fedbox.datasets.femnist

  1import os.path as path
  2import torch
  3from torch.utils.data import Dataset
  4from typing import Any, Tuple, Union
  5
  6from . import utils
  7
  8
  9class FEMNIST(Dataset):
 10    '''
 11    This class loads the FEMNIST dataset, specifically only its 10 classes subset with written digit in [0, 9].
 12
 13    Note
 14    ----
 15    The dataset must be downloaded beforehand using the instructions in the file README.md.
 16    '''
 17
 18    def __init__(
 19        self,
 20        root: str, 
 21        train: bool = True,
 22        transform: Any = None,
 23        target_transform: Any = None, 
 24        download = True
 25    ):
 26        super().__init__()
 27
 28        self.root = root
 29        self.train = train
 30        self.transform = transform
 31        self.target_transform = target_transform
 32        self.download = download
 33        self.data = None
 34        self.targets = None
 35
 36        directory = path.join(self.root, 'femnist')
 37        # NOTE: dataset must be downloaded first by following the instructions in README.md file
 38        if path.exists(directory):
 39            self.data, self.targets, _ = torch.load(path.join(directory, 'training.pt' if self.train else 'testing.pt'))
 40            self.targets = self.targets.to(dtype = torch.float32)
 41        else:
 42            raise RuntimeError('FEMNIST dataset not found in directory {}. It must be downloaded first using the README.md file.'.format(root))
 43
 44        self.data = self.data.unsqueeze(3)
 45        self.targets = self.targets.to(dtype = torch.long)
 46
 47    def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:
 48        x, y = self.data[index], self.targets[index]
 49
 50        if self.transform:
 51            x = self.transform(x)
 52
 53        if self.target_transform:
 54            y = self.target_transform(y)
 55
 56        return x, y
 57    
 58    def __len__(self) -> int:
 59        return self.data.shape[0]
 60
 61def femnist(
 62    directory: str,
 63    n_subsets: int = 1000,
 64    heterogeneity_degree: float = None,
 65    samples_variance: float = 0.0,
 66    transform: Any = None,
 67    target_transform: Any = None,
 68    seed: int = None,
 69    strict: bool = True
 70) -> dict[str, list[utils.FederatedSubset]]:
 71    '''
 72    Loads the `FEMNIST` dataset and partitions it into `n_subsets` training subsets and 
 73    `n_subsets` testing subsets according to heterogeneity parameters `heterogeneity_degree` 
 74    and `samples_variance`.
 75
 76    Parameters
 77    ----------
 78    directory: str
 79        Root directory to load the dataset
 80    n_subsets: int
 81        Number of datasets for splitting
 82    heterogeneity_degree: float
 83        Class heterogeneity degree, by default is homogeneous
 84    samples_variance: float
 85        Standard deviation (%) in the number of samples for each client, `0` by default
 86    transform: Any
 87        Transformation to apply on data samples, `None` by default
 88    target_transform: Any
 89       Transformation to apply on data labels, `None` by default
 90    seed: int
 91        Random seed initializer
 92    strict: bool
 93        In strict mode `heterogeneity_degree` and `samples_variance` are highly respected,
 94        otherwise flexibility is allowed
 95
 96    Returns
 97    -------
 98    dict[str, list[utils.FederatedSubset]]
 99        Returns the lists of subsets of training clients and testing clients
100    '''
101
102    training = FEMNIST(
103        root = directory, 
104        train = True, 
105        download = True,
106        transform = transform,
107        target_transform = target_transform
108    )
109
110    testing = FEMNIST(
111        root = directory, 
112        train = False, 
113        download = True,
114        transform = transform,
115        target_transform = target_transform
116    )
117
118    return {
119        'training': utils.partition(
120            training,
121            n_subsets = min(len(training), n_subsets),
122            n_classes = 10,
123            heterogeneity_degree = heterogeneity_degree,
124            samples_variance = samples_variance,
125            return_indices = False,
126            seed = seed,
127            strict = strict
128        ),
129        'testing': utils.partition(
130            testing,
131            n_subsets = min(len(testing), n_subsets),
132            n_classes = 10,
133            heterogeneity_degree = heterogeneity_degree,
134            samples_variance = samples_variance,
135            return_indices = False,
136            seed = seed,
137            strict = strict
138        )
139    }
class FEMNIST(typing.Generic[+T_co]):
10class FEMNIST(Dataset):
11    '''
12    This class loads the FEMNIST dataset, specifically only its 10 classes subset with written digit in [0, 9].
13
14    Note
15    ----
16    The dataset must be downloaded beforehand using the instructions in the file README.md.
17    '''
18
19    def __init__(
20        self,
21        root: str, 
22        train: bool = True,
23        transform: Any = None,
24        target_transform: Any = None, 
25        download = True
26    ):
27        super().__init__()
28
29        self.root = root
30        self.train = train
31        self.transform = transform
32        self.target_transform = target_transform
33        self.download = download
34        self.data = None
35        self.targets = None
36
37        directory = path.join(self.root, 'femnist')
38        # NOTE: dataset must be downloaded first by following the instructions in README.md file
39        if path.exists(directory):
40            self.data, self.targets, _ = torch.load(path.join(directory, 'training.pt' if self.train else 'testing.pt'))
41            self.targets = self.targets.to(dtype = torch.float32)
42        else:
43            raise RuntimeError('FEMNIST dataset not found in directory {}. It must be downloaded first using the README.md file.'.format(root))
44
45        self.data = self.data.unsqueeze(3)
46        self.targets = self.targets.to(dtype = torch.long)
47
48    def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:
49        x, y = self.data[index], self.targets[index]
50
51        if self.transform:
52            x = self.transform(x)
53
54        if self.target_transform:
55            y = self.target_transform(y)
56
57        return x, y
58    
59    def __len__(self) -> int:
60        return self.data.shape[0]

This class loads the FEMNIST dataset, specifically only its 10 classes subset with written digit in [0, 9].

Note

The dataset must be downloaded beforehand using the instructions in the file README.md.

FEMNIST( root: str, train: bool = True, transform: Any = None, target_transform: Any = None, download=True)
19    def __init__(
20        self,
21        root: str, 
22        train: bool = True,
23        transform: Any = None,
24        target_transform: Any = None, 
25        download = True
26    ):
27        super().__init__()
28
29        self.root = root
30        self.train = train
31        self.transform = transform
32        self.target_transform = target_transform
33        self.download = download
34        self.data = None
35        self.targets = None
36
37        directory = path.join(self.root, 'femnist')
38        # NOTE: dataset must be downloaded first by following the instructions in README.md file
39        if path.exists(directory):
40            self.data, self.targets, _ = torch.load(path.join(directory, 'training.pt' if self.train else 'testing.pt'))
41            self.targets = self.targets.to(dtype = torch.float32)
42        else:
43            raise RuntimeError('FEMNIST dataset not found in directory {}. It must be downloaded first using the README.md file.'.format(root))
44
45        self.data = self.data.unsqueeze(3)
46        self.targets = self.targets.to(dtype = torch.long)
root
train
transform
target_transform
download
data
targets
def femnist( directory: str, n_subsets: int = 1000, heterogeneity_degree: float = None, samples_variance: float = 0.0, transform: Any = None, target_transform: Any = None, seed: int = None, strict: bool = True) -> dict[str, list[fedbox.datasets.utils.FederatedSubset]]:
 62def femnist(
 63    directory: str,
 64    n_subsets: int = 1000,
 65    heterogeneity_degree: float = None,
 66    samples_variance: float = 0.0,
 67    transform: Any = None,
 68    target_transform: Any = None,
 69    seed: int = None,
 70    strict: bool = True
 71) -> dict[str, list[utils.FederatedSubset]]:
 72    '''
 73    Loads the `FEMNIST` dataset and partitions it into `n_subsets` training subsets and 
 74    `n_subsets` testing subsets according to heterogeneity parameters `heterogeneity_degree` 
 75    and `samples_variance`.
 76
 77    Parameters
 78    ----------
 79    directory: str
 80        Root directory to load the dataset
 81    n_subsets: int
 82        Number of datasets for splitting
 83    heterogeneity_degree: float
 84        Class heterogeneity degree, by default is homogeneous
 85    samples_variance: float
 86        Standard deviation (%) in the number of samples for each client, `0` by default
 87    transform: Any
 88        Transformation to apply on data samples, `None` by default
 89    target_transform: Any
 90       Transformation to apply on data labels, `None` by default
 91    seed: int
 92        Random seed initializer
 93    strict: bool
 94        In strict mode `heterogeneity_degree` and `samples_variance` are highly respected,
 95        otherwise flexibility is allowed
 96
 97    Returns
 98    -------
 99    dict[str, list[utils.FederatedSubset]]
100        Returns the lists of subsets of training clients and testing clients
101    '''
102
103    training = FEMNIST(
104        root = directory, 
105        train = True, 
106        download = True,
107        transform = transform,
108        target_transform = target_transform
109    )
110
111    testing = FEMNIST(
112        root = directory, 
113        train = False, 
114        download = True,
115        transform = transform,
116        target_transform = target_transform
117    )
118
119    return {
120        'training': utils.partition(
121            training,
122            n_subsets = min(len(training), n_subsets),
123            n_classes = 10,
124            heterogeneity_degree = heterogeneity_degree,
125            samples_variance = samples_variance,
126            return_indices = False,
127            seed = seed,
128            strict = strict
129        ),
130        'testing': utils.partition(
131            testing,
132            n_subsets = min(len(testing), n_subsets),
133            n_classes = 10,
134            heterogeneity_degree = heterogeneity_degree,
135            samples_variance = samples_variance,
136            return_indices = False,
137            seed = seed,
138            strict = strict
139        )
140    }

Loads the FEMNIST dataset and partitions it into n_subsets training subsets and n_subsets testing subsets according to heterogeneity parameters heterogeneity_degree and samples_variance.

Parameters
  • directory (str): Root directory to load the dataset
  • n_subsets (int): Number of datasets for splitting
  • heterogeneity_degree (float): Class heterogeneity degree, by default is homogeneous
  • samples_variance (float): Standard deviation (%) in the number of samples for each client, 0 by default
  • transform (Any): Transformation to apply on data samples, None by default
  • target_transform (Any): Transformation to apply on data labels, None by default
  • seed (int): Random seed initializer
  • strict (bool): In strict mode heterogeneity_degree and samples_variance are highly respected, otherwise flexibility is allowed
Returns
  • dict[str, list[utils.FederatedSubset]]: Returns the lists of subsets of training clients and testing clients