fedbox.datasets.utils
1import numpy 2import os 3import random 4import torch 5from torch.utils.data import Dataset, Subset, TensorDataset 6from typing import Sequence, Union 7import warnings 8 9from torchvision.datasets import CIFAR10, CIFAR100 10 11class FederatedSubset(Subset): 12 ''' 13 This class represents the local subset held by each client in a federated simulation. 14 ''' 15 16 def __init__(self, dataset: Dataset, indices: Sequence[int]) -> None: 17 super().__init__(dataset, indices) 18 19 self.normalization = 255 if (isinstance(dataset, CIFAR10) or isinstance(dataset, CIFAR100)) else 1 20 21 @property 22 def data(self) -> torch.Tensor: 23 if isinstance(self.dataset, TensorDataset): 24 return self.dataset.tensors[0][self.indices] 25 return self.dataset.data[self.indices] 26 27 @property 28 def targets(self) -> torch.Tensor: 29 if isinstance(self.dataset, TensorDataset): 30 return self.dataset.tensors[1][self.indices] 31 if isinstance(self.dataset.targets, list): 32 return torch.Tensor(self.dataset.targets)[self.indices] 33 return self.dataset.targets[self.indices] 34 35def set_seed(s: int): 36 ''' 37 Sets the same random initialization seed across multiple libraries, and 38 enables the usage of deterministic algorithms in PyTorch. 39 40 Parameters 41 ---------- 42 s: int 43 Seed initialization value 44 ''' 45 46 random.seed(s) 47 numpy.random.seed(s) 48 torch.manual_seed(s) 49 50 if torch.version.cuda is not None and torch.version.cuda >= '10.2': 51 os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8' 52 else: 53 torch.use_deterministic_algorithms(True) 54 55def partition( 56 dataset: Dataset, 57 n_subsets: int, 58 n_classes: int = None, 59 heterogeneity_degree: float = 0.0, 60 samples_variance: float = 0.0, 61 return_indices: bool = False, 62 seed: int = None, 63 strict: bool = True 64) -> Union[list[list[int]], list[FederatedSubset]]: 65 ''' 66 Partitions a dataset in `n_subsets` heterogenously or homogenously according to 67 `heterogeneity_degree` and `samples_variance`. 68 69 Parameters 70 ---------- 71 dataset: Dataset 72 Dataset (pytorch) 73 n_subsets: int 74 Number of datasets for splitting 75 n_classes: int 76 Number of classes in the dataset, inferred if `None` 77 heterogeneity_degree: float 78 Class heterogeneity degree, by default is homogeneous 79 samples_variance: float 80 Standard deviation (%) in the number of samples for each client, `0` by default 81 return_indices: bool 82 To return indices instead of federated subsets 83 seed: int 84 Random seed initializer 85 strict: bool 86 In strict mode `heterogeneity_degree` and `samples_variance` are highly respected, 87 otherwise flexibility is allowed 88 89 Returns 90 ------- 91 Union[list[list[int]], list[FederatedSubset]] 92 If `return_indices` is `False` then returns federated subsets, otherwise indices 93 94 Note 95 ---- 96 Heterogeneity degree is the inverse of the concentration parameter of a Dirichelet distribution 97 used to sample class ratios across each subset, whilst sample variance refers to the variance in 98 the number of samples assigned to each subset, which is extracted from a log-normal distribution. 99 ''' 100 101 # labels of dataset, to be computed 102 labels: torch.Tensor = None 103 # discriminate between kind of datasets 104 if isinstance(dataset, TensorDataset): 105 labels = dataset.tensors[1] 106 else: 107 assert hasattr(dataset, 'data'), "Dataset needs to have .data attribute" 108 assert hasattr(dataset, 'targets'), "Dataset needs to have .targets attribute" 109 labels = dataset.targets 110 # number of classes extracted from data 111 n_class_inferred = numpy.unique(labels).shape[0] 112 # parameter checking 113 assert 0 < n_subsets <= len(dataset), "Number of subsets must be between 1 and number of samples" 114 assert heterogeneity_degree is None or heterogeneity_degree >= 0, "Dirichelet concentration must be a positive number" 115 assert samples_variance is None or samples_variance >= 0, "Log-normal variance must be a positive number" 116 assert n_classes is None or 0 < n_classes <= n_class_inferred, "Number of classes must be between 1 and number of labels' classes" 117 # using less classes than expected is admissible even though it is signaled 118 if n_classes < n_class_inferred: 119 warnings.warn(f"Number of classes specified {n_classes} is inferior to inferred number of classes {n_class_inferred}") 120 # random generator initialization for reproducibility 121 if seed: 122 set_seed(seed) 123 # for each class extracts the indices of the corresponding samples 124 samples_indices = [ numpy.argwhere(numpy.array(labels) == k).reshape(-1) for k in range(n_classes) ] 125 # number of samples in each subset of the partition (computed with respect to availability of samples) 126 n_subset_samples = sum([ len(x) // n_subsets for x in samples_indices ]) 127 # no sample variance, so each user will receive same amount of samples 128 if samples_variance is None or samples_variance == 0: 129 n_subset_samples_sampled = numpy.repeat(n_subset_samples, n_subsets) 130 # otherwise number of samples of each user are sampled according to lognormal distribution 131 else: 132 # log normal extraction 133 n_subset_samples_sampled = numpy.random.lognormal(numpy.log(n_subset_samples), samples_variance, size = n_subsets) 134 # normalization with respect to dataset size 135 n_subset_samples_sampled = ((n_subset_samples_sampled / n_subset_samples_sampled.sum()) * n_subset_samples * n_subsets).astype(int) 136 # in case of homogeneity each subset has the same fraction of data for each class 137 if heterogeneity_degree is None or heterogeneity_degree == 0: 138 n_partition_label_ratios_sampled = numpy.ones((n_subsets, n_classes)) / n_classes 139 else: 140 # otherwise for each class the ratio of corresponding data assigned are sampled according to dirichelet distribution 141 alpha = 1.0 / heterogeneity_degree * numpy.ones(n_subsets) 142 n_partition_label_ratios_sampled = numpy.random.dirichlet(alpha, size = n_classes).T 143 # number of assigned samples used for each class 144 n_consumed_class_samples = [ 0 for _ in range(n_classes) ] 145 # indices of samples assigned to each subset 146 subset_indices = [ [] for _ in range(n_subsets) ] 147 # computation of number of samples that each client expects from each class (using floor) 148 n_expected_samples_subset = numpy.floor(n_subset_samples_sampled.reshape(n_subsets, 1) * n_partition_label_ratios_sampled / n_partition_label_ratios_sampled.sum(axis = 1).reshape(n_subsets, 1)).astype(int) 149 # number of samples effectively assigned to each client from each class 150 n_effective_samples_subset = numpy.zeros((n_subsets, n_classes)) 151 # number of subsets (can be less that requested because some are empty, especially in strict mode) 152 n_effective_subsets = None 153 # shuffle data samples' indices of each class 154 for k in range(n_classes): 155 numpy.random.shuffle(samples_indices[k]) 156 # assigns samples to each subset 157 for subset_index in range(n_subsets): 158 # counter of number of samples assigned to each class for the current client 159 n_received_samples = numpy.zeros(n_classes) 160 # assigns samples of each class 161 for k in range(n_classes): 162 # in strict mode stops assigning and adding clients whenever the samples of any class are exhausted 163 if strict and len(samples_indices[k]) - n_consumed_class_samples[k] < n_expected_samples_subset[subset_index, k]: 164 n_effective_subsets = subset_index 165 break 166 # computes how many samples should be given to current client for current class, without exceeding the availability 167 n_consumed_already = min(n_consumed_class_samples[k], len(samples_indices[k])) 168 n_consumed_current = min(n_consumed_class_samples[k] + n_expected_samples_subset[subset_index, k], len(samples_indices[k])) 169 # assigns samples to subset 170 subset_indices[subset_index].extend(samples_indices[k][n_consumed_already:n_consumed_current]) 171 # updates counter 172 n_received_samples[k] = max(n_consumed_current - n_consumed_already, 0) 173 n_consumed_class_samples[k] = n_consumed_current 174 # if any class availabilty is over then we stop building subsets and we exit 175 if strict and n_effective_subsets is not None: 176 subset_indices = subset_indices[:n_effective_subsets] 177 break 178 # updates counter of data assigned to subset 179 n_effective_samples_subset[subset_index] = n_received_samples 180 # indices of subsets 181 indices = range(n_subsets) 182 # this indicates how much importance is given to lack of data in subset when we need to account for 183 # probabilities of distributing samples left from previous iterations 184 inter_subset_samples_amount_importance = 0.25 185 # distribution of left samples happens only in non-strict mode 186 if not strict: 187 for k in range(n_classes): 188 # computes how many samples are left for each subset and for each class 189 n_class_samples_left = len(samples_indices[k]) - n_consumed_class_samples[k] 190 # computes how many samples are left for each subset 191 n_samples_left = n_expected_samples_subset.sum(axis = 1) - n_effective_samples_subset.sum(axis = 1) 192 n_samples_left[n_samples_left < 0] = 0 193 # skip class samples distribution in case of no sample left 194 if n_class_samples_left <= 0: 195 continue 196 # computes for each subset probability of getting new data of current class 197 probabilities = 1 + (n_expected_samples_subset[:, k] - n_effective_samples_subset[:, k]) + inter_subset_samples_amount_importance * n_samples_left 198 probabilities /= probabilities.sum() 199 # weighted sampling with replacement a number of subsets equal to the number of samples left 200 chosen_subsets = numpy.random.choice( 201 indices, 202 p = probabilities, 203 size = n_class_samples_left, 204 replace = True 205 ) 206 # moves left samples to selected subsets for each subset 207 for chosen_subset in chosen_subsets: 208 subset_indices[chosen_subset].append(samples_indices[k][n_consumed_class_samples[k]]) 209 # updates number of samples assigned and consumed 210 n_consumed_class_samples[k] += 1 211 n_effective_samples_subset[chosen_subset, k] += 1 212 # does not construct federated subset 213 if return_indices: 214 return subset_indices 215 # returns federated subset, exclusively non empty 216 return [ FederatedSubset(dataset, indices) for indices in subset_indices if len(indices) > 0 ]
class
FederatedSubset(torch.utils.data.dataset.Dataset[+T_co]):
12class FederatedSubset(Subset): 13 ''' 14 This class represents the local subset held by each client in a federated simulation. 15 ''' 16 17 def __init__(self, dataset: Dataset, indices: Sequence[int]) -> None: 18 super().__init__(dataset, indices) 19 20 self.normalization = 255 if (isinstance(dataset, CIFAR10) or isinstance(dataset, CIFAR100)) else 1 21 22 @property 23 def data(self) -> torch.Tensor: 24 if isinstance(self.dataset, TensorDataset): 25 return self.dataset.tensors[0][self.indices] 26 return self.dataset.data[self.indices] 27 28 @property 29 def targets(self) -> torch.Tensor: 30 if isinstance(self.dataset, TensorDataset): 31 return self.dataset.tensors[1][self.indices] 32 if isinstance(self.dataset.targets, list): 33 return torch.Tensor(self.dataset.targets)[self.indices] 34 return self.dataset.targets[self.indices]
This class represents the local subset held by each client in a federated simulation.
Inherited Members
- torch.utils.data.dataset.Subset
- dataset
- indices
def
set_seed(s: int):
36def set_seed(s: int): 37 ''' 38 Sets the same random initialization seed across multiple libraries, and 39 enables the usage of deterministic algorithms in PyTorch. 40 41 Parameters 42 ---------- 43 s: int 44 Seed initialization value 45 ''' 46 47 random.seed(s) 48 numpy.random.seed(s) 49 torch.manual_seed(s) 50 51 if torch.version.cuda is not None and torch.version.cuda >= '10.2': 52 os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8' 53 else: 54 torch.use_deterministic_algorithms(True)
Sets the same random initialization seed across multiple libraries, and enables the usage of deterministic algorithms in PyTorch.
Parameters
- s (int): Seed initialization value
def
partition( dataset: torch.utils.data.dataset.Dataset, n_subsets: int, n_classes: int = None, heterogeneity_degree: float = 0.0, samples_variance: float = 0.0, return_indices: bool = False, seed: int = None, strict: bool = True) -> Union[list[list[int]], list[FederatedSubset]]:
56def partition( 57 dataset: Dataset, 58 n_subsets: int, 59 n_classes: int = None, 60 heterogeneity_degree: float = 0.0, 61 samples_variance: float = 0.0, 62 return_indices: bool = False, 63 seed: int = None, 64 strict: bool = True 65) -> Union[list[list[int]], list[FederatedSubset]]: 66 ''' 67 Partitions a dataset in `n_subsets` heterogenously or homogenously according to 68 `heterogeneity_degree` and `samples_variance`. 69 70 Parameters 71 ---------- 72 dataset: Dataset 73 Dataset (pytorch) 74 n_subsets: int 75 Number of datasets for splitting 76 n_classes: int 77 Number of classes in the dataset, inferred if `None` 78 heterogeneity_degree: float 79 Class heterogeneity degree, by default is homogeneous 80 samples_variance: float 81 Standard deviation (%) in the number of samples for each client, `0` by default 82 return_indices: bool 83 To return indices instead of federated subsets 84 seed: int 85 Random seed initializer 86 strict: bool 87 In strict mode `heterogeneity_degree` and `samples_variance` are highly respected, 88 otherwise flexibility is allowed 89 90 Returns 91 ------- 92 Union[list[list[int]], list[FederatedSubset]] 93 If `return_indices` is `False` then returns federated subsets, otherwise indices 94 95 Note 96 ---- 97 Heterogeneity degree is the inverse of the concentration parameter of a Dirichelet distribution 98 used to sample class ratios across each subset, whilst sample variance refers to the variance in 99 the number of samples assigned to each subset, which is extracted from a log-normal distribution. 100 ''' 101 102 # labels of dataset, to be computed 103 labels: torch.Tensor = None 104 # discriminate between kind of datasets 105 if isinstance(dataset, TensorDataset): 106 labels = dataset.tensors[1] 107 else: 108 assert hasattr(dataset, 'data'), "Dataset needs to have .data attribute" 109 assert hasattr(dataset, 'targets'), "Dataset needs to have .targets attribute" 110 labels = dataset.targets 111 # number of classes extracted from data 112 n_class_inferred = numpy.unique(labels).shape[0] 113 # parameter checking 114 assert 0 < n_subsets <= len(dataset), "Number of subsets must be between 1 and number of samples" 115 assert heterogeneity_degree is None or heterogeneity_degree >= 0, "Dirichelet concentration must be a positive number" 116 assert samples_variance is None or samples_variance >= 0, "Log-normal variance must be a positive number" 117 assert n_classes is None or 0 < n_classes <= n_class_inferred, "Number of classes must be between 1 and number of labels' classes" 118 # using less classes than expected is admissible even though it is signaled 119 if n_classes < n_class_inferred: 120 warnings.warn(f"Number of classes specified {n_classes} is inferior to inferred number of classes {n_class_inferred}") 121 # random generator initialization for reproducibility 122 if seed: 123 set_seed(seed) 124 # for each class extracts the indices of the corresponding samples 125 samples_indices = [ numpy.argwhere(numpy.array(labels) == k).reshape(-1) for k in range(n_classes) ] 126 # number of samples in each subset of the partition (computed with respect to availability of samples) 127 n_subset_samples = sum([ len(x) // n_subsets for x in samples_indices ]) 128 # no sample variance, so each user will receive same amount of samples 129 if samples_variance is None or samples_variance == 0: 130 n_subset_samples_sampled = numpy.repeat(n_subset_samples, n_subsets) 131 # otherwise number of samples of each user are sampled according to lognormal distribution 132 else: 133 # log normal extraction 134 n_subset_samples_sampled = numpy.random.lognormal(numpy.log(n_subset_samples), samples_variance, size = n_subsets) 135 # normalization with respect to dataset size 136 n_subset_samples_sampled = ((n_subset_samples_sampled / n_subset_samples_sampled.sum()) * n_subset_samples * n_subsets).astype(int) 137 # in case of homogeneity each subset has the same fraction of data for each class 138 if heterogeneity_degree is None or heterogeneity_degree == 0: 139 n_partition_label_ratios_sampled = numpy.ones((n_subsets, n_classes)) / n_classes 140 else: 141 # otherwise for each class the ratio of corresponding data assigned are sampled according to dirichelet distribution 142 alpha = 1.0 / heterogeneity_degree * numpy.ones(n_subsets) 143 n_partition_label_ratios_sampled = numpy.random.dirichlet(alpha, size = n_classes).T 144 # number of assigned samples used for each class 145 n_consumed_class_samples = [ 0 for _ in range(n_classes) ] 146 # indices of samples assigned to each subset 147 subset_indices = [ [] for _ in range(n_subsets) ] 148 # computation of number of samples that each client expects from each class (using floor) 149 n_expected_samples_subset = numpy.floor(n_subset_samples_sampled.reshape(n_subsets, 1) * n_partition_label_ratios_sampled / n_partition_label_ratios_sampled.sum(axis = 1).reshape(n_subsets, 1)).astype(int) 150 # number of samples effectively assigned to each client from each class 151 n_effective_samples_subset = numpy.zeros((n_subsets, n_classes)) 152 # number of subsets (can be less that requested because some are empty, especially in strict mode) 153 n_effective_subsets = None 154 # shuffle data samples' indices of each class 155 for k in range(n_classes): 156 numpy.random.shuffle(samples_indices[k]) 157 # assigns samples to each subset 158 for subset_index in range(n_subsets): 159 # counter of number of samples assigned to each class for the current client 160 n_received_samples = numpy.zeros(n_classes) 161 # assigns samples of each class 162 for k in range(n_classes): 163 # in strict mode stops assigning and adding clients whenever the samples of any class are exhausted 164 if strict and len(samples_indices[k]) - n_consumed_class_samples[k] < n_expected_samples_subset[subset_index, k]: 165 n_effective_subsets = subset_index 166 break 167 # computes how many samples should be given to current client for current class, without exceeding the availability 168 n_consumed_already = min(n_consumed_class_samples[k], len(samples_indices[k])) 169 n_consumed_current = min(n_consumed_class_samples[k] + n_expected_samples_subset[subset_index, k], len(samples_indices[k])) 170 # assigns samples to subset 171 subset_indices[subset_index].extend(samples_indices[k][n_consumed_already:n_consumed_current]) 172 # updates counter 173 n_received_samples[k] = max(n_consumed_current - n_consumed_already, 0) 174 n_consumed_class_samples[k] = n_consumed_current 175 # if any class availabilty is over then we stop building subsets and we exit 176 if strict and n_effective_subsets is not None: 177 subset_indices = subset_indices[:n_effective_subsets] 178 break 179 # updates counter of data assigned to subset 180 n_effective_samples_subset[subset_index] = n_received_samples 181 # indices of subsets 182 indices = range(n_subsets) 183 # this indicates how much importance is given to lack of data in subset when we need to account for 184 # probabilities of distributing samples left from previous iterations 185 inter_subset_samples_amount_importance = 0.25 186 # distribution of left samples happens only in non-strict mode 187 if not strict: 188 for k in range(n_classes): 189 # computes how many samples are left for each subset and for each class 190 n_class_samples_left = len(samples_indices[k]) - n_consumed_class_samples[k] 191 # computes how many samples are left for each subset 192 n_samples_left = n_expected_samples_subset.sum(axis = 1) - n_effective_samples_subset.sum(axis = 1) 193 n_samples_left[n_samples_left < 0] = 0 194 # skip class samples distribution in case of no sample left 195 if n_class_samples_left <= 0: 196 continue 197 # computes for each subset probability of getting new data of current class 198 probabilities = 1 + (n_expected_samples_subset[:, k] - n_effective_samples_subset[:, k]) + inter_subset_samples_amount_importance * n_samples_left 199 probabilities /= probabilities.sum() 200 # weighted sampling with replacement a number of subsets equal to the number of samples left 201 chosen_subsets = numpy.random.choice( 202 indices, 203 p = probabilities, 204 size = n_class_samples_left, 205 replace = True 206 ) 207 # moves left samples to selected subsets for each subset 208 for chosen_subset in chosen_subsets: 209 subset_indices[chosen_subset].append(samples_indices[k][n_consumed_class_samples[k]]) 210 # updates number of samples assigned and consumed 211 n_consumed_class_samples[k] += 1 212 n_effective_samples_subset[chosen_subset, k] += 1 213 # does not construct federated subset 214 if return_indices: 215 return subset_indices 216 # returns federated subset, exclusively non empty 217 return [ FederatedSubset(dataset, indices) for indices in subset_indices if len(indices) > 0 ]
Partitions a dataset in n_subsets heterogenously or homogenously according to
heterogeneity_degree and samples_variance.
Parameters
- dataset (Dataset): Dataset (pytorch)
- n_subsets (int): Number of datasets for splitting
- n_classes (int):
Number of classes in the dataset, inferred if
None - heterogeneity_degree (float): Class heterogeneity degree, by default is homogeneous
- samples_variance (float):
Standard deviation (%) in the number of samples for each client,
0by default - return_indices (bool): To return indices instead of federated subsets
- seed (int): Random seed initializer
- strict (bool):
In strict mode
heterogeneity_degreeandsamples_varianceare highly respected, otherwise flexibility is allowed
Returns
- Union[list[list[int]], list[FederatedSubset]]: If
return_indicesisFalsethen returns federated subsets, otherwise indices
Note
Heterogeneity degree is the inverse of the concentration parameter of a Dirichelet distribution used to sample class ratios across each subset, whilst sample variance refers to the variance in the number of samples assigned to each subset, which is extracted from a log-normal distribution.