fedbox.datasets.utils

  1import numpy
  2import os
  3import random
  4import torch
  5from torch.utils.data import Dataset, Subset, TensorDataset
  6from typing import Sequence, Union
  7import warnings
  8
  9from torchvision.datasets import CIFAR10, CIFAR100
 10
 11class FederatedSubset(Subset):
 12    '''
 13    This class represents the local subset held by each client in a federated simulation.
 14    '''
 15
 16    def __init__(self, dataset: Dataset, indices: Sequence[int]) -> None:
 17        super().__init__(dataset, indices)
 18
 19        self.normalization = 255 if (isinstance(dataset, CIFAR10) or isinstance(dataset, CIFAR100)) else 1
 20
 21    @property
 22    def data(self) -> torch.Tensor:
 23        if isinstance(self.dataset, TensorDataset):
 24            return self.dataset.tensors[0][self.indices]
 25        return self.dataset.data[self.indices]
 26    
 27    @property
 28    def targets(self) -> torch.Tensor:
 29        if isinstance(self.dataset, TensorDataset):
 30            return self.dataset.tensors[1][self.indices]
 31        if isinstance(self.dataset.targets, list):
 32            return torch.Tensor(self.dataset.targets)[self.indices]
 33        return self.dataset.targets[self.indices]
 34    
 35def set_seed(s: int):
 36    '''
 37    Sets the same random initialization seed across multiple libraries, and
 38    enables the usage of deterministic algorithms in PyTorch.
 39
 40    Parameters
 41    ----------
 42    s: int
 43        Seed initialization value
 44    '''
 45    
 46    random.seed(s)
 47    numpy.random.seed(s)
 48    torch.manual_seed(s)
 49    
 50    if torch.version.cuda is not None and torch.version.cuda >= '10.2':
 51        os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
 52    else:
 53        torch.use_deterministic_algorithms(True)
 54
 55def partition(
 56    dataset: Dataset,
 57    n_subsets: int,
 58    n_classes: int = None,
 59    heterogeneity_degree: float = 0.0,
 60    samples_variance: float = 0.0,
 61    return_indices: bool = False,
 62    seed: int = None,
 63    strict: bool = True
 64) -> Union[list[list[int]], list[FederatedSubset]]:
 65    '''
 66    Partitions a dataset in `n_subsets` heterogenously or homogenously according to
 67    `heterogeneity_degree` and `samples_variance`.
 68
 69    Parameters
 70    ----------
 71    dataset: Dataset
 72        Dataset (pytorch)
 73    n_subsets: int
 74        Number of datasets for splitting
 75    n_classes: int
 76        Number of classes in the dataset, inferred if `None`
 77    heterogeneity_degree: float
 78        Class heterogeneity degree, by default is homogeneous
 79    samples_variance: float
 80        Standard deviation (%) in the number of samples for each client, `0` by default
 81    return_indices: bool
 82        To return indices instead of federated subsets
 83    seed: int
 84        Random seed initializer
 85    strict: bool
 86        In strict mode `heterogeneity_degree` and `samples_variance` are highly respected,
 87        otherwise flexibility is allowed
 88
 89    Returns
 90    -------
 91    Union[list[list[int]], list[FederatedSubset]]
 92        If `return_indices` is `False` then returns federated subsets, otherwise indices
 93
 94    Note
 95    ----
 96    Heterogeneity degree is the inverse of the concentration parameter of a Dirichelet distribution
 97    used to sample class ratios across each subset, whilst sample variance refers to the variance in
 98    the number of samples assigned to each subset, which is extracted from a log-normal distribution.
 99    '''
100
101    # labels of dataset, to be computed
102    labels: torch.Tensor = None
103    # discriminate between kind of datasets
104    if isinstance(dataset, TensorDataset):
105        labels = dataset.tensors[1]
106    else:
107        assert hasattr(dataset, 'data'), "Dataset needs to have .data attribute"
108        assert hasattr(dataset, 'targets'), "Dataset needs to have .targets attribute"
109        labels = dataset.targets
110    # number of classes extracted from data
111    n_class_inferred = numpy.unique(labels).shape[0]
112    # parameter checking
113    assert 0 < n_subsets <= len(dataset), "Number of subsets must be between 1 and number of samples"
114    assert heterogeneity_degree is None or heterogeneity_degree >= 0, "Dirichelet concentration must be a positive number"
115    assert samples_variance is None or samples_variance >= 0, "Log-normal variance must be a positive number"
116    assert n_classes is None or 0 < n_classes <= n_class_inferred, "Number of classes must be between 1 and number of labels' classes"
117    # using less classes than expected is admissible even though it is signaled
118    if n_classes < n_class_inferred:
119        warnings.warn(f"Number of classes specified {n_classes} is inferior to inferred number of classes {n_class_inferred}")
120    # random generator initialization for reproducibility
121    if seed:
122        set_seed(seed)
123    # for each class extracts the indices of the corresponding samples 
124    samples_indices = [ numpy.argwhere(numpy.array(labels) == k).reshape(-1) for k in range(n_classes) ]
125    # number of samples in each subset of the partition (computed with respect to availability of samples)
126    n_subset_samples = sum([ len(x) // n_subsets for x in samples_indices ])
127    # no sample variance, so each user will receive same amount of samples
128    if samples_variance is None or samples_variance == 0:
129        n_subset_samples_sampled = numpy.repeat(n_subset_samples, n_subsets)
130    # otherwise number of samples of each user are sampled according to lognormal distribution
131    else:
132        # log normal extraction
133        n_subset_samples_sampled = numpy.random.lognormal(numpy.log(n_subset_samples), samples_variance, size = n_subsets)
134        # normalization with respect to dataset size
135        n_subset_samples_sampled = ((n_subset_samples_sampled / n_subset_samples_sampled.sum()) * n_subset_samples * n_subsets).astype(int)
136    # in case of homogeneity each subset has the same fraction of data for each class
137    if heterogeneity_degree is None or heterogeneity_degree == 0:
138        n_partition_label_ratios_sampled = numpy.ones((n_subsets, n_classes)) / n_classes
139    else:
140        # otherwise for each class the ratio of corresponding data assigned are sampled according to dirichelet distribution
141        alpha = 1.0 / heterogeneity_degree * numpy.ones(n_subsets)
142        n_partition_label_ratios_sampled = numpy.random.dirichlet(alpha, size = n_classes).T
143    # number of assigned samples used for each class
144    n_consumed_class_samples = [ 0 for _ in range(n_classes) ]
145    # indices of samples assigned to each subset
146    subset_indices = [ [] for _ in range(n_subsets) ]
147    # computation of number of samples that each client expects from each class (using floor)
148    n_expected_samples_subset = numpy.floor(n_subset_samples_sampled.reshape(n_subsets, 1) * n_partition_label_ratios_sampled / n_partition_label_ratios_sampled.sum(axis = 1).reshape(n_subsets, 1)).astype(int)
149    # number of samples effectively assigned to each client from each class
150    n_effective_samples_subset = numpy.zeros((n_subsets, n_classes))
151    # number of subsets (can be less that requested because some are empty, especially in strict mode)
152    n_effective_subsets = None
153    # shuffle data samples' indices of each class
154    for k in range(n_classes):
155        numpy.random.shuffle(samples_indices[k])
156    # assigns samples to each subset
157    for subset_index in range(n_subsets):
158        # counter of number of samples assigned to each class for the current client
159        n_received_samples = numpy.zeros(n_classes)
160        # assigns samples of each class
161        for k in range(n_classes):
162            # in strict mode stops assigning and adding clients whenever the samples of any class are exhausted
163            if strict and len(samples_indices[k]) - n_consumed_class_samples[k] < n_expected_samples_subset[subset_index, k]:
164                n_effective_subsets = subset_index
165                break
166            # computes how many samples should be given to current client for current class, without exceeding the availability
167            n_consumed_already = min(n_consumed_class_samples[k], len(samples_indices[k]))
168            n_consumed_current = min(n_consumed_class_samples[k] + n_expected_samples_subset[subset_index, k], len(samples_indices[k]))
169            # assigns samples to subset
170            subset_indices[subset_index].extend(samples_indices[k][n_consumed_already:n_consumed_current])
171            # updates counter
172            n_received_samples[k] = max(n_consumed_current - n_consumed_already, 0)
173            n_consumed_class_samples[k] = n_consumed_current
174        # if any class availabilty is over then we stop building subsets and we exit
175        if strict and n_effective_subsets is not None:
176            subset_indices = subset_indices[:n_effective_subsets]
177            break
178        # updates counter of data assigned to subset
179        n_effective_samples_subset[subset_index] = n_received_samples
180    # indices of subsets
181    indices = range(n_subsets)
182    # this indicates how much importance is given to lack of data in subset when we need to account for
183    # probabilities of distributing samples left from previous iterations
184    inter_subset_samples_amount_importance = 0.25
185    # distribution of left samples happens only in non-strict mode
186    if not strict:
187        for k in range(n_classes):
188            # computes how many samples are left for each subset and for each class
189            n_class_samples_left = len(samples_indices[k]) - n_consumed_class_samples[k]
190            # computes how many samples are left for each subset
191            n_samples_left = n_expected_samples_subset.sum(axis = 1) - n_effective_samples_subset.sum(axis = 1)
192            n_samples_left[n_samples_left < 0] = 0
193            # skip class samples distribution in case of no sample left
194            if n_class_samples_left <= 0:
195                continue
196            # computes for each subset probability of getting new data of current class
197            probabilities = 1 + (n_expected_samples_subset[:, k] - n_effective_samples_subset[:, k]) + inter_subset_samples_amount_importance * n_samples_left
198            probabilities /= probabilities.sum()
199            # weighted sampling with replacement a number of subsets equal to the number of samples left
200            chosen_subsets = numpy.random.choice(
201                indices,
202                p = probabilities,
203                size = n_class_samples_left,
204                replace = True
205            )
206            # moves left samples to selected subsets for each subset
207            for chosen_subset in chosen_subsets:
208                subset_indices[chosen_subset].append(samples_indices[k][n_consumed_class_samples[k]])
209                # updates number of samples assigned and consumed
210                n_consumed_class_samples[k] += 1
211                n_effective_samples_subset[chosen_subset, k] += 1
212    # does not construct federated subset
213    if return_indices:
214        return subset_indices
215    # returns federated subset, exclusively non empty
216    return [ FederatedSubset(dataset, indices) for indices in subset_indices if len(indices) > 0 ]
class FederatedSubset(torch.utils.data.dataset.Dataset[+T_co]):
12class FederatedSubset(Subset):
13    '''
14    This class represents the local subset held by each client in a federated simulation.
15    '''
16
17    def __init__(self, dataset: Dataset, indices: Sequence[int]) -> None:
18        super().__init__(dataset, indices)
19
20        self.normalization = 255 if (isinstance(dataset, CIFAR10) or isinstance(dataset, CIFAR100)) else 1
21
22    @property
23    def data(self) -> torch.Tensor:
24        if isinstance(self.dataset, TensorDataset):
25            return self.dataset.tensors[0][self.indices]
26        return self.dataset.data[self.indices]
27    
28    @property
29    def targets(self) -> torch.Tensor:
30        if isinstance(self.dataset, TensorDataset):
31            return self.dataset.tensors[1][self.indices]
32        if isinstance(self.dataset.targets, list):
33            return torch.Tensor(self.dataset.targets)[self.indices]
34        return self.dataset.targets[self.indices]

This class represents the local subset held by each client in a federated simulation.

FederatedSubset(dataset: torch.utils.data.dataset.Dataset, indices: Sequence[int])
17    def __init__(self, dataset: Dataset, indices: Sequence[int]) -> None:
18        super().__init__(dataset, indices)
19
20        self.normalization = 255 if (isinstance(dataset, CIFAR10) or isinstance(dataset, CIFAR100)) else 1
normalization
data: torch.Tensor
22    @property
23    def data(self) -> torch.Tensor:
24        if isinstance(self.dataset, TensorDataset):
25            return self.dataset.tensors[0][self.indices]
26        return self.dataset.data[self.indices]
targets: torch.Tensor
28    @property
29    def targets(self) -> torch.Tensor:
30        if isinstance(self.dataset, TensorDataset):
31            return self.dataset.tensors[1][self.indices]
32        if isinstance(self.dataset.targets, list):
33            return torch.Tensor(self.dataset.targets)[self.indices]
34        return self.dataset.targets[self.indices]
Inherited Members
torch.utils.data.dataset.Subset
dataset
indices
def set_seed(s: int):
36def set_seed(s: int):
37    '''
38    Sets the same random initialization seed across multiple libraries, and
39    enables the usage of deterministic algorithms in PyTorch.
40
41    Parameters
42    ----------
43    s: int
44        Seed initialization value
45    '''
46    
47    random.seed(s)
48    numpy.random.seed(s)
49    torch.manual_seed(s)
50    
51    if torch.version.cuda is not None and torch.version.cuda >= '10.2':
52        os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
53    else:
54        torch.use_deterministic_algorithms(True)

Sets the same random initialization seed across multiple libraries, and enables the usage of deterministic algorithms in PyTorch.

Parameters
  • s (int): Seed initialization value
def partition( dataset: torch.utils.data.dataset.Dataset, n_subsets: int, n_classes: int = None, heterogeneity_degree: float = 0.0, samples_variance: float = 0.0, return_indices: bool = False, seed: int = None, strict: bool = True) -> Union[list[list[int]], list[FederatedSubset]]:
 56def partition(
 57    dataset: Dataset,
 58    n_subsets: int,
 59    n_classes: int = None,
 60    heterogeneity_degree: float = 0.0,
 61    samples_variance: float = 0.0,
 62    return_indices: bool = False,
 63    seed: int = None,
 64    strict: bool = True
 65) -> Union[list[list[int]], list[FederatedSubset]]:
 66    '''
 67    Partitions a dataset in `n_subsets` heterogenously or homogenously according to
 68    `heterogeneity_degree` and `samples_variance`.
 69
 70    Parameters
 71    ----------
 72    dataset: Dataset
 73        Dataset (pytorch)
 74    n_subsets: int
 75        Number of datasets for splitting
 76    n_classes: int
 77        Number of classes in the dataset, inferred if `None`
 78    heterogeneity_degree: float
 79        Class heterogeneity degree, by default is homogeneous
 80    samples_variance: float
 81        Standard deviation (%) in the number of samples for each client, `0` by default
 82    return_indices: bool
 83        To return indices instead of federated subsets
 84    seed: int
 85        Random seed initializer
 86    strict: bool
 87        In strict mode `heterogeneity_degree` and `samples_variance` are highly respected,
 88        otherwise flexibility is allowed
 89
 90    Returns
 91    -------
 92    Union[list[list[int]], list[FederatedSubset]]
 93        If `return_indices` is `False` then returns federated subsets, otherwise indices
 94
 95    Note
 96    ----
 97    Heterogeneity degree is the inverse of the concentration parameter of a Dirichelet distribution
 98    used to sample class ratios across each subset, whilst sample variance refers to the variance in
 99    the number of samples assigned to each subset, which is extracted from a log-normal distribution.
100    '''
101
102    # labels of dataset, to be computed
103    labels: torch.Tensor = None
104    # discriminate between kind of datasets
105    if isinstance(dataset, TensorDataset):
106        labels = dataset.tensors[1]
107    else:
108        assert hasattr(dataset, 'data'), "Dataset needs to have .data attribute"
109        assert hasattr(dataset, 'targets'), "Dataset needs to have .targets attribute"
110        labels = dataset.targets
111    # number of classes extracted from data
112    n_class_inferred = numpy.unique(labels).shape[0]
113    # parameter checking
114    assert 0 < n_subsets <= len(dataset), "Number of subsets must be between 1 and number of samples"
115    assert heterogeneity_degree is None or heterogeneity_degree >= 0, "Dirichelet concentration must be a positive number"
116    assert samples_variance is None or samples_variance >= 0, "Log-normal variance must be a positive number"
117    assert n_classes is None or 0 < n_classes <= n_class_inferred, "Number of classes must be between 1 and number of labels' classes"
118    # using less classes than expected is admissible even though it is signaled
119    if n_classes < n_class_inferred:
120        warnings.warn(f"Number of classes specified {n_classes} is inferior to inferred number of classes {n_class_inferred}")
121    # random generator initialization for reproducibility
122    if seed:
123        set_seed(seed)
124    # for each class extracts the indices of the corresponding samples 
125    samples_indices = [ numpy.argwhere(numpy.array(labels) == k).reshape(-1) for k in range(n_classes) ]
126    # number of samples in each subset of the partition (computed with respect to availability of samples)
127    n_subset_samples = sum([ len(x) // n_subsets for x in samples_indices ])
128    # no sample variance, so each user will receive same amount of samples
129    if samples_variance is None or samples_variance == 0:
130        n_subset_samples_sampled = numpy.repeat(n_subset_samples, n_subsets)
131    # otherwise number of samples of each user are sampled according to lognormal distribution
132    else:
133        # log normal extraction
134        n_subset_samples_sampled = numpy.random.lognormal(numpy.log(n_subset_samples), samples_variance, size = n_subsets)
135        # normalization with respect to dataset size
136        n_subset_samples_sampled = ((n_subset_samples_sampled / n_subset_samples_sampled.sum()) * n_subset_samples * n_subsets).astype(int)
137    # in case of homogeneity each subset has the same fraction of data for each class
138    if heterogeneity_degree is None or heterogeneity_degree == 0:
139        n_partition_label_ratios_sampled = numpy.ones((n_subsets, n_classes)) / n_classes
140    else:
141        # otherwise for each class the ratio of corresponding data assigned are sampled according to dirichelet distribution
142        alpha = 1.0 / heterogeneity_degree * numpy.ones(n_subsets)
143        n_partition_label_ratios_sampled = numpy.random.dirichlet(alpha, size = n_classes).T
144    # number of assigned samples used for each class
145    n_consumed_class_samples = [ 0 for _ in range(n_classes) ]
146    # indices of samples assigned to each subset
147    subset_indices = [ [] for _ in range(n_subsets) ]
148    # computation of number of samples that each client expects from each class (using floor)
149    n_expected_samples_subset = numpy.floor(n_subset_samples_sampled.reshape(n_subsets, 1) * n_partition_label_ratios_sampled / n_partition_label_ratios_sampled.sum(axis = 1).reshape(n_subsets, 1)).astype(int)
150    # number of samples effectively assigned to each client from each class
151    n_effective_samples_subset = numpy.zeros((n_subsets, n_classes))
152    # number of subsets (can be less that requested because some are empty, especially in strict mode)
153    n_effective_subsets = None
154    # shuffle data samples' indices of each class
155    for k in range(n_classes):
156        numpy.random.shuffle(samples_indices[k])
157    # assigns samples to each subset
158    for subset_index in range(n_subsets):
159        # counter of number of samples assigned to each class for the current client
160        n_received_samples = numpy.zeros(n_classes)
161        # assigns samples of each class
162        for k in range(n_classes):
163            # in strict mode stops assigning and adding clients whenever the samples of any class are exhausted
164            if strict and len(samples_indices[k]) - n_consumed_class_samples[k] < n_expected_samples_subset[subset_index, k]:
165                n_effective_subsets = subset_index
166                break
167            # computes how many samples should be given to current client for current class, without exceeding the availability
168            n_consumed_already = min(n_consumed_class_samples[k], len(samples_indices[k]))
169            n_consumed_current = min(n_consumed_class_samples[k] + n_expected_samples_subset[subset_index, k], len(samples_indices[k]))
170            # assigns samples to subset
171            subset_indices[subset_index].extend(samples_indices[k][n_consumed_already:n_consumed_current])
172            # updates counter
173            n_received_samples[k] = max(n_consumed_current - n_consumed_already, 0)
174            n_consumed_class_samples[k] = n_consumed_current
175        # if any class availabilty is over then we stop building subsets and we exit
176        if strict and n_effective_subsets is not None:
177            subset_indices = subset_indices[:n_effective_subsets]
178            break
179        # updates counter of data assigned to subset
180        n_effective_samples_subset[subset_index] = n_received_samples
181    # indices of subsets
182    indices = range(n_subsets)
183    # this indicates how much importance is given to lack of data in subset when we need to account for
184    # probabilities of distributing samples left from previous iterations
185    inter_subset_samples_amount_importance = 0.25
186    # distribution of left samples happens only in non-strict mode
187    if not strict:
188        for k in range(n_classes):
189            # computes how many samples are left for each subset and for each class
190            n_class_samples_left = len(samples_indices[k]) - n_consumed_class_samples[k]
191            # computes how many samples are left for each subset
192            n_samples_left = n_expected_samples_subset.sum(axis = 1) - n_effective_samples_subset.sum(axis = 1)
193            n_samples_left[n_samples_left < 0] = 0
194            # skip class samples distribution in case of no sample left
195            if n_class_samples_left <= 0:
196                continue
197            # computes for each subset probability of getting new data of current class
198            probabilities = 1 + (n_expected_samples_subset[:, k] - n_effective_samples_subset[:, k]) + inter_subset_samples_amount_importance * n_samples_left
199            probabilities /= probabilities.sum()
200            # weighted sampling with replacement a number of subsets equal to the number of samples left
201            chosen_subsets = numpy.random.choice(
202                indices,
203                p = probabilities,
204                size = n_class_samples_left,
205                replace = True
206            )
207            # moves left samples to selected subsets for each subset
208            for chosen_subset in chosen_subsets:
209                subset_indices[chosen_subset].append(samples_indices[k][n_consumed_class_samples[k]])
210                # updates number of samples assigned and consumed
211                n_consumed_class_samples[k] += 1
212                n_effective_samples_subset[chosen_subset, k] += 1
213    # does not construct federated subset
214    if return_indices:
215        return subset_indices
216    # returns federated subset, exclusively non empty
217    return [ FederatedSubset(dataset, indices) for indices in subset_indices if len(indices) > 0 ]

Partitions a dataset in n_subsets heterogenously or homogenously according to heterogeneity_degree and samples_variance.

Parameters
  • dataset (Dataset): Dataset (pytorch)
  • n_subsets (int): Number of datasets for splitting
  • n_classes (int): Number of classes in the dataset, inferred if None
  • heterogeneity_degree (float): Class heterogeneity degree, by default is homogeneous
  • samples_variance (float): Standard deviation (%) in the number of samples for each client, 0 by default
  • return_indices (bool): To return indices instead of federated subsets
  • seed (int): Random seed initializer
  • strict (bool): In strict mode heterogeneity_degree and samples_variance are highly respected, otherwise flexibility is allowed
Returns
  • Union[list[list[int]], list[FederatedSubset]]: If return_indices is False then returns federated subsets, otherwise indices
Note

Heterogeneity degree is the inverse of the concentration parameter of a Dirichelet distribution used to sample class ratios across each subset, whilst sample variance refers to the variance in the number of samples assigned to each subset, which is extracted from a log-normal distribution.