fedbox.optimization.ours

  1from ..datasets import utils
  2from . import fedavg
  3from .utils import WeightingScheme, AdjancencyWeightingScheme, Logger
  4
  5from collections import OrderedDict
  6from copy import deepcopy
  7import random
  8import torch
  9import torch.nn as nn
 10import torch.optim as optim
 11import torch.utils.data as data
 12from tqdm import tqdm
 13
 14
 15class Agent(fedavg.Agent):
 16    '''
 17    An agent (client) uses our novel scheme to optimize a shared model on its local subset.
 18    '''
 19
 20    def __init__(self, subset: utils.FederatedSubset):
 21        '''
 22        Initializes the agent with a local `subset` of data samples and labels.
 23
 24        Parameters
 25        ----------
 26        subset: utils.FederatedSubset
 27            Subset of data samples and labels
 28        '''
 29        
 30        self.subset = subset
 31
 32    def step(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, x: torch.Tensor, y: torch.Tensor, optimizer: optim.Optimizer, max_gradient_norm: float):
 33        '''
 34        Performs an optimization step on `model` using minibatch (`x`, `y`) by introducing our `beta`-specific perturbation.
 35
 36        Parameters
 37        ----------
 38        beta: float
 39            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
 40        u: torch.nn.Module
 41            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
 42        model: torch.nn.Module
 43            Model that is optimized locally
 44        x: torch.Tensor
 45            Data samples in the minibatch
 46        y: torch.Tensor
 47            Data labels in the minibatch
 48        optimizer: optim.Optimizer
 49            Gradient-based optimizer
 50        max_gradient_norm: float
 51            Value used to clip the norm of the stochastic gradient
 52        '''
 53        
 54        perturbed = self.perturb(beta, u, model)
 55        prediction = perturbed(x)
 56        
 57        loss = nn.functional.cross_entropy(prediction, y)
 58        loss.backward()
 59
 60        torch.nn.utils.clip_grad_norm_(perturbed.parameters(), max_gradient_norm, error_if_nonfinite = True)
 61
 62        model.load_state_dict(OrderedDict([
 63            (name, model.get_parameter(name) - optimizer.param_groups[-1]['lr'] * perturbed.get_parameter(name).grad.detach())
 64            for name in model.state_dict().keys()
 65        ]))
 66
 67    def optimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_steps: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device):
 68        '''
 69        Runs `n_steps` stochastic gradient descent steps while injecting `beta`-specific perturbation on the local dataset (one step for each minibatch).
 70
 71        Parameters
 72        ----------
 73        beta: float
 74            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
 75        u: torch.nn.Module
 76            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
 77        model: torch.nn.Module
 78            Model that is locally optimized
 79        n_steps: int
 80            Number of local SGD steps, i.e. number of minibatches
 81        step_size: float
 82            Step size or learning rate
 83        l2_penalty: float
 84            Weight of L2 (Tikhonov) regularization term
 85        max_gradient_norm: float
 86            Value used to clip the norm of the stochastic gradient
 87        device: torch.device
 88            Accelerator to run the code
 89        '''
 90
 91        loader = data.DataLoader(self.subset, batch_size = len(self.subset) // n_steps)
 92        optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty)
 93
 94        model.train()
 95
 96        for x, y in loader:
 97            self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm)
 98
 99        return model
100
101    def multioptimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device):
102        '''
103        Runs `n_epochs` stochastic gradient descent epochs while injecting `beta`-specific perturbation on the local dataset.
104
105        Parameters
106        ----------
107        beta: float
108            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
109        u: torch.nn.Module
110            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
111        model: torch.nn.Module
112            Model that is locally optimized
113        n_epochs: int
114            Number of local epochs to pass over the entire local dataset
115        step_size: float
116            Step size or learning rate
117        l2_penalty: float
118            Weight of L2 (Tikhonov) regularization term
119        max_gradient_norm: float
120            Value used to clip the norm of the stochastic gradient
121        device: torch.device
122            Accelerator to run the code
123
124        Note
125        ----
126        Differently from `optimize(...)`, each epoch corresponds to passing over the entire dataset using SGD.
127        '''
128
129        loader = data.DataLoader(self.subset, batch_size = batch_size)
130        optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty)
131
132        model.train()
133
134        for _ in range(n_epochs):
135            for x, y in loader:
136                self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm)
137
138        return model
139
140    def evaluate(self, model: torch.nn.Module, device: torch.device) -> tuple[float, float]:
141        '''
142        Evaluate the `model` by computing the average sample loss and accuracy.
143
144        Parameters
145        ----------
146        model: torch.nn.Module
147            Model that is locally optimized
148        device: torch.device
149            Accelerator to run the code
150
151        Returns
152        -------
153        tuple[float, float]
154            Tuple of average sample loss and accuracy on the local dataset
155        '''
156
157        loader = data.DataLoader(self.subset, batch_size = len(self.subset))
158        x, y = next(iter(loader))
159        x = x.to(device)
160        y = y.to(device)
161
162        model.eval()
163
164        with torch.no_grad():
165            prediction = model(x)
166            loss = nn.functional.cross_entropy(prediction, y)
167            accuracy = torch.sum(torch.argmax(prediction, dim = 1) == y)
168            return loss.item(), accuracy.item()
169        
170    def perturb(self, beta: float, u: torch.nn.Module, model: torch.nn.Module) -> torch.nn.Module:
171        '''
172        Computes the perturbed model as `beta`-weighted average between `model` and `u`.
173
174        Parameters
175        ----------
176        beta: float
177            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
178        u: torch.nn.Module
179            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
180        model: torch.nn.Module
181            Model that is locally optimized
182
183        Returns
184        -------
185        torch.nn.Module
186            Perturbed model as `beta`-weighted average between `model` and `u`
187        '''
188        
189        perturbed_model = deepcopy(model)
190        perturbed_model.load_state_dict(OrderedDict([
191            (name, beta * model.state_dict()[name].detach() + (1 - beta) * u.state_dict()[name].detach())
192            for name in model.state_dict().keys()
193        ]))
194        return perturbed_model
195
196
197class Coordinator(fedavg.Coordinator):
198    '''
199    This class represents a centralized server coordinating the training of a shared model across multiple agents (i.e. clients).
200    
201    Note
202    ----
203    The agents locally update their models using our novel algorithmic framework.
204    '''
205
206    def __init__(
207        self,
208        beta: float,
209        model: torch.nn.Module,
210        datasets: dict[str, list[utils.FederatedSubset]],
211        scheme: WeightingScheme,
212        logger: Logger = Logger.default()
213    ):
214        '''
215        Constructs the centralized coordinator, i.e. server, in the federated learning simulation.
216
217        Parameters
218        ----------
219        beta: float
220            Parameter controlling the perturbation (the lower beta, the higher the perturbation) while running our novel algorithm on clients
221        model: torch.nn.Module
222            Initial shared model
223        datasets: dict[str, list[utils.FederatedSubset]]
224            Training clients' subsets ('training') and testing clients' subsets ('testing')
225        scheme: WeightingScheme
226            Aggregation scheme to weight local updates from clients
227        logger: Logger
228            Logger instance to save progress during the simulation
229        '''
230
231        assert isinstance(scheme, AdjancencyWeightingScheme)
232
233        assert beta > 0 and beta < 1
234
235        self.beta = beta
236        self.datasets = datasets
237        self.model = model
238        self.agents = {
239            group: [ Agent(subset) for subset in dataset ] for group, dataset in datasets.items() 
240        }
241        self.weights = scheme.weights()
242        self.adjacency_matrices = scheme.adjacencies()
243        self.logger = logger
244
245    def run(self, n_iterations: int, n_steps: int = None, n_epochs = None, batch_size: int = 32, step_size: float = 1e-3, step_size_diminishing: bool = False, l2_penalty: float = 1e-4, max_gradient_norm: float = 1.0, device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
246        '''
247        Runs `n_iterations` optimization (with our novel algorithm) and evaluation rounds on training clients.
248
249        Parameters
250        ----------
251        n_iterations: int
252            Number of global rounds
253        n_steps: int
254            Number of local SGD steps used for optimization on clients
255        n_epochs: int
256            Number of local epochs used for optimization on clients (mutually excludes n_steps)
257        batch_size: int
258            Number of samples in one SGD minibatch
259        step_size: float
260            Learning rate
261        step_size_diminishing: bool
262            This enables diminishing the step size linearly in time
263        l2_penalty: float
264            Weight of the L2 (Tikhonov) regularization used to penalize local models
265        max_gradient_norm: float
266            Value used to clip the norm of the stochastic gradient during local optimization
267        device: torch.device
268            Accelerator to run the code
269        evaluate: bool
270            Flag that enables evaluation of the update global model on training and testing clients
271
272        Note
273        ----
274        Runs `n_iterations` times function `iterate(...)`.
275        '''
276
277        assert n_steps is not None or n_epochs is not None
278
279        self.model = self.model.to(device)
280        self.model.compile()
281        self.u = [ deepcopy(self.model) for _ in self.agents['training']  ]
282
283        for iteration in range(n_iterations):
284            step_size_updated = step_size if not step_size_diminishing else step_size / (iteration + 1)
285            metrics = self.iterate(iteration, n_steps, n_epochs, batch_size, step_size_updated, l2_penalty, max_gradient_norm, device, evaluate = True)
286            
287            self.logger.log({
288                'step': iteration,
289                'loss.training': metrics['training']['loss'],
290                'loss.testing': metrics['testing']['loss'],
291                'accuracy.training': metrics['training']['accuracy'],
292                'accuracy.testing': metrics['testing']['accuracy'],
293            })
294            
295            print(iteration, metrics)
296
297    def iterate(self, iteration: int, n_steps: int, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device, evaluate: bool = False) -> dict[str, float]:
298        '''
299        Runs a single optimization round with our novel algorithm on all training clients.
300
301        Parameters
302        ----------
303        iteration: int
304            Current global round
305        n_steps: int
306            Number of local SGD steps used for optimization on clients
307        n_epochs: int
308            Number of local epochs used for optimization on clients (mutually excludes n_steps)
309        batch_size: int
310            Number of samples in one SGD minibatch
311        step_size: float
312            Learning rate
313        l2_penalty: float
314            Weight of the L2 (Tikhonov) regularization used to penalize local models
315        max_gradient_norm: float
316            Value used to clip the norm of the stochastic gradient during local optimization
317        device: torch.device
318            Accelerator to run the code
319        evaluate: bool
320            Flag that enables evaluation of the update global model on training and testing clients
321
322        Returns
323        -------
324        dict[str, float]
325            Dictionary of current round's metrics
326        '''
327
328        indices = list(range(0, len(self.agents['training'])))
329        k = len(self.agents['training'])
330        
331        random.shuffle(indices)
332        
333        indices = indices[:k]
334        participants = [ self.agents['training'][i] for i in indices ]
335        weights = [ self.weights['training'][i] for i in indices ]
336        u = [ self.u[i] for i in indices ]
337
338        initial_model = deepcopy(self.model)
339        updates: list[nn.Module] = [ initial_model for _ in self.agents['training'] ]
340
341        if n_steps is not None:
342            for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)):
343                updates[i] = participant.optimize(self.beta, u, deepcopy(initial_model), n_steps = n_steps, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device)
344        else:
345            for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)):
346                updates[i] = participant.multioptimize(self.beta, u, deepcopy(initial_model), n_epochs = n_epochs, batch_size = batch_size, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device)
347
348        self.average(updates, weights = weights)
349
350        for i, degree in enumerate(self.weights['training']):
351            # for each client, this computes the average of latest updates from other clients weighted accordingly to the similarity measure (normalized by client's degree)
352            self.u[i].load_state_dict(OrderedDict([
353                (
354                    name, 
355                    torch.stack([ 
356                        similarity * update.state_dict()[name].detach()
357                        for j, (update, similarity) in enumerate(zip(updates, self.adjacency_matrices['training'][i]))
358                        if j != i
359                    ]).sum(dim = 0) / degree
360                )
361                for name in self.model.state_dict().keys()
362            ]))
363
364        if not evaluate:
365            return {}
366        
367        return self.evaluate(iteration, device)
class Agent(fedbox.optimization.fedavg.Agent):
 16class Agent(fedavg.Agent):
 17    '''
 18    An agent (client) uses our novel scheme to optimize a shared model on its local subset.
 19    '''
 20
 21    def __init__(self, subset: utils.FederatedSubset):
 22        '''
 23        Initializes the agent with a local `subset` of data samples and labels.
 24
 25        Parameters
 26        ----------
 27        subset: utils.FederatedSubset
 28            Subset of data samples and labels
 29        '''
 30        
 31        self.subset = subset
 32
 33    def step(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, x: torch.Tensor, y: torch.Tensor, optimizer: optim.Optimizer, max_gradient_norm: float):
 34        '''
 35        Performs an optimization step on `model` using minibatch (`x`, `y`) by introducing our `beta`-specific perturbation.
 36
 37        Parameters
 38        ----------
 39        beta: float
 40            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
 41        u: torch.nn.Module
 42            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
 43        model: torch.nn.Module
 44            Model that is optimized locally
 45        x: torch.Tensor
 46            Data samples in the minibatch
 47        y: torch.Tensor
 48            Data labels in the minibatch
 49        optimizer: optim.Optimizer
 50            Gradient-based optimizer
 51        max_gradient_norm: float
 52            Value used to clip the norm of the stochastic gradient
 53        '''
 54        
 55        perturbed = self.perturb(beta, u, model)
 56        prediction = perturbed(x)
 57        
 58        loss = nn.functional.cross_entropy(prediction, y)
 59        loss.backward()
 60
 61        torch.nn.utils.clip_grad_norm_(perturbed.parameters(), max_gradient_norm, error_if_nonfinite = True)
 62
 63        model.load_state_dict(OrderedDict([
 64            (name, model.get_parameter(name) - optimizer.param_groups[-1]['lr'] * perturbed.get_parameter(name).grad.detach())
 65            for name in model.state_dict().keys()
 66        ]))
 67
 68    def optimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_steps: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device):
 69        '''
 70        Runs `n_steps` stochastic gradient descent steps while injecting `beta`-specific perturbation on the local dataset (one step for each minibatch).
 71
 72        Parameters
 73        ----------
 74        beta: float
 75            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
 76        u: torch.nn.Module
 77            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
 78        model: torch.nn.Module
 79            Model that is locally optimized
 80        n_steps: int
 81            Number of local SGD steps, i.e. number of minibatches
 82        step_size: float
 83            Step size or learning rate
 84        l2_penalty: float
 85            Weight of L2 (Tikhonov) regularization term
 86        max_gradient_norm: float
 87            Value used to clip the norm of the stochastic gradient
 88        device: torch.device
 89            Accelerator to run the code
 90        '''
 91
 92        loader = data.DataLoader(self.subset, batch_size = len(self.subset) // n_steps)
 93        optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty)
 94
 95        model.train()
 96
 97        for x, y in loader:
 98            self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm)
 99
100        return model
101
102    def multioptimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device):
103        '''
104        Runs `n_epochs` stochastic gradient descent epochs while injecting `beta`-specific perturbation on the local dataset.
105
106        Parameters
107        ----------
108        beta: float
109            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
110        u: torch.nn.Module
111            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
112        model: torch.nn.Module
113            Model that is locally optimized
114        n_epochs: int
115            Number of local epochs to pass over the entire local dataset
116        step_size: float
117            Step size or learning rate
118        l2_penalty: float
119            Weight of L2 (Tikhonov) regularization term
120        max_gradient_norm: float
121            Value used to clip the norm of the stochastic gradient
122        device: torch.device
123            Accelerator to run the code
124
125        Note
126        ----
127        Differently from `optimize(...)`, each epoch corresponds to passing over the entire dataset using SGD.
128        '''
129
130        loader = data.DataLoader(self.subset, batch_size = batch_size)
131        optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty)
132
133        model.train()
134
135        for _ in range(n_epochs):
136            for x, y in loader:
137                self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm)
138
139        return model
140
141    def evaluate(self, model: torch.nn.Module, device: torch.device) -> tuple[float, float]:
142        '''
143        Evaluate the `model` by computing the average sample loss and accuracy.
144
145        Parameters
146        ----------
147        model: torch.nn.Module
148            Model that is locally optimized
149        device: torch.device
150            Accelerator to run the code
151
152        Returns
153        -------
154        tuple[float, float]
155            Tuple of average sample loss and accuracy on the local dataset
156        '''
157
158        loader = data.DataLoader(self.subset, batch_size = len(self.subset))
159        x, y = next(iter(loader))
160        x = x.to(device)
161        y = y.to(device)
162
163        model.eval()
164
165        with torch.no_grad():
166            prediction = model(x)
167            loss = nn.functional.cross_entropy(prediction, y)
168            accuracy = torch.sum(torch.argmax(prediction, dim = 1) == y)
169            return loss.item(), accuracy.item()
170        
171    def perturb(self, beta: float, u: torch.nn.Module, model: torch.nn.Module) -> torch.nn.Module:
172        '''
173        Computes the perturbed model as `beta`-weighted average between `model` and `u`.
174
175        Parameters
176        ----------
177        beta: float
178            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
179        u: torch.nn.Module
180            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
181        model: torch.nn.Module
182            Model that is locally optimized
183
184        Returns
185        -------
186        torch.nn.Module
187            Perturbed model as `beta`-weighted average between `model` and `u`
188        '''
189        
190        perturbed_model = deepcopy(model)
191        perturbed_model.load_state_dict(OrderedDict([
192            (name, beta * model.state_dict()[name].detach() + (1 - beta) * u.state_dict()[name].detach())
193            for name in model.state_dict().keys()
194        ]))
195        return perturbed_model

An agent (client) uses our novel scheme to optimize a shared model on its local subset.

Agent(subset: fedbox.datasets.utils.FederatedSubset)
21    def __init__(self, subset: utils.FederatedSubset):
22        '''
23        Initializes the agent with a local `subset` of data samples and labels.
24
25        Parameters
26        ----------
27        subset: utils.FederatedSubset
28            Subset of data samples and labels
29        '''
30        
31        self.subset = subset

Initializes the agent with a local subset of data samples and labels.

Parameters
  • subset (utils.FederatedSubset): Subset of data samples and labels
subset
def step( self, beta: float, u: torch.nn.modules.module.Module, model: torch.nn.modules.module.Module, x: torch.Tensor, y: torch.Tensor, optimizer: torch.optim.optimizer.Optimizer, max_gradient_norm: float):
33    def step(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, x: torch.Tensor, y: torch.Tensor, optimizer: optim.Optimizer, max_gradient_norm: float):
34        '''
35        Performs an optimization step on `model` using minibatch (`x`, `y`) by introducing our `beta`-specific perturbation.
36
37        Parameters
38        ----------
39        beta: float
40            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
41        u: torch.nn.Module
42            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
43        model: torch.nn.Module
44            Model that is optimized locally
45        x: torch.Tensor
46            Data samples in the minibatch
47        y: torch.Tensor
48            Data labels in the minibatch
49        optimizer: optim.Optimizer
50            Gradient-based optimizer
51        max_gradient_norm: float
52            Value used to clip the norm of the stochastic gradient
53        '''
54        
55        perturbed = self.perturb(beta, u, model)
56        prediction = perturbed(x)
57        
58        loss = nn.functional.cross_entropy(prediction, y)
59        loss.backward()
60
61        torch.nn.utils.clip_grad_norm_(perturbed.parameters(), max_gradient_norm, error_if_nonfinite = True)
62
63        model.load_state_dict(OrderedDict([
64            (name, model.get_parameter(name) - optimizer.param_groups[-1]['lr'] * perturbed.get_parameter(name).grad.detach())
65            for name in model.state_dict().keys()
66        ]))

Performs an optimization step on model using minibatch (x, y) by introducing our beta-specific perturbation.

Parameters
  • beta (float): Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
  • u (torch.nn.Module): Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
  • model (torch.nn.Module): Model that is optimized locally
  • x (torch.Tensor): Data samples in the minibatch
  • y (torch.Tensor): Data labels in the minibatch
  • optimizer (optim.Optimizer): Gradient-based optimizer
  • max_gradient_norm (float): Value used to clip the norm of the stochastic gradient
def optimize( self, beta: float, u: torch.nn.modules.module.Module, model: torch.nn.modules.module.Module, n_steps: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device):
 68    def optimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_steps: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device):
 69        '''
 70        Runs `n_steps` stochastic gradient descent steps while injecting `beta`-specific perturbation on the local dataset (one step for each minibatch).
 71
 72        Parameters
 73        ----------
 74        beta: float
 75            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
 76        u: torch.nn.Module
 77            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
 78        model: torch.nn.Module
 79            Model that is locally optimized
 80        n_steps: int
 81            Number of local SGD steps, i.e. number of minibatches
 82        step_size: float
 83            Step size or learning rate
 84        l2_penalty: float
 85            Weight of L2 (Tikhonov) regularization term
 86        max_gradient_norm: float
 87            Value used to clip the norm of the stochastic gradient
 88        device: torch.device
 89            Accelerator to run the code
 90        '''
 91
 92        loader = data.DataLoader(self.subset, batch_size = len(self.subset) // n_steps)
 93        optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty)
 94
 95        model.train()
 96
 97        for x, y in loader:
 98            self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm)
 99
100        return model

Runs n_steps stochastic gradient descent steps while injecting beta-specific perturbation on the local dataset (one step for each minibatch).

Parameters
  • beta (float): Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
  • u (torch.nn.Module): Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
  • model (torch.nn.Module): Model that is locally optimized
  • n_steps (int): Number of local SGD steps, i.e. number of minibatches
  • step_size (float): Step size or learning rate
  • l2_penalty (float): Weight of L2 (Tikhonov) regularization term
  • max_gradient_norm (float): Value used to clip the norm of the stochastic gradient
  • device (torch.device): Accelerator to run the code
def multioptimize( self, beta: float, u: torch.nn.modules.module.Module, model: torch.nn.modules.module.Module, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device):
102    def multioptimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device):
103        '''
104        Runs `n_epochs` stochastic gradient descent epochs while injecting `beta`-specific perturbation on the local dataset.
105
106        Parameters
107        ----------
108        beta: float
109            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
110        u: torch.nn.Module
111            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
112        model: torch.nn.Module
113            Model that is locally optimized
114        n_epochs: int
115            Number of local epochs to pass over the entire local dataset
116        step_size: float
117            Step size or learning rate
118        l2_penalty: float
119            Weight of L2 (Tikhonov) regularization term
120        max_gradient_norm: float
121            Value used to clip the norm of the stochastic gradient
122        device: torch.device
123            Accelerator to run the code
124
125        Note
126        ----
127        Differently from `optimize(...)`, each epoch corresponds to passing over the entire dataset using SGD.
128        '''
129
130        loader = data.DataLoader(self.subset, batch_size = batch_size)
131        optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty)
132
133        model.train()
134
135        for _ in range(n_epochs):
136            for x, y in loader:
137                self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm)
138
139        return model

Runs n_epochs stochastic gradient descent epochs while injecting beta-specific perturbation on the local dataset.

Parameters
  • beta (float): Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
  • u (torch.nn.Module): Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
  • model (torch.nn.Module): Model that is locally optimized
  • n_epochs (int): Number of local epochs to pass over the entire local dataset
  • step_size (float): Step size or learning rate
  • l2_penalty (float): Weight of L2 (Tikhonov) regularization term
  • max_gradient_norm (float): Value used to clip the norm of the stochastic gradient
  • device (torch.device): Accelerator to run the code
Note

Differently from optimize(...), each epoch corresponds to passing over the entire dataset using SGD.

def evaluate( self, model: torch.nn.modules.module.Module, device: torch.device) -> tuple[float, float]:
141    def evaluate(self, model: torch.nn.Module, device: torch.device) -> tuple[float, float]:
142        '''
143        Evaluate the `model` by computing the average sample loss and accuracy.
144
145        Parameters
146        ----------
147        model: torch.nn.Module
148            Model that is locally optimized
149        device: torch.device
150            Accelerator to run the code
151
152        Returns
153        -------
154        tuple[float, float]
155            Tuple of average sample loss and accuracy on the local dataset
156        '''
157
158        loader = data.DataLoader(self.subset, batch_size = len(self.subset))
159        x, y = next(iter(loader))
160        x = x.to(device)
161        y = y.to(device)
162
163        model.eval()
164
165        with torch.no_grad():
166            prediction = model(x)
167            loss = nn.functional.cross_entropy(prediction, y)
168            accuracy = torch.sum(torch.argmax(prediction, dim = 1) == y)
169            return loss.item(), accuracy.item()

Evaluate the model by computing the average sample loss and accuracy.

Parameters
  • model (torch.nn.Module): Model that is locally optimized
  • device (torch.device): Accelerator to run the code
Returns
  • tuple[float, float]: Tuple of average sample loss and accuracy on the local dataset
def perturb( self, beta: float, u: torch.nn.modules.module.Module, model: torch.nn.modules.module.Module) -> torch.nn.modules.module.Module:
171    def perturb(self, beta: float, u: torch.nn.Module, model: torch.nn.Module) -> torch.nn.Module:
172        '''
173        Computes the perturbed model as `beta`-weighted average between `model` and `u`.
174
175        Parameters
176        ----------
177        beta: float
178            Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
179        u: torch.nn.Module
180            Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
181        model: torch.nn.Module
182            Model that is locally optimized
183
184        Returns
185        -------
186        torch.nn.Module
187            Perturbed model as `beta`-weighted average between `model` and `u`
188        '''
189        
190        perturbed_model = deepcopy(model)
191        perturbed_model.load_state_dict(OrderedDict([
192            (name, beta * model.state_dict()[name].detach() + (1 - beta) * u.state_dict()[name].detach())
193            for name in model.state_dict().keys()
194        ]))
195        return perturbed_model

Computes the perturbed model as beta-weighted average between model and u.

Parameters
  • beta (float): Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
  • u (torch.nn.Module): Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
  • model (torch.nn.Module): Model that is locally optimized
Returns
  • torch.nn.Module: Perturbed model as beta-weighted average between model and u
class Coordinator(fedbox.optimization.fedavg.Coordinator):
198class Coordinator(fedavg.Coordinator):
199    '''
200    This class represents a centralized server coordinating the training of a shared model across multiple agents (i.e. clients).
201    
202    Note
203    ----
204    The agents locally update their models using our novel algorithmic framework.
205    '''
206
207    def __init__(
208        self,
209        beta: float,
210        model: torch.nn.Module,
211        datasets: dict[str, list[utils.FederatedSubset]],
212        scheme: WeightingScheme,
213        logger: Logger = Logger.default()
214    ):
215        '''
216        Constructs the centralized coordinator, i.e. server, in the federated learning simulation.
217
218        Parameters
219        ----------
220        beta: float
221            Parameter controlling the perturbation (the lower beta, the higher the perturbation) while running our novel algorithm on clients
222        model: torch.nn.Module
223            Initial shared model
224        datasets: dict[str, list[utils.FederatedSubset]]
225            Training clients' subsets ('training') and testing clients' subsets ('testing')
226        scheme: WeightingScheme
227            Aggregation scheme to weight local updates from clients
228        logger: Logger
229            Logger instance to save progress during the simulation
230        '''
231
232        assert isinstance(scheme, AdjancencyWeightingScheme)
233
234        assert beta > 0 and beta < 1
235
236        self.beta = beta
237        self.datasets = datasets
238        self.model = model
239        self.agents = {
240            group: [ Agent(subset) for subset in dataset ] for group, dataset in datasets.items() 
241        }
242        self.weights = scheme.weights()
243        self.adjacency_matrices = scheme.adjacencies()
244        self.logger = logger
245
246    def run(self, n_iterations: int, n_steps: int = None, n_epochs = None, batch_size: int = 32, step_size: float = 1e-3, step_size_diminishing: bool = False, l2_penalty: float = 1e-4, max_gradient_norm: float = 1.0, device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
247        '''
248        Runs `n_iterations` optimization (with our novel algorithm) and evaluation rounds on training clients.
249
250        Parameters
251        ----------
252        n_iterations: int
253            Number of global rounds
254        n_steps: int
255            Number of local SGD steps used for optimization on clients
256        n_epochs: int
257            Number of local epochs used for optimization on clients (mutually excludes n_steps)
258        batch_size: int
259            Number of samples in one SGD minibatch
260        step_size: float
261            Learning rate
262        step_size_diminishing: bool
263            This enables diminishing the step size linearly in time
264        l2_penalty: float
265            Weight of the L2 (Tikhonov) regularization used to penalize local models
266        max_gradient_norm: float
267            Value used to clip the norm of the stochastic gradient during local optimization
268        device: torch.device
269            Accelerator to run the code
270        evaluate: bool
271            Flag that enables evaluation of the update global model on training and testing clients
272
273        Note
274        ----
275        Runs `n_iterations` times function `iterate(...)`.
276        '''
277
278        assert n_steps is not None or n_epochs is not None
279
280        self.model = self.model.to(device)
281        self.model.compile()
282        self.u = [ deepcopy(self.model) for _ in self.agents['training']  ]
283
284        for iteration in range(n_iterations):
285            step_size_updated = step_size if not step_size_diminishing else step_size / (iteration + 1)
286            metrics = self.iterate(iteration, n_steps, n_epochs, batch_size, step_size_updated, l2_penalty, max_gradient_norm, device, evaluate = True)
287            
288            self.logger.log({
289                'step': iteration,
290                'loss.training': metrics['training']['loss'],
291                'loss.testing': metrics['testing']['loss'],
292                'accuracy.training': metrics['training']['accuracy'],
293                'accuracy.testing': metrics['testing']['accuracy'],
294            })
295            
296            print(iteration, metrics)
297
298    def iterate(self, iteration: int, n_steps: int, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device, evaluate: bool = False) -> dict[str, float]:
299        '''
300        Runs a single optimization round with our novel algorithm on all training clients.
301
302        Parameters
303        ----------
304        iteration: int
305            Current global round
306        n_steps: int
307            Number of local SGD steps used for optimization on clients
308        n_epochs: int
309            Number of local epochs used for optimization on clients (mutually excludes n_steps)
310        batch_size: int
311            Number of samples in one SGD minibatch
312        step_size: float
313            Learning rate
314        l2_penalty: float
315            Weight of the L2 (Tikhonov) regularization used to penalize local models
316        max_gradient_norm: float
317            Value used to clip the norm of the stochastic gradient during local optimization
318        device: torch.device
319            Accelerator to run the code
320        evaluate: bool
321            Flag that enables evaluation of the update global model on training and testing clients
322
323        Returns
324        -------
325        dict[str, float]
326            Dictionary of current round's metrics
327        '''
328
329        indices = list(range(0, len(self.agents['training'])))
330        k = len(self.agents['training'])
331        
332        random.shuffle(indices)
333        
334        indices = indices[:k]
335        participants = [ self.agents['training'][i] for i in indices ]
336        weights = [ self.weights['training'][i] for i in indices ]
337        u = [ self.u[i] for i in indices ]
338
339        initial_model = deepcopy(self.model)
340        updates: list[nn.Module] = [ initial_model for _ in self.agents['training'] ]
341
342        if n_steps is not None:
343            for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)):
344                updates[i] = participant.optimize(self.beta, u, deepcopy(initial_model), n_steps = n_steps, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device)
345        else:
346            for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)):
347                updates[i] = participant.multioptimize(self.beta, u, deepcopy(initial_model), n_epochs = n_epochs, batch_size = batch_size, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device)
348
349        self.average(updates, weights = weights)
350
351        for i, degree in enumerate(self.weights['training']):
352            # for each client, this computes the average of latest updates from other clients weighted accordingly to the similarity measure (normalized by client's degree)
353            self.u[i].load_state_dict(OrderedDict([
354                (
355                    name, 
356                    torch.stack([ 
357                        similarity * update.state_dict()[name].detach()
358                        for j, (update, similarity) in enumerate(zip(updates, self.adjacency_matrices['training'][i]))
359                        if j != i
360                    ]).sum(dim = 0) / degree
361                )
362                for name in self.model.state_dict().keys()
363            ]))
364
365        if not evaluate:
366            return {}
367        
368        return self.evaluate(iteration, device)

This class represents a centralized server coordinating the training of a shared model across multiple agents (i.e. clients).

Note

The agents locally update their models using our novel algorithmic framework.

Coordinator( beta: float, model: torch.nn.modules.module.Module, datasets: dict[str, list[fedbox.datasets.utils.FederatedSubset]], scheme: fedbox.optimization.utils.WeightingScheme, logger: fedbox.optimization.utils.Logger = <fedbox.optimization.utils.Logger object>)
207    def __init__(
208        self,
209        beta: float,
210        model: torch.nn.Module,
211        datasets: dict[str, list[utils.FederatedSubset]],
212        scheme: WeightingScheme,
213        logger: Logger = Logger.default()
214    ):
215        '''
216        Constructs the centralized coordinator, i.e. server, in the federated learning simulation.
217
218        Parameters
219        ----------
220        beta: float
221            Parameter controlling the perturbation (the lower beta, the higher the perturbation) while running our novel algorithm on clients
222        model: torch.nn.Module
223            Initial shared model
224        datasets: dict[str, list[utils.FederatedSubset]]
225            Training clients' subsets ('training') and testing clients' subsets ('testing')
226        scheme: WeightingScheme
227            Aggregation scheme to weight local updates from clients
228        logger: Logger
229            Logger instance to save progress during the simulation
230        '''
231
232        assert isinstance(scheme, AdjancencyWeightingScheme)
233
234        assert beta > 0 and beta < 1
235
236        self.beta = beta
237        self.datasets = datasets
238        self.model = model
239        self.agents = {
240            group: [ Agent(subset) for subset in dataset ] for group, dataset in datasets.items() 
241        }
242        self.weights = scheme.weights()
243        self.adjacency_matrices = scheme.adjacencies()
244        self.logger = logger

Constructs the centralized coordinator, i.e. server, in the federated learning simulation.

Parameters
  • beta (float): Parameter controlling the perturbation (the lower beta, the higher the perturbation) while running our novel algorithm on clients
  • model (torch.nn.Module): Initial shared model
  • datasets (dict[str, list[utils.FederatedSubset]]): Training clients' subsets ('training') and testing clients' subsets ('testing')
  • scheme (WeightingScheme): Aggregation scheme to weight local updates from clients
  • logger (Logger): Logger instance to save progress during the simulation
beta
datasets
model
agents
weights
adjacency_matrices
logger
def run( self, n_iterations: int, n_steps: int = None, n_epochs=None, batch_size: int = 32, step_size: float = 0.001, step_size_diminishing: bool = False, l2_penalty: float = 0.0001, max_gradient_norm: float = 1.0, device: torch.device = device(type='cpu')):
246    def run(self, n_iterations: int, n_steps: int = None, n_epochs = None, batch_size: int = 32, step_size: float = 1e-3, step_size_diminishing: bool = False, l2_penalty: float = 1e-4, max_gradient_norm: float = 1.0, device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
247        '''
248        Runs `n_iterations` optimization (with our novel algorithm) and evaluation rounds on training clients.
249
250        Parameters
251        ----------
252        n_iterations: int
253            Number of global rounds
254        n_steps: int
255            Number of local SGD steps used for optimization on clients
256        n_epochs: int
257            Number of local epochs used for optimization on clients (mutually excludes n_steps)
258        batch_size: int
259            Number of samples in one SGD minibatch
260        step_size: float
261            Learning rate
262        step_size_diminishing: bool
263            This enables diminishing the step size linearly in time
264        l2_penalty: float
265            Weight of the L2 (Tikhonov) regularization used to penalize local models
266        max_gradient_norm: float
267            Value used to clip the norm of the stochastic gradient during local optimization
268        device: torch.device
269            Accelerator to run the code
270        evaluate: bool
271            Flag that enables evaluation of the update global model on training and testing clients
272
273        Note
274        ----
275        Runs `n_iterations` times function `iterate(...)`.
276        '''
277
278        assert n_steps is not None or n_epochs is not None
279
280        self.model = self.model.to(device)
281        self.model.compile()
282        self.u = [ deepcopy(self.model) for _ in self.agents['training']  ]
283
284        for iteration in range(n_iterations):
285            step_size_updated = step_size if not step_size_diminishing else step_size / (iteration + 1)
286            metrics = self.iterate(iteration, n_steps, n_epochs, batch_size, step_size_updated, l2_penalty, max_gradient_norm, device, evaluate = True)
287            
288            self.logger.log({
289                'step': iteration,
290                'loss.training': metrics['training']['loss'],
291                'loss.testing': metrics['testing']['loss'],
292                'accuracy.training': metrics['training']['accuracy'],
293                'accuracy.testing': metrics['testing']['accuracy'],
294            })
295            
296            print(iteration, metrics)

Runs n_iterations optimization (with our novel algorithm) and evaluation rounds on training clients.

Parameters
  • n_iterations (int): Number of global rounds
  • n_steps (int): Number of local SGD steps used for optimization on clients
  • n_epochs (int): Number of local epochs used for optimization on clients (mutually excludes n_steps)
  • batch_size (int): Number of samples in one SGD minibatch
  • step_size (float): Learning rate
  • step_size_diminishing (bool): This enables diminishing the step size linearly in time
  • l2_penalty (float): Weight of the L2 (Tikhonov) regularization used to penalize local models
  • max_gradient_norm (float): Value used to clip the norm of the stochastic gradient during local optimization
  • device (torch.device): Accelerator to run the code
  • evaluate (bool): Flag that enables evaluation of the update global model on training and testing clients
Note

Runs n_iterations times function iterate(...).

def iterate( self, iteration: int, n_steps: int, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device, evaluate: bool = False) -> dict[str, float]:
298    def iterate(self, iteration: int, n_steps: int, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device, evaluate: bool = False) -> dict[str, float]:
299        '''
300        Runs a single optimization round with our novel algorithm on all training clients.
301
302        Parameters
303        ----------
304        iteration: int
305            Current global round
306        n_steps: int
307            Number of local SGD steps used for optimization on clients
308        n_epochs: int
309            Number of local epochs used for optimization on clients (mutually excludes n_steps)
310        batch_size: int
311            Number of samples in one SGD minibatch
312        step_size: float
313            Learning rate
314        l2_penalty: float
315            Weight of the L2 (Tikhonov) regularization used to penalize local models
316        max_gradient_norm: float
317            Value used to clip the norm of the stochastic gradient during local optimization
318        device: torch.device
319            Accelerator to run the code
320        evaluate: bool
321            Flag that enables evaluation of the update global model on training and testing clients
322
323        Returns
324        -------
325        dict[str, float]
326            Dictionary of current round's metrics
327        '''
328
329        indices = list(range(0, len(self.agents['training'])))
330        k = len(self.agents['training'])
331        
332        random.shuffle(indices)
333        
334        indices = indices[:k]
335        participants = [ self.agents['training'][i] for i in indices ]
336        weights = [ self.weights['training'][i] for i in indices ]
337        u = [ self.u[i] for i in indices ]
338
339        initial_model = deepcopy(self.model)
340        updates: list[nn.Module] = [ initial_model for _ in self.agents['training'] ]
341
342        if n_steps is not None:
343            for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)):
344                updates[i] = participant.optimize(self.beta, u, deepcopy(initial_model), n_steps = n_steps, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device)
345        else:
346            for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)):
347                updates[i] = participant.multioptimize(self.beta, u, deepcopy(initial_model), n_epochs = n_epochs, batch_size = batch_size, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device)
348
349        self.average(updates, weights = weights)
350
351        for i, degree in enumerate(self.weights['training']):
352            # for each client, this computes the average of latest updates from other clients weighted accordingly to the similarity measure (normalized by client's degree)
353            self.u[i].load_state_dict(OrderedDict([
354                (
355                    name, 
356                    torch.stack([ 
357                        similarity * update.state_dict()[name].detach()
358                        for j, (update, similarity) in enumerate(zip(updates, self.adjacency_matrices['training'][i]))
359                        if j != i
360                    ]).sum(dim = 0) / degree
361                )
362                for name in self.model.state_dict().keys()
363            ]))
364
365        if not evaluate:
366            return {}
367        
368        return self.evaluate(iteration, device)

Runs a single optimization round with our novel algorithm on all training clients.

Parameters
  • iteration (int): Current global round
  • n_steps (int): Number of local SGD steps used for optimization on clients
  • n_epochs (int): Number of local epochs used for optimization on clients (mutually excludes n_steps)
  • batch_size (int): Number of samples in one SGD minibatch
  • step_size (float): Learning rate
  • l2_penalty (float): Weight of the L2 (Tikhonov) regularization used to penalize local models
  • max_gradient_norm (float): Value used to clip the norm of the stochastic gradient during local optimization
  • device (torch.device): Accelerator to run the code
  • evaluate (bool): Flag that enables evaluation of the update global model on training and testing clients
Returns
  • dict[str, float]: Dictionary of current round's metrics