fedbox.optimization.ours
1from ..datasets import utils 2from . import fedavg 3from .utils import WeightingScheme, AdjancencyWeightingScheme, Logger 4 5from collections import OrderedDict 6from copy import deepcopy 7import random 8import torch 9import torch.nn as nn 10import torch.optim as optim 11import torch.utils.data as data 12from tqdm import tqdm 13 14 15class Agent(fedavg.Agent): 16 ''' 17 An agent (client) uses our novel scheme to optimize a shared model on its local subset. 18 ''' 19 20 def __init__(self, subset: utils.FederatedSubset): 21 ''' 22 Initializes the agent with a local `subset` of data samples and labels. 23 24 Parameters 25 ---------- 26 subset: utils.FederatedSubset 27 Subset of data samples and labels 28 ''' 29 30 self.subset = subset 31 32 def step(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, x: torch.Tensor, y: torch.Tensor, optimizer: optim.Optimizer, max_gradient_norm: float): 33 ''' 34 Performs an optimization step on `model` using minibatch (`x`, `y`) by introducing our `beta`-specific perturbation. 35 36 Parameters 37 ---------- 38 beta: float 39 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 40 u: torch.nn.Module 41 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 42 model: torch.nn.Module 43 Model that is optimized locally 44 x: torch.Tensor 45 Data samples in the minibatch 46 y: torch.Tensor 47 Data labels in the minibatch 48 optimizer: optim.Optimizer 49 Gradient-based optimizer 50 max_gradient_norm: float 51 Value used to clip the norm of the stochastic gradient 52 ''' 53 54 perturbed = self.perturb(beta, u, model) 55 prediction = perturbed(x) 56 57 loss = nn.functional.cross_entropy(prediction, y) 58 loss.backward() 59 60 torch.nn.utils.clip_grad_norm_(perturbed.parameters(), max_gradient_norm, error_if_nonfinite = True) 61 62 model.load_state_dict(OrderedDict([ 63 (name, model.get_parameter(name) - optimizer.param_groups[-1]['lr'] * perturbed.get_parameter(name).grad.detach()) 64 for name in model.state_dict().keys() 65 ])) 66 67 def optimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_steps: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device): 68 ''' 69 Runs `n_steps` stochastic gradient descent steps while injecting `beta`-specific perturbation on the local dataset (one step for each minibatch). 70 71 Parameters 72 ---------- 73 beta: float 74 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 75 u: torch.nn.Module 76 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 77 model: torch.nn.Module 78 Model that is locally optimized 79 n_steps: int 80 Number of local SGD steps, i.e. number of minibatches 81 step_size: float 82 Step size or learning rate 83 l2_penalty: float 84 Weight of L2 (Tikhonov) regularization term 85 max_gradient_norm: float 86 Value used to clip the norm of the stochastic gradient 87 device: torch.device 88 Accelerator to run the code 89 ''' 90 91 loader = data.DataLoader(self.subset, batch_size = len(self.subset) // n_steps) 92 optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty) 93 94 model.train() 95 96 for x, y in loader: 97 self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm) 98 99 return model 100 101 def multioptimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device): 102 ''' 103 Runs `n_epochs` stochastic gradient descent epochs while injecting `beta`-specific perturbation on the local dataset. 104 105 Parameters 106 ---------- 107 beta: float 108 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 109 u: torch.nn.Module 110 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 111 model: torch.nn.Module 112 Model that is locally optimized 113 n_epochs: int 114 Number of local epochs to pass over the entire local dataset 115 step_size: float 116 Step size or learning rate 117 l2_penalty: float 118 Weight of L2 (Tikhonov) regularization term 119 max_gradient_norm: float 120 Value used to clip the norm of the stochastic gradient 121 device: torch.device 122 Accelerator to run the code 123 124 Note 125 ---- 126 Differently from `optimize(...)`, each epoch corresponds to passing over the entire dataset using SGD. 127 ''' 128 129 loader = data.DataLoader(self.subset, batch_size = batch_size) 130 optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty) 131 132 model.train() 133 134 for _ in range(n_epochs): 135 for x, y in loader: 136 self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm) 137 138 return model 139 140 def evaluate(self, model: torch.nn.Module, device: torch.device) -> tuple[float, float]: 141 ''' 142 Evaluate the `model` by computing the average sample loss and accuracy. 143 144 Parameters 145 ---------- 146 model: torch.nn.Module 147 Model that is locally optimized 148 device: torch.device 149 Accelerator to run the code 150 151 Returns 152 ------- 153 tuple[float, float] 154 Tuple of average sample loss and accuracy on the local dataset 155 ''' 156 157 loader = data.DataLoader(self.subset, batch_size = len(self.subset)) 158 x, y = next(iter(loader)) 159 x = x.to(device) 160 y = y.to(device) 161 162 model.eval() 163 164 with torch.no_grad(): 165 prediction = model(x) 166 loss = nn.functional.cross_entropy(prediction, y) 167 accuracy = torch.sum(torch.argmax(prediction, dim = 1) == y) 168 return loss.item(), accuracy.item() 169 170 def perturb(self, beta: float, u: torch.nn.Module, model: torch.nn.Module) -> torch.nn.Module: 171 ''' 172 Computes the perturbed model as `beta`-weighted average between `model` and `u`. 173 174 Parameters 175 ---------- 176 beta: float 177 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 178 u: torch.nn.Module 179 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 180 model: torch.nn.Module 181 Model that is locally optimized 182 183 Returns 184 ------- 185 torch.nn.Module 186 Perturbed model as `beta`-weighted average between `model` and `u` 187 ''' 188 189 perturbed_model = deepcopy(model) 190 perturbed_model.load_state_dict(OrderedDict([ 191 (name, beta * model.state_dict()[name].detach() + (1 - beta) * u.state_dict()[name].detach()) 192 for name in model.state_dict().keys() 193 ])) 194 return perturbed_model 195 196 197class Coordinator(fedavg.Coordinator): 198 ''' 199 This class represents a centralized server coordinating the training of a shared model across multiple agents (i.e. clients). 200 201 Note 202 ---- 203 The agents locally update their models using our novel algorithmic framework. 204 ''' 205 206 def __init__( 207 self, 208 beta: float, 209 model: torch.nn.Module, 210 datasets: dict[str, list[utils.FederatedSubset]], 211 scheme: WeightingScheme, 212 logger: Logger = Logger.default() 213 ): 214 ''' 215 Constructs the centralized coordinator, i.e. server, in the federated learning simulation. 216 217 Parameters 218 ---------- 219 beta: float 220 Parameter controlling the perturbation (the lower beta, the higher the perturbation) while running our novel algorithm on clients 221 model: torch.nn.Module 222 Initial shared model 223 datasets: dict[str, list[utils.FederatedSubset]] 224 Training clients' subsets ('training') and testing clients' subsets ('testing') 225 scheme: WeightingScheme 226 Aggregation scheme to weight local updates from clients 227 logger: Logger 228 Logger instance to save progress during the simulation 229 ''' 230 231 assert isinstance(scheme, AdjancencyWeightingScheme) 232 233 assert beta > 0 and beta < 1 234 235 self.beta = beta 236 self.datasets = datasets 237 self.model = model 238 self.agents = { 239 group: [ Agent(subset) for subset in dataset ] for group, dataset in datasets.items() 240 } 241 self.weights = scheme.weights() 242 self.adjacency_matrices = scheme.adjacencies() 243 self.logger = logger 244 245 def run(self, n_iterations: int, n_steps: int = None, n_epochs = None, batch_size: int = 32, step_size: float = 1e-3, step_size_diminishing: bool = False, l2_penalty: float = 1e-4, max_gradient_norm: float = 1.0, device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')): 246 ''' 247 Runs `n_iterations` optimization (with our novel algorithm) and evaluation rounds on training clients. 248 249 Parameters 250 ---------- 251 n_iterations: int 252 Number of global rounds 253 n_steps: int 254 Number of local SGD steps used for optimization on clients 255 n_epochs: int 256 Number of local epochs used for optimization on clients (mutually excludes n_steps) 257 batch_size: int 258 Number of samples in one SGD minibatch 259 step_size: float 260 Learning rate 261 step_size_diminishing: bool 262 This enables diminishing the step size linearly in time 263 l2_penalty: float 264 Weight of the L2 (Tikhonov) regularization used to penalize local models 265 max_gradient_norm: float 266 Value used to clip the norm of the stochastic gradient during local optimization 267 device: torch.device 268 Accelerator to run the code 269 evaluate: bool 270 Flag that enables evaluation of the update global model on training and testing clients 271 272 Note 273 ---- 274 Runs `n_iterations` times function `iterate(...)`. 275 ''' 276 277 assert n_steps is not None or n_epochs is not None 278 279 self.model = self.model.to(device) 280 self.model.compile() 281 self.u = [ deepcopy(self.model) for _ in self.agents['training'] ] 282 283 for iteration in range(n_iterations): 284 step_size_updated = step_size if not step_size_diminishing else step_size / (iteration + 1) 285 metrics = self.iterate(iteration, n_steps, n_epochs, batch_size, step_size_updated, l2_penalty, max_gradient_norm, device, evaluate = True) 286 287 self.logger.log({ 288 'step': iteration, 289 'loss.training': metrics['training']['loss'], 290 'loss.testing': metrics['testing']['loss'], 291 'accuracy.training': metrics['training']['accuracy'], 292 'accuracy.testing': metrics['testing']['accuracy'], 293 }) 294 295 print(iteration, metrics) 296 297 def iterate(self, iteration: int, n_steps: int, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device, evaluate: bool = False) -> dict[str, float]: 298 ''' 299 Runs a single optimization round with our novel algorithm on all training clients. 300 301 Parameters 302 ---------- 303 iteration: int 304 Current global round 305 n_steps: int 306 Number of local SGD steps used for optimization on clients 307 n_epochs: int 308 Number of local epochs used for optimization on clients (mutually excludes n_steps) 309 batch_size: int 310 Number of samples in one SGD minibatch 311 step_size: float 312 Learning rate 313 l2_penalty: float 314 Weight of the L2 (Tikhonov) regularization used to penalize local models 315 max_gradient_norm: float 316 Value used to clip the norm of the stochastic gradient during local optimization 317 device: torch.device 318 Accelerator to run the code 319 evaluate: bool 320 Flag that enables evaluation of the update global model on training and testing clients 321 322 Returns 323 ------- 324 dict[str, float] 325 Dictionary of current round's metrics 326 ''' 327 328 indices = list(range(0, len(self.agents['training']))) 329 k = len(self.agents['training']) 330 331 random.shuffle(indices) 332 333 indices = indices[:k] 334 participants = [ self.agents['training'][i] for i in indices ] 335 weights = [ self.weights['training'][i] for i in indices ] 336 u = [ self.u[i] for i in indices ] 337 338 initial_model = deepcopy(self.model) 339 updates: list[nn.Module] = [ initial_model for _ in self.agents['training'] ] 340 341 if n_steps is not None: 342 for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)): 343 updates[i] = participant.optimize(self.beta, u, deepcopy(initial_model), n_steps = n_steps, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device) 344 else: 345 for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)): 346 updates[i] = participant.multioptimize(self.beta, u, deepcopy(initial_model), n_epochs = n_epochs, batch_size = batch_size, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device) 347 348 self.average(updates, weights = weights) 349 350 for i, degree in enumerate(self.weights['training']): 351 # for each client, this computes the average of latest updates from other clients weighted accordingly to the similarity measure (normalized by client's degree) 352 self.u[i].load_state_dict(OrderedDict([ 353 ( 354 name, 355 torch.stack([ 356 similarity * update.state_dict()[name].detach() 357 for j, (update, similarity) in enumerate(zip(updates, self.adjacency_matrices['training'][i])) 358 if j != i 359 ]).sum(dim = 0) / degree 360 ) 361 for name in self.model.state_dict().keys() 362 ])) 363 364 if not evaluate: 365 return {} 366 367 return self.evaluate(iteration, device)
16class Agent(fedavg.Agent): 17 ''' 18 An agent (client) uses our novel scheme to optimize a shared model on its local subset. 19 ''' 20 21 def __init__(self, subset: utils.FederatedSubset): 22 ''' 23 Initializes the agent with a local `subset` of data samples and labels. 24 25 Parameters 26 ---------- 27 subset: utils.FederatedSubset 28 Subset of data samples and labels 29 ''' 30 31 self.subset = subset 32 33 def step(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, x: torch.Tensor, y: torch.Tensor, optimizer: optim.Optimizer, max_gradient_norm: float): 34 ''' 35 Performs an optimization step on `model` using minibatch (`x`, `y`) by introducing our `beta`-specific perturbation. 36 37 Parameters 38 ---------- 39 beta: float 40 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 41 u: torch.nn.Module 42 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 43 model: torch.nn.Module 44 Model that is optimized locally 45 x: torch.Tensor 46 Data samples in the minibatch 47 y: torch.Tensor 48 Data labels in the minibatch 49 optimizer: optim.Optimizer 50 Gradient-based optimizer 51 max_gradient_norm: float 52 Value used to clip the norm of the stochastic gradient 53 ''' 54 55 perturbed = self.perturb(beta, u, model) 56 prediction = perturbed(x) 57 58 loss = nn.functional.cross_entropy(prediction, y) 59 loss.backward() 60 61 torch.nn.utils.clip_grad_norm_(perturbed.parameters(), max_gradient_norm, error_if_nonfinite = True) 62 63 model.load_state_dict(OrderedDict([ 64 (name, model.get_parameter(name) - optimizer.param_groups[-1]['lr'] * perturbed.get_parameter(name).grad.detach()) 65 for name in model.state_dict().keys() 66 ])) 67 68 def optimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_steps: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device): 69 ''' 70 Runs `n_steps` stochastic gradient descent steps while injecting `beta`-specific perturbation on the local dataset (one step for each minibatch). 71 72 Parameters 73 ---------- 74 beta: float 75 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 76 u: torch.nn.Module 77 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 78 model: torch.nn.Module 79 Model that is locally optimized 80 n_steps: int 81 Number of local SGD steps, i.e. number of minibatches 82 step_size: float 83 Step size or learning rate 84 l2_penalty: float 85 Weight of L2 (Tikhonov) regularization term 86 max_gradient_norm: float 87 Value used to clip the norm of the stochastic gradient 88 device: torch.device 89 Accelerator to run the code 90 ''' 91 92 loader = data.DataLoader(self.subset, batch_size = len(self.subset) // n_steps) 93 optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty) 94 95 model.train() 96 97 for x, y in loader: 98 self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm) 99 100 return model 101 102 def multioptimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device): 103 ''' 104 Runs `n_epochs` stochastic gradient descent epochs while injecting `beta`-specific perturbation on the local dataset. 105 106 Parameters 107 ---------- 108 beta: float 109 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 110 u: torch.nn.Module 111 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 112 model: torch.nn.Module 113 Model that is locally optimized 114 n_epochs: int 115 Number of local epochs to pass over the entire local dataset 116 step_size: float 117 Step size or learning rate 118 l2_penalty: float 119 Weight of L2 (Tikhonov) regularization term 120 max_gradient_norm: float 121 Value used to clip the norm of the stochastic gradient 122 device: torch.device 123 Accelerator to run the code 124 125 Note 126 ---- 127 Differently from `optimize(...)`, each epoch corresponds to passing over the entire dataset using SGD. 128 ''' 129 130 loader = data.DataLoader(self.subset, batch_size = batch_size) 131 optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty) 132 133 model.train() 134 135 for _ in range(n_epochs): 136 for x, y in loader: 137 self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm) 138 139 return model 140 141 def evaluate(self, model: torch.nn.Module, device: torch.device) -> tuple[float, float]: 142 ''' 143 Evaluate the `model` by computing the average sample loss and accuracy. 144 145 Parameters 146 ---------- 147 model: torch.nn.Module 148 Model that is locally optimized 149 device: torch.device 150 Accelerator to run the code 151 152 Returns 153 ------- 154 tuple[float, float] 155 Tuple of average sample loss and accuracy on the local dataset 156 ''' 157 158 loader = data.DataLoader(self.subset, batch_size = len(self.subset)) 159 x, y = next(iter(loader)) 160 x = x.to(device) 161 y = y.to(device) 162 163 model.eval() 164 165 with torch.no_grad(): 166 prediction = model(x) 167 loss = nn.functional.cross_entropy(prediction, y) 168 accuracy = torch.sum(torch.argmax(prediction, dim = 1) == y) 169 return loss.item(), accuracy.item() 170 171 def perturb(self, beta: float, u: torch.nn.Module, model: torch.nn.Module) -> torch.nn.Module: 172 ''' 173 Computes the perturbed model as `beta`-weighted average between `model` and `u`. 174 175 Parameters 176 ---------- 177 beta: float 178 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 179 u: torch.nn.Module 180 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 181 model: torch.nn.Module 182 Model that is locally optimized 183 184 Returns 185 ------- 186 torch.nn.Module 187 Perturbed model as `beta`-weighted average between `model` and `u` 188 ''' 189 190 perturbed_model = deepcopy(model) 191 perturbed_model.load_state_dict(OrderedDict([ 192 (name, beta * model.state_dict()[name].detach() + (1 - beta) * u.state_dict()[name].detach()) 193 for name in model.state_dict().keys() 194 ])) 195 return perturbed_model
An agent (client) uses our novel scheme to optimize a shared model on its local subset.
21 def __init__(self, subset: utils.FederatedSubset): 22 ''' 23 Initializes the agent with a local `subset` of data samples and labels. 24 25 Parameters 26 ---------- 27 subset: utils.FederatedSubset 28 Subset of data samples and labels 29 ''' 30 31 self.subset = subset
Initializes the agent with a local subset of data samples and labels.
Parameters
- subset (utils.FederatedSubset): Subset of data samples and labels
33 def step(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, x: torch.Tensor, y: torch.Tensor, optimizer: optim.Optimizer, max_gradient_norm: float): 34 ''' 35 Performs an optimization step on `model` using minibatch (`x`, `y`) by introducing our `beta`-specific perturbation. 36 37 Parameters 38 ---------- 39 beta: float 40 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 41 u: torch.nn.Module 42 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 43 model: torch.nn.Module 44 Model that is optimized locally 45 x: torch.Tensor 46 Data samples in the minibatch 47 y: torch.Tensor 48 Data labels in the minibatch 49 optimizer: optim.Optimizer 50 Gradient-based optimizer 51 max_gradient_norm: float 52 Value used to clip the norm of the stochastic gradient 53 ''' 54 55 perturbed = self.perturb(beta, u, model) 56 prediction = perturbed(x) 57 58 loss = nn.functional.cross_entropy(prediction, y) 59 loss.backward() 60 61 torch.nn.utils.clip_grad_norm_(perturbed.parameters(), max_gradient_norm, error_if_nonfinite = True) 62 63 model.load_state_dict(OrderedDict([ 64 (name, model.get_parameter(name) - optimizer.param_groups[-1]['lr'] * perturbed.get_parameter(name).grad.detach()) 65 for name in model.state_dict().keys() 66 ]))
Performs an optimization step on model using minibatch (x, y) by introducing our beta-specific perturbation.
Parameters
- beta (float): Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
- u (torch.nn.Module): Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
- model (torch.nn.Module): Model that is optimized locally
- x (torch.Tensor): Data samples in the minibatch
- y (torch.Tensor): Data labels in the minibatch
- optimizer (optim.Optimizer): Gradient-based optimizer
- max_gradient_norm (float): Value used to clip the norm of the stochastic gradient
68 def optimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_steps: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device): 69 ''' 70 Runs `n_steps` stochastic gradient descent steps while injecting `beta`-specific perturbation on the local dataset (one step for each minibatch). 71 72 Parameters 73 ---------- 74 beta: float 75 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 76 u: torch.nn.Module 77 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 78 model: torch.nn.Module 79 Model that is locally optimized 80 n_steps: int 81 Number of local SGD steps, i.e. number of minibatches 82 step_size: float 83 Step size or learning rate 84 l2_penalty: float 85 Weight of L2 (Tikhonov) regularization term 86 max_gradient_norm: float 87 Value used to clip the norm of the stochastic gradient 88 device: torch.device 89 Accelerator to run the code 90 ''' 91 92 loader = data.DataLoader(self.subset, batch_size = len(self.subset) // n_steps) 93 optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty) 94 95 model.train() 96 97 for x, y in loader: 98 self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm) 99 100 return model
Runs n_steps stochastic gradient descent steps while injecting beta-specific perturbation on the local dataset (one step for each minibatch).
Parameters
- beta (float): Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
- u (torch.nn.Module): Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
- model (torch.nn.Module): Model that is locally optimized
- n_steps (int): Number of local SGD steps, i.e. number of minibatches
- step_size (float): Step size or learning rate
- l2_penalty (float): Weight of L2 (Tikhonov) regularization term
- max_gradient_norm (float): Value used to clip the norm of the stochastic gradient
- device (torch.device): Accelerator to run the code
102 def multioptimize(self, beta: float, u: torch.nn.Module, model: torch.nn.Module, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device): 103 ''' 104 Runs `n_epochs` stochastic gradient descent epochs while injecting `beta`-specific perturbation on the local dataset. 105 106 Parameters 107 ---------- 108 beta: float 109 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 110 u: torch.nn.Module 111 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 112 model: torch.nn.Module 113 Model that is locally optimized 114 n_epochs: int 115 Number of local epochs to pass over the entire local dataset 116 step_size: float 117 Step size or learning rate 118 l2_penalty: float 119 Weight of L2 (Tikhonov) regularization term 120 max_gradient_norm: float 121 Value used to clip the norm of the stochastic gradient 122 device: torch.device 123 Accelerator to run the code 124 125 Note 126 ---- 127 Differently from `optimize(...)`, each epoch corresponds to passing over the entire dataset using SGD. 128 ''' 129 130 loader = data.DataLoader(self.subset, batch_size = batch_size) 131 optimizer = optim.SGD(model.parameters(), lr = step_size, weight_decay = l2_penalty) 132 133 model.train() 134 135 for _ in range(n_epochs): 136 for x, y in loader: 137 self.step(beta, u, model, x.to(device), y.to(device), optimizer = optimizer, max_gradient_norm = max_gradient_norm) 138 139 return model
Runs n_epochs stochastic gradient descent epochs while injecting beta-specific perturbation on the local dataset.
Parameters
- beta (float): Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
- u (torch.nn.Module): Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
- model (torch.nn.Module): Model that is locally optimized
- n_epochs (int): Number of local epochs to pass over the entire local dataset
- step_size (float): Step size or learning rate
- l2_penalty (float): Weight of L2 (Tikhonov) regularization term
- max_gradient_norm (float): Value used to clip the norm of the stochastic gradient
- device (torch.device): Accelerator to run the code
Note
Differently from optimize(...), each epoch corresponds to passing over the entire dataset using SGD.
141 def evaluate(self, model: torch.nn.Module, device: torch.device) -> tuple[float, float]: 142 ''' 143 Evaluate the `model` by computing the average sample loss and accuracy. 144 145 Parameters 146 ---------- 147 model: torch.nn.Module 148 Model that is locally optimized 149 device: torch.device 150 Accelerator to run the code 151 152 Returns 153 ------- 154 tuple[float, float] 155 Tuple of average sample loss and accuracy on the local dataset 156 ''' 157 158 loader = data.DataLoader(self.subset, batch_size = len(self.subset)) 159 x, y = next(iter(loader)) 160 x = x.to(device) 161 y = y.to(device) 162 163 model.eval() 164 165 with torch.no_grad(): 166 prediction = model(x) 167 loss = nn.functional.cross_entropy(prediction, y) 168 accuracy = torch.sum(torch.argmax(prediction, dim = 1) == y) 169 return loss.item(), accuracy.item()
Evaluate the model by computing the average sample loss and accuracy.
Parameters
- model (torch.nn.Module): Model that is locally optimized
- device (torch.device): Accelerator to run the code
Returns
- tuple[float, float]: Tuple of average sample loss and accuracy on the local dataset
171 def perturb(self, beta: float, u: torch.nn.Module, model: torch.nn.Module) -> torch.nn.Module: 172 ''' 173 Computes the perturbed model as `beta`-weighted average between `model` and `u`. 174 175 Parameters 176 ---------- 177 beta: float 178 Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation) 179 u: torch.nn.Module 180 Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents 181 model: torch.nn.Module 182 Model that is locally optimized 183 184 Returns 185 ------- 186 torch.nn.Module 187 Perturbed model as `beta`-weighted average between `model` and `u` 188 ''' 189 190 perturbed_model = deepcopy(model) 191 perturbed_model.load_state_dict(OrderedDict([ 192 (name, beta * model.state_dict()[name].detach() + (1 - beta) * u.state_dict()[name].detach()) 193 for name in model.state_dict().keys() 194 ])) 195 return perturbed_model
Computes the perturbed model as beta-weighted average between model and u.
Parameters
- beta (float): Parameter of our algorithm that controls the perturbation (the lower beta, the higher the perturbation)
- u (torch.nn.Module): Averaged model with the latest updates (previous round) weighted by mutual statistical similarities with other agents
- model (torch.nn.Module): Model that is locally optimized
Returns
- torch.nn.Module: Perturbed model as
beta-weighted average betweenmodelandu
198class Coordinator(fedavg.Coordinator): 199 ''' 200 This class represents a centralized server coordinating the training of a shared model across multiple agents (i.e. clients). 201 202 Note 203 ---- 204 The agents locally update their models using our novel algorithmic framework. 205 ''' 206 207 def __init__( 208 self, 209 beta: float, 210 model: torch.nn.Module, 211 datasets: dict[str, list[utils.FederatedSubset]], 212 scheme: WeightingScheme, 213 logger: Logger = Logger.default() 214 ): 215 ''' 216 Constructs the centralized coordinator, i.e. server, in the federated learning simulation. 217 218 Parameters 219 ---------- 220 beta: float 221 Parameter controlling the perturbation (the lower beta, the higher the perturbation) while running our novel algorithm on clients 222 model: torch.nn.Module 223 Initial shared model 224 datasets: dict[str, list[utils.FederatedSubset]] 225 Training clients' subsets ('training') and testing clients' subsets ('testing') 226 scheme: WeightingScheme 227 Aggregation scheme to weight local updates from clients 228 logger: Logger 229 Logger instance to save progress during the simulation 230 ''' 231 232 assert isinstance(scheme, AdjancencyWeightingScheme) 233 234 assert beta > 0 and beta < 1 235 236 self.beta = beta 237 self.datasets = datasets 238 self.model = model 239 self.agents = { 240 group: [ Agent(subset) for subset in dataset ] for group, dataset in datasets.items() 241 } 242 self.weights = scheme.weights() 243 self.adjacency_matrices = scheme.adjacencies() 244 self.logger = logger 245 246 def run(self, n_iterations: int, n_steps: int = None, n_epochs = None, batch_size: int = 32, step_size: float = 1e-3, step_size_diminishing: bool = False, l2_penalty: float = 1e-4, max_gradient_norm: float = 1.0, device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')): 247 ''' 248 Runs `n_iterations` optimization (with our novel algorithm) and evaluation rounds on training clients. 249 250 Parameters 251 ---------- 252 n_iterations: int 253 Number of global rounds 254 n_steps: int 255 Number of local SGD steps used for optimization on clients 256 n_epochs: int 257 Number of local epochs used for optimization on clients (mutually excludes n_steps) 258 batch_size: int 259 Number of samples in one SGD minibatch 260 step_size: float 261 Learning rate 262 step_size_diminishing: bool 263 This enables diminishing the step size linearly in time 264 l2_penalty: float 265 Weight of the L2 (Tikhonov) regularization used to penalize local models 266 max_gradient_norm: float 267 Value used to clip the norm of the stochastic gradient during local optimization 268 device: torch.device 269 Accelerator to run the code 270 evaluate: bool 271 Flag that enables evaluation of the update global model on training and testing clients 272 273 Note 274 ---- 275 Runs `n_iterations` times function `iterate(...)`. 276 ''' 277 278 assert n_steps is not None or n_epochs is not None 279 280 self.model = self.model.to(device) 281 self.model.compile() 282 self.u = [ deepcopy(self.model) for _ in self.agents['training'] ] 283 284 for iteration in range(n_iterations): 285 step_size_updated = step_size if not step_size_diminishing else step_size / (iteration + 1) 286 metrics = self.iterate(iteration, n_steps, n_epochs, batch_size, step_size_updated, l2_penalty, max_gradient_norm, device, evaluate = True) 287 288 self.logger.log({ 289 'step': iteration, 290 'loss.training': metrics['training']['loss'], 291 'loss.testing': metrics['testing']['loss'], 292 'accuracy.training': metrics['training']['accuracy'], 293 'accuracy.testing': metrics['testing']['accuracy'], 294 }) 295 296 print(iteration, metrics) 297 298 def iterate(self, iteration: int, n_steps: int, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device, evaluate: bool = False) -> dict[str, float]: 299 ''' 300 Runs a single optimization round with our novel algorithm on all training clients. 301 302 Parameters 303 ---------- 304 iteration: int 305 Current global round 306 n_steps: int 307 Number of local SGD steps used for optimization on clients 308 n_epochs: int 309 Number of local epochs used for optimization on clients (mutually excludes n_steps) 310 batch_size: int 311 Number of samples in one SGD minibatch 312 step_size: float 313 Learning rate 314 l2_penalty: float 315 Weight of the L2 (Tikhonov) regularization used to penalize local models 316 max_gradient_norm: float 317 Value used to clip the norm of the stochastic gradient during local optimization 318 device: torch.device 319 Accelerator to run the code 320 evaluate: bool 321 Flag that enables evaluation of the update global model on training and testing clients 322 323 Returns 324 ------- 325 dict[str, float] 326 Dictionary of current round's metrics 327 ''' 328 329 indices = list(range(0, len(self.agents['training']))) 330 k = len(self.agents['training']) 331 332 random.shuffle(indices) 333 334 indices = indices[:k] 335 participants = [ self.agents['training'][i] for i in indices ] 336 weights = [ self.weights['training'][i] for i in indices ] 337 u = [ self.u[i] for i in indices ] 338 339 initial_model = deepcopy(self.model) 340 updates: list[nn.Module] = [ initial_model for _ in self.agents['training'] ] 341 342 if n_steps is not None: 343 for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)): 344 updates[i] = participant.optimize(self.beta, u, deepcopy(initial_model), n_steps = n_steps, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device) 345 else: 346 for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)): 347 updates[i] = participant.multioptimize(self.beta, u, deepcopy(initial_model), n_epochs = n_epochs, batch_size = batch_size, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device) 348 349 self.average(updates, weights = weights) 350 351 for i, degree in enumerate(self.weights['training']): 352 # for each client, this computes the average of latest updates from other clients weighted accordingly to the similarity measure (normalized by client's degree) 353 self.u[i].load_state_dict(OrderedDict([ 354 ( 355 name, 356 torch.stack([ 357 similarity * update.state_dict()[name].detach() 358 for j, (update, similarity) in enumerate(zip(updates, self.adjacency_matrices['training'][i])) 359 if j != i 360 ]).sum(dim = 0) / degree 361 ) 362 for name in self.model.state_dict().keys() 363 ])) 364 365 if not evaluate: 366 return {} 367 368 return self.evaluate(iteration, device)
This class represents a centralized server coordinating the training of a shared model across multiple agents (i.e. clients).
Note
The agents locally update their models using our novel algorithmic framework.
207 def __init__( 208 self, 209 beta: float, 210 model: torch.nn.Module, 211 datasets: dict[str, list[utils.FederatedSubset]], 212 scheme: WeightingScheme, 213 logger: Logger = Logger.default() 214 ): 215 ''' 216 Constructs the centralized coordinator, i.e. server, in the federated learning simulation. 217 218 Parameters 219 ---------- 220 beta: float 221 Parameter controlling the perturbation (the lower beta, the higher the perturbation) while running our novel algorithm on clients 222 model: torch.nn.Module 223 Initial shared model 224 datasets: dict[str, list[utils.FederatedSubset]] 225 Training clients' subsets ('training') and testing clients' subsets ('testing') 226 scheme: WeightingScheme 227 Aggregation scheme to weight local updates from clients 228 logger: Logger 229 Logger instance to save progress during the simulation 230 ''' 231 232 assert isinstance(scheme, AdjancencyWeightingScheme) 233 234 assert beta > 0 and beta < 1 235 236 self.beta = beta 237 self.datasets = datasets 238 self.model = model 239 self.agents = { 240 group: [ Agent(subset) for subset in dataset ] for group, dataset in datasets.items() 241 } 242 self.weights = scheme.weights() 243 self.adjacency_matrices = scheme.adjacencies() 244 self.logger = logger
Constructs the centralized coordinator, i.e. server, in the federated learning simulation.
Parameters
- beta (float): Parameter controlling the perturbation (the lower beta, the higher the perturbation) while running our novel algorithm on clients
- model (torch.nn.Module): Initial shared model
- datasets (dict[str, list[utils.FederatedSubset]]): Training clients' subsets ('training') and testing clients' subsets ('testing')
- scheme (WeightingScheme): Aggregation scheme to weight local updates from clients
- logger (Logger): Logger instance to save progress during the simulation
246 def run(self, n_iterations: int, n_steps: int = None, n_epochs = None, batch_size: int = 32, step_size: float = 1e-3, step_size_diminishing: bool = False, l2_penalty: float = 1e-4, max_gradient_norm: float = 1.0, device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')): 247 ''' 248 Runs `n_iterations` optimization (with our novel algorithm) and evaluation rounds on training clients. 249 250 Parameters 251 ---------- 252 n_iterations: int 253 Number of global rounds 254 n_steps: int 255 Number of local SGD steps used for optimization on clients 256 n_epochs: int 257 Number of local epochs used for optimization on clients (mutually excludes n_steps) 258 batch_size: int 259 Number of samples in one SGD minibatch 260 step_size: float 261 Learning rate 262 step_size_diminishing: bool 263 This enables diminishing the step size linearly in time 264 l2_penalty: float 265 Weight of the L2 (Tikhonov) regularization used to penalize local models 266 max_gradient_norm: float 267 Value used to clip the norm of the stochastic gradient during local optimization 268 device: torch.device 269 Accelerator to run the code 270 evaluate: bool 271 Flag that enables evaluation of the update global model on training and testing clients 272 273 Note 274 ---- 275 Runs `n_iterations` times function `iterate(...)`. 276 ''' 277 278 assert n_steps is not None or n_epochs is not None 279 280 self.model = self.model.to(device) 281 self.model.compile() 282 self.u = [ deepcopy(self.model) for _ in self.agents['training'] ] 283 284 for iteration in range(n_iterations): 285 step_size_updated = step_size if not step_size_diminishing else step_size / (iteration + 1) 286 metrics = self.iterate(iteration, n_steps, n_epochs, batch_size, step_size_updated, l2_penalty, max_gradient_norm, device, evaluate = True) 287 288 self.logger.log({ 289 'step': iteration, 290 'loss.training': metrics['training']['loss'], 291 'loss.testing': metrics['testing']['loss'], 292 'accuracy.training': metrics['training']['accuracy'], 293 'accuracy.testing': metrics['testing']['accuracy'], 294 }) 295 296 print(iteration, metrics)
Runs n_iterations optimization (with our novel algorithm) and evaluation rounds on training clients.
Parameters
- n_iterations (int): Number of global rounds
- n_steps (int): Number of local SGD steps used for optimization on clients
- n_epochs (int): Number of local epochs used for optimization on clients (mutually excludes n_steps)
- batch_size (int): Number of samples in one SGD minibatch
- step_size (float): Learning rate
- step_size_diminishing (bool): This enables diminishing the step size linearly in time
- l2_penalty (float): Weight of the L2 (Tikhonov) regularization used to penalize local models
- max_gradient_norm (float): Value used to clip the norm of the stochastic gradient during local optimization
- device (torch.device): Accelerator to run the code
- evaluate (bool): Flag that enables evaluation of the update global model on training and testing clients
Note
Runs n_iterations times function iterate(...).
298 def iterate(self, iteration: int, n_steps: int, n_epochs: int, batch_size: int, step_size: float, l2_penalty: float, max_gradient_norm: float, device: torch.device, evaluate: bool = False) -> dict[str, float]: 299 ''' 300 Runs a single optimization round with our novel algorithm on all training clients. 301 302 Parameters 303 ---------- 304 iteration: int 305 Current global round 306 n_steps: int 307 Number of local SGD steps used for optimization on clients 308 n_epochs: int 309 Number of local epochs used for optimization on clients (mutually excludes n_steps) 310 batch_size: int 311 Number of samples in one SGD minibatch 312 step_size: float 313 Learning rate 314 l2_penalty: float 315 Weight of the L2 (Tikhonov) regularization used to penalize local models 316 max_gradient_norm: float 317 Value used to clip the norm of the stochastic gradient during local optimization 318 device: torch.device 319 Accelerator to run the code 320 evaluate: bool 321 Flag that enables evaluation of the update global model on training and testing clients 322 323 Returns 324 ------- 325 dict[str, float] 326 Dictionary of current round's metrics 327 ''' 328 329 indices = list(range(0, len(self.agents['training']))) 330 k = len(self.agents['training']) 331 332 random.shuffle(indices) 333 334 indices = indices[:k] 335 participants = [ self.agents['training'][i] for i in indices ] 336 weights = [ self.weights['training'][i] for i in indices ] 337 u = [ self.u[i] for i in indices ] 338 339 initial_model = deepcopy(self.model) 340 updates: list[nn.Module] = [ initial_model for _ in self.agents['training'] ] 341 342 if n_steps is not None: 343 for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)): 344 updates[i] = participant.optimize(self.beta, u, deepcopy(initial_model), n_steps = n_steps, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device) 345 else: 346 for i, participant, u in tqdm(zip(indices, participants, self.u), total = len(participants), desc = 'Optimization on training agents (iteration {})'.format(iteration)): 347 updates[i] = participant.multioptimize(self.beta, u, deepcopy(initial_model), n_epochs = n_epochs, batch_size = batch_size, step_size = step_size, l2_penalty = l2_penalty, max_gradient_norm = max_gradient_norm, device = device) 348 349 self.average(updates, weights = weights) 350 351 for i, degree in enumerate(self.weights['training']): 352 # for each client, this computes the average of latest updates from other clients weighted accordingly to the similarity measure (normalized by client's degree) 353 self.u[i].load_state_dict(OrderedDict([ 354 ( 355 name, 356 torch.stack([ 357 similarity * update.state_dict()[name].detach() 358 for j, (update, similarity) in enumerate(zip(updates, self.adjacency_matrices['training'][i])) 359 if j != i 360 ]).sum(dim = 0) / degree 361 ) 362 for name in self.model.state_dict().keys() 363 ])) 364 365 if not evaluate: 366 return {} 367 368 return self.evaluate(iteration, device)
Runs a single optimization round with our novel algorithm on all training clients.
Parameters
- iteration (int): Current global round
- n_steps (int): Number of local SGD steps used for optimization on clients
- n_epochs (int): Number of local epochs used for optimization on clients (mutually excludes n_steps)
- batch_size (int): Number of samples in one SGD minibatch
- step_size (float): Learning rate
- l2_penalty (float): Weight of the L2 (Tikhonov) regularization used to penalize local models
- max_gradient_norm (float): Value used to clip the norm of the stochastic gradient during local optimization
- device (torch.device): Accelerator to run the code
- evaluate (bool): Flag that enables evaluation of the update global model on training and testing clients
Returns
- dict[str, float]: Dictionary of current round's metrics