@torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov']
for p in group['params']: if p.grad is None: continue d_p = p.grad if weight_decay != 0: d_p = d_p.add(p, alpha=weight_decay) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(d_p, alpha=1 - dampening) if nesterov: d_p = d_p.add(buf, alpha=momentum) else: d_p = buf
@torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: for p in group['params']: if p.grad is None: continue
grad = p.grad state = self.state[p]
state['step'] += 1
if group['weight_decay'] != 0: if p.grad.is_sparse: raise RuntimeError("weight_decay option is not compatible with sparse gradients") grad = grad.add(p, alpha=group['weight_decay'])
if grad.is_sparse: grad = grad.coalesce() # the update is non-linear so indices must be unique grad_indices = grad._indices() grad_values = grad._values() size = grad.size()
@torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad if grad.is_sparse: raise RuntimeError('Adadelta does not support sparse gradients') state = self.state[p]
# State initialization if len(state) == 0: state['step'] = 0 state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) state['acc_delta'] = torch.zeros_like(p, memory_format=torch.preserve_format)
@torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad if grad.is_sparse: raise RuntimeError('RMSprop does not support sparse gradients') state = self.state[p]
# State initialization if len(state) == 0: state['step'] = 0 state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) if group['momentum'] > 0: state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format) if group['centered']: state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
@torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: for p in group['params']: if p.grad is None: continue
# Perform optimization step grad = p.grad if grad.is_sparse: raise RuntimeError('AdamW does not support sparse gradients') amsgrad = group['amsgrad']
state = self.state[p]
# State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
# Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
@torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: for p in group['params']: if p.grad is None: continue
# Perform optimization step grad = p.grad if grad.is_sparse: raise RuntimeError('AdamW does not support sparse gradients') amsgrad = group['amsgrad']
state = self.state[p]
# State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
# Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])