优化器更新参数机制

Require: 初始化参数 θ\theta
Require: 学习率 ϵ\epsilon
while 没有达到停止准则 do
     计算梯度估计:  g1mθiL(f(x(i);θ),y(i))\boldsymbol{g} \leftarrow \frac{1}{m} \nabla_{\boldsymbol{\theta}} \sum_{i} L\left(f\left(\boldsymbol{x}^{(i)} ; \boldsymbol{\theta}\right), \boldsymbol{y}^{(i)}\right)
     计算一阶动量、二阶动量:  s=ϕ(g)s=\phi(g)r=ψ(g)r=\psi(g)
     计算更新:  Δθ=ϵs/r\Delta \theta = \epsilon \cdot s / r
     应用更新:  θ=θ+Δθ\theta = \theta + \Delta \theta

SGD / SGD with momentum / SGD with Nesterov

用法

  • weight_decay - L2正则化
  • momentum - 一阶动量权重系数
  • dampening - 动量抑制因子:数值越大,抑制程度越大,也就是动量对梯度的影响越小
  • nesterov - 是否使用Nesterov动量, nesterov动量本意是优化了momentum动量
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class SGD(Optimizer):

def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
...

@torch.no_grad()
def step(self, closure=None):
"""Performs a single optimization step.

Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()

for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']

for p in group['params']:
if p.grad is None:
continue
d_p = p.grad
if weight_decay != 0:
d_p = d_p.add(p, alpha=weight_decay)
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
if nesterov:
d_p = d_p.add(buf, alpha=momentum)
else:
d_p = buf

p.add_(d_p, alpha=-group['lr'])

return loss

AdaGrad

使用了二阶动量,对于更新频繁的节点,降低其学习率,对于更新比较少的节点,提高其学习率,从而做到了自适应学习率。

用法

  • lr_decay - 学习率衰减系数
  • weight_decay - L2正则化
  • initial_accumulator_value - sum第一轮初始化值
  • eps - 分母平滑项
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class Adagrad(Optimizer):

def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):
...

@torch.no_grad()
def step(self, closure=None):
"""Performs a single optimization step.

Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()

for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue

grad = p.grad
state = self.state[p]

state['step'] += 1

if group['weight_decay'] != 0:
if p.grad.is_sparse:
raise RuntimeError("weight_decay option is not compatible with sparse gradients")
grad = grad.add(p, alpha=group['weight_decay'])

clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])

if grad.is_sparse:
grad = grad.coalesce() # the update is non-linear so indices must be unique
grad_indices = grad._indices()
grad_values = grad._values()
size = grad.size()

def make_sparse(values):
constructor = grad.new
if grad_indices.dim() == 0 or values.dim() == 0:
return constructor().resize_as_(grad)
return constructor(grad_indices, values, size)
state['sum'].add_(make_sparse(grad_values.pow(2)))
std = state['sum'].sparse_mask(grad)
std_values = std._values().sqrt_().add_(group['eps'])
p.add_(make_sparse(grad_values / std_values), alpha=-clr)
else:
# sum = sum + value*grad*grad
state['sum'].addcmul_(grad, grad, value=1)
# std = sqrt(sum) + eps
std = state['sum'].sqrt().add_(group['eps'])
# p_t = p_(t-1) - clr*grad/std
p.addcdiv_(grad, std, value=-clr)

return loss

Adadelta

vt=γvt1+(1γ)J(θt1)2Δθt1=RMS[Δθ]t2RMS[g]t1gt1θt=θt1+Δθt1\begin{aligned} &\mathrm{v}_{\mathrm{t}} =\gamma \mathrm{v}_{\mathrm{t}-1}+(1-\gamma) \nabla \mathrm{J}\left(\theta_{\mathrm{t}-1}\right)^{2} \\ % &\theta_{\mathrm{t}}=\theta_{\mathrm{t}-1}-\frac{\alpha}{\sqrt{\mathrm{v}_{\mathrm{t}}+\epsilon}} * \nabla \mathrm{J}\left(\theta_{\mathrm{t}-1}\right) &\Delta \theta_{t-1} =-\frac{R M S[\Delta \theta]_{t-2}}{R M S[g]_{t-1}} g_{t-1} \\ &\theta_{t} =\theta_{t-1}+\Delta \theta_{t-1} \end{aligned}

用法

  • rho - 指数加权平均系数
  • eps - 分母平滑项
  • weight_decay - L2正则化
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
class Adadelta(Optimizer):

def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
...

@torch.no_grad()
def step(self, closure=None):
"""Performs a single optimization step.

Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()

for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad
if grad.is_sparse:
raise RuntimeError('Adadelta does not support sparse gradients')
state = self.state[p]

# State initialization
if len(state) == 0:
state['step'] = 0
state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
state['acc_delta'] = torch.zeros_like(p, memory_format=torch.preserve_format)

square_avg, acc_delta = state['square_avg'], state['acc_delta']
rho, eps = group['rho'], group['eps']

state['step'] += 1

if group['weight_decay'] != 0:
grad = grad.add(p, alpha=group['weight_decay'])

# 梯度加权平均
square_avg.mul_(rho).addcmul_(grad, grad, value=1 - rho)
# 加入平滑项
std = square_avg.add(eps).sqrt_()
delta = acc_delta.add(eps).sqrt_().div_(std).mul_(grad)
# p_t = p_(t-1) - lr * delta
p.add_(delta, alpha=-group['lr'])
acc_delta.mul_(rho).addcmul_(delta, delta, value=1 - rho)

return loss

RMSProp

与AdaDelta差不多,只是学习率全部由自己设定

用法

  • alpha - 二阶动量指数加权平均系数
  • eps - 分母平滑项
  • weight_decay - L2正则化
  • momentum - 一阶动量,指上一个step的梯度呈指数加权来影响当前梯度
  • centered - 是否将梯度对其方差进行归一化
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class RMSprop(Optimizer):

def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
...

@torch.no_grad()
def step(self, closure=None):
"""Performs a single optimization step.

Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()

for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad
if grad.is_sparse:
raise RuntimeError('RMSprop does not support sparse gradients')
state = self.state[p]

# State initialization
if len(state) == 0:
state['step'] = 0
state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
if group['momentum'] > 0:
state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format)
if group['centered']:
state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)

square_avg = state['square_avg']
alpha = group['alpha']

state['step'] += 1

if group['weight_decay'] != 0:
grad = grad.add(p, alpha=group['weight_decay'])

square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)

if group['centered']:
grad_avg = state['grad_avg']
grad_avg.mul_(alpha).add_(grad, alpha=1 - alpha)
avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).sqrt_().add_(group['eps'])
else:
avg = square_avg.sqrt().add_(group['eps'])

if group['momentum'] > 0:
buf = state['momentum_buffer']
buf.mul_(group['momentum']).addcdiv_(grad, avg)
p.add_(buf, alpha=-group['lr'])
else:
p.addcdiv_(grad, avg, value=-group['lr'])

return loss

AdamW / AMSGrad

同时使用一阶动量和二阶动量来完成自适应学习率,并保持正确的L2正则化

用法

  • params - 待优化参数的iterable,可分层学习率
  • lr - 学习率
  • betas - betas[0] 一阶动量指数加权平均系数,betas[1] 二阶动量指数加权平均系数
  • eps - 分母平滑项
  • weight_decay - L2正则化系数
  • amsgrad - 是否使用amsgrad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class AdamW(Optimizer):

def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=1e-2, amsgrad=False):
...

@torch.no_grad()
def step(self, closure=None):
"""Performs a single optimization step.

Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()

for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue

# Perform stepweight decay
p.mul_(1 - group['lr'] * group['weight_decay'])

# Perform optimization step
grad = p.grad
if grad.is_sparse:
raise RuntimeError('AdamW does not support sparse gradients')
amsgrad = group['amsgrad']

state = self.state[p]

# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)

exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']

state['step'] += 1
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']

# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
else:
denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

step_size = group['lr'] / bias_correction1

p.addcdiv_(exp_avg, denom, value=-step_size)

return loss

Adam / AMSGrad

同时使用一阶动量和二阶动量来完成自适应学习率,但实现的方式中L2正则受一二阶动量影响,使得L2正则不能在模型中保持各向同性

用法

同AdamW

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class Adam(Optimizer):

def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
...

@torch.no_grad()
def step(self, closure=None):
"""Performs a single optimization step.

Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()

for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue

# Perform stepweight decay
p.mul_(1 - group['lr'] * group['weight_decay'])

# Perform optimization step
grad = p.grad
if grad.is_sparse:
raise RuntimeError('AdamW does not support sparse gradients')
amsgrad = group['amsgrad']

state = self.state[p]

# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)

exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']

state['step'] += 1
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']

# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
else:
denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

step_size = group['lr'] / bias_correction1

p.addcdiv_(exp_avg, denom, value=-step_size)

return loss