PyTorch 自动微分
计算图原理
PyTorch 使用动态计算图(Define-by-Run),每次前向传播都构建新的计算图。
python
import torch
# 叶子节点
x = torch.tensor(3.0, requires_grad=True)
w = torch.tensor(2.0, requires_grad=True)
b = torch.tensor(1.0, requires_grad=True)
# 前向传播(构建计算图)
y = w * x + b # y = 2*3 + 1 = 7
# 反向传播
y.backward()
print(f"dy/dx = {x.grad}") # 2.0 (= w)
print(f"dy/dw = {w.grad}") # 3.0 (= x)
print(f"dy/db = {b.grad}") # 1.0
# 梯度累积(多次 backward 会累加)
x.grad.zero_() # 清零梯度自定义梯度函数
python
class SigmoidFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, x):
output = 1 / (1 + torch.exp(-x))
ctx.save_for_backward(output) # 保存用于反向传播
return output
@staticmethod
def backward(ctx, grad_output):
output, = ctx.saved_tensors
grad_input = grad_output * output * (1 - output)
return grad_input
sigmoid = SigmoidFunction.apply
x = torch.randn(3, requires_grad=True)
y = sigmoid(x)
y.sum().backward()
print(x.grad)梯度检查
python
# 验证自定义梯度是否正确
from torch.autograd import gradcheck
x = torch.randn(3, dtype=torch.float64, requires_grad=True)
test = gradcheck(SigmoidFunction.apply, (x,), eps=1e-6)
print(f"梯度检查通过: {test}")梯度裁剪(LLM 训练必备)
python
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
for batch in dataloader:
loss = model(batch)
loss.backward()
# 梯度裁剪,防止梯度爆炸
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
optimizer.zero_grad()混合精度训练
python
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for batch in dataloader:
with autocast(): # 自动使用 fp16
loss = model(batch)
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()