Skip to content

PyTorch 自动微分

计算图原理

PyTorch 使用动态计算图(Define-by-Run),每次前向传播都构建新的计算图。

python
import torch

# 叶子节点
x = torch.tensor(3.0, requires_grad=True)
w = torch.tensor(2.0, requires_grad=True)
b = torch.tensor(1.0, requires_grad=True)

# 前向传播(构建计算图)
y = w * x + b  # y = 2*3 + 1 = 7

# 反向传播
y.backward()

print(f"dy/dx = {x.grad}")  # 2.0 (= w)
print(f"dy/dw = {w.grad}")  # 3.0 (= x)
print(f"dy/db = {b.grad}")  # 1.0

# 梯度累积(多次 backward 会累加)
x.grad.zero_()  # 清零梯度

自定义梯度函数

python
class SigmoidFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        output = 1 / (1 + torch.exp(-x))
        ctx.save_for_backward(output)  # 保存用于反向传播
        return output
    
    @staticmethod
    def backward(ctx, grad_output):
        output, = ctx.saved_tensors
        grad_input = grad_output * output * (1 - output)
        return grad_input

sigmoid = SigmoidFunction.apply
x = torch.randn(3, requires_grad=True)
y = sigmoid(x)
y.sum().backward()
print(x.grad)

梯度检查

python
# 验证自定义梯度是否正确
from torch.autograd import gradcheck

x = torch.randn(3, dtype=torch.float64, requires_grad=True)
test = gradcheck(SigmoidFunction.apply, (x,), eps=1e-6)
print(f"梯度检查通过: {test}")

梯度裁剪(LLM 训练必备)

python
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for batch in dataloader:
    loss = model(batch)
    loss.backward()
    
    # 梯度裁剪,防止梯度爆炸
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    optimizer.step()
    optimizer.zero_grad()

混合精度训练

python
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for batch in dataloader:
    with autocast():  # 自动使用 fp16
        loss = model(batch)
    
    scaler.scale(loss).backward()
    scaler.unscale_(optimizer)
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad()

本站内容由 褚成志 整理编写,仅供学习参考