Appendix B: PyTorch API Reference

A practical reference for the PyTorch patterns used throughout this textbook, organized by topic. All code targets PyTorch 2.x and assumes import torch and import torch.nn as nn unless noted otherwise.

B.1 Tensor Fundamentals

Creation

import torch

# From Python data
x = torch.tensor([1.0, 2.0, 3.0])                     # float32 by default
x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float64) # explicit dtype

# Common constructors
zeros = torch.zeros(3, 4)                # 3x4 of zeros
ones = torch.ones(2, 3)                  # 2x3 of ones
eye = torch.eye(4)                       # 4x4 identity
rng = torch.arange(0, 10, 2)            # [0, 2, 4, 6, 8]
lin = torch.linspace(0, 1, steps=5)     # [0.0, 0.25, 0.5, 0.75, 1.0]
empty = torch.empty(3, 3)               # uninitialized memory

# Random tensors
uniform = torch.rand(3, 4)              # Uniform [0, 1)
normal = torch.randn(3, 4)              # Standard normal
randint = torch.randint(0, 10, (3, 4))  # Integers in [0, 10)

# From NumPy (shares memory — no copy)
import numpy as np
arr = np.array([1.0, 2.0, 3.0])
t = torch.from_numpy(arr)
back_to_np = t.numpy()

# Like-constructors (match shape, dtype, device)
y = torch.zeros_like(x)
y = torch.randn_like(x)

Shape Manipulation

x = torch.randn(2, 3, 4)

x.shape                    # torch.Size([2, 3, 4])
x.view(6, 4)              # reshape (contiguous memory required)
x.reshape(6, 4)           # reshape (works even if non-contiguous)
x.permute(2, 0, 1)        # reorder dimensions -> (4, 2, 3)
x.transpose(0, 2)         # swap dim 0 and dim 2 -> (4, 3, 2)
x.unsqueeze(0)             # add dim at position 0 -> (1, 2, 3, 4)
x.squeeze()                # remove all size-1 dims
x.flatten()                # -> (24,)
x.flatten(start_dim=1)    # -> (2, 12)
x.contiguous()             # force contiguous memory layout

# Concatenation and stacking
a = torch.randn(2, 3)
b = torch.randn(2, 3)
torch.cat([a, b], dim=0)    # (4, 3) — concatenate along existing dim
torch.stack([a, b], dim=0)  # (2, 2, 3) — creates new dim

Indexing and Slicing

x = torch.randn(4, 5)

x[0]                       # first row
x[:, 2]                    # third column
x[1:3, :]                 # rows 1-2
x[x > 0]                  # boolean mask (returns 1D)
x[[0, 2, 3], :]           # fancy indexing: rows 0, 2, 3

# Gather and scatter (used in embedding lookups, advanced indexing)
indices = torch.tensor([0, 2, 1])
torch.gather(x, dim=1, index=indices.unsqueeze(0).expand(4, -1))

Key Operations

a = torch.randn(3, 4)
b = torch.randn(4, 5)

# Matrix multiplication (three equivalent forms)
c = a @ b                      # operator syntax
c = torch.matmul(a, b)        # function syntax
c = torch.mm(a, b)            # explicit 2D matmul

# Batched matrix multiply
batch_a = torch.randn(8, 3, 4)
batch_b = torch.randn(8, 4, 5)
batch_c = torch.bmm(batch_a, batch_b)  # (8, 3, 5)

# Einstein summation (flexible, readable)
c = torch.einsum("ij,jk->ik", a, b)           # matmul
d = torch.einsum("bij,bjk->bik", batch_a, batch_b)  # batched matmul

# Element-wise operations
a * b_broadcast      # Hadamard product (with broadcasting)
a + 1                # scalar broadcast
torch.exp(a)
torch.log(a)
torch.clamp(a, min=0)  # equivalent to ReLU

# Reductions
a.sum()                # scalar
a.sum(dim=1)           # sum over columns -> (3,)
a.mean(dim=0)          # mean over rows -> (4,)
a.max(dim=1)           # returns (values, indices)
a.argmax(dim=1)        # indices only

Device Management

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x = torch.randn(3, 4, device=device)     # create on device
x = x.to(device)                          # move existing tensor
x = x.cpu()                               # move to CPU
x = x.cuda()                              # move to first GPU
x = x.to("cuda:1")                       # move to specific GPU

# Check device
x.device      # e.g., device(type='cuda', index=0)
x.is_cuda     # True/False

B.2 nn.Module Patterns

Basic Module

import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int,
                 dropout: float = 0.1):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(hidden_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.gelu(self.norm(self.fc1(x)))
        x = self.dropout(x)
        x = F.gelu(self.norm(self.fc2(x)))
        x = self.dropout(x)
        return self.fc3(x)

model = MLP(input_dim=784, hidden_dim=256, output_dim=10)

Common Layers Reference

# Linear (fully connected)
nn.Linear(in_features=128, out_features=64, bias=True)

# Convolution
nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
nn.Conv1d(in_channels=128, out_channels=256, kernel_size=5, padding=2)

# Recurrent
nn.LSTM(input_size=128, hidden_size=256, num_layers=2,
        batch_first=True, dropout=0.1, bidirectional=True)
nn.GRU(input_size=128, hidden_size=256, num_layers=1, batch_first=True)

# Normalization
nn.BatchNorm2d(num_features=64)      # for conv outputs (N, C, H, W)
nn.LayerNorm(normalized_shape=256)    # for transformer / MLP
nn.GroupNorm(num_groups=8, num_channels=64)

# Regularization
nn.Dropout(p=0.1)
nn.Dropout2d(p=0.1)                   # spatial dropout for conv

# Pooling
nn.MaxPool2d(kernel_size=2, stride=2)
nn.AdaptiveAvgPool2d(output_size=(1, 1))  # global average pooling

# Embedding
nn.Embedding(num_embeddings=10000, embedding_dim=256, padding_idx=0)

# Containers
nn.Sequential(nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, 10))
nn.ModuleList([nn.Linear(128, 128) for _ in range(4)])
nn.ModuleDict({"encoder": nn.Linear(128, 64), "decoder": nn.Linear(64, 128)})

Weight Initialization

def init_weights(module: nn.Module):
    if isinstance(module, nn.Linear):
        nn.init.kaiming_normal_(module.weight, nonlinearity="relu")
        if module.bias is not None:
            nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
        nn.init.normal_(module.weight, mean=0.0, std=0.02)
    elif isinstance(module, nn.LayerNorm):
        nn.init.ones_(module.weight)
        nn.init.zeros_(module.bias)

model.apply(init_weights)

Parameter Inspection

# Count parameters
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Parameters: {total:,} total, {trainable:,} trainable")

# Iterate named parameters
for name, param in model.named_parameters():
    print(f"{name}: shape={param.shape}, requires_grad={param.requires_grad}")

# Freeze parameters
for param in model.fc1.parameters():
    param.requires_grad = False

B.3 Autograd

Gradient Computation

x = torch.tensor([2.0, 3.0], requires_grad=True)
y = (x ** 2).sum()     # y = x0^2 + x1^2
y.backward()            # compute gradients
print(x.grad)           # tensor([4., 6.])  (dy/dx = 2x)

# Gradient accumulates — always zero before backward
x.grad.zero_()

Controlling Gradient Flow

# Detach from computation graph
z = x.detach()              # new tensor, no grad tracking
z = x.clone().detach()      # safe copy

# No-grad context (inference, evaluation)
with torch.no_grad():
    output = model(input_data)

# Inference mode (faster than no_grad, more restrictive)
with torch.inference_mode():
    output = model(input_data)

# Selective gradient: stop gradient through one branch
loss = loss_a + loss_b.detach()  # gradients flow only through loss_a

# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)

Custom Autograd Function

class StraightThroughEstimator(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x: torch.Tensor) -> torch.Tensor:
        return (x > 0).float()    # hard threshold

    @staticmethod
    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
        return grad_output         # pass gradient through unchanged

# Usage
ste = StraightThroughEstimator.apply
y = ste(x)

B.4 DataLoader and Dataset

Map-Style Dataset

from torch.utils.data import Dataset, DataLoader

class TabularDataset(Dataset):
    def __init__(self, features: np.ndarray, labels: np.ndarray):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self) -> int:
        return len(self.labels)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        return self.features[idx], self.labels[idx]

dataset = TabularDataset(X_train, y_train)
loader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=True,
    num_workers=4,         # parallel data loading
    pin_memory=True,       # faster CPU-to-GPU transfer
    drop_last=True,        # drop incomplete final batch
    persistent_workers=True # keep workers alive between epochs
)

for batch_features, batch_labels in loader:
    batch_features = batch_features.to(device)
    batch_labels = batch_labels.to(device)
    # forward pass ...

Iterable-Style Dataset (for Streaming Data)

from torch.utils.data import IterableDataset

class StreamingDataset(IterableDataset):
    def __init__(self, file_paths: list[str]):
        self.file_paths = file_paths

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
            paths = self.file_paths
        else:
            per_worker = len(self.file_paths) // worker_info.num_workers
            start = worker_info.id * per_worker
            end = start + per_worker
            paths = self.file_paths[start:end]

        for path in paths:
            with open(path, "r") as f:
                for line in f:
                    features, label = self._parse_line(line)
                    yield features, label

    def _parse_line(self, line: str) -> tuple[torch.Tensor, torch.Tensor]:
        parts = line.strip().split(",")
        features = torch.tensor([float(x) for x in parts[:-1]])
        label = torch.tensor(int(parts[-1]))
        return features, label

Data Transforms (Torchvision Example)

from torchvision import transforms

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

B.5 Training Loop

Standard Training Loop

model = MLP(input_dim=784, hidden_dim=256, output_dim=10).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
criterion = nn.CrossEntropyLoss()

for epoch in range(100):
    model.train()
    epoch_loss = 0.0
    correct = 0
    total = 0

    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad(set_to_none=True)  # more efficient than zero_grad()
        logits = model(features)
        loss = criterion(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        epoch_loss += loss.item() * features.size(0)
        correct += (logits.argmax(dim=1) == labels).sum().item()
        total += labels.size(0)

    scheduler.step()

    train_loss = epoch_loss / total
    train_acc = correct / total

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            logits = model(features)
            loss = criterion(logits, labels)
            val_loss += loss.item() * features.size(0)
            val_correct += (logits.argmax(dim=1) == labels).sum().item()
            val_total += labels.size(0)

    val_loss /= val_total
    val_acc = val_correct / val_total

    print(f"Epoch {epoch+1:3d} | "
          f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f} | "
          f"LR: {scheduler.get_last_lr()[0]:.2e}")

B.6 Distributed Training

DistributedDataParallel (DDP)

DDP replicates the model on each GPU and synchronizes gradients with an all-reduce after each backward pass.

import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler

def setup(rank: int, world_size: int):
    dist.init_process_group(
        backend="nccl",
        init_method="env://",
        rank=rank,
        world_size=world_size,
    )
    torch.cuda.set_device(rank)

def cleanup():
    dist.destroy_process_group()

def train_ddp(rank: int, world_size: int):
    setup(rank, world_size)
    device = torch.device(f"cuda:{rank}")

    model = MLP(784, 256, 10).to(device)
    model = DDP(model, device_ids=[rank])

    sampler = DistributedSampler(dataset, num_replicas=world_size,
                                  rank=rank, shuffle=True)
    loader = DataLoader(dataset, batch_size=64, sampler=sampler,
                        num_workers=4, pin_memory=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

    for epoch in range(100):
        sampler.set_epoch(epoch)  # ensures proper shuffling each epoch
        model.train()
        for features, labels in loader:
            features, labels = features.to(device), labels.to(device)
            optimizer.zero_grad(set_to_none=True)
            loss = F.cross_entropy(model(features), labels)
            loss.backward()
            optimizer.step()

    cleanup()

# Launch: torchrun --nproc_per_node=4 train_script.py

Fully Sharded Data Parallel (FSDP)

FSDP shards model parameters, gradients, and optimizer states across GPUs, allowing training of models that do not fit on a single GPU.

from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import MixedPrecision
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
from functools import partial

# Define a wrapping policy: shard at the TransformerBlock level
wrap_policy = partial(
    transformer_auto_wrap_policy,
    transformer_layer_cls={TransformerBlock},
)

# Mixed precision policy for FSDP
mp_policy = MixedPrecision(
    param_dtype=torch.bfloat16,
    reduce_dtype=torch.bfloat16,
    buffer_dtype=torch.bfloat16,
)

model = LargeTransformer().to(device)
model = FSDP(
    model,
    auto_wrap_policy=wrap_policy,
    mixed_precision=mp_policy,
    device_id=rank,
)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(num_epochs):
    model.train()
    for batch in loader:
        optimizer.zero_grad(set_to_none=True)
        loss = model(batch).loss
        loss.backward()
        model.clip_grad_norm_(max_norm=1.0)
        optimizer.step()

B.7 Mixed Precision Training (AMP)

Automatic mixed precision uses float16 (or bfloat16) for most operations and float32 for numerically sensitive operations (e.g., loss computation, softmax).

from torch.amp import autocast, GradScaler

scaler = GradScaler("cuda")

for features, labels in train_loader:
    features, labels = features.to(device), labels.to(device)
    optimizer.zero_grad(set_to_none=True)

    with autocast("cuda"):               # forward pass in mixed precision
        logits = model(features)
        loss = criterion(logits, labels)  # loss computed in float32

    scaler.scale(loss).backward()         # scale loss to prevent underflow
    scaler.unscale_(optimizer)            # unscale before clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    scaler.step(optimizer)                # optimizer step with unscaled grads
    scaler.update()                       # adjust scale factor

Using bfloat16 (no scaler needed on Ampere+ GPUs):

# bfloat16 has the same exponent range as float32, so gradient scaling
# is typically unnecessary.
with autocast("cuda", dtype=torch.bfloat16):
    logits = model(features)
    loss = criterion(logits, labels)

loss.backward()
optimizer.step()

B.8 Model Saving and Loading

Save and Load State Dict (Recommended)

# Save
torch.save(model.state_dict(), "model_weights.pt")

# Load
model = MLP(784, 256, 10)
model.load_state_dict(torch.load("model_weights.pt", weights_only=True))
model.eval()

Save Full Checkpoint (for Resuming Training)

checkpoint = {
    "epoch": epoch,
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "scheduler_state_dict": scheduler.state_dict(),
    "best_val_loss": best_val_loss,
    "scaler_state_dict": scaler.state_dict(),   # if using AMP
}
torch.save(checkpoint, "checkpoint_epoch_50.pt")

# Resume
checkpoint = torch.load("checkpoint_epoch_50.pt", weights_only=False)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
scaler.load_state_dict(checkpoint["scaler_state_dict"])
start_epoch = checkpoint["epoch"] + 1
best_val_loss = checkpoint["best_val_loss"]

Save DDP Model

# DDP wraps model in .module — save the inner model
if rank == 0:  # only save on rank 0
    torch.save(model.module.state_dict(), "model_ddp.pt")

Export for Inference (TorchScript)

# Tracing (works for models without control flow)
example_input = torch.randn(1, 784).to(device)
traced = torch.jit.trace(model, example_input)
traced.save("model_traced.pt")

# Load in production (no Python dependency needed for C++ runtime)
loaded = torch.jit.load("model_traced.pt")
output = loaded(example_input)

Export to ONNX

example_input = torch.randn(1, 784).to(device)
torch.onnx.export(
    model,
    example_input,
    "model.onnx",
    input_names=["input"],
    output_names=["logits"],
    dynamic_axes={"input": {0: "batch_size"}, "logits": {0: "batch_size"}},
    opset_version=17,
)

B.9 Debugging Common Issues

NaN Gradients

NaN gradients typically originate from numerical instability: division by zero, log of zero, or exploding values.

# Detect NaN in forward pass
torch.autograd.set_detect_anomaly(True)   # enables anomaly detection

# Pinpoint the first NaN
for name, param in model.named_parameters():
    if param.grad is not None and torch.isnan(param.grad).any():
        print(f"NaN gradient in: {name}")

# Common fixes
# 1. Add epsilon to denominators
safe_div = numerator / (denominator + 1e-8)

# 2. Clamp log inputs
safe_log = torch.log(torch.clamp(prob, min=1e-7))

# 3. Use numerically stable loss
loss = F.cross_entropy(logits, labels)   # internally applies log_softmax
# NOT: loss = -torch.log(F.softmax(logits, dim=1))  # numerically unstable

# 4. Gradient clipping (see Section B.3)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

Important: Disable anomaly detection for production training, as it significantly slows down computation.

Memory Leaks

# Symptom: GPU memory grows each iteration

# Cause 1: Accumulating loss tensors on the computation graph
# WRONG:
total_loss += loss               # keeps entire graph in memory
# CORRECT:
total_loss += loss.item()        # .item() extracts Python scalar

# Cause 2: Storing tensors with gradient history in lists
# WRONG:
all_outputs.append(output)       # holds references to graph
# CORRECT:
all_outputs.append(output.detach().cpu())

# Cause 3: Forgetting no_grad during evaluation
# WRONG:
model.eval()
output = model(x)               # still builds computation graph
# CORRECT:
model.eval()
with torch.no_grad():
    output = model(x)

# Monitor GPU memory
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Reserved:  {torch.cuda.memory_reserved() / 1e9:.2f} GB")
torch.cuda.memory_summary()     # detailed breakdown

# Force garbage collection
import gc
gc.collect()
torch.cuda.empty_cache()        # releases cached memory back to CUDA

CUDA Errors

# RuntimeError: CUDA out of memory
# Solutions (in order of preference):
# 1. Reduce batch size
# 2. Use gradient accumulation
accumulation_steps = 4
for i, (features, labels) in enumerate(loader):
    features, labels = features.to(device), labels.to(device)
    with autocast("cuda"):
        loss = criterion(model(features), labels) / accumulation_steps
    loss.backward()
    if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

# 3. Use gradient checkpointing (trades compute for memory)
from torch.utils.checkpoint import checkpoint

class MemoryEfficientBlock(nn.Module):
    def __init__(self, layer):
        super().__init__()
        self.layer = layer

    def forward(self, x):
        return checkpoint(self.layer, x, use_reentrant=False)

# 4. Move to mixed precision (Section B.7)
# 5. Move to FSDP (Section B.6)

# RuntimeError: CUDA error: device-side assert triggered
# This usually means an index is out of bounds.
# Debug by running with CUDA_LAUNCH_BLOCKING=1:
#   CUDA_LAUNCH_BLOCKING=1 python train.py
# Common causes:
# - Label index >= num_classes in CrossEntropyLoss
# - Negative indices in embedding lookup

# RuntimeError: Expected all tensors to be on the same device
# Fix: ensure model and data are on the same device
model = model.to(device)
x = x.to(device)
y = y.to(device)

Shape Mismatches

# RuntimeError: mat1 and mat2 shapes cannot be multiplied
# Debug with explicit shape printing:
class DebugModel(nn.Module):
    def forward(self, x):
        print(f"Input: {x.shape}")
        x = self.encoder(x)
        print(f"After encoder: {x.shape}")
        x = x.flatten(start_dim=1)
        print(f"After flatten: {x.shape}")
        x = self.fc(x)
        print(f"After fc: {x.shape}")
        return x

# Common pattern: conv output size calculation
# out_size = floor((in_size + 2*padding - kernel_size) / stride) + 1
# Use a dummy forward pass to determine sizes:
with torch.no_grad():
    dummy = torch.randn(1, 3, 224, 224)
    dummy = conv_layers(dummy)
    print(f"Conv output: {dummy.shape}")
    flat_size = dummy.numel()  # use this for first Linear layer

Reproducibility

import random
import numpy as np

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

# Note: full determinism requires setting environment variable:
#   CUBLAS_WORKSPACE_CONFIG=:4096:8
# and calling:
torch.use_deterministic_algorithms(True)

B.10 Useful Utilities

Learning Rate Schedulers

# Cosine annealing
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=100, eta_min=1e-6
)

# Cosine annealing with warm restarts
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer, T_0=10, T_mult=2
)

# Linear warmup + cosine decay (common in transformers)
from torch.optim.lr_scheduler import LambdaLR

def warmup_cosine_schedule(step: int, warmup_steps: int = 1000,
                            total_steps: int = 100000) -> float:
    if step < warmup_steps:
        return step / warmup_steps
    progress = (step - warmup_steps) / (total_steps - warmup_steps)
    return 0.5 * (1.0 + math.cos(math.pi * progress))

scheduler = LambdaLR(optimizer, lr_lambda=warmup_cosine_schedule)

# ReduceLROnPlateau (adaptive)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=10
)
# Note: call scheduler.step(val_loss) after validation

torch.compile (PyTorch 2.x)

# Compiles the model into optimized kernels
model = torch.compile(model)

# Compile with specific backend
model = torch.compile(model, backend="inductor")     # default, best for CUDA
model = torch.compile(model, mode="reduce-overhead") # optimizes for small batches
model = torch.compile(model, mode="max-autotune")    # slower compile, faster run

Profiling

from torch.profiler import profile, ProfilerActivity, schedule, tensorboard_trace_handler

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(wait=1, warmup=1, active=3, repeat=1),
    on_trace_ready=tensorboard_trace_handler("./profiler_logs"),
    record_shapes=True,
    profile_memory=True,
    with_stack=True,
) as prof:
    for step, (features, labels) in enumerate(train_loader):
        if step >= 5:
            break
        features, labels = features.to(device), labels.to(device)
        optimizer.zero_grad(set_to_none=True)
        loss = criterion(model(features), labels)
        loss.backward()
        optimizer.step()
        prof.step()

# View results: tensorboard --logdir=./profiler_logs

For the most current PyTorch API documentation, see pytorch.org/docs/stable. The patterns in this appendix target PyTorch 2.x; syntax for earlier versions may differ.