Appendix C: Python Reference for AI Engineering

This appendix provides a quick reference for the Python libraries most heavily used in this textbook: NumPy, PyTorch, and HuggingFace Transformers. It is designed as a lookup resource, not a tutorial. For conceptual explanations, refer to the relevant chapters.

C.1 NumPy Quick Reference

NumPy is the foundation of numerical computing in Python. Nearly every ML library builds on its array abstraction.

C.1.1 Array Creation

import numpy as np

# From Python lists
a = np.array([1, 2, 3])                    # 1D array
A = np.array([[1, 2], [3, 4]])             # 2D array (matrix)

# Common constructors
np.zeros((3, 4))                            # 3x4 matrix of zeros
np.ones((2, 3))                             # 2x3 matrix of ones
np.full((3, 3), 7.0)                        # 3x3 matrix filled with 7.0
np.eye(4)                                   # 4x4 identity matrix
np.diag([1, 2, 3])                          # 3x3 diagonal matrix

# Ranges and spacing
np.arange(0, 10, 2)                         # [0, 2, 4, 6, 8]
np.linspace(0, 1, 5)                        # [0.0, 0.25, 0.5, 0.75, 1.0]
np.logspace(-3, 3, 7)                       # 7 points from 10^-3 to 10^3

# Random arrays
rng = np.random.default_rng(seed=42)        # Modern random generator
rng.standard_normal((3, 4))                 # Standard normal samples
rng.uniform(0, 1, size=(2, 3))             # Uniform [0, 1)
rng.integers(0, 10, size=(5,))             # Random integers in [0, 10)
rng.choice([1, 2, 3, 4], size=2, replace=False)  # Sampling without replacement

C.1.2 Array Properties and Reshaping

a = np.zeros((3, 4, 5))

a.shape          # (3, 4, 5)
a.ndim           # 3
a.size           # 60
a.dtype          # float64

# Reshaping
a.reshape(12, 5)             # New shape, same data
a.reshape(-1, 5)             # Infer first dimension: (12, 5)
a.flatten()                  # 1D copy
a.ravel()                    # 1D view (no copy if possible)
a.T                          # Transpose (reverses all axes)
a.transpose(2, 0, 1)        # Custom axis permutation
np.expand_dims(a, axis=0)   # Add dimension: (1, 3, 4, 5)
np.squeeze(a)                # Remove dimensions of size 1

C.1.3 Indexing and Slicing

A = np.arange(12).reshape(3, 4)
# A = [[ 0,  1,  2,  3],
#      [ 4,  5,  6,  7],
#      [ 8,  9, 10, 11]]

A[0, 1]          # 1 (scalar)
A[1, :]          # [4, 5, 6, 7] (row 1)
A[:, 2]          # [2, 6, 10] (column 2)
A[0:2, 1:3]      # [[1, 2], [5, 6]] (submatrix)
A[A > 5]         # [6, 7, 8, 9, 10, 11] (boolean indexing)
A[[0, 2], :]     # Rows 0 and 2 (fancy indexing)

C.1.4 Mathematical Operations

a = np.array([1.0, 2.0, 3.0])
b = np.array([4.0, 5.0, 6.0])

# Element-wise operations
a + b            # [5, 7, 9]
a * b            # [4, 10, 18]
a ** 2           # [1, 4, 9]
np.exp(a)        # Element-wise exponential
np.log(a)        # Element-wise natural log
np.sqrt(a)       # Element-wise square root
np.clip(a, 1.5, 2.5)  # Clamp values to [1.5, 2.5]

# Dot products and matrix multiplication
np.dot(a, b)             # 32.0 (dot product)
A @ B                    # Matrix multiplication (Python 3.5+)
np.matmul(A, B)          # Equivalent to @

# Reductions
np.sum(A, axis=0)        # Column sums
np.sum(A, axis=1)        # Row sums
np.mean(A, axis=0)       # Column means
np.std(A, axis=1)        # Row standard deviations
np.max(A, axis=1)        # Row maxima
np.argmax(A, axis=1)     # Indices of row maxima
np.cumsum(a)             # Cumulative sum

# Linear algebra
np.linalg.norm(a)                # L2 norm
np.linalg.norm(a, ord=1)        # L1 norm
np.linalg.inv(A)                # Matrix inverse
np.linalg.det(A)                # Determinant
np.linalg.eig(A)                # Eigenvalues and eigenvectors
np.linalg.svd(A)                # Singular value decomposition
np.linalg.solve(A, b)           # Solve Ax = b

C.1.5 Broadcasting

Broadcasting allows operations on arrays of different shapes. Rules: 1. Dimensions are compared from the trailing (rightmost) dimension. 2. Dimensions are compatible if they are equal or one of them is 1. 3. Missing leading dimensions are treated as 1.

A = np.ones((3, 4))       # Shape: (3, 4)
b = np.array([1, 2, 3, 4])  # Shape: (4,) -> broadcast to (3, 4)
A + b                       # Works: each row of A gets b added

c = np.array([[10], [20], [30]])  # Shape: (3, 1) -> broadcast to (3, 4)
A + c                              # Works: each column gets c added

# Common pattern: normalize rows
row_means = A.mean(axis=1, keepdims=True)  # Shape: (3, 1)
A_normalized = A - row_means                # Broadcasting subtracts row means

C.2 PyTorch Quick Reference

PyTorch is the primary deep learning framework used in this textbook.

C.2.1 Tensors

import torch

# Creation (similar to NumPy)
x = torch.tensor([1.0, 2.0, 3.0])
x = torch.zeros(3, 4)
x = torch.ones(2, 3)
x = torch.randn(3, 4)            # Standard normal
x = torch.rand(3, 4)             # Uniform [0, 1)
x = torch.arange(0, 10, 2)
x = torch.linspace(0, 1, 5)
x = torch.eye(4)

# Device management
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(3, 4, device=device)
x = x.to(device)
x = x.cpu()               # Move to CPU
x = x.cuda()              # Move to GPU (if available)

# Dtype management
x = torch.randn(3, 4, dtype=torch.float32)
x = x.half()              # Convert to float16
x = x.to(torch.bfloat16)  # Convert to bfloat16

# Conversion to/from NumPy
numpy_array = x.cpu().numpy()           # Tensor -> NumPy
tensor = torch.from_numpy(numpy_array)  # NumPy -> Tensor

C.2.2 Autograd (Automatic Differentiation)

# Requires grad tracking
x = torch.randn(3, requires_grad=True)
y = (x ** 2).sum()
y.backward()               # Compute gradients
print(x.grad)              # dy/dx = 2x

# Disabling gradient tracking
with torch.no_grad():
    predictions = model(inputs)    # Inference mode, no grad computation

# Detaching from computation graph
z = x.detach()             # z shares data but has no grad_fn

# Zeroing gradients (critical in training loops)
optimizer.zero_grad()

C.2.3 Building Models with nn.Module

import torch.nn as nn
import torch.nn.functional as F

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.classifier = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids) + self.pos_encoding[:, :input_ids.size(1), :]
        x = self.dropout(x)
        x = self.transformer(x, src_key_padding_mask=~attention_mask.bool())
        x = x[:, 0, :]              # CLS token
        return self.classifier(x)

# Inspect model
model = TransformerClassifier(30000, 256, 8, 4, 3)
print(model)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

C.2.4 Common Layers Reference

# Linear (fully connected)
nn.Linear(in_features, out_features, bias=True)

# Convolution
nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0)

# Recurrent
nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=False)
nn.GRU(input_size, hidden_size, num_layers, batch_first=True)

# Normalization
nn.LayerNorm(normalized_shape)
nn.BatchNorm1d(num_features)
nn.RMSNorm(normalized_shape)             # PyTorch 2.4+

# Activation
nn.ReLU()
nn.GELU()
nn.SiLU()                                # Swish activation

# Regularization
nn.Dropout(p=0.1)

# Embedding
nn.Embedding(num_embeddings, embedding_dim, padding_idx=None)

C.2.5 Loss Functions

# Classification
nn.CrossEntropyLoss()            # Combines LogSoftmax + NLLLoss; input: logits
nn.BCEWithLogitsLoss()           # Binary cross-entropy; input: logits
nn.NLLLoss()                     # Negative log-likelihood; input: log-probabilities

# Regression
nn.MSELoss()                     # Mean squared error
nn.L1Loss()                      # Mean absolute error
nn.SmoothL1Loss()                # Huber loss

# Contrastive / ranking
nn.CosineEmbeddingLoss()
nn.TripletMarginLoss()

C.2.6 Optimizers and Schedulers

from torch.optim import Adam, AdamW, SGD
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR, LinearLR

# AdamW (most common for transformers)
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)

# Learning rate schedulers
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
scheduler = OneCycleLR(optimizer, max_lr=1e-3, total_steps=total_steps)

# Warmup + decay (common for LLM fine-tuning)
warmup = LinearLR(optimizer, start_factor=0.1, total_iters=warmup_steps)
decay = CosineAnnealingLR(optimizer, T_max=total_steps - warmup_steps)
scheduler = torch.optim.lr_scheduler.SequentialLR(
    optimizer, schedulers=[warmup, decay], milestones=[warmup_steps]
)

C.2.7 Data Loading

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4,
    pin_memory=True,          # Faster GPU transfer
    drop_last=True,           # Drop incomplete last batch
)

C.2.8 Training Loop Pattern

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

C.2.9 Mixed Precision Training

from torch.amp import autocast, GradScaler

scaler = GradScaler("cuda")
model.train()

for batch in train_loader:
    optimizer.zero_grad()
    with autocast("cuda"):
        outputs = model(batch["input_ids"].to(device))
        loss = criterion(outputs, batch["labels"].to(device))

    scaler.scale(loss).backward()
    scaler.unscale_(optimizer)
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    scaler.step(optimizer)
    scaler.update()

C.2.10 Model Saving and Loading

# Save full checkpoint
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "epoch": epoch,
    "loss": loss,
}, "checkpoint.pt")

# Load checkpoint
checkpoint = torch.load("checkpoint.pt", map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

C.3 HuggingFace Transformers Quick Reference

The HuggingFace ecosystem is the standard toolkit for working with pre-trained language models.

C.3.1 Pipeline API (Fastest Start)

from transformers import pipeline

# Text classification
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
result = classifier("This movie is wonderful!")
# [{'label': 'POSITIVE', 'score': 0.9998}]

# Text generation
generator = pipeline("text-generation", model="gpt2")
output = generator("The future of AI is", max_new_tokens=50, temperature=0.7)

# Question answering
qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
answer = qa(question="What is the capital of France?", context="France is a country in Europe. Its capital is Paris.")

# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(long_text, max_length=150, min_length=40)

# Embeddings (feature extraction)
embedder = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedder("Hello world")

# Zero-shot classification
zsc = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
result = zsc("I love playing tennis", candidate_labels=["sports", "cooking", "politics"])

C.3.2 AutoModel and Tokenizer

from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Tokenization
inputs = tokenizer(
    "Hello, world!",
    return_tensors="pt",        # "pt" for PyTorch, "tf" for TensorFlow
    padding=True,
    truncation=True,
    max_length=512,
)
# inputs contains: input_ids, attention_mask, (token_type_ids for BERT)

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

last_hidden = outputs.last_hidden_state    # (batch, seq_len, hidden_dim)
pooler = outputs.pooler_output              # (batch, hidden_dim), BERT only

# Causal LM (text generation)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
inputs = tokenizer("The meaning of life is", return_tensors="pt")
output_ids = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.1,
)
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

C.3.3 Tokenizer Deep Dive

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Encode
token_ids = tokenizer.encode("Hello world")           # List of int IDs
tokens = tokenizer.tokenize("Hello world")             # List of string tokens
# ['hello', 'world']

# Decode
text = tokenizer.decode(token_ids, skip_special_tokens=True)

# Batch encoding with padding
batch = tokenizer(
    ["Short text.", "This is a much longer piece of text that needs padding."],
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt",
)
# batch["input_ids"].shape: (2, max_len_in_batch)
# batch["attention_mask"]: 1 for real tokens, 0 for padding

# Special tokens
tokenizer.cls_token       # '[CLS]'
tokenizer.sep_token       # '[SEP]'
tokenizer.pad_token       # '[PAD]'
tokenizer.unk_token       # '[UNK]'
tokenizer.cls_token_id    # 101
tokenizer.vocab_size       # 30522

# Adding custom tokens
tokenizer.add_tokens(["<CUSTOM>"])
model.resize_token_embeddings(len(tokenizer))

C.3.4 Fine-Tuning with Trainer API

from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset
import evaluate

# Load dataset
dataset = load_dataset("glue", "sst2")

# Tokenize
def preprocess(examples):
    return tokenizer(examples["sentence"], truncation=True, max_length=128)

tokenized = dataset.map(preprocess, batched=True)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)

# Define metrics
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    dataloader_num_workers=4,
    report_to="wandb",          # or "tensorboard", "none"
)

# Create Trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()
trainer.evaluate()
trainer.save_model("./final_model")

C.3.5 Parameter-Efficient Fine-Tuning (PEFT / LoRA)

from peft import LoraConfig, get_peft_model, TaskType

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,                          # Rank
    lora_alpha=32,                 # Scaling factor
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    bias="none",
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622

# Save and load LoRA adapters (only saves the adapter weights)
model.save_pretrained("./lora_adapter")
# To load:
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
model = PeftModel.from_pretrained(base_model, "./lora_adapter")
model = model.merge_and_unload()   # Merge LoRA weights into base model

C.3.6 Quantized Model Loading

from transformers import BitsAndBytesConfig

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-70B",
    quantization_config=bnb_config,
    device_map="auto",           # Automatically distribute across GPUs
)

C.4 Common Patterns and Idioms

C.4.1 Reproducibility

import random
import numpy as np
import torch

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

C.4.2 GPU Memory Management

# Check GPU memory
torch.cuda.memory_allocated() / 1e9    # GB currently allocated
torch.cuda.max_memory_allocated() / 1e9  # Peak GB allocated

# Free memory
del model
torch.cuda.empty_cache()
import gc; gc.collect()

# Gradient accumulation (simulate larger batch sizes)
accumulation_steps = 4
for i, batch in enumerate(train_loader):
    loss = model(**batch).loss / accumulation_steps
    loss.backward()
    if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

# Gradient checkpointing (trade compute for memory)
model.gradient_checkpointing_enable()

C.4.3 Distributed Training

# Launch: torchrun --nproc_per_node=4 train.py
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

dist.init_process_group("nccl")
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)

model = model.to(local_rank)
model = DDP(model, device_ids=[local_rank])

# With HuggingFace Accelerate (recommended)
from accelerate import Accelerator
accelerator = Accelerator()
model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

for batch in train_loader:
    outputs = model(**batch)
    accelerator.backward(outputs.loss)
    optimizer.step()
    optimizer.zero_grad()

C.4.4 Inference Optimization

# Compile model (PyTorch 2.0+)
model = torch.compile(model, mode="reduce-overhead")

# Export to ONNX
torch.onnx.export(
    model, dummy_input, "model.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={"input_ids": {0: "batch", 1: "seq_len"}},
)

# Key-value cache for autoregressive generation
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    use_cache=True,       # Enable KV caching (default)
)

C.4.5 Configuration Management

from dataclasses import dataclass, field
from typing import Optional

@dataclass
class TrainConfig:
    model_name: str = "bert-base-uncased"
    learning_rate: float = 2e-5
    batch_size: int = 16
    num_epochs: int = 3
    max_length: int = 512
    seed: int = 42
    output_dir: str = "./output"
    fp16: bool = True
    gradient_accumulation_steps: int = 1
    warmup_ratio: float = 0.1
    weight_decay: float = 0.01
    logging_steps: int = 100
    eval_steps: int = 500
    tags: list[str] = field(default_factory=list)

# Parse from command line with HuggingFace
from transformers import HfArgumentParser
parser = HfArgumentParser(TrainConfig)
config = parser.parse_args_into_dataclasses()[0]

C.4.6 Evaluation Utilities

import evaluate

# Load multiple metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# Combined evaluation
def compute_metrics(predictions, references):
    return {
        **accuracy.compute(predictions=predictions, references=references),
        **f1.compute(predictions=predictions, references=references, average="weighted"),
    }

# BLEU for generation tasks
bleu_score = bleu.compute(
    predictions=["the cat sat on mat"],
    references=[["the cat is on the mat"]],
)

# ROUGE for summarization
rouge_score = rouge.compute(
    predictions=["the cat sat on the mat"],
    references=["the cat is sitting on a mat"],
)

C.4.7 Logging and Experiment Tracking

# Weights & Biases
import wandb
wandb.init(project="my-project", config=config.__dict__)
wandb.log({"loss": loss.item(), "accuracy": acc, "epoch": epoch})
wandb.finish()

# TensorBoard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/experiment_1")
writer.add_scalar("Loss/train", loss.item(), global_step)
writer.add_scalar("Accuracy/val", accuracy, global_step)
writer.close()