Appendix C: Python Reference

This appendix provides a concise reference for the Python libraries and patterns used throughout this textbook. It is not a Python tutorial; readers are assumed to have basic proficiency. The focus is on the specific functions, idioms, and workflows that appear repeatedly in sports betting analysis.

All code assumes Python 3.9+ and the following imports unless otherwise noted:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss

C.1 Pandas Operations

C.1.1 Loading and Inspecting Data

# Load data from CSV
df = pd.read_csv('nfl_games_2023.csv', parse_dates=['game_date'])

# Load from multiple file types
df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1')
df_json = pd.read_json('odds_data.json')
df_sql = pd.read_sql('SELECT * FROM games', connection)

# Quick inspection
df.shape              # (rows, columns)
df.dtypes             # Column types
df.describe()         # Summary statistics
df.info()             # Non-null counts and memory usage
df.head(10)           # First 10 rows
df.sample(5)          # 5 random rows
df.isnull().sum()     # Missing values per column
df.nunique()          # Unique values per column

C.1.2 Selection and Filtering

# Column selection
df['spread']                      # Single column (Series)
df[['team', 'spread', 'total']]   # Multiple columns (DataFrame)

# Row filtering
df[df['spread'] < -3]                                  # Simple condition
df[(df['spread'] < -3) & (df['total'] > 45)]           # Multiple conditions
df[df['team'].isin(['KC', 'BUF', 'SF'])]               # In-list filter
df.query('spread < -3 and total > 45')                  # Query syntax

# loc and iloc
df.loc[df['team'] == 'KC', 'spread']        # Label-based
df.iloc[0:10, 2:5]                          # Position-based

C.1.3 Transformations and Feature Engineering

# New columns
df['cover'] = (df['score_diff'] + df['spread'] > 0).astype(int)
df['total_points'] = df['home_score'] + df['away_score']
df['over'] = (df['total_points'] > df['total_line']).astype(int)
df['implied_prob'] = 1 / df['decimal_odds']

# Apply functions
df['elo_diff'] = df.apply(lambda row: elo[row['home']] - elo[row['away']], axis=1)

# Rolling calculations (important for form/momentum features)
df['rolling_ppg'] = df.groupby('team')['points'].transform(
    lambda x: x.rolling(5, min_periods=1).mean()
)

# Lag features
df['prev_score'] = df.groupby('team')['points'].shift(1)

# Binning
df['spread_bucket'] = pd.cut(df['spread'], bins=[-30, -10, -3, 0, 3, 10, 30])

# Encoding categorical variables
df_encoded = pd.get_dummies(df, columns=['day_of_week', 'surface'], drop_first=True)

C.1.4 Grouping and Aggregation

# Basic groupby
df.groupby('team')['cover'].mean()                    # ATS win rate by team
df.groupby('season')['total_points'].agg(['mean', 'std', 'count'])
df.groupby(['season', 'team']).agg(
    wins=('result', 'sum'),
    games=('result', 'count'),
    avg_spread=('spread', 'mean')
).reset_index()

# Pivot tables
pd.pivot_table(df, values='cover', index='home_team',
               columns='season', aggfunc='mean')

# Cross-tabulation
pd.crosstab(df['favorite_cover'], df['over'], margins=True)

C.1.5 Merging and Joining

# Merge game data with odds data
merged = pd.merge(games, odds, on=['game_id'], how='inner')

# Merge with team-level features
merged = pd.merge(games, team_stats, left_on='home_team', right_on='team')

# Concatenate DataFrames
all_seasons = pd.concat([df_2021, df_2022, df_2023], ignore_index=True)

C.2 NumPy Operations

C.2.1 Array Creation and Manipulation

# Creation
a = np.array([1.5, 2.3, 4.1, 3.7])
zeros = np.zeros((100, 5))
ones = np.ones(50)
grid = np.linspace(0, 1, 101)           # 101 points from 0 to 1
rng = np.arange(0, 10, 0.5)             # 0, 0.5, 1, ..., 9.5

# Random number generation (preferred modern API)
rng = np.random.default_rng(seed=42)
samples = rng.normal(loc=0, scale=13.5, size=10000)   # NFL score diffs
bets = rng.binomial(n=1, p=0.53, size=1000)            # Simulated bet outcomes
poisson_goals = rng.poisson(lam=1.4, size=10000)        # Goal simulation

C.2.2 Mathematical Operations

# Element-wise operations
returns = np.log(1 + profit_pct)            # Log returns
growth = np.exp(np.cumsum(returns))          # Cumulative bankroll growth
kelly = (prob * odds - 1) / (odds - 1)      # Vectorized Kelly fraction

# Aggregations
np.mean(a), np.median(a), np.std(a, ddof=1)
np.percentile(a, [25, 50, 75])
np.corrcoef(x, y)                           # Correlation matrix
np.cov(x, y)                                # Covariance matrix

# Linear algebra
np.dot(w, x)                                # Dot product
np.linalg.inv(X.T @ X) @ X.T @ y           # OLS in matrix form
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

C.2.3 Simulation Patterns

# Monte Carlo simulation for bankroll paths (Chapter 8)
def simulate_bankroll(p, odds, kelly_frac, n_bets, n_sims, bankroll=1000):
    rng = np.random.default_rng(42)
    outcomes = rng.binomial(1, p, size=(n_sims, n_bets))
    multipliers = np.where(outcomes == 1,
                           1 + kelly_frac * (odds - 1),
                           1 - kelly_frac)
    paths = bankroll * np.cumprod(multipliers, axis=1)
    return paths

# Poisson match simulation (Chapter 7)
def simulate_match(home_rate, away_rate, n_sims=100000):
    rng = np.random.default_rng(42)
    home_goals = rng.poisson(home_rate, n_sims)
    away_goals = rng.poisson(away_rate, n_sims)
    home_win = np.mean(home_goals > away_goals)
    draw = np.mean(home_goals == away_goals)
    away_win = np.mean(home_goals < away_goals)
    return home_win, draw, away_win

C.3 SciPy Statistics Functions

C.3.1 Distribution Objects

from scipy import stats

# Normal distribution
norm = stats.norm(loc=0, scale=13.5)      # NFL point spread distribution
norm.pdf(3)                                # Density at x=3
norm.cdf(0)                                # P(X <= 0)
norm.sf(7)                                 # P(X > 7) = 1 - cdf(7)
norm.ppf(0.95)                             # 95th percentile (inverse CDF)
norm.rvs(size=1000, random_state=42)       # Random samples

# Poisson distribution
pois = stats.poisson(mu=2.5)
pois.pmf(3)                                # P(X = 3)
pois.cdf(2)                                # P(X <= 2)

# Beta distribution (for Bayesian win probability)
prior = stats.beta(a=2, b=2)
posterior = stats.beta(a=2 + wins, b=2 + losses)
posterior.mean()
posterior.interval(0.95)                    # 95% credible interval

# Student's t
t_dist = stats.t(df=29)
t_dist.ppf(0.975)                          # Critical value for two-tailed test

C.3.2 Hypothesis Tests

# One-sample t-test: Is mean ROI different from 0?
t_stat, p_value = stats.ttest_1samp(roi_values, popmean=0)

# Two-sample t-test: Does model A outperform model B?
t_stat, p_value = stats.ttest_ind(returns_A, returns_B)

# Paired t-test: Same games, different models
t_stat, p_value = stats.ttest_rel(preds_A, preds_B)

# Chi-squared goodness of fit (calibration test)
chi2, p_value = stats.chisquare(f_obs=observed, f_exp=expected)

# Proportions z-test
from statsmodels.stats.proportion import proportions_ztest
z_stat, p_value = proportions_ztest(count=wins, nobs=n_bets, value=0.5238)

# Kolmogorov-Smirnov test (distribution comparison)
ks_stat, p_value = stats.ks_2samp(sample1, sample2)

# Mann-Whitney U test (non-parametric comparison)
u_stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')

C.3.3 Regression and Correlation

# Simple linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Spearman rank correlation
rho, p_value = stats.spearmanr(predicted_spread, actual_spread)

# Pearson correlation
r, p_value = stats.pearsonr(clv, roi)

C.4 Matplotlib and Seaborn Plotting

C.4.1 Essential Plot Types

# Figure setup
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
plt.style.use('seaborn-v0_8-whitegrid')

# Histogram of point spreads
axes[0].hist(df['spread'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Point Spread')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of NFL Point Spreads')

# Scatter plot with regression line
sns.regplot(x='predicted_prob', y='actual_outcome', data=df,
            logistic=True, scatter_kws={'alpha': 0.3}, ax=axes[1])
axes[1].set_xlabel('Predicted Probability')
axes[1].set_ylabel('Actual Outcome')
plt.tight_layout()
plt.savefig('figures/spread_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

C.4.2 Betting-Specific Visualizations

# Calibration plot (Chapter 17)
from sklearn.calibration import calibration_curve
prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=10)
plt.plot(prob_pred, prob_true, 's-', label='Model')
plt.plot([0, 1], [0, 1], '--', color='gray', label='Perfect')
plt.xlabel('Predicted Probability')
plt.ylabel('Observed Frequency')
plt.legend()
plt.title('Calibration Plot')

# Bankroll trajectory plot
for i in range(min(50, paths.shape[0])):
    plt.plot(paths[i], alpha=0.15, color='steelblue')
plt.plot(np.median(paths, axis=0), color='darkblue', linewidth=2, label='Median')
plt.xlabel('Bet Number')
plt.ylabel('Bankroll ($)')
plt.yscale('log')
plt.title('Simulated Bankroll Trajectories (Kelly Criterion)')
plt.legend()

# ROC curve
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_true, y_prob)
plt.plot(fpr, tpr, label=f'AUC = {auc(fpr, tpr):.3f}')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()

# Heatmap of confusion matrix or correlation matrix
sns.heatmap(df[features].corr(), annot=True, fmt='.2f',
            cmap='RdBu_r', center=0, vmin=-1, vmax=1)
plt.title('Feature Correlation Matrix')

C.4.3 Seaborn Specialized Plots

# Box plots for spread performance by team
sns.boxplot(x='team', y='margin', data=df, order=team_order)
plt.xticks(rotation=90)

# Violin plot for score distributions
sns.violinplot(x='surface', y='total_points', data=df, inner='quartile')

# Pair plot for feature exploration
sns.pairplot(df[['spread', 'total', 'elo_diff', 'rest_days', 'cover']],
             hue='cover', diag_kind='kde')

# Joint plot for bivariate analysis
sns.jointplot(x='closing_spread', y='actual_margin', data=df,
              kind='reg', height=7)

C.5 Scikit-Learn Common Patterns

C.5.1 Data Preparation

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# IMPORTANT: For betting, use temporal splits, not random splits
df_sorted = df.sort_values('game_date')
train = df_sorted[df_sorted['season'] < 2023]
test = df_sorted[df_sorted['season'] == 2023]

X_train, y_train = train[features], train['cover']
X_test, y_test = test[features], test['cover']

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)       # Use train statistics!

# Time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
for train_idx, val_idx in tscv.split(X):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

C.5.2 Model Training and Evaluation

from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, log_loss, brier_score_loss,
                             classification_report, roc_auc_score)

# Logistic Regression (the workhorse model)
lr = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs', max_iter=1000)
lr.fit(X_train_scaled, y_train)
probs = lr.predict_proba(X_test_scaled)[:, 1]

# Random Forest
rf = RandomForestClassifier(n_estimators=500, max_depth=6,
                            min_samples_leaf=20, random_state=42)
rf.fit(X_train, y_train)

# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05,
                                 max_depth=4, subsample=0.8, random_state=42)
gb.fit(X_train, y_train)

# XGBoost (if installed)
import xgboost as xgb
xgb_model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.05,
                                max_depth=4, subsample=0.8,
                                colsample_bytree=0.8, random_state=42)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)],
              verbose=False)

# Evaluation
print(f"Accuracy:    {accuracy_score(y_test, probs > 0.5):.4f}")
print(f"Log Loss:    {log_loss(y_test, probs):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, probs):.4f}")
print(f"ROC AUC:     {roc_auc_score(y_test, probs):.4f}")

C.5.3 Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['saga']
}

grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid,
                    cv=TimeSeriesSplit(5), scoring='neg_log_loss',
                    n_jobs=-1, verbose=1)
grid.fit(X_train_scaled, y_train)
print(f"Best params: {grid.best_params_}")
print(f"Best score:  {grid.best_score_:.4f}")
best_model = grid.best_estimator_

C.5.4 Feature Importance

# Logistic regression coefficients
coef_df = pd.DataFrame({'feature': features, 'coef': lr.coef_[0]})
coef_df['abs_coef'] = coef_df['coef'].abs()
coef_df.sort_values('abs_coef', ascending=False, inplace=True)

# Random forest feature importance
imp_df = pd.DataFrame({'feature': features,
                        'importance': rf.feature_importances_})
imp_df.sort_values('importance', ascending=False, inplace=True)

# Permutation importance (model-agnostic)
from sklearn.inspection import permutation_importance
perm_imp = permutation_importance(rf, X_test, y_test,
                                   n_repeats=10, random_state=42)

C.6 PyTorch Basics

C.6.1 Tensor Operations

import torch
import torch.nn as nn
import torch.optim as optim

# Tensor creation
x = torch.tensor([1.0, 2.0, 3.0])
X = torch.from_numpy(X_train_scaled).float()
y = torch.from_numpy(y_train.values).float().unsqueeze(1)

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = X.to(device)
y = y.to(device)

C.6.2 Neural Network for Game Prediction

class BettingNet(nn.Module):
    def __init__(self, input_dim, hidden_dims=[64, 32]):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h),
                nn.ReLU(),
                nn.BatchNorm1d(h),
                nn.Dropout(0.3)
            ])
            prev_dim = h
        layers.append(nn.Linear(prev_dim, 1))
        layers.append(nn.Sigmoid())
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

# Initialize
model = BettingNet(input_dim=X.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

C.6.3 Training Loop

from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

model.train()
for epoch in range(100):
    epoch_loss = 0.0
    for batch_X, batch_y in loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * batch_X.size(0)
    epoch_loss /= len(dataset)
    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1:3d}, Loss: {epoch_loss:.4f}")

# Evaluation
model.eval()
with torch.no_grad():
    X_test_t = torch.from_numpy(X_test_scaled).float().to(device)
    test_probs = model(X_test_t).cpu().numpy().flatten()

C.6.4 Early Stopping Pattern

best_val_loss = float('inf')
patience = 10
patience_counter = 0
best_state = None

for epoch in range(500):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        loss = criterion(model(batch_X), batch_y)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        val_loss = criterion(model(X_val_t), y_val_t).item()

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_state = model.state_dict().copy()
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

model.load_state_dict(best_state)

C.7 Common Idioms Used Throughout the Book

C.7.1 Odds Conversion Utilities

def american_to_decimal(american):
    """Convert American odds to decimal odds."""
    if american > 0:
        return 1 + american / 100
    else:
        return 1 + 100 / abs(american)

def decimal_to_implied(decimal_odds):
    """Convert decimal odds to implied probability."""
    return 1 / decimal_odds

def remove_vig(prob_a, prob_b):
    """Remove the vig from a two-way market to get fair probabilities."""
    total = prob_a + prob_b
    return prob_a / total, prob_b / total

def implied_to_american(prob):
    """Convert implied probability to American odds."""
    if prob >= 0.5:
        return -100 * prob / (1 - prob)
    else:
        return 100 * (1 - prob) / prob

C.7.2 Kelly Criterion Implementation

def kelly_fraction(prob, decimal_odds, kelly_multiplier=0.25):
    """Calculate the fractional Kelly bet size."""
    b = decimal_odds - 1
    edge = prob * b - (1 - prob)
    if edge <= 0:
        return 0.0
    full_kelly = edge / b
    return full_kelly * kelly_multiplier

def kelly_multi_outcome(probs, odds):
    """Kelly for multiple simultaneous independent bets (approximate)."""
    fractions = [kelly_fraction(p, o) for p, o in zip(probs, odds)]
    total = sum(fractions)
    if total > 1.0:
        fractions = [f / total for f in fractions]
    return fractions

C.7.3 Elo Rating System

class EloSystem:
    def __init__(self, k=20, home_advantage=65, initial_rating=1500,
                 season_reversion=0.33):
        self.k = k
        self.home_advantage = home_advantage
        self.initial = initial_rating
        self.reversion = season_reversion
        self.ratings = {}

    def get_rating(self, team):
        return self.ratings.get(team, self.initial)

    def expected_score(self, rating_a, rating_b):
        return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

    def update(self, home, away, home_score, away_score):
        r_h = self.get_rating(home) + self.home_advantage
        r_a = self.get_rating(away)
        exp_h = self.expected_score(r_h, r_a)
        actual_h = 1.0 if home_score > away_score else (0.5 if home_score == away_score else 0.0)
        self.ratings[home] = self.get_rating(home) + self.k * (actual_h - exp_h)
        self.ratings[away] = self.get_rating(away) + self.k * (exp_h - actual_h)

    def new_season(self):
        mean_rating = np.mean(list(self.ratings.values())) if self.ratings else self.initial
        for team in self.ratings:
            self.ratings[team] = (self.ratings[team] * (1 - self.reversion)
                                  + mean_rating * self.reversion)

C.7.4 Backtest Framework

def backtest_strategy(df, model, features, stake_func, initial_bankroll=10000):
    """Walk-forward backtest of a betting strategy."""
    bankroll = initial_bankroll
    results = []

    for season in sorted(df['season'].unique()):
        train_data = df[df['season'] < season]
        test_data = df[df['season'] == season]
        if len(train_data) < 100:
            continue

        model.fit(train_data[features], train_data['cover'])
        probs = model.predict_proba(test_data[features])[:, 1]

        for idx, (_, game) in enumerate(test_data.iterrows()):
            prob = probs[idx]
            odds = game['decimal_odds']
            stake = stake_func(prob, odds, bankroll)
            if stake > 0:
                profit = stake * (odds - 1) if game['cover'] else -stake
                bankroll += profit
                results.append({
                    'game_date': game['game_date'],
                    'prob': prob,
                    'odds': odds,
                    'stake': stake,
                    'profit': profit,
                    'bankroll': bankroll
                })

    return pd.DataFrame(results)

C.7.5 Poisson Goal Model

from scipy.stats import poisson

def poisson_match_probs(home_rate, away_rate, max_goals=10):
    """Calculate exact match outcome probabilities from Poisson rates."""
    prob_matrix = np.zeros((max_goals + 1, max_goals + 1))
    for i in range(max_goals + 1):
        for j in range(max_goals + 1):
            prob_matrix[i, j] = (poisson.pmf(i, home_rate)
                                 * poisson.pmf(j, away_rate))

    home_win = np.sum(np.tril(prob_matrix, -1))
    draw = np.sum(np.diag(prob_matrix))
    away_win = np.sum(np.triu(prob_matrix, 1))
    over_2_5 = sum(prob_matrix[i, j]
                   for i in range(max_goals + 1)
                   for j in range(max_goals + 1)
                   if i + j > 2)

    return {
        'home_win': home_win, 'draw': draw, 'away_win': away_win,
        'over_2.5': over_2_5, 'under_2.5': 1 - over_2_5,
        'score_matrix': prob_matrix
    }

C.7.6 Profit and Performance Metrics

def calculate_metrics(results_df):
    """Calculate standard betting performance metrics."""
    n = len(results_df)
    wins = (results_df['profit'] > 0).sum()
    total_staked = results_df['stake'].sum()
    total_profit = results_df['profit'].sum()
    roi = total_profit / total_staked if total_staked > 0 else 0

    # Yield (profit per unit)
    yield_pct = roi * 100

    # Drawdown
    cumulative = results_df['profit'].cumsum()
    running_max = cumulative.cummax()
    drawdowns = cumulative - running_max
    max_drawdown = drawdowns.min()

    # Sharpe-like ratio (daily)
    daily_returns = results_df.groupby('game_date')['profit'].sum()
    sharpe = (daily_returns.mean() / daily_returns.std()
              * np.sqrt(252) if daily_returns.std() > 0 else 0)

    # CLV (if closing odds available)
    if 'closing_odds' in results_df.columns:
        clv = (1 / results_df['decimal_odds'] - 1 / results_df['closing_odds']).mean()
    else:
        clv = None

    return {
        'n_bets': n, 'win_rate': wins / n, 'total_profit': total_profit,
        'roi': roi, 'yield_pct': yield_pct, 'max_drawdown': max_drawdown,
        'sharpe': sharpe, 'clv': clv
    }

For installation instructions and environment setup, see the companion repository at the URL listed in the Preface. All code in this appendix has been tested with Python 3.11, pandas 2.1, numpy 1.26, scipy 1.12, scikit-learn 1.4, and PyTorch 2.2.