Appendix C: Python Quick Reference for Soccer Analytics

This appendix provides a concise reference for the Python libraries, functions, and patterns used throughout this book. It assumes basic Python familiarity and focuses on the specific workflows most commonly encountered in soccer analytics.


C.1 Essential Libraries and Functions

C.1.1 Environment Setup

# Recommended imports for soccer analytics work
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    log_loss, brier_score_loss, roc_auc_score,
    confusion_matrix, classification_report
)
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.4f}'.format)
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100
plt.rcParams['font.size'] = 12

C.1.2 NumPy Essentials

# Array creation
x = np.array([1.2, 0.8, 2.1, 0.0, 1.5])    # From list
zeros = np.zeros((11, 2))                      # 11 players, 2 coordinates
grid = np.linspace(0, 105, 100)                # 100 evenly spaced points (pitch length)
rng = np.random.default_rng(42)                # Reproducible random generator

# Common operations
np.mean(x)                    # Mean
np.std(x, ddof=1)            # Sample standard deviation
np.median(x)                  # Median
np.percentile(x, [25, 50, 75])  # Quartiles
np.corrcoef(x, y)            # Correlation matrix
np.linalg.norm(v)            # Euclidean norm (distance)

# Distance calculations (common in tracking data)
def euclidean_distance(pos1, pos2):
    """Distance between two (x, y) positions."""
    return np.sqrt((pos1[0] - pos2[0])**2 + (pos1[1] - pos2[1])**2)

# Vectorized distance for arrays of positions
positions = np.array([[10, 20], [30, 40], [50, 60]])  # shape (n, 2)
target = np.array([25, 35])
distances = np.linalg.norm(positions - target, axis=1)

# Poisson probability (goal scoring)
from scipy.stats import poisson
prob_2_goals = poisson.pmf(2, mu=1.5)   # P(X=2) with lambda=1.5
prob_at_least_1 = 1 - poisson.pmf(0, mu=1.5)

C.1.3 Pandas Essentials

# Loading data
df = pd.read_csv('matches.csv')
df = pd.read_json('events.json')
df = pd.read_parquet('tracking_data.parquet')  # Preferred for large datasets

# Inspecting data
df.shape                      # (rows, columns)
df.info()                     # Column types and non-null counts
df.describe()                 # Summary statistics
df.head(10)                   # First 10 rows
df.columns.tolist()           # All column names
df.dtypes                     # Data types
df.isnull().sum()             # Count missing values per column

# Filtering
shots = df[df['type'] == 'Shot']
goals = df[(df['type'] == 'Shot') & (df['outcome'] == 'Goal')]
home_wins = df.query("home_score > away_score")
big_chances = df.loc[df['xG'] > 0.3]

# Grouping and aggregation
team_stats = df.groupby('team').agg(
    goals=('goals', 'sum'),
    xG=('xG', 'sum'),
    matches=('match_id', 'nunique'),
    shots=('shot', 'sum')
).reset_index()

# Adding derived columns
df['xG_diff'] = df['xG'] - df['xGA']
df['points_per_game'] = df['points'] / df['matches']
df['conversion_rate'] = df['goals'] / df['shots']
df['minutes_per_goal'] = 90 / df['goals_per_90']

# Pivot tables
pivot = df.pivot_table(
    values='xG',
    index='team',
    columns='season',
    aggfunc='mean'
)

# Rolling averages (form analysis)
df = df.sort_values(['team', 'date'])
df['xG_rolling_5'] = (
    df.groupby('team')['xG']
    .transform(lambda x: x.rolling(5, min_periods=1).mean())
)

# Merging datasets
merged = pd.merge(events, matches, on='match_id', how='left')
combined = pd.merge(
    player_stats, player_bio,
    on='player_id', how='inner'
)

# Ranking
df['xG_rank'] = df.groupby('season')['xG'].rank(ascending=False)

C.1.4 Matplotlib and Visualization

# Basic scatter plot: xG vs actual goals
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(df['xG'], df['goals'], alpha=0.6, s=40)
ax.plot([0, 80], [0, 80], 'r--', label='Perfect calibration')
ax.set_xlabel('Expected Goals (xG)')
ax.set_ylabel('Actual Goals')
ax.set_title('xG vs Actual Goals by Team (2023-24)')
ax.legend()
ax.set_aspect('equal')
plt.tight_layout()
plt.savefig('xg_vs_goals.png', dpi=150, bbox_inches='tight')
plt.show()

# Histogram of goal distributions
fig, ax = plt.subplots()
ax.hist(df['home_goals'], bins=range(0, 8), density=True,
        alpha=0.7, label='Observed', edgecolor='black')
# Overlay Poisson fit
from scipy.stats import poisson
x_vals = np.arange(0, 8)
lambda_hat = df['home_goals'].mean()
ax.plot(x_vals + 0.4, poisson.pmf(x_vals, lambda_hat), 'ro-',
        label=f'Poisson (lambda={lambda_hat:.2f})')
ax.set_xlabel('Goals')
ax.set_ylabel('Proportion')
ax.legend()
plt.show()

# Heatmap
fig, ax = plt.subplots(figsize=(10, 8))
corr = df[['xG', 'xGA', 'possession', 'ppda', 'points']].corr()
sns.heatmap(corr, annot=True, cmap='RdBu_r', center=0,
            vmin=-1, vmax=1, ax=ax)
ax.set_title('Correlation Matrix: Team Metrics')
plt.tight_layout()
plt.show()

# Bar chart with error bars
fig, ax = plt.subplots(figsize=(12, 5))
teams = team_stats['team']
means = team_stats['xG_per_90']
stds = team_stats['xG_std']
ax.barh(teams, means, xerr=stds, capsize=3, color='steelblue')
ax.set_xlabel('xG per 90')
ax.set_title('Team xG per 90 with Standard Deviation')
plt.tight_layout()
plt.show()

C.1.5 Statistical Modeling with Statsmodels

# Linear regression: Goals ~ xG
model = smf.ols('goals ~ xG', data=team_season).fit()
print(model.summary())

# Multiple regression
model = smf.ols('points ~ xG + xGA + possession', data=df).fit()
print(model.summary())
print(f"R-squared: {model.rsquared:.4f}")
print(f"AIC: {model.aic:.1f}")

# Logistic regression for xG model
xg_model = smf.logit(
    'goal ~ distance + angle + body_part_head + is_first_time',
    data=shots
).fit()
print(xg_model.summary())
shots['xG_pred'] = xg_model.predict(shots)

# Poisson regression (goal scoring model)
poisson_model = smf.glm(
    'goals ~ home + attack_strength + defense_weakness',
    data=match_teams,
    family=sm.families.Poisson()
).fit()
print(poisson_model.summary())

# Negative Binomial regression (for overdispersed counts)
nb_model = smf.glm(
    'goals ~ home + attack_strength + defense_weakness',
    data=match_teams,
    family=sm.families.NegativeBinomial()
).fit()

C.1.6 Scikit-learn for Predictive Models

# Prepare data
features = ['distance', 'angle', 'is_header', 'is_first_time',
            'num_defenders', 'gk_distance']
X = shots[features].values
y = shots['is_goal'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(C=1.0, max_iter=1000))
])
pipe.fit(X_train, y_train)

# Evaluation
y_prob = pipe.predict_proba(X_test)[:, 1]
print(f"Log Loss:      {log_loss(y_test, y_prob):.4f}")
print(f"Brier Score:   {brier_score_loss(y_test, y_prob):.4f}")
print(f"ROC AUC:       {roc_auc_score(y_test, y_prob):.4f}")

# Cross-validation
cv_scores = cross_val_score(pipe, X, y, cv=5, scoring='neg_log_loss')
print(f"CV Log Loss: {-cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Random Forest
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

rf = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)
rf.fit(X_train, y_train)

# Feature importance
importances = pd.Series(rf.feature_importances_, index=features)
importances.sort_values(ascending=True).plot(kind='barh')
plt.xlabel('Feature Importance')
plt.title('xG Model Feature Importance')
plt.tight_layout()
plt.show()

C.2 Common Soccer Analytics Patterns

C.2.1 Loading StatsBomb Open Data

# Method 1: Using the statsbombpy library
from statsbombpy import sb

# List available competitions
comps = sb.competitions()
print(comps[['competition_name', 'season_name', 'competition_id', 'season_id']])

# Get matches for a competition/season
matches = sb.matches(competition_id=11, season_id=90)  # La Liga 2023/24

# Get events for a specific match
events = sb.events(match_id=3869685)

# Get all shots from a match
shots = events[events['type'] == 'Shot']
print(shots[['player', 'shot_statsbomb_xg', 'shot_outcome', 'location']])

# Method 2: Loading from JSON files directly
import json
import os

def load_statsbomb_events(match_id, data_dir='open-data/data'):
    filepath = os.path.join(data_dir, 'events', f'{match_id}.json')
    with open(filepath, 'r') as f:
        events = json.load(f)
    return pd.json_normalize(events, sep='_')

# Extract x, y from location lists
events['x'] = events['location'].apply(lambda loc: loc[0] if isinstance(loc, list) else None)
events['y'] = events['location'].apply(lambda loc: loc[1] if isinstance(loc, list) else None)

C.2.2 Drawing a Soccer Pitch

def draw_pitch(ax=None, pitch_length=120, pitch_width=80,
               color='white', linecolor='black', linewidth=1.5):
    """
    Draw a soccer pitch on the given axes.
    Uses StatsBomb coordinate system (120 x 80 yards) by default.
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=(12, 8))

    ax.set_xlim(-2, pitch_length + 2)
    ax.set_ylim(-2, pitch_width + 2)
    ax.set_aspect('equal')
    ax.set_facecolor(color)

    # Pitch outline
    pitch_rect = patches.Rectangle((0, 0), pitch_length, pitch_width,
                                    fill=False, edgecolor=linecolor,
                                    linewidth=linewidth)
    ax.add_patch(pitch_rect)

    # Halfway line
    ax.plot([pitch_length/2, pitch_length/2], [0, pitch_width],
            color=linecolor, linewidth=linewidth)

    # Center circle
    center_circle = patches.Circle((pitch_length/2, pitch_width/2), 10,
                                    fill=False, edgecolor=linecolor,
                                    linewidth=linewidth)
    ax.add_patch(center_circle)
    ax.plot(pitch_length/2, pitch_width/2, 'o', color=linecolor, markersize=3)

    # Left penalty area
    left_pen = patches.Rectangle((0, 18), 18, 44,
                                  fill=False, edgecolor=linecolor,
                                  linewidth=linewidth)
    ax.add_patch(left_pen)

    # Right penalty area
    right_pen = patches.Rectangle((pitch_length - 18, 18), 18, 44,
                                   fill=False, edgecolor=linecolor,
                                   linewidth=linewidth)
    ax.add_patch(right_pen)

    # Left 6-yard box
    left_six = patches.Rectangle((0, 30), 6, 20,
                                  fill=False, edgecolor=linecolor,
                                  linewidth=linewidth)
    ax.add_patch(left_six)

    # Right 6-yard box
    right_six = patches.Rectangle((pitch_length - 6, 30), 6, 20,
                                   fill=False, edgecolor=linecolor,
                                   linewidth=linewidth)
    ax.add_patch(right_six)

    # Goals
    ax.plot([0, 0], [36, 44], color=linecolor, linewidth=linewidth * 2)
    ax.plot([pitch_length, pitch_length], [36, 44],
            color=linecolor, linewidth=linewidth * 2)

    # Penalty spots
    ax.plot(12, pitch_width/2, 'o', color=linecolor, markersize=4)
    ax.plot(pitch_length - 12, pitch_width/2, 'o', color=linecolor, markersize=4)

    # Penalty arcs
    left_arc = patches.Arc((12, pitch_width/2), 20, 20,
                            angle=0, theta1=-53, theta2=53,
                            color=linecolor, linewidth=linewidth)
    right_arc = patches.Arc((pitch_length - 12, pitch_width/2), 20, 20,
                             angle=0, theta1=127, theta2=233,
                             color=linecolor, linewidth=linewidth)
    ax.add_patch(left_arc)
    ax.add_patch(right_arc)

    # Remove axis labels
    ax.set_xticks([])
    ax.set_yticks([])

    return ax


# Usage: Shot map
fig, ax = plt.subplots(figsize=(12, 8))
ax = draw_pitch(ax)

goals = shots[shots['shot_outcome'] == 'Goal']
non_goals = shots[shots['shot_outcome'] != 'Goal']

ax.scatter(non_goals['x'], non_goals['y'], c='gray', s=non_goals['xG']*500,
           alpha=0.5, edgecolors='black', linewidth=0.5, zorder=5, label='No Goal')
ax.scatter(goals['x'], goals['y'], c='red', s=goals['xG']*500,
           alpha=0.8, edgecolors='black', linewidth=0.5, zorder=6, label='Goal')
ax.legend(loc='upper left')
ax.set_title('Shot Map: Team A vs Team B')
plt.tight_layout()
plt.show()

C.2.3 Building a Basic xG Model

def build_xg_model(shots_df):
    """
    Build a basic expected goals model from shot data.
    Requires columns: x, y, is_goal, body_part.
    """
    # Feature engineering
    df = shots_df.copy()

    # Distance to center of goal (assuming attacking right, goal at x=120)
    goal_x, goal_y = 120, 40
    df['distance'] = np.sqrt((df['x'] - goal_x)**2 + (df['y'] - goal_y)**2)

    # Angle to goal (radians)
    df['angle'] = np.abs(np.arctan2(
        9.32,  # goal width in meters (approximate in yards: 8)
        df['distance']
    ))

    # Dummy variables
    df['is_header'] = (df['body_part'] == 'Head').astype(int)
    df['is_right_foot'] = (df['body_part'] == 'Right Foot').astype(int)

    features = ['distance', 'angle', 'is_header']
    X = df[features]
    y = df['is_goal'].astype(int)

    # Fit logistic regression
    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)

    # Predict xG
    df['xG'] = model.predict_proba(X)[:, 1]

    return model, df


def evaluate_xg_model(y_true, y_pred_prob):
    """Evaluate an xG model with standard metrics."""
    results = {
        'Log Loss': log_loss(y_true, y_pred_prob),
        'Brier Score': brier_score_loss(y_true, y_pred_prob),
        'ROC AUC': roc_auc_score(y_true, y_pred_prob),
        'Calibration (mean pred)': y_pred_prob.mean(),
        'Calibration (mean actual)': y_true.mean(),
    }
    for metric, value in results.items():
        print(f"  {metric:25s}: {value:.4f}")
    return results

C.2.4 Match Simulation (Dixon-Coles)

def simulate_match(home_xg, away_xg, n_simulations=10000, rng=None):
    """
    Simulate a match outcome using independent Poisson distributions.

    Parameters
    ----------
    home_xg : float
        Expected goals for the home team.
    away_xg : float
        Expected goals for the away team.
    n_simulations : int
        Number of Monte Carlo simulations.

    Returns
    -------
    dict with keys: home_win_prob, draw_prob, away_win_prob,
                    expected_home_goals, expected_away_goals,
                    most_likely_score
    """
    if rng is None:
        rng = np.random.default_rng(42)

    home_goals = rng.poisson(home_xg, size=n_simulations)
    away_goals = rng.poisson(away_xg, size=n_simulations)

    home_wins = np.mean(home_goals > away_goals)
    draws = np.mean(home_goals == away_goals)
    away_wins = np.mean(home_goals < away_goals)

    # Most likely scoreline
    from collections import Counter
    scores = list(zip(home_goals, away_goals))
    most_common = Counter(scores).most_common(5)

    return {
        'home_win_prob': home_wins,
        'draw_prob': draws,
        'away_win_prob': away_wins,
        'expected_home_goals': home_goals.mean(),
        'expected_away_goals': away_goals.mean(),
        'most_likely_scores': most_common
    }

# Example
result = simulate_match(1.8, 1.2)
print(f"Home Win: {result['home_win_prob']:.1%}")
print(f"Draw:     {result['draw_prob']:.1%}")
print(f"Away Win: {result['away_win_prob']:.1%}")
for score, count in result['most_likely_scores']:
    print(f"  {score[0]}-{score[1]}: {count/10000:.1%}")

C.2.5 Pass Network Visualization

def plot_pass_network(passes_df, team, ax=None, min_passes=3):
    """
    Draw a pass network showing average positions and passing connections.

    Parameters
    ----------
    passes_df : DataFrame
        Must have columns: passer, recipient, passer_x, passer_y,
        recipient_x, recipient_y
    team : str
        Team name for the title.
    min_passes : int
        Minimum number of passes between two players to draw a connection.
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=(12, 8))
    ax = draw_pitch(ax)

    # Average positions
    avg_pos = passes_df.groupby('passer').agg(
        x=('passer_x', 'mean'),
        y=('passer_y', 'mean'),
        count=('passer', 'count')
    ).reset_index()

    # Pass combinations
    pair_counts = (passes_df.groupby(['passer', 'recipient'])
                   .size().reset_index(name='num_passes'))
    pair_counts = pair_counts[pair_counts['num_passes'] >= min_passes]

    # Draw connections
    for _, row in pair_counts.iterrows():
        p1 = avg_pos[avg_pos['passer'] == row['passer']].iloc[0]
        p2 = avg_pos[avg_pos['passer'] == row['recipient']].iloc[0]
        line_width = row['num_passes'] / pair_counts['num_passes'].max() * 5
        ax.plot([p1['x'], p2['x']], [p1['y'], p2['y']],
                color='blue', linewidth=line_width, alpha=0.5, zorder=3)

    # Draw player nodes
    ax.scatter(avg_pos['x'], avg_pos['y'],
               s=avg_pos['count'] * 3, c='red',
               edgecolors='black', linewidth=1.5, zorder=5)

    # Player labels
    for _, player in avg_pos.iterrows():
        name = player['passer'].split()[-1]  # Last name only
        ax.annotate(name, (player['x'], player['y']),
                    textcoords="offset points", xytext=(0, 10),
                    ha='center', fontsize=8, fontweight='bold',
                    zorder=6)

    ax.set_title(f'Pass Network: {team}', fontsize=14, fontweight='bold')
    return ax

C.3 Performance Tips

C.3.1 Working with Large Datasets

# Use appropriate data types to reduce memory
def optimize_dtypes(df):
    """Reduce memory usage by downcasting numeric types."""
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

# Before: df.info(memory_usage='deep')  -> e.g., 450 MB
# After:  optimize_dtypes(df).info(memory_usage='deep')  -> e.g., 180 MB

# Use Parquet for storage (faster reads, smaller files, preserves types)
df.to_parquet('tracking_data.parquet', index=False)
df = pd.read_parquet('tracking_data.parquet')

# Process tracking data in chunks
chunk_size = 100000
chunks = []
for chunk in pd.read_csv('large_tracking.csv', chunksize=chunk_size):
    processed = process_frame(chunk)  # Your processing function
    chunks.append(processed)
result = pd.concat(chunks, ignore_index=True)

# Use vectorized operations instead of loops
# BAD (slow):
for i, row in df.iterrows():
    df.loc[i, 'distance'] = np.sqrt(row['x']**2 + row['y']**2)

# GOOD (fast):
df['distance'] = np.sqrt(df['x']**2 + df['y']**2)

# Use .apply() only when vectorization is not possible
# When you must use apply, prefer axis=1 for row operations
df['custom_metric'] = df.apply(lambda row: complex_calc(row), axis=1)

C.3.2 Speeding Up Computation

# NumPy broadcasting for pairwise distances (tracking data)
def pairwise_distances(positions):
    """
    Compute all pairwise distances between n players.
    positions: array of shape (n, 2)
    Returns: distance matrix of shape (n, n)
    """
    diff = positions[:, np.newaxis, :] - positions[np.newaxis, :, :]
    return np.sqrt(np.sum(diff**2, axis=-1))

# Caching expensive computations
from functools import lru_cache

@lru_cache(maxsize=1024)
def get_player_season_stats(player_id, season):
    """Cache player stats to avoid repeated database queries."""
    return query_database(player_id, season)

# Parallel processing for independent match computations
from concurrent.futures import ProcessPoolExecutor

def process_match(match_id):
    events = load_events(match_id)
    return compute_xg(events)

match_ids = df['match_id'].unique()
with ProcessPoolExecutor(max_workers=4) as executor:
    results = list(executor.map(process_match, match_ids))

C.3.3 Reproducibility

# Always set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
rng = np.random.default_rng(SEED)

# For scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

# For PyTorch (if using deep learning)
import torch
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Save model artifacts
import joblib
joblib.dump(model, 'xg_model_v1.pkl')
loaded_model = joblib.load('xg_model_v1.pkl')

# Environment tracking
# Create requirements.txt:
# pip freeze > requirements.txt
# Or use a pyproject.toml / environment.yml for conda

C.4 Debugging Guide

C.4.1 Common Data Issues

# Problem: NaN values causing unexpected results
# Diagnosis:
df.isnull().sum()                     # Count NaNs per column
df[df['xG'].isnull()]                 # Inspect rows with NaN xG

# Solutions:
df = df.dropna(subset=['xG'])         # Drop rows where xG is NaN
df['xG'] = df['xG'].fillna(0)        # Fill NaN with 0
df['xG'] = df.groupby('team')['xG'].transform(
    lambda x: x.fillna(x.median())    # Fill with group median
)

# Problem: Duplicate entries
dupes = df[df.duplicated(subset=['match_id', 'event_id'], keep=False)]
print(f"Found {len(dupes)} duplicate rows")
df = df.drop_duplicates(subset=['match_id', 'event_id'], keep='first')

# Problem: Inconsistent team/player names
# Diagnosis:
print(df['team'].unique())           # Check for variations
# E.g., ['Man United', 'Manchester United', 'Man Utd']

# Solution: Create a mapping
name_map = {
    'Man United': 'Manchester United',
    'Man Utd': 'Manchester United',
}
df['team'] = df['team'].replace(name_map)

# Problem: Coordinate systems differ between providers
# StatsBomb: 120 x 80 (yards), origin bottom-left
# Opta: 100 x 100 (percentage), origin bottom-left
# Wyscout: 100 x 100 (percentage), origin top-left

def convert_opta_to_statsbomb(x, y):
    """Convert Opta (100x100) coordinates to StatsBomb (120x80)."""
    return x * 1.2, y * 0.8

def convert_wyscout_to_statsbomb(x, y):
    """Convert Wyscout (100x100, y-inverted) to StatsBomb (120x80)."""
    return x * 1.2, (100 - y) * 0.8

C.4.2 Common Modeling Issues

# Problem: Class imbalance (goals are rare events, ~10% of shots)
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

# Option 1: Class weights
weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y_train)
model = LogisticRegression(class_weight='balanced')

# Option 2: SMOTE oversampling (use only on training data!)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Problem: Overfitting (training score >> test score)
# Diagnosis:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, cv=5, scoring='neg_log_loss',
    train_sizes=np.linspace(0.1, 1.0, 10)
)
# Plot learning curves to visualize

# Solutions:
# - Regularization (increase C in logistic regression)
# - Reduce model complexity (fewer features, shallower trees)
# - Get more data
# - Cross-validation for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid,
                    cv=5, scoring='neg_log_loss')
grid.fit(X_train, y_train)
print(f"Best C: {grid.best_params_['C']}")

# Problem: Multicollinearity in regression
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data['feature'] = features
vif_data['VIF'] = [variance_inflation_factor(X.values, i)
                   for i in range(len(features))]
print(vif_data.sort_values('VIF', ascending=False))
# VIF > 10 suggests problematic multicollinearity

C.4.3 Common Visualization Issues

# Problem: Overlapping points in scatter plots
# Solution 1: Transparency
ax.scatter(x, y, alpha=0.3)

# Solution 2: Hexbin for dense data
ax.hexbin(x, y, gridsize=30, cmap='YlOrRd')

# Solution 3: 2D KDE
sns.kdeplot(x=x, y=y, fill=True, cmap='Reds', levels=10, ax=ax)

# Problem: Pitch plot looks stretched
# Solution: Always set equal aspect ratio
ax.set_aspect('equal')

# Problem: Labels overlapping
# Solution: Use adjustText
from adjustText import adjust_text
texts = [ax.annotate(name, (x, y)) for name, x, y in zip(names, xs, ys)]
adjust_text(texts)

# Problem: Color scale not informative
# Solution: Use appropriate colormaps and normalization
from matplotlib.colors import Normalize, TwoSlopeNorm
# For diverging data (e.g., xG difference):
norm = TwoSlopeNorm(vmin=-2, vcenter=0, vmax=2)
scatter = ax.scatter(x, y, c=values, cmap='RdBu_r', norm=norm)
plt.colorbar(scatter, label='xG Difference')

C.4.4 Quick Diagnostic Checks

def diagnostic_report(df, target_col='is_goal', pred_col='xG'):
    """Generate a quick diagnostic report for an xG model."""
    print("=" * 50)
    print("DATA DIAGNOSTICS")
    print("=" * 50)
    print(f"  Rows:            {len(df):,}")
    print(f"  Columns:         {len(df.columns)}")
    print(f"  Missing values:  {df.isnull().sum().sum():,}")
    print(f"  Duplicates:      {df.duplicated().sum():,}")
    print()
    print(f"  Target rate:     {df[target_col].mean():.4f}")
    print(f"  Prediction mean: {df[pred_col].mean():.4f}")
    print(f"  Prediction std:  {df[pred_col].std():.4f}")
    print(f"  Prediction min:  {df[pred_col].min():.4f}")
    print(f"  Prediction max:  {df[pred_col].max():.4f}")
    print()

    # Calibration check by decile
    df['decile'] = pd.qcut(df[pred_col], 10, labels=False, duplicates='drop')
    cal = df.groupby('decile').agg(
        mean_pred=(pred_col, 'mean'),
        mean_actual=(target_col, 'mean'),
        count=(target_col, 'count')
    )
    print("CALIBRATION BY DECILE:")
    print(cal.to_string())
    print()

    # Overall metrics
    print("MODEL METRICS:")
    print(f"  Log Loss:    {log_loss(df[target_col], df[pred_col]):.4f}")
    print(f"  Brier Score: {brier_score_loss(df[target_col], df[pred_col]):.4f}")
    print(f"  ROC AUC:     {roc_auc_score(df[target_col], df[pred_col]):.4f}")
    print("=" * 50)

For additional Python resources, see the online companion repository referenced in Appendix D. For the mathematical foundations behind the statistical methods used here, see Appendix A.