Case Study 1: Building a Game Prediction System
Overview
This case study develops a complete machine learning system for predicting college football game outcomes, from data collection through model deployment and weekly predictions.
Business Context
A sports media company needs: - Weekly game outcome predictions for editorial content - Confidence levels for each prediction - Model performance tracking over the season - Interpretable explanations for key matchups
Data Description
# Historical game data schema
game_schema = {
'game_id': 'unique identifier',
'season': 'year (2018-2023)',
'week': 'week number',
'home_team': 'team name',
'away_team': 'team name',
# Pre-game metrics
'home_elo': 'pre-game Elo rating',
'away_elo': 'pre-game Elo rating',
'home_sp_plus': 'SP+ rating',
'away_sp_plus': 'SP+ rating',
'spread': 'Vegas spread (home perspective)',
# Team statistics (entering game)
'home_off_efficiency': 'offensive efficiency',
'home_def_efficiency': 'defensive efficiency',
'away_off_efficiency': 'offensive efficiency',
'away_def_efficiency': 'defensive efficiency',
# Situational
'neutral_site': 'boolean',
'conference_game': 'boolean',
'rivalry': 'boolean',
# Outcome
'home_score': 'final score',
'away_score': 'final score',
'home_win': 'boolean'
}
data_summary = {
'total_games': 4200,
'seasons': 6,
'features': 25,
'home_win_rate': 0.57
}
Implementation
Step 1: Data Preparation Pipeline
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
class GameDataPipeline:
"""Prepare game data for ML modeling."""
def __init__(self):
self.scaler = StandardScaler()
self.feature_cols = []
def load_and_clean(self, filepath: str) -> pd.DataFrame:
"""Load and clean game data."""
df = pd.read_csv(filepath)
# Remove games with missing key data
required_cols = ['home_elo', 'away_elo', 'home_score', 'away_score']
df = df.dropna(subset=required_cols)
# Remove FCS games (incomplete data)
df = df[df['fbs_game'] == True]
# Create outcome variable
df['home_win'] = (df['home_score'] > df['away_score']).astype(int)
return df
def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Create features for prediction."""
df = df.copy()
# Power rating differentials
df['elo_diff'] = df['home_elo'] - df['away_elo']
df['sp_diff'] = df['home_sp_plus'] - df['away_sp_plus']
# Efficiency differentials
df['off_diff'] = df['home_off_efficiency'] - df['away_off_efficiency']
df['def_diff'] = df['away_def_efficiency'] - df['home_def_efficiency']
# Combined efficiency
df['home_net_eff'] = df['home_off_efficiency'] - df['home_def_efficiency']
df['away_net_eff'] = df['away_off_efficiency'] - df['away_def_efficiency']
df['net_eff_diff'] = df['home_net_eff'] - df['away_net_eff']
# Situational factors
df['home_advantage'] = (~df['neutral_site']).astype(int)
df['is_conference'] = df['conference_game'].astype(int)
df['is_rivalry'] = df['rivalry'].astype(int)
# Vegas implied probability
if 'spread' in df.columns:
df['vegas_implied_wp'] = 1 / (1 + 10 ** (-df['spread'] / 10))
# Recent form (rolling 3-game win %)
df['home_recent_form'] = df.groupby('home_team')['home_win'].transform(
lambda x: x.shift().rolling(3, min_periods=1).mean()
).fillna(0.5)
self.feature_cols = [
'elo_diff', 'sp_diff', 'off_diff', 'def_diff', 'net_eff_diff',
'home_advantage', 'is_conference', 'is_rivalry',
'vegas_implied_wp', 'home_recent_form'
]
return df
def create_temporal_split(self,
df: pd.DataFrame,
train_seasons: list,
val_season: int,
test_season: int) -> tuple:
"""Create temporal train/val/test splits."""
train = df[df['season'].isin(train_seasons)]
val = df[df['season'] == val_season]
test = df[df['season'] == test_season]
X_train = train[self.feature_cols].values
X_val = val[self.feature_cols].values
X_test = test[self.feature_cols].values
y_train = train['home_win'].values
y_val = val['home_win'].values
y_test = test['home_win'].values
# Scale features
X_train = self.scaler.fit_transform(X_train)
X_val = self.scaler.transform(X_val)
X_test = self.scaler.transform(X_test)
return (X_train, X_val, X_test, y_train, y_val, y_test)
Step 2: Model Training and Selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, brier_score_loss
class ModelTrainer:
"""Train and compare ML models."""
def __init__(self):
self.models = {}
self.results = {}
def train_all_models(self, X_train, y_train):
"""Train suite of classification models."""
self.models = {
'logistic': LogisticRegression(C=1.0, max_iter=1000),
'random_forest': RandomForestClassifier(
n_estimators=200, max_depth=6,
min_samples_leaf=20, random_state=42
),
'gbm': GradientBoostingClassifier(
n_estimators=150, max_depth=4,
learning_rate=0.1, random_state=42
),
'xgboost': xgb.XGBClassifier(
n_estimators=150, max_depth=4,
learning_rate=0.1, random_state=42,
use_label_encoder=False, eval_metric='logloss'
)
}
for name, model in self.models.items():
model.fit(X_train, y_train)
print(f"Trained {name}")
def evaluate_on_validation(self, X_val, y_val) -> pd.DataFrame:
"""Evaluate all models on validation set."""
results = []
for name, model in self.models.items():
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]
results.append({
'model': name,
'accuracy': accuracy_score(y_val, y_pred),
'auc': roc_auc_score(y_val, y_prob),
'brier': brier_score_loss(y_val, y_prob)
})
return pd.DataFrame(results).sort_values('brier')
def select_best_model(self, results: pd.DataFrame, metric='brier'):
"""Select best model based on metric."""
best_name = results.loc[results[metric].idxmin(), 'model']
return best_name, self.models[best_name]
Step 3: Ensemble Construction
from sklearn.isotonic import IsotonicRegression
class GamePredictionEnsemble:
"""Ensemble model for game prediction."""
def __init__(self, models: dict, weights: dict = None):
self.models = models
self.weights = weights or {name: 1/len(models) for name in models}
self.calibrator = None
def fit_weights(self, X_val, y_val):
"""Fit weights based on validation performance."""
performances = {}
for name, model in self.models.items():
y_prob = model.predict_proba(X_val)[:, 1]
brier = brier_score_loss(y_val, y_prob)
# Inverse Brier (lower is better)
performances[name] = 1 / (brier + 0.01)
total = sum(performances.values())
self.weights = {name: perf/total for name, perf in performances.items()}
print("Ensemble weights:")
for name, weight in sorted(self.weights.items(), key=lambda x: -x[1]):
print(f" {name}: {weight:.3f}")
def predict_proba(self, X) -> np.ndarray:
"""Get weighted ensemble predictions."""
weighted_sum = np.zeros(len(X))
for name, model in self.models.items():
proba = model.predict_proba(X)[:, 1]
weighted_sum += self.weights[name] * proba
return weighted_sum
def fit_calibration(self, X_cal, y_cal):
"""Fit isotonic regression calibration."""
raw_probs = self.predict_proba(X_cal)
self.calibrator = IsotonicRegression(out_of_bounds='clip')
self.calibrator.fit(raw_probs, y_cal)
def predict_calibrated(self, X) -> np.ndarray:
"""Get calibrated predictions."""
raw_probs = self.predict_proba(X)
if self.calibrator:
return self.calibrator.predict(raw_probs)
return raw_probs
def evaluate(self, X_test, y_test) -> dict:
"""Evaluate ensemble performance."""
y_prob = self.predict_calibrated(X_test)
y_pred = (y_prob > 0.5).astype(int)
return {
'accuracy': accuracy_score(y_test, y_pred),
'auc': roc_auc_score(y_test, y_prob),
'brier': brier_score_loss(y_test, y_prob)
}
Step 4: Production System
class WeeklyPredictionSystem:
"""Production system for weekly predictions."""
def __init__(self, ensemble: GamePredictionEnsemble, pipeline: GameDataPipeline):
self.ensemble = ensemble
self.pipeline = pipeline
self.prediction_history = []
def generate_predictions(self, upcoming_games: pd.DataFrame) -> pd.DataFrame:
"""Generate predictions for upcoming games."""
# Engineer features
games = self.pipeline.engineer_features(upcoming_games)
X = games[self.pipeline.feature_cols].values
X = self.pipeline.scaler.transform(X)
# Get calibrated probabilities
home_win_prob = self.ensemble.predict_calibrated(X)
# Build predictions DataFrame
predictions = pd.DataFrame({
'game_id': games['game_id'],
'home_team': games['home_team'],
'away_team': games['away_team'],
'home_win_prob': home_win_prob,
'predicted_winner': np.where(
home_win_prob > 0.5,
games['home_team'],
games['away_team']
),
'confidence': self._calculate_confidence(home_win_prob),
'model_agreement': self._calculate_agreement(X)
})
return predictions.sort_values('confidence', ascending=False)
def _calculate_confidence(self, probs: np.ndarray) -> np.ndarray:
"""Assign confidence levels."""
confidence = np.where(
np.abs(probs - 0.5) > 0.25, 'High',
np.where(np.abs(probs - 0.5) > 0.15, 'Medium', 'Low')
)
return confidence
def _calculate_agreement(self, X: np.ndarray) -> np.ndarray:
"""Calculate model agreement percentage."""
predictions = []
for model in self.ensemble.models.values():
pred = (model.predict_proba(X)[:, 1] > 0.5).astype(int)
predictions.append(pred)
predictions = np.array(predictions)
agreement = np.apply_along_axis(
lambda x: max(x.mean(), 1-x.mean()),
axis=0, arr=predictions
)
return agreement
def track_results(self, predictions: pd.DataFrame,
actual_results: pd.DataFrame):
"""Track prediction results for performance monitoring."""
merged = predictions.merge(
actual_results[['game_id', 'home_win']],
on='game_id'
)
merged['correct'] = (
(merged['home_win_prob'] > 0.5) == merged['home_win']
).astype(int)
self.prediction_history.append(merged)
return {
'games': len(merged),
'correct': merged['correct'].sum(),
'accuracy': merged['correct'].mean(),
'brier': brier_score_loss(merged['home_win'], merged['home_win_prob'])
}
def generate_report(self, predictions: pd.DataFrame) -> str:
"""Generate formatted prediction report."""
report = []
report.append("=" * 60)
report.append("WEEKLY GAME PREDICTIONS")
report.append("=" * 60)
report.append("")
for conf in ['High', 'Medium', 'Low']:
conf_games = predictions[predictions['confidence'] == conf]
if len(conf_games) == 0:
continue
report.append(f"\n{conf.upper()} CONFIDENCE ({len(conf_games)} games)")
report.append("-" * 40)
for _, game in conf_games.iterrows():
winner = game['predicted_winner']
prob = game['home_win_prob']
if game['predicted_winner'] != game['home_team']:
prob = 1 - prob
report.append(
f" {winner} over "
f"{game['away_team'] if winner == game['home_team'] else game['home_team']}"
f" ({prob*100:.1f}%)"
)
return "\n".join(report)
Results
Model Comparison
MODEL PERFORMANCE (2023 Season Holdout)
========================================
Model | Accuracy | AUC | Brier Score
----------------|----------|-------|------------
XGBoost | 73.2% | 0.811 | 0.178
Gradient Boost | 72.8% | 0.806 | 0.181
Random Forest | 71.5% | 0.794 | 0.189
Logistic Reg | 71.1% | 0.788 | 0.192
Ensemble | 74.1% | 0.819 | 0.172
Calibrated Ens | 74.1% | 0.819 | 0.168
Key Findings:
- Ensemble improves Brier score by 6% over best single model
- Calibration reduces Brier score by additional 2%
- XGBoost is strongest single model
Feature Importance
FEATURE IMPORTANCE (XGBoost)
============================
Rank | Feature | Importance
-----|------------------|----------
1 | vegas_implied_wp | 0.324
2 | elo_diff | 0.218
3 | sp_diff | 0.156
4 | net_eff_diff | 0.089
5 | home_advantage | 0.072
6 | off_diff | 0.054
7 | def_diff | 0.041
8 | home_recent_form | 0.028
9 | is_rivalry | 0.012
10 | is_conference | 0.006
Insight: Vegas line is most predictive feature,
followed by power rating differentials.
Weekly Performance Tracking
2023 SEASON PREDICTION TRACKING
===============================
Week | Games | Correct | Accuracy | Brier | Notes
-----|-------|---------|----------|--------|-------
1 | 58 | 39 | 67.2% | 0.215 | Early season volatility
2 | 62 | 44 | 71.0% | 0.194 |
3 | 60 | 47 | 78.3% | 0.158 | Strong week
4 | 55 | 40 | 72.7% | 0.182 |
...
12 | 52 | 40 | 76.9% | 0.161 |
13 | 48 | 37 | 77.1% | 0.155 | Rivalry week
14 | 12 | 9 | 75.0% | 0.168 | Championship week
SEASON TOTAL
Games: 680 | Correct: 504 | Accuracy: 74.1%
Brier Score: 0.172 | vs. Baseline: +8.3%
Accuracy by Confidence:
- High confidence: 82.4% (156/189)
- Medium confidence: 74.8% (224/300)
- Low confidence: 64.9% (124/191)
Sample Prediction Output
==============================================================
WEEKLY GAME PREDICTIONS - WEEK 12
==============================================================
HIGH CONFIDENCE (15 games)
----------------------------------------
Georgia over Tennessee (81.2%)
Ohio State over Wisconsin (79.8%)
Alabama over Auburn (78.5%)
Michigan over Purdue (77.9%)
...
MEDIUM CONFIDENCE (28 games)
----------------------------------------
Oregon over Utah (68.4%)
Penn State over Rutgers (66.2%)
LSU over Texas A&M (64.1%)
...
LOW CONFIDENCE (12 games)
----------------------------------------
Florida vs Florida State (52.8% - Home: Florida)
USC vs UCLA (51.2% - Home: USC)
...
Lessons Learned
-
Ensemble > Single Model: Weighted ensemble consistently outperformed any single model by 2-3% accuracy
-
Calibration Matters: Post-hoc calibration improved Brier score without changing accuracy
-
Vegas is Informative: The spread remains the single most predictive feature
-
Confidence Stratification: High-confidence predictions significantly more accurate than low-confidence
-
Early Season Challenge: Model performance is weakest in weeks 1-3 due to limited current-season data
Production Deployment
SYSTEM ARCHITECTURE
===================
┌─────────────────────────────────────────────────────────────┐
│ WEEKLY WORKFLOW │
├─────────────────────────────────────────────────────────────┤
│ │
│ Monday: Data Update │
│ ├── Ingest previous week results │
│ ├── Update team statistics │
│ └── Refresh Elo ratings │
│ │
│ Tuesday: Model Refresh │
│ ├── Retrain on updated data (monthly) │
│ └── Validate calibration │
│ │
│ Wednesday: Prediction Generation │
│ ├── Generate week predictions │
│ ├── Calculate confidence levels │
│ └── Create editorial report │
│ │
│ Saturday/Sunday: Result Tracking │
│ ├── Record actual outcomes │
│ ├── Update performance metrics │
│ └── Flag model drift if needed │
│ │
└─────────────────────────────────────────────────────────────┘
Latency: Predictions generated in <30 seconds
Accuracy Target: >72% overall, >80% high-confidence
Monitoring: Weekly performance reports, drift detection