Case Study 1: Building a College Football Win Probability System
Overview
This case study develops a complete win probability system for college football, from data preparation through model deployment and in-game application.
Business Context
A college football analytics team needs: - Real-time win probability for broadcasts - Fourth-down decision recommendations - Post-game analysis of key moments - Player WPA attribution
Data Description
# Play-by-play data structure
pbp_schema = {
'game_id': 'unique game identifier',
'play_id': 'play within game',
'home_team': 'home team name',
'away_team': 'away team name',
# Game state
'home_score': 'int',
'away_score': 'int',
'quarter': 'int (1-4 or 5 for OT)',
'seconds_remaining': 'seconds in quarter',
'yard_line': 'int (1-99)',
'down': 'int (1-4)',
'distance': 'int',
'home_possession': 'bool',
# Team info
'home_pregame_wp': 'pregame probability',
'home_elo': 'Elo rating',
'away_elo': 'Elo rating',
# Outcome
'home_win': 'bool (game outcome)'
}
sample_data = {
'seasons': '2018-2023',
'total_games': 4200,
'total_plays': 680000,
'features': 15,
'outcome_coverage': 1.0
}
Implementation
Step 1: Data Preparation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
class WPDataPipeline:
"""Prepare data for win probability modeling."""
def __init__(self):
self.feature_cols = None
def load_and_clean(self, filepath: str) -> pd.DataFrame:
"""Load and clean play-by-play data."""
df = pd.read_csv(filepath)
# Remove overtime (different dynamics)
df = df[df['quarter'] <= 4]
# Remove garbage time
df = df[~self._is_garbage_time(df)]
# Calculate derived features
df = self._engineer_features(df)
return df
def _is_garbage_time(self, df: pd.DataFrame) -> pd.Series:
"""Identify garbage time plays."""
score_diff = abs(df['home_score'] - df['away_score'])
time_remaining = (4 - df['quarter']) * 900 + df['seconds_remaining']
return (score_diff >= 28) & (time_remaining < 900)
def _engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Create model features."""
df = df.copy()
# Score differential (home perspective)
df['score_diff'] = df['home_score'] - df['away_score']
# Possessing team's score differential
df['poss_score_diff'] = np.where(
df['home_possession'],
df['score_diff'],
-df['score_diff']
)
# Time features
df['game_seconds'] = (4 - df['quarter']) * 900 + df['seconds_remaining']
df['game_pct_remaining'] = df['game_seconds'] / 3600
# Field position (0-1 scale)
df['field_position_pct'] = df['yard_line'] / 100
# Down features
df['down_1'] = (df['down'] == 1).astype(int)
df['down_2'] = (df['down'] == 2).astype(int)
df['down_3'] = (df['down'] == 3).astype(int)
df['down_4'] = (df['down'] == 4).astype(int)
# Distance features
df['distance_norm'] = df['distance'].clip(upper=20) / 20
# Interaction features
df['score_time'] = df['score_diff'] * df['game_pct_remaining']
df['score_per_time'] = df['score_diff'] / (df['game_pct_remaining'] + 0.01)
# Situational indicators
df['is_red_zone'] = (df['yard_line'] >= 80).astype(int)
df['is_fg_range'] = (df['yard_line'] >= 60).astype(int)
df['trailing_late'] = (
(df['poss_score_diff'] < 0) &
(df['game_seconds'] < 600)
).astype(int)
# Team strength
df['elo_diff'] = df.get('home_elo', 1500) - df.get('away_elo', 1500)
df['pregame_wp'] = df.get('home_pregame_wp', 0.5)
# Define feature columns
self.feature_cols = [
'score_diff', 'game_pct_remaining', 'field_position_pct',
'down_1', 'down_2', 'down_3', 'down_4', 'distance_norm',
'score_time', 'is_red_zone', 'trailing_late',
'pregame_wp'
]
return df
def prepare_train_test(self,
df: pd.DataFrame,
test_size: float = 0.2):
"""Prepare train/test split."""
X = df[self.feature_cols].values
y = df['home_win'].values
return train_test_split(X, y, test_size=test_size, random_state=42)
Step 2: Model Training
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss, log_loss, roc_auc_score
class WPModelTrainer:
"""Train and evaluate WP models."""
def __init__(self):
self.models = {}
def train_logistic(self, X_train, y_train):
"""Train logistic regression baseline."""
model = LogisticRegression(C=1.0, max_iter=1000)
model.fit(X_train, y_train)
self.models['logistic'] = model
return model
def train_gradient_boosting(self, X_train, y_train):
"""Train gradient boosting model."""
model = GradientBoostingClassifier(
n_estimators=100,
max_depth=4,
learning_rate=0.1,
random_state=42
)
model.fit(X_train, y_train)
self.models['gbm'] = model
return model
def evaluate_model(self, model, X_test, y_test) -> dict:
"""Evaluate model performance."""
predictions = model.predict_proba(X_test)[:, 1]
return {
'brier_score': brier_score_loss(y_test, predictions),
'log_loss': log_loss(y_test, predictions),
'auc': roc_auc_score(y_test, predictions)
}
def compare_models(self, X_test, y_test) -> pd.DataFrame:
"""Compare all trained models."""
results = []
for name, model in self.models.items():
metrics = self.evaluate_model(model, X_test, y_test)
results.append({'model': name, **metrics})
return pd.DataFrame(results).sort_values('brier_score')
Step 3: Calibration
from sklearn.isotonic import IsotonicRegression
class WPCalibrator:
"""Calibrate win probability model."""
def __init__(self, model):
self.model = model
self.calibrator = IsotonicRegression(out_of_bounds='clip')
def fit(self, X_cal, y_cal):
"""Fit calibration on validation set."""
raw_probs = self.model.predict_proba(X_cal)[:, 1]
self.calibrator.fit(raw_probs, y_cal)
def predict(self, X) -> np.ndarray:
"""Get calibrated predictions."""
raw_probs = self.model.predict_proba(X)[:, 1]
return self.calibrator.predict(raw_probs)
def evaluate_calibration(self, X_test, y_test, n_bins=10):
"""Evaluate calibration quality."""
predictions = self.predict(X_test)
bins = np.linspace(0, 1, n_bins + 1)
results = []
for i in range(n_bins):
mask = (predictions >= bins[i]) & (predictions < bins[i+1])
if mask.sum() > 0:
results.append({
'bin': f'{bins[i]:.1f}-{bins[i+1]:.1f}',
'predicted': predictions[mask].mean(),
'actual': y_test[mask].mean(),
'count': mask.sum()
})
cal_df = pd.DataFrame(results)
ece = np.average(
np.abs(cal_df['predicted'] - cal_df['actual']),
weights=cal_df['count']
)
return {'calibration': cal_df, 'ece': ece}
Results
Model Comparison
WIN PROBABILITY MODEL COMPARISON
================================
Model | Brier Score | Log Loss | AUC
----------------|-------------|----------|------
GBM Calibrated | 0.168 | 0.492 | 0.846
GBM Raw | 0.175 | 0.512 | 0.846
Logistic | 0.182 | 0.528 | 0.831
Calibration improved Brier Score by 4%
Calibration Analysis
CALIBRATION QUALITY
===================
Before Calibration (GBM):
ECE: 0.032
MCE: 0.065
After Calibration:
ECE: 0.018
MCE: 0.038
Calibration curve shows improved alignment with
perfect calibration line after isotonic regression.
Feature Importance
FEATURE IMPORTANCE (GBM)
========================
Rank | Feature | Importance
-----|--------------------|-----------
1 | score_time | 0.312
2 | score_diff | 0.245
3 | game_pct_remaining | 0.178
4 | pregame_wp | 0.089
5 | field_position_pct | 0.056
6 | trailing_late | 0.042
7 | is_red_zone | 0.028
8 | distance_norm | 0.021
...
Key insight: Score-time interaction is most important,
capturing how score differential matters more late in games.
Application: Live Game Example
GAME: Alabama vs Georgia (SEC Championship)
===========================================
PRE-GAME:
Alabama pregame WP: 48% (slight underdog)
KEY MOMENT 1: Q2 4:32 - Georgia TD
Score: Georgia 14, Alabama 7
WP Before: Alabama 41%
WP After: Alabama 32%
WPA: -9%
KEY MOMENT 2: Q3 11:45 - Alabama INT returned for TD
Score: Alabama 21, Georgia 14
WP Before: Alabama 38%
WP After: Alabama 62%
WPA: +24% (game's highest WPA play)
KEY MOMENT 3: Q4 2:15 - 4th Down Conversion
Situation: Alabama 4th & 3 at own 32, down 3
GO FOR IT: Expected WP = 41%
PUNT: Expected WP = 38%
Recommendation: GO FOR IT (+3% WP advantage)
Actual: Alabama converted, WP jumped to 48%
FINAL:
Alabama 28, Georgia 24
Alabama WP at final whistle: 100%
Total WP swing: +52% (from 48% to 100%)
Top WPA Plays:
1. Q3 INT return TD: +24%
2. Q4 game-winning TD: +18%
3. Q4 4th down conversion: +7%
Lessons Learned
-
Score-Time Interaction Critical: The interaction between score and time is more important than either alone
-
Calibration Matters: Post-hoc calibration improved prediction quality significantly
-
Pregame WP Valuable: Including team strength improved accuracy by ~2%
-
GBM vs Logistic: Gradient boosting provided 8% Brier score improvement
-
Real-Time Application: Model runs in <1ms, suitable for live broadcasts
Production Implementation
class ProductionWPModel:
"""Production-ready WP model."""
def __init__(self, model_path: str):
self.model = joblib.load(model_path)
self.feature_cols = [...] # Feature list
def predict(self, game_state: dict) -> float:
"""Get win probability for game state."""
features = self._encode_state(game_state)
return self.model.predict(features)[0]
def get_fourth_down_analysis(self, game_state: dict) -> dict:
"""Analyze fourth down decision."""
# ... implementation
pass
def track_game(self, pbp_stream) -> Generator:
"""Stream WP updates for live game."""
for play in pbp_stream:
wp = self.predict(play)
yield {
'play_id': play['play_id'],
'wp': wp,
'wpa': wp - self.prev_wp
}
self.prev_wp = wp