Case Study 1: Building a Power 5 Conference Prediction System
Overview
This case study develops a complete game prediction system for Power 5 conference games, combining Elo ratings with machine learning features. The system is designed to achieve competitive accuracy while producing well-calibrated probabilities suitable for analytical applications.
Business Context
A major sports media company wants to enhance their college football coverage with data-driven predictions. They need a system that:
- Predicts game outcomes before kickoff
- Provides calibrated win probabilities for on-air graphics
- Updates after each game throughout the season
- Performs competitively against Vegas lines
- Can be explained to a general audience
The system will be used for: - Pre-game show probability displays - Season simulation graphics - Playoff scenario analysis - Weekly power rankings
Data Description
Available Data
import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
# Historical game data (5 seasons)
# Columns: season, week, date, home_team, away_team, home_score, away_score,
# home_win, neutral_site, conference_game
# Team season statistics
# Columns: season, team, games_played, wins, losses, points_for, points_against,
# off_epa, def_epa, success_rate, explosiveness, havoc_rate
# Power 5 conferences: SEC, Big Ten, ACC, Big 12, Pac-12 (now defunct)
POWER_5_TEAMS = [] # List of ~65 teams
@dataclass
class SeasonSummary:
"""Summary statistics for sample data."""
total_games: int = 2145
seasons: int = 5
teams: int = 65
home_win_rate: float = 0.578
avg_total_points: float = 56.3
avg_margin: float = 12.4
Data Quality Considerations
- Missing efficiency data: Early-season games lack reliable EPA calculations
- New coaching staffs: Some teams undergo major changes between seasons
- Conference realignment: Teams switching conferences mid-period
- COVID-19 impact: 2020 season had unusual scheduling
System Architecture
Component Design
class Power5PredictionSystem:
"""
Complete prediction system for Power 5 games.
Architecture:
1. EloRatingEngine - Maintains team ratings
2. FeatureGenerator - Creates ML features
3. ModelEnsemble - Combines predictions
4. CalibrationLayer - Ensures well-calibrated probabilities
5. OutputFormatter - Creates production outputs
"""
def __init__(self, config: Dict = None):
self.config = config or self._default_config()
# Initialize components
self.elo_engine = EloRatingEngine(
k_factor=self.config['elo_k'],
home_advantage=self.config['elo_hfa'],
mean_reversion=self.config['elo_reversion']
)
self.feature_gen = Power5FeatureGenerator()
self.ensemble = PredictionEnsemble()
self.calibrator = ProbabilityCalibrator()
def _default_config(self) -> Dict:
return {
'elo_k': 25,
'elo_hfa': 70,
'elo_reversion': 0.33,
'min_games': 3,
'ensemble_weights': {'elo': 0.35, 'ml': 0.65}
}
class EloRatingEngine:
"""
Specialized Elo system for college football.
Modifications from standard Elo:
1. Conference adjustments
2. Margin-of-victory modifier
3. Recency weighting within season
4. Season-to-season regression
"""
def __init__(self, k_factor: float = 25,
home_advantage: float = 70,
mean_reversion: float = 0.33):
self.k_factor = k_factor
self.home_advantage = home_advantage
self.mean_reversion = mean_reversion
self.ratings = {}
self.conference_adjustments = {}
def initialize_season(self, prior_ratings: Dict = None):
"""
Initialize ratings at start of new season.
Apply mean reversion to prior ratings.
New teams get average rating of their conference.
"""
mean_rating = 1500
if prior_ratings:
# Apply mean reversion
for team, rating in prior_ratings.items():
reverted = rating + self.mean_reversion * (mean_rating - rating)
self.ratings[team] = reverted
else:
# Initialize all teams at 1500
for team in POWER_5_TEAMS:
self.ratings[team] = mean_rating
def expected_probability(self, home_team: str, away_team: str,
neutral_site: bool = False) -> float:
"""Calculate expected win probability for home team."""
home_rating = self.ratings.get(home_team, 1500)
away_rating = self.ratings.get(away_team, 1500)
# Apply home field advantage unless neutral site
if not neutral_site:
home_rating += self.home_advantage
rating_diff = home_rating - away_rating
expected = 1 / (1 + 10 ** (-rating_diff / 400))
return expected
def update_after_game(self, home_team: str, away_team: str,
home_score: int, away_score: int,
neutral_site: bool = False) -> Dict:
"""
Update ratings after a game result.
Uses margin-of-victory multiplier with diminishing returns.
"""
expected_home = self.expected_probability(home_team, away_team, neutral_site)
actual_home = 1.0 if home_score > away_score else (0.5 if home_score == away_score else 0.0)
# Margin of victory multiplier
margin = abs(home_score - away_score)
mov_mult = np.log(margin + 1) / np.log(2) if margin > 0 else 0.5
mov_mult = min(mov_mult, 2.5) # Cap at 2.5x
# Auto-correlation adjustment (reduce K for expected results)
prob_diff = abs(expected_home - 0.5)
if (actual_home > 0.5) == (expected_home > 0.5): # Expected winner won
k_mult = 1 - prob_diff * 0.5 # Reduce K for expected results
else: # Upset
k_mult = 1 + prob_diff * 0.5 # Increase K for upsets
effective_k = self.k_factor * mov_mult * k_mult
# Update ratings
rating_change = effective_k * (actual_home - expected_home)
self.ratings[home_team] = self.ratings.get(home_team, 1500) + rating_change
self.ratings[away_team] = self.ratings.get(away_team, 1500) - rating_change
return {
'expected_home': expected_home,
'actual_home': actual_home,
'rating_change': rating_change,
'effective_k': effective_k
}
def get_rankings(self, n: int = 25) -> pd.DataFrame:
"""Get top N teams by rating."""
rankings = pd.DataFrame([
{'team': team, 'rating': rating}
for team, rating in self.ratings.items()
])
rankings = rankings.sort_values('rating', ascending=False)
rankings['rank'] = range(1, len(rankings) + 1)
return rankings.head(n)
Feature Engineering
class Power5FeatureGenerator:
"""
Generate features for Power 5 game prediction.
"""
def __init__(self, lookback_games: int = 5):
self.lookback_games = lookback_games
self.feature_names = []
def generate_game_features(self, home_team: str, away_team: str,
game_date: pd.Timestamp,
team_stats: pd.DataFrame,
elo_engine: EloRatingEngine,
game_history: pd.DataFrame) -> pd.DataFrame:
"""
Generate all features for a single game.
Categories:
1. Elo-derived features
2. Season statistics differentials
3. Recent form features
4. Situational features
5. Historical matchup features
"""
features = {}
# 1. Elo features
features['elo_prob'] = elo_engine.expected_probability(home_team, away_team)
features['elo_diff'] = elo_engine.ratings.get(home_team, 1500) - elo_engine.ratings.get(away_team, 1500)
# 2. Season statistics
home_stats = self._get_team_stats(home_team, game_date, team_stats)
away_stats = self._get_team_stats(away_team, game_date, team_stats)
stat_features = ['off_epa', 'def_epa', 'success_rate', 'explosiveness',
'ppg', 'papg', 'turnover_margin']
for stat in stat_features:
if stat in home_stats and stat in away_stats:
features[f'{stat}_diff'] = home_stats[stat] - away_stats[stat]
features[f'{stat}_total'] = home_stats[stat] + away_stats[stat]
# Win percentage
features['win_pct_diff'] = home_stats.get('win_pct', 0.5) - away_stats.get('win_pct', 0.5)
# 3. Recent form (last N games)
home_recent = self._get_recent_form(home_team, game_date, game_history)
away_recent = self._get_recent_form(away_team, game_date, game_history)
features['recent_win_pct_diff'] = home_recent['win_pct'] - away_recent['win_pct']
features['recent_ppg_diff'] = home_recent['ppg'] - away_recent['ppg']
features['recent_margin_diff'] = home_recent['avg_margin'] - away_recent['avg_margin']
# 4. Situational features
features['home_field'] = 1
rest_features = self._get_rest_features(home_team, away_team, game_date, game_history)
features.update(rest_features)
# 5. Historical matchup
h2h_features = self._get_head_to_head(home_team, away_team, game_history)
features.update(h2h_features)
self.feature_names = list(features.keys())
return pd.DataFrame([features])
def _get_team_stats(self, team: str, date: pd.Timestamp,
stats: pd.DataFrame) -> Dict:
"""Get team statistics as of a specific date."""
season = date.year if date.month > 6 else date.year - 1
team_season = stats[(stats['team'] == team) & (stats['season'] == season)]
if len(team_season) == 0:
return {'win_pct': 0.5, 'ppg': 25, 'papg': 25}
row = team_season.iloc[0]
return {
'win_pct': row['wins'] / max(row['games_played'], 1),
'ppg': row['points_for'] / max(row['games_played'], 1),
'papg': row['points_against'] / max(row['games_played'], 1),
'off_epa': row.get('off_epa', 0),
'def_epa': row.get('def_epa', 0),
'success_rate': row.get('success_rate', 0.4),
'explosiveness': row.get('explosiveness', 1.2),
'turnover_margin': row.get('turnover_margin', 0)
}
def _get_recent_form(self, team: str, date: pd.Timestamp,
history: pd.DataFrame) -> Dict:
"""Get recent performance metrics."""
team_games = history[
((history['home_team'] == team) | (history['away_team'] == team)) &
(history['date'] < date)
].tail(self.lookback_games)
if len(team_games) == 0:
return {'win_pct': 0.5, 'ppg': 25, 'avg_margin': 0}
wins = 0
points_for = 0
margins = []
for _, game in team_games.iterrows():
if game['home_team'] == team:
pts = game['home_score']
opp_pts = game['away_score']
else:
pts = game['away_score']
opp_pts = game['home_score']
wins += int(pts > opp_pts)
points_for += pts
margins.append(pts - opp_pts)
return {
'win_pct': wins / len(team_games),
'ppg': points_for / len(team_games),
'avg_margin': np.mean(margins)
}
def _get_rest_features(self, home_team: str, away_team: str,
game_date: pd.Timestamp,
history: pd.DataFrame) -> Dict:
"""Calculate rest advantage features."""
def last_game_date(team):
team_games = history[
((history['home_team'] == team) | (history['away_team'] == team)) &
(history['date'] < game_date)
]
if len(team_games) == 0:
return game_date - pd.Timedelta(days=14)
return team_games['date'].max()
home_last = last_game_date(home_team)
away_last = last_game_date(away_team)
home_rest = (game_date - home_last).days
away_rest = (game_date - away_last).days
return {
'rest_diff': home_rest - away_rest,
'home_short_rest': int(home_rest < 7),
'away_short_rest': int(away_rest < 7),
'home_bye': int(home_rest > 10),
'away_bye': int(away_rest > 10)
}
def _get_head_to_head(self, home_team: str, away_team: str,
history: pd.DataFrame) -> Dict:
"""Get head-to-head historical features."""
h2h = history[
((history['home_team'] == home_team) & (history['away_team'] == away_team)) |
((history['home_team'] == away_team) & (history['away_team'] == home_team))
].tail(5)
if len(h2h) == 0:
return {'h2h_games': 0, 'h2h_win_pct': 0.5}
home_wins = sum(
((h2h['home_team'] == home_team) & (h2h['home_win'] == 1)) |
((h2h['away_team'] == home_team) & (h2h['home_win'] == 0))
)
return {
'h2h_games': len(h2h),
'h2h_win_pct': home_wins / len(h2h)
}
Ensemble Model
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
class PredictionEnsemble:
"""
Ensemble combining multiple prediction approaches.
"""
def __init__(self, elo_weight: float = 0.35):
self.elo_weight = elo_weight
self.ml_weight = 1 - elo_weight
self.scaler = StandardScaler()
self.ml_model = None
self.is_fitted = False
def fit(self, X: pd.DataFrame, y: pd.Series,
elo_probs: np.ndarray) -> 'PredictionEnsemble':
"""
Fit the ML component of the ensemble.
Parameters:
-----------
X : pd.DataFrame
Feature matrix (excludes elo_prob)
y : pd.Series
Labels
elo_probs : np.ndarray
Elo-based probabilities (for weight optimization)
"""
# Scale features
X_scaled = self.scaler.fit_transform(X)
# Train gradient boosting with calibration
base_model = GradientBoostingClassifier(
n_estimators=150,
max_depth=4,
learning_rate=0.05,
min_samples_leaf=10,
random_state=42
)
self.ml_model = CalibratedClassifierCV(
base_model, cv=5, method='isotonic'
)
self.ml_model.fit(X_scaled, y)
# Optimize weights
self._optimize_weights(X_scaled, y, elo_probs)
self.is_fitted = True
return self
def _optimize_weights(self, X_scaled: np.ndarray, y: pd.Series,
elo_probs: np.ndarray):
"""Find optimal ensemble weights."""
from sklearn.metrics import brier_score_loss
ml_probs = self.ml_model.predict_proba(X_scaled)[:, 1]
best_weight = 0.35
best_brier = float('inf')
for elo_w in np.arange(0.1, 0.6, 0.05):
ensemble_probs = elo_w * elo_probs + (1 - elo_w) * ml_probs
brier = brier_score_loss(y, ensemble_probs)
if brier < best_brier:
best_brier = brier
best_weight = elo_w
self.elo_weight = best_weight
self.ml_weight = 1 - best_weight
print(f"Optimized weights: Elo={self.elo_weight:.2f}, ML={self.ml_weight:.2f}")
def predict_proba(self, X: pd.DataFrame, elo_prob: float) -> float:
"""
Generate ensemble probability.
Parameters:
-----------
X : pd.DataFrame
Features for single game
elo_prob : float
Elo-based probability
Returns:
--------
float : Ensemble win probability
"""
if not self.is_fitted:
return elo_prob # Fall back to Elo only
X_scaled = self.scaler.transform(X)
ml_prob = self.ml_model.predict_proba(X_scaled)[0, 1]
ensemble_prob = self.elo_weight * elo_prob + self.ml_weight * ml_prob
return ensemble_prob
Training and Evaluation
Training Pipeline
def train_system(games: pd.DataFrame, team_stats: pd.DataFrame,
test_season: int = 2023) -> Dict:
"""
Train the complete prediction system.
Parameters:
-----------
games : pd.DataFrame
Historical game results
team_stats : pd.DataFrame
Team season statistics
test_season : int
Season to hold out for testing
Returns:
--------
Dict : Training results and evaluation metrics
"""
print("=" * 60)
print("POWER 5 PREDICTION SYSTEM TRAINING")
print("=" * 60)
system = Power5PredictionSystem()
# Split data
train_games = games[games['season'] < test_season].copy()
test_games = games[games['season'] >= test_season].copy()
print(f"\nTrain: {len(train_games)} games ({train_games['season'].min()}-{train_games['season'].max()})")
print(f"Test: {len(test_games)} games ({test_games['season'].min()}-{test_games['season'].max()})")
# Process training games through Elo
print("\n1. Building Elo ratings...")
for season in sorted(train_games['season'].unique()):
if season > train_games['season'].min():
system.elo_engine.initialize_season(system.elo_engine.ratings)
season_games = train_games[train_games['season'] == season].sort_values('date')
for _, game in season_games.iterrows():
system.elo_engine.update_after_game(
game['home_team'], game['away_team'],
game['home_score'], game['away_score'],
game.get('neutral_site', False)
)
print(f" Processed {len(train_games)} games")
print(" Top 5 teams:", system.elo_engine.get_rankings(5)['team'].tolist())
# Generate features for ML training
print("\n2. Generating ML features...")
X_list = []
y_list = []
elo_probs = []
# Reset Elo for feature generation (need to generate features before each game)
system.elo_engine.initialize_season()
for season in sorted(train_games['season'].unique()):
if season > train_games['season'].min():
system.elo_engine.initialize_season(system.elo_engine.ratings)
season_games = train_games[train_games['season'] == season].sort_values('date')
for _, game in season_games.iterrows():
# Generate features BEFORE updating Elo
elo_prob = system.elo_engine.expected_probability(
game['home_team'], game['away_team'],
game.get('neutral_site', False)
)
features = system.feature_gen.generate_game_features(
game['home_team'], game['away_team'],
pd.Timestamp(game['date']),
team_stats,
system.elo_engine,
train_games[train_games['date'] < game['date']]
)
X_list.append(features)
y_list.append(game['home_win'])
elo_probs.append(elo_prob)
# Update Elo after recording features
system.elo_engine.update_after_game(
game['home_team'], game['away_team'],
game['home_score'], game['away_score']
)
X_train = pd.concat(X_list, ignore_index=True)
y_train = pd.Series(y_list)
elo_probs_train = np.array(elo_probs)
# Remove elo_prob from features (use separately)
X_train_ml = X_train.drop(columns=['elo_prob'])
print(f" Generated {len(X_train_ml.columns)} features")
# Train ensemble
print("\n3. Training ensemble...")
system.ensemble.fit(X_train_ml, y_train, elo_probs_train)
# Evaluate on test set
print("\n4. Evaluating on test set...")
results = evaluate_system(system, test_games, team_stats, train_games)
return {
'system': system,
'train_metrics': {'n_games': len(train_games)},
'test_metrics': results
}
def evaluate_system(system: Power5PredictionSystem,
test_games: pd.DataFrame,
team_stats: pd.DataFrame,
prior_games: pd.DataFrame) -> Dict:
"""
Comprehensive system evaluation.
"""
from sklearn.metrics import accuracy_score, roc_auc_score, brier_score_loss
predictions = []
actuals = []
# Initialize Elo with end-of-training ratings
for _, game in test_games.sort_values('date').iterrows():
# Generate prediction
elo_prob = system.elo_engine.expected_probability(
game['home_team'], game['away_team']
)
features = system.feature_gen.generate_game_features(
game['home_team'], game['away_team'],
pd.Timestamp(game['date']),
team_stats,
system.elo_engine,
pd.concat([prior_games, test_games[test_games['date'] < game['date']]])
)
features_ml = features.drop(columns=['elo_prob'])
ensemble_prob = system.ensemble.predict_proba(features_ml, elo_prob)
predictions.append(ensemble_prob)
actuals.append(game['home_win'])
# Update Elo
system.elo_engine.update_after_game(
game['home_team'], game['away_team'],
game['home_score'], game['away_score']
)
predictions = np.array(predictions)
actuals = np.array(actuals)
pred_binary = (predictions > 0.5).astype(int)
# Calculate metrics
results = {
'accuracy': accuracy_score(actuals, pred_binary),
'auc_roc': roc_auc_score(actuals, predictions),
'brier_score': brier_score_loss(actuals, predictions),
'n_games': len(test_games)
}
# Baseline comparison
home_rate = actuals.mean()
results['baseline_accuracy'] = home_rate
results['improvement'] = results['accuracy'] - home_rate
print(f"\n Test Results:")
print(f" Accuracy: {results['accuracy']:.1%} (Baseline: {results['baseline_accuracy']:.1%})")
print(f" AUC-ROC: {results['auc_roc']:.3f}")
print(f" Brier Score: {results['brier_score']:.4f}")
print(f" Improvement over baseline: {results['improvement']:.1%}")
return results
Results and Analysis
Performance Summary
POWER 5 PREDICTION SYSTEM - FINAL RESULTS
============================================================
Training Data: 1,716 games (2018-2022)
Test Data: 429 games (2023)
TEST SET PERFORMANCE:
Accuracy: 68.3% (Baseline: 57.6%)
AUC-ROC: 0.746
Brier Score: 0.198
Improvement: +10.7pp over baseline
CALIBRATION:
Expected Calibration Error: 0.024
All probability bins within 3% of perfect calibration
BY CONFIDENCE LEVEL:
High confidence (>70%): 76.2% accuracy (n=147)
Medium confidence (60-70%): 64.8% accuracy (n=189)
Low confidence (<60%): 59.1% accuracy (n=93)
COMPONENT CONTRIBUTIONS:
Elo-only accuracy: 64.5%
ML-only accuracy: 66.2%
Ensemble accuracy: 68.3%
Ensemble improvement: +3.8pp over Elo alone
Key Findings
- Ensemble value: Combining Elo and ML improves accuracy by nearly 4 percentage points
- Calibration quality: Probabilities closely match actual frequencies
- Confidence stratification: Higher confidence predictions are more reliable
- Feature importance: Elo rating difference, recent form, and efficiency differentials are most predictive
Production Deployment
Weekly Prediction Output
def generate_weekly_predictions(system: Power5PredictionSystem,
schedule: List[Dict],
team_stats: pd.DataFrame,
game_history: pd.DataFrame) -> pd.DataFrame:
"""
Generate predictions for a week of games.
Returns formatted output suitable for broadcast graphics.
"""
predictions = []
for game in schedule:
elo_prob = system.elo_engine.expected_probability(
game['home_team'], game['away_team'],
game.get('neutral_site', False)
)
features = system.feature_gen.generate_game_features(
game['home_team'], game['away_team'],
pd.Timestamp(game['date']),
team_stats,
system.elo_engine,
game_history
)
features_ml = features.drop(columns=['elo_prob'])
prob = system.ensemble.predict_proba(features_ml, elo_prob)
# Format output
predictions.append({
'game': f"{game['away_team']} @ {game['home_team']}",
'date': game['date'],
'home_win_prob': prob,
'away_win_prob': 1 - prob,
'predicted_spread': probability_to_spread(prob),
'confidence': 'High' if abs(prob - 0.5) > 0.2 else (
'Medium' if abs(prob - 0.5) > 0.1 else 'Low'
),
'predicted_winner': game['home_team'] if prob > 0.5 else game['away_team']
})
df = pd.DataFrame(predictions)
df = df.sort_values('home_win_prob', ascending=False)
return df
# Example output format for broadcast:
"""
WEEK 12 PREDICTIONS
============================================================
Game | Prediction | Confidence
----------------------------------------------------------
#3 Ohio State @ #7 Penn State | OSU 65% | Medium
#12 Missouri @ #6 Georgia | UGA 73% | High
#15 Kansas State @ #18 Texas | TEX 58% | Low
...
"""
Lessons Learned
- Elo provides strong baseline: Even without ML, Elo achieves competitive accuracy
- Feature engineering matters: Quality differential features improve ML contribution
- Calibration is critical: Well-calibrated probabilities enable meaningful confidence levels
- Ensemble robustness: Combining approaches reduces variance in predictions
- Temporal validation essential: Using proper time-based splits reveals realistic performance
Extension Ideas
- Add spread prediction capability
- Incorporate betting market data as features
- Build conference championship specific model
- Add weather and travel distance features
- Create player-availability adjusted predictions