Case Study 1: Building a Game Prediction System

Overview

This case study develops a complete machine learning system for predicting college football game outcomes, from data collection through model deployment and weekly predictions.

Business Context

A sports media company needs: - Weekly game outcome predictions for editorial content - Confidence levels for each prediction - Model performance tracking over the season - Interpretable explanations for key matchups

Data Description

# Historical game data schema
game_schema = {
    'game_id': 'unique identifier',
    'season': 'year (2018-2023)',
    'week': 'week number',
    'home_team': 'team name',
    'away_team': 'team name',

    # Pre-game metrics
    'home_elo': 'pre-game Elo rating',
    'away_elo': 'pre-game Elo rating',
    'home_sp_plus': 'SP+ rating',
    'away_sp_plus': 'SP+ rating',
    'spread': 'Vegas spread (home perspective)',

    # Team statistics (entering game)
    'home_off_efficiency': 'offensive efficiency',
    'home_def_efficiency': 'defensive efficiency',
    'away_off_efficiency': 'offensive efficiency',
    'away_def_efficiency': 'defensive efficiency',

    # Situational
    'neutral_site': 'boolean',
    'conference_game': 'boolean',
    'rivalry': 'boolean',

    # Outcome
    'home_score': 'final score',
    'away_score': 'final score',
    'home_win': 'boolean'
}

data_summary = {
    'total_games': 4200,
    'seasons': 6,
    'features': 25,
    'home_win_rate': 0.57
}

Implementation

Step 1: Data Preparation Pipeline

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class GameDataPipeline:
    """Prepare game data for ML modeling."""

    def __init__(self):
        self.scaler = StandardScaler()
        self.feature_cols = []

    def load_and_clean(self, filepath: str) -> pd.DataFrame:
        """Load and clean game data."""
        df = pd.read_csv(filepath)

        # Remove games with missing key data
        required_cols = ['home_elo', 'away_elo', 'home_score', 'away_score']
        df = df.dropna(subset=required_cols)

        # Remove FCS games (incomplete data)
        df = df[df['fbs_game'] == True]

        # Create outcome variable
        df['home_win'] = (df['home_score'] > df['away_score']).astype(int)

        return df

    def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create features for prediction."""
        df = df.copy()

        # Power rating differentials
        df['elo_diff'] = df['home_elo'] - df['away_elo']
        df['sp_diff'] = df['home_sp_plus'] - df['away_sp_plus']

        # Efficiency differentials
        df['off_diff'] = df['home_off_efficiency'] - df['away_off_efficiency']
        df['def_diff'] = df['away_def_efficiency'] - df['home_def_efficiency']

        # Combined efficiency
        df['home_net_eff'] = df['home_off_efficiency'] - df['home_def_efficiency']
        df['away_net_eff'] = df['away_off_efficiency'] - df['away_def_efficiency']
        df['net_eff_diff'] = df['home_net_eff'] - df['away_net_eff']

        # Situational factors
        df['home_advantage'] = (~df['neutral_site']).astype(int)
        df['is_conference'] = df['conference_game'].astype(int)
        df['is_rivalry'] = df['rivalry'].astype(int)

        # Vegas implied probability
        if 'spread' in df.columns:
            df['vegas_implied_wp'] = 1 / (1 + 10 ** (-df['spread'] / 10))

        # Recent form (rolling 3-game win %)
        df['home_recent_form'] = df.groupby('home_team')['home_win'].transform(
            lambda x: x.shift().rolling(3, min_periods=1).mean()
        ).fillna(0.5)

        self.feature_cols = [
            'elo_diff', 'sp_diff', 'off_diff', 'def_diff', 'net_eff_diff',
            'home_advantage', 'is_conference', 'is_rivalry',
            'vegas_implied_wp', 'home_recent_form'
        ]

        return df

    def create_temporal_split(self,
                              df: pd.DataFrame,
                              train_seasons: list,
                              val_season: int,
                              test_season: int) -> tuple:
        """Create temporal train/val/test splits."""
        train = df[df['season'].isin(train_seasons)]
        val = df[df['season'] == val_season]
        test = df[df['season'] == test_season]

        X_train = train[self.feature_cols].values
        X_val = val[self.feature_cols].values
        X_test = test[self.feature_cols].values

        y_train = train['home_win'].values
        y_val = val['home_win'].values
        y_test = test['home_win'].values

        # Scale features
        X_train = self.scaler.fit_transform(X_train)
        X_val = self.scaler.transform(X_val)
        X_test = self.scaler.transform(X_test)

        return (X_train, X_val, X_test, y_train, y_val, y_test)

Step 2: Model Training and Selection

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, brier_score_loss

class ModelTrainer:
    """Train and compare ML models."""

    def __init__(self):
        self.models = {}
        self.results = {}

    def train_all_models(self, X_train, y_train):
        """Train suite of classification models."""
        self.models = {
            'logistic': LogisticRegression(C=1.0, max_iter=1000),
            'random_forest': RandomForestClassifier(
                n_estimators=200, max_depth=6,
                min_samples_leaf=20, random_state=42
            ),
            'gbm': GradientBoostingClassifier(
                n_estimators=150, max_depth=4,
                learning_rate=0.1, random_state=42
            ),
            'xgboost': xgb.XGBClassifier(
                n_estimators=150, max_depth=4,
                learning_rate=0.1, random_state=42,
                use_label_encoder=False, eval_metric='logloss'
            )
        }

        for name, model in self.models.items():
            model.fit(X_train, y_train)
            print(f"Trained {name}")

    def evaluate_on_validation(self, X_val, y_val) -> pd.DataFrame:
        """Evaluate all models on validation set."""
        results = []

        for name, model in self.models.items():
            y_pred = model.predict(X_val)
            y_prob = model.predict_proba(X_val)[:, 1]

            results.append({
                'model': name,
                'accuracy': accuracy_score(y_val, y_pred),
                'auc': roc_auc_score(y_val, y_prob),
                'brier': brier_score_loss(y_val, y_prob)
            })

        return pd.DataFrame(results).sort_values('brier')

    def select_best_model(self, results: pd.DataFrame, metric='brier'):
        """Select best model based on metric."""
        best_name = results.loc[results[metric].idxmin(), 'model']
        return best_name, self.models[best_name]

Step 3: Ensemble Construction

from sklearn.isotonic import IsotonicRegression

class GamePredictionEnsemble:
    """Ensemble model for game prediction."""

    def __init__(self, models: dict, weights: dict = None):
        self.models = models
        self.weights = weights or {name: 1/len(models) for name in models}
        self.calibrator = None

    def fit_weights(self, X_val, y_val):
        """Fit weights based on validation performance."""
        performances = {}

        for name, model in self.models.items():
            y_prob = model.predict_proba(X_val)[:, 1]
            brier = brier_score_loss(y_val, y_prob)
            # Inverse Brier (lower is better)
            performances[name] = 1 / (brier + 0.01)

        total = sum(performances.values())
        self.weights = {name: perf/total for name, perf in performances.items()}

        print("Ensemble weights:")
        for name, weight in sorted(self.weights.items(), key=lambda x: -x[1]):
            print(f"  {name}: {weight:.3f}")

    def predict_proba(self, X) -> np.ndarray:
        """Get weighted ensemble predictions."""
        weighted_sum = np.zeros(len(X))

        for name, model in self.models.items():
            proba = model.predict_proba(X)[:, 1]
            weighted_sum += self.weights[name] * proba

        return weighted_sum

    def fit_calibration(self, X_cal, y_cal):
        """Fit isotonic regression calibration."""
        raw_probs = self.predict_proba(X_cal)
        self.calibrator = IsotonicRegression(out_of_bounds='clip')
        self.calibrator.fit(raw_probs, y_cal)

    def predict_calibrated(self, X) -> np.ndarray:
        """Get calibrated predictions."""
        raw_probs = self.predict_proba(X)
        if self.calibrator:
            return self.calibrator.predict(raw_probs)
        return raw_probs

    def evaluate(self, X_test, y_test) -> dict:
        """Evaluate ensemble performance."""
        y_prob = self.predict_calibrated(X_test)
        y_pred = (y_prob > 0.5).astype(int)

        return {
            'accuracy': accuracy_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_prob),
            'brier': brier_score_loss(y_test, y_prob)
        }

Step 4: Production System

class WeeklyPredictionSystem:
    """Production system for weekly predictions."""

    def __init__(self, ensemble: GamePredictionEnsemble, pipeline: GameDataPipeline):
        self.ensemble = ensemble
        self.pipeline = pipeline
        self.prediction_history = []

    def generate_predictions(self, upcoming_games: pd.DataFrame) -> pd.DataFrame:
        """Generate predictions for upcoming games."""
        # Engineer features
        games = self.pipeline.engineer_features(upcoming_games)
        X = games[self.pipeline.feature_cols].values
        X = self.pipeline.scaler.transform(X)

        # Get calibrated probabilities
        home_win_prob = self.ensemble.predict_calibrated(X)

        # Build predictions DataFrame
        predictions = pd.DataFrame({
            'game_id': games['game_id'],
            'home_team': games['home_team'],
            'away_team': games['away_team'],
            'home_win_prob': home_win_prob,
            'predicted_winner': np.where(
                home_win_prob > 0.5,
                games['home_team'],
                games['away_team']
            ),
            'confidence': self._calculate_confidence(home_win_prob),
            'model_agreement': self._calculate_agreement(X)
        })

        return predictions.sort_values('confidence', ascending=False)

    def _calculate_confidence(self, probs: np.ndarray) -> np.ndarray:
        """Assign confidence levels."""
        confidence = np.where(
            np.abs(probs - 0.5) > 0.25, 'High',
            np.where(np.abs(probs - 0.5) > 0.15, 'Medium', 'Low')
        )
        return confidence

    def _calculate_agreement(self, X: np.ndarray) -> np.ndarray:
        """Calculate model agreement percentage."""
        predictions = []
        for model in self.ensemble.models.values():
            pred = (model.predict_proba(X)[:, 1] > 0.5).astype(int)
            predictions.append(pred)

        predictions = np.array(predictions)
        agreement = np.apply_along_axis(
            lambda x: max(x.mean(), 1-x.mean()),
            axis=0, arr=predictions
        )
        return agreement

    def track_results(self, predictions: pd.DataFrame,
                      actual_results: pd.DataFrame):
        """Track prediction results for performance monitoring."""
        merged = predictions.merge(
            actual_results[['game_id', 'home_win']],
            on='game_id'
        )

        merged['correct'] = (
            (merged['home_win_prob'] > 0.5) == merged['home_win']
        ).astype(int)

        self.prediction_history.append(merged)

        return {
            'games': len(merged),
            'correct': merged['correct'].sum(),
            'accuracy': merged['correct'].mean(),
            'brier': brier_score_loss(merged['home_win'], merged['home_win_prob'])
        }

    def generate_report(self, predictions: pd.DataFrame) -> str:
        """Generate formatted prediction report."""
        report = []
        report.append("=" * 60)
        report.append("WEEKLY GAME PREDICTIONS")
        report.append("=" * 60)
        report.append("")

        for conf in ['High', 'Medium', 'Low']:
            conf_games = predictions[predictions['confidence'] == conf]
            if len(conf_games) == 0:
                continue

            report.append(f"\n{conf.upper()} CONFIDENCE ({len(conf_games)} games)")
            report.append("-" * 40)

            for _, game in conf_games.iterrows():
                winner = game['predicted_winner']
                prob = game['home_win_prob']
                if game['predicted_winner'] != game['home_team']:
                    prob = 1 - prob

                report.append(
                    f"  {winner} over "
                    f"{game['away_team'] if winner == game['home_team'] else game['home_team']}"
                    f" ({prob*100:.1f}%)"
                )

        return "\n".join(report)

Results

Model Comparison

MODEL PERFORMANCE (2023 Season Holdout)
========================================

Model           | Accuracy | AUC   | Brier Score
----------------|----------|-------|------------
XGBoost         | 73.2%    | 0.811 | 0.178
Gradient Boost  | 72.8%    | 0.806 | 0.181
Random Forest   | 71.5%    | 0.794 | 0.189
Logistic Reg    | 71.1%    | 0.788 | 0.192

Ensemble        | 74.1%    | 0.819 | 0.172
Calibrated Ens  | 74.1%    | 0.819 | 0.168

Key Findings:
- Ensemble improves Brier score by 6% over best single model
- Calibration reduces Brier score by additional 2%
- XGBoost is strongest single model

Feature Importance

FEATURE IMPORTANCE (XGBoost)
============================

Rank | Feature          | Importance
-----|------------------|----------
1    | vegas_implied_wp | 0.324
2    | elo_diff         | 0.218
3    | sp_diff          | 0.156
4    | net_eff_diff     | 0.089
5    | home_advantage   | 0.072
6    | off_diff         | 0.054
7    | def_diff         | 0.041
8    | home_recent_form | 0.028
9    | is_rivalry       | 0.012
10   | is_conference    | 0.006

Insight: Vegas line is most predictive feature,
followed by power rating differentials.

Weekly Performance Tracking

2023 SEASON PREDICTION TRACKING
===============================

Week | Games | Correct | Accuracy | Brier  | Notes
-----|-------|---------|----------|--------|-------
1    | 58    | 39      | 67.2%    | 0.215  | Early season volatility
2    | 62    | 44      | 71.0%    | 0.194  |
3    | 60    | 47      | 78.3%    | 0.158  | Strong week
4    | 55    | 40      | 72.7%    | 0.182  |
...
12   | 52    | 40      | 76.9%    | 0.161  |
13   | 48    | 37      | 77.1%    | 0.155  | Rivalry week
14   | 12    | 9       | 75.0%    | 0.168  | Championship week

SEASON TOTAL
Games: 680 | Correct: 504 | Accuracy: 74.1%
Brier Score: 0.172 | vs. Baseline: +8.3%

Accuracy by Confidence:
- High confidence: 82.4% (156/189)
- Medium confidence: 74.8% (224/300)
- Low confidence: 64.9% (124/191)

Sample Prediction Output

==============================================================
WEEKLY GAME PREDICTIONS - WEEK 12
==============================================================

HIGH CONFIDENCE (15 games)
----------------------------------------
  Georgia over Tennessee (81.2%)
  Ohio State over Wisconsin (79.8%)
  Alabama over Auburn (78.5%)
  Michigan over Purdue (77.9%)
  ...

MEDIUM CONFIDENCE (28 games)
----------------------------------------
  Oregon over Utah (68.4%)
  Penn State over Rutgers (66.2%)
  LSU over Texas A&M (64.1%)
  ...

LOW CONFIDENCE (12 games)
----------------------------------------
  Florida vs Florida State (52.8% - Home: Florida)
  USC vs UCLA (51.2% - Home: USC)
  ...

Lessons Learned

  1. Ensemble > Single Model: Weighted ensemble consistently outperformed any single model by 2-3% accuracy

  2. Calibration Matters: Post-hoc calibration improved Brier score without changing accuracy

  3. Vegas is Informative: The spread remains the single most predictive feature

  4. Confidence Stratification: High-confidence predictions significantly more accurate than low-confidence

  5. Early Season Challenge: Model performance is weakest in weeks 1-3 due to limited current-season data

Production Deployment

SYSTEM ARCHITECTURE
===================

┌─────────────────────────────────────────────────────────────┐
│                    WEEKLY WORKFLOW                          │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  Monday: Data Update                                        │
│  ├── Ingest previous week results                          │
│  ├── Update team statistics                                │
│  └── Refresh Elo ratings                                   │
│                                                             │
│  Tuesday: Model Refresh                                     │
│  ├── Retrain on updated data (monthly)                     │
│  └── Validate calibration                                  │
│                                                             │
│  Wednesday: Prediction Generation                           │
│  ├── Generate week predictions                             │
│  ├── Calculate confidence levels                           │
│  └── Create editorial report                               │
│                                                             │
│  Saturday/Sunday: Result Tracking                           │
│  ├── Record actual outcomes                                │
│  ├── Update performance metrics                            │
│  └── Flag model drift if needed                            │
│                                                             │
└─────────────────────────────────────────────────────────────┘

Latency: Predictions generated in <30 seconds
Accuracy Target: >72% overall, >80% high-confidence
Monitoring: Weekly performance reports, drift detection