Case Study 2: Upset Prediction and Variance Analysis
Overview
This case study focuses on predicting and analyzing upsets in college football—games where the underdog wins despite being predicted to lose. Understanding upset dynamics helps improve model confidence calibration and identifies systematic blind spots in prediction systems.
Business Context
A sports betting analytics firm wants to:
- Identify games with elevated upset potential
- Understand factors that contribute to upsets
- Improve model calibration for high-spread games
- Develop an "upset alert" feature for subscribers
Key questions: - What distinguishes games that result in upsets? - Are certain types of favorites more vulnerable? - Can we predict which "likely" winners are actually at risk? - How should confidence levels be adjusted for high-variance situations?
Defining Upsets
Classification Framework
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List, Tuple
from enum import Enum
class UpsetCategory(Enum):
"""Categories of upset magnitude."""
MINOR = "minor" # Favorite by 3-7 points loses
MODERATE = "moderate" # Favorite by 7-14 points loses
MAJOR = "major" # Favorite by 14-21 points loses
MASSIVE = "massive" # Favorite by 21+ points loses
@dataclass
class UpsetDefinition:
"""
Framework for defining upsets.
An upset occurs when:
1. Pre-game favorite loses, OR
2. Underdog covers by significant margin
Upset severity based on:
- Pre-game win probability gap
- Point spread size
- Actual margin vs expected margin
"""
@staticmethod
def is_upset(predicted_prob: float, actual_win: bool,
threshold: float = 0.55) -> bool:
"""
Determine if game was an upset.
Parameters:
-----------
predicted_prob : float
Pre-game win probability for team A
actual_win : bool
Whether team A won
threshold : float
Minimum probability to be considered favorite
Returns:
--------
bool : True if result was an upset
"""
predicted_winner = predicted_prob > 0.5
return predicted_winner != actual_win and max(predicted_prob, 1-predicted_prob) >= threshold
@staticmethod
def categorize_upset(predicted_spread: float, actual_margin: float) -> UpsetCategory:
"""
Categorize upset by magnitude.
Parameters:
-----------
predicted_spread : float
Expected margin (positive = favorite)
actual_margin : float
Actual margin (from favorite's perspective)
"""
if actual_margin >= 0:
return None # Not an upset
margin_swing = predicted_spread - actual_margin
if margin_swing < 14:
return UpsetCategory.MINOR
elif margin_swing < 24:
return UpsetCategory.MODERATE
elif margin_swing < 35:
return UpsetCategory.MAJOR
else:
return UpsetCategory.MASSIVE
@staticmethod
def upset_severity_score(predicted_prob: float,
actual_result: bool) -> float:
"""
Calculate upset severity on 0-1 scale.
Higher scores indicate more surprising results.
"""
if predicted_prob > 0.5 and actual_result:
return 0 # Expected result
if predicted_prob < 0.5 and not actual_result:
return 0 # Expected result
# Upset occurred
expected_prob = predicted_prob if predicted_prob > 0.5 else 1 - predicted_prob
return expected_prob # Severity = how much we expected the other outcome
Upset Analysis
class UpsetAnalyzer:
"""
Comprehensive analysis of upsets in game data.
"""
def __init__(self, data: pd.DataFrame):
"""
Initialize with game data.
Required columns:
- predicted_prob: Model's pre-game probability for home team
- home_win: Actual outcome (0/1)
- home_score, away_score: Final scores
- predicted_spread: Expected margin
- Various feature columns for analysis
"""
self.data = data
self.upsets = None
self.analysis_results = {}
def identify_upsets(self, prob_threshold: float = 0.55) -> pd.DataFrame:
"""
Identify all upsets in the dataset.
Parameters:
-----------
prob_threshold : float
Minimum probability to be considered favorite
Returns:
--------
pd.DataFrame : Upset games with severity metrics
"""
upset_mask = self.data.apply(
lambda row: UpsetDefinition.is_upset(
row['predicted_prob'],
row['home_win'],
prob_threshold
),
axis=1
)
self.upsets = self.data[upset_mask].copy()
# Add severity metrics
self.upsets['upset_severity'] = self.upsets.apply(
lambda row: UpsetDefinition.upset_severity_score(
row['predicted_prob'],
row['home_win']
),
axis=1
)
self.upsets['actual_margin'] = (
self.upsets['home_score'] - self.upsets['away_score']
)
# Margin from favorite's perspective
self.upsets['favorite_margin'] = np.where(
self.upsets['predicted_prob'] > 0.5,
self.upsets['actual_margin'],
-self.upsets['actual_margin']
)
return self.upsets
def calculate_upset_rates(self) -> pd.DataFrame:
"""
Calculate upset rates by favorite probability.
Returns:
--------
pd.DataFrame : Upset rates by probability bin
"""
# Create probability bins
bins = [0.5, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 1.0]
labels = ['50-55%', '55-60%', '60-65%', '65-70%', '70-75%',
'75-80%', '80-85%', '85-90%', '90%+']
# Get favorite probability for each game
self.data['favorite_prob'] = self.data['predicted_prob'].apply(
lambda p: max(p, 1-p)
)
self.data['prob_bin'] = pd.cut(
self.data['favorite_prob'],
bins=bins,
labels=labels,
include_lowest=True
)
# Determine if favorite won
self.data['favorite_won'] = np.where(
self.data['predicted_prob'] > 0.5,
self.data['home_win'],
1 - self.data['home_win']
)
# Calculate rates by bin
rates = self.data.groupby('prob_bin', observed=True).agg({
'favorite_won': ['mean', 'count'],
'favorite_prob': 'mean'
}).round(3)
rates.columns = ['win_rate', 'n_games', 'avg_prob']
rates['upset_rate'] = 1 - rates['win_rate']
rates['expected_win_rate'] = rates['avg_prob']
rates['calibration_error'] = rates['win_rate'] - rates['expected_win_rate']
return rates.reset_index()
def analyze_upset_factors(self) -> Dict:
"""
Analyze factors that correlate with upsets.
Returns:
--------
Dict : Factor analysis results
"""
if self.upsets is None:
self.identify_upsets()
non_upsets = self.data[~self.data.index.isin(self.upsets.index)]
# Factors to analyze
factors = {
'rest_diff': 'Rest advantage',
'recent_form_diff': 'Recent form difference',
'turnover_diff': 'Turnover margin difference',
'home_field': 'Home field',
'week': 'Week of season'
}
analysis = {}
for col, name in factors.items():
if col not in self.data.columns:
continue
upset_mean = self.upsets[col].mean()
non_upset_mean = non_upsets[col].mean()
overall_mean = self.data[col].mean()
analysis[name] = {
'upset_mean': upset_mean,
'non_upset_mean': non_upset_mean,
'difference': upset_mean - non_upset_mean,
'pct_difference': (upset_mean - non_upset_mean) / (abs(overall_mean) + 0.01) * 100
}
self.analysis_results['factor_analysis'] = analysis
return analysis
def find_upset_patterns(self) -> Dict:
"""
Identify patterns in upset games.
Returns:
--------
Dict : Pattern analysis
"""
if self.upsets is None:
self.identify_upsets()
patterns = {}
# Upset rate by game type
if 'game_type' in self.data.columns:
by_type = self.data.groupby('game_type').apply(
lambda x: 1 - x['favorite_won'].mean()
)
patterns['by_game_type'] = by_type.to_dict()
# Upset rate by week
if 'week' in self.data.columns:
by_week = self.data.groupby('week').apply(
lambda x: 1 - x['favorite_won'].mean()
)
patterns['by_week'] = by_week.to_dict()
# Upset rate by conference matchup
if 'same_conference' in self.data.columns:
conf_upset = self.data.groupby('same_conference').apply(
lambda x: 1 - x['favorite_won'].mean()
)
patterns['conference_game'] = {
True: conf_upset.get(True, 0),
False: conf_upset.get(False, 0)
}
# Home vs away favorite
home_fav = self.data[self.data['predicted_prob'] > 0.5]
away_fav = self.data[self.data['predicted_prob'] < 0.5]
patterns['home_favorite_upset_rate'] = 1 - home_fav['home_win'].mean()
patterns['away_favorite_upset_rate'] = away_fav['home_win'].mean()
self.analysis_results['patterns'] = patterns
return patterns
Upset Prediction Model
Building an Upset Probability Model
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score
class UpsetPredictionModel:
"""
Model specifically for predicting upset probability.
Key insight: Standard game prediction focuses on WHO wins.
This model focuses on VARIANCE - when is a predicted winner
actually at risk?
"""
def __init__(self):
self.scaler = StandardScaler()
self.model = None
self.feature_columns = None
def engineer_upset_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""
Create features specifically for upset prediction.
These features capture volatility, not just team strength.
"""
features = data.copy()
# Base prediction features (how confident is the base model?)
features['predicted_margin'] = abs(features['predicted_spread'])
features['pred_prob_deviation'] = abs(features['predicted_prob'] - 0.5)
# Volatility indicators
if 'home_scoring_std' in data.columns:
features['scoring_volatility'] = (
features['home_scoring_std'] + features['away_scoring_std']
) / 2
# Underdog factors (things that help underdogs)
features['underdog_rest_advantage'] = np.where(
features['predicted_prob'] > 0.5,
-features.get('rest_diff', 0),
features.get('rest_diff', 0)
)
features['underdog_home'] = np.where(
features['predicted_prob'] > 0.5,
1 - features.get('home_field', 1),
features.get('home_field', 1)
)
# Recent form convergence (are teams trending toward each other?)
if 'recent_form_diff' in data.columns:
features['form_convergence'] = -abs(features['recent_form_diff'])
# Turnover risk (high turnover variance = upset potential)
if 'turnover_diff' in data.columns:
features['turnover_volatility'] = abs(features['turnover_diff'])
# Style matchup features
if 'off_epa_diff' in data.columns and 'def_epa_diff' in data.columns:
# Are teams mismatched in style?
features['style_mismatch'] = abs(
features['off_epa_diff'] - features['def_epa_diff']
)
# Historical upset indicators
if 'h2h_upset_history' in data.columns:
features['h2h_upset_history'] = data['h2h_upset_history']
return features
def fit(self, X: pd.DataFrame, y: pd.Series) -> 'UpsetPredictionModel':
"""
Train the upset prediction model.
Parameters:
-----------
X : pd.DataFrame
Features
y : pd.Series
Binary upset indicator (1 = upset occurred)
"""
# Engineer features
X_eng = self.engineer_upset_features(X)
# Select relevant columns
self.feature_columns = [
col for col in X_eng.columns
if col not in ['predicted_prob', 'home_win', 'home_score',
'away_score', 'date', 'home_team', 'away_team']
]
X_model = X_eng[self.feature_columns].fillna(0)
# Scale
X_scaled = self.scaler.fit_transform(X_model)
# Use gradient boosting with calibration
base_model = GradientBoostingClassifier(
n_estimators=100,
max_depth=4,
learning_rate=0.05,
min_samples_leaf=20,
random_state=42
)
self.model = CalibratedClassifierCV(base_model, cv=5, method='sigmoid')
self.model.fit(X_scaled, y)
return self
def predict_upset_probability(self, X: pd.DataFrame) -> np.ndarray:
"""
Predict probability that an upset will occur.
Returns:
--------
np.ndarray : Upset probabilities
"""
X_eng = self.engineer_upset_features(X)
X_model = X_eng[self.feature_columns].fillna(0)
X_scaled = self.scaler.transform(X_model)
return self.model.predict_proba(X_scaled)[:, 1]
def identify_high_risk_games(self, X: pd.DataFrame,
threshold: float = 0.40) -> pd.DataFrame:
"""
Identify games with elevated upset risk.
Parameters:
-----------
X : pd.DataFrame
Game features
threshold : float
Upset probability threshold for "high risk"
Returns:
--------
pd.DataFrame : High-risk games with risk scores
"""
upset_probs = self.predict_upset_probability(X)
high_risk = X[upset_probs >= threshold].copy()
high_risk['upset_probability'] = upset_probs[upset_probs >= threshold]
# Calculate risk-adjusted confidence
high_risk['adjusted_confidence'] = (
high_risk['predicted_prob'].apply(lambda p: abs(p - 0.5)) *
(1 - high_risk['upset_probability'])
)
return high_risk.sort_values('upset_probability', ascending=False)
Confidence Adjustment System
class ConfidenceAdjuster:
"""
Adjust prediction confidence based on upset risk analysis.
"""
def __init__(self, upset_model: UpsetPredictionModel):
self.upset_model = upset_model
self.adjustment_params = {
'max_adjustment': 0.15, # Maximum confidence reduction
'upset_threshold': 0.25 # When to start adjusting
}
def adjust_probability(self, original_prob: float,
upset_prob: float) -> float:
"""
Adjust win probability based on upset risk.
When upset probability is elevated, pull prediction toward 0.5.
Parameters:
-----------
original_prob : float
Original model win probability
upset_prob : float
Estimated upset probability
Returns:
--------
float : Adjusted win probability
"""
# Only adjust if upset prob exceeds threshold
if upset_prob < self.adjustment_params['upset_threshold']:
return original_prob
# Calculate adjustment magnitude
adjustment_factor = min(
(upset_prob - self.adjustment_params['upset_threshold']) /
(1 - self.adjustment_params['upset_threshold']),
1.0
) * self.adjustment_params['max_adjustment']
# Pull toward 0.5
adjusted = original_prob + adjustment_factor * (0.5 - original_prob)
return adjusted
def create_adjusted_predictions(self, games: pd.DataFrame) -> pd.DataFrame:
"""
Create predictions with confidence adjustments.
Returns:
--------
pd.DataFrame : Predictions with original and adjusted probabilities
"""
results = games.copy()
# Get upset probabilities
upset_probs = self.upset_model.predict_upset_probability(games)
results['upset_probability'] = upset_probs
# Adjust probabilities
results['adjusted_prob'] = results.apply(
lambda row: self.adjust_probability(
row['predicted_prob'],
row['upset_probability']
),
axis=1
)
# Calculate confidence metrics
results['original_confidence'] = abs(results['predicted_prob'] - 0.5) * 2
results['adjusted_confidence'] = abs(results['adjusted_prob'] - 0.5) * 2
results['confidence_reduction'] = (
results['original_confidence'] - results['adjusted_confidence']
)
# Flag high-risk games
results['upset_alert'] = results['upset_probability'] > 0.35
return results
Evaluation and Results
Model Performance
def evaluate_upset_prediction(model: UpsetPredictionModel,
test_data: pd.DataFrame,
actual_upsets: pd.Series) -> Dict:
"""
Evaluate upset prediction model.
Parameters:
-----------
model : UpsetPredictionModel
Trained model
test_data : pd.DataFrame
Test features
actual_upsets : pd.Series
Binary indicator of actual upsets
Returns:
--------
Dict : Evaluation metrics
"""
upset_probs = model.predict_upset_probability(test_data)
upset_preds = (upset_probs > 0.35).astype(int)
# Standard metrics
from sklearn.metrics import (
accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score
)
results = {
'accuracy': accuracy_score(actual_upsets, upset_preds),
'precision': precision_score(actual_upsets, upset_preds),
'recall': recall_score(actual_upsets, upset_preds),
'f1': f1_score(actual_upsets, upset_preds),
'auc_roc': roc_auc_score(actual_upsets, upset_probs)
}
# Upset-specific metrics
# How many actual upsets did we flag?
flagged_upsets = actual_upsets[upset_preds == 1]
if len(flagged_upsets) > 0:
results['upset_capture_rate'] = flagged_upsets.mean()
else:
results['upset_capture_rate'] = 0
# Value of alert system
high_risk_games = test_data[upset_probs > 0.35]
if len(high_risk_games) > 0:
high_risk_upset_rate = actual_upsets[upset_probs > 0.35].mean()
overall_upset_rate = actual_upsets.mean()
results['alert_lift'] = high_risk_upset_rate / overall_upset_rate
return results
Sample Results
UPSET PREDICTION MODEL RESULTS
============================================================
Test Set: 429 games (2023 season)
Actual Upsets: 137 (31.9% of games with 55%+ favorite)
UPSET PREDICTION METRICS:
AUC-ROC: 0.687
Precision: 0.412 (of games flagged as high-risk, 41.2% were upsets)
Recall: 0.485 (captured 48.5% of actual upsets)
F1 Score: 0.446
ALERT SYSTEM VALUE:
Games flagged as high-risk: 162
Actual upset rate in flagged games: 41.2%
Baseline upset rate: 31.9%
Alert lift: 1.29x (29% more upsets than baseline)
CONFIDENCE ADJUSTMENT IMPACT:
Original model accuracy: 68.3%
Adjusted model accuracy: 67.1%
Brier score improvement: 0.004 (better calibration)
High-risk game accuracy (original): 54.3%
High-risk game accuracy (adjusted): 61.7%
MOST PREDICTIVE UPSET FACTORS:
1. Underdog rest advantage: +12.3%
2. Conference game: +8.7%
3. Underdog playing at home: +7.2%
4. Late season (week 10+): +5.1%
5. Recent form convergence: +4.8%
Production Implementation
Upset Alert System
@dataclass
class UpsetAlert:
"""Alert for games with elevated upset risk."""
game_id: str
home_team: str
away_team: str
favorite: str
original_probability: float
upset_probability: float
adjusted_probability: float
risk_factors: List[str]
alert_level: str # 'watch', 'elevated', 'high'
class UpsetAlertSystem:
"""
Production system for generating upset alerts.
"""
def __init__(self, upset_model: UpsetPredictionModel,
confidence_adjuster: ConfidenceAdjuster):
self.upset_model = upset_model
self.adjuster = confidence_adjuster
def generate_weekly_alerts(self, games: pd.DataFrame) -> List[UpsetAlert]:
"""
Generate upset alerts for a week of games.
Parameters:
-----------
games : pd.DataFrame
Week's games with predictions
Returns:
--------
List[UpsetAlert] : Sorted list of alerts
"""
alerts = []
upset_probs = self.upset_model.predict_upset_probability(games)
for idx, row in games.iterrows():
upset_prob = upset_probs[idx] if isinstance(upset_probs, np.ndarray) else upset_probs.iloc[idx]
# Skip low-risk games
if upset_prob < 0.25:
continue
# Identify risk factors
risk_factors = self._identify_risk_factors(row, upset_prob)
# Determine alert level
if upset_prob >= 0.45:
alert_level = 'high'
elif upset_prob >= 0.35:
alert_level = 'elevated'
else:
alert_level = 'watch'
# Create alert
favorite = row['home_team'] if row['predicted_prob'] > 0.5 else row['away_team']
adjusted = self.adjuster.adjust_probability(row['predicted_prob'], upset_prob)
alert = UpsetAlert(
game_id=f"{row.get('game_id', idx)}",
home_team=row['home_team'],
away_team=row['away_team'],
favorite=favorite,
original_probability=max(row['predicted_prob'], 1-row['predicted_prob']),
upset_probability=upset_prob,
adjusted_probability=max(adjusted, 1-adjusted),
risk_factors=risk_factors,
alert_level=alert_level
)
alerts.append(alert)
# Sort by upset probability
return sorted(alerts, key=lambda x: x.upset_probability, reverse=True)
def _identify_risk_factors(self, game: pd.Series,
upset_prob: float) -> List[str]:
"""Identify specific risk factors for a game."""
factors = []
# Check various risk indicators
if game.get('underdog_rest_advantage', 0) > 3:
factors.append("Underdog rest advantage")
if game.get('underdog_home', 0) == 1:
factors.append("Underdog at home")
if game.get('week', 0) >= 10:
factors.append("Late season game")
if abs(game.get('recent_form_diff', 0)) < 2:
factors.append("Recent form convergence")
if game.get('rivalry_game', False):
factors.append("Rivalry game")
if game.get('turnover_volatility', 0) > 1.5:
factors.append("High turnover variance")
return factors[:4] # Return top 4 factors
def format_alerts_for_display(self, alerts: List[UpsetAlert]) -> str:
"""Format alerts for subscriber display."""
output = []
output.append("=" * 60)
output.append("UPSET ALERT REPORT")
output.append("=" * 60)
for level in ['high', 'elevated', 'watch']:
level_alerts = [a for a in alerts if a.alert_level == level]
if level_alerts:
output.append(f"\n{level.upper()} RISK ({len(level_alerts)} games):")
output.append("-" * 40)
for alert in level_alerts:
output.append(f"\n{alert.away_team} @ {alert.home_team}")
output.append(f" Favorite: {alert.favorite} ({alert.original_probability:.0%})")
output.append(f" Upset Risk: {alert.upset_probability:.0%}")
output.append(f" Adjusted: {alert.adjusted_probability:.0%}")
output.append(f" Risk Factors: {', '.join(alert.risk_factors)}")
return "\n".join(output)
Key Findings
- Upset patterns are partially predictable: ~48% of upsets can be flagged in advance
- Key risk factors: Rest advantage, home underdog, late season, form convergence
- Calibration value: Adjusted probabilities are better calibrated for high-risk games
- Alert system ROI: Flagged games have 29% higher upset rate than baseline
Recommendations
- Display adjusted confidence for games with elevated upset risk
- Include risk factor explanations in predictions
- Weight predictions differently when upset risk is high
- Track upset alert accuracy to refine model over time
- Consider upset risk in ensemble weighting