Case Study: Building a QB Comparison Tool
"The best code is code you don't have to debug because it's clear enough to understand at a glance."
Executive Summary
This case study applies Chapter 3 programming concepts to build a reusable quarterback comparison tool. You'll practice function design, data manipulation, and code organization while creating something genuinely useful for football analysis.
Skills Applied: - Function design with type hints and docstrings - Pandas aggregation and transformation - Method chaining - Class-based tool design - Error handling
The Problem
You need a tool that can: 1. Compare any set of quarterbacks on key metrics 2. Rank quarterbacks across multiple dimensions 3. Handle different time periods (season, week ranges) 4. Produce clean, readable output
Implementation
Part 1: Core Functions
"""
QB Comparison Tool - Core Functions
"""
import pandas as pd
import numpy as np
from typing import List, Optional, Dict, Union
import nfl_data_py as nfl
def load_passing_data(
seasons: List[int],
min_dropbacks: int = 100
) -> pd.DataFrame:
"""
Load and aggregate passing data by quarterback.
Parameters
----------
seasons : List[int]
Seasons to include
min_dropbacks : int
Minimum dropbacks for inclusion
Returns
-------
pd.DataFrame
QB statistics with one row per player-season
"""
pbp = nfl.import_pbp_data(seasons)
# Filter to pass plays
passes = pbp[pbp['pass'] == 1].copy()
# Aggregate by QB
qb_stats = (
passes
.groupby(['season', 'passer_player_id', 'passer_player_name'])
.agg(
dropbacks=('pass', 'count'),
completions=('complete_pass', 'sum'),
attempts=('pass_attempt', 'sum'),
yards=('yards_gained', 'sum'),
air_yards=('air_yards', 'sum'),
yac=('yards_after_catch', 'sum'),
tds=('pass_touchdown', 'sum'),
ints=('interception', 'sum'),
sacks=('sack', 'sum'),
epa_total=('epa', 'sum'),
epa_mean=('epa', 'mean'),
cpoe_mean=('cpoe', 'mean'),
success_rate=('success', 'mean')
)
.reset_index()
.query(f"dropbacks >= {min_dropbacks}")
)
# Add derived metrics
qb_stats['comp_pct'] = qb_stats['completions'] / qb_stats['attempts']
qb_stats['yards_per_att'] = qb_stats['yards'] / qb_stats['attempts']
qb_stats['td_rate'] = qb_stats['tds'] / qb_stats['attempts']
qb_stats['int_rate'] = qb_stats['ints'] / qb_stats['attempts']
qb_stats['sack_rate'] = qb_stats['sacks'] / qb_stats['dropbacks']
return qb_stats
def rank_quarterbacks(
qb_stats: pd.DataFrame,
metrics: List[str],
weights: Optional[Dict[str, float]] = None
) -> pd.DataFrame:
"""
Rank quarterbacks across multiple metrics.
Parameters
----------
qb_stats : pd.DataFrame
QB statistics
metrics : List[str]
Columns to rank on
weights : Dict[str, float], optional
Weight for each metric in composite score
Returns
-------
pd.DataFrame
Original data with rank columns added
"""
df = qb_stats.copy()
# Higher is better for most metrics
higher_is_better = ['epa_mean', 'cpoe_mean', 'success_rate',
'comp_pct', 'yards_per_att', 'td_rate']
lower_is_better = ['int_rate', 'sack_rate']
for metric in metrics:
ascending = metric in lower_is_better
df[f'{metric}_rank'] = df[metric].rank(ascending=ascending)
# Composite score if weights provided
if weights:
df['composite_score'] = 0
for metric, weight in weights.items():
df['composite_score'] += df[f'{metric}_rank'] * weight
df['overall_rank'] = df['composite_score'].rank()
return df
def compare_qbs(
qb_stats: pd.DataFrame,
player_names: List[str],
metrics: Optional[List[str]] = None
) -> pd.DataFrame:
"""
Compare specific quarterbacks side by side.
Parameters
----------
qb_stats : pd.DataFrame
QB statistics
player_names : List[str]
Names to compare (partial match supported)
metrics : List[str], optional
Metrics to show (default: standard set)
Returns
-------
pd.DataFrame
Comparison table with one row per QB
"""
if metrics is None:
metrics = ['dropbacks', 'epa_mean', 'cpoe_mean', 'success_rate',
'comp_pct', 'yards_per_att', 'td_rate', 'int_rate']
# Find matching players
mask = qb_stats['passer_player_name'].str.lower().apply(
lambda x: any(name.lower() in x for name in player_names)
)
comparison = qb_stats[mask][
['season', 'passer_player_name'] + metrics
].copy()
return comparison.sort_values('epa_mean', ascending=False)
Part 2: Class-Based Tool
class QBComparisonTool:
"""
Tool for comparing NFL quarterbacks.
Example
-------
>>> tool = QBComparisonTool([2022, 2023])
>>> tool.top_qbs(10, metric='epa_mean')
>>> tool.compare(['Mahomes', 'Burrow', 'Allen'])
>>> tool.situation_analysis('P.Mahomes', situation='third_down')
"""
DEFAULT_METRICS = [
'dropbacks', 'epa_mean', 'cpoe_mean', 'success_rate',
'comp_pct', 'yards_per_att', 'td_rate', 'int_rate'
]
def __init__(
self,
seasons: List[int],
min_dropbacks: int = 100
):
"""Initialize with seasons and load data."""
self.seasons = seasons
self.min_dropbacks = min_dropbacks
self._qb_stats = None
self._pbp = None
@property
def qb_stats(self) -> pd.DataFrame:
"""Lazy load QB stats."""
if self._qb_stats is None:
self._qb_stats = load_passing_data(
self.seasons, self.min_dropbacks
)
return self._qb_stats
@property
def pbp(self) -> pd.DataFrame:
"""Lazy load play-by-play."""
if self._pbp is None:
self._pbp = nfl.import_pbp_data(self.seasons)
return self._pbp
def top_qbs(
self,
n: int = 10,
metric: str = 'epa_mean',
season: Optional[int] = None
) -> pd.DataFrame:
"""
Get top N quarterbacks by specified metric.
Parameters
----------
n : int
Number of QBs to return
metric : str
Metric to rank by
season : int, optional
Filter to specific season
Returns
-------
pd.DataFrame
Top QBs with key metrics
"""
df = self.qb_stats.copy()
if season:
df = df[df['season'] == season]
ascending = metric in ['int_rate', 'sack_rate']
return (
df
.sort_values(metric, ascending=ascending)
.head(n)
[['season', 'passer_player_name'] + self.DEFAULT_METRICS]
)
def compare(
self,
names: List[str],
metrics: Optional[List[str]] = None
) -> pd.DataFrame:
"""
Compare specific quarterbacks.
Parameters
----------
names : List[str]
QB names (partial match)
metrics : List[str], optional
Metrics to include
Returns
-------
pd.DataFrame
Side-by-side comparison
"""
return compare_qbs(
self.qb_stats,
names,
metrics or self.DEFAULT_METRICS
)
def percentile_rank(self, name: str) -> pd.DataFrame:
"""
Get percentile rankings for a QB.
Parameters
----------
name : str
QB name (partial match)
Returns
-------
pd.DataFrame
Percentile for each metric
"""
df = self.qb_stats.copy()
# Find the player
mask = df['passer_player_name'].str.lower().str.contains(name.lower())
if mask.sum() == 0:
raise ValueError(f"No QB found matching '{name}'")
player_row = df[mask].iloc[0]
percentiles = {}
for metric in self.DEFAULT_METRICS:
if metric == 'dropbacks':
continue
ascending = metric in ['int_rate', 'sack_rate']
pct = (df[metric] < player_row[metric]).mean()
percentiles[metric] = pct if not ascending else 1 - pct
return pd.DataFrame([percentiles], index=[player_row['passer_player_name']])
def situation_analysis(
self,
name: str,
situation: str = 'all'
) -> pd.DataFrame:
"""
Analyze QB performance in specific situations.
Parameters
----------
name : str
QB name
situation : str
'all', 'third_down', 'red_zone', 'two_minute', 'trailing'
Returns
-------
pd.DataFrame
Situational statistics
"""
passes = self.pbp[
(self.pbp['pass'] == 1) &
(self.pbp['passer_player_name'].str.contains(name, case=False, na=False))
].copy()
if situation == 'third_down':
passes = passes[passes['down'] == 3]
elif situation == 'red_zone':
passes = passes[passes['yardline_100'] <= 20]
elif situation == 'two_minute':
passes = passes[passes['half_seconds_remaining'] <= 120]
elif situation == 'trailing':
passes = passes[
passes['posteam_score'] < passes['defteam_score']
]
return (
passes
.groupby('season')
.agg(
plays=('play_id', 'count'),
epa=('epa', 'mean'),
cpoe=('cpoe', 'mean'),
success_rate=('success', 'mean'),
comp_pct=('complete_pass', 'mean')
)
.reset_index()
)
def summary(self, name: str) -> str:
"""
Generate text summary for a QB.
Parameters
----------
name : str
QB name
Returns
-------
str
Formatted summary text
"""
comparison = self.compare([name])
if comparison.empty:
return f"No data found for {name}"
row = comparison.iloc[0]
percentiles = self.percentile_rank(name).iloc[0]
lines = [
f"QB Summary: {row['passer_player_name']} ({int(row['season'])})",
"=" * 50,
f"Dropbacks: {int(row['dropbacks'])}",
f"EPA/Play: {row['epa_mean']:.3f} ({percentiles['epa_mean']:.0%}tile)",
f"CPOE: {row['cpoe_mean']:.1f}% ({percentiles['cpoe_mean']:.0%}tile)",
f"Success Rate: {row['success_rate']:.1%}",
f"Completion %: {row['comp_pct']:.1%}",
f"Y/A: {row['yards_per_att']:.1f}",
f"TD Rate: {row['td_rate']:.1%} | INT Rate: {row['int_rate']:.1%}",
]
return "\n".join(lines)
Part 3: Usage Examples
# Initialize tool
tool = QBComparisonTool([2023])
# Get top 10 QBs by EPA
top_10 = tool.top_qbs(10, metric='epa_mean')
print("Top 10 QBs by EPA/Play (2023):")
print(top_10)
# Compare specific QBs
comparison = tool.compare(['Mahomes', 'Burrow', 'Allen', 'Hurts'])
print("\nQB Comparison:")
print(comparison)
# Percentile rankings
print("\nPatrick Mahomes Percentile Rankings:")
print(tool.percentile_rank('Mahomes'))
# Situational analysis
print("\nMahomes on Third Down:")
print(tool.situation_analysis('Mahomes', situation='third_down'))
# Full summary
print("\n" + tool.summary('Mahomes'))
Discussion Questions
-
Why do we use lazy loading (the
@propertypattern) for data? -
What are the benefits of the class-based approach vs. standalone functions?
-
How would you extend this tool to compare across different seasons?
-
What error handling should be added for production use?
Extensions
Option A: Add visualization methods (bar charts, radar plots)
Option B: Add weekly trend analysis
Option C: Add opponent-adjusted metrics
Option D: Export results to formatted reports (HTML, PDF)