Case Study: Building a Team Performance Dashboard
"The goal is to turn data into information, and information into insight." — Carly Fiorina
Executive Summary
In this case study, you'll build a comprehensive team performance analysis system using pandas and NumPy. Starting with raw game data, you'll create functions that calculate standings, compare teams, and identify trends—the foundation of any football analytics dashboard.
Skills Applied: - DataFrame manipulation and aggregation - Complex groupby operations - Function design for reusable analysis - Performance optimization
Background
The Scenario
You're a data analyst for a college football media company. Your editor needs a weekly report showing:
- Conference standings
- Team performance trends
- Head-to-head comparisons
- Statistical leaders
You need to build a reusable system that processes game data and produces these insights automatically.
The Data
We'll work with simulated 2023 SEC season data including: - All regular season games - Scores and basic statistics - Home/away designations
Phase 1: Data Preparation
Creating the Dataset
First, let's create a realistic season dataset:
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
# Set seed for reproducibility
np.random.seed(2023)
def create_sec_schedule() -> pd.DataFrame:
"""
Create a simulated SEC season schedule with results.
Returns
-------
pd.DataFrame
Complete season of SEC games
"""
# SEC teams
sec_east = ["Georgia", "Florida", "Tennessee", "Kentucky",
"South Carolina", "Missouri", "Vanderbilt"]
sec_west = ["Alabama", "LSU", "Ole Miss", "Texas A&M",
"Auburn", "Mississippi State", "Arkansas"]
all_teams = sec_east + sec_west
games = []
game_id = 1
# Conference games (8 per team)
for week in range(1, 13): # 12 weeks
# Generate matchups for this week
week_games = []
teams_scheduled = set()
# Prioritize division games
for team in all_teams:
if team in teams_scheduled:
continue
# Find opponent
division = sec_east if team in sec_east else sec_west
cross_div = sec_west if team in sec_east else sec_east
# Mix of division and cross-division games
if week % 3 == 0:
possible_opponents = [t for t in cross_div if t not in teams_scheduled]
else:
possible_opponents = [t for t in division if t not in teams_scheduled and t != team]
if not possible_opponents:
possible_opponents = [t for t in all_teams if t not in teams_scheduled and t != team]
if possible_opponents:
opponent = np.random.choice(possible_opponents)
teams_scheduled.add(team)
teams_scheduled.add(opponent)
# Determine home team
home_team = team if np.random.random() > 0.5 else opponent
away_team = opponent if home_team == team else team
# Generate realistic score
# Better teams tend to score more
team_strength = get_team_strength(home_team)
opp_strength = get_team_strength(away_team)
home_advantage = 3
expected_home = 28 + team_strength + home_advantage
expected_away = 28 + opp_strength
home_score = max(0, int(np.random.normal(expected_home, 10)))
away_score = max(0, int(np.random.normal(expected_away, 10)))
games.append({
"game_id": game_id,
"week": week,
"home_team": home_team,
"away_team": away_team,
"home_score": home_score,
"away_score": away_score,
"home_rush_yards": int(np.random.normal(150 + team_strength * 5, 40)),
"home_pass_yards": int(np.random.normal(250 + team_strength * 8, 60)),
"away_rush_yards": int(np.random.normal(150 + opp_strength * 5, 40)),
"away_pass_yards": int(np.random.normal(250 + opp_strength * 8, 60)),
"attendance": int(np.random.normal(85000, 15000))
})
game_id += 1
return pd.DataFrame(games)
def get_team_strength(team: str) -> int:
"""
Return team strength modifier (-5 to +10).
Based on recent historical performance.
"""
strengths = {
"Georgia": 10, "Alabama": 9, "Tennessee": 6, "LSU": 6,
"Ole Miss": 5, "Texas A&M": 3, "Kentucky": 2, "Florida": 2,
"Auburn": 1, "Missouri": 1, "South Carolina": 0,
"Mississippi State": -1, "Arkansas": -2, "Vanderbilt": -5
}
return strengths.get(team, 0)
# Create the dataset
games_df = create_sec_schedule()
print(f"Created {len(games_df)} games")
print(games_df.head(10))
Data Validation
Before analysis, validate the data:
def validate_game_data(df: pd.DataFrame) -> Dict[str, any]:
"""
Validate game data integrity.
Parameters
----------
df : pd.DataFrame
Games DataFrame
Returns
-------
Dict
Validation results
"""
results = {
"total_games": len(df),
"issues": []
}
# Check for missing values
missing = df.isnull().sum()
if missing.any():
results["issues"].append(f"Missing values: {missing[missing > 0].to_dict()}")
# Check for negative scores
neg_scores = ((df["home_score"] < 0) | (df["away_score"] < 0)).sum()
if neg_scores > 0:
results["issues"].append(f"{neg_scores} games with negative scores")
# Check for duplicate games
duplicates = df.duplicated(subset=["week", "home_team", "away_team"]).sum()
if duplicates > 0:
results["issues"].append(f"{duplicates} duplicate games")
# Check team plays same opponent multiple times
for team in df["home_team"].unique():
team_games = df[(df["home_team"] == team) | (df["away_team"] == team)]
opponents = []
for _, game in team_games.iterrows():
opp = game["away_team"] if game["home_team"] == team else game["home_team"]
opponents.append(opp)
from collections import Counter
counts = Counter(opponents)
excess = {k: v for k, v in counts.items() if v > 1}
if excess:
# This is actually normal for conferences, so just note it
pass
results["valid"] = len(results["issues"]) == 0
return results
validation = validate_game_data(games_df)
print(f"Validation: {'PASSED' if validation['valid'] else 'FAILED'}")
for issue in validation["issues"]:
print(f" - {issue}")
Phase 2: Building the Standings Calculator
Core Standings Function
def calculate_standings(games_df: pd.DataFrame) -> pd.DataFrame:
"""
Calculate conference standings from game results.
Parameters
----------
games_df : pd.DataFrame
Games with home_team, away_team, home_score, away_score
Returns
-------
pd.DataFrame
Standings with wins, losses, points for/against
"""
# Process home games
home = games_df.copy()
home["team"] = home["home_team"]
home["opponent"] = home["away_team"]
home["points_for"] = home["home_score"]
home["points_against"] = home["away_score"]
home["is_win"] = home["home_score"] > home["away_score"]
home["is_home"] = True
home["rush_yards"] = home["home_rush_yards"]
home["pass_yards"] = home["home_pass_yards"]
# Process away games
away = games_df.copy()
away["team"] = away["away_team"]
away["opponent"] = away["home_team"]
away["points_for"] = away["away_score"]
away["points_against"] = away["home_score"]
away["is_win"] = away["away_score"] > away["home_score"]
away["is_home"] = False
away["rush_yards"] = away["away_rush_yards"]
away["pass_yards"] = away["away_pass_yards"]
# Combine
all_games = pd.concat([
home[["team", "opponent", "points_for", "points_against",
"is_win", "is_home", "rush_yards", "pass_yards"]],
away[["team", "opponent", "points_for", "points_against",
"is_win", "is_home", "rush_yards", "pass_yards"]]
])
# Aggregate by team
standings = all_games.groupby("team").agg(
games=("is_win", "count"),
wins=("is_win", "sum"),
home_wins=("is_win", lambda x: x[all_games.loc[x.index, "is_home"]].sum()),
points_for=("points_for", "sum"),
points_against=("points_against", "sum"),
total_rush_yards=("rush_yards", "sum"),
total_pass_yards=("pass_yards", "sum")
).reset_index()
# Calculate derived metrics
standings["losses"] = standings["games"] - standings["wins"]
standings["away_wins"] = standings["wins"] - standings["home_wins"]
standings["win_pct"] = (standings["wins"] / standings["games"]).round(3)
standings["ppg"] = (standings["points_for"] / standings["games"]).round(1)
standings["ppg_allowed"] = (standings["points_against"] / standings["games"]).round(1)
standings["point_diff"] = standings["points_for"] - standings["points_against"]
standings["ypg"] = ((standings["total_rush_yards"] + standings["total_pass_yards"])
/ standings["games"]).round(1)
# Sort by win percentage, then point differential
standings = standings.sort_values(
["win_pct", "point_diff"],
ascending=[False, False]
).reset_index(drop=True)
# Add rank
standings["rank"] = range(1, len(standings) + 1)
return standings
# Calculate standings
standings = calculate_standings(games_df)
print("\nSEC STANDINGS")
print("=" * 80)
print(standings[["rank", "team", "wins", "losses", "win_pct", "ppg", "ppg_allowed", "point_diff"]])
Division Standings
def calculate_division_standings(
games_df: pd.DataFrame,
divisions: Dict[str, List[str]]
) -> Dict[str, pd.DataFrame]:
"""
Calculate standings by division.
Parameters
----------
games_df : pd.DataFrame
All games
divisions : Dict[str, List[str]]
Dictionary mapping division name to team list
Returns
-------
Dict[str, pd.DataFrame]
Standings for each division
"""
# Get overall standings
all_standings = calculate_standings(games_df)
division_standings = {}
for div_name, teams in divisions.items():
# Filter to division teams
div_standings = all_standings[all_standings["team"].isin(teams)].copy()
# Re-rank within division
div_standings = div_standings.sort_values(
["win_pct", "point_diff"],
ascending=[False, False]
).reset_index(drop=True)
div_standings["div_rank"] = range(1, len(div_standings) + 1)
division_standings[div_name] = div_standings
return division_standings
# Define divisions
sec_divisions = {
"SEC East": ["Georgia", "Florida", "Tennessee", "Kentucky",
"South Carolina", "Missouri", "Vanderbilt"],
"SEC West": ["Alabama", "LSU", "Ole Miss", "Texas A&M",
"Auburn", "Mississippi State", "Arkansas"]
}
div_standings = calculate_division_standings(games_df, sec_divisions)
for div_name, standings in div_standings.items():
print(f"\n{div_name.upper()}")
print("-" * 60)
print(standings[["div_rank", "team", "wins", "losses", "ppg", "ppg_allowed"]])
Phase 3: Team Performance Analysis
Team Profile Function
def create_team_profile(
games_df: pd.DataFrame,
team: str
) -> Dict[str, any]:
"""
Create comprehensive profile for a single team.
Parameters
----------
games_df : pd.DataFrame
All games
team : str
Team name
Returns
-------
Dict
Complete team profile
"""
# Get team's games
home_games = games_df[games_df["home_team"] == team].copy()
away_games = games_df[games_df["away_team"] == team].copy()
# Standardize
home_games["points_for"] = home_games["home_score"]
home_games["points_against"] = home_games["away_score"]
home_games["is_win"] = home_games["home_score"] > home_games["away_score"]
home_games["opponent"] = home_games["away_team"]
home_games["location"] = "Home"
away_games["points_for"] = away_games["away_score"]
away_games["points_against"] = away_games["home_score"]
away_games["is_win"] = away_games["away_score"] > away_games["home_score"]
away_games["opponent"] = away_games["home_team"]
away_games["location"] = "Away"
all_games = pd.concat([
home_games[["week", "opponent", "points_for", "points_against", "is_win", "location"]],
away_games[["week", "opponent", "points_for", "points_against", "is_win", "location"]]
]).sort_values("week")
# Calculate profile
profile = {
"team": team,
"games_played": len(all_games),
"record": {
"wins": all_games["is_win"].sum(),
"losses": len(all_games) - all_games["is_win"].sum(),
"home_record": f"{home_games['is_win'].sum()}-{len(home_games) - home_games['is_win'].sum()}",
"away_record": f"{away_games['is_win'].sum()}-{len(away_games) - away_games['is_win'].sum()}"
},
"scoring": {
"total_points_for": all_games["points_for"].sum(),
"total_points_against": all_games["points_against"].sum(),
"ppg": round(all_games["points_for"].mean(), 1),
"ppg_allowed": round(all_games["points_against"].mean(), 1),
"point_differential": all_games["points_for"].sum() - all_games["points_against"].sum()
},
"margins": {
"average_margin": round((all_games["points_for"] - all_games["points_against"]).mean(), 1),
"biggest_win": (all_games["points_for"] - all_games["points_against"]).max(),
"worst_loss": (all_games["points_for"] - all_games["points_against"]).min()
},
"game_log": all_games.to_dict("records")
}
# Calculate streaks
profile["streaks"] = calculate_streaks(all_games)
return profile
def calculate_streaks(team_games: pd.DataFrame) -> Dict[str, int]:
"""Calculate win/loss streaks."""
games = team_games.sort_values("week")
current_streak = 0
max_win_streak = 0
max_loss_streak = 0
temp_streak = 0
last_result = None
for _, game in games.iterrows():
if game["is_win"]:
if last_result == True:
temp_streak += 1
else:
temp_streak = 1
max_win_streak = max(max_win_streak, temp_streak)
else:
if last_result == False:
temp_streak += 1
else:
temp_streak = 1
max_loss_streak = max(max_loss_streak, temp_streak)
last_result = game["is_win"]
# Current streak
current = 0
last = None
for _, game in games.iloc[::-1].iterrows():
if last is None:
last = game["is_win"]
current = 1
elif game["is_win"] == last:
current += 1
else:
break
return {
"current_streak": current if last else -current,
"longest_win_streak": max_win_streak,
"longest_loss_streak": max_loss_streak
}
# Create profile for Georgia
georgia_profile = create_team_profile(games_df, "Georgia")
print("\nGEORGIA BULLDOGS PROFILE")
print("=" * 60)
print(f"Record: {georgia_profile['record']['wins']}-{georgia_profile['record']['losses']}")
print(f"Home: {georgia_profile['record']['home_record']}")
print(f"Away: {georgia_profile['record']['away_record']}")
print(f"\nScoring:")
print(f" PPG: {georgia_profile['scoring']['ppg']}")
print(f" PPG Allowed: {georgia_profile['scoring']['ppg_allowed']}")
print(f" Point Differential: {georgia_profile['scoring']['point_differential']}")
print(f"\nStreaks:")
print(f" Current: {'W' if georgia_profile['streaks']['current_streak'] > 0 else 'L'}{abs(georgia_profile['streaks']['current_streak'])}")
print(f" Longest Win Streak: {georgia_profile['streaks']['longest_win_streak']}")
Phase 4: Comparative Analysis
Head-to-Head Comparison
def compare_teams(
games_df: pd.DataFrame,
team1: str,
team2: str
) -> Dict[str, any]:
"""
Compare two teams head-to-head and statistically.
Parameters
----------
games_df : pd.DataFrame
All games
team1, team2 : str
Teams to compare
Returns
-------
Dict
Comparison results
"""
# Get profiles
profile1 = create_team_profile(games_df, team1)
profile2 = create_team_profile(games_df, team2)
# Find head-to-head games
h2h_games = games_df[
((games_df["home_team"] == team1) & (games_df["away_team"] == team2)) |
((games_df["home_team"] == team2) & (games_df["away_team"] == team1))
]
h2h_record = {"team1_wins": 0, "team2_wins": 0, "games": []}
for _, game in h2h_games.iterrows():
game_info = {
"week": game["week"],
"home_team": game["home_team"],
"away_team": game["away_team"],
"score": f"{game['home_score']}-{game['away_score']}"
}
if game["home_team"] == team1:
if game["home_score"] > game["away_score"]:
h2h_record["team1_wins"] += 1
game_info["winner"] = team1
else:
h2h_record["team2_wins"] += 1
game_info["winner"] = team2
else:
if game["away_score"] > game["home_score"]:
h2h_record["team1_wins"] += 1
game_info["winner"] = team1
else:
h2h_record["team2_wins"] += 1
game_info["winner"] = team2
h2h_record["games"].append(game_info)
# Statistical comparison
comparison = {
"teams": [team1, team2],
"head_to_head": h2h_record,
"records": {
team1: f"{profile1['record']['wins']}-{profile1['record']['losses']}",
team2: f"{profile2['record']['wins']}-{profile2['record']['losses']}"
},
"statistics": {
"ppg": {team1: profile1['scoring']['ppg'], team2: profile2['scoring']['ppg']},
"ppg_allowed": {team1: profile1['scoring']['ppg_allowed'],
team2: profile2['scoring']['ppg_allowed']},
"point_diff": {team1: profile1['scoring']['point_differential'],
team2: profile2['scoring']['point_differential']},
"avg_margin": {team1: profile1['margins']['average_margin'],
team2: profile2['margins']['average_margin']}
},
"advantages": {}
}
# Determine advantages
for stat in comparison["statistics"]:
vals = comparison["statistics"][stat]
if stat == "ppg_allowed":
better = team1 if vals[team1] < vals[team2] else team2
else:
better = team1 if vals[team1] > vals[team2] else team2
comparison["advantages"][stat] = better
return comparison
# Compare Georgia vs Alabama
comparison = compare_teams(games_df, "Georgia", "Alabama")
print("\nGEORGIA vs ALABAMA COMPARISON")
print("=" * 60)
print(f"\nRecords:")
print(f" Georgia: {comparison['records']['Georgia']}")
print(f" Alabama: {comparison['records']['Alabama']}")
print(f"\nHead-to-Head: Georgia {comparison['head_to_head']['team1_wins']}, Alabama {comparison['head_to_head']['team2_wins']}")
print(f"\nStatistical Comparison:")
for stat, values in comparison["statistics"].items():
advantage = comparison["advantages"][stat]
print(f" {stat}: Georgia {values['Georgia']} | Alabama {values['Alabama']} -> {advantage}")
Phase 5: Weekly Report Generator
Putting It All Together
def generate_weekly_report(
games_df: pd.DataFrame,
week: int,
divisions: Dict[str, List[str]]
) -> str:
"""
Generate comprehensive weekly report.
Parameters
----------
games_df : pd.DataFrame
All games through current week
week : int
Week number
divisions : Dict[str, List[str]]
Division definitions
Returns
-------
str
Formatted report
"""
# Filter to games through this week
through_week = games_df[games_df["week"] <= week]
this_week = games_df[games_df["week"] == week]
report_lines = []
report_lines.append("=" * 70)
report_lines.append(f"SEC WEEKLY REPORT - WEEK {week}")
report_lines.append("=" * 70)
# This week's results
report_lines.append(f"\nWEEK {week} RESULTS")
report_lines.append("-" * 40)
for _, game in this_week.iterrows():
winner = game["home_team"] if game["home_score"] > game["away_score"] else game["away_team"]
margin = abs(game["home_score"] - game["away_score"])
report_lines.append(
f" {game['away_team']} {game['away_score']} @ "
f"{game['home_team']} {game['home_score']} "
f"({winner} wins by {margin})"
)
# Division standings
div_standings = calculate_division_standings(through_week, divisions)
for div_name, standings in div_standings.items():
report_lines.append(f"\n{div_name.upper()} STANDINGS")
report_lines.append("-" * 40)
report_lines.append(f"{'Rank':<5} {'Team':<20} {'W-L':<8} {'PPG':<8} {'PAPG':<8}")
for _, row in standings.iterrows():
record = f"{int(row['wins'])}-{int(row['losses'])}"
report_lines.append(
f"{int(row['div_rank']):<5} {row['team']:<20} {record:<8} "
f"{row['ppg']:<8} {row['ppg_allowed']:<8}"
)
# Statistical leaders
standings = calculate_standings(through_week)
report_lines.append("\nSTATISTICAL LEADERS")
report_lines.append("-" * 40)
# Scoring offense
top_offense = standings.nlargest(3, "ppg")
report_lines.append("Scoring Offense (PPG):")
for _, row in top_offense.iterrows():
report_lines.append(f" {row['team']}: {row['ppg']}")
# Scoring defense
top_defense = standings.nsmallest(3, "ppg_allowed")
report_lines.append("Scoring Defense (PPG Allowed):")
for _, row in top_defense.iterrows():
report_lines.append(f" {row['team']}: {row['ppg_allowed']}")
# Total offense
top_total = standings.nlargest(3, "ypg")
report_lines.append("Total Offense (YPG):")
for _, row in top_total.iterrows():
report_lines.append(f" {row['team']}: {row['ypg']}")
report_lines.append("\n" + "=" * 70)
return "\n".join(report_lines)
# Generate report for week 8
report = generate_weekly_report(games_df, 8, sec_divisions)
print(report)
Discussion Questions
-
How would you modify the standings calculation to handle ties (same win percentage)?
-
What additional metrics would make the team comparison more useful?
-
How would you handle conference vs. non-conference games differently?
-
What visualizations would complement this analysis?
-
How would you scale this system to handle multiple conferences?
Your Turn: Extensions
Option A: Add Trend Analysis
Modify the team profile to include week-over-week trends: - Is scoring improving or declining? - Rolling 3-game averages - Performance trajectory visualization
Option B: Power Rankings
Create a simple power ranking algorithm that considers: - Win percentage - Strength of schedule - Point differential - Recent performance (recency weighting)
Option C: Playoff Scenarios
Build a function that simulates remaining games and calculates probability of each team making the conference championship.
Key Takeaways
- Structured data processing: Breaking game data into team-centric views enables flexible analysis
- Reusable functions: Building modular functions allows combining analyses in different ways
- Aggregation patterns: GroupBy operations are fundamental to sports analytics
- Comparative analysis: Head-to-head and statistical comparisons provide context for rankings