Case Study 2: Season-Long Performance Tracker
Introduction
Tracking team and player performance across an entire season requires different approaches than single-match analysis. Data must be aggregated efficiently, trends identified, and comparisons made against historical baselines. This case study builds a comprehensive season tracking system using advanced pandas techniques.
The Scenario
A club's analytics department needs to:
- Track weekly xG performance vs actual goals
- Monitor player form over rolling windows
- Identify early warning signs of declining performance
- Compare current season to historical averages
- Generate automated weekly reports
Step 1: Data Architecture
Season Data Structure
"""
season_tracker/data_models.py
Define data structures for season tracking.
"""
import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import List, Dict, Optional
from datetime import date
@dataclass
class MatchResult:
"""Single match result."""
match_id: int
date: date
opponent: str
venue: str # 'home' or 'away'
goals_for: int
goals_against: int
xg_for: float
xg_against: float
@property
def points(self) -> int:
if self.goals_for > self.goals_against:
return 3
elif self.goals_for == self.goals_against:
return 1
return 0
@property
def result(self) -> str:
if self.goals_for > self.goals_against:
return 'W'
elif self.goals_for == self.goals_against:
return 'D'
return 'L'
@dataclass
class PlayerMatchStats:
"""Player statistics for a single match."""
player_id: str
player_name: str
match_id: int
minutes: int
goals: int
assists: int
xg: float
xa: float
shots: int
passes: int
pass_accuracy: float
class SeasonData:
"""Container for full season data."""
def __init__(self, team_name: str, season: str):
self.team_name = team_name
self.season = season
self.matches: List[MatchResult] = []
self.player_stats: List[PlayerMatchStats] = []
def add_match(self, match: MatchResult):
self.matches.append(match)
def add_player_stats(self, stats: PlayerMatchStats):
self.player_stats.append(stats)
def to_matches_df(self) -> pd.DataFrame:
"""Convert matches to DataFrame."""
if not self.matches:
return pd.DataFrame()
data = [
{
'match_id': m.match_id,
'date': m.date,
'opponent': m.opponent,
'venue': m.venue,
'goals_for': m.goals_for,
'goals_against': m.goals_against,
'xg_for': m.xg_for,
'xg_against': m.xg_against,
'points': m.points,
'result': m.result
}
for m in self.matches
]
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'])
return df.sort_values('date').reset_index(drop=True)
def to_players_df(self) -> pd.DataFrame:
"""Convert player stats to DataFrame."""
if not self.player_stats:
return pd.DataFrame()
data = [vars(p) for p in self.player_stats]
return pd.DataFrame(data)
Step 2: Performance Metrics Calculator
"""
season_tracker/metrics.py
Calculate season-level and rolling metrics.
"""
import pandas as pd
import numpy as np
from typing import Dict, Tuple
class SeasonMetrics:
"""Calculate comprehensive season metrics."""
def __init__(self, matches_df: pd.DataFrame):
"""
Initialize with matches DataFrame.
Expected columns: date, goals_for, goals_against, xg_for,
xg_against, points, result
"""
self.matches = matches_df.copy()
self.matches = self.matches.sort_values('date').reset_index(drop=True)
self._add_cumulative_stats()
def _add_cumulative_stats(self):
"""Add cumulative statistics columns."""
df = self.matches
# Cumulative totals
df['cum_goals_for'] = df['goals_for'].cumsum()
df['cum_goals_against'] = df['goals_against'].cumsum()
df['cum_xg_for'] = df['xg_for'].cumsum()
df['cum_xg_against'] = df['xg_against'].cumsum()
df['cum_points'] = df['points'].cumsum()
# Match number
df['match_num'] = range(1, len(df) + 1)
# Goal difference
df['goal_diff'] = df['goals_for'] - df['goals_against']
df['cum_goal_diff'] = df['cum_goals_for'] - df['cum_goals_against']
# xG performance
df['xg_diff'] = df['xg_for'] - df['xg_against']
df['goals_vs_xg'] = df['goals_for'] - df['xg_for']
df['cum_goals_vs_xg'] = df['cum_goals_for'] - df['cum_xg_for']
def rolling_metrics(self, window: int = 5) -> pd.DataFrame:
"""
Calculate rolling window metrics.
Parameters
----------
window : int
Number of matches for rolling window
Returns
-------
pd.DataFrame
DataFrame with rolling statistics
"""
df = self.matches.copy()
# Rolling averages
df[f'rolling_{window}_goals'] = df['goals_for'].rolling(
window, min_periods=1
).mean()
df[f'rolling_{window}_xg'] = df['xg_for'].rolling(
window, min_periods=1
).mean()
df[f'rolling_{window}_conceded'] = df['goals_against'].rolling(
window, min_periods=1
).mean()
df[f'rolling_{window}_xga'] = df['xg_against'].rolling(
window, min_periods=1
).mean()
# Rolling points (form)
df[f'rolling_{window}_points'] = df['points'].rolling(
window, min_periods=1
).sum()
# Rolling win rate
df[f'rolling_{window}_win_rate'] = df['result'].apply(
lambda x: 1 if x == 'W' else 0
).rolling(window, min_periods=1).mean()
return df
def current_position(self) -> Dict:
"""
Calculate current season position metrics.
Returns
-------
Dict
Dictionary of current season statistics
"""
df = self.matches
if len(df) == 0:
return {}
latest = df.iloc[-1]
n_matches = len(df)
return {
'matches_played': n_matches,
'points': int(latest['cum_points']),
'ppg': round(latest['cum_points'] / n_matches, 2),
'goals_for': int(latest['cum_goals_for']),
'goals_against': int(latest['cum_goals_against']),
'goal_difference': int(latest['cum_goal_diff']),
'xg_for': round(latest['cum_xg_for'], 2),
'xg_against': round(latest['cum_xg_against'], 2),
'xg_difference': round(latest['cum_xg_for'] - latest['cum_xg_against'], 2),
'goals_vs_xg': round(latest['cum_goals_vs_xg'], 2),
'wins': len(df[df['result'] == 'W']),
'draws': len(df[df['result'] == 'D']),
'losses': len(df[df['result'] == 'L']),
}
def form_assessment(self, window: int = 5) -> Dict:
"""
Assess current form vs season average.
Returns
-------
Dict
Form assessment with comparisons
"""
if len(self.matches) < window:
return {'status': 'insufficient_data'}
df = self.rolling_metrics(window)
latest = df.iloc[-1]
season_avg = df.mean()
# Compare rolling to season average
goals_trend = latest[f'rolling_{window}_goals'] - season_avg['goals_for']
xg_trend = latest[f'rolling_{window}_xg'] - season_avg['xg_for']
defensive_trend = season_avg['goals_against'] - latest[f'rolling_{window}_conceded']
return {
'window': window,
'rolling_goals_per_match': round(latest[f'rolling_{window}_goals'], 2),
'season_avg_goals': round(season_avg['goals_for'], 2),
'goals_trend': round(goals_trend, 2),
'goals_trend_direction': 'up' if goals_trend > 0 else 'down',
'rolling_xg': round(latest[f'rolling_{window}_xg'], 2),
'season_avg_xg': round(season_avg['xg_for'], 2),
'xg_trend': round(xg_trend, 2),
'defensive_trend': round(defensive_trend, 2),
'defensive_trend_direction': 'improving' if defensive_trend > 0 else 'declining',
'form_points': int(latest[f'rolling_{window}_points']),
'max_possible': window * 3,
}
class PlayerSeasonMetrics:
"""Calculate player-level season metrics."""
def __init__(self, player_df: pd.DataFrame, matches_df: pd.DataFrame):
"""
Initialize with player and match data.
Parameters
----------
player_df : pd.DataFrame
Player match statistics
matches_df : pd.DataFrame
Match information (for dates)
"""
self.players = player_df.merge(
matches_df[['match_id', 'date']],
on='match_id',
how='left'
).sort_values(['player_name', 'date'])
def player_totals(self) -> pd.DataFrame:
"""Calculate season totals per player."""
return self.players.groupby(['player_id', 'player_name']).agg({
'minutes': 'sum',
'goals': 'sum',
'assists': 'sum',
'xg': 'sum',
'xa': 'sum',
'shots': 'sum',
'passes': 'sum',
'match_id': 'count'
}).rename(columns={'match_id': 'appearances'}).reset_index()
def player_per_90(self, min_minutes: int = 450) -> pd.DataFrame:
"""
Calculate per-90 statistics.
Parameters
----------
min_minutes : int
Minimum minutes for inclusion
Returns
-------
pd.DataFrame
Per-90 statistics for qualified players
"""
totals = self.player_totals()
qualified = totals[totals['minutes'] >= min_minutes].copy()
# Calculate per 90
for col in ['goals', 'assists', 'xg', 'xa', 'shots', 'passes']:
qualified[f'{col}_per90'] = (
qualified[col] / qualified['minutes'] * 90
).round(2)
return qualified
def player_form(self, player_name: str, window: int = 5) -> pd.DataFrame:
"""
Calculate rolling form for a specific player.
Parameters
----------
player_name : str
Name of the player
window : int
Rolling window size
Returns
-------
pd.DataFrame
Player's rolling statistics
"""
player_data = self.players[
self.players['player_name'] == player_name
].copy()
if len(player_data) < window:
return player_data
player_data['rolling_goals'] = player_data['goals'].rolling(
window, min_periods=1
).sum()
player_data['rolling_xg'] = player_data['xg'].rolling(
window, min_periods=1
).sum()
player_data['rolling_minutes'] = player_data['minutes'].rolling(
window, min_periods=1
).sum()
return player_data
Step 3: Visualization Dashboard
"""
season_tracker/visualizations.py
Create season tracking visualizations.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, Optional
def plot_xg_vs_goals_progression(
matches_df: pd.DataFrame,
team_name: str = "Team"
) -> Tuple[plt.Figure, plt.Axes]:
"""
Plot cumulative xG vs actual goals over the season.
Shows divergence between expected and actual performance.
"""
fig, ax = plt.subplots(figsize=(12, 6))
df = matches_df.copy()
# Plot cumulative lines
ax.plot(df['match_num'], df['cum_goals_for'], 'b-', linewidth=2,
label='Actual Goals', marker='o', markersize=4)
ax.plot(df['match_num'], df['cum_xg_for'], 'r--', linewidth=2,
label='Expected Goals (xG)', marker='s', markersize=4)
# Fill between to show over/under performance
ax.fill_between(
df['match_num'],
df['cum_goals_for'],
df['cum_xg_for'],
where=(df['cum_goals_for'] >= df['cum_xg_for']),
interpolate=True,
alpha=0.3,
color='green',
label='Overperformance'
)
ax.fill_between(
df['match_num'],
df['cum_goals_for'],
df['cum_xg_for'],
where=(df['cum_goals_for'] < df['cum_xg_for']),
interpolate=True,
alpha=0.3,
color='red',
label='Underperformance'
)
ax.set_xlabel('Match Number', fontsize=12)
ax.set_ylabel('Cumulative Goals / xG', fontsize=12)
ax.set_title(f'{team_name}: Goals vs xG Progression', fontsize=14)
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)
plt.tight_layout()
return fig, ax
def plot_rolling_form(
matches_df: pd.DataFrame,
window: int = 5,
team_name: str = "Team"
) -> Tuple[plt.Figure, plt.Axes]:
"""
Plot rolling form metrics over the season.
"""
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
df = matches_df.copy()
# 1. Rolling points
ax = axes[0, 0]
df[f'rolling_{window}_points'] = df['points'].rolling(window, min_periods=1).sum()
ax.plot(df['match_num'], df[f'rolling_{window}_points'], 'b-', linewidth=2)
ax.axhline(window * 2, color='orange', linestyle='--', alpha=0.7,
label=f'Target ({window*2} pts)')
ax.fill_between(df['match_num'], df[f'rolling_{window}_points'],
window * 2, alpha=0.3,
where=(df[f'rolling_{window}_points'] >= window * 2),
color='green')
ax.fill_between(df['match_num'], df[f'rolling_{window}_points'],
window * 2, alpha=0.3,
where=(df[f'rolling_{window}_points'] < window * 2),
color='red')
ax.set_xlabel('Match Number')
ax.set_ylabel(f'Points (last {window} games)')
ax.set_title(f'Rolling {window}-Match Points')
ax.legend()
ax.grid(True, alpha=0.3)
# 2. Rolling goals scored vs conceded
ax = axes[0, 1]
df[f'rolling_scored'] = df['goals_for'].rolling(window, min_periods=1).mean()
df[f'rolling_conceded'] = df['goals_against'].rolling(window, min_periods=1).mean()
ax.plot(df['match_num'], df['rolling_scored'], 'g-', linewidth=2, label='Scored')
ax.plot(df['match_num'], df['rolling_conceded'], 'r-', linewidth=2, label='Conceded')
ax.set_xlabel('Match Number')
ax.set_ylabel(f'Goals per match (avg of {window})')
ax.set_title('Rolling Goals Scored vs Conceded')
ax.legend()
ax.grid(True, alpha=0.3)
# 3. Rolling xG
ax = axes[1, 0]
df['rolling_xg'] = df['xg_for'].rolling(window, min_periods=1).mean()
df['rolling_xga'] = df['xg_against'].rolling(window, min_periods=1).mean()
ax.plot(df['match_num'], df['rolling_xg'], 'g--', linewidth=2, label='xG For')
ax.plot(df['match_num'], df['rolling_xga'], 'r--', linewidth=2, label='xG Against')
ax.set_xlabel('Match Number')
ax.set_ylabel(f'xG per match (avg of {window})')
ax.set_title('Rolling xG For vs Against')
ax.legend()
ax.grid(True, alpha=0.3)
# 4. Points projection
ax = axes[1, 1]
df['projected_points'] = (df['cum_points'] / df['match_num']) * 38
ax.plot(df['match_num'], df['projected_points'], 'b-', linewidth=2)
ax.axhline(65, color='green', linestyle='--', alpha=0.7, label='Top 4 (~65)')
ax.axhline(40, color='orange', linestyle='--', alpha=0.7, label='Safety (~40)')
ax.set_xlabel('Match Number')
ax.set_ylabel('Projected Final Points')
ax.set_title('Season Points Projection')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_ylim(0, 100)
fig.suptitle(f'{team_name} Season Tracking', fontsize=16, y=1.02)
plt.tight_layout()
return fig, axes
def plot_player_contribution_chart(
player_totals: pd.DataFrame,
metric: str = 'goals',
top_n: int = 10,
team_name: str = "Team"
) -> Tuple[plt.Figure, plt.Axes]:
"""
Create horizontal bar chart of player contributions.
"""
fig, ax = plt.subplots(figsize=(10, 8))
# Get top N players
top_players = player_totals.nlargest(top_n, metric)
# Create bars
colors = plt.cm.Blues(np.linspace(0.4, 0.8, len(top_players)))
bars = ax.barh(top_players['player_name'], top_players[metric],
color=colors, edgecolor='black')
# Add value labels
for bar, val in zip(bars, top_players[metric]):
ax.text(val + 0.1, bar.get_y() + bar.get_height()/2,
f'{val:.1f}' if isinstance(val, float) else str(int(val)),
va='center', fontsize=10)
ax.set_xlabel(metric.replace('_', ' ').title(), fontsize=12)
ax.set_title(f'{team_name}: Top {top_n} by {metric.title()}', fontsize=14)
ax.invert_yaxis() # Highest at top
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
return fig, ax
Step 4: Automated Reporting System
"""
season_tracker/reporter.py
Generate automated weekly reports.
"""
import pandas as pd
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt
from .metrics import SeasonMetrics, PlayerSeasonMetrics
from . import visualizations as viz
class WeeklyReporter:
"""Generate automated weekly performance reports."""
def __init__(
self,
team_name: str,
matches_df: pd.DataFrame,
players_df: pd.DataFrame,
output_dir: Path
):
self.team_name = team_name
self.matches = matches_df
self.players = players_df
self.output_dir = output_dir
self.output_dir.mkdir(parents=True, exist_ok=True)
self.team_metrics = SeasonMetrics(matches_df)
self.player_metrics = PlayerSeasonMetrics(players_df, matches_df)
def generate_weekly_report(self, week_number: int = None) -> Path:
"""
Generate comprehensive weekly report.
Parameters
----------
week_number : int, optional
Week number (defaults to current)
Returns
-------
Path
Path to generated report
"""
week = week_number or len(self.matches)
report_dir = self.output_dir / f"week_{week}"
report_dir.mkdir(parents=True, exist_ok=True)
# Generate components
self._write_summary(report_dir, week)
self._generate_visualizations(report_dir)
self._write_player_stats(report_dir)
self._write_form_analysis(report_dir)
return report_dir
def _write_summary(self, report_dir: Path, week: int):
"""Write text summary."""
position = self.team_metrics.current_position()
form = self.team_metrics.form_assessment(5)
with open(report_dir / "summary.txt", 'w') as f:
f.write("=" * 60 + "\n")
f.write(f"WEEKLY REPORT: {self.team_name} - Week {week}\n")
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
f.write("=" * 60 + "\n\n")
f.write("SEASON POSITION\n")
f.write("-" * 40 + "\n")
f.write(f"Matches Played: {position['matches_played']}\n")
f.write(f"Points: {position['points']} "
f"({position['ppg']} per game)\n")
f.write(f"Record: {position['wins']}W - {position['draws']}D - "
f"{position['losses']}L\n")
f.write(f"Goals: {position['goals_for']} scored, "
f"{position['goals_against']} conceded "
f"(GD: {position['goal_difference']:+d})\n")
f.write(f"xG: {position['xg_for']} for, {position['xg_against']} against "
f"(xGD: {position['xg_difference']:+.1f})\n")
f.write(f"Goals vs xG: {position['goals_vs_xg']:+.1f}\n\n")
if form.get('status') != 'insufficient_data':
f.write("CURRENT FORM (Last 5 matches)\n")
f.write("-" * 40 + "\n")
f.write(f"Points: {form['form_points']}/{form['max_possible']}\n")
f.write(f"Goals/match: {form['rolling_goals_per_match']} "
f"(season avg: {form['season_avg_goals']})\n")
f.write(f"xG/match: {form['rolling_xg']} "
f"(season avg: {form['season_avg_xg']})\n")
f.write(f"Trend: {form['goals_trend_direction'].upper()} "
f"({form['goals_trend']:+.2f} vs avg)\n")
f.write(f"Defense: {form['defensive_trend_direction'].upper()}\n")
def _generate_visualizations(self, report_dir: Path):
"""Generate all visualizations."""
# xG progression
fig, _ = viz.plot_xg_vs_goals_progression(
self.team_metrics.matches,
self.team_name
)
fig.savefig(report_dir / "xg_progression.png", dpi=150, bbox_inches='tight')
plt.close(fig)
# Rolling form
fig, _ = viz.plot_rolling_form(
self.team_metrics.matches,
window=5,
team_name=self.team_name
)
fig.savefig(report_dir / "rolling_form.png", dpi=150, bbox_inches='tight')
plt.close(fig)
# Player contributions
totals = self.player_metrics.player_totals()
for metric in ['goals', 'assists', 'xg']:
fig, _ = viz.plot_player_contribution_chart(
totals, metric=metric, team_name=self.team_name
)
fig.savefig(report_dir / f"player_{metric}.png",
dpi=150, bbox_inches='tight')
plt.close(fig)
def _write_player_stats(self, report_dir: Path):
"""Write player statistics CSV."""
totals = self.player_metrics.player_totals()
per90 = self.player_metrics.player_per_90(min_minutes=270)
totals.to_csv(report_dir / "player_totals.csv", index=False)
per90.to_csv(report_dir / "player_per90.csv", index=False)
def _write_form_analysis(self, report_dir: Path):
"""Write detailed form analysis."""
rolling_df = self.team_metrics.rolling_metrics(5)
rolling_df.to_csv(report_dir / "rolling_metrics.csv", index=False)
Example Usage
"""
main.py - Example usage of the season tracker.
"""
import pandas as pd
import numpy as np
from datetime import date, timedelta
from pathlib import Path
from season_tracker.data_models import SeasonData, MatchResult, PlayerMatchStats
from season_tracker.reporter import WeeklyReporter
def generate_sample_season() -> SeasonData:
"""Generate sample season data for demonstration."""
np.random.seed(42)
season = SeasonData("Sample FC", "2023-24")
start_date = date(2023, 8, 12)
for i in range(20): # 20 matches so far
match_date = start_date + timedelta(days=i * 7)
venue = 'home' if i % 2 == 0 else 'away'
# Generate realistic match data
xg_for = np.random.normal(1.5, 0.5)
xg_against = np.random.normal(1.2, 0.4)
goals_for = np.random.poisson(xg_for)
goals_against = np.random.poisson(xg_against)
match = MatchResult(
match_id=i + 1,
date=match_date,
opponent=f"Team {chr(65 + i)}",
venue=venue,
goals_for=goals_for,
goals_against=goals_against,
xg_for=round(max(0.2, xg_for), 2),
xg_against=round(max(0.2, xg_against), 2)
)
season.add_match(match)
# Generate player stats
for player_num in range(11):
player_xg = np.random.exponential(0.1) if player_num < 4 else 0
player_goals = 1 if np.random.random() < player_xg else 0
stats = PlayerMatchStats(
player_id=f"P{player_num}",
player_name=f"Player {player_num + 1}",
match_id=i + 1,
minutes=np.random.randint(60, 91),
goals=player_goals,
assists=np.random.choice([0, 1], p=[0.85, 0.15]),
xg=round(player_xg, 2),
xa=round(np.random.exponential(0.08), 2),
shots=np.random.poisson(1.5) if player_num < 4 else 0,
passes=np.random.poisson(35),
pass_accuracy=round(np.random.normal(82, 5), 1)
)
season.add_player_stats(stats)
return season
def main():
"""Generate sample weekly report."""
# Generate sample data
season = generate_sample_season()
# Convert to DataFrames
matches_df = season.to_matches_df()
players_df = season.to_players_df()
# Create reporter
output_dir = Path("outputs/season_reports")
reporter = WeeklyReporter(
team_name=season.team_name,
matches_df=matches_df,
players_df=players_df,
output_dir=output_dir
)
# Generate report
report_path = reporter.generate_weekly_report()
print(f"Report generated at: {report_path}")
if __name__ == "__main__":
main()
Key Learnings
-
Data Modeling: Using dataclasses and clear structures makes data handling predictable and type-safe.
-
Metric Calculation: Separating metrics into their own class enables reuse and testing.
-
Rolling Windows: pandas rolling functions efficiently calculate form indicators.
-
Automated Reporting: Structured output generation enables scheduled report automation.
-
Visualization Consistency: Reusable plotting functions ensure uniform styling across reports.
Summary
This case study demonstrated building a production-quality season tracking system. The modular architecture separates concerns (data, metrics, visualization, reporting), making the system maintainable and extensible. The rolling window calculations provide crucial form indicators that help identify performance trends before they become obvious in raw results.