Case Study 2: Season-Long Performance Tracker

Introduction

Tracking team and player performance across an entire season requires different approaches than single-match analysis. Data must be aggregated efficiently, trends identified, and comparisons made against historical baselines. This case study builds a comprehensive season tracking system using advanced pandas techniques.

The Scenario

A club's analytics department needs to:

Track weekly xG performance vs actual goals
Monitor player form over rolling windows
Identify early warning signs of declining performance
Compare current season to historical averages
Generate automated weekly reports

Step 1: Data Architecture

Season Data Structure

"""
season_tracker/data_models.py

Define data structures for season tracking.
"""

import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import List, Dict, Optional
from datetime import date


@dataclass
class MatchResult:
    """Single match result."""
    match_id: int
    date: date
    opponent: str
    venue: str  # 'home' or 'away'
    goals_for: int
    goals_against: int
    xg_for: float
    xg_against: float

    @property
    def points(self) -> int:
        if self.goals_for > self.goals_against:
            return 3
        elif self.goals_for == self.goals_against:
            return 1
        return 0

    @property
    def result(self) -> str:
        if self.goals_for > self.goals_against:
            return 'W'
        elif self.goals_for == self.goals_against:
            return 'D'
        return 'L'


@dataclass
class PlayerMatchStats:
    """Player statistics for a single match."""
    player_id: str
    player_name: str
    match_id: int
    minutes: int
    goals: int
    assists: int
    xg: float
    xa: float
    shots: int
    passes: int
    pass_accuracy: float


class SeasonData:
    """Container for full season data."""

    def __init__(self, team_name: str, season: str):
        self.team_name = team_name
        self.season = season
        self.matches: List[MatchResult] = []
        self.player_stats: List[PlayerMatchStats] = []

    def add_match(self, match: MatchResult):
        self.matches.append(match)

    def add_player_stats(self, stats: PlayerMatchStats):
        self.player_stats.append(stats)

    def to_matches_df(self) -> pd.DataFrame:
        """Convert matches to DataFrame."""
        if not self.matches:
            return pd.DataFrame()

        data = [
            {
                'match_id': m.match_id,
                'date': m.date,
                'opponent': m.opponent,
                'venue': m.venue,
                'goals_for': m.goals_for,
                'goals_against': m.goals_against,
                'xg_for': m.xg_for,
                'xg_against': m.xg_against,
                'points': m.points,
                'result': m.result
            }
            for m in self.matches
        ]

        df = pd.DataFrame(data)
        df['date'] = pd.to_datetime(df['date'])
        return df.sort_values('date').reset_index(drop=True)

    def to_players_df(self) -> pd.DataFrame:
        """Convert player stats to DataFrame."""
        if not self.player_stats:
            return pd.DataFrame()

        data = [vars(p) for p in self.player_stats]
        return pd.DataFrame(data)

Step 2: Performance Metrics Calculator

"""
season_tracker/metrics.py

Calculate season-level and rolling metrics.
"""

import pandas as pd
import numpy as np
from typing import Dict, Tuple


class SeasonMetrics:
    """Calculate comprehensive season metrics."""

    def __init__(self, matches_df: pd.DataFrame):
        """
        Initialize with matches DataFrame.

        Expected columns: date, goals_for, goals_against, xg_for,
                         xg_against, points, result
        """
        self.matches = matches_df.copy()
        self.matches = self.matches.sort_values('date').reset_index(drop=True)
        self._add_cumulative_stats()

    def _add_cumulative_stats(self):
        """Add cumulative statistics columns."""
        df = self.matches

        # Cumulative totals
        df['cum_goals_for'] = df['goals_for'].cumsum()
        df['cum_goals_against'] = df['goals_against'].cumsum()
        df['cum_xg_for'] = df['xg_for'].cumsum()
        df['cum_xg_against'] = df['xg_against'].cumsum()
        df['cum_points'] = df['points'].cumsum()

        # Match number
        df['match_num'] = range(1, len(df) + 1)

        # Goal difference
        df['goal_diff'] = df['goals_for'] - df['goals_against']
        df['cum_goal_diff'] = df['cum_goals_for'] - df['cum_goals_against']

        # xG performance
        df['xg_diff'] = df['xg_for'] - df['xg_against']
        df['goals_vs_xg'] = df['goals_for'] - df['xg_for']
        df['cum_goals_vs_xg'] = df['cum_goals_for'] - df['cum_xg_for']

    def rolling_metrics(self, window: int = 5) -> pd.DataFrame:
        """
        Calculate rolling window metrics.

        Parameters
        ----------
        window : int
            Number of matches for rolling window

        Returns
        -------
        pd.DataFrame
            DataFrame with rolling statistics
        """
        df = self.matches.copy()

        # Rolling averages
        df[f'rolling_{window}_goals'] = df['goals_for'].rolling(
            window, min_periods=1
        ).mean()

        df[f'rolling_{window}_xg'] = df['xg_for'].rolling(
            window, min_periods=1
        ).mean()

        df[f'rolling_{window}_conceded'] = df['goals_against'].rolling(
            window, min_periods=1
        ).mean()

        df[f'rolling_{window}_xga'] = df['xg_against'].rolling(
            window, min_periods=1
        ).mean()

        # Rolling points (form)
        df[f'rolling_{window}_points'] = df['points'].rolling(
            window, min_periods=1
        ).sum()

        # Rolling win rate
        df[f'rolling_{window}_win_rate'] = df['result'].apply(
            lambda x: 1 if x == 'W' else 0
        ).rolling(window, min_periods=1).mean()

        return df

    def current_position(self) -> Dict:
        """
        Calculate current season position metrics.

        Returns
        -------
        Dict
            Dictionary of current season statistics
        """
        df = self.matches

        if len(df) == 0:
            return {}

        latest = df.iloc[-1]
        n_matches = len(df)

        return {
            'matches_played': n_matches,
            'points': int(latest['cum_points']),
            'ppg': round(latest['cum_points'] / n_matches, 2),
            'goals_for': int(latest['cum_goals_for']),
            'goals_against': int(latest['cum_goals_against']),
            'goal_difference': int(latest['cum_goal_diff']),
            'xg_for': round(latest['cum_xg_for'], 2),
            'xg_against': round(latest['cum_xg_against'], 2),
            'xg_difference': round(latest['cum_xg_for'] - latest['cum_xg_against'], 2),
            'goals_vs_xg': round(latest['cum_goals_vs_xg'], 2),
            'wins': len(df[df['result'] == 'W']),
            'draws': len(df[df['result'] == 'D']),
            'losses': len(df[df['result'] == 'L']),
        }

    def form_assessment(self, window: int = 5) -> Dict:
        """
        Assess current form vs season average.

        Returns
        -------
        Dict
            Form assessment with comparisons
        """
        if len(self.matches) < window:
            return {'status': 'insufficient_data'}

        df = self.rolling_metrics(window)
        latest = df.iloc[-1]
        season_avg = df.mean()

        # Compare rolling to season average
        goals_trend = latest[f'rolling_{window}_goals'] - season_avg['goals_for']
        xg_trend = latest[f'rolling_{window}_xg'] - season_avg['xg_for']
        defensive_trend = season_avg['goals_against'] - latest[f'rolling_{window}_conceded']

        return {
            'window': window,
            'rolling_goals_per_match': round(latest[f'rolling_{window}_goals'], 2),
            'season_avg_goals': round(season_avg['goals_for'], 2),
            'goals_trend': round(goals_trend, 2),
            'goals_trend_direction': 'up' if goals_trend > 0 else 'down',
            'rolling_xg': round(latest[f'rolling_{window}_xg'], 2),
            'season_avg_xg': round(season_avg['xg_for'], 2),
            'xg_trend': round(xg_trend, 2),
            'defensive_trend': round(defensive_trend, 2),
            'defensive_trend_direction': 'improving' if defensive_trend > 0 else 'declining',
            'form_points': int(latest[f'rolling_{window}_points']),
            'max_possible': window * 3,
        }


class PlayerSeasonMetrics:
    """Calculate player-level season metrics."""

    def __init__(self, player_df: pd.DataFrame, matches_df: pd.DataFrame):
        """
        Initialize with player and match data.

        Parameters
        ----------
        player_df : pd.DataFrame
            Player match statistics
        matches_df : pd.DataFrame
            Match information (for dates)
        """
        self.players = player_df.merge(
            matches_df[['match_id', 'date']],
            on='match_id',
            how='left'
        ).sort_values(['player_name', 'date'])

    def player_totals(self) -> pd.DataFrame:
        """Calculate season totals per player."""
        return self.players.groupby(['player_id', 'player_name']).agg({
            'minutes': 'sum',
            'goals': 'sum',
            'assists': 'sum',
            'xg': 'sum',
            'xa': 'sum',
            'shots': 'sum',
            'passes': 'sum',
            'match_id': 'count'
        }).rename(columns={'match_id': 'appearances'}).reset_index()

    def player_per_90(self, min_minutes: int = 450) -> pd.DataFrame:
        """
        Calculate per-90 statistics.

        Parameters
        ----------
        min_minutes : int
            Minimum minutes for inclusion

        Returns
        -------
        pd.DataFrame
            Per-90 statistics for qualified players
        """
        totals = self.player_totals()
        qualified = totals[totals['minutes'] >= min_minutes].copy()

        # Calculate per 90
        for col in ['goals', 'assists', 'xg', 'xa', 'shots', 'passes']:
            qualified[f'{col}_per90'] = (
                qualified[col] / qualified['minutes'] * 90
            ).round(2)

        return qualified

    def player_form(self, player_name: str, window: int = 5) -> pd.DataFrame:
        """
        Calculate rolling form for a specific player.

        Parameters
        ----------
        player_name : str
            Name of the player
        window : int
            Rolling window size

        Returns
        -------
        pd.DataFrame
            Player's rolling statistics
        """
        player_data = self.players[
            self.players['player_name'] == player_name
        ].copy()

        if len(player_data) < window:
            return player_data

        player_data['rolling_goals'] = player_data['goals'].rolling(
            window, min_periods=1
        ).sum()

        player_data['rolling_xg'] = player_data['xg'].rolling(
            window, min_periods=1
        ).sum()

        player_data['rolling_minutes'] = player_data['minutes'].rolling(
            window, min_periods=1
        ).sum()

        return player_data

Step 3: Visualization Dashboard

"""
season_tracker/visualizations.py

Create season tracking visualizations.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, Optional


def plot_xg_vs_goals_progression(
    matches_df: pd.DataFrame,
    team_name: str = "Team"
) -> Tuple[plt.Figure, plt.Axes]:
    """
    Plot cumulative xG vs actual goals over the season.

    Shows divergence between expected and actual performance.
    """
    fig, ax = plt.subplots(figsize=(12, 6))

    df = matches_df.copy()

    # Plot cumulative lines
    ax.plot(df['match_num'], df['cum_goals_for'], 'b-', linewidth=2,
            label='Actual Goals', marker='o', markersize=4)

    ax.plot(df['match_num'], df['cum_xg_for'], 'r--', linewidth=2,
            label='Expected Goals (xG)', marker='s', markersize=4)

    # Fill between to show over/under performance
    ax.fill_between(
        df['match_num'],
        df['cum_goals_for'],
        df['cum_xg_for'],
        where=(df['cum_goals_for'] >= df['cum_xg_for']),
        interpolate=True,
        alpha=0.3,
        color='green',
        label='Overperformance'
    )

    ax.fill_between(
        df['match_num'],
        df['cum_goals_for'],
        df['cum_xg_for'],
        where=(df['cum_goals_for'] < df['cum_xg_for']),
        interpolate=True,
        alpha=0.3,
        color='red',
        label='Underperformance'
    )

    ax.set_xlabel('Match Number', fontsize=12)
    ax.set_ylabel('Cumulative Goals / xG', fontsize=12)
    ax.set_title(f'{team_name}: Goals vs xG Progression', fontsize=14)
    ax.legend(loc='upper left')
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    return fig, ax


def plot_rolling_form(
    matches_df: pd.DataFrame,
    window: int = 5,
    team_name: str = "Team"
) -> Tuple[plt.Figure, plt.Axes]:
    """
    Plot rolling form metrics over the season.
    """
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    df = matches_df.copy()

    # 1. Rolling points
    ax = axes[0, 0]
    df[f'rolling_{window}_points'] = df['points'].rolling(window, min_periods=1).sum()
    ax.plot(df['match_num'], df[f'rolling_{window}_points'], 'b-', linewidth=2)
    ax.axhline(window * 2, color='orange', linestyle='--', alpha=0.7,
               label=f'Target ({window*2} pts)')
    ax.fill_between(df['match_num'], df[f'rolling_{window}_points'],
                    window * 2, alpha=0.3,
                    where=(df[f'rolling_{window}_points'] >= window * 2),
                    color='green')
    ax.fill_between(df['match_num'], df[f'rolling_{window}_points'],
                    window * 2, alpha=0.3,
                    where=(df[f'rolling_{window}_points'] < window * 2),
                    color='red')
    ax.set_xlabel('Match Number')
    ax.set_ylabel(f'Points (last {window} games)')
    ax.set_title(f'Rolling {window}-Match Points')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # 2. Rolling goals scored vs conceded
    ax = axes[0, 1]
    df[f'rolling_scored'] = df['goals_for'].rolling(window, min_periods=1).mean()
    df[f'rolling_conceded'] = df['goals_against'].rolling(window, min_periods=1).mean()
    ax.plot(df['match_num'], df['rolling_scored'], 'g-', linewidth=2, label='Scored')
    ax.plot(df['match_num'], df['rolling_conceded'], 'r-', linewidth=2, label='Conceded')
    ax.set_xlabel('Match Number')
    ax.set_ylabel(f'Goals per match (avg of {window})')
    ax.set_title('Rolling Goals Scored vs Conceded')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # 3. Rolling xG
    ax = axes[1, 0]
    df['rolling_xg'] = df['xg_for'].rolling(window, min_periods=1).mean()
    df['rolling_xga'] = df['xg_against'].rolling(window, min_periods=1).mean()
    ax.plot(df['match_num'], df['rolling_xg'], 'g--', linewidth=2, label='xG For')
    ax.plot(df['match_num'], df['rolling_xga'], 'r--', linewidth=2, label='xG Against')
    ax.set_xlabel('Match Number')
    ax.set_ylabel(f'xG per match (avg of {window})')
    ax.set_title('Rolling xG For vs Against')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # 4. Points projection
    ax = axes[1, 1]
    df['projected_points'] = (df['cum_points'] / df['match_num']) * 38
    ax.plot(df['match_num'], df['projected_points'], 'b-', linewidth=2)
    ax.axhline(65, color='green', linestyle='--', alpha=0.7, label='Top 4 (~65)')
    ax.axhline(40, color='orange', linestyle='--', alpha=0.7, label='Safety (~40)')
    ax.set_xlabel('Match Number')
    ax.set_ylabel('Projected Final Points')
    ax.set_title('Season Points Projection')
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_ylim(0, 100)

    fig.suptitle(f'{team_name} Season Tracking', fontsize=16, y=1.02)
    plt.tight_layout()
    return fig, axes


def plot_player_contribution_chart(
    player_totals: pd.DataFrame,
    metric: str = 'goals',
    top_n: int = 10,
    team_name: str = "Team"
) -> Tuple[plt.Figure, plt.Axes]:
    """
    Create horizontal bar chart of player contributions.
    """
    fig, ax = plt.subplots(figsize=(10, 8))

    # Get top N players
    top_players = player_totals.nlargest(top_n, metric)

    # Create bars
    colors = plt.cm.Blues(np.linspace(0.4, 0.8, len(top_players)))
    bars = ax.barh(top_players['player_name'], top_players[metric],
                   color=colors, edgecolor='black')

    # Add value labels
    for bar, val in zip(bars, top_players[metric]):
        ax.text(val + 0.1, bar.get_y() + bar.get_height()/2,
                f'{val:.1f}' if isinstance(val, float) else str(int(val)),
                va='center', fontsize=10)

    ax.set_xlabel(metric.replace('_', ' ').title(), fontsize=12)
    ax.set_title(f'{team_name}: Top {top_n} by {metric.title()}', fontsize=14)
    ax.invert_yaxis()  # Highest at top
    ax.grid(True, alpha=0.3, axis='x')

    plt.tight_layout()
    return fig, ax

Step 4: Automated Reporting System

"""
season_tracker/reporter.py

Generate automated weekly reports.
"""

import pandas as pd
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt

from .metrics import SeasonMetrics, PlayerSeasonMetrics
from . import visualizations as viz


class WeeklyReporter:
    """Generate automated weekly performance reports."""

    def __init__(
        self,
        team_name: str,
        matches_df: pd.DataFrame,
        players_df: pd.DataFrame,
        output_dir: Path
    ):
        self.team_name = team_name
        self.matches = matches_df
        self.players = players_df
        self.output_dir = output_dir
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self.team_metrics = SeasonMetrics(matches_df)
        self.player_metrics = PlayerSeasonMetrics(players_df, matches_df)

    def generate_weekly_report(self, week_number: int = None) -> Path:
        """
        Generate comprehensive weekly report.

        Parameters
        ----------
        week_number : int, optional
            Week number (defaults to current)

        Returns
        -------
        Path
            Path to generated report
        """
        week = week_number or len(self.matches)
        report_dir = self.output_dir / f"week_{week}"
        report_dir.mkdir(parents=True, exist_ok=True)

        # Generate components
        self._write_summary(report_dir, week)
        self._generate_visualizations(report_dir)
        self._write_player_stats(report_dir)
        self._write_form_analysis(report_dir)

        return report_dir

    def _write_summary(self, report_dir: Path, week: int):
        """Write text summary."""
        position = self.team_metrics.current_position()
        form = self.team_metrics.form_assessment(5)

        with open(report_dir / "summary.txt", 'w') as f:
            f.write("=" * 60 + "\n")
            f.write(f"WEEKLY REPORT: {self.team_name} - Week {week}\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
            f.write("=" * 60 + "\n\n")

            f.write("SEASON POSITION\n")
            f.write("-" * 40 + "\n")
            f.write(f"Matches Played: {position['matches_played']}\n")
            f.write(f"Points: {position['points']} "
                    f"({position['ppg']} per game)\n")
            f.write(f"Record: {position['wins']}W - {position['draws']}D - "
                    f"{position['losses']}L\n")
            f.write(f"Goals: {position['goals_for']} scored, "
                    f"{position['goals_against']} conceded "
                    f"(GD: {position['goal_difference']:+d})\n")
            f.write(f"xG: {position['xg_for']} for, {position['xg_against']} against "
                    f"(xGD: {position['xg_difference']:+.1f})\n")
            f.write(f"Goals vs xG: {position['goals_vs_xg']:+.1f}\n\n")

            if form.get('status') != 'insufficient_data':
                f.write("CURRENT FORM (Last 5 matches)\n")
                f.write("-" * 40 + "\n")
                f.write(f"Points: {form['form_points']}/{form['max_possible']}\n")
                f.write(f"Goals/match: {form['rolling_goals_per_match']} "
                        f"(season avg: {form['season_avg_goals']})\n")
                f.write(f"xG/match: {form['rolling_xg']} "
                        f"(season avg: {form['season_avg_xg']})\n")
                f.write(f"Trend: {form['goals_trend_direction'].upper()} "
                        f"({form['goals_trend']:+.2f} vs avg)\n")
                f.write(f"Defense: {form['defensive_trend_direction'].upper()}\n")

    def _generate_visualizations(self, report_dir: Path):
        """Generate all visualizations."""
        # xG progression
        fig, _ = viz.plot_xg_vs_goals_progression(
            self.team_metrics.matches,
            self.team_name
        )
        fig.savefig(report_dir / "xg_progression.png", dpi=150, bbox_inches='tight')
        plt.close(fig)

        # Rolling form
        fig, _ = viz.plot_rolling_form(
            self.team_metrics.matches,
            window=5,
            team_name=self.team_name
        )
        fig.savefig(report_dir / "rolling_form.png", dpi=150, bbox_inches='tight')
        plt.close(fig)

        # Player contributions
        totals = self.player_metrics.player_totals()
        for metric in ['goals', 'assists', 'xg']:
            fig, _ = viz.plot_player_contribution_chart(
                totals, metric=metric, team_name=self.team_name
            )
            fig.savefig(report_dir / f"player_{metric}.png",
                        dpi=150, bbox_inches='tight')
            plt.close(fig)

    def _write_player_stats(self, report_dir: Path):
        """Write player statistics CSV."""
        totals = self.player_metrics.player_totals()
        per90 = self.player_metrics.player_per_90(min_minutes=270)

        totals.to_csv(report_dir / "player_totals.csv", index=False)
        per90.to_csv(report_dir / "player_per90.csv", index=False)

    def _write_form_analysis(self, report_dir: Path):
        """Write detailed form analysis."""
        rolling_df = self.team_metrics.rolling_metrics(5)
        rolling_df.to_csv(report_dir / "rolling_metrics.csv", index=False)

Example Usage

"""
main.py - Example usage of the season tracker.
"""

import pandas as pd
import numpy as np
from datetime import date, timedelta
from pathlib import Path

from season_tracker.data_models import SeasonData, MatchResult, PlayerMatchStats
from season_tracker.reporter import WeeklyReporter


def generate_sample_season() -> SeasonData:
    """Generate sample season data for demonstration."""
    np.random.seed(42)

    season = SeasonData("Sample FC", "2023-24")
    start_date = date(2023, 8, 12)

    for i in range(20):  # 20 matches so far
        match_date = start_date + timedelta(days=i * 7)
        venue = 'home' if i % 2 == 0 else 'away'

        # Generate realistic match data
        xg_for = np.random.normal(1.5, 0.5)
        xg_against = np.random.normal(1.2, 0.4)
        goals_for = np.random.poisson(xg_for)
        goals_against = np.random.poisson(xg_against)

        match = MatchResult(
            match_id=i + 1,
            date=match_date,
            opponent=f"Team {chr(65 + i)}",
            venue=venue,
            goals_for=goals_for,
            goals_against=goals_against,
            xg_for=round(max(0.2, xg_for), 2),
            xg_against=round(max(0.2, xg_against), 2)
        )
        season.add_match(match)

        # Generate player stats
        for player_num in range(11):
            player_xg = np.random.exponential(0.1) if player_num < 4 else 0
            player_goals = 1 if np.random.random() < player_xg else 0

            stats = PlayerMatchStats(
                player_id=f"P{player_num}",
                player_name=f"Player {player_num + 1}",
                match_id=i + 1,
                minutes=np.random.randint(60, 91),
                goals=player_goals,
                assists=np.random.choice([0, 1], p=[0.85, 0.15]),
                xg=round(player_xg, 2),
                xa=round(np.random.exponential(0.08), 2),
                shots=np.random.poisson(1.5) if player_num < 4 else 0,
                passes=np.random.poisson(35),
                pass_accuracy=round(np.random.normal(82, 5), 1)
            )
            season.add_player_stats(stats)

    return season


def main():
    """Generate sample weekly report."""
    # Generate sample data
    season = generate_sample_season()

    # Convert to DataFrames
    matches_df = season.to_matches_df()
    players_df = season.to_players_df()

    # Create reporter
    output_dir = Path("outputs/season_reports")
    reporter = WeeklyReporter(
        team_name=season.team_name,
        matches_df=matches_df,
        players_df=players_df,
        output_dir=output_dir
    )

    # Generate report
    report_path = reporter.generate_weekly_report()
    print(f"Report generated at: {report_path}")


if __name__ == "__main__":
    main()

Key Learnings

Data Modeling: Using dataclasses and clear structures makes data handling predictable and type-safe.
Metric Calculation: Separating metrics into their own class enables reuse and testing.
Rolling Windows: pandas rolling functions efficiently calculate form indicators.
Automated Reporting: Structured output generation enables scheduled report automation.
Visualization Consistency: Reusable plotting functions ensure uniform styling across reports.

Summary

This case study demonstrated building a production-quality season tracking system. The modular architecture separates concerns (data, metrics, visualization, reporting), making the system maintainable and extensible. The rolling window calculations provide crucial form indicators that help identify performance trends before they become obvious in raw results.