Case Study 1: Building a Match Analysis Dashboard

Introduction

Professional soccer clubs need rapid, comprehensive match analysis immediately after games end. Analysts must quickly process event data, calculate key metrics, and present findings to coaching staff. This case study walks through building a complete match analysis dashboard using the Python tools covered in this chapter.

The Scenario

You are a data analyst for a Premier League club. The head coach wants a standardized post-match report that includes:

  1. Match Summary: Goals, shots, possession
  2. Team Comparison: Key statistics side by side
  3. Shot Analysis: Shot map and xG breakdown
  4. Passing Analysis: Completion rates and progressive passes
  5. Key Player Performances: Top contributors

The report must be generated within 30 minutes of match end.

Step 1: Project Setup

Directory Structure

match_analysis/
├── src/
│   ├── __init__.py
│   ├── data_loader.py
│   ├── metrics.py
│   ├── visualizations.py
│   └── report_generator.py
├── outputs/
│   └── reports/
├── config.py
└── main.py

Configuration

# config.py
"""Configuration for match analysis dashboard."""

from pathlib import Path

PROJECT_ROOT = Path(__file__).parent
OUTPUT_DIR = PROJECT_ROOT / "outputs" / "reports"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# StatsBomb competition/season (World Cup 2018 for demo)
COMPETITION_ID = 43
SEASON_ID = 3

# Visualization settings
FIGURE_DPI = 150
PITCH_COLOR = '#22312b'
LINE_COLOR = 'white'

# Metrics thresholds
HIGH_XG_THRESHOLD = 0.15  # "Big chance" xG threshold

Step 2: Data Loading Module

# src/data_loader.py
"""Load and preprocess match data from StatsBomb."""

import pandas as pd
from statsbombpy import sb
from typing import Tuple, Optional
import logging

logger = logging.getLogger(__name__)


class MatchDataLoader:
    """Load and preprocess match event data."""

    def __init__(self, match_id: int):
        """
        Initialize loader for a specific match.

        Parameters
        ----------
        match_id : int
            StatsBomb match ID
        """
        self.match_id = match_id
        self.events = None
        self.lineups = None
        self.match_info = None

    def load(self) -> 'MatchDataLoader':
        """Load all match data."""
        logger.info(f"Loading data for match {self.match_id}")

        # Load events
        self.events = sb.events(match_id=self.match_id)
        self._preprocess_events()

        # Load lineups
        self.lineups = sb.lineups(match_id=self.match_id)

        # Extract match info from events
        self.match_info = self._extract_match_info()

        logger.info(f"Loaded {len(self.events)} events")
        return self

    def _preprocess_events(self):
        """Clean and preprocess event data."""
        df = self.events

        # Extract coordinates
        df['x'] = df['location'].apply(
            lambda loc: loc[0] if isinstance(loc, list) else None
        )
        df['y'] = df['location'].apply(
            lambda loc: loc[1] if isinstance(loc, list) else None
        )

        # Extract end coordinates for passes
        df['end_x'] = df['pass_end_location'].apply(
            lambda loc: loc[0] if isinstance(loc, list) else None
        )
        df['end_y'] = df['pass_end_location'].apply(
            lambda loc: loc[1] if isinstance(loc, list) else None
        )

        # Convert types for memory efficiency
        df['minute'] = df['minute'].astype('int16')
        df['period'] = df['period'].astype('int8')

        self.events = df

    def _extract_match_info(self) -> dict:
        """Extract match metadata from events."""
        teams = self.events['team'].dropna().unique()

        return {
            'match_id': self.match_id,
            'teams': list(teams),
            'home_team': teams[0] if len(teams) > 0 else None,
            'away_team': teams[1] if len(teams) > 1 else None,
        }

    def get_team_events(self, team: str) -> pd.DataFrame:
        """Get events for a specific team."""
        return self.events[self.events['team'] == team].copy()

    def get_shots(self) -> pd.DataFrame:
        """Get all shots in the match."""
        return self.events[self.events['type'] == 'Shot'].copy()

    def get_passes(self) -> pd.DataFrame:
        """Get all passes in the match."""
        return self.events[self.events['type'] == 'Pass'].copy()

Step 3: Metrics Calculation Module

# src/metrics.py
"""Calculate match and player metrics."""

import pandas as pd
import numpy as np
from typing import Dict, List


class MatchMetrics:
    """Calculate comprehensive match statistics."""

    def __init__(self, events: pd.DataFrame, teams: List[str]):
        self.events = events
        self.teams = teams

    def calculate_team_stats(self) -> pd.DataFrame:
        """Calculate statistics for each team."""
        team_stats = []

        for team in self.teams:
            team_events = self.events[self.events['team'] == team]

            # Shots and xG
            shots = team_events[team_events['type'] == 'Shot']
            goals = len(shots[shots['shot_outcome'] == 'Goal'])
            total_xg = shots['shot_statsbomb_xg'].sum() if 'shot_statsbomb_xg' in shots else 0

            # Passes
            passes = team_events[team_events['type'] == 'Pass']
            completed_passes = passes[passes['pass_outcome'].isna()].shape[0]
            pass_accuracy = completed_passes / len(passes) if len(passes) > 0 else 0

            # Possession approximation (% of events)
            possession = len(team_events) / len(self.events) * 100

            team_stats.append({
                'team': team,
                'goals': goals,
                'shots': len(shots),
                'shots_on_target': len(shots[shots['shot_outcome'].isin(['Goal', 'Saved'])]),
                'xG': round(total_xg, 2),
                'passes': len(passes),
                'pass_accuracy': round(pass_accuracy * 100, 1),
                'possession': round(possession, 1),
            })

        return pd.DataFrame(team_stats)

    def calculate_player_stats(self, team: str = None) -> pd.DataFrame:
        """Calculate per-player statistics."""
        events = self.events
        if team:
            events = events[events['team'] == team]

        player_stats = []
        players = events['player'].dropna().unique()

        for player in players:
            player_events = events[events['player'] == player]

            # Passes
            passes = player_events[player_events['type'] == 'Pass']
            completed = passes[passes['pass_outcome'].isna()].shape[0]

            # Shots
            shots = player_events[player_events['type'] == 'Shot']
            goals = len(shots[shots['shot_outcome'] == 'Goal'])
            xg = shots['shot_statsbomb_xg'].sum() if 'shot_statsbomb_xg' in shots else 0

            player_stats.append({
                'player': player,
                'team': player_events['team'].iloc[0],
                'passes': len(passes),
                'pass_accuracy': round(completed / len(passes) * 100, 1) if len(passes) > 0 else 0,
                'shots': len(shots),
                'goals': goals,
                'xG': round(xg, 2),
                'touches': len(player_events),
            })

        return pd.DataFrame(player_stats).sort_values('touches', ascending=False)

    def calculate_shot_analysis(self) -> Dict:
        """Detailed shot analysis."""
        shots = self.events[self.events['type'] == 'Shot'].copy()

        analysis = {}
        for team in self.teams:
            team_shots = shots[shots['team'] == team]

            analysis[team] = {
                'total_shots': len(team_shots),
                'big_chances': len(team_shots[team_shots['shot_statsbomb_xg'] >= 0.15]),
                'shots_inside_box': len(team_shots[team_shots['x'] >= 102]),
                'shots_outside_box': len(team_shots[team_shots['x'] < 102]),
                'total_xg': team_shots['shot_statsbomb_xg'].sum(),
                'avg_xg_per_shot': team_shots['shot_statsbomb_xg'].mean(),
            }

        return analysis

Step 4: Visualization Module

# src/visualizations.py
"""Create match visualizations."""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mplsoccer import Pitch
from typing import Tuple, Optional
import config


def create_shot_map(
    shots: pd.DataFrame,
    team: str,
    title: str = None
) -> Tuple[plt.Figure, plt.Axes]:
    """
    Create a shot map for one team.

    Parameters
    ----------
    shots : pd.DataFrame
        Shot data with x, y, xG, outcome
    team : str
        Team name
    title : str, optional
        Plot title

    Returns
    -------
    Tuple[plt.Figure, plt.Axes]
        Matplotlib figure and axes
    """
    pitch = Pitch(
        pitch_type='statsbomb',
        pitch_color=config.PITCH_COLOR,
        line_color=config.LINE_COLOR,
        half=True
    )

    fig, ax = pitch.draw(figsize=(10, 7))

    team_shots = shots[shots['team'] == team].copy()

    # Goals
    goals = team_shots[team_shots['shot_outcome'] == 'Goal']
    non_goals = team_shots[team_shots['shot_outcome'] != 'Goal']

    # Plot non-goals
    if len(non_goals) > 0:
        pitch.scatter(
            non_goals['x'], non_goals['y'],
            s=non_goals['shot_statsbomb_xg'] * 500 + 50,
            c='white', edgecolors='black', alpha=0.6,
            ax=ax, zorder=2, label='No Goal'
        )

    # Plot goals
    if len(goals) > 0:
        pitch.scatter(
            goals['x'], goals['y'],
            s=goals['shot_statsbomb_xg'] * 500 + 50,
            c='red', edgecolors='black', alpha=0.9,
            ax=ax, zorder=3, label='Goal'
        )

    # Add xG annotation
    total_xg = team_shots['shot_statsbomb_xg'].sum()
    actual_goals = len(goals)
    ax.text(
        60, 75,
        f"xG: {total_xg:.2f} | Goals: {actual_goals}",
        ha='center', fontsize=12, color='white'
    )

    if title:
        ax.set_title(title, fontsize=14, color='white')

    ax.legend(loc='upper left', fontsize=10)

    return fig, ax


def create_comparison_bar_chart(
    team_stats: pd.DataFrame,
    metrics: list = None
) -> Tuple[plt.Figure, plt.Axes]:
    """
    Create a bar chart comparing team statistics.

    Parameters
    ----------
    team_stats : pd.DataFrame
        Team statistics
    metrics : list, optional
        Metrics to include

    Returns
    -------
    Tuple[plt.Figure, plt.Axes]
    """
    if metrics is None:
        metrics = ['shots', 'shots_on_target', 'pass_accuracy', 'possession']

    teams = team_stats['team'].tolist()

    fig, axes = plt.subplots(1, len(metrics), figsize=(4 * len(metrics), 5))

    for idx, metric in enumerate(metrics):
        ax = axes[idx] if len(metrics) > 1 else axes

        values = team_stats[metric].tolist()
        colors = ['#1f77b4', '#ff7f0e']

        bars = ax.bar(teams, values, color=colors, edgecolor='black')

        ax.set_ylabel(metric.replace('_', ' ').title())
        ax.set_title(metric.replace('_', ' ').title())

        # Add value labels
        for bar, val in zip(bars, values):
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                bar.get_height() + 0.5,
                f'{val:.1f}' if isinstance(val, float) else str(val),
                ha='center', va='bottom', fontsize=10
            )

    plt.tight_layout()
    return fig, axes


def create_xg_timeline(
    events: pd.DataFrame,
    teams: list
) -> Tuple[plt.Figure, plt.Axes]:
    """
    Create cumulative xG timeline.

    Parameters
    ----------
    events : pd.DataFrame
        Match events
    teams : list
        Team names

    Returns
    -------
    Tuple[plt.Figure, plt.Axes]
    """
    fig, ax = plt.subplots(figsize=(12, 6))

    colors = ['#1f77b4', '#ff7f0e']

    for team, color in zip(teams, colors):
        shots = events[
            (events['team'] == team) &
            (events['type'] == 'Shot')
        ].copy()

        shots = shots.sort_values('minute')
        shots['cumulative_xg'] = shots['shot_statsbomb_xg'].cumsum()

        # Add starting point
        minutes = [0] + shots['minute'].tolist()
        xg_values = [0] + shots['cumulative_xg'].tolist()

        ax.step(minutes, xg_values, where='post', linewidth=2,
                color=color, label=f"{team}")

        # Mark goals
        goals = shots[shots['shot_outcome'] == 'Goal']
        if len(goals) > 0:
            ax.scatter(
                goals['minute'],
                goals['cumulative_xg'],
                s=100, c=color, edgecolors='black',
                zorder=5, marker='o'
            )

    ax.set_xlabel('Minute', fontsize=12)
    ax.set_ylabel('Cumulative xG', fontsize=12)
    ax.set_title('xG Timeline', fontsize=14)
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_xlim(0, 95)

    plt.tight_layout()
    return fig, ax

Step 5: Report Generator

# src/report_generator.py
"""Generate comprehensive match reports."""

import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import logging

from .data_loader import MatchDataLoader
from .metrics import MatchMetrics
from . import visualizations as viz
import config

logger = logging.getLogger(__name__)


class MatchReport:
    """Generate a complete post-match analysis report."""

    def __init__(self, match_id: int):
        self.match_id = match_id
        self.loader = None
        self.metrics = None
        self.output_dir = None

    def generate(self, output_dir: Path = None) -> Path:
        """
        Generate complete match report.

        Parameters
        ----------
        output_dir : Path, optional
            Output directory

        Returns
        -------
        Path
            Path to report directory
        """
        # Setup output
        self.output_dir = output_dir or config.OUTPUT_DIR / f"match_{self.match_id}"
        self.output_dir.mkdir(parents=True, exist_ok=True)

        logger.info(f"Generating report for match {self.match_id}")

        # Load data
        self.loader = MatchDataLoader(self.match_id)
        self.loader.load()

        # Calculate metrics
        teams = self.loader.match_info['teams']
        self.metrics = MatchMetrics(self.loader.events, teams)

        # Generate components
        self._generate_summary()
        self._generate_shot_maps()
        self._generate_comparison_charts()
        self._generate_xg_timeline()
        self._generate_player_stats()

        logger.info(f"Report saved to {self.output_dir}")
        return self.output_dir

    def _generate_summary(self):
        """Generate text summary."""
        team_stats = self.metrics.calculate_team_stats()

        summary_path = self.output_dir / "summary.txt"
        with open(summary_path, 'w') as f:
            f.write("=" * 50 + "\n")
            f.write("MATCH SUMMARY\n")
            f.write("=" * 50 + "\n\n")

            for _, row in team_stats.iterrows():
                f.write(f"{row['team']}\n")
                f.write(f"  Goals: {row['goals']}\n")
                f.write(f"  Shots: {row['shots']} ({row['shots_on_target']} on target)\n")
                f.write(f"  xG: {row['xG']}\n")
                f.write(f"  Pass Accuracy: {row['pass_accuracy']}%\n")
                f.write(f"  Possession: {row['possession']}%\n\n")

            f.write(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")

    def _generate_shot_maps(self):
        """Generate shot map visualizations."""
        shots = self.loader.get_shots()
        teams = self.loader.match_info['teams']

        for team in teams:
            fig, ax = viz.create_shot_map(shots, team, f"{team} Shot Map")
            fig.savefig(
                self.output_dir / f"shot_map_{team.lower().replace(' ', '_')}.png",
                dpi=config.FIGURE_DPI,
                facecolor=config.PITCH_COLOR,
                bbox_inches='tight'
            )
            plt.close(fig)

    def _generate_comparison_charts(self):
        """Generate team comparison visualizations."""
        team_stats = self.metrics.calculate_team_stats()

        fig, ax = viz.create_comparison_bar_chart(team_stats)
        fig.savefig(
            self.output_dir / "team_comparison.png",
            dpi=config.FIGURE_DPI,
            bbox_inches='tight'
        )
        plt.close(fig)

    def _generate_xg_timeline(self):
        """Generate xG timeline visualization."""
        teams = self.loader.match_info['teams']

        fig, ax = viz.create_xg_timeline(self.loader.events, teams)
        fig.savefig(
            self.output_dir / "xg_timeline.png",
            dpi=config.FIGURE_DPI,
            bbox_inches='tight'
        )
        plt.close(fig)

    def _generate_player_stats(self):
        """Generate player statistics CSV."""
        player_stats = self.metrics.calculate_player_stats()
        player_stats.to_csv(
            self.output_dir / "player_stats.csv",
            index=False
        )

Step 6: Main Entry Point

# main.py
"""Main entry point for match analysis."""

import logging
from src.report_generator import MatchReport

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)


def main():
    """Generate match analysis report."""
    # World Cup 2018 Final: France vs Croatia
    match_id = 7298

    report = MatchReport(match_id)
    output_path = report.generate()

    print(f"\nReport generated successfully!")
    print(f"Output: {output_path}")


if __name__ == "__main__":
    main()

Running the Dashboard

# Navigate to project directory
cd match_analysis

# Run the analysis
python main.py

Output:

2024-01-15 14:32:01 - src.data_loader - INFO - Loading data for match 7298
2024-01-15 14:32:05 - src.data_loader - INFO - Loaded 3524 events
2024-01-15 14:32:05 - src.report_generator - INFO - Generating report for match 7298
2024-01-15 14:32:08 - src.report_generator - INFO - Report saved to outputs/reports/match_7298

Report generated successfully!
Output: outputs/reports/match_7298

Generated Report Contents

The report directory contains:

match_7298/
├── summary.txt              # Text summary of match
├── shot_map_france.png      # France shot map
├── shot_map_croatia.png     # Croatia shot map
├── team_comparison.png      # Side-by-side statistics
├── xg_timeline.png          # Cumulative xG chart
└── player_stats.csv         # Detailed player statistics

Key Learnings

  1. Modular Design: Separating data loading, metrics, visualization, and reporting makes code maintainable and testable.

  2. Configuration Management: Centralizing settings in config.py makes adjustments easy.

  3. Logging: Proper logging helps debug issues in production.

  4. Efficient Data Processing: Using pandas vectorized operations enables fast metric calculation.

  5. Visualization Consistency: Using shared configurations ensures uniform styling.

Extension Ideas

  1. Add pass network visualization
  2. Include defensive metrics (pressures, tackles)
  3. Generate HTML report with embedded images
  4. Add comparison to team's season averages
  5. Create automated email distribution

Summary

This case study demonstrated how to build a production-quality match analysis dashboard using the Python tools from Chapter 4. The modular architecture allows easy extension and maintenance, while efficient pandas operations ensure reports generate quickly.