Case Study 1: Building a Match Analysis Dashboard
Introduction
Professional soccer clubs need rapid, comprehensive match analysis immediately after games end. Analysts must quickly process event data, calculate key metrics, and present findings to coaching staff. This case study walks through building a complete match analysis dashboard using the Python tools covered in this chapter.
The Scenario
You are a data analyst for a Premier League club. The head coach wants a standardized post-match report that includes:
- Match Summary: Goals, shots, possession
- Team Comparison: Key statistics side by side
- Shot Analysis: Shot map and xG breakdown
- Passing Analysis: Completion rates and progressive passes
- Key Player Performances: Top contributors
The report must be generated within 30 minutes of match end.
Step 1: Project Setup
Directory Structure
match_analysis/
├── src/
│ ├── __init__.py
│ ├── data_loader.py
│ ├── metrics.py
│ ├── visualizations.py
│ └── report_generator.py
├── outputs/
│ └── reports/
├── config.py
└── main.py
Configuration
# config.py
"""Configuration for match analysis dashboard."""
from pathlib import Path
PROJECT_ROOT = Path(__file__).parent
OUTPUT_DIR = PROJECT_ROOT / "outputs" / "reports"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# StatsBomb competition/season (World Cup 2018 for demo)
COMPETITION_ID = 43
SEASON_ID = 3
# Visualization settings
FIGURE_DPI = 150
PITCH_COLOR = '#22312b'
LINE_COLOR = 'white'
# Metrics thresholds
HIGH_XG_THRESHOLD = 0.15 # "Big chance" xG threshold
Step 2: Data Loading Module
# src/data_loader.py
"""Load and preprocess match data from StatsBomb."""
import pandas as pd
from statsbombpy import sb
from typing import Tuple, Optional
import logging
logger = logging.getLogger(__name__)
class MatchDataLoader:
"""Load and preprocess match event data."""
def __init__(self, match_id: int):
"""
Initialize loader for a specific match.
Parameters
----------
match_id : int
StatsBomb match ID
"""
self.match_id = match_id
self.events = None
self.lineups = None
self.match_info = None
def load(self) -> 'MatchDataLoader':
"""Load all match data."""
logger.info(f"Loading data for match {self.match_id}")
# Load events
self.events = sb.events(match_id=self.match_id)
self._preprocess_events()
# Load lineups
self.lineups = sb.lineups(match_id=self.match_id)
# Extract match info from events
self.match_info = self._extract_match_info()
logger.info(f"Loaded {len(self.events)} events")
return self
def _preprocess_events(self):
"""Clean and preprocess event data."""
df = self.events
# Extract coordinates
df['x'] = df['location'].apply(
lambda loc: loc[0] if isinstance(loc, list) else None
)
df['y'] = df['location'].apply(
lambda loc: loc[1] if isinstance(loc, list) else None
)
# Extract end coordinates for passes
df['end_x'] = df['pass_end_location'].apply(
lambda loc: loc[0] if isinstance(loc, list) else None
)
df['end_y'] = df['pass_end_location'].apply(
lambda loc: loc[1] if isinstance(loc, list) else None
)
# Convert types for memory efficiency
df['minute'] = df['minute'].astype('int16')
df['period'] = df['period'].astype('int8')
self.events = df
def _extract_match_info(self) -> dict:
"""Extract match metadata from events."""
teams = self.events['team'].dropna().unique()
return {
'match_id': self.match_id,
'teams': list(teams),
'home_team': teams[0] if len(teams) > 0 else None,
'away_team': teams[1] if len(teams) > 1 else None,
}
def get_team_events(self, team: str) -> pd.DataFrame:
"""Get events for a specific team."""
return self.events[self.events['team'] == team].copy()
def get_shots(self) -> pd.DataFrame:
"""Get all shots in the match."""
return self.events[self.events['type'] == 'Shot'].copy()
def get_passes(self) -> pd.DataFrame:
"""Get all passes in the match."""
return self.events[self.events['type'] == 'Pass'].copy()
Step 3: Metrics Calculation Module
# src/metrics.py
"""Calculate match and player metrics."""
import pandas as pd
import numpy as np
from typing import Dict, List
class MatchMetrics:
"""Calculate comprehensive match statistics."""
def __init__(self, events: pd.DataFrame, teams: List[str]):
self.events = events
self.teams = teams
def calculate_team_stats(self) -> pd.DataFrame:
"""Calculate statistics for each team."""
team_stats = []
for team in self.teams:
team_events = self.events[self.events['team'] == team]
# Shots and xG
shots = team_events[team_events['type'] == 'Shot']
goals = len(shots[shots['shot_outcome'] == 'Goal'])
total_xg = shots['shot_statsbomb_xg'].sum() if 'shot_statsbomb_xg' in shots else 0
# Passes
passes = team_events[team_events['type'] == 'Pass']
completed_passes = passes[passes['pass_outcome'].isna()].shape[0]
pass_accuracy = completed_passes / len(passes) if len(passes) > 0 else 0
# Possession approximation (% of events)
possession = len(team_events) / len(self.events) * 100
team_stats.append({
'team': team,
'goals': goals,
'shots': len(shots),
'shots_on_target': len(shots[shots['shot_outcome'].isin(['Goal', 'Saved'])]),
'xG': round(total_xg, 2),
'passes': len(passes),
'pass_accuracy': round(pass_accuracy * 100, 1),
'possession': round(possession, 1),
})
return pd.DataFrame(team_stats)
def calculate_player_stats(self, team: str = None) -> pd.DataFrame:
"""Calculate per-player statistics."""
events = self.events
if team:
events = events[events['team'] == team]
player_stats = []
players = events['player'].dropna().unique()
for player in players:
player_events = events[events['player'] == player]
# Passes
passes = player_events[player_events['type'] == 'Pass']
completed = passes[passes['pass_outcome'].isna()].shape[0]
# Shots
shots = player_events[player_events['type'] == 'Shot']
goals = len(shots[shots['shot_outcome'] == 'Goal'])
xg = shots['shot_statsbomb_xg'].sum() if 'shot_statsbomb_xg' in shots else 0
player_stats.append({
'player': player,
'team': player_events['team'].iloc[0],
'passes': len(passes),
'pass_accuracy': round(completed / len(passes) * 100, 1) if len(passes) > 0 else 0,
'shots': len(shots),
'goals': goals,
'xG': round(xg, 2),
'touches': len(player_events),
})
return pd.DataFrame(player_stats).sort_values('touches', ascending=False)
def calculate_shot_analysis(self) -> Dict:
"""Detailed shot analysis."""
shots = self.events[self.events['type'] == 'Shot'].copy()
analysis = {}
for team in self.teams:
team_shots = shots[shots['team'] == team]
analysis[team] = {
'total_shots': len(team_shots),
'big_chances': len(team_shots[team_shots['shot_statsbomb_xg'] >= 0.15]),
'shots_inside_box': len(team_shots[team_shots['x'] >= 102]),
'shots_outside_box': len(team_shots[team_shots['x'] < 102]),
'total_xg': team_shots['shot_statsbomb_xg'].sum(),
'avg_xg_per_shot': team_shots['shot_statsbomb_xg'].mean(),
}
return analysis
Step 4: Visualization Module
# src/visualizations.py
"""Create match visualizations."""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mplsoccer import Pitch
from typing import Tuple, Optional
import config
def create_shot_map(
shots: pd.DataFrame,
team: str,
title: str = None
) -> Tuple[plt.Figure, plt.Axes]:
"""
Create a shot map for one team.
Parameters
----------
shots : pd.DataFrame
Shot data with x, y, xG, outcome
team : str
Team name
title : str, optional
Plot title
Returns
-------
Tuple[plt.Figure, plt.Axes]
Matplotlib figure and axes
"""
pitch = Pitch(
pitch_type='statsbomb',
pitch_color=config.PITCH_COLOR,
line_color=config.LINE_COLOR,
half=True
)
fig, ax = pitch.draw(figsize=(10, 7))
team_shots = shots[shots['team'] == team].copy()
# Goals
goals = team_shots[team_shots['shot_outcome'] == 'Goal']
non_goals = team_shots[team_shots['shot_outcome'] != 'Goal']
# Plot non-goals
if len(non_goals) > 0:
pitch.scatter(
non_goals['x'], non_goals['y'],
s=non_goals['shot_statsbomb_xg'] * 500 + 50,
c='white', edgecolors='black', alpha=0.6,
ax=ax, zorder=2, label='No Goal'
)
# Plot goals
if len(goals) > 0:
pitch.scatter(
goals['x'], goals['y'],
s=goals['shot_statsbomb_xg'] * 500 + 50,
c='red', edgecolors='black', alpha=0.9,
ax=ax, zorder=3, label='Goal'
)
# Add xG annotation
total_xg = team_shots['shot_statsbomb_xg'].sum()
actual_goals = len(goals)
ax.text(
60, 75,
f"xG: {total_xg:.2f} | Goals: {actual_goals}",
ha='center', fontsize=12, color='white'
)
if title:
ax.set_title(title, fontsize=14, color='white')
ax.legend(loc='upper left', fontsize=10)
return fig, ax
def create_comparison_bar_chart(
team_stats: pd.DataFrame,
metrics: list = None
) -> Tuple[plt.Figure, plt.Axes]:
"""
Create a bar chart comparing team statistics.
Parameters
----------
team_stats : pd.DataFrame
Team statistics
metrics : list, optional
Metrics to include
Returns
-------
Tuple[plt.Figure, plt.Axes]
"""
if metrics is None:
metrics = ['shots', 'shots_on_target', 'pass_accuracy', 'possession']
teams = team_stats['team'].tolist()
fig, axes = plt.subplots(1, len(metrics), figsize=(4 * len(metrics), 5))
for idx, metric in enumerate(metrics):
ax = axes[idx] if len(metrics) > 1 else axes
values = team_stats[metric].tolist()
colors = ['#1f77b4', '#ff7f0e']
bars = ax.bar(teams, values, color=colors, edgecolor='black')
ax.set_ylabel(metric.replace('_', ' ').title())
ax.set_title(metric.replace('_', ' ').title())
# Add value labels
for bar, val in zip(bars, values):
ax.text(
bar.get_x() + bar.get_width() / 2,
bar.get_height() + 0.5,
f'{val:.1f}' if isinstance(val, float) else str(val),
ha='center', va='bottom', fontsize=10
)
plt.tight_layout()
return fig, axes
def create_xg_timeline(
events: pd.DataFrame,
teams: list
) -> Tuple[plt.Figure, plt.Axes]:
"""
Create cumulative xG timeline.
Parameters
----------
events : pd.DataFrame
Match events
teams : list
Team names
Returns
-------
Tuple[plt.Figure, plt.Axes]
"""
fig, ax = plt.subplots(figsize=(12, 6))
colors = ['#1f77b4', '#ff7f0e']
for team, color in zip(teams, colors):
shots = events[
(events['team'] == team) &
(events['type'] == 'Shot')
].copy()
shots = shots.sort_values('minute')
shots['cumulative_xg'] = shots['shot_statsbomb_xg'].cumsum()
# Add starting point
minutes = [0] + shots['minute'].tolist()
xg_values = [0] + shots['cumulative_xg'].tolist()
ax.step(minutes, xg_values, where='post', linewidth=2,
color=color, label=f"{team}")
# Mark goals
goals = shots[shots['shot_outcome'] == 'Goal']
if len(goals) > 0:
ax.scatter(
goals['minute'],
goals['cumulative_xg'],
s=100, c=color, edgecolors='black',
zorder=5, marker='o'
)
ax.set_xlabel('Minute', fontsize=12)
ax.set_ylabel('Cumulative xG', fontsize=12)
ax.set_title('xG Timeline', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_xlim(0, 95)
plt.tight_layout()
return fig, ax
Step 5: Report Generator
# src/report_generator.py
"""Generate comprehensive match reports."""
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import logging
from .data_loader import MatchDataLoader
from .metrics import MatchMetrics
from . import visualizations as viz
import config
logger = logging.getLogger(__name__)
class MatchReport:
"""Generate a complete post-match analysis report."""
def __init__(self, match_id: int):
self.match_id = match_id
self.loader = None
self.metrics = None
self.output_dir = None
def generate(self, output_dir: Path = None) -> Path:
"""
Generate complete match report.
Parameters
----------
output_dir : Path, optional
Output directory
Returns
-------
Path
Path to report directory
"""
# Setup output
self.output_dir = output_dir or config.OUTPUT_DIR / f"match_{self.match_id}"
self.output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Generating report for match {self.match_id}")
# Load data
self.loader = MatchDataLoader(self.match_id)
self.loader.load()
# Calculate metrics
teams = self.loader.match_info['teams']
self.metrics = MatchMetrics(self.loader.events, teams)
# Generate components
self._generate_summary()
self._generate_shot_maps()
self._generate_comparison_charts()
self._generate_xg_timeline()
self._generate_player_stats()
logger.info(f"Report saved to {self.output_dir}")
return self.output_dir
def _generate_summary(self):
"""Generate text summary."""
team_stats = self.metrics.calculate_team_stats()
summary_path = self.output_dir / "summary.txt"
with open(summary_path, 'w') as f:
f.write("=" * 50 + "\n")
f.write("MATCH SUMMARY\n")
f.write("=" * 50 + "\n\n")
for _, row in team_stats.iterrows():
f.write(f"{row['team']}\n")
f.write(f" Goals: {row['goals']}\n")
f.write(f" Shots: {row['shots']} ({row['shots_on_target']} on target)\n")
f.write(f" xG: {row['xG']}\n")
f.write(f" Pass Accuracy: {row['pass_accuracy']}%\n")
f.write(f" Possession: {row['possession']}%\n\n")
f.write(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
def _generate_shot_maps(self):
"""Generate shot map visualizations."""
shots = self.loader.get_shots()
teams = self.loader.match_info['teams']
for team in teams:
fig, ax = viz.create_shot_map(shots, team, f"{team} Shot Map")
fig.savefig(
self.output_dir / f"shot_map_{team.lower().replace(' ', '_')}.png",
dpi=config.FIGURE_DPI,
facecolor=config.PITCH_COLOR,
bbox_inches='tight'
)
plt.close(fig)
def _generate_comparison_charts(self):
"""Generate team comparison visualizations."""
team_stats = self.metrics.calculate_team_stats()
fig, ax = viz.create_comparison_bar_chart(team_stats)
fig.savefig(
self.output_dir / "team_comparison.png",
dpi=config.FIGURE_DPI,
bbox_inches='tight'
)
plt.close(fig)
def _generate_xg_timeline(self):
"""Generate xG timeline visualization."""
teams = self.loader.match_info['teams']
fig, ax = viz.create_xg_timeline(self.loader.events, teams)
fig.savefig(
self.output_dir / "xg_timeline.png",
dpi=config.FIGURE_DPI,
bbox_inches='tight'
)
plt.close(fig)
def _generate_player_stats(self):
"""Generate player statistics CSV."""
player_stats = self.metrics.calculate_player_stats()
player_stats.to_csv(
self.output_dir / "player_stats.csv",
index=False
)
Step 6: Main Entry Point
# main.py
"""Main entry point for match analysis."""
import logging
from src.report_generator import MatchReport
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
def main():
"""Generate match analysis report."""
# World Cup 2018 Final: France vs Croatia
match_id = 7298
report = MatchReport(match_id)
output_path = report.generate()
print(f"\nReport generated successfully!")
print(f"Output: {output_path}")
if __name__ == "__main__":
main()
Running the Dashboard
# Navigate to project directory
cd match_analysis
# Run the analysis
python main.py
Output:
2024-01-15 14:32:01 - src.data_loader - INFO - Loading data for match 7298
2024-01-15 14:32:05 - src.data_loader - INFO - Loaded 3524 events
2024-01-15 14:32:05 - src.report_generator - INFO - Generating report for match 7298
2024-01-15 14:32:08 - src.report_generator - INFO - Report saved to outputs/reports/match_7298
Report generated successfully!
Output: outputs/reports/match_7298
Generated Report Contents
The report directory contains:
match_7298/
├── summary.txt # Text summary of match
├── shot_map_france.png # France shot map
├── shot_map_croatia.png # Croatia shot map
├── team_comparison.png # Side-by-side statistics
├── xg_timeline.png # Cumulative xG chart
└── player_stats.csv # Detailed player statistics
Key Learnings
-
Modular Design: Separating data loading, metrics, visualization, and reporting makes code maintainable and testable.
-
Configuration Management: Centralizing settings in
config.pymakes adjustments easy. -
Logging: Proper logging helps debug issues in production.
-
Efficient Data Processing: Using pandas vectorized operations enables fast metric calculation.
-
Visualization Consistency: Using shared configurations ensures uniform styling.
Extension Ideas
- Add pass network visualization
- Include defensive metrics (pressures, tackles)
- Generate HTML report with embedded images
- Add comparison to team's season averages
- Create automated email distribution
Summary
This case study demonstrated how to build a production-quality match analysis dashboard using the Python tools from Chapter 4. The modular architecture allows easy extension and maintenance, while efficient pandas operations ensure reports generate quickly.