Your First Soccer Analysis
Beginner
10 min read
1 views
Nov 27, 2025
Hands-On Match Analysis Tutorial
Let's analyze a real match from the 2018 FIFA World Cup using StatsBomb's free data. This tutorial covers loading data, performing basic analysis, and creating professional visualizations.
What You'll Learn
- Loading and exploring match data
- Analyzing passing networks and shot quality
- Creating professional visualizations
- Comparing player performance
- Calculating key metrics (xG, pass completion, etc.)
Part 1: Match Overview Analysis
Python: Loading Match Data
from statsbombpy import sb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mplsoccer import Pitch, VerticalPitch
import seaborn as sns
# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
# Load World Cup 2018 Final: France vs Croatia
match_id = 8658
# Get match events
print("Loading match data...")
events = sb.events(match_id=match_id)
print(f"Loaded {len(events)} events")
# Get lineups
lineups = sb.lineups(match_id=match_id)
# Basic match info
teams = events['team'].unique()
print(f"\nTeams: {teams[0]} vs {teams[1]}")
print(f"Event types: {events['type'].nunique()}")
# Display event type breakdown
print("\nEvent Type Distribution:")
event_counts = events['type'].value_counts()
print(event_counts.head(10))
R: Loading Match Data
library(StatsBombR)
library(tidyverse)
library(ggsoccer)
# Load World Cup 2018 Final: France vs Croatia
match_id <- 8658
# Get match events
cat("Loading match data...\n")
match_data <- get.matchFree(
MatchesDF = data.frame(match_id = match_id)
)
events <- allclean(match_data)
cat(sprintf("Loaded %d events\n", nrow(events)))
# Get lineups
lineups <- get.lineups(match_id)
# Basic match info
teams <- unique(events$team.name)
cat(sprintf("\nTeams: %s vs %s\n", teams[1], teams[2]))
# Event type distribution
event_counts <- events %>%
count(type.name, sort = TRUE)
cat("\nTop Event Types:\n")
print(head(event_counts, 10))
Part 2: Shot Analysis and xG
Python: Analyzing Shots
# Filter shots
shots = events[events['type'] == 'Shot'].copy()
print(f"\nTotal shots: {len(shots)}")
print(f"\nShots by team:")
print(shots.groupby('team')['shot_outcome'].value_counts())
# Calculate xG by team
xg_by_team = shots.groupby('team').agg({
'shot_statsbomb_xg': 'sum',
'id': 'count'
}).round(2)
xg_by_team.columns = ['Total xG', 'Total Shots']
# Count actual goals
goals_by_team = shots[shots['shot_outcome'] == 'Goal'].groupby('team').size()
xg_by_team['Goals'] = goals_by_team
print("\nExpected Goals (xG) Summary:")
print(xg_by_team)
# Performance vs expectation
xg_by_team['Difference'] = xg_by_team['Goals'] - xg_by_team['Total xG']
print("\nGoals vs xG:")
print(xg_by_team[['Goals', 'Total xG', 'Difference']])
# Create shot map
pitch = Pitch(pitch_type='statsbomb', pitch_color='#22312b',
line_color='white', linewidth=2)
fig, axes = pitch.draw(nrows=1, ncols=2, figsize=(16, 8))
for idx, team in enumerate(teams):
team_shots = shots[shots['team'] == team]
ax = axes[idx]
# Plot shots
for i, shot in team_shots.iterrows():
if pd.notna(shot['location']):
x, y = shot['location']
# Size by xG
size = shot['shot_statsbomb_xg'] * 500
# Color by outcome
if shot['shot_outcome'] == 'Goal':
color = '#00ff00'
edge_color = 'white'
zorder = 2
else:
color = '#ff4444'
edge_color = 'white'
zorder = 1
ax.scatter(x, y, s=size, c=color, edgecolors=edge_color,
linewidth=2, alpha=0.8, zorder=zorder)
ax.set_title(f'{team}\nShots: {len(team_shots)} | xG: {team_shots["shot_statsbomb_xg"].sum():.2f}',
fontsize=14, fontweight='bold', color='white')
fig.suptitle('2018 World Cup Final - Shot Map\nSize = xG Value | Green = Goal | Red = No Goal',
fontsize=16, fontweight='bold', color='white', y=0.98)
plt.tight_layout()
plt.savefig('shot_map.png', dpi=300, bbox_inches='tight', facecolor='#22312b')
print("\nShot map saved as 'shot_map.png'")
R: Shot Analysis and Visualization
# Filter shots
shots <- events %>%
filter(type.name == "Shot")
cat(sprintf("\nTotal shots: %d\n", nrow(shots)))
# xG by team
xg_summary <- shots %>%
group_by(team.name) %>%
summarise(
total_shots = n(),
total_xG = sum(shot.statsbomb_xg, na.rm = TRUE),
goals = sum(shot.outcome.name == "Goal", na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(
difference = goals - total_xG
)
cat("\nExpected Goals (xG) Summary:\n")
print(xg_summary)
# Create shot map
shot_map <- ggplot(shots, aes(x = location.x, y = location.y)) +
annotate_pitch(dimensions = pitch_statsbomb,
fill = "#22312b", colour = "white") +
geom_point(aes(size = shot.statsbomb_xg,
color = shot.outcome.name),
alpha = 0.8) +
scale_size_continuous(range = c(2, 10), name = "xG Value") +
scale_color_manual(values = c(
"Goal" = "#00ff00",
"Saved" = "#ffff00",
"Off T" = "#ff4444",
"Blocked" = "#ff8800"
), name = "Outcome") +
facet_wrap(~team.name) +
theme_pitch() +
theme(
plot.background = element_rect(fill = "#22312b"),
strip.background = element_rect(fill = "#1a1a1a"),
strip.text = element_text(color = "white", face = "bold", size = 12),
legend.background = element_rect(fill = "#1a1a1a"),
legend.text = element_text(color = "white"),
legend.title = element_text(color = "white", face = "bold"),
plot.title = element_text(color = "white", face = "bold", size = 16, hjust = 0.5)
) +
coord_fixed(ratio = 1) +
labs(title = "2018 World Cup Final - Shot Map")
ggsave("shot_map.png", shot_map, width = 14, height = 8, dpi = 300)
cat("\nShot map saved as 'shot_map.png'\n")
Part 3: Passing Network Analysis
Python: Building Passing Networks
# Filter passes (first half only for clarity)
passes = events[
(events['type'] == 'Pass') &
(events['period'] == 1)
].copy()
# Choose one team for analysis
team_name = teams[0] # France
team_passes = passes[passes['team'] == team_name].copy()
print(f"\nAnalyzing {team_name} passing in first half")
print(f"Total passes: {len(team_passes)}")
print(f"Completed: {team_passes['pass_outcome'].isna().sum()}")
print(f"Incomplete: {team_passes['pass_outcome'].notna().sum()}")
# Calculate pass completion by player
player_passes = team_passes.groupby('player').agg({
'id': 'count',
'pass_outcome': lambda x: (x.isna().sum() / len(x) * 100)
}).round(1)
player_passes.columns = ['Total Passes', 'Completion %']
player_passes = player_passes[player_passes['Total Passes'] >= 10]
player_passes = player_passes.sort_values('Total Passes', ascending=False)
print(f"\nTop Passers (10+ passes):")
print(player_passes.head(10))
# Calculate average pass locations for network
player_positions = team_passes.groupby('player').agg({
'location': lambda x: [np.mean([loc[0] for loc in x if pd.notna(loc)]),
np.mean([loc[1] for loc in x if pd.notna(loc)])],
'id': 'count'
})
player_positions.columns = ['avg_location', 'pass_count']
player_positions = player_positions[player_positions['pass_count'] >= 10]
# Create passing network visualization
pitch = Pitch(pitch_type='statsbomb', pitch_color='#22312b',
line_color='white', linewidth=2)
fig, ax = pitch.draw(figsize=(14, 10))
# Plot player nodes
for player, row in player_positions.iterrows():
x, y = row['avg_location']
size = row['pass_count'] * 3
ax.scatter(x, y, s=size, c='#4CAF50', edgecolors='white',
linewidth=2, alpha=0.9, zorder=3)
# Add player name
ax.annotate(player.split()[-1], (x, y),
fontsize=8, fontweight='bold', color='white',
ha='center', va='center', zorder=4)
ax.set_title(f'{team_name} - Passing Network (First Half)\n' +
'Node size = number of passes',
fontsize=16, fontweight='bold', color='white', pad=20)
plt.tight_layout()
plt.savefig('passing_network.png', dpi=300, bbox_inches='tight',
facecolor='#22312b')
print("\nPassing network saved as 'passing_network.png'")
R: Passing Network Visualization
# Filter passes (first half)
passes <- events %>%
filter(type.name == "Pass", period == 1)
# Choose one team
team_name <- teams[1] # France
team_passes <- passes %>%
filter(team.name == team_name)
cat(sprintf("\nAnalyzing %s passing in first half\n", team_name))
cat(sprintf("Total passes: %d\n", nrow(team_passes)))
# Pass completion by player
player_passes <- team_passes %>%
group_by(player.name) %>%
summarise(
total_passes = n(),
completed = sum(is.na(pass.outcome.name)),
completion_pct = (completed / total_passes) * 100,
.groups = 'drop'
) %>%
filter(total_passes >= 10) %>%
arrange(desc(total_passes))
cat("\nTop Passers (10+ passes):\n")
print(head(player_passes, 10))
# Calculate average positions
player_positions <- team_passes %>%
group_by(player.name) %>%
summarise(
avg_x = mean(location.x, na.rm = TRUE),
avg_y = mean(location.y, na.rm = TRUE),
pass_count = n(),
.groups = 'drop'
) %>%
filter(pass_count >= 10)
# Create passing network
network_plot <- ggplot() +
annotate_pitch(dimensions = pitch_statsbomb,
fill = "#22312b", colour = "white") +
geom_point(data = player_positions,
aes(x = avg_x, y = avg_y, size = pass_count),
color = "#4CAF50", alpha = 0.9) +
geom_text(data = player_positions,
aes(x = avg_x, y = avg_y,
label = word(player.name, -1)),
color = "white", fontface = "bold", size = 3) +
scale_size_continuous(range = c(5, 15), name = "Pass Count") +
theme_pitch() +
theme(
plot.background = element_rect(fill = "#22312b"),
legend.background = element_rect(fill = "#1a1a1a"),
legend.text = element_text(color = "white"),
legend.title = element_text(color = "white", face = "bold"),
plot.title = element_text(color = "white", face = "bold",
size = 16, hjust = 0.5)
) +
coord_fixed(ratio = 1) +
labs(title = sprintf("%s - Passing Network (First Half)", team_name),
subtitle = "Node size = number of passes")
ggsave("passing_network.png", network_plot,
width = 12, height = 10, dpi = 300)
cat("\nPassing network saved as 'passing_network.png'\n")
Part 4: Player Performance Comparison
Python: Comparing Player Statistics
# Aggregate player statistics
player_stats = []
for player in events['player'].unique():
if pd.isna(player):
continue
player_events = events[events['player'] == player]
team = player_events['team'].iloc[0]
# Calculate statistics
passes = player_events[player_events['type'] == 'Pass']
shots = player_events[player_events['type'] == 'Shot']
stats = {
'Player': player,
'Team': team,
'Passes': len(passes),
'Pass_Completion': (passes['pass_outcome'].isna().sum() / len(passes) * 100) if len(passes) > 0 else 0,
'Shots': len(shots),
'xG': shots['shot_statsbomb_xg'].sum() if len(shots) > 0 else 0,
'Goals': (shots['shot_outcome'] == 'Goal').sum(),
'Touches': len(player_events)
}
player_stats.append(stats)
# Create DataFrame
player_df = pd.DataFrame(player_stats)
player_df = player_df[player_df['Passes'] >= 10] # Filter for meaningful data
player_df = player_df.round(2)
# Sort by touches
player_df = player_df.sort_values('Touches', ascending=False)
print("\nTop 10 Players by Touches:")
print(player_df.head(10))
# Create comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Player Performance Comparison - World Cup 2018 Final',
fontsize=16, fontweight='bold')
# 1. Passes vs Completion Rate
ax1 = axes[0, 0]
for team in teams:
team_data = player_df[player_df['Team'] == team]
ax1.scatter(team_data['Passes'], team_data['Pass_Completion'],
s=100, alpha=0.6, label=team)
ax1.set_xlabel('Total Passes', fontsize=12)
ax1.set_ylabel('Pass Completion %', fontsize=12)
ax1.set_title('Passing Performance', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(alpha=0.3)
# 2. Shots vs xG
ax2 = axes[0, 1]
for team in teams:
team_data = player_df[player_df['Team'] == team]
ax2.scatter(team_data['Shots'], team_data['xG'],
s=100, alpha=0.6, label=team)
ax2.set_xlabel('Total Shots', fontsize=12)
ax2.set_ylabel('Expected Goals (xG)', fontsize=12)
ax2.set_title('Shooting Performance', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(alpha=0.3)
# 3. Top Passers
ax3 = axes[1, 0]
top_passers = player_df.nlargest(10, 'Passes')
colors = ['#FF6B6B' if team == teams[0] else '#4ECDC4'
for team in top_passers['Team']]
ax3.barh(range(len(top_passers)), top_passers['Passes'], color=colors, alpha=0.7)
ax3.set_yticks(range(len(top_passers)))
ax3.set_yticklabels([name.split()[-1] for name in top_passers['Player']])
ax3.set_xlabel('Total Passes', fontsize=12)
ax3.set_title('Top 10 Passers', fontsize=14, fontweight='bold')
ax3.invert_yaxis()
ax3.grid(axis='x', alpha=0.3)
# 4. Touches Distribution
ax4 = axes[1, 1]
top_touches = player_df.nlargest(10, 'Touches')
colors = ['#FF6B6B' if team == teams[0] else '#4ECDC4'
for team in top_touches['Team']]
ax4.barh(range(len(top_touches)), top_touches['Touches'], color=colors, alpha=0.7)
ax4.set_yticks(range(len(top_touches)))
ax4.set_yticklabels([name.split()[-1] for name in top_touches['Player']])
ax4.set_xlabel('Total Touches', fontsize=12)
ax4.set_title('Most Involved Players', fontsize=14, fontweight='bold')
ax4.invert_yaxis()
ax4.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('player_comparison.png', dpi=300, bbox_inches='tight')
print("\nPlayer comparison saved as 'player_comparison.png'")
R: Player Statistics Dashboard
library(patchwork)
# Aggregate player statistics
player_stats <- events %>%
group_by(player.name, team.name) %>%
summarise(
total_touches = n(),
passes = sum(type.name == "Pass", na.rm = TRUE),
shots = sum(type.name == "Shot", na.rm = TRUE),
xG = sum(shot.statsbomb_xg, na.rm = TRUE),
goals = sum(shot.outcome.name == "Goal", na.rm = TRUE),
.groups = 'drop'
) %>%
filter(!is.na(player.name), passes >= 10)
# Calculate pass completion
pass_completion <- events %>%
filter(type.name == "Pass") %>%
group_by(player.name) %>%
summarise(
pass_completion = (sum(is.na(pass.outcome.name)) / n()) * 100,
.groups = 'drop'
)
player_stats <- player_stats %>%
left_join(pass_completion, by = "player.name")
cat("\nTop 10 Players by Touches:\n")
print(player_stats %>% arrange(desc(total_touches)) %>% head(10))
# Create visualizations
p1 <- ggplot(player_stats,
aes(x = passes, y = pass_completion, color = team.name)) +
geom_point(size = 3, alpha = 0.7) +
scale_color_manual(values = c("#FF6B6B", "#4ECDC4")) +
labs(title = "Passing Performance",
x = "Total Passes", y = "Completion %",
color = "Team") +
theme_minimal() +
theme(legend.position = "bottom")
p2 <- ggplot(player_stats %>% filter(shots > 0),
aes(x = shots, y = xG, color = team.name)) +
geom_point(size = 3, alpha = 0.7) +
scale_color_manual(values = c("#FF6B6B", "#4ECDC4")) +
labs(title = "Shooting Performance",
x = "Total Shots", y = "xG",
color = "Team") +
theme_minimal() +
theme(legend.position = "bottom")
p3 <- player_stats %>%
arrange(desc(passes)) %>%
head(10) %>%
ggplot(aes(x = reorder(word(player.name, -1), passes),
y = passes, fill = team.name)) +
geom_col(alpha = 0.7) +
scale_fill_manual(values = c("#FF6B6B", "#4ECDC4")) +
coord_flip() +
labs(title = "Top 10 Passers", x = "", y = "Total Passes", fill = "Team") +
theme_minimal() +
theme(legend.position = "bottom")
p4 <- player_stats %>%
arrange(desc(total_touches)) %>%
head(10) %>%
ggplot(aes(x = reorder(word(player.name, -1), total_touches),
y = total_touches, fill = team.name)) +
geom_col(alpha = 0.7) +
scale_fill_manual(values = c("#FF6B6B", "#4ECDC4")) +
coord_flip() +
labs(title = "Most Involved Players",
x = "", y = "Total Touches", fill = "Team") +
theme_minimal() +
theme(legend.position = "bottom")
# Combine plots
combined <- (p1 | p2) / (p3 | p4) +
plot_annotation(
title = "Player Performance - World Cup 2018 Final",
theme = theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
)
ggsave("player_comparison.png", combined, width = 16, height = 12, dpi = 300)
cat("\nPlayer comparison saved as 'player_comparison.png'\n")
Part 5: Match Summary Report
Python: Generate Summary Statistics
# Create comprehensive match summary
def generate_match_report(events, teams):
"""Generate comprehensive match statistics"""
report = {}
for team in teams:
team_events = events[events['team'] == team]
# Possession (estimated from passes)
passes = team_events[team_events['type'] == 'Pass']
# Shots
shots = team_events[team_events['type'] == 'Shot']
# Goals
goals = (shots['shot_outcome'] == 'Goal').sum()
# xG
xg = shots['shot_statsbomb_xg'].sum()
report[team] = {
'Passes': len(passes),
'Pass Completion %': round((passes['pass_outcome'].isna().sum() / len(passes) * 100), 1) if len(passes) > 0 else 0,
'Shots': len(shots),
'Shots on Target': (shots['shot_outcome'].isin(['Goal', 'Saved'])).sum(),
'Goals': goals,
'xG': round(xg, 2),
'xG per Shot': round(xg / len(shots), 3) if len(shots) > 0 else 0
}
return pd.DataFrame(report).T
# Generate and display report
match_report = generate_match_report(events, teams)
print("\n" + "="*60)
print("MATCH SUMMARY REPORT")
print("2018 FIFA World Cup Final")
print("="*60)
print(match_report)
print("="*60)
# Save to CSV
match_report.to_csv('match_report.csv')
print("\nDetailed report saved as 'match_report.csv'")
Congratulations!
You've completed your first soccer analysis! You've learned to:
- Load and explore match-level event data
- Calculate and visualize expected goals (xG)
- Build passing networks
- Compare player performance
- Generate comprehensive match reports
Next Steps
Now that you understand the basics, explore:
- Multi-match analysis across a full season
- Advanced metrics (PPDA, progressive passes, xA)
- Player recruitment and scouting analysis
- Predictive modeling for match outcomes
- Custom xG models using machine learning
Practice Exercise
Try analyzing different matches from the dataset:
- Pick a different World Cup 2018 match
- Modify the code to analyze both halves separately
- Create a heat map showing player positioning
- Calculate defensive actions (tackles, interceptions, blocks)
- Compare team pressing intensity using PPDA
Discussion
Have questions or feedback? Join our community discussion on
Discord or
GitHub Discussions.
Table of Contents
Related Topics
Quick Actions