Your First Soccer Analysis

Beginner 10 min read 1 views Nov 27, 2025

Hands-On Match Analysis Tutorial

Let's analyze a real match from the 2018 FIFA World Cup using StatsBomb's free data. This tutorial covers loading data, performing basic analysis, and creating professional visualizations.

What You'll Learn

  • Loading and exploring match data
  • Analyzing passing networks and shot quality
  • Creating professional visualizations
  • Comparing player performance
  • Calculating key metrics (xG, pass completion, etc.)

Part 1: Match Overview Analysis

Python: Loading Match Data

from statsbombpy import sb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mplsoccer import Pitch, VerticalPitch
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Load World Cup 2018 Final: France vs Croatia
match_id = 8658

# Get match events
print("Loading match data...")
events = sb.events(match_id=match_id)
print(f"Loaded {len(events)} events")

# Get lineups
lineups = sb.lineups(match_id=match_id)

# Basic match info
teams = events['team'].unique()
print(f"\nTeams: {teams[0]} vs {teams[1]}")
print(f"Event types: {events['type'].nunique()}")

# Display event type breakdown
print("\nEvent Type Distribution:")
event_counts = events['type'].value_counts()
print(event_counts.head(10))

R: Loading Match Data

library(StatsBombR)
library(tidyverse)
library(ggsoccer)

# Load World Cup 2018 Final: France vs Croatia
match_id <- 8658

# Get match events
cat("Loading match data...\n")
match_data <- get.matchFree(
  MatchesDF = data.frame(match_id = match_id)
)
events <- allclean(match_data)

cat(sprintf("Loaded %d events\n", nrow(events)))

# Get lineups
lineups <- get.lineups(match_id)

# Basic match info
teams <- unique(events$team.name)
cat(sprintf("\nTeams: %s vs %s\n", teams[1], teams[2]))

# Event type distribution
event_counts <- events %>%
  count(type.name, sort = TRUE)

cat("\nTop Event Types:\n")
print(head(event_counts, 10))

Part 2: Shot Analysis and xG

Python: Analyzing Shots

# Filter shots
shots = events[events['type'] == 'Shot'].copy()

print(f"\nTotal shots: {len(shots)}")
print(f"\nShots by team:")
print(shots.groupby('team')['shot_outcome'].value_counts())

# Calculate xG by team
xg_by_team = shots.groupby('team').agg({
    'shot_statsbomb_xg': 'sum',
    'id': 'count'
}).round(2)
xg_by_team.columns = ['Total xG', 'Total Shots']

# Count actual goals
goals_by_team = shots[shots['shot_outcome'] == 'Goal'].groupby('team').size()
xg_by_team['Goals'] = goals_by_team

print("\nExpected Goals (xG) Summary:")
print(xg_by_team)

# Performance vs expectation
xg_by_team['Difference'] = xg_by_team['Goals'] - xg_by_team['Total xG']
print("\nGoals vs xG:")
print(xg_by_team[['Goals', 'Total xG', 'Difference']])

# Create shot map
pitch = Pitch(pitch_type='statsbomb', pitch_color='#22312b',
              line_color='white', linewidth=2)
fig, axes = pitch.draw(nrows=1, ncols=2, figsize=(16, 8))

for idx, team in enumerate(teams):
    team_shots = shots[shots['team'] == team]
    ax = axes[idx]

    # Plot shots
    for i, shot in team_shots.iterrows():
        if pd.notna(shot['location']):
            x, y = shot['location']

            # Size by xG
            size = shot['shot_statsbomb_xg'] * 500

            # Color by outcome
            if shot['shot_outcome'] == 'Goal':
                color = '#00ff00'
                edge_color = 'white'
                zorder = 2
            else:
                color = '#ff4444'
                edge_color = 'white'
                zorder = 1

            ax.scatter(x, y, s=size, c=color, edgecolors=edge_color,
                      linewidth=2, alpha=0.8, zorder=zorder)

    ax.set_title(f'{team}\nShots: {len(team_shots)} | xG: {team_shots["shot_statsbomb_xg"].sum():.2f}',
                fontsize=14, fontweight='bold', color='white')

fig.suptitle('2018 World Cup Final - Shot Map\nSize = xG Value | Green = Goal | Red = No Goal',
            fontsize=16, fontweight='bold', color='white', y=0.98)
plt.tight_layout()
plt.savefig('shot_map.png', dpi=300, bbox_inches='tight', facecolor='#22312b')
print("\nShot map saved as 'shot_map.png'")

R: Shot Analysis and Visualization

# Filter shots
shots <- events %>%
  filter(type.name == "Shot")

cat(sprintf("\nTotal shots: %d\n", nrow(shots)))

# xG by team
xg_summary <- shots %>%
  group_by(team.name) %>%
  summarise(
    total_shots = n(),
    total_xG = sum(shot.statsbomb_xg, na.rm = TRUE),
    goals = sum(shot.outcome.name == "Goal", na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  mutate(
    difference = goals - total_xG
  )

cat("\nExpected Goals (xG) Summary:\n")
print(xg_summary)

# Create shot map
shot_map <- ggplot(shots, aes(x = location.x, y = location.y)) +
  annotate_pitch(dimensions = pitch_statsbomb,
                fill = "#22312b", colour = "white") +
  geom_point(aes(size = shot.statsbomb_xg,
                 color = shot.outcome.name),
            alpha = 0.8) +
  scale_size_continuous(range = c(2, 10), name = "xG Value") +
  scale_color_manual(values = c(
    "Goal" = "#00ff00",
    "Saved" = "#ffff00",
    "Off T" = "#ff4444",
    "Blocked" = "#ff8800"
  ), name = "Outcome") +
  facet_wrap(~team.name) +
  theme_pitch() +
  theme(
    plot.background = element_rect(fill = "#22312b"),
    strip.background = element_rect(fill = "#1a1a1a"),
    strip.text = element_text(color = "white", face = "bold", size = 12),
    legend.background = element_rect(fill = "#1a1a1a"),
    legend.text = element_text(color = "white"),
    legend.title = element_text(color = "white", face = "bold"),
    plot.title = element_text(color = "white", face = "bold", size = 16, hjust = 0.5)
  ) +
  coord_fixed(ratio = 1) +
  labs(title = "2018 World Cup Final - Shot Map")

ggsave("shot_map.png", shot_map, width = 14, height = 8, dpi = 300)
cat("\nShot map saved as 'shot_map.png'\n")

Part 3: Passing Network Analysis

Python: Building Passing Networks

# Filter passes (first half only for clarity)
passes = events[
    (events['type'] == 'Pass') &
    (events['period'] == 1)
].copy()

# Choose one team for analysis
team_name = teams[0]  # France
team_passes = passes[passes['team'] == team_name].copy()

print(f"\nAnalyzing {team_name} passing in first half")
print(f"Total passes: {len(team_passes)}")
print(f"Completed: {team_passes['pass_outcome'].isna().sum()}")
print(f"Incomplete: {team_passes['pass_outcome'].notna().sum()}")

# Calculate pass completion by player
player_passes = team_passes.groupby('player').agg({
    'id': 'count',
    'pass_outcome': lambda x: (x.isna().sum() / len(x) * 100)
}).round(1)
player_passes.columns = ['Total Passes', 'Completion %']
player_passes = player_passes[player_passes['Total Passes'] >= 10]
player_passes = player_passes.sort_values('Total Passes', ascending=False)

print(f"\nTop Passers (10+ passes):")
print(player_passes.head(10))

# Calculate average pass locations for network
player_positions = team_passes.groupby('player').agg({
    'location': lambda x: [np.mean([loc[0] for loc in x if pd.notna(loc)]),
                           np.mean([loc[1] for loc in x if pd.notna(loc)])],
    'id': 'count'
})
player_positions.columns = ['avg_location', 'pass_count']
player_positions = player_positions[player_positions['pass_count'] >= 10]

# Create passing network visualization
pitch = Pitch(pitch_type='statsbomb', pitch_color='#22312b',
              line_color='white', linewidth=2)
fig, ax = pitch.draw(figsize=(14, 10))

# Plot player nodes
for player, row in player_positions.iterrows():
    x, y = row['avg_location']
    size = row['pass_count'] * 3

    ax.scatter(x, y, s=size, c='#4CAF50', edgecolors='white',
              linewidth=2, alpha=0.9, zorder=3)

    # Add player name
    ax.annotate(player.split()[-1], (x, y),
               fontsize=8, fontweight='bold', color='white',
               ha='center', va='center', zorder=4)

ax.set_title(f'{team_name} - Passing Network (First Half)\n' +
            'Node size = number of passes',
            fontsize=16, fontweight='bold', color='white', pad=20)

plt.tight_layout()
plt.savefig('passing_network.png', dpi=300, bbox_inches='tight',
           facecolor='#22312b')
print("\nPassing network saved as 'passing_network.png'")

R: Passing Network Visualization

# Filter passes (first half)
passes <- events %>%
  filter(type.name == "Pass", period == 1)

# Choose one team
team_name <- teams[1]  # France
team_passes <- passes %>%
  filter(team.name == team_name)

cat(sprintf("\nAnalyzing %s passing in first half\n", team_name))
cat(sprintf("Total passes: %d\n", nrow(team_passes)))

# Pass completion by player
player_passes <- team_passes %>%
  group_by(player.name) %>%
  summarise(
    total_passes = n(),
    completed = sum(is.na(pass.outcome.name)),
    completion_pct = (completed / total_passes) * 100,
    .groups = 'drop'
  ) %>%
  filter(total_passes >= 10) %>%
  arrange(desc(total_passes))

cat("\nTop Passers (10+ passes):\n")
print(head(player_passes, 10))

# Calculate average positions
player_positions <- team_passes %>%
  group_by(player.name) %>%
  summarise(
    avg_x = mean(location.x, na.rm = TRUE),
    avg_y = mean(location.y, na.rm = TRUE),
    pass_count = n(),
    .groups = 'drop'
  ) %>%
  filter(pass_count >= 10)

# Create passing network
network_plot <- ggplot() +
  annotate_pitch(dimensions = pitch_statsbomb,
                fill = "#22312b", colour = "white") +
  geom_point(data = player_positions,
            aes(x = avg_x, y = avg_y, size = pass_count),
            color = "#4CAF50", alpha = 0.9) +
  geom_text(data = player_positions,
           aes(x = avg_x, y = avg_y,
               label = word(player.name, -1)),
           color = "white", fontface = "bold", size = 3) +
  scale_size_continuous(range = c(5, 15), name = "Pass Count") +
  theme_pitch() +
  theme(
    plot.background = element_rect(fill = "#22312b"),
    legend.background = element_rect(fill = "#1a1a1a"),
    legend.text = element_text(color = "white"),
    legend.title = element_text(color = "white", face = "bold"),
    plot.title = element_text(color = "white", face = "bold",
                             size = 16, hjust = 0.5)
  ) +
  coord_fixed(ratio = 1) +
  labs(title = sprintf("%s - Passing Network (First Half)", team_name),
       subtitle = "Node size = number of passes")

ggsave("passing_network.png", network_plot,
      width = 12, height = 10, dpi = 300)
cat("\nPassing network saved as 'passing_network.png'\n")

Part 4: Player Performance Comparison

Python: Comparing Player Statistics

# Aggregate player statistics
player_stats = []

for player in events['player'].unique():
    if pd.isna(player):
        continue

    player_events = events[events['player'] == player]
    team = player_events['team'].iloc[0]

    # Calculate statistics
    passes = player_events[player_events['type'] == 'Pass']
    shots = player_events[player_events['type'] == 'Shot']

    stats = {
        'Player': player,
        'Team': team,
        'Passes': len(passes),
        'Pass_Completion': (passes['pass_outcome'].isna().sum() / len(passes) * 100) if len(passes) > 0 else 0,
        'Shots': len(shots),
        'xG': shots['shot_statsbomb_xg'].sum() if len(shots) > 0 else 0,
        'Goals': (shots['shot_outcome'] == 'Goal').sum(),
        'Touches': len(player_events)
    }

    player_stats.append(stats)

# Create DataFrame
player_df = pd.DataFrame(player_stats)
player_df = player_df[player_df['Passes'] >= 10]  # Filter for meaningful data
player_df = player_df.round(2)

# Sort by touches
player_df = player_df.sort_values('Touches', ascending=False)

print("\nTop 10 Players by Touches:")
print(player_df.head(10))

# Create comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Player Performance Comparison - World Cup 2018 Final',
            fontsize=16, fontweight='bold')

# 1. Passes vs Completion Rate
ax1 = axes[0, 0]
for team in teams:
    team_data = player_df[player_df['Team'] == team]
    ax1.scatter(team_data['Passes'], team_data['Pass_Completion'],
               s=100, alpha=0.6, label=team)
ax1.set_xlabel('Total Passes', fontsize=12)
ax1.set_ylabel('Pass Completion %', fontsize=12)
ax1.set_title('Passing Performance', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(alpha=0.3)

# 2. Shots vs xG
ax2 = axes[0, 1]
for team in teams:
    team_data = player_df[player_df['Team'] == team]
    ax2.scatter(team_data['Shots'], team_data['xG'],
               s=100, alpha=0.6, label=team)
ax2.set_xlabel('Total Shots', fontsize=12)
ax2.set_ylabel('Expected Goals (xG)', fontsize=12)
ax2.set_title('Shooting Performance', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(alpha=0.3)

# 3. Top Passers
ax3 = axes[1, 0]
top_passers = player_df.nlargest(10, 'Passes')
colors = ['#FF6B6B' if team == teams[0] else '#4ECDC4'
         for team in top_passers['Team']]
ax3.barh(range(len(top_passers)), top_passers['Passes'], color=colors, alpha=0.7)
ax3.set_yticks(range(len(top_passers)))
ax3.set_yticklabels([name.split()[-1] for name in top_passers['Player']])
ax3.set_xlabel('Total Passes', fontsize=12)
ax3.set_title('Top 10 Passers', fontsize=14, fontweight='bold')
ax3.invert_yaxis()
ax3.grid(axis='x', alpha=0.3)

# 4. Touches Distribution
ax4 = axes[1, 1]
top_touches = player_df.nlargest(10, 'Touches')
colors = ['#FF6B6B' if team == teams[0] else '#4ECDC4'
         for team in top_touches['Team']]
ax4.barh(range(len(top_touches)), top_touches['Touches'], color=colors, alpha=0.7)
ax4.set_yticks(range(len(top_touches)))
ax4.set_yticklabels([name.split()[-1] for name in top_touches['Player']])
ax4.set_xlabel('Total Touches', fontsize=12)
ax4.set_title('Most Involved Players', fontsize=14, fontweight='bold')
ax4.invert_yaxis()
ax4.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('player_comparison.png', dpi=300, bbox_inches='tight')
print("\nPlayer comparison saved as 'player_comparison.png'")

R: Player Statistics Dashboard

library(patchwork)

# Aggregate player statistics
player_stats <- events %>%
  group_by(player.name, team.name) %>%
  summarise(
    total_touches = n(),
    passes = sum(type.name == "Pass", na.rm = TRUE),
    shots = sum(type.name == "Shot", na.rm = TRUE),
    xG = sum(shot.statsbomb_xg, na.rm = TRUE),
    goals = sum(shot.outcome.name == "Goal", na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  filter(!is.na(player.name), passes >= 10)

# Calculate pass completion
pass_completion <- events %>%
  filter(type.name == "Pass") %>%
  group_by(player.name) %>%
  summarise(
    pass_completion = (sum(is.na(pass.outcome.name)) / n()) * 100,
    .groups = 'drop'
  )

player_stats <- player_stats %>%
  left_join(pass_completion, by = "player.name")

cat("\nTop 10 Players by Touches:\n")
print(player_stats %>% arrange(desc(total_touches)) %>% head(10))

# Create visualizations
p1 <- ggplot(player_stats,
            aes(x = passes, y = pass_completion, color = team.name)) +
  geom_point(size = 3, alpha = 0.7) +
  scale_color_manual(values = c("#FF6B6B", "#4ECDC4")) +
  labs(title = "Passing Performance",
       x = "Total Passes", y = "Completion %",
       color = "Team") +
  theme_minimal() +
  theme(legend.position = "bottom")

p2 <- ggplot(player_stats %>% filter(shots > 0),
            aes(x = shots, y = xG, color = team.name)) +
  geom_point(size = 3, alpha = 0.7) +
  scale_color_manual(values = c("#FF6B6B", "#4ECDC4")) +
  labs(title = "Shooting Performance",
       x = "Total Shots", y = "xG",
       color = "Team") +
  theme_minimal() +
  theme(legend.position = "bottom")

p3 <- player_stats %>%
  arrange(desc(passes)) %>%
  head(10) %>%
  ggplot(aes(x = reorder(word(player.name, -1), passes),
             y = passes, fill = team.name)) +
  geom_col(alpha = 0.7) +
  scale_fill_manual(values = c("#FF6B6B", "#4ECDC4")) +
  coord_flip() +
  labs(title = "Top 10 Passers", x = "", y = "Total Passes", fill = "Team") +
  theme_minimal() +
  theme(legend.position = "bottom")

p4 <- player_stats %>%
  arrange(desc(total_touches)) %>%
  head(10) %>%
  ggplot(aes(x = reorder(word(player.name, -1), total_touches),
             y = total_touches, fill = team.name)) +
  geom_col(alpha = 0.7) +
  scale_fill_manual(values = c("#FF6B6B", "#4ECDC4")) +
  coord_flip() +
  labs(title = "Most Involved Players",
       x = "", y = "Total Touches", fill = "Team") +
  theme_minimal() +
  theme(legend.position = "bottom")

# Combine plots
combined <- (p1 | p2) / (p3 | p4) +
  plot_annotation(
    title = "Player Performance - World Cup 2018 Final",
    theme = theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
  )

ggsave("player_comparison.png", combined, width = 16, height = 12, dpi = 300)
cat("\nPlayer comparison saved as 'player_comparison.png'\n")

Part 5: Match Summary Report

Python: Generate Summary Statistics

# Create comprehensive match summary
def generate_match_report(events, teams):
    """Generate comprehensive match statistics"""

    report = {}

    for team in teams:
        team_events = events[events['team'] == team]

        # Possession (estimated from passes)
        passes = team_events[team_events['type'] == 'Pass']

        # Shots
        shots = team_events[team_events['type'] == 'Shot']

        # Goals
        goals = (shots['shot_outcome'] == 'Goal').sum()

        # xG
        xg = shots['shot_statsbomb_xg'].sum()

        report[team] = {
            'Passes': len(passes),
            'Pass Completion %': round((passes['pass_outcome'].isna().sum() / len(passes) * 100), 1) if len(passes) > 0 else 0,
            'Shots': len(shots),
            'Shots on Target': (shots['shot_outcome'].isin(['Goal', 'Saved'])).sum(),
            'Goals': goals,
            'xG': round(xg, 2),
            'xG per Shot': round(xg / len(shots), 3) if len(shots) > 0 else 0
        }

    return pd.DataFrame(report).T

# Generate and display report
match_report = generate_match_report(events, teams)

print("\n" + "="*60)
print("MATCH SUMMARY REPORT")
print("2018 FIFA World Cup Final")
print("="*60)
print(match_report)
print("="*60)

# Save to CSV
match_report.to_csv('match_report.csv')
print("\nDetailed report saved as 'match_report.csv'")

Congratulations!

You've completed your first soccer analysis! You've learned to:

  • Load and explore match-level event data
  • Calculate and visualize expected goals (xG)
  • Build passing networks
  • Compare player performance
  • Generate comprehensive match reports

Next Steps

Now that you understand the basics, explore:

  • Multi-match analysis across a full season
  • Advanced metrics (PPDA, progressive passes, xA)
  • Player recruitment and scouting analysis
  • Predictive modeling for match outcomes
  • Custom xG models using machine learning

Practice Exercise

Try analyzing different matches from the dataset:

  1. Pick a different World Cup 2018 match
  2. Modify the code to analyze both halves separately
  3. Create a heat map showing player positioning
  4. Calculate defensive actions (tackles, interceptions, blocks)
  5. Compare team pressing intensity using PPDA

Discussion

Have questions or feedback? Join our community discussion on Discord or GitHub Discussions.