Case Study 1: Identifying Elite Chance Creators
Overview
This case study uses Expected Assists (xA) and related metrics to identify and evaluate elite chance creators in the 2018 World Cup. We'll build a comprehensive creativity assessment framework and create a scouting shortlist based on our analysis.
Learning Objectives: - Calculate and interpret xA at the tournament level - Build creativity profiles for individual players - Compare chance creation methods across players - Create actionable scouting recommendations
The Scenario
You're an analyst for a club looking to sign a creative midfielder or attacking player. The sporting director has asked you to use the 2018 World Cup as a scouting showcase, identifying players who demonstrate elite chance creation abilities. Your task:
- Quantify creativity using xA and related metrics
- Build detailed profiles of the top creators
- Identify different "types" of creative players
- Provide a prioritized shortlist with justification
Part 1: Data Collection and Preparation
1.1 Loading Tournament Data
import pandas as pd
import numpy as np
from statsbombpy import sb
import matplotlib.pyplot as plt
from collections import defaultdict
def load_world_cup_creativity_data():
"""
Load all relevant data for creativity analysis.
"""
# Get all World Cup 2018 matches
matches = sb.matches(competition_id=43, season_id=3)
print(f"Total matches: {len(matches)}")
# Collect events from all matches
all_events = []
match_minutes = defaultdict(lambda: defaultdict(int))
for _, match_row in matches.iterrows():
match_id = match_row['match_id']
events = sb.events(match_id=match_id)
# Add match context
events['match_id'] = match_id
all_events.append(events)
# Track player minutes (simplified)
for team in [match_row['home_team'], match_row['away_team']]:
team_events = events[events['team'] == team]
for player in team_events['player'].dropna().unique():
match_minutes[player][team] += 90 # Simplified
events_df = pd.concat(all_events, ignore_index=True)
print(f"Total events: {len(events_df)}")
return events_df, match_minutes
events_df, player_minutes = load_world_cup_creativity_data()
1.2 Calculating Base xA
def calculate_tournament_xa(events_df):
"""
Calculate xA for all players in the tournament.
"""
shots = events_df[events_df['type'] == 'Shot'].copy()
passes = events_df[events_df['type'] == 'Pass']
xa_records = []
for _, shot in shots.iterrows():
# Get key pass information
kp_id = shot.get('shot_key_pass_id')
if pd.notna(kp_id):
key_pass = passes[passes['id'] == kp_id]
if len(key_pass) > 0:
kp = key_pass.iloc[0]
xa_records.append({
'passer': kp['player'],
'passer_team': kp['team'],
'shooter': shot['player'],
'match_id': shot['match_id'],
'shot_xg': shot.get('shot_statsbomb_xg', 0),
'goal': shot['shot_outcome'] == 'Goal',
'pass_type': classify_pass(kp),
'shot_type': shot.get('shot_type', 'Open Play')
})
xa_df = pd.DataFrame(xa_records)
# Aggregate by player
player_xa = xa_df.groupby(['passer', 'passer_team']).agg({
'shot_xg': ['sum', 'count', 'mean'],
'goal': 'sum'
})
player_xa.columns = ['total_xa', 'key_passes', 'xa_per_kp', 'assists']
player_xa = player_xa.reset_index()
player_xa = player_xa.rename(columns={'passer': 'player', 'passer_team': 'team'})
return player_xa, xa_df
def classify_pass(pass_row):
"""Classify a key pass by type."""
if pass_row.get('pass_through_ball'):
return 'through_ball'
elif pass_row.get('pass_cross'):
return 'cross'
elif pass_row.get('pass_cut_back'):
return 'cutback'
else:
return 'regular'
player_xa, xa_details = calculate_tournament_xa(events_df)
Part 2: Identifying Top Creators
2.1 Tournament xA Leaders
def analyze_xa_leaders(player_xa, min_key_passes=3):
"""
Identify and analyze xA leaders.
"""
# Filter for minimum sample
qualified = player_xa[player_xa['key_passes'] >= min_key_passes].copy()
# Sort by total xA
qualified = qualified.sort_values('total_xa', ascending=False)
print("=" * 60)
print("2018 WORLD CUP - EXPECTED ASSISTS LEADERS")
print("=" * 60)
print(f"\nTop 15 by Total xA (minimum {min_key_passes} key passes):")
print("-" * 60)
display_cols = ['player', 'team', 'total_xa', 'key_passes',
'xa_per_kp', 'assists']
top_15 = qualified.head(15)[display_cols].copy()
top_15['total_xa'] = top_15['total_xa'].round(2)
top_15['xa_per_kp'] = top_15['xa_per_kp'].round(3)
print(top_15.to_string(index=False))
# xA efficiency leaders (high xA per key pass)
print("\n" + "=" * 60)
print("EFFICIENCY LEADERS (xA per Key Pass)")
print("=" * 60)
efficiency = qualified[qualified['key_passes'] >= 5].sort_values(
'xa_per_kp', ascending=False
).head(10)
print(efficiency[display_cols].to_string(index=False))
return qualified
xa_leaders = analyze_xa_leaders(player_xa)
2.2 Assist vs. xA Comparison
def analyze_xa_conversion(player_xa):
"""
Analyze which players over/underperform their xA.
"""
df = player_xa[player_xa['key_passes'] >= 3].copy()
df['xa_diff'] = df['assists'] - df['total_xa']
df['conversion_rate'] = df['assists'] / df['total_xa']
print("\n" + "=" * 60)
print("xA CONVERSION ANALYSIS")
print("=" * 60)
# Over-performers
print("\nOver-performers (Assists > xA):")
over = df[df['xa_diff'] > 0.5].sort_values('xa_diff', ascending=False)
if len(over) > 0:
for _, row in over.head(5).iterrows():
print(f" {row['player']}: {row['assists']} assists from "
f"{row['total_xa']:.2f} xA (+{row['xa_diff']:.2f})")
else:
print(" No significant over-performers")
# Under-performers
print("\nUnder-performers (Assists < xA):")
under = df[df['xa_diff'] < -0.5].sort_values('xa_diff', ascending=True)
if len(under) > 0:
for _, row in under.head(5).iterrows():
print(f" {row['player']}: {row['assists']} assists from "
f"{row['total_xa']:.2f} xA ({row['xa_diff']:.2f})")
return df
conversion_analysis = analyze_xa_conversion(player_xa)
Part 3: Building Creativity Profiles
3.1 Comprehensive Player Profile
def build_creativity_profile(events_df, xa_details, player_name):
"""
Build comprehensive creativity profile for a specific player.
"""
print(f"\n{'=' * 60}")
print(f"CREATIVITY PROFILE: {player_name}")
print("=" * 60)
# Get player's passes
player_passes = events_df[
(events_df['type'] == 'Pass') &
(events_df['player'] == player_name)
].copy()
# Get player's key passes (xA contributions)
player_xa = xa_details[xa_details['passer'] == player_name]
# Basic stats
total_passes = len(player_passes)
successful = player_passes['pass_outcome'].isna().sum()
key_passes = len(player_xa)
total_xa = player_xa['shot_xg'].sum()
assists = player_xa['goal'].sum()
print(f"\nPassing Overview:")
print(f" Total passes: {total_passes}")
print(f" Successful: {successful} ({successful/total_passes:.1%})")
print(f" Key passes: {key_passes} ({key_passes/total_passes:.1%} of passes)")
print(f"\nChance Creation:")
print(f" Total xA: {total_xa:.2f}")
print(f" Assists: {assists}")
print(f" xA per key pass: {total_xa/key_passes:.3f}" if key_passes > 0 else " No key passes")
# Pass type breakdown
if len(player_xa) > 0:
print(f"\nKey Pass Types:")
type_breakdown = player_xa.groupby('pass_type').agg({
'shot_xg': ['count', 'sum']
})
type_breakdown.columns = ['count', 'xa']
for pass_type, row in type_breakdown.iterrows():
print(f" {pass_type}: {row['count']} passes, {row['xa']:.2f} xA")
# Through ball analysis
through_balls = player_passes[
player_passes['pass_through_ball'] == True
]
print(f"\nThrough Balls:")
print(f" Attempted: {len(through_balls)}")
successful_tb = through_balls['pass_outcome'].isna().sum()
print(f" Successful: {successful_tb} ({successful_tb/len(through_balls):.1%})" if len(through_balls) > 0 else " N/A")
# Cross analysis
crosses = player_passes[player_passes['pass_cross'] == True]
print(f"\nCrosses:")
print(f" Attempted: {len(crosses)}")
successful_cross = crosses['pass_outcome'].isna().sum()
print(f" Successful: {successful_cross} ({successful_cross/len(crosses):.1%})" if len(crosses) > 0 else " N/A")
# Progressive passes
player_passes['start_x'] = player_passes['location'].apply(
lambda x: x[0] if isinstance(x, list) else None
)
player_passes['end_x'] = player_passes['pass_end_location'].apply(
lambda x: x[0] if isinstance(x, list) else None
)
player_passes = player_passes.dropna(subset=['start_x', 'end_x'])
player_passes['progression'] = player_passes['end_x'] - player_passes['start_x']
progressive = player_passes[
(player_passes['pass_outcome'].isna()) &
(player_passes['progression'] > 10)
]
print(f"\nProgressive Passes (>10m forward):")
print(f" Count: {len(progressive)}")
print(f" Avg progression: {progressive['progression'].mean():.1f}m" if len(progressive) > 0 else " N/A")
return {
'player': player_name,
'total_passes': total_passes,
'key_passes': key_passes,
'total_xa': total_xa,
'assists': assists,
'through_balls': len(through_balls),
'crosses': len(crosses),
'progressive_passes': len(progressive)
}
# Build profiles for top creators
top_creators = xa_leaders.head(5)['player'].tolist()
profiles = {}
for player in top_creators:
profiles[player] = build_creativity_profile(events_df, xa_details, player)
3.2 Creativity Style Classification
def classify_creativity_styles(xa_details, player_xa, min_key_passes=3):
"""
Classify players by their creativity style based on pass types.
"""
qualified = player_xa[player_xa['key_passes'] >= min_key_passes].copy()
# Get pass type breakdown for each player
type_breakdown = xa_details.groupby(['passer', 'pass_type']).agg({
'shot_xg': 'sum'
}).unstack(fill_value=0)
type_breakdown.columns = type_breakdown.columns.droplevel(0)
type_breakdown = type_breakdown.reset_index()
type_breakdown = type_breakdown.rename(columns={'passer': 'player'})
# Merge with total xA
style_df = qualified.merge(type_breakdown, on='player', how='left')
style_df = style_df.fillna(0)
# Calculate percentages
for col in ['through_ball', 'cross', 'cutback', 'regular']:
if col in style_df.columns:
style_df[f'{col}_pct'] = style_df[col] / style_df['total_xa']
# Classify style
def classify_style(row):
if row.get('cross_pct', 0) > 0.4:
return 'Wide Creator (Cross-Heavy)'
elif row.get('through_ball_pct', 0) > 0.3:
return 'Penetrative Playmaker'
elif row.get('cutback_pct', 0) > 0.2:
return 'Byline Specialist'
else:
return 'Central Creator'
style_df['style'] = style_df.apply(classify_style, axis=1)
print("\n" + "=" * 60)
print("CREATIVITY STYLE CLASSIFICATION")
print("=" * 60)
for style in style_df['style'].unique():
print(f"\n{style}:")
style_players = style_df[style_df['style'] == style].sort_values(
'total_xa', ascending=False
).head(5)
for _, row in style_players.iterrows():
print(f" {row['player']} ({row['team']}): {row['total_xa']:.2f} xA")
return style_df
creativity_styles = classify_creativity_styles(xa_details, player_xa)
Part 4: Scouting Shortlist
4.1 Building Composite Score
def create_scouting_shortlist(player_xa, xa_details, events_df, min_key_passes=4):
"""
Create prioritized scouting shortlist based on multiple criteria.
"""
# Filter qualified players
qualified = player_xa[player_xa['key_passes'] >= min_key_passes].copy()
# Calculate additional metrics
# xA efficiency
qualified['xa_efficiency'] = qualified['total_xa'] / qualified['key_passes']
# Get through ball data
tb_data = xa_details[xa_details['pass_type'] == 'through_ball'].groupby('passer').agg({
'shot_xg': ['count', 'sum']
})
tb_data.columns = ['through_balls', 'tb_xa']
tb_data = tb_data.reset_index().rename(columns={'passer': 'player'})
qualified = qualified.merge(tb_data, on='player', how='left').fillna(0)
# Composite score (weighted)
# Heavy weight on total xA and efficiency
qualified['composite_score'] = (
qualified['total_xa'] * 3 + # Volume of creativity
qualified['xa_efficiency'] * 5 + # Quality of chances
qualified['through_balls'] * 0.5 + # High-value pass type
qualified['assists'] * 0.3 # Proven output
)
# Rank
shortlist = qualified.sort_values('composite_score', ascending=False).head(10)
print("\n" + "=" * 60)
print("SCOUTING SHORTLIST - TOP 10 CREATIVE PLAYERS")
print("=" * 60)
for rank, (_, player) in enumerate(shortlist.iterrows(), 1):
print(f"\n{rank}. {player['player']} ({player['team']})")
print(f" Total xA: {player['total_xa']:.2f}")
print(f" Key Passes: {player['key_passes']}")
print(f" xA/Key Pass: {player['xa_efficiency']:.3f}")
print(f" Assists: {player['assists']}")
print(f" Through Balls: {player['through_balls']:.0f}")
print(f" Composite Score: {player['composite_score']:.2f}")
return shortlist
shortlist = create_scouting_shortlist(player_xa, xa_details, events_df)
4.2 Final Recommendations
def generate_recommendations(shortlist, profiles):
"""
Generate final scouting recommendations.
"""
print("\n" + "=" * 60)
print("SCOUTING RECOMMENDATIONS")
print("=" * 60)
# Top 3 detailed recommendations
top_3 = shortlist.head(3)
recommendations = []
for rank, (_, player) in enumerate(top_3.iterrows(), 1):
name = player['player']
team = player['team']
rec = {
'rank': rank,
'player': name,
'team': team,
'total_xa': player['total_xa'],
'strengths': [],
'concerns': [],
'recommendation': ''
}
# Identify strengths
if player['xa_efficiency'] > 0.15:
rec['strengths'].append("High-quality chance creation")
if player['through_balls'] >= 2:
rec['strengths'].append("Excellent at through balls")
if player['assists'] >= player['total_xa'] * 0.8:
rec['strengths'].append("Chances lead to goals")
# Potential concerns
if player['key_passes'] < 6:
rec['concerns'].append("Limited sample size in tournament")
if player['xa_efficiency'] < 0.10:
rec['concerns'].append("Volume over quality in creation")
# Overall recommendation
if player['total_xa'] > 1.5 and player['xa_efficiency'] > 0.12:
rec['recommendation'] = "PRIORITY TARGET"
elif player['total_xa'] > 1.0:
rec['recommendation'] = "STRONG INTEREST"
else:
rec['recommendation'] = "MONITOR"
recommendations.append(rec)
# Print recommendation
print(f"\n--- {rank}. {name} ({team}) ---")
print(f"Verdict: {rec['recommendation']}")
print(f"\nStrengths:")
for s in rec['strengths']:
print(f" + {s}")
if rec['concerns']:
print(f"\nConcerns:")
for c in rec['concerns']:
print(f" - {c}")
return recommendations
recommendations = generate_recommendations(shortlist, profiles)
Conclusions
Key Findings
-
Elite creators identified: The analysis identified several players demonstrating exceptional chance creation abilities at the World Cup.
-
Different creativity styles: Players varied significantly in how they created chances - some relied on crosses, others on through balls, others on cutbacks.
-
xA efficiency matters: Total xA alone doesn't tell the full story; the quality of chances per key pass distinguishes truly elite creators.
-
Sample size caution: Even at a World Cup, sample sizes are small. These findings should be combined with league data for definitive conclusions.
Recommendations for the Club
-
Priority targets should have both high total xA and high xA efficiency, demonstrating they create quality chances consistently.
-
Consider playing style fit: A cross-heavy creator might not suit a team that doesn't play with target strikers.
-
Validate with league data: Tournament performance should be confirmed against domestic league statistics.
-
Age and development: Younger players showing elite creativity may be better long-term investments.
Code Files
Complete implementation available in:
- code/case-study-code.py - Full analysis pipeline
- code/example-01-xa-calculation.py - xA calculation methods
- code/example-02-creativity-profiles.py - Profile building