Case Study 1: Identifying Elite Chance Creators

Overview

This case study uses Expected Assists (xA) and related metrics to identify and evaluate elite chance creators in the 2018 World Cup. We'll build a comprehensive creativity assessment framework and create a scouting shortlist based on our analysis.

Learning Objectives: - Calculate and interpret xA at the tournament level - Build creativity profiles for individual players - Compare chance creation methods across players - Create actionable scouting recommendations


The Scenario

You're an analyst for a club looking to sign a creative midfielder or attacking player. The sporting director has asked you to use the 2018 World Cup as a scouting showcase, identifying players who demonstrate elite chance creation abilities. Your task:

  1. Quantify creativity using xA and related metrics
  2. Build detailed profiles of the top creators
  3. Identify different "types" of creative players
  4. Provide a prioritized shortlist with justification

Part 1: Data Collection and Preparation

1.1 Loading Tournament Data

import pandas as pd
import numpy as np
from statsbombpy import sb
import matplotlib.pyplot as plt
from collections import defaultdict

def load_world_cup_creativity_data():
    """
    Load all relevant data for creativity analysis.
    """
    # Get all World Cup 2018 matches
    matches = sb.matches(competition_id=43, season_id=3)
    print(f"Total matches: {len(matches)}")

    # Collect events from all matches
    all_events = []
    match_minutes = defaultdict(lambda: defaultdict(int))

    for _, match_row in matches.iterrows():
        match_id = match_row['match_id']
        events = sb.events(match_id=match_id)

        # Add match context
        events['match_id'] = match_id

        all_events.append(events)

        # Track player minutes (simplified)
        for team in [match_row['home_team'], match_row['away_team']]:
            team_events = events[events['team'] == team]
            for player in team_events['player'].dropna().unique():
                match_minutes[player][team] += 90  # Simplified

    events_df = pd.concat(all_events, ignore_index=True)

    print(f"Total events: {len(events_df)}")

    return events_df, match_minutes

events_df, player_minutes = load_world_cup_creativity_data()

1.2 Calculating Base xA

def calculate_tournament_xa(events_df):
    """
    Calculate xA for all players in the tournament.
    """
    shots = events_df[events_df['type'] == 'Shot'].copy()
    passes = events_df[events_df['type'] == 'Pass']

    xa_records = []

    for _, shot in shots.iterrows():
        # Get key pass information
        kp_id = shot.get('shot_key_pass_id')

        if pd.notna(kp_id):
            key_pass = passes[passes['id'] == kp_id]

            if len(key_pass) > 0:
                kp = key_pass.iloc[0]

                xa_records.append({
                    'passer': kp['player'],
                    'passer_team': kp['team'],
                    'shooter': shot['player'],
                    'match_id': shot['match_id'],
                    'shot_xg': shot.get('shot_statsbomb_xg', 0),
                    'goal': shot['shot_outcome'] == 'Goal',
                    'pass_type': classify_pass(kp),
                    'shot_type': shot.get('shot_type', 'Open Play')
                })

    xa_df = pd.DataFrame(xa_records)

    # Aggregate by player
    player_xa = xa_df.groupby(['passer', 'passer_team']).agg({
        'shot_xg': ['sum', 'count', 'mean'],
        'goal': 'sum'
    })
    player_xa.columns = ['total_xa', 'key_passes', 'xa_per_kp', 'assists']
    player_xa = player_xa.reset_index()
    player_xa = player_xa.rename(columns={'passer': 'player', 'passer_team': 'team'})

    return player_xa, xa_df

def classify_pass(pass_row):
    """Classify a key pass by type."""
    if pass_row.get('pass_through_ball'):
        return 'through_ball'
    elif pass_row.get('pass_cross'):
        return 'cross'
    elif pass_row.get('pass_cut_back'):
        return 'cutback'
    else:
        return 'regular'

player_xa, xa_details = calculate_tournament_xa(events_df)

Part 2: Identifying Top Creators

2.1 Tournament xA Leaders

def analyze_xa_leaders(player_xa, min_key_passes=3):
    """
    Identify and analyze xA leaders.
    """
    # Filter for minimum sample
    qualified = player_xa[player_xa['key_passes'] >= min_key_passes].copy()

    # Sort by total xA
    qualified = qualified.sort_values('total_xa', ascending=False)

    print("=" * 60)
    print("2018 WORLD CUP - EXPECTED ASSISTS LEADERS")
    print("=" * 60)
    print(f"\nTop 15 by Total xA (minimum {min_key_passes} key passes):")
    print("-" * 60)

    display_cols = ['player', 'team', 'total_xa', 'key_passes',
                    'xa_per_kp', 'assists']

    top_15 = qualified.head(15)[display_cols].copy()
    top_15['total_xa'] = top_15['total_xa'].round(2)
    top_15['xa_per_kp'] = top_15['xa_per_kp'].round(3)

    print(top_15.to_string(index=False))

    # xA efficiency leaders (high xA per key pass)
    print("\n" + "=" * 60)
    print("EFFICIENCY LEADERS (xA per Key Pass)")
    print("=" * 60)

    efficiency = qualified[qualified['key_passes'] >= 5].sort_values(
        'xa_per_kp', ascending=False
    ).head(10)

    print(efficiency[display_cols].to_string(index=False))

    return qualified

xa_leaders = analyze_xa_leaders(player_xa)

2.2 Assist vs. xA Comparison

def analyze_xa_conversion(player_xa):
    """
    Analyze which players over/underperform their xA.
    """
    df = player_xa[player_xa['key_passes'] >= 3].copy()

    df['xa_diff'] = df['assists'] - df['total_xa']
    df['conversion_rate'] = df['assists'] / df['total_xa']

    print("\n" + "=" * 60)
    print("xA CONVERSION ANALYSIS")
    print("=" * 60)

    # Over-performers
    print("\nOver-performers (Assists > xA):")
    over = df[df['xa_diff'] > 0.5].sort_values('xa_diff', ascending=False)
    if len(over) > 0:
        for _, row in over.head(5).iterrows():
            print(f"  {row['player']}: {row['assists']} assists from "
                  f"{row['total_xa']:.2f} xA (+{row['xa_diff']:.2f})")
    else:
        print("  No significant over-performers")

    # Under-performers
    print("\nUnder-performers (Assists < xA):")
    under = df[df['xa_diff'] < -0.5].sort_values('xa_diff', ascending=True)
    if len(under) > 0:
        for _, row in under.head(5).iterrows():
            print(f"  {row['player']}: {row['assists']} assists from "
                  f"{row['total_xa']:.2f} xA ({row['xa_diff']:.2f})")

    return df

conversion_analysis = analyze_xa_conversion(player_xa)

Part 3: Building Creativity Profiles

3.1 Comprehensive Player Profile

def build_creativity_profile(events_df, xa_details, player_name):
    """
    Build comprehensive creativity profile for a specific player.
    """
    print(f"\n{'=' * 60}")
    print(f"CREATIVITY PROFILE: {player_name}")
    print("=" * 60)

    # Get player's passes
    player_passes = events_df[
        (events_df['type'] == 'Pass') &
        (events_df['player'] == player_name)
    ].copy()

    # Get player's key passes (xA contributions)
    player_xa = xa_details[xa_details['passer'] == player_name]

    # Basic stats
    total_passes = len(player_passes)
    successful = player_passes['pass_outcome'].isna().sum()
    key_passes = len(player_xa)
    total_xa = player_xa['shot_xg'].sum()
    assists = player_xa['goal'].sum()

    print(f"\nPassing Overview:")
    print(f"  Total passes: {total_passes}")
    print(f"  Successful: {successful} ({successful/total_passes:.1%})")
    print(f"  Key passes: {key_passes} ({key_passes/total_passes:.1%} of passes)")

    print(f"\nChance Creation:")
    print(f"  Total xA: {total_xa:.2f}")
    print(f"  Assists: {assists}")
    print(f"  xA per key pass: {total_xa/key_passes:.3f}" if key_passes > 0 else "  No key passes")

    # Pass type breakdown
    if len(player_xa) > 0:
        print(f"\nKey Pass Types:")
        type_breakdown = player_xa.groupby('pass_type').agg({
            'shot_xg': ['count', 'sum']
        })
        type_breakdown.columns = ['count', 'xa']
        for pass_type, row in type_breakdown.iterrows():
            print(f"  {pass_type}: {row['count']} passes, {row['xa']:.2f} xA")

    # Through ball analysis
    through_balls = player_passes[
        player_passes['pass_through_ball'] == True
    ]
    print(f"\nThrough Balls:")
    print(f"  Attempted: {len(through_balls)}")
    successful_tb = through_balls['pass_outcome'].isna().sum()
    print(f"  Successful: {successful_tb} ({successful_tb/len(through_balls):.1%})" if len(through_balls) > 0 else "  N/A")

    # Cross analysis
    crosses = player_passes[player_passes['pass_cross'] == True]
    print(f"\nCrosses:")
    print(f"  Attempted: {len(crosses)}")
    successful_cross = crosses['pass_outcome'].isna().sum()
    print(f"  Successful: {successful_cross} ({successful_cross/len(crosses):.1%})" if len(crosses) > 0 else "  N/A")

    # Progressive passes
    player_passes['start_x'] = player_passes['location'].apply(
        lambda x: x[0] if isinstance(x, list) else None
    )
    player_passes['end_x'] = player_passes['pass_end_location'].apply(
        lambda x: x[0] if isinstance(x, list) else None
    )
    player_passes = player_passes.dropna(subset=['start_x', 'end_x'])
    player_passes['progression'] = player_passes['end_x'] - player_passes['start_x']

    progressive = player_passes[
        (player_passes['pass_outcome'].isna()) &
        (player_passes['progression'] > 10)
    ]
    print(f"\nProgressive Passes (>10m forward):")
    print(f"  Count: {len(progressive)}")
    print(f"  Avg progression: {progressive['progression'].mean():.1f}m" if len(progressive) > 0 else "  N/A")

    return {
        'player': player_name,
        'total_passes': total_passes,
        'key_passes': key_passes,
        'total_xa': total_xa,
        'assists': assists,
        'through_balls': len(through_balls),
        'crosses': len(crosses),
        'progressive_passes': len(progressive)
    }

# Build profiles for top creators
top_creators = xa_leaders.head(5)['player'].tolist()
profiles = {}
for player in top_creators:
    profiles[player] = build_creativity_profile(events_df, xa_details, player)

3.2 Creativity Style Classification

def classify_creativity_styles(xa_details, player_xa, min_key_passes=3):
    """
    Classify players by their creativity style based on pass types.
    """
    qualified = player_xa[player_xa['key_passes'] >= min_key_passes].copy()

    # Get pass type breakdown for each player
    type_breakdown = xa_details.groupby(['passer', 'pass_type']).agg({
        'shot_xg': 'sum'
    }).unstack(fill_value=0)
    type_breakdown.columns = type_breakdown.columns.droplevel(0)
    type_breakdown = type_breakdown.reset_index()
    type_breakdown = type_breakdown.rename(columns={'passer': 'player'})

    # Merge with total xA
    style_df = qualified.merge(type_breakdown, on='player', how='left')
    style_df = style_df.fillna(0)

    # Calculate percentages
    for col in ['through_ball', 'cross', 'cutback', 'regular']:
        if col in style_df.columns:
            style_df[f'{col}_pct'] = style_df[col] / style_df['total_xa']

    # Classify style
    def classify_style(row):
        if row.get('cross_pct', 0) > 0.4:
            return 'Wide Creator (Cross-Heavy)'
        elif row.get('through_ball_pct', 0) > 0.3:
            return 'Penetrative Playmaker'
        elif row.get('cutback_pct', 0) > 0.2:
            return 'Byline Specialist'
        else:
            return 'Central Creator'

    style_df['style'] = style_df.apply(classify_style, axis=1)

    print("\n" + "=" * 60)
    print("CREATIVITY STYLE CLASSIFICATION")
    print("=" * 60)

    for style in style_df['style'].unique():
        print(f"\n{style}:")
        style_players = style_df[style_df['style'] == style].sort_values(
            'total_xa', ascending=False
        ).head(5)
        for _, row in style_players.iterrows():
            print(f"  {row['player']} ({row['team']}): {row['total_xa']:.2f} xA")

    return style_df

creativity_styles = classify_creativity_styles(xa_details, player_xa)

Part 4: Scouting Shortlist

4.1 Building Composite Score

def create_scouting_shortlist(player_xa, xa_details, events_df, min_key_passes=4):
    """
    Create prioritized scouting shortlist based on multiple criteria.
    """
    # Filter qualified players
    qualified = player_xa[player_xa['key_passes'] >= min_key_passes].copy()

    # Calculate additional metrics
    # xA efficiency
    qualified['xa_efficiency'] = qualified['total_xa'] / qualified['key_passes']

    # Get through ball data
    tb_data = xa_details[xa_details['pass_type'] == 'through_ball'].groupby('passer').agg({
        'shot_xg': ['count', 'sum']
    })
    tb_data.columns = ['through_balls', 'tb_xa']
    tb_data = tb_data.reset_index().rename(columns={'passer': 'player'})

    qualified = qualified.merge(tb_data, on='player', how='left').fillna(0)

    # Composite score (weighted)
    # Heavy weight on total xA and efficiency
    qualified['composite_score'] = (
        qualified['total_xa'] * 3 +  # Volume of creativity
        qualified['xa_efficiency'] * 5 +  # Quality of chances
        qualified['through_balls'] * 0.5 +  # High-value pass type
        qualified['assists'] * 0.3  # Proven output
    )

    # Rank
    shortlist = qualified.sort_values('composite_score', ascending=False).head(10)

    print("\n" + "=" * 60)
    print("SCOUTING SHORTLIST - TOP 10 CREATIVE PLAYERS")
    print("=" * 60)

    for rank, (_, player) in enumerate(shortlist.iterrows(), 1):
        print(f"\n{rank}. {player['player']} ({player['team']})")
        print(f"   Total xA: {player['total_xa']:.2f}")
        print(f"   Key Passes: {player['key_passes']}")
        print(f"   xA/Key Pass: {player['xa_efficiency']:.3f}")
        print(f"   Assists: {player['assists']}")
        print(f"   Through Balls: {player['through_balls']:.0f}")
        print(f"   Composite Score: {player['composite_score']:.2f}")

    return shortlist

shortlist = create_scouting_shortlist(player_xa, xa_details, events_df)

4.2 Final Recommendations

def generate_recommendations(shortlist, profiles):
    """
    Generate final scouting recommendations.
    """
    print("\n" + "=" * 60)
    print("SCOUTING RECOMMENDATIONS")
    print("=" * 60)

    # Top 3 detailed recommendations
    top_3 = shortlist.head(3)

    recommendations = []

    for rank, (_, player) in enumerate(top_3.iterrows(), 1):
        name = player['player']
        team = player['team']

        rec = {
            'rank': rank,
            'player': name,
            'team': team,
            'total_xa': player['total_xa'],
            'strengths': [],
            'concerns': [],
            'recommendation': ''
        }

        # Identify strengths
        if player['xa_efficiency'] > 0.15:
            rec['strengths'].append("High-quality chance creation")
        if player['through_balls'] >= 2:
            rec['strengths'].append("Excellent at through balls")
        if player['assists'] >= player['total_xa'] * 0.8:
            rec['strengths'].append("Chances lead to goals")

        # Potential concerns
        if player['key_passes'] < 6:
            rec['concerns'].append("Limited sample size in tournament")
        if player['xa_efficiency'] < 0.10:
            rec['concerns'].append("Volume over quality in creation")

        # Overall recommendation
        if player['total_xa'] > 1.5 and player['xa_efficiency'] > 0.12:
            rec['recommendation'] = "PRIORITY TARGET"
        elif player['total_xa'] > 1.0:
            rec['recommendation'] = "STRONG INTEREST"
        else:
            rec['recommendation'] = "MONITOR"

        recommendations.append(rec)

        # Print recommendation
        print(f"\n--- {rank}. {name} ({team}) ---")
        print(f"Verdict: {rec['recommendation']}")
        print(f"\nStrengths:")
        for s in rec['strengths']:
            print(f"  + {s}")
        if rec['concerns']:
            print(f"\nConcerns:")
            for c in rec['concerns']:
                print(f"  - {c}")

    return recommendations

recommendations = generate_recommendations(shortlist, profiles)

Conclusions

Key Findings

  1. Elite creators identified: The analysis identified several players demonstrating exceptional chance creation abilities at the World Cup.

  2. Different creativity styles: Players varied significantly in how they created chances - some relied on crosses, others on through balls, others on cutbacks.

  3. xA efficiency matters: Total xA alone doesn't tell the full story; the quality of chances per key pass distinguishes truly elite creators.

  4. Sample size caution: Even at a World Cup, sample sizes are small. These findings should be combined with league data for definitive conclusions.

Recommendations for the Club

  1. Priority targets should have both high total xA and high xA efficiency, demonstrating they create quality chances consistently.

  2. Consider playing style fit: A cross-heavy creator might not suit a team that doesn't play with target strikers.

  3. Validate with league data: Tournament performance should be confirmed against domestic league statistics.

  4. Age and development: Younger players showing elite creativity may be better long-term investments.


Code Files

Complete implementation available in: - code/case-study-code.py - Full analysis pipeline - code/example-01-xa-calculation.py - xA calculation methods - code/example-02-creativity-profiles.py - Profile building