Processing Event Stream Data

Beginner 10 min read 0 views Nov 27, 2025
Event data records discrete actions during a match - passes, shots, tackles, and more. Processing this data effectively is fundamental to soccer analytics. ## Event Data Structure Event data typically includes: - Timestamp and period - Event type and subtype - Player and team identifiers - Location coordinates - Outcome (success/failure) - Additional context (body part, technique, etc.) ## Loading and Initial Processing ```python import pandas as pd import json def load_event_data(filepath): with open(filepath, 'r') as f: events = json.load(f) # Flatten nested structures events_df = pd.json_normalize(events) # Convert timestamp to seconds events_df['minute'] = events_df['minute'].astype(int) events_df['second'] = events_df['second'].astype(int) events_df['match_second'] = events_df['minute'] * 60 + events_df['second'] return events_df ``` ## Filtering and Querying Events ```python def get_team_passes(events_df, team_name): """Extract all passes for a specific team""" passes = events_df[ (events_df['type'] == 'Pass') & (events_df['team'] == team_name) ].copy() # Separate successful and unsuccessful passes['successful'] = passes['pass_outcome'].isna() return passes def get_possession_sequences(events_df): """Split events into possession sequences""" events_df['possession_change'] = ( events_df['team'] != events_df['team'].shift() ).cumsum() return events_df.groupby('possession_change') ``` ## Calculating Basic Metrics ```python def calculate_passing_metrics(passes_df): """Calculate team passing statistics""" metrics = { 'total_passes': len(passes_df), 'completed_passes': passes_df['successful'].sum(), 'pass_accuracy': passes_df['successful'].mean() * 100, 'forward_passes': (passes_df['pass_end_x'] > passes_df['x']).sum(), 'avg_pass_length': np.sqrt( (passes_df['pass_end_x'] - passes_df['x'])**2 + (passes_df['pass_end_y'] - passes_df['y'])**2 ).mean() } return metrics ``` ## Building Sequence Analysis ```python def analyze_sequences(events_df): """Analyze possession sequences""" sequences = [] for seq_id, sequence in events_df.groupby('possession_change'): sequences.append({ 'sequence_id': seq_id, 'team': sequence.iloc[0]['team'], 'duration': sequence['match_second'].max() - sequence['match_second'].min(), 'num_events': len(sequence), 'num_passes': (sequence['type'] == 'Pass').sum(), 'ended_in_shot': (sequence['type'] == 'Shot').any(), 'start_x': sequence.iloc[0]['x'], 'end_x': sequence.iloc[-1]['x'] }) return pd.DataFrame(sequences) ``` ## Handling Data Quality Issues Event data often requires cleaning: ```python def clean_event_data(events_df): # Remove events with missing critical data events_df = events_df.dropna(subset=['x', 'y', 'type']) # Standardize coordinates to 0-105 x 0-68 events_df['x'] = events_df['x'].clip(0, 105) events_df['y'] = events_df['y'].clip(0, 68) # Handle duplicate events events_df = events_df.drop_duplicates(subset=['timestamp', 'player', 'type']) # Sort by timestamp events_df = events_df.sort_values('match_second').reset_index(drop=True) return events_df ``` ## Advanced: Pass Network Construction ```python def build_pass_network(passes_df): """Create pass network between players""" successful_passes = passes_df[passes_df['successful']].copy() network = successful_passes.groupby(['player', 'pass_recipient']).agg({ 'id': 'count' }).rename(columns={'id': 'passes'}).reset_index() # Calculate edge weights total_passes = network.groupby('player')['passes'].transform('sum') network['weight'] = network['passes'] / total_passes return network ``` Proper event data processing creates the foundation for all downstream analysis, from simple statistics to complex machine learning models.

Discussion

Have questions or feedback? Join our community discussion on Discord or GitHub Discussions.