Code Library
Ready-to-use code snippets for sports analytics projects
Stolen Base Break-Even
Calculate stolen base break-even point.
import pandas as pd
def sb_breakeven(run_environment=None):
"""Calculate stolen base break-even success rate."""
# Run values
sb_value = 0.175 # Value of successful SB
cs_value = -0.40 # Cost of caught stealing
# Break-even: SB_value * success_rate + CS_value * (1 - success_rate) = 0
# success_rate = -CS_value / (SB_value - CS_value)
breakeven = -cs_value / (sb_value - cs_value)
return breakeven
def evaluate_sb_attempts(player_stats):
"""Evaluate if player should attempt more/fewer stolen bases."""
breakeven = sb_breakeven()
sb = player_stats["sb"]
cs = player_stats["cs"]
attempts = sb + cs
success_rate = sb / attempts if attempts > 0 else 0
net_value = sb * 0.175 + cs * -0.40
return {
"sb": sb,
"cs": cs,
"success_rate": success_rate,
"breakeven": breakeven,
"above_breakeven": success_rate > breakeven,
"net_run_value": net_value,
"recommendation": "Attempt more" if success_rate > breakeven + 0.05 else "Reduce attempts" if success_rate < breakeven - 0.05 else "Maintain current rate"
}
Trade Value Calculator
Calculate player trade values.
import pandas as pd
import numpy as np
def calculate_surplus_value(player_df, dollars_per_war=8_000_000):
"""Calculate surplus value for players."""
df = player_df.copy()
# Calculate WAR value
df["war_value"] = df["projected_war"] * dollars_per_war
# Surplus = Value - Salary
df["surplus_value"] = df["war_value"] - df["salary"]
# Contract-adjusted (future years)
df["total_surplus"] = df["surplus_value"] * df["contract_years_remaining"]
return df[["player_id", "name", "age", "projected_war", "salary", "war_value", "surplus_value", "total_surplus"]]
def trade_analysis(team1_players, team2_players, dollars_per_war=8_000_000):
"""Analyze trade fairness."""
team1_value = calculate_surplus_value(team1_players, dollars_per_war)["total_surplus"].sum()
team2_value = calculate_surplus_value(team2_players, dollars_per_war)["total_surplus"].sum()
return {
"team1_value": team1_value,
"team2_value": team2_value,
"difference": team1_value - team2_value,
"winner": "Team 1" if team1_value > team2_value else "Team 2"
}
Player WAR Projection
Project future WAR from current stats.
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
def project_war(historical_df, current_season_df, years_forward=1):
"""Project future WAR."""
# Feature engineering
features = ["age", "pa", "war_ly", "war_2y_avg", "war_3y_avg", "obp", "slg", "k_rate", "bb_rate"]
# Training data (players with future WAR known)
train = historical_df[historical_df[f"war_y{years_forward}"].notna()].copy()
X_train = train[features]
y_train = train[f"war_y{years_forward}"]
# Train model
model = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)
# Predict for current players
current = current_season_df.copy()
current["projected_war"] = model.predict(current[features])
# Apply aging curve adjustment
def age_adj(row):
if row["age"] > 32:
return row["projected_war"] * (1 - 0.05 * (row["age"] - 32))
return row["projected_war"]
current["projected_war_adj"] = current.apply(age_adj, axis=1)
return current[["player_id", "name", "age", "war", "projected_war", "projected_war_adj"]]
Defensive Runs Saved Estimator
Estimate defensive runs saved from basic stats.
import pandas as pd
import numpy as np
def estimate_drs(fielding_stats, position):
"""Estimate defensive runs saved (simplified)."""
# Position-specific weights
position_weights = {
"C": {"passed_balls": -0.25, "cs_pct": 0.15},
"1B": {"errors": -0.5, "assists": 0.1},
"2B": {"errors": -0.6, "range_factor": 0.3},
"SS": {"errors": -0.6, "range_factor": 0.35},
"3B": {"errors": -0.55, "range_factor": 0.25},
"LF": {"errors": -0.45, "outfield_assists": 0.2},
"CF": {"errors": -0.5, "outfield_assists": 0.25},
"RF": {"errors": -0.45, "outfield_assists": 0.3}
}
weights = position_weights.get(position, {"errors": -0.5})
drs = 0
for stat, weight in weights.items():
if stat in fielding_stats:
league_avg = fielding_stats[stat].mean()
player_val = fielding_stats[stat].iloc[0]
drs += (player_val - league_avg) * weight
return drs
def calculate_uzr_components(zone_data, position):
"""Calculate UZR components from zone data."""
# Simplified UZR calculation
components = {
"range_runs": 0,
"error_runs": 0,
"arm_runs": 0 if position not in ["LF", "CF", "RF"] else 0
}
for zone in zone_data["zone"].unique():
zone_plays = zone_data[zone_data["zone"] == zone]
league_avg = zone_plays["league_make_pct"].iloc[0]
player_rate = zone_plays["plays_made"].sum() / len(zone_plays)
opportunities = len(zone_plays)
components["range_runs"] += (player_rate - league_avg) * opportunities * 0.8
return components
FIP Calculator
Calculate Fielding Independent Pitching.
import pandas as pd
def calculate_fip(pitcher_stats, fip_constant=3.10):
"""Calculate FIP for pitchers."""
# FIP = ((13*HR + 3*(BB+HBP) - 2*K) / IP) + FIP_constant
fip = (
(13 * pitcher_stats["hr"] +
3 * (pitcher_stats["bb"] + pitcher_stats["hbp"]) -
2 * pitcher_stats["so"]) /
pitcher_stats["ip"]
) + fip_constant
return fip
def calculate_xfip(pitcher_stats, league_hr_fb_rate=0.10, fip_constant=3.10):
"""Calculate expected FIP (xFIP)."""
# Use league HR/FB rate instead of actual
expected_hr = pitcher_stats["fb"] * league_hr_fb_rate
xfip = (
(13 * expected_hr +
3 * (pitcher_stats["bb"] + pitcher_stats["hbp"]) -
2 * pitcher_stats["so"]) /
pitcher_stats["ip"]
) + fip_constant
return xfip
def calculate_siera(pitcher_stats):
"""Calculate SIERA (simplified)."""
# Simplified SIERA calculation
k_pct = pitcher_stats["so"] / pitcher_stats["bf"]
bb_pct = pitcher_stats["bb"] / pitcher_stats["bf"]
gb_pct = pitcher_stats["gb"] / (pitcher_stats["gb"] + pitcher_stats["fb"])
siera = 6.145 - 16.986 * k_pct + 11.434 * bb_pct - 1.858 * gb_pct
return siera
Weighted OBA Calculator
Calculate weighted on-base average (wOBA).
import pandas as pd
# wOBA weights (2023 values - update annually)
WOBA_WEIGHTS = {
"uBB": 0.690,
"HBP": 0.722,
"1B": 0.888,
"2B": 1.271,
"3B": 1.616,
"HR": 2.101
}
def calculate_woba(player_stats):
"""Calculate weighted on-base average."""
numerator = (
WOBA_WEIGHTS["uBB"] * player_stats["ubb"] +
WOBA_WEIGHTS["HBP"] * player_stats["hbp"] +
WOBA_WEIGHTS["1B"] * player_stats["singles"] +
WOBA_WEIGHTS["2B"] * player_stats["doubles"] +
WOBA_WEIGHTS["3B"] * player_stats["triples"] +
WOBA_WEIGHTS["HR"] * player_stats["hr"]
)
denominator = (
player_stats["ab"] +
player_stats["ubb"] +
player_stats["sf"] +
player_stats["hbp"]
)
return numerator / denominator
def woba_to_wrc_plus(woba, league_woba=0.320, park_factor=1.0, league_runs_pa=0.12):
"""Convert wOBA to wRC+."""
woba_scale = 1.25
wRAA = ((woba - league_woba) / woba_scale) * 600
wRC = wRAA + (600 * league_runs_pa)
wRC_plus = (wRC / park_factor) / (600 * league_runs_pa) * 100
return wRC_plus
Streakiness Analysis
Analyze player performance streakiness.
import pandas as pd
import numpy as np
from scipy import stats
def calculate_streakiness(game_log_df, player_id, stat="hits"):
"""Calculate streakiness index for a player."""
player_games = game_log_df[game_log_df["player_id"] == player_id].sort_values("game_date")
values = player_games[stat].values
if len(values) < 20:
return None
# Calculate runs (consecutive games above/below mean)
mean_val = values.mean()
above_mean = values > mean_val
# Count runs
runs = 1
for i in range(1, len(above_mean)):
if above_mean[i] != above_mean[i-1]:
runs += 1
# Expected runs under random model
n1 = above_mean.sum()
n2 = len(above_mean) - n1
expected_runs = (2 * n1 * n2) / (n1 + n2) + 1
std_runs = np.sqrt((2 * n1 * n2 * (2 * n1 * n2 - n1 - n2)) / ((n1 + n2)**2 * (n1 + n2 - 1)))
# Z-score (negative = more streaky than random)
z_score = (runs - expected_runs) / std_runs
# Autocorrelation
autocorr = np.corrcoef(values[:-1], values[1:])[0, 1]
return {
"player_id": player_id,
"games": len(values),
"observed_runs": runs,
"expected_runs": expected_runs,
"runs_z_score": z_score,
"autocorrelation": autocorr,
"is_streaky": z_score < -2 or autocorr > 0.2
}
Pythagorean Wins Calculator
Calculate expected wins using Pythagorean expectation.
import pandas as pd
import numpy as np
def pythagorean_wins(runs_scored, runs_allowed, games, exponent=None, sport="baseball"):
"""Calculate Pythagorean expected wins."""
# Sport-specific exponents
default_exponents = {
"baseball": 1.83,
"basketball": 13.91,
"football": 2.37,
"hockey": 2.0
}
if exponent is None:
exponent = default_exponents.get(sport, 2.0)
expected_pct = runs_scored ** exponent / (runs_scored ** exponent + runs_allowed ** exponent)
expected_wins = expected_pct * games
return expected_wins, expected_pct
def pythagenpat(runs_scored, runs_allowed, games):
"""Calculate Pythagenpat wins (variable exponent)."""
# Exponent varies with run environment
total_runs_per_game = (runs_scored + runs_allowed) / games
exponent = total_runs_per_game ** 0.287
return pythagorean_wins(runs_scored, runs_allowed, games, exponent)
def calculate_luck(actual_wins, expected_wins):
"""Calculate luck factor (actual - expected)."""
return actual_wins - expected_wins
Regression to Mean Calculator
Calculate regressed statistics.
import pandas as pd
import numpy as np
def regress_to_mean(observed, sample_size, league_avg, reliability_denominator):
"""
Regress observed value toward league average.
reliability_denominator: sample size at which 50% regression occurs
"""
reliability = sample_size / (sample_size + reliability_denominator)
regressed = reliability * observed + (1 - reliability) * league_avg
return regressed
def regress_batting_stats(player_df, league_df):
"""Regress batting statistics."""
regressed = player_df.copy()
# Regression denominators (approximate PA needed for 50% reliability)
denominators = {
"avg": 500,
"babip": 800,
"hr_rate": 300,
"k_rate": 200,
"bb_rate": 400
}
league_avgs = league_df.mean()
for stat, denom in denominators.items():
if stat in regressed.columns:
regressed[f"{stat}_regressed"] = regressed.apply(
lambda row: regress_to_mean(
row[stat], row["pa"], league_avgs[stat], denom
),
axis=1
)
return regressed
# Usage
regressed_stats = regress_batting_stats(player_stats_df, league_stats_df)
print(regressed_stats[["name", "avg", "avg_regressed", "pa"]].head())
Park Factors Adjustment
Adjust statistics for park effects.
import pandas as pd
def calculate_park_factors(games_df):
"""Calculate park factors from game data."""
park_stats = games_df.groupby("park_id").agg({
"home_runs": "sum",
"away_runs": "sum",
"home_hr": "sum",
"away_hr": "sum",
"game_id": "count"
})
park_stats["total_runs"] = park_stats["home_runs"] + park_stats["away_runs"]
park_stats["total_hr"] = park_stats["home_hr"] + park_stats["away_hr"]
park_stats["games"] = park_stats["game_id"]
league_rpg = park_stats["total_runs"].sum() / park_stats["games"].sum()
league_hrpg = park_stats["total_hr"].sum() / park_stats["games"].sum()
park_stats["runs_pf"] = (park_stats["total_runs"] / park_stats["games"]) / league_rpg
park_stats["hr_pf"] = (park_stats["total_hr"] / park_stats["games"]) / league_hrpg
return park_stats[["runs_pf", "hr_pf", "games"]]
def park_adjust_stats(player_stats, park_factors):
"""Adjust player stats for park effects."""
adjusted = player_stats.merge(park_factors, on="park_id")
# Adjust counting stats
adjusted["adj_hr"] = adjusted["hr"] / adjusted["hr_pf"]
adjusted["adj_runs"] = adjusted["runs"] / adjusted["runs_pf"]
adjusted["adj_rbi"] = adjusted["rbi"] / adjusted["runs_pf"]
return adjusted
Batting Order Optimizer
Optimize batting order using run expectancy.
import numpy as np
from itertools import permutations
import random
def expected_runs_lineup(lineup, players_df, innings=9, simulations=1000):
"""Estimate expected runs for a lineup."""
total_runs = 0
for _ in range(simulations):
runs = 0
outs = 0
batter_idx = 0
while outs < innings * 3:
player = players_df.iloc[lineup[batter_idx % 9]]
# Simple outcome model
rand = random.random()
if rand < player["bb_rate"]:
outcome = "walk"
elif rand < player["bb_rate"] + player["hr_rate"]:
outcome = "hr"
runs += 1
elif rand < player["bb_rate"] + player["hr_rate"] + (player["avg"] - player["hr_rate"]):
outcome = "single"
else:
outcome = "out"
outs += 1
batter_idx += 1
total_runs += runs
return total_runs / simulations
def optimize_lineup(players_df, n_iterations=1000):
"""Find optimal batting order using simulated annealing."""
current = list(range(9))
random.shuffle(current)
current_score = expected_runs_lineup(current, players_df)
best = current.copy()
best_score = current_score
temp = 1.0
for i in range(n_iterations):
# Swap two random positions
new = current.copy()
i, j = random.sample(range(9), 2)
new[i], new[j] = new[j], new[i]
new_score = expected_runs_lineup(new, players_df)
# Accept or reject
if new_score > current_score or random.random() < np.exp((new_score - current_score) / temp):
current = new
current_score = new_score
if current_score > best_score:
best = current.copy()
best_score = current_score
temp *= 0.995
return best, best_score
Player Comparison Tool
Compare players across multiple statistical dimensions.
import pandas as pd
import numpy as np
from scipy import stats
class PlayerComparator:
"""Compare players across statistics."""
def __init__(self, league_stats_df):
self.league_stats = league_stats_df
def compare_players(self, player1_id, player2_id, stats_to_compare=None):
"""Compare two players."""
p1 = self.league_stats[self.league_stats["player_id"] == player1_id].iloc[0]
p2 = self.league_stats[self.league_stats["player_id"] == player2_id].iloc[0]
if stats_to_compare is None:
stats_to_compare = ["avg", "obp", "slg", "hr", "war"]
comparison = []
for stat in stats_to_compare:
league_mean = self.league_stats[stat].mean()
league_std = self.league_stats[stat].std()
comparison.append({
"stat": stat,
"player1": p1[stat],
"player2": p2[stat],
"difference": p1[stat] - p2[stat],
"player1_zscore": (p1[stat] - league_mean) / league_std,
"player2_zscore": (p2[stat] - league_mean) / league_std,
"advantage": p1["name"] if p1[stat] > p2[stat] else p2["name"]
})
return pd.DataFrame(comparison)
def similarity_score(self, player1_id, player2_id, stats=None):
"""Calculate similarity between two players."""
if stats is None:
stats = ["avg", "obp", "slg", "hr_rate", "k_rate", "bb_rate"]
p1 = self.league_stats[self.league_stats["player_id"] == player1_id][stats].values[0]
p2 = self.league_stats[self.league_stats["player_id"] == player2_id][stats].values[0]
# Normalize
means = self.league_stats[stats].mean().values
stds = self.league_stats[stats].std().values
p1_norm = (p1 - means) / stds
p2_norm = (p2 - means) / stds
# Cosine similarity
similarity = np.dot(p1_norm, p2_norm) / (np.linalg.norm(p1_norm) * np.linalg.norm(p2_norm))
return similarity
Schedule Strength Calculator
Calculate strength of schedule for teams.
import pandas as pd
import numpy as np
def calculate_sos(games_df, team_ratings):
"""Calculate strength of schedule."""
results = []
for team_id in games_df["home_team_id"].unique():
# Get all opponents
home_games = games_df[games_df["home_team_id"] == team_id]
away_games = games_df[games_df["away_team_id"] == team_id]
opponents = list(home_games["away_team_id"]) + list(away_games["home_team_id"])
# Get opponent ratings
opp_ratings = [team_ratings.get(opp, 0.5) for opp in opponents]
# SOS = average opponent win %
sos = np.mean(opp_ratings)
# Future SOS (remaining games)
played = len([g for g in opponents if g in games_df[games_df["status"] == "Final"]["home_team_id"].values])
future_opps = opponents[played:]
future_sos = np.mean([team_ratings.get(opp, 0.5) for opp in future_opps]) if future_opps else sos
results.append({
"team_id": team_id,
"games_played": played,
"games_remaining": len(opponents) - played,
"sos": sos,
"past_sos": np.mean(opp_ratings[:played]) if played > 0 else 0.5,
"future_sos": future_sos
})
return pd.DataFrame(results).sort_values("sos", ascending=False)
Win Shares Calculator
Calculate basketball win shares.
import pandas as pd
def calculate_win_shares(player_stats, team_stats):
"""Calculate offensive and defensive win shares."""
# Marginal offense
pts_produced = player_stats["pts"] + player_stats["ast"] * 0.5
possessions = player_stats["fga"] + 0.44 * player_stats["fta"] - player_stats["oreb"] + player_stats["tov"]
# Points per possession
ppp = pts_produced / possessions
league_ppp = team_stats["pts"].sum() / team_stats["possessions"].sum()
# Marginal PPP
marginal_off = (ppp - 0.92 * league_ppp) * possessions
# Offensive win shares
marginal_pts_per_win = league_ppp * team_stats["pace"].mean() * 2 / 0.32
ows = marginal_off / marginal_pts_per_win
# Defensive win shares (simplified)
def_rating = player_stats["drtg"]
league_def = team_stats["drtg"].mean()
dws = (league_def - def_rating) / 100 * player_stats["mp"] / 48 * 0.1
return pd.DataFrame({
"player_id": player_stats["player_id"],
"OWS": ows,
"DWS": dws,
"WS": ows + dws
})
Player Projection Aggregator
Aggregate projections from multiple systems.
import pandas as pd
import numpy as np
def aggregate_projections(projection_systems: dict, weights: dict = None):
"""Aggregate projections from multiple systems."""
if weights is None:
weights = {name: 1/len(projection_systems) for name in projection_systems}
# Normalize weights
total = sum(weights.values())
weights = {k: v/total for k, v in weights.items()}
# Merge all projections
combined = None
for name, df in projection_systems.items():
df = df.copy()
df.columns = [f"{col}_{name}" if col != "player_id" else col for col in df.columns]
if combined is None:
combined = df
else:
combined = combined.merge(df, on="player_id", how="outer")
# Calculate weighted averages
stat_cols = ["pa", "avg", "hr", "rbi", "war"]
for stat in stat_cols:
system_cols = [f"{stat}_{name}" for name in projection_systems]
existing_cols = [c for c in system_cols if c in combined.columns]
combined[f"{stat}_proj"] = sum(
combined[col] * weights.get(col.split("_")[-1], 0)
for col in existing_cols
)
return combined
# Example usage
projections = {
"steamer": steamer_df,
"zips": zips_df,
"pecota": pecota_df
}
weights = {"steamer": 0.4, "zips": 0.35, "pecota": 0.25}
combined = aggregate_projections(projections, weights)
MLB Spray Chart Generator
Generate spray charts from batted ball data.
import matplotlib.pyplot as plt
import numpy as np
def create_spray_chart(batted_balls_df, player_name):
"""Create spray chart for a player."""
player_data = batted_balls_df[batted_balls_df["batter_name"] == player_name]
fig, ax = plt.subplots(figsize=(10, 10))
# Draw field outline
theta = np.linspace(np.pi/4, 3*np.pi/4, 100)
r = 400
ax.plot(r * np.cos(theta), r * np.sin(theta), "k-", lw=2)
ax.plot([0, r * np.cos(np.pi/4)], [0, r * np.sin(np.pi/4)], "k-", lw=2)
ax.plot([0, r * np.cos(3*np.pi/4)], [0, r * np.sin(3*np.pi/4)], "k-", lw=2)
# Color by hit type
colors = {"single": "blue", "double": "green", "triple": "orange", "home_run": "red", "out": "gray"}
for hit_type, color in colors.items():
subset = player_data[player_data["events"] == hit_type]
ax.scatter(subset["hc_x"], subset["hc_y"], c=color, s=30, alpha=0.6, label=hit_type)
ax.set_xlim(-250, 250)
ax.set_ylim(-50, 450)
ax.set_title(f"{player_name} Spray Chart")
ax.legend()
return fig, ax
NBA Shot Zones Analysis
Analyze shooting efficiency by court zones.
import pandas as pd
import numpy as np
def classify_shot_zone(x, y):
"""Classify shot location into zones."""
distance = np.sqrt(x**2 + y**2)
angle = np.arctan2(y, x) * 180 / np.pi
if distance < 4:
return "Restricted Area"
elif distance < 8:
return "Paint"
elif y <= 7.8 and abs(x) > 22:
return "Corner 3"
elif distance > 23.75:
return "Above Break 3"
elif distance < 16:
return "Mid-Range"
else:
return "Long 2"
def zone_efficiency(shots_df):
"""Calculate efficiency by zone."""
shots = shots_df.copy()
shots["zone"] = shots.apply(lambda r: classify_shot_zone(r["loc_x"], r["loc_y"]), axis=1)
return shots.groupby("zone").agg({
"is_made": ["sum", "count", "mean"]
}).rename(columns={"sum": "makes", "count": "attempts", "mean": "fg_pct"})
print(zone_efficiency(shots_df))
Batter vs Pitcher Matchup
Analyze batter vs pitcher historical matchups.
import pandas as pd
import numpy as np
def analyze_matchup(pa_df, batter_id, pitcher_id, min_pa=10):
"""Analyze historical batter vs pitcher matchup."""
matchup = pa_df[(pa_df["batter_id"] == batter_id) & (pa_df["pitcher_id"] == pitcher_id)]
if len(matchup) < min_pa:
return {"sufficient_data": False, "pa": len(matchup)}
stats = {
"pa": len(matchup),
"ab": matchup["is_ab"].sum(),
"hits": matchup["is_hit"].sum(),
"hr": (matchup["event"] == "home_run").sum(),
"so": (matchup["event"] == "strikeout").sum(),
"bb": (matchup["event"] == "walk").sum(),
"avg": matchup["is_hit"].sum() / matchup["is_ab"].sum() if matchup["is_ab"].sum() > 0 else 0,
"woba": matchup["woba_value"].mean(),
"sufficient_data": True
}
return stats
def matchup_projection(batter_stats, pitcher_stats, batter_vs_pitcher=None, pa_weight=30):
"""Project matchup performance."""
# Weighted average of overall stats and matchup history
if batter_vs_pitcher and batter_vs_pitcher.get("sufficient_data"):
matchup_pa = batter_vs_pitcher["pa"]
weight = matchup_pa / (matchup_pa + pa_weight)
projected_woba = (
weight * batter_vs_pitcher["woba"] +
(1 - weight) * (batter_stats["woba"] + pitcher_stats["woba_against"]) / 2
)
else:
projected_woba = (batter_stats["woba"] + pitcher_stats["woba_against"]) / 2
return projected_woba
Run Expectancy Matrix
Calculate run expectancy by base-out state.
import pandas as pd
import numpy as np
def calculate_re_matrix(pbp_df):
"""Calculate run expectancy matrix from play-by-play data."""
# Define base-out states
states = []
for outs in range(3):
for first in [0, 1]:
for second in [0, 1]:
for third in [0, 1]:
states.append({
"outs": outs,
"first": first,
"second": second,
"third": third,
"state": f"{third}{second}{first}_{outs}"
})
# Calculate runs scored from each state to end of inning
pbp = pbp_df.copy()
pbp["state"] = pbp.apply(
lambda r: f"{r['third']}{r['second']}{r['first']}_{r['outs']}", axis=1
)
# Group and calculate average runs remaining
re_matrix = pbp.groupby("state")["runs_to_end_inning"].mean().to_dict()
# Format as DataFrame
re_df = pd.DataFrame(states)
re_df["run_expectancy"] = re_df["state"].map(re_matrix)
return re_df.pivot_table(
index=["third", "second", "first"],
columns="outs",
values="run_expectancy"
)
def calculate_re24(pbp_df, re_matrix):
"""Calculate RE24 for each play."""
pbp = pbp_df.copy()
# Get RE before and after each play
pbp["re_before"] = pbp.apply(lambda r: re_matrix.get(r["state_before"], 0), axis=1)
pbp["re_after"] = pbp.apply(lambda r: re_matrix.get(r["state_after"], 0), axis=1)
# RE24 = runs scored + (RE after - RE before)
pbp["re24"] = pbp["runs_on_play"] + (pbp["re_after"] - pbp["re_before"])
return pbp
Quality Start Probability
Predict probability of a quality start.
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
def train_qs_model(historical_starts_df):
"""Train model to predict quality starts."""
features = [
"season_era", "season_fip", "season_k9", "season_bb9",
"last_3_era", "home_game", "opp_wrc_plus",
"rest_days", "park_factor"
]
X = historical_starts_df[features]
y = (historical_starts_df["ip"] >= 6) & (historical_starts_df["er"] <= 3)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
model = LogisticRegression()
model.fit(X_scaled, y)
return model, scaler, features
def predict_qs_probability(model, scaler, features, game_data):
"""Predict QS probability for upcoming start."""
X = game_data[features]
X_scaled = scaler.transform(X)
probability = model.predict_proba(X_scaled)[0, 1]
return probability
Plate Discipline Metrics
Calculate plate discipline statistics.
import pandas as pd
def calculate_plate_discipline(pitch_df, batter_id):
"""Calculate plate discipline metrics."""
batter_pitches = pitch_df[pitch_df["batter_id"] == batter_id]
total = len(batter_pitches)
in_zone = batter_pitches["in_zone"].sum()
out_zone = total - in_zone
swings = batter_pitches["swing"].sum()
swings_zone = batter_pitches[batter_pitches["in_zone"]]["swing"].sum()
swings_out = batter_pitches[~batter_pitches["in_zone"]]["swing"].sum()
contact = batter_pitches[batter_pitches["swing"]]["contact"].sum()
contact_zone = batter_pitches[batter_pitches["in_zone"] & batter_pitches["swing"]]["contact"].sum()
contact_out = batter_pitches[~batter_pitches["in_zone"] & batter_pitches["swing"]]["contact"].sum()
return {
"pitches": total,
"zone_pct": in_zone / total * 100,
"swing_pct": swings / total * 100,
"z_swing_pct": swings_zone / in_zone * 100 if in_zone > 0 else 0,
"o_swing_pct": swings_out / out_zone * 100 if out_zone > 0 else 0,
"contact_pct": contact / swings * 100 if swings > 0 else 0,
"z_contact_pct": contact_zone / swings_zone * 100 if swings_zone > 0 else 0,
"o_contact_pct": contact_out / swings_out * 100 if swings_out > 0 else 0,
"swstr_pct": (swings - contact) / total * 100 # Swinging strike %
}
Exit Velocity Analysis
Analyze batted ball exit velocities.
import pandas as pd
import numpy as np
def analyze_exit_velocity(batted_balls_df, player_id):
"""Analyze exit velocity metrics."""
player_bb = batted_balls_df[batted_balls_df["batter_id"] == player_id]
ev = player_bb["launch_speed"]
la = player_bb["launch_angle"]
# Hard hit = 95+ mph
hard_hit_pct = (ev >= 95).mean() * 100
# Barrel = optimal EV + LA combination
barrels = player_bb[(ev >= 98) & (la >= 26) & (la <= 30)]
barrel_pct = len(barrels) / len(player_bb) * 100 if len(player_bb) > 0 else 0
# Sweet spot = 8-32 degree launch angle
sweet_spot = player_bb[(la >= 8) & (la <= 32)]
sweet_spot_pct = len(sweet_spot) / len(player_bb) * 100 if len(player_bb) > 0 else 0
return {
"batted_balls": len(player_bb),
"avg_ev": ev.mean(),
"max_ev": ev.max(),
"ev_50th": ev.quantile(0.5),
"ev_90th": ev.quantile(0.9),
"hard_hit_pct": hard_hit_pct,
"barrel_pct": barrel_pct,
"sweet_spot_pct": sweet_spot_pct,
"avg_la": la.mean(),
"gb_pct": (la < 10).mean() * 100,
"fb_pct": (la > 25).mean() * 100
}
Soccer Pass Network Analysis
Analyze passing networks and identify key players in soccer.
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
def build_pass_network(events_df, team_id, match_id):
"""Build passing network from match events."""
passes = events_df[
(events_df["team_id"] == team_id) &
(events_df["match_id"] == match_id) &
(events_df["event_type"] == "pass") &
(events_df["pass_outcome"] == "successful")
]
# Create directed graph
G = nx.DiGraph()
# Add edges (passes between players)
for _, p in passes.iterrows():
passer = p["player_id"]
receiver = p["pass_recipient_id"]
if G.has_edge(passer, receiver):
G[passer][receiver]["weight"] += 1
else:
G.add_edge(passer, receiver, weight=1)
return G, passes
def calculate_network_metrics(G, players_df):
"""Calculate network centrality metrics."""
metrics = {}
# Degree centrality (how connected)
metrics["degree"] = nx.degree_centrality(G)
# Betweenness (how often in passing chains)
metrics["betweenness"] = nx.betweenness_centrality(G, weight="weight")
# PageRank (importance weighted by connections)
metrics["pagerank"] = nx.pagerank(G, weight="weight")
# Closeness (how quickly can reach others)
metrics["closeness"] = nx.closeness_centrality(G)
# Compile into DataFrame
results = pd.DataFrame(metrics)
results.index.name = "player_id"
results = results.reset_index()
# Add player names
results = results.merge(
players_df[["player_id", "name", "position"]],
on="player_id"
)
return results.sort_values("pagerank", ascending=False)
def visualize_pass_network(G, players_df, avg_positions):
"""Visualize the passing network."""
fig, ax = plt.subplots(figsize=(14, 9))
# Draw pitch
ax.set_xlim(0, 120)
ax.set_ylim(0, 80)
ax.set_facecolor("#228B22")
# Get node positions from average player positions
pos = {row["player_id"]: (row["avg_x"], row["avg_y"])
for _, row in avg_positions.iterrows()}
# Node sizes based on degree
degrees = dict(G.degree())
node_sizes = [degrees.get(n, 1) * 100 for n in G.nodes()]
# Edge widths based on pass count
edge_widths = [G[u][v]["weight"] / 5 for u, v in G.edges()]
# Draw network
nx.draw_networkx_nodes(G, pos, node_size=node_sizes,
node_color="white", edgecolors="black", ax=ax)
nx.draw_networkx_edges(G, pos, width=edge_widths,
edge_color="white", alpha=0.6,
arrows=True, arrowsize=15, ax=ax)
# Labels
labels = {row["player_id"]: row["name"].split()[-1]
for _, row in players_df.iterrows() if row["player_id"] in G.nodes()}
nx.draw_networkx_labels(G, pos, labels, font_size=8, font_color="black", ax=ax)
ax.set_title("Team Passing Network", fontsize=14)
return fig, ax
# Build and analyze network
G, passes = build_pass_network(events_df, team_id=1, match_id=12345)
metrics = calculate_network_metrics(G, players_df)
print("Most Central Players:")
print(metrics[["name", "position", "pagerank", "betweenness"]].head(10))
Hockey Corsi and Fenwick
Calculate Corsi and Fenwick shot attempt metrics for hockey.
import pandas as pd
import numpy as np
def calculate_corsi_fenwick(events_df, player_id):
"""Calculate Corsi and Fenwick for a player."""
# Filter to when player is on ice
on_ice = events_df[events_df["players_on_ice"].apply(lambda x: player_id in x)]
# Corsi events: all shot attempts (goals, shots, missed, blocked)
corsi_events = ["goal", "shot", "missed_shot", "blocked_shot"]
# Fenwick events: unblocked shot attempts
fenwick_events = ["goal", "shot", "missed_shot"]
player_team = events_df[events_df["player_id"] == player_id]["team_id"].iloc[0]
# Count events
corsi_for = len(on_ice[
(on_ice["event_type"].isin(corsi_events)) &
(on_ice["team_id"] == player_team)
])
corsi_against = len(on_ice[
(on_ice["event_type"].isin(corsi_events)) &
(on_ice["team_id"] != player_team)
])
fenwick_for = len(on_ice[
(on_ice["event_type"].isin(fenwick_events)) &
(on_ice["team_id"] == player_team)
])
fenwick_against = len(on_ice[
(on_ice["event_type"].isin(fenwick_events)) &
(on_ice["team_id"] != player_team)
])
# Time on ice
toi = on_ice["event_time"].max() - on_ice["event_time"].min()
toi_minutes = toi / 60
return {
"player_id": player_id,
"CF": corsi_for,
"CA": corsi_against,
"CF%": corsi_for / (corsi_for + corsi_against) * 100 if (corsi_for + corsi_against) > 0 else 50,
"CF_rel": (corsi_for - corsi_against) / toi_minutes * 60 if toi_minutes > 0 else 0,
"FF": fenwick_for,
"FA": fenwick_against,
"FF%": fenwick_for / (fenwick_for + fenwick_against) * 100 if (fenwick_for + fenwick_against) > 0 else 50,
"TOI": toi_minutes
}
def team_corsi_summary(events_df, team_id, game_state="5v5"):
"""Calculate team Corsi summary."""
# Filter to game state
if game_state == "5v5":
events = events_df[
(events_df["home_skaters"] == 5) &
(events_df["away_skaters"] == 5)
]
else:
events = events_df
team_events = events[events["team_id"] == team_id]
opp_events = events[events["team_id"] != team_id]
corsi_events = ["goal", "shot", "missed_shot", "blocked_shot"]
cf = len(team_events[team_events["event_type"].isin(corsi_events)])
ca = len(opp_events[opp_events["event_type"].isin(corsi_events)])
return {
"team_id": team_id,
"game_state": game_state,
"CF": cf,
"CA": ca,
"CF%": cf / (cf + ca) * 100 if (cf + ca) > 0 else 50,
"shot_diff": cf - ca
}
# Calculate for all players
all_stats = []
for player_id in events_df["player_id"].unique():
stats = calculate_corsi_fenwick(events_df, player_id)
all_stats.append(stats)
corsi_df = pd.DataFrame(all_stats)
print("Top Players by CF%:")
print(corsi_df.nlargest(10, "CF%")[["player_id", "CF%", "CF", "CA", "TOI"]])
Tennis Match Statistics
Analyze tennis match statistics and serve patterns.
import pandas as pd
import numpy as np
def analyze_serve_stats(points_df, player_id):
"""Analyze serve statistics for a player."""
serves = points_df[points_df["server_id"] == player_id]
stats = {
"total_service_points": len(serves),
# First serve
"first_serves_in": serves["first_serve_in"].sum(),
"first_serve_pct": serves["first_serve_in"].mean() * 100,
"first_serve_won": serves[serves["first_serve_in"]]["point_won"].mean() * 100,
# Second serve
"second_serves": len(serves[~serves["first_serve_in"]]),
"double_faults": serves["double_fault"].sum(),
"second_serve_won": serves[(~serves["first_serve_in"]) & (~serves["double_fault"])]["point_won"].mean() * 100,
# Aces
"aces": serves["ace"].sum(),
"ace_pct": serves["ace"].mean() * 100,
# Break points
"break_points_faced": serves["break_point"].sum(),
"break_points_saved": serves[serves["break_point"]]["point_won"].mean() * 100
}
return stats
def analyze_return_stats(points_df, player_id):
"""Analyze return statistics for a player."""
returns = points_df[points_df["returner_id"] == player_id]
stats = {
"total_return_points": len(returns),
"first_serve_return_won": returns[returns["first_serve_in"]]["point_won"].mean() * 100,
"second_serve_return_won": returns[~returns["first_serve_in"]]["point_won"].mean() * 100,
"break_points_created": returns["break_point"].sum(),
"break_points_converted": returns[returns["break_point"]]["point_won"].mean() * 100
}
return stats
def rally_analysis(points_df, player_id):
"""Analyze rally patterns."""
player_points = points_df[
(points_df["server_id"] == player_id) |
(points_df["returner_id"] == player_id)
]
player_points["is_server"] = player_points["server_id"] == player_id
# Performance by rally length
rally_stats = player_points.groupby(
pd.cut(player_points["rally_length"], bins=[0, 4, 8, float("inf")])
).agg({
"point_won": ["mean", "count"]
})
return {
"short_rally_win_pct": player_points[player_points["rally_length"] <= 4]["point_won"].mean() * 100,
"medium_rally_win_pct": player_points[
(player_points["rally_length"] > 4) & (player_points["rally_length"] <= 8)
]["point_won"].mean() * 100,
"long_rally_win_pct": player_points[player_points["rally_length"] > 8]["point_won"].mean() * 100,
"avg_rally_length": player_points["rally_length"].mean()
}
# Comprehensive match analysis
def match_summary(points_df, player1_id, player2_id):
"""Generate comprehensive match summary."""
p1_serve = analyze_serve_stats(points_df, player1_id)
p1_return = analyze_return_stats(points_df, player1_id)
p1_rally = rally_analysis(points_df, player1_id)
p2_serve = analyze_serve_stats(points_df, player2_id)
p2_return = analyze_return_stats(points_df, player2_id)
p2_rally = rally_analysis(points_df, player2_id)
return pd.DataFrame({
"Player 1": {**p1_serve, **p1_return, **p1_rally},
"Player 2": {**p2_serve, **p2_return, **p2_rally}
})
summary = match_summary(points_df, player1_id=1, player2_id=2)
print(summary)
Sports Linear Mixed Models in R
Fit linear mixed models for hierarchical sports data.
library(lme4)
library(lmerTest)
library(dplyr)
library(ggplot2)
# Load hierarchical data (players nested in teams)
# Model: Performance varies by player, team, and year
# Fit mixed model
# Fixed effects: age, experience
# Random effects: player (nested in team), team
model <- lmer(
war ~ age + I(age^2) + experience + (1 | team/player_id) + (1 | season),
data = player_seasons,
REML = TRUE
)
# Summary
summary(model)
# Extract variance components
VarCorr(model)
# Random effects
ranef_team <- ranef(model)$team
ranef_player <- ranef(model)$`team:player_id`
# Best teams by random effect
team_effects <- data.frame(
team = rownames(ranef_team),
effect = ranef_team[[1]]
) %>%
arrange(desc(effect))
print("Top teams by random effect:")
print(head(team_effects, 10))
# Diagnostic plots
par(mfrow = c(2, 2))
plot(model)
# Predictions with confidence intervals
newdata <- data.frame(
age = 28,
experience = 5,
team = "Yankees",
player_id = "new_player",
season = 2024
)
# Prediction (population average)
predict(model, newdata, re.form = NA)
# Prediction with team effect
predict(model, newdata, re.form = ~ (1 | team))
# Compare models with likelihood ratio test
model_reduced <- lmer(
war ~ age + experience + (1 | team/player_id),
data = player_seasons
)
anova(model_reduced, model)
Sports GAM Models in R
Fit Generalized Additive Models for non-linear relationships.
library(mgcv)
library(ggplot2)
library(dplyr)
# Fit GAM for WAR prediction
# Allows non-linear relationships with age, experience, etc.
gam_model <- gam(
war ~ s(age, k = 10) + # Smooth function of age
s(experience, k = 5) +
s(plate_appearances, k = 5) +
position + # Categorical
ti(age, experience, k = 5), # Tensor interaction
data = player_stats,
family = gaussian(),
method = "REML"
)
# Summary
summary(gam_model)
# Check effective degrees of freedom
gam.check(gam_model)
# Visualize smooth terms
par(mfrow = c(2, 2))
plot(gam_model, pages = 1, shade = TRUE)
# Get partial effects
library(gratia)
draw(gam_model)
# Predict aging curve
age_pred <- data.frame(
age = 20:40,
experience = 5,
plate_appearances = 500,
position = "OF"
)
age_pred$predicted_war <- predict(gam_model, newdata = age_pred)
age_pred$se <- predict(gam_model, newdata = age_pred, se.fit = TRUE)$se.fit
ggplot(age_pred, aes(x = age, y = predicted_war)) +
geom_ribbon(aes(ymin = predicted_war - 1.96 * se,
ymax = predicted_war + 1.96 * se),
fill = "lightblue", alpha = 0.5) +
geom_line(color = "blue", size = 1.5) +
geom_vline(xintercept = 27, linetype = "dashed") +
labs(title = "GAM Aging Curve",
x = "Age", y = "Predicted WAR") +
theme_minimal()
# Compare to linear model
lm_model <- lm(war ~ poly(age, 2) + experience + plate_appearances + position,
data = player_stats)
AIC(gam_model, lm_model)
Sports Data Imputation in R
Handle missing data in sports datasets using multiple imputation.
library(mice)
library(dplyr)
library(VIM)
# Visualize missing data patterns
md.pattern(player_stats)
aggr(player_stats, col = c("navyblue", "red"),
numbers = TRUE, sortVars = TRUE)
# Multiple imputation with MICE
# Predictive mean matching for numeric, logistic for binary
imp <- mice(
player_stats,
m = 5, # Number of imputations
method = c(
"pmm", # age
"pmm", # experience
"pmm", # avg
"pmm", # hr
"logreg" # all_star (binary)
),
maxit = 20,
seed = 42
)
# Check convergence
plot(imp)
# Pool results from imputed datasets
# Fit model on each imputed dataset
model_imp <- with(imp, lm(war ~ age + I(age^2) + avg + hr))
# Pool estimates
pooled <- pool(model_imp)
summary(pooled)
# Get complete datasets
complete_data <- complete(imp, action = "long", include = TRUE)
# Compare distributions
library(lattice)
densityplot(imp, ~ avg | .imp)
# Sensitivity analysis: compare results across imputations
results <- sapply(1:5, function(i) {
df <- complete(imp, i)
coef(lm(war ~ age + avg + hr, data = df))
})
# Show variation across imputations
print(round(results, 4))
Websocket Live Data Feed
Connect to live sports data websocket feeds.
import asyncio
import websockets
import json
from datetime import datetime
from collections import deque
class LiveDataFeed:
"""Connect to live sports data feed."""
def __init__(self, url, on_message_callback=None):
self.url = url
self.callback = on_message_callback or self.default_handler
self.messages = deque(maxlen=1000)
self.connected = False
async def connect(self):
"""Establish websocket connection."""
async with websockets.connect(self.url) as ws:
self.connected = True
print(f"Connected to {self.url}")
while True:
try:
message = await ws.recv()
data = json.loads(message)
self.messages.append({
"timestamp": datetime.now(),
"data": data
})
await self.callback(data)
except websockets.ConnectionClosed:
print("Connection closed")
self.connected = False
break
async def default_handler(self, data):
"""Default message handler."""
event_type = data.get("type", "unknown")
print(f"[{datetime.now()}] {event_type}: {data.get('message', data)}")
def get_recent_messages(self, n=10):
"""Get recent messages."""
return list(self.messages)[-n:]
class SportsScoreTracker:
"""Track live sports scores."""
def __init__(self):
self.games = {}
self.feed = None
async def on_score_update(self, data):
"""Handle score update messages."""
if data.get("type") == "score_update":
game_id = data["game_id"]
if game_id not in self.games:
self.games[game_id] = {
"home_team": data["home_team"],
"away_team": data["away_team"],
"home_score": 0,
"away_score": 0
}
self.games[game_id]["home_score"] = data.get("home_score", 0)
self.games[game_id]["away_score"] = data.get("away_score", 0)
print(f"{self.games[game_id]['away_team']} {self.games[game_id]['away_score']} @ "
f"{self.games[game_id]['home_team']} {self.games[game_id]['home_score']}")
async def start(self, feed_url):
"""Start tracking scores."""
self.feed = LiveDataFeed(feed_url, self.on_score_update)
await self.feed.connect()
# Usage
async def main():
tracker = SportsScoreTracker()
await tracker.start("wss://live-scores.example.com/feed")
# asyncio.run(main())
Sports Data Logger
Structured logging for sports analytics pipelines.
import logging
import json
from datetime import datetime
from pathlib import Path
from typing import Any, Dict
import sys
class SportsDataLogger:
"""Structured logging for sports analytics."""
def __init__(self, name: str, log_dir: str = "./logs"):
self.name = name
self.log_dir = Path(log_dir)
self.log_dir.mkdir(exist_ok=True)
self.logger = logging.getLogger(name)
self.logger.setLevel(logging.DEBUG)
# Console handler
console = logging.StreamHandler(sys.stdout)
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter(
"%(asctime)s | %(levelname)s | %(message)s"
))
self.logger.addHandler(console)
# File handler (JSON lines)
file_path = self.log_dir / f"{name}_{datetime.now():%Y%m%d}.jsonl"
file_handler = logging.FileHandler(file_path)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(JsonFormatter())
self.logger.addHandler(file_handler)
def log_data_load(self, source: str, records: int, duration: float):
"""Log data loading operation."""
self.logger.info(
f"Loaded {records} records from {source}",
extra={
"event_type": "data_load",
"source": source,
"records": records,
"duration_seconds": duration
}
)
def log_model_train(self, model_name: str, metrics: Dict[str, float]):
"""Log model training."""
self.logger.info(
f"Trained model {model_name}",
extra={
"event_type": "model_train",
"model": model_name,
"metrics": metrics
}
)
def log_prediction(self, model: str, input_data: Dict, prediction: Any):
"""Log prediction."""
self.logger.debug(
f"Prediction from {model}",
extra={
"event_type": "prediction",
"model": model,
"input": input_data,
"prediction": prediction
}
)
def log_error(self, error: Exception, context: Dict = None):
"""Log error with context."""
self.logger.error(
str(error),
extra={
"event_type": "error",
"error_type": type(error).__name__,
"context": context or {}
},
exc_info=True
)
class JsonFormatter(logging.Formatter):
"""Format log records as JSON."""
def format(self, record):
log_data = {
"timestamp": datetime.utcnow().isoformat(),
"level": record.levelname,
"logger": record.name,
"message": record.getMessage()
}
# Add extra fields
for key in ["event_type", "source", "records", "duration_seconds",
"model", "metrics", "input", "prediction", "error_type", "context"]:
if hasattr(record, key):
log_data[key] = getattr(record, key)
return json.dumps(log_data)
# Usage
logger = SportsDataLogger("mlb_analytics")
# Log data operations
import time
start = time.time()
# ... load data ...
logger.log_data_load("statcast_api", records=50000, duration=time.time() - start)
# Log model training
logger.log_model_train("war_predictor", {
"rmse": 0.85,
"r2": 0.78,
"mae": 0.62
})
Sports Alert System
Generate alerts for notable sports events and statistics.
from dataclasses import dataclass
from datetime import datetime
from typing import List, Callable, Dict, Any
from enum import Enum
import json
class AlertPriority(Enum):
LOW = 1
MEDIUM = 2
HIGH = 3
CRITICAL = 4
@dataclass
class Alert:
"""Sports alert."""
id: str
title: str
message: str
priority: AlertPriority
timestamp: datetime
data: Dict[str, Any]
category: str
class AlertRule:
"""Rule for generating alerts."""
def __init__(self, name: str, condition: Callable, priority: AlertPriority, category: str):
self.name = name
self.condition = condition
self.priority = priority
self.category = category
def check(self, data: Dict) -> Alert:
"""Check if rule triggers an alert."""
result = self.condition(data)
if result:
return Alert(
id=f"{self.name}_{datetime.now().timestamp()}",
title=result.get("title", self.name),
message=result.get("message", ""),
priority=self.priority,
timestamp=datetime.now(),
data=data,
category=self.category
)
return None
class SportsAlertSystem:
"""Alert system for sports analytics."""
def __init__(self):
self.rules: List[AlertRule] = []
self.alerts: List[Alert] = []
self.handlers: List[Callable] = []
def add_rule(self, rule: AlertRule):
"""Add alert rule."""
self.rules.append(rule)
def add_handler(self, handler: Callable):
"""Add alert handler (notification function)."""
self.handlers.append(handler)
def check_all(self, data: Dict) -> List[Alert]:
"""Check all rules against data."""
triggered = []
for rule in self.rules:
alert = rule.check(data)
if alert:
triggered.append(alert)
self.alerts.append(alert)
self._dispatch(alert)
return triggered
def _dispatch(self, alert: Alert):
"""Dispatch alert to all handlers."""
for handler in self.handlers:
try:
handler(alert)
except Exception as e:
print(f"Handler error: {e}")
# Example rules
def milestone_rule(data):
"""Check for career milestones."""
if data.get("career_hr", 0) in [500, 600, 700, 714, 755, 762]:
return {
"title": "Career HR Milestone!",
"message": f"{data['player_name']} hit career HR #{data['career_hr']}"
}
return None
def injury_rule(data):
"""Check for injury reports."""
if data.get("injury_status") == "out":
return {
"title": "Player Injury",
"message": f"{data['player_name']} placed on IL: {data.get('injury_type', 'Unknown')}"
}
return None
def blowout_rule(data):
"""Check for blowout games."""
margin = abs(data.get("home_score", 0) - data.get("away_score", 0))
if margin >= 15 and data.get("inning", 0) >= 7:
return {
"title": "Blowout Alert",
"message": f"Large margin ({margin}) in {data['away_team']} @ {data['home_team']}"
}
return None
# Setup system
alert_system = SportsAlertSystem()
alert_system.add_rule(AlertRule("milestone", milestone_rule, AlertPriority.HIGH, "player"))
alert_system.add_rule(AlertRule("injury", injury_rule, AlertPriority.CRITICAL, "health"))
alert_system.add_rule(AlertRule("blowout", blowout_rule, AlertPriority.LOW, "game"))
# Add handlers
alert_system.add_handler(lambda a: print(f"[{a.priority.name}] {a.title}: {a.message}"))
# Check data
alerts = alert_system.check_all({
"player_name": "Albert Pujols",
"career_hr": 700
})
WAR Components Breakdown
Calculate and visualize WAR component breakdown.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def calculate_war_components(player_stats):
"""Calculate individual WAR components."""
stats = player_stats.copy()
# Constants (simplified)
RUNS_PER_WIN = 10
LG_WOBA = 0.320
WOBA_SCALE = 1.25
# Batting Runs (wRAA)
stats["batting_runs"] = (
(stats["woba"] - LG_WOBA) / WOBA_SCALE * stats["pa"]
)
# Baserunning Runs
stats["baserunning_runs"] = (
stats["stolen_bases"] * 0.2 -
stats["caught_stealing"] * 0.4 +
stats["extra_bases_taken"] * 0.15
)
# Fielding Runs (using UZR or DRS)
stats["fielding_runs"] = stats.get("uzr", 0)
# Positional Adjustment
position_adj = {
"C": 12.5, "SS": 7.5, "2B": 2.5, "CF": 2.5, "3B": 2.5,
"RF": -7.5, "LF": -7.5, "1B": -12.5, "DH": -17.5
}
stats["position_adj"] = (
stats["position"].map(position_adj) * stats["games"] / 162
)
# League Adjustment (simplified)
stats["league_adj"] = 0
# Replacement Level (about 20 runs per 600 PA)
stats["replacement_runs"] = stats["pa"] / 600 * 20
# Total WAR
stats["war"] = (
stats["batting_runs"] +
stats["baserunning_runs"] +
stats["fielding_runs"] +
stats["position_adj"] +
stats["league_adj"] +
stats["replacement_runs"]
) / RUNS_PER_WIN
return stats
def visualize_war_breakdown(player_stats, player_name):
"""Visualize WAR components for a player."""
player = player_stats[player_stats["name"] == player_name].iloc[0]
components = {
"Batting": player["batting_runs"],
"Baserunning": player["baserunning_runs"],
"Fielding": player["fielding_runs"],
"Position": player["position_adj"],
"Replacement": player["replacement_runs"]
}
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Bar chart
ax1 = axes[0]
colors = ["green" if v >= 0 else "red" for v in components.values()]
ax1.barh(list(components.keys()), list(components.values()), color=colors)
ax1.axvline(x=0, color="black", linewidth=0.5)
ax1.set_xlabel("Runs")
ax1.set_title(f"{player_name} WAR Components")
# Waterfall chart
ax2 = axes[1]
values = list(components.values())
labels = list(components.keys())
cumulative = [0]
for i, v in enumerate(values):
cumulative.append(cumulative[-1] + v)
for i, (label, val) in enumerate(zip(labels, values)):
bottom = cumulative[i]
color = "green" if val >= 0 else "red"
ax2.bar(label, val, bottom=bottom, color=color, edgecolor="black")
# Add total
total = sum(values)
ax2.bar("Total WAR", total / 10, color="navy")
ax2.axhline(y=0, color="black", linewidth=0.5)
ax2.set_ylabel("Runs")
ax2.set_title(f"{player_name} WAR Waterfall")
plt.tight_layout()
return fig
# Calculate and visualize
player_stats = calculate_war_components(raw_stats_df)
fig = visualize_war_breakdown(player_stats, "Mike Trout")
Rolling Stats Calculator
Calculate various rolling statistics for time series analysis.
import pandas as pd
import numpy as np
from typing import List, Dict
class RollingStatsCalculator:
"""Calculate rolling statistics for sports data."""
def __init__(self, df: pd.DataFrame, date_col: str = "date"):
self.df = df.sort_values(date_col).copy()
self.date_col = date_col
def rolling_avg(self, value_col: str, window: int, min_periods: int = 1) -> pd.Series:
"""Simple rolling average."""
return self.df[value_col].rolling(window=window, min_periods=min_periods).mean()
def weighted_rolling_avg(self, value_col: str, window: int) -> pd.Series:
"""Exponentially weighted rolling average."""
return self.df[value_col].ewm(span=window, adjust=False).mean()
def rolling_sum(self, value_col: str, window: int) -> pd.Series:
"""Rolling sum."""
return self.df[value_col].rolling(window=window).sum()
def rolling_percentile(self, value_col: str, window: int, percentile: float) -> pd.Series:
"""Rolling percentile."""
return self.df[value_col].rolling(window=window).quantile(percentile)
def rolling_zscore(self, value_col: str, window: int) -> pd.Series:
"""Rolling z-score (how many std from rolling mean)."""
rolling_mean = self.df[value_col].rolling(window=window).mean()
rolling_std = self.df[value_col].rolling(window=window).std()
return (self.df[value_col] - rolling_mean) / rolling_std
def calculate_all_rolling(self, value_cols: List[str], windows: List[int]) -> pd.DataFrame:
"""Calculate multiple rolling stats."""
result = self.df.copy()
for col in value_cols:
for window in windows:
result[f"{col}_roll{window}_avg"] = self.rolling_avg(col, window)
result[f"{col}_roll{window}_sum"] = self.rolling_sum(col, window)
return result
def pace_adjusted_rolling(self, value_col: str, attempts_col: str, window: int) -> pd.Series:
"""Calculate pace-adjusted rolling (e.g., per-PA or per-100-possessions)."""
rolling_value = self.df[value_col].rolling(window=window).sum()
rolling_attempts = self.df[attempts_col].rolling(window=window).sum()
return rolling_value / rolling_attempts
def hot_cold_streak(self, value_col: str, threshold_pct: float = 0.75, min_streak: int = 5) -> pd.DataFrame:
"""Identify hot and cold streaks."""
threshold_high = self.df[value_col].quantile(threshold_pct)
threshold_low = self.df[value_col].quantile(1 - threshold_pct)
self.df["above_threshold"] = self.df[value_col] >= threshold_high
self.df["below_threshold"] = self.df[value_col] <= threshold_low
# Find consecutive streaks
self.df["hot_streak"] = (
self.df["above_threshold"]
.groupby((~self.df["above_threshold"]).cumsum())
.cumsum()
)
self.df["cold_streak"] = (
self.df["below_threshold"]
.groupby((~self.df["below_threshold"]).cumsum())
.cumsum()
)
return self.df
# Usage
calc = RollingStatsCalculator(game_log_df, date_col="game_date")
# Add rolling stats
game_log_df["avg_roll20"] = calc.rolling_avg("batting_avg", window=20)
game_log_df["avg_roll50_ewm"] = calc.weighted_rolling_avg("batting_avg", window=50)
game_log_df["hr_roll30_sum"] = calc.rolling_sum("home_runs", window=30)
game_log_df["avg_zscore"] = calc.rolling_zscore("batting_avg", window=50)
# Pace-adjusted (batting avg = hits/AB)
game_log_df["avg_roll20_calc"] = calc.pace_adjusted_rolling("hits", "at_bats", window=20)
print(game_log_df[["game_date", "batting_avg", "avg_roll20", "avg_zscore"]].tail(20))
Sports Percentile Ranking
Calculate percentile rankings across various statistics.
import pandas as pd
import numpy as np
from scipy import stats
def percentile_rank(series: pd.Series) -> pd.Series:
"""Calculate percentile rank (0-100) for a series."""
return series.rank(pct=True) * 100
def percentile_rank_grouped(df: pd.DataFrame, value_col: str, group_col: str) -> pd.Series:
"""Calculate percentile rank within groups."""
return df.groupby(group_col)[value_col].transform(lambda x: x.rank(pct=True) * 100)
class PlayerPercentileProfile:
"""Calculate comprehensive percentile profile for players."""
def __init__(self, league_stats_df: pd.DataFrame):
self.league_stats = league_stats_df
def calculate_percentiles(self, player_stats: dict) -> dict:
"""Calculate percentile for each stat."""
percentiles = {}
for stat, value in player_stats.items():
if stat in self.league_stats.columns:
league_values = self.league_stats[stat].dropna()
percentile = stats.percentileofscore(league_values, value)
percentiles[stat] = round(percentile, 1)
return percentiles
def create_profile(self, player_df: pd.DataFrame) -> pd.DataFrame:
"""Create percentile profile for all players."""
stats_cols = ["avg", "obp", "slg", "hr", "rbi", "sb", "war",
"k_pct", "bb_pct", "iso", "babip", "wrc_plus"]
result = player_df.copy()
for col in stats_cols:
if col in result.columns:
# Higher is better for most stats
higher_better = col not in ["k_pct"]
result[f"{col}_pct"] = result[col].rank(pct=True, ascending=higher_better) * 100
return result
def compare_to_position(self, player_stats: dict, position: str) -> dict:
"""Compare player to position average."""
pos_stats = self.league_stats[self.league_stats["position"] == position]
comparison = {}
for stat, value in player_stats.items():
if stat in pos_stats.columns:
pos_avg = pos_stats[stat].mean()
pos_std = pos_stats[stat].std()
comparison[stat] = {
"value": value,
"position_avg": pos_avg,
"vs_avg": value - pos_avg,
"z_score": (value - pos_avg) / pos_std if pos_std > 0 else 0,
"percentile": stats.percentileofscore(pos_stats[stat].dropna(), value)
}
return comparison
def visualize_percentile_profile(percentiles: dict, player_name: str):
"""Visualize percentile profile as radar chart or bar chart."""
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 6))
stats = list(percentiles.keys())
values = list(percentiles.values())
colors = ["green" if v >= 70 else "orange" if v >= 30 else "red" for v in values]
ax.barh(stats, values, color=colors, edgecolor="black")
ax.axvline(x=50, color="gray", linestyle="--", alpha=0.5)
ax.axvline(x=75, color="blue", linestyle=":", alpha=0.3)
ax.axvline(x=25, color="blue", linestyle=":", alpha=0.3)
ax.set_xlim(0, 100)
ax.set_xlabel("Percentile")
ax.set_title(f"{player_name} Percentile Profile")
for i, (stat, val) in enumerate(zip(stats, values)):
ax.text(val + 2, i, f"{val:.0f}", va="center")
return fig
# Usage
profiler = PlayerPercentileProfile(league_wide_stats_df)
# Get player percentiles
player = {"avg": 0.295, "obp": 0.380, "slg": 0.540, "hr": 35, "war": 6.5}
percentiles = profiler.calculate_percentiles(player)
print("Player Percentile Rankings:")
for stat, pct in sorted(percentiles.items(), key=lambda x: -x[1]):
print(f" {stat}: {pct:.0f}th percentile")
Sports Data Export Utilities
Export sports data to various formats.
import pandas as pd
import json
from pathlib import Path
from datetime import datetime
from typing import Optional
import xlsxwriter
class SportsDataExporter:
"""Export sports data to various formats."""
def __init__(self, output_dir: str = "./exports"):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
def _get_filename(self, base_name: str, extension: str) -> Path:
"""Generate timestamped filename."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return self.output_dir / f"{base_name}_{timestamp}.{extension}"
def to_csv(self, df: pd.DataFrame, name: str, **kwargs) -> Path:
"""Export to CSV."""
path = self._get_filename(name, "csv")
df.to_csv(path, index=False, **kwargs)
return path
def to_excel(self, data: dict, name: str, include_summary: bool = True) -> Path:
"""Export multiple DataFrames to Excel workbook."""
path = self._get_filename(name, "xlsx")
with pd.ExcelWriter(path, engine="xlsxwriter") as writer:
workbook = writer.book
# Formats
header_fmt = workbook.add_format({
"bold": True,
"bg_color": "#4472C4",
"font_color": "white"
})
number_fmt = workbook.add_format({"num_format": "0.000"})
for sheet_name, df in data.items():
df.to_excel(writer, sheet_name=sheet_name[:31], index=False)
worksheet = writer.sheets[sheet_name[:31]]
# Format header
for col_num, value in enumerate(df.columns):
worksheet.write(0, col_num, value, header_fmt)
# Autofit columns
for i, col in enumerate(df.columns):
max_len = max(df[col].astype(str).str.len().max(), len(col)) + 2
worksheet.set_column(i, i, min(max_len, 50))
# Add summary sheet
if include_summary:
summary_data = []
for name, df in data.items():
summary_data.append({
"Sheet": name,
"Rows": len(df),
"Columns": len(df.columns)
})
pd.DataFrame(summary_data).to_excel(
writer, sheet_name="Summary", index=False
)
return path
def to_json(self, df: pd.DataFrame, name: str, orient: str = "records") -> Path:
"""Export to JSON."""
path = self._get_filename(name, "json")
df.to_json(path, orient=orient, indent=2, date_format="iso")
return path
def to_parquet(self, df: pd.DataFrame, name: str) -> Path:
"""Export to Parquet (efficient for large datasets)."""
path = self._get_filename(name, "parquet")
df.to_parquet(path, index=False)
return path
def to_html_report(self, df: pd.DataFrame, name: str, title: str = "Report") -> Path:
"""Export to styled HTML report."""
path = self._get_filename(name, "html")
html = f"""
<!DOCTYPE html>
<html>
<head>
<title>{title}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
h1 {{ color: #333; }}
table {{ border-collapse: collapse; width: 100%; }}
th {{ background-color: #4472C4; color: white; padding: 10px; text-align: left; }}
td {{ border: 1px solid #ddd; padding: 8px; }}
tr:nth-child(even) {{ background-color: #f2f2f2; }}
tr:hover {{ background-color: #ddd; }}
</style>
</head>
<body>
<h1>{title}</h1>
<p>Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
<p>Records: {len(df)}</p>
{df.to_html(index=False, classes="data-table")}
</body>
</html>
"""
path.write_text(html)
return path
# Usage
exporter = SportsDataExporter("./exports")
# Export player stats
csv_path = exporter.to_csv(player_stats_df, "player_stats")
print(f"CSV exported to: {csv_path}")
# Export multiple sheets to Excel
excel_path = exporter.to_excel({
"Batting": batting_df,
"Pitching": pitching_df,
"Fielding": fielding_df
}, "full_stats_export")
print(f"Excel exported to: {excel_path}")
# Export to Parquet for big data
parquet_path = exporter.to_parquet(large_dataset_df, "historical_data")
Team Roster Constructor
Build and manage team rosters with salary cap considerations.
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from datetime import date
@dataclass
class Player:
"""Player on roster."""
id: int
name: str
position: str
salary: float
war: float
contract_years: int = 1
status: str = "active"
@dataclass
class Roster:
"""Team roster management."""
team_name: str
salary_cap: float
players: List[Player] = field(default_factory=list)
@property
def total_salary(self) -> float:
return sum(p.salary for p in self.players if p.status == "active")
@property
def cap_space(self) -> float:
return self.salary_cap - self.total_salary
@property
def total_war(self) -> float:
return sum(p.war for p in self.players if p.status == "active")
def add_player(self, player: Player) -> bool:
"""Add player to roster if salary fits."""
if player.salary <= self.cap_space:
self.players.append(player)
return True
return False
def remove_player(self, player_id: int) -> Optional[Player]:
"""Remove player from roster."""
for i, p in enumerate(self.players):
if p.id == player_id:
return self.players.pop(i)
return None
def get_by_position(self, position: str) -> List[Player]:
"""Get players by position."""
return [p for p in self.players if p.position == position and p.status == "active"]
def to_dataframe(self):
"""Convert roster to DataFrame."""
import pandas as pd
return pd.DataFrame([
{"name": p.name, "position": p.position, "salary": p.salary,
"war": p.war, "status": p.status}
for p in self.players
])
def summary(self) -> Dict:
"""Get roster summary."""
return {
"team": self.team_name,
"players": len([p for p in self.players if p.status == "active"]),
"total_salary": self.total_salary,
"cap_space": self.cap_space,
"total_war": self.total_war,
"dollars_per_war": self.total_salary / self.total_war if self.total_war > 0 else 0,
"by_position": {
pos: len(self.get_by_position(pos))
for pos in ["C", "1B", "2B", "SS", "3B", "LF", "CF", "RF", "DH", "SP", "RP"]
}
}
# Usage
roster = Roster("Yankees", salary_cap=250_000_000)
# Add players
roster.add_player(Player(1, "Aaron Judge", "RF", 40_000_000, 8.0, 9))
roster.add_player(Player(2, "Gerrit Cole", "SP", 36_000_000, 5.5, 5))
roster.add_player(Player(3, "Anthony Rizzo", "1B", 17_000_000, 2.0, 2))
print(f"Total Salary: ${roster.total_salary/1e6:.1f}M")
print(f"Cap Space: ${roster.cap_space/1e6:.1f}M")
print(f"Total WAR: {roster.total_war:.1f}")
print(f"\nRoster Summary:")
print(roster.summary())
Sports Calendar Utilities
Manage sports schedules and calendar operations.
from datetime import datetime, timedelta, date
from typing import List, Dict, Optional
import calendar
class SportsCalendar:
"""Sports calendar and schedule utilities."""
SEASON_DATES = {
"MLB": {
"spring_training": (2, 20),
"opening_day": (3, 28),
"all_star_break": (7, 14, 7, 18), # Start and end
"regular_season_end": (9, 30),
"postseason_end": (11, 5),
"games_per_season": 162
},
"NBA": {
"preseason_start": (10, 1),
"regular_season_start": (10, 22),
"all_star_break": (2, 14, 2, 20),
"regular_season_end": (4, 14),
"playoffs_end": (6, 20),
"games_per_season": 82
},
"NFL": {
"preseason_start": (8, 3),
"regular_season_start": (9, 7),
"bye_weeks": (5, 14),
"regular_season_end": (1, 8),
"super_bowl": (2, 11),
"games_per_season": 17
}
}
def __init__(self, league: str):
self.league = league.upper()
self.season_info = self.SEASON_DATES.get(self.league, {})
def get_season_year(self, dt: datetime) -> int:
"""Get season year for a date."""
if self.league == "MLB":
return dt.year
elif self.league in ["NBA", "NFL"]:
# Season spans years
if dt.month < 7:
return dt.year - 1
return dt.year
def is_regular_season(self, dt: datetime) -> bool:
"""Check if date is during regular season."""
year = self.get_season_year(dt)
if self.league == "MLB":
start = datetime(year, *self.season_info["opening_day"][:2])
end = datetime(year, *self.season_info["regular_season_end"][:2])
return start <= dt <= end
elif self.league == "NBA":
start = datetime(year, *self.season_info["regular_season_start"][:2])
end = datetime(year + 1, *self.season_info["regular_season_end"][:2])
return start <= dt <= end
return True
def is_all_star_break(self, dt: datetime) -> bool:
"""Check if date is during All-Star break."""
if "all_star_break" not in self.season_info:
return False
year = self.get_season_year(dt)
asb = self.season_info["all_star_break"]
if self.league == "MLB":
start = datetime(year, asb[0], asb[1])
end = datetime(year, asb[2], asb[3])
else:
start = datetime(year + 1, asb[0], asb[1])
end = datetime(year + 1, asb[2], asb[3])
return start <= dt <= end
def games_remaining(self, dt: datetime, current_games: int) -> int:
"""Calculate games remaining in season."""
total = self.season_info.get("games_per_season", 162)
return total - current_games
def generate_schedule_dates(self, year: int) -> List[date]:
"""Generate potential game dates for a season."""
dates = []
start = datetime(year, *self.season_info.get("opening_day", (3, 28))[:2])
end = datetime(year, *self.season_info.get("regular_season_end", (9, 30))[:2])
current = start
while current <= end:
if not self.is_all_star_break(current):
dates.append(current.date())
current += timedelta(days=1)
return dates
def get_rest_days(self, game_dates: List[date], current: date) -> int:
"""Calculate rest days before current date."""
past_games = [d for d in game_dates if d < current]
if not past_games:
return 7
return (current - max(past_games)).days - 1
# Usage
mlb_cal = SportsCalendar("MLB")
nba_cal = SportsCalendar("NBA")
today = datetime.now()
print(f"MLB Season Year: {mlb_cal.get_season_year(today)}")
print(f"Is Regular Season: {mlb_cal.is_regular_season(today)}")
print(f"Is All-Star Break: {mlb_cal.is_all_star_break(today)}")
# Generate schedule
schedule_dates = mlb_cal.generate_schedule_dates(2024)
print(f"\n2024 MLB potential game dates: {len(schedule_dates)}")
API Rate Limiter
Rate limit API calls for sports data fetching.
import time
from datetime import datetime
from functools import wraps
from typing import Optional, Callable
import threading
from collections import deque
class RateLimiter:
"""Rate limiter for API calls."""
def __init__(self, calls_per_second: float = 1.0, calls_per_minute: float = 60.0):
self.calls_per_second = calls_per_second
self.calls_per_minute = calls_per_minute
self.call_times = deque()
self.lock = threading.Lock()
def wait_if_needed(self):
"""Wait if rate limit would be exceeded."""
with self.lock:
now = time.time()
# Clean old entries
minute_ago = now - 60
while self.call_times and self.call_times[0] < minute_ago:
self.call_times.popleft()
# Check per-second limit
second_ago = now - 1
recent_calls = sum(1 for t in self.call_times if t > second_ago)
if recent_calls >= self.calls_per_second:
wait_time = 1 - (now - max(t for t in self.call_times if t > second_ago))
if wait_time > 0:
time.sleep(wait_time)
now = time.time()
# Check per-minute limit
if len(self.call_times) >= self.calls_per_minute:
oldest_in_minute = self.call_times[0]
wait_time = 60 - (now - oldest_in_minute)
if wait_time > 0:
time.sleep(wait_time)
now = time.time()
self.call_times.append(now)
def __call__(self, func: Callable) -> Callable:
"""Decorator to rate limit function calls."""
@wraps(func)
def wrapper(*args, **kwargs):
self.wait_if_needed()
return func(*args, **kwargs)
return wrapper
class AdaptiveRateLimiter(RateLimiter):
"""Rate limiter that adapts to API responses."""
def __init__(self, initial_rate: float = 10.0):
super().__init__(calls_per_second=initial_rate)
self.error_count = 0
self.success_count = 0
def record_success(self):
"""Record successful API call."""
self.success_count += 1
# Gradually increase rate after successes
if self.success_count > 10 and self.error_count == 0:
self.calls_per_second = min(self.calls_per_second * 1.1, 20.0)
def record_error(self, status_code: int):
"""Record API error."""
self.error_count += 1
self.success_count = 0
if status_code == 429: # Rate limited
self.calls_per_second = max(self.calls_per_second * 0.5, 0.5)
elif status_code >= 500: # Server error
self.calls_per_second = max(self.calls_per_second * 0.8, 1.0)
# Usage as decorator
rate_limiter = RateLimiter(calls_per_second=2, calls_per_minute=100)
@rate_limiter
def fetch_player_stats(player_id: int):
"""Fetch player stats (rate limited)."""
import requests
response = requests.get(f"https://api.example.com/players/{player_id}")
return response.json()
# Usage with adaptive limiter
adaptive_limiter = AdaptiveRateLimiter(initial_rate=5.0)
def fetch_with_adaptive_limit(url: str):
"""Fetch with adaptive rate limiting."""
import requests
adaptive_limiter.wait_if_needed()
try:
response = requests.get(url)
if response.status_code == 200:
adaptive_limiter.record_success()
return response.json()
else:
adaptive_limiter.record_error(response.status_code)
return None
except Exception as e:
adaptive_limiter.record_error(500)
raise
# Fetch multiple players
for player_id in range(1, 100):
stats = fetch_player_stats(player_id)
print(f"Fetched player {player_id}")
Team Performance Trends SQL
Analyze team performance trends over multiple seasons.
-- Team performance trends across seasons
WITH season_stats AS (
SELECT
t.team_id,
t.team_name,
s.season,
s.wins,
s.losses,
s.runs_scored,
s.runs_allowed,
s.wins * 1.0 / (s.wins + s.losses) AS win_pct,
s.runs_scored - s.runs_allowed AS run_diff,
LAG(s.wins) OVER (PARTITION BY t.team_id ORDER BY s.season) AS prev_wins,
LAG(s.runs_scored - s.runs_allowed) OVER (PARTITION BY t.team_id ORDER BY s.season) AS prev_run_diff
FROM teams t
JOIN team_season_stats s ON t.team_id = s.team_id
WHERE s.season >= 2020
)
SELECT
team_name,
season,
wins,
win_pct,
run_diff,
wins - COALESCE(prev_wins, wins) AS win_change,
run_diff - COALESCE(prev_run_diff, run_diff) AS run_diff_change,
AVG(win_pct) OVER (PARTITION BY team_id ORDER BY season ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS rolling_3yr_win_pct
FROM season_stats
ORDER BY team_name, season;
Player Milestone Tracking SQL
Track players approaching career milestones.
-- Players approaching milestones
WITH career_totals AS (
SELECT
p.player_id,
p.name,
SUM(s.hits) AS career_hits,
SUM(s.home_runs) AS career_hr,
SUM(s.rbi) AS career_rbi,
SUM(s.strikeouts) AS career_k, -- For pitchers
SUM(s.wins) AS career_wins
FROM players p
JOIN player_stats s ON p.player_id = s.player_id
GROUP BY p.player_id, p.name
),
milestones AS (
SELECT
player_id,
name,
career_hits,
career_hr,
career_rbi,
-- Hits milestones
CASE
WHEN career_hits >= 2900 AND career_hits < 3000 THEN 3000 - career_hits
WHEN career_hits >= 1900 AND career_hits < 2000 THEN 2000 - career_hits
WHEN career_hits >= 900 AND career_hits < 1000 THEN 1000 - career_hits
ELSE NULL
END AS hits_to_milestone,
-- HR milestones
CASE
WHEN career_hr >= 490 AND career_hr < 500 THEN 500 - career_hr
WHEN career_hr >= 290 AND career_hr < 300 THEN 300 - career_hr
ELSE NULL
END AS hr_to_milestone
FROM career_totals
)
SELECT
name,
career_hits,
career_hr,
career_rbi,
CONCAT(career_hits + hits_to_milestone, ' hits') AS hits_milestone,
hits_to_milestone AS hits_needed,
CONCAT(career_hr + hr_to_milestone, ' HR') AS hr_milestone,
hr_to_milestone AS hr_needed
FROM milestones
WHERE hits_to_milestone IS NOT NULL OR hr_to_milestone IS NOT NULL
ORDER BY COALESCE(hits_to_milestone, hr_to_milestone);
Head-to-Head Record SQL
Calculate head-to-head records between teams.
-- Head-to-head records between teams
WITH matchups AS (
SELECT
LEAST(home_team_id, away_team_id) AS team1_id,
GREATEST(home_team_id, away_team_id) AS team2_id,
CASE
WHEN home_score > away_score THEN home_team_id
ELSE away_team_id
END AS winner_id,
home_score + away_score AS total_runs,
ABS(home_score - away_score) AS margin
FROM games
WHERE season = 2024 AND status = 'Final'
)
SELECT
t1.team_name AS team1,
t2.team_name AS team2,
COUNT(*) AS games_played,
SUM(CASE WHEN m.winner_id = m.team1_id THEN 1 ELSE 0 END) AS team1_wins,
SUM(CASE WHEN m.winner_id = m.team2_id THEN 1 ELSE 0 END) AS team2_wins,
ROUND(AVG(total_runs), 1) AS avg_total_runs,
ROUND(AVG(margin), 1) AS avg_margin,
SUM(CASE WHEN margin <= 2 THEN 1 ELSE 0 END) AS close_games
FROM matchups m
JOIN teams t1 ON m.team1_id = t1.team_id
JOIN teams t2 ON m.team2_id = t2.team_id
GROUP BY t1.team_name, t2.team_name
HAVING COUNT(*) >= 3
ORDER BY games_played DESC, avg_margin;
Expected Points Added (EPA)
Calculate EPA for football plays.
import pandas as pd
import numpy as np
def calculate_expected_points(down, distance, yard_line):
"""Calculate expected points based on game state."""
# Simplified EP model (would use ML model in production)
base_ep = (yard_line - 50) * 0.05 # Field position value
# Down adjustments
down_adj = {1: 0.5, 2: 0.2, 3: -0.3, 4: -1.0}
base_ep += down_adj.get(down, 0)
# Distance adjustment
if distance <= 3:
base_ep += 0.3
elif distance >= 10:
base_ep -= 0.3
return base_ep
def calculate_epa(plays_df):
"""Calculate EPA for each play."""
plays = plays_df.copy()
# Calculate EP before play
plays["ep_before"] = plays.apply(
lambda x: calculate_expected_points(x["down"], x["distance"], x["yard_line"]),
axis=1
)
# Calculate EP after play
plays["ep_after"] = plays.apply(
lambda x: calculate_expected_points(
x["next_down"] if not x["turnover"] else 1,
x["next_distance"] if not x["turnover"] else 10,
x["next_yard_line"] if not x["turnover"] else 100 - x["next_yard_line"]
) * (-1 if x["turnover"] else 1),
axis=1
)
# Handle scoring plays
plays.loc[plays["touchdown"] == 1, "ep_after"] = 7
plays.loc[plays["field_goal"] == 1, "ep_after"] = 3
plays.loc[plays["safety"] == 1, "ep_after"] = 2
# EPA = EP after - EP before
plays["epa"] = plays["ep_after"] - plays["ep_before"]
return plays
def player_epa_summary(plays_df):
"""Summarize EPA by player."""
# Passing EPA
passing = plays_df[plays_df["play_type"] == "pass"].groupby("passer_id").agg({
"epa": ["sum", "mean", "count"],
"yards_gained": "sum",
"touchdown": "sum"
})
passing.columns = ["total_epa", "epa_per_play", "attempts", "yards", "tds"]
# Rushing EPA
rushing = plays_df[plays_df["play_type"] == "run"].groupby("rusher_id").agg({
"epa": ["sum", "mean", "count"]
})
rushing.columns = ["total_epa", "epa_per_carry", "carries"]
return passing, rushing
plays_with_epa = calculate_epa(pbp_df)
passing_epa, rushing_epa = player_epa_summary(plays_with_epa)
print("Top Passers by EPA:")
print(passing_epa.nlargest(10, "total_epa"))
Strokes Gained Calculator
Calculate strokes gained statistics for golf.
import pandas as pd
import numpy as np
class StrokesGainedCalculator:
"""Calculate strokes gained for golf shots."""
# Baseline strokes to hole from distance (PGA Tour averages)
BASELINE_STROKES = {
"tee": {100: 2.92, 150: 2.99, 200: 3.05, 250: 3.15, 300: 3.30, 350: 3.45, 400: 3.65},
"fairway": {50: 2.60, 100: 2.80, 150: 2.92, 200: 3.02},
"rough": {50: 2.75, 100: 2.92, 150: 3.05, 200: 3.18},
"sand": {20: 2.43, 40: 2.65, 60: 2.85},
"green": {5: 1.50, 10: 1.61, 20: 1.78, 30: 1.95, 40: 2.10, 60: 2.30}
}
def get_baseline_strokes(self, lie: str, distance: int) -> float:
"""Get baseline strokes to hole."""
baseline = self.BASELINE_STROKES.get(lie, self.BASELINE_STROKES["fairway"])
# Interpolate
distances = sorted(baseline.keys())
if distance <= distances[0]:
return baseline[distances[0]]
if distance >= distances[-1]:
return baseline[distances[-1]]
for i in range(len(distances) - 1):
if distances[i] <= distance <= distances[i + 1]:
ratio = (distance - distances[i]) / (distances[i + 1] - distances[i])
return baseline[distances[i]] + ratio * (baseline[distances[i + 1]] - baseline[distances[i]])
return 3.0 # Default
def calculate_sg(self, shots_df):
"""Calculate strokes gained for each shot."""
shots = shots_df.copy()
# Get baseline strokes before shot
shots["baseline_before"] = shots.apply(
lambda x: self.get_baseline_strokes(x["lie_before"], x["distance_before"]),
axis=1
)
# Get baseline strokes after shot (0 if holed)
shots["baseline_after"] = shots.apply(
lambda x: 0 if x["holed"] else self.get_baseline_strokes(x["lie_after"], x["distance_after"]),
axis=1
)
# SG = baseline_before - baseline_after - 1 (for the stroke taken)
shots["strokes_gained"] = shots["baseline_before"] - shots["baseline_after"] - 1
return shots
def player_summary(self, shots_df):
"""Summarize strokes gained by player and category."""
shots = self.calculate_sg(shots_df)
# Categorize shots
def categorize_shot(row):
if row["lie_before"] == "tee" and row["distance_before"] > 250:
return "off_the_tee"
elif row["lie_before"] in ["fairway", "rough", "sand"] and row["distance_before"] > 100:
return "approach"
elif row["lie_before"] in ["fairway", "rough", "sand"] and row["distance_before"] <= 100:
return "around_green"
elif row["lie_before"] == "green":
return "putting"
return "other"
shots["category"] = shots.apply(categorize_shot, axis=1)
# Summarize
summary = shots.groupby(["player_id", "category"])["strokes_gained"].agg([
("total_sg", "sum"),
("avg_sg", "mean"),
("shots", "count")
]).unstack(level=1)
summary["total_sg_all"] = shots.groupby("player_id")["strokes_gained"].sum()
return summary
calculator = StrokesGainedCalculator()
sg_summary = calculator.player_summary(shots_df)
print(sg_summary.sort_values("total_sg_all", ascending=False).head(20))
Game Simulation Engine
Monte Carlo simulation engine for game outcomes.
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List, Tuple
@dataclass
class TeamStats:
"""Team statistical profile."""
name: str
offense_rating: float
defense_rating: float
pace: float
variance: float = 5.0
class GameSimulator:
"""Monte Carlo game simulation engine."""
def __init__(self, league_avg_pace=100):
self.league_avg_pace = league_avg_pace
def simulate_game(self, home_team: TeamStats, away_team: TeamStats,
home_advantage: float = 3.0) -> Dict:
"""Simulate single game."""
# Expected pace (average of both teams)
expected_pace = (home_team.pace + away_team.pace) / 2
# Expected scores
# Home: their offense vs opponent defense + home advantage
home_expected = (
(home_team.offense_rating - away_team.defense_rating + 100)
* expected_pace / 100 + home_advantage
)
away_expected = (
(away_team.offense_rating - home_team.defense_rating + 100)
* expected_pace / 100
)
# Add variance
home_score = np.random.normal(home_expected, home_team.variance)
away_score = np.random.normal(away_expected, away_team.variance)
# Round to integers
home_score = max(0, round(home_score))
away_score = max(0, round(away_score))
return {
"home_score": home_score,
"away_score": away_score,
"home_win": home_score > away_score,
"margin": home_score - away_score
}
def simulate_series(self, home_team: TeamStats, away_team: TeamStats,
n_games: int = 7, wins_needed: int = 4) -> Dict:
"""Simulate playoff series."""
home_wins = 0
away_wins = 0
games = []
while home_wins < wins_needed and away_wins < wins_needed:
game_num = len(games) + 1
# Alternate home court (simplified 2-2-1-1-1)
if game_num in [1, 2, 5, 7]:
result = self.simulate_game(home_team, away_team)
else:
result = self.simulate_game(away_team, home_team)
result["home_win"] = not result["home_win"]
games.append(result)
if result["home_win"]:
home_wins += 1
else:
away_wins += 1
return {
"winner": home_team.name if home_wins >= wins_needed else away_team.name,
"games": len(games),
"home_wins": home_wins,
"away_wins": away_wins,
"game_results": games
}
def monte_carlo_prediction(self, home_team: TeamStats, away_team: TeamStats,
n_simulations: int = 10000) -> Dict:
"""Run Monte Carlo simulation for game prediction."""
results = [
self.simulate_game(home_team, away_team)
for _ in range(n_simulations)
]
home_wins = sum(r["home_win"] for r in results)
margins = [r["margin"] for r in results]
home_scores = [r["home_score"] for r in results]
away_scores = [r["away_score"] for r in results]
return {
"home_win_prob": home_wins / n_simulations,
"away_win_prob": 1 - home_wins / n_simulations,
"expected_margin": np.mean(margins),
"margin_std": np.std(margins),
"home_score_mean": np.mean(home_scores),
"away_score_mean": np.mean(away_scores),
"total_mean": np.mean(home_scores) + np.mean(away_scores)
}
# Usage
celtics = TeamStats("Celtics", offense_rating=118, defense_rating=106, pace=98)
lakers = TeamStats("Lakers", offense_rating=114, defense_rating=110, pace=100)
simulator = GameSimulator()
# Single game prediction
prediction = simulator.monte_carlo_prediction(celtics, lakers, n_simulations=10000)
print(f"Celtics Win Probability: {prediction['home_win_prob']:.1%}")
print(f"Expected Score: {prediction['home_score_mean']:.0f} - {prediction['away_score_mean']:.0f}")
print(f"Expected Margin: {prediction['expected_margin']:.1f} ± {prediction['margin_std']:.1f}")
Injury Prediction with Survival Analysis
Predict time until next injury using survival analysis.
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter, KaplanMeierFitter
from lifelines.utils import concordance_index
import matplotlib.pyplot as plt
class InjuryPredictor:
"""Predict injury risk using survival analysis."""
def __init__(self):
self.cox_model = CoxPHFitter()
self.km_fitter = KaplanMeierFitter()
def prepare_data(self, player_history_df):
"""Prepare data for survival analysis."""
# Each row = player-stint (time between injuries)
df = player_history_df.copy()
# Time variable: days until injury (or end of observation)
df["duration"] = (df["end_date"] - df["start_date"]).dt.days
# Event variable: did injury occur?
df["injured"] = df["injury_type"].notna().astype(int)
return df
def fit(self, df):
"""Fit Cox proportional hazards model."""
features = [
"age", "career_games", "workload_30d",
"previous_injuries", "position_risk",
"bmi", "sprint_speed_percentile"
]
survival_df = df[features + ["duration", "injured"]].dropna()
self.cox_model.fit(
survival_df,
duration_col="duration",
event_col="injured"
)
print(self.cox_model.summary)
return self
def predict_risk(self, player_features, time_horizon=180):
"""Predict injury risk for a player."""
# Survival function at time horizon
survival_func = self.cox_model.predict_survival_function(
player_features
)
# Risk = 1 - survival probability
risk = 1 - survival_func.loc[time_horizon].values[0]
# Hazard ratio compared to baseline
hr = self.cox_model.predict_partial_hazard(player_features).values[0]
return {
"injury_risk_6mo": risk,
"hazard_ratio": hr,
"risk_category": "High" if risk > 0.5 else "Medium" if risk > 0.25 else "Low"
}
def plot_survival_curves(self, df, group_col):
"""Plot survival curves by group."""
fig, ax = plt.subplots(figsize=(10, 6))
for group in df[group_col].unique():
group_data = df[df[group_col] == group]
self.km_fitter.fit(
group_data["duration"],
event_observed=group_data["injured"],
label=group
)
self.km_fitter.plot_survival_function(ax=ax)
ax.set_xlabel("Days Since Last Injury")
ax.set_ylabel("Probability of Staying Healthy")
ax.set_title(f"Injury-Free Survival by {group_col}")
ax.legend()
return fig, ax
def risk_factors_report(self):
"""Generate risk factors report."""
summary = self.cox_model.summary.copy()
summary["risk_increase"] = (np.exp(summary["coef"]) - 1) * 100
return summary[["coef", "exp(coef)", "risk_increase", "p"]].sort_values(
"risk_increase", ascending=False
)
# Usage
predictor = InjuryPredictor()
df = predictor.prepare_data(player_injury_history)
predictor.fit(df)
# Predict for specific player
player = pd.DataFrame([{
"age": 28,
"career_games": 500,
"workload_30d": 450,
"previous_injuries": 3,
"position_risk": 0.7,
"bmi": 24.5,
"sprint_speed_percentile": 65
}])
risk = predictor.predict_risk(player)
print(f"6-Month Injury Risk: {risk['injury_risk_6mo']:.1%}")
print(f"Risk Category: {risk['risk_category']}")
Contract Optimization Model
Optimize team salary cap allocation using linear programming.
import pandas as pd
import numpy as np
from scipy.optimize import linprog, milp, LinearConstraint, Bounds
def optimize_roster(players_df, salary_cap, roster_spots=15, min_by_position=None):
"""Optimize roster construction under salary cap."""
n_players = len(players_df)
# Objective: maximize total WAR
c = -players_df["projected_war"].values # Negative for maximization
# Salary cap constraint
A_salary = players_df["salary"].values.reshape(1, -1)
b_salary = np.array([salary_cap])
# Roster size constraint
A_roster = np.ones((1, n_players))
b_roster = np.array([roster_spots])
# Position constraints
A_position = []
b_position = []
if min_by_position:
for pos, min_count in min_by_position.items():
pos_vector = (players_df["position"] == pos).astype(int).values
A_position.append(pos_vector)
b_position.append(min_count)
# Combine constraints
A_ub = np.vstack([A_salary, A_roster])
b_ub = np.concatenate([b_salary, b_roster])
if A_position:
A_ub = np.vstack([A_ub, -np.array(A_position)])
b_ub = np.concatenate([b_ub, -np.array(b_position)])
# Bounds: binary selection (0 or 1)
bounds = [(0, 1) for _ in range(n_players)]
# Solve
result = linprog(c, A_ub=A_ub, b_ub=b_ub, bounds=bounds, method="highs")
if result.success:
# Get selected players (round to binary)
selected = result.x > 0.5
roster = players_df[selected].copy()
roster["selected"] = 1
return {
"roster": roster,
"total_war": roster["projected_war"].sum(),
"total_salary": roster["salary"].sum(),
"cap_space": salary_cap - roster["salary"].sum(),
"roster_size": len(roster)
}
else:
return {"error": "Optimization failed", "message": result.message}
def trade_optimizer(team_roster, available_players, salary_cap):
"""Find optimal trades to improve team."""
current_war = team_roster["projected_war"].sum()
current_salary = team_roster["salary"].sum()
trade_options = []
# For each player on roster, find beneficial swaps
for _, player_out in team_roster.iterrows():
for _, player_in in available_players.iterrows():
# Check salary works
new_salary = current_salary - player_out["salary"] + player_in["salary"]
if new_salary > salary_cap:
continue
# Check position match (simplified)
if player_out["position"] != player_in["position"]:
continue
# Calculate improvement
war_change = player_in["projected_war"] - player_out["projected_war"]
if war_change > 0:
trade_options.append({
"player_out": player_out["name"],
"player_in": player_in["name"],
"war_gain": war_change,
"salary_change": player_in["salary"] - player_out["salary"],
"new_cap_space": salary_cap - new_salary
})
return pd.DataFrame(trade_options).sort_values("war_gain", ascending=False)
# Usage
min_positions = {"C": 2, "PF": 2, "SF": 2, "SG": 2, "PG": 2}
result = optimize_roster(
free_agents_df,
salary_cap=140_000_000,
roster_spots=15,
min_by_position=min_positions
)
print(f"Optimal Roster ({result['roster_size']} players):")
print(result["roster"][["name", "position", "salary", "projected_war"]])
print(f"\nTotal WAR: {result['total_war']:.1f}")
print(f"Total Salary: ${result['total_salary']/1e6:.1f}M")
print(f"Cap Space: ${result['cap_space']/1e6:.1f}M")
Real-Time Score Prediction
Predict final score based on current game state.
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
import joblib
class ScorePredictor:
"""Predict final game score from current state."""
def __init__(self):
self.home_model = None
self.away_model = None
def train(self, historical_games_df):
"""Train score prediction models."""
# Create training data from historical game states
training_data = []
for _, game in historical_games_df.iterrows():
for period in range(1, 5): # Quarters
if f"home_q{period}" not in game:
continue
state = {
"period": period,
"home_current": sum(game[f"home_q{i}"] for i in range(1, period)),
"away_current": sum(game[f"away_q{i}"] for i in range(1, period)),
"home_q1": game.get("home_q1", 0),
"away_q1": game.get("away_q1", 0),
"home_final": game["home_score"],
"away_final": game["away_score"]
}
training_data.append(state)
df = pd.DataFrame(training_data)
features = ["period", "home_current", "away_current", "home_q1", "away_q1"]
X = df[features]
# Train separate models for home/away
self.home_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
self.home_model.fit(X, df["home_final"])
self.away_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
self.away_model.fit(X, df["away_final"])
def predict(self, current_state):
"""Predict final scores from current game state."""
features = np.array([[
current_state["period"],
current_state["home_score"],
current_state["away_score"],
current_state.get("home_q1", current_state["home_score"]),
current_state.get("away_q1", current_state["away_score"])
]])
home_pred = self.home_model.predict(features)[0]
away_pred = self.away_model.predict(features)[0]
# Ensure predictions are at least current score
home_pred = max(home_pred, current_state["home_score"])
away_pred = max(away_pred, current_state["away_score"])
return {
"home_predicted": round(home_pred),
"away_predicted": round(away_pred),
"predicted_margin": round(home_pred - away_pred),
"home_win_likely": home_pred > away_pred
}
def predict_with_uncertainty(self, current_state, n_simulations=1000):
"""Predict with uncertainty estimates."""
base_pred = self.predict(current_state)
# Add noise based on remaining game time
remaining_periods = 4 - current_state["period"]
noise_std = 5 * remaining_periods # More uncertainty with more time
home_sims = np.random.normal(
base_pred["home_predicted"], noise_std, n_simulations
)
away_sims = np.random.normal(
base_pred["away_predicted"], noise_std, n_simulations
)
home_wins = (home_sims > away_sims).mean()
return {
"home_predicted": base_pred["home_predicted"],
"away_predicted": base_pred["away_predicted"],
"home_win_prob": home_wins,
"home_95_ci": np.percentile(home_sims, [2.5, 97.5]),
"away_95_ci": np.percentile(away_sims, [2.5, 97.5])
}
# Usage
predictor = ScorePredictor()
predictor.train(historical_games_df)
# Current game state: Home up 58-52 at halftime
current = {
"period": 2,
"home_score": 58,
"away_score": 52,
"home_q1": 28,
"away_q1": 25
}
prediction = predictor.predict_with_uncertainty(current)
print(f"Predicted Final: {prediction['home_predicted']} - {prediction['away_predicted']}")
print(f"Home Win Probability: {prediction['home_win_prob']:.1%}")
Pace and Space Analysis
Analyze team pace and spacing metrics for basketball.
import pandas as pd
import numpy as np
def calculate_pace(team_stats):
"""Calculate team pace (possessions per 48 minutes)."""
# Possessions = FGA + 0.44*FTA - ORB + TOV
poss = (
team_stats["fga"] +
0.44 * team_stats["fta"] -
team_stats["orb"] +
team_stats["tov"]
)
minutes = team_stats["minutes"]
pace = poss / minutes * 48
return pace
def spacing_analysis(tracking_df, team_id):
"""Analyze team spacing from tracking data."""
team_possessions = tracking_df[
(tracking_df["team_id"] == team_id) &
(tracking_df["on_offense"] == True)
]
spacing_metrics = []
for poss_id in team_possessions["possession_id"].unique():
poss_data = team_possessions[team_possessions["possession_id"] == poss_id]
# Get player positions at each frame
for frame in poss_data["frame_id"].unique():
frame_data = poss_data[poss_data["frame_id"] == frame]
if len(frame_data) < 5:
continue
# Calculate pairwise distances
positions = frame_data[["x", "y"]].values
distances = []
for i in range(5):
for j in range(i+1, 5):
dist = np.sqrt(
(positions[i][0] - positions[j][0])**2 +
(positions[i][1] - positions[j][1])**2
)
distances.append(dist)
# Spacing metrics
spacing_metrics.append({
"possession_id": poss_id,
"frame_id": frame,
"avg_spacing": np.mean(distances),
"min_spacing": np.min(distances),
"max_spacing": np.max(distances),
"spacing_std": np.std(distances),
# Distance from 3pt line
"three_pt_spacing": sum(
1 for x, y in positions
if np.sqrt(x**2 + y**2) > 23.75
)
})
return pd.DataFrame(spacing_metrics)
def analyze_pace_impact(game_df, team_id):
"""Analyze how pace affects team performance."""
team_games = game_df[
(game_df["home_team_id"] == team_id) |
(game_df["away_team_id"] == team_id)
].copy()
team_games["is_home"] = team_games["home_team_id"] == team_id
team_games["team_pace"] = np.where(
team_games["is_home"],
team_games["home_pace"],
team_games["away_pace"]
)
team_games["team_margin"] = np.where(
team_games["is_home"],
team_games["home_score"] - team_games["away_score"],
team_games["away_score"] - team_games["home_score"]
)
# Categorize pace
pace_median = team_games["team_pace"].median()
team_games["pace_category"] = np.where(
team_games["team_pace"] > pace_median + 2,
"Fast",
np.where(team_games["team_pace"] < pace_median - 2, "Slow", "Normal")
)
# Results by pace
pace_results = team_games.groupby("pace_category").agg({
"team_margin": ["mean", "std"],
"team_pace": "count"
})
pace_results.columns = ["avg_margin", "std_margin", "games"]
return pace_results
# Calculate team spacing metrics
spacing_df = spacing_analysis(tracking_df, team_id=1610612744)
print("Average Team Spacing:", spacing_df["avg_spacing"].mean())
print("3PT Spacing (avg players beyond arc):", spacing_df["three_pt_spacing"].mean())
Player Value Calculator
Calculate player market value based on performance metrics.
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
class PlayerValueCalculator:
"""Calculate player market value from performance."""
def __init__(self, salary_df):
self.salary_data = salary_df
self.dollars_per_war = None
self.model = None
def calculate_market_rate(self, season):
"""Calculate market $/WAR rate."""
season_data = self.salary_data[self.salary_data["season"] == season]
# Filter to meaningful playing time
season_data = season_data[season_data["war"] > 0.5]
# Calculate $/WAR
season_data["dollars_per_war"] = season_data["salary"] / season_data["war"]
# Use median to avoid outliers
self.dollars_per_war = season_data["dollars_per_war"].median()
return self.dollars_per_war
def war_to_dollars(self, war, season=None):
"""Convert WAR to dollar value."""
if self.dollars_per_war is None:
self.calculate_market_rate(season or 2024)
return war * self.dollars_per_war
def surplus_value(self, player_df):
"""Calculate surplus value (value produced - salary)."""
player_df = player_df.copy()
player_df["war_value"] = player_df["war"].apply(self.war_to_dollars)
player_df["surplus"] = player_df["war_value"] - player_df["salary"]
player_df["surplus_pct"] = player_df["surplus"] / player_df["salary"] * 100
return player_df
def project_contract_value(self, player_projections_df, years):
"""Project total contract value from WAR projections."""
contracts = []
for _, player in player_projections_df.iterrows():
total_value = 0
year_values = []
for year in range(years):
# Apply aging curve (decline ~0.5 WAR/year after 30)
age = player["age"] + year
war_decline = max(0, (age - 30) * 0.5) if age > 30 else 0
projected_war = max(0, player["projected_war"] - war_decline)
year_value = self.war_to_dollars(projected_war)
year_values.append(year_value)
total_value += year_value
contracts.append({
"player_name": player["name"],
"age": player["age"],
"years": years,
"total_value": total_value,
"aav": total_value / years,
"year_breakdown": year_values
})
return pd.DataFrame(contracts)
def comparable_contracts(self, player_war, player_age, n=5):
"""Find comparable historical contracts."""
# Find similar players at signing
comparables = self.salary_data[
(abs(self.salary_data["war"] - player_war) < 1) &
(abs(self.salary_data["age"] - player_age) < 2)
].copy()
comparables["similarity"] = (
1 - abs(comparables["war"] - player_war) / 5 -
abs(comparables["age"] - player_age) / 10
)
return comparables.nlargest(n, "similarity")[[
"name", "age", "war", "salary", "years", "similarity"
]]
# Usage
calculator = PlayerValueCalculator(salary_history_df)
calculator.calculate_market_rate(2024)
print(f"Market rate: ${calculator.dollars_per_war/1e6:.2f}M per WAR")
# Calculate surplus value
players_with_surplus = calculator.surplus_value(current_contracts_df)
print("\nBest Value Contracts:")
print(players_with_surplus.nlargest(10, "surplus")[
["name", "salary", "war", "war_value", "surplus"]
])
Clutch Performance Analysis
Analyze player performance in high-leverage situations.
import pandas as pd
import numpy as np
from scipy import stats
def calculate_leverage_index(game_state):
"""Calculate leverage index for game situation."""
inning = game_state["inning"]
score_diff = abs(game_state["home_score"] - game_state["away_score"])
outs = game_state["outs"]
runners = game_state["runners_on"]
# Base LI calculation (simplified)
base_li = 1.0
# Late inning bonus
if inning >= 7:
base_li *= 1.5
if inning >= 9:
base_li *= 1.3
# Close game bonus
if score_diff <= 1:
base_li *= 2.0
elif score_diff <= 3:
base_li *= 1.5
# Runners on bonus
base_li *= (1 + 0.3 * runners)
return base_li
def clutch_performance(player_stats_df, plays_df):
"""Analyze clutch performance by player."""
# Add leverage index to each play
plays_df["leverage"] = plays_df.apply(calculate_leverage_index, axis=1)
# Define high leverage (top 20%)
li_threshold = plays_df["leverage"].quantile(0.80)
plays_df["high_leverage"] = plays_df["leverage"] >= li_threshold
# Calculate performance in different situations
clutch_stats = []
for player_id in plays_df["batter_id"].unique():
player_plays = plays_df[plays_df["batter_id"] == player_id]
if len(player_plays) < 50:
continue
# Overall stats
overall_woba = player_plays["woba_value"].mean()
# High leverage stats
high_lev = player_plays[player_plays["high_leverage"]]
if len(high_lev) >= 20:
clutch_woba = high_lev["woba_value"].mean()
# Clutch score = high leverage performance - overall
clutch_score = clutch_woba - overall_woba
clutch_stats.append({
"player_id": player_id,
"total_pa": len(player_plays),
"high_lev_pa": len(high_lev),
"overall_woba": overall_woba,
"clutch_woba": clutch_woba,
"clutch_score": clutch_score
})
clutch_df = pd.DataFrame(clutch_stats)
# Statistical test: is clutch a skill or noise?
# If clutch is random, year-to-year correlation should be ~0
year_pairs = clutch_df.groupby("player_id").apply(
lambda x: x.sort_values("season")
)
return clutch_df.sort_values("clutch_score", ascending=False)
def late_close_analysis(plays_df):
"""Analyze late and close game performance."""
# Late and close: 7th inning or later, within 1 run
plays_df["late_close"] = (
(plays_df["inning"] >= 7) &
(abs(plays_df["home_score"] - plays_df["away_score"]) <= 1)
)
# Compare performance
lc_stats = plays_df.groupby(["batter_id", "late_close"]).agg({
"woba_value": "mean",
"event": "count"
}).unstack()
lc_stats.columns = ["woba_normal", "woba_late_close", "pa_normal", "pa_late_close"]
lc_stats["lc_difference"] = lc_stats["woba_late_close"] - lc_stats["woba_normal"]
return lc_stats.sort_values("lc_difference", ascending=False)
clutch_rankings = clutch_performance(player_stats, pbp_df)
print("Most Clutch Players:")
print(clutch_rankings.head(15))
R Markdown Sports Report
Create automated sports analytics report using R Markdown.
---
title: "Weekly MLB Analytics Report"
author: "Sports Analytics Team"
date: "`r Sys.Date()`"
output:
html_document:
toc: true
toc_float: true
theme: flatly
params:
week_num: 10
season: 2024
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
library(dplyr)
library(ggplot2)
library(knitr)
library(DT)
```
# Weekly Summary
```{r load-data}
# Load data for the specified week
batting <- read.csv("batting_weekly.csv") %>%
filter(week == params$week_num, season == params$season)
pitching <- read.csv("pitching_weekly.csv") %>%
filter(week == params$week_num, season == params$season)
```
## Top Performers
### Batting Leaders
```{r batting-table}
batting %>%
arrange(desc(war)) %>%
head(10) %>%
select(Name = name, Team = team, AVG = avg, HR = hr, RBI = rbi, WAR = war) %>%
datatable(options = list(pageLength = 10))
```
### Pitching Leaders
```{r pitching-table}
pitching %>%
arrange(era) %>%
head(10) %>%
select(Name = name, Team = team, W = wins, ERA = era, K = strikeouts, WAR = war) %>%
datatable(options = list(pageLength = 10))
```
## Visualizations
```{r war-distribution, fig.width=10, fig.height=6}
ggplot(batting, aes(x = war)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
geom_vline(xintercept = mean(batting$war), color = "red", linetype = "dashed") +
labs(
title = paste("WAR Distribution - Week", params$week_num),
x = "Wins Above Replacement",
y = "Count"
) +
theme_minimal()
```
## Week-over-Week Trends
```{r trends, fig.width=12, fig.height=5}
all_weeks <- read.csv("batting_weekly.csv") %>%
filter(season == params$season, week <= params$week_num)
weekly_summary <- all_weeks %>%
group_by(week) %>%
summarize(
avg_war = mean(war),
total_hr = sum(hr),
league_avg = mean(avg)
)
ggplot(weekly_summary, aes(x = week, y = total_hr)) +
geom_line(color = "steelblue", size = 1.5) +
geom_point(size = 3) +
labs(title = "Weekly Home Run Totals", x = "Week", y = "Home Runs") +
theme_minimal()
```
ggplot2 Faceted Sports Charts
Create multi-panel visualizations using ggplot2 facets for comparing across groups.
library(ggplot2)
library(dplyr)
library(scales)
# Faceted bar chart by division
ggplot(team_stats, aes(x = reorder(team, wins), y = wins, fill = above_500)) +
geom_col() +
coord_flip() +
facet_wrap(~ division, scales = "free_y", ncol = 2) +
scale_fill_manual(values = c("FALSE" = "coral", "TRUE" = "steelblue")) +
labs(title = "Wins by Team and Division", x = "", y = "Wins") +
theme_minimal() +
theme(legend.position = "none")
# Faceted scatter plot with trend lines
ggplot(player_stats, aes(x = obp, y = slg)) +
geom_point(aes(color = war), alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = TRUE, color = "red", linetype = "dashed") +
facet_grid(position ~ league) +
scale_color_viridis_c(option = "plasma") +
labs(
title = "OBP vs SLG by Position and League",
x = "On-Base Percentage",
y = "Slugging Percentage"
) +
theme_bw()
# Time series faceted by player
ggplot(game_log, aes(x = game_date, y = rolling_avg)) +
geom_line(color = "steelblue", size = 1) +
geom_hline(aes(yintercept = season_avg), linetype = "dashed", color = "red") +
facet_wrap(~ player_name, scales = "free_y", ncol = 3) +
scale_x_date(date_labels = "%b") +
labs(
title = "Rolling Batting Average by Player",
x = "Date",
y = "20-Game Rolling Average"
) +
theme_minimal()
# Distribution comparison with facets
ggplot(pitching_stats, aes(x = era)) +
geom_histogram(aes(y = ..density.., fill = role), bins = 30, alpha = 0.7) +
geom_density(color = "black", size = 1) +
facet_grid(league ~ role) +
scale_fill_brewer(palette = "Set2") +
labs(title = "ERA Distribution by Role and League") +
theme_light()
# Custom facet labeller
position_labels <- c(
"C" = "Catcher", "1B" = "First Base", "2B" = "Second Base",
"SS" = "Shortstop", "3B" = "Third Base", "OF" = "Outfield"
)
ggplot(defensive_stats, aes(x = uzr, y = drs)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
facet_wrap(~ position, labeller = labeller(position = position_labels)) +
labs(title = "UZR vs DRS by Position")
Batch Data Processor
Process large sports datasets in parallel batches.
import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from typing import Callable, List, Any
import logging
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class BatchProcessor:
"""Process large datasets in parallel batches."""
def __init__(self, n_workers=4, batch_size=1000):
self.n_workers = n_workers
self.batch_size = batch_size
def split_batches(self, data: pd.DataFrame) -> List[pd.DataFrame]:
"""Split DataFrame into batches."""
return [
data.iloc[i:i+self.batch_size]
for i in range(0, len(data), self.batch_size)
]
def process_parallel(self, data: pd.DataFrame,
process_func: Callable,
use_processes: bool = True) -> pd.DataFrame:
"""Process data in parallel batches."""
batches = self.split_batches(data)
logger.info(f"Processing {len(data)} rows in {len(batches)} batches")
Executor = ProcessPoolExecutor if use_processes else ThreadPoolExecutor
results = []
with Executor(max_workers=self.n_workers) as executor:
futures = [
executor.submit(process_func, batch)
for batch in batches
]
for future in tqdm(futures, desc="Processing batches"):
results.append(future.result())
return pd.concat(results, ignore_index=True)
def process_with_state(self, data: pd.DataFrame,
process_func: Callable,
state: dict) -> pd.DataFrame:
"""Process with shared state (single-threaded with batching)."""
batches = self.split_batches(data)
results = []
for batch in tqdm(batches, desc="Processing"):
result = process_func(batch, state)
results.append(result)
return pd.concat(results, ignore_index=True)
def calculate_war_batch(batch: pd.DataFrame) -> pd.DataFrame:
"""Calculate WAR for a batch of players."""
batch = batch.copy()
# Simplified WAR calculation
batch["batting_runs"] = (batch["woba"] - 0.320) / 1.25 * batch["pa"]
batch["base_running"] = batch["stolen_bases"] * 0.2 - batch["caught_stealing"] * 0.4
batch["position_adj"] = batch["position"].map({
"C": 12.5, "SS": 7.5, "CF": 2.5, "2B": 2.5, "3B": 2.5,
"RF": -7.5, "LF": -7.5, "1B": -12.5, "DH": -17.5
}) * batch["games"] / 162
batch["war"] = (
batch["batting_runs"] +
batch["base_running"] +
batch["position_adj"] +
batch["games"] * 0.1 # Replacement level
) / 10
return batch
# Usage
processor = BatchProcessor(n_workers=4, batch_size=5000)
# Process 100k player records
large_df = pd.DataFrame({
"player_id": range(100000),
"woba": np.random.normal(0.320, 0.030, 100000),
"pa": np.random.randint(100, 600, 100000),
"stolen_bases": np.random.randint(0, 30, 100000),
"caught_stealing": np.random.randint(0, 10, 100000),
"games": np.random.randint(50, 162, 100000),
"position": np.random.choice(
["C", "1B", "2B", "SS", "3B", "LF", "CF", "RF", "DH"],
100000
)
})
result = processor.process_parallel(large_df, calculate_war_batch)
print(f"Processed {len(result)} players")
print(f"Average WAR: {result['war'].mean():.2f}")
Sports Data Joins in dplyr
Efficiently join multiple sports datasets using dplyr verbs.
library(dplyr)
library(tidyr)
# Load various data sources
batting <- read.csv("batting_stats.csv")
fielding <- read.csv("fielding_stats.csv")
players <- read.csv("players.csv")
teams <- read.csv("teams.csv")
salaries <- read.csv("salaries.csv")
# Inner join - only matching records
batting_with_fielding <- batting %>%
inner_join(fielding, by = c("player_id", "season"))
# Left join - keep all batting records
full_stats <- batting %>%
left_join(fielding, by = c("player_id", "season")) %>%
left_join(players, by = "player_id") %>%
left_join(teams, by = "team_id")
# Anti-join - find players without fielding stats
batters_no_fielding <- batting %>%
anti_join(fielding, by = c("player_id", "season"))
# Multiple condition join
salary_comparison <- salaries %>%
inner_join(
batting,
by = c("player_id", "season"),
suffix = c("_salary", "_stats")
)
# Fuzzy date join (within 7 days)
library(fuzzyjoin)
injuries <- read.csv("injuries.csv")
game_log <- read.csv("game_log.csv")
games_near_injury <- game_log %>%
fuzzy_left_join(
injuries,
by = c("player_id", "game_date" = "injury_date"),
match_fun = list(`==`, function(x, y) abs(x - y) <= 7)
)
# Complex aggregation with multiple joins
season_summary <- batting %>%
group_by(player_id, season) %>%
summarize(
games = sum(games),
avg = sum(hits) / sum(at_bats),
war = sum(war),
.groups = "drop"
) %>%
left_join(
salaries %>% select(player_id, season, salary),
by = c("player_id", "season")
) %>%
mutate(
dollars_per_war = salary / pmax(war, 0.1)
) %>%
arrange(desc(war))
Player Aging Curve
Model player performance aging curves.
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
def delta_method_aging(player_seasons_df):
"""Calculate aging curve using delta method."""
# Get consecutive seasons for same player
df = player_seasons_df.sort_values(["player_id", "season"]).copy()
df["prev_war"] = df.groupby("player_id")["war"].shift(1)
df["prev_age"] = df.groupby("player_id")["age"].shift(1)
# Only consecutive seasons
df = df[df["season"] == df.groupby("player_id")["season"].shift(1) + 1]
df["war_change"] = df["war"] - df["prev_war"]
# Average change at each age
aging_curve = df.groupby("age").agg({
"war_change": ["mean", "std", "count"]
})
aging_curve.columns = ["delta_war", "std", "n"]
# Cumulative curve (set peak at age 27)
aging_curve["cumulative"] = aging_curve["delta_war"].cumsum()
peak_value = aging_curve.loc[27, "cumulative"] if 27 in aging_curve.index else 0
aging_curve["relative_to_peak"] = aging_curve["cumulative"] - peak_value
return aging_curve
def fit_parametric_curve(aging_data):
"""Fit parametric aging curve."""
# Gaussian-like aging curve
def aging_func(age, peak_age, peak_value, decline_rate, asymmetry):
return peak_value * np.exp(
-0.5 * ((age - peak_age) / decline_rate) ** 2 *
(1 + asymmetry * np.sign(age - peak_age))
)
ages = aging_data.index.values
values = aging_data["cumulative"].values + 5 # Shift for fitting
params, _ = curve_fit(
aging_func, ages, values,
p0=[27, 5, 5, 0.3],
bounds=([22, 0, 1, -1], [32, 10, 15, 1])
)
return params, aging_func
def position_aging_curves(player_seasons_df):
"""Calculate aging curves by position."""
positions = player_seasons_df["position"].unique()
curves = {}
for pos in positions:
pos_data = player_seasons_df[player_seasons_df["position"] == pos]
curves[pos] = delta_method_aging(pos_data)
return curves
def plot_aging_curves(curves_dict):
"""Plot aging curves for different positions."""
fig, ax = plt.subplots(figsize=(12, 6))
colors = plt.cm.Set1(np.linspace(0, 1, len(curves_dict)))
for (pos, curve), color in zip(curves_dict.items(), colors):
ax.plot(
curve.index,
curve["relative_to_peak"],
label=pos,
color=color,
linewidth=2
)
ax.fill_between(
curve.index,
curve["relative_to_peak"] - curve["std"],
curve["relative_to_peak"] + curve["std"],
alpha=0.2,
color=color
)
ax.axhline(y=0, color="black", linestyle="--", alpha=0.5)
ax.axvline(x=27, color="gray", linestyle=":", alpha=0.5)
ax.set_xlabel("Age")
ax.set_ylabel("WAR Relative to Peak")
ax.set_title("Aging Curves by Position")
ax.legend()
ax.grid(True, alpha=0.3)
return fig, ax
# Calculate aging curves
overall_curve = delta_method_aging(player_seasons_df)
position_curves = position_aging_curves(player_seasons_df)
print("Peak Age Analysis:")
print(overall_curve[["delta_war", "n"]].loc[24:32])
# Plot curves
fig, ax = plot_aging_curves(position_curves)
plt.show()
Configuration Management
Manage configuration for sports analytics projects.
import yaml
import os
from pathlib import Path
from dataclasses import dataclass, field
from typing import Dict, Any, Optional
import json
@dataclass
class DatabaseConfig:
host: str = "localhost"
port: int = 3306
database: str = "sports_analytics"
user: str = "analyst"
password: str = ""
@dataclass
class APIConfig:
base_url: str = ""
api_key: str = ""
rate_limit: int = 100
timeout: int = 30
@dataclass
class AnalyticsConfig:
database: DatabaseConfig = field(default_factory=DatabaseConfig)
apis: Dict[str, APIConfig] = field(default_factory=dict)
cache_dir: str = "./cache"
log_level: str = "INFO"
parallel_workers: int = 4
class ConfigManager:
"""Manage application configuration."""
def __init__(self, config_dir: str = "./config"):
self.config_dir = Path(config_dir)
self.config_dir.mkdir(exist_ok=True)
self.config: Optional[AnalyticsConfig] = None
def load(self, env: str = "development") -> AnalyticsConfig:
"""Load configuration for environment."""
# Load base config
base_path = self.config_dir / "base.yaml"
config_data = self._load_yaml(base_path) if base_path.exists() else {}
# Load environment-specific config
env_path = self.config_dir / f"{env}.yaml"
if env_path.exists():
env_data = self._load_yaml(env_path)
config_data = self._deep_merge(config_data, env_data)
# Override with environment variables
config_data = self._apply_env_vars(config_data)
# Build config object
self.config = self._build_config(config_data)
return self.config
def _load_yaml(self, path: Path) -> dict:
with open(path) as f:
return yaml.safe_load(f) or {}
def _deep_merge(self, base: dict, override: dict) -> dict:
"""Deep merge two dictionaries."""
result = base.copy()
for key, value in override.items():
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
result[key] = self._deep_merge(result[key], value)
else:
result[key] = value
return result
def _apply_env_vars(self, config: dict) -> dict:
"""Apply environment variable overrides."""
# Database
if "SPORTS_DB_HOST" in os.environ:
config.setdefault("database", {})["host"] = os.environ["SPORTS_DB_HOST"]
if "SPORTS_DB_PASSWORD" in os.environ:
config.setdefault("database", {})["password"] = os.environ["SPORTS_DB_PASSWORD"]
# API keys
for key in os.environ:
if key.startswith("SPORTS_API_"):
api_name = key.replace("SPORTS_API_", "").lower()
config.setdefault("apis", {}).setdefault(api_name, {})["api_key"] = os.environ[key]
return config
def _build_config(self, data: dict) -> AnalyticsConfig:
"""Build config object from dictionary."""
db_config = DatabaseConfig(**data.get("database", {}))
api_configs = {}
for name, api_data in data.get("apis", {}).items():
api_configs[name] = APIConfig(**api_data)
return AnalyticsConfig(
database=db_config,
apis=api_configs,
cache_dir=data.get("cache_dir", "./cache"),
log_level=data.get("log_level", "INFO"),
parallel_workers=data.get("parallel_workers", 4)
)
def save_template(self):
"""Save configuration template."""
template = {
"database": {
"host": "localhost",
"port": 3306,
"database": "sports_analytics",
"user": "analyst",
"password": "CHANGE_ME"
},
"apis": {
"mlb": {
"base_url": "https://statsapi.mlb.com/api/v1",
"api_key": "",
"rate_limit": 100
},
"nba": {
"base_url": "https://stats.nba.com/stats",
"api_key": "",
"rate_limit": 50
}
},
"cache_dir": "./cache",
"log_level": "INFO",
"parallel_workers": 4
}
with open(self.config_dir / "template.yaml", "w") as f:
yaml.dump(template, f, default_flow_style=False)
# Usage
config_manager = ConfigManager()
config = config_manager.load(env="production")
print(f"Database: {config.database.host}:{config.database.port}")
print(f"APIs configured: {list(config.apis.keys())}")
Sports Date Utilities
Utility functions for handling sports-specific date logic.
from datetime import datetime, timedelta
from typing import List, Tuple, Optional
import pandas as pd
class SportsDateUtils:
"""Utilities for sports date handling."""
# Season date ranges (approximate)
SEASON_DATES = {
"mlb": {"start": (3, 28), "end": (10, 1), "playoffs_end": (11, 5)},
"nba": {"start": (10, 22), "end": (4, 14), "playoffs_end": (6, 20)},
"nfl": {"start": (9, 5), "end": (1, 8), "playoffs_end": (2, 12)},
"nhl": {"start": (10, 10), "end": (4, 13), "playoffs_end": (6, 25)}
}
@classmethod
def get_season_year(cls, date: datetime, sport: str) -> int:
"""Get season year for a given date and sport."""
season_info = cls.SEASON_DATES.get(sport, {})
start_month = season_info.get("start", (1, 1))[0]
# For sports that span calendar years (NBA, NHL, NFL)
if sport in ["nba", "nhl"]:
# If before start month, use previous year
if date.month < start_month:
return date.year - 1
return date.year
elif sport == "nfl":
if date.month < 3: # Before March = previous season
return date.year - 1
return date.year
else: # MLB and others
return date.year
@classmethod
def is_regular_season(cls, date: datetime, sport: str) -> bool:
"""Check if date is during regular season."""
season_info = cls.SEASON_DATES.get(sport)
if not season_info:
return True
start = datetime(date.year, *season_info["start"])
end_month, end_day = season_info["end"]
# Handle season spanning years
if end_month < season_info["start"][0]:
end = datetime(date.year + 1, end_month, end_day)
else:
end = datetime(date.year, end_month, end_day)
return start <= date <= end
@classmethod
def get_game_week(cls, date: datetime, sport: str = "nfl") -> int:
"""Get NFL game week number."""
season_year = cls.get_season_year(date, sport)
season_start = datetime(season_year, 9, 1)
# Find first Thursday in September
while season_start.weekday() != 3: # Thursday
season_start += timedelta(days=1)
if date < season_start:
return 0
days_since_start = (date - season_start).days
return (days_since_start // 7) + 1
@classmethod
def get_rest_days(cls, game_dates: List[datetime],
current_date: datetime) -> int:
"""Calculate days of rest before current game."""
previous_games = [d for d in game_dates if d < current_date]
if not previous_games:
return 7 # Default to week rest
last_game = max(previous_games)
return (current_date - last_game).days - 1
@classmethod
def create_game_schedule_features(cls, games_df: pd.DataFrame,
sport: str) -> pd.DataFrame:
"""Add schedule-based features to games DataFrame."""
df = games_df.copy()
df["game_date"] = pd.to_datetime(df["game_date"])
# Season year
df["season"] = df["game_date"].apply(
lambda x: cls.get_season_year(x, sport)
)
# Regular season flag
df["is_regular_season"] = df["game_date"].apply(
lambda x: cls.is_regular_season(x, sport)
)
# Day of week
df["day_of_week"] = df["game_date"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6])
# Time of season (0-1)
def get_season_progress(row):
season_info = cls.SEASON_DATES.get(sport, {})
start = datetime(row["season"], *season_info.get("start", (1, 1)))
end_month, end_day = season_info.get("end", (12, 31))
if end_month < start.month:
end = datetime(row["season"] + 1, end_month, end_day)
else:
end = datetime(row["season"], end_month, end_day)
total_days = (end - start).days
days_in = (row["game_date"] - start).days
return max(0, min(1, days_in / total_days))
df["season_progress"] = df.apply(get_season_progress, axis=1)
return df
# Usage
utils = SportsDateUtils()
# Get current NFL week
today = datetime.now()
week = utils.get_game_week(today)
print(f"Current NFL Week: {week}")
# Add features to games
games_with_features = utils.create_game_schedule_features(games_df, "mlb")
print(games_with_features[["game_date", "season", "is_regular_season", "season_progress"]].head())
Sports Data Cache Manager
Implement caching layer for sports API data to reduce API calls.
import json
import hashlib
import time
from datetime import datetime, timedelta
from pathlib import Path
from functools import wraps
import pickle
import redis
class CacheManager:
"""Manage caching for sports data."""
def __init__(self, cache_dir="./cache", ttl_hours=1):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
self.default_ttl = ttl_hours * 3600
self.redis_client = None
def connect_redis(self, host="localhost", port=6379):
"""Connect to Redis for distributed caching."""
self.redis_client = redis.Redis(host=host, port=port, decode_responses=False)
def _get_cache_key(self, func_name, args, kwargs):
"""Generate unique cache key."""
key_data = f"{func_name}:{args}:{sorted(kwargs.items())}"
return hashlib.md5(key_data.encode()).hexdigest()
def _file_cache_path(self, key):
"""Get file path for cache key."""
return self.cache_dir / f"{key}.pkl"
def get(self, key):
"""Get value from cache."""
# Try Redis first
if self.redis_client:
try:
data = self.redis_client.get(key)
if data:
return pickle.loads(data)
except:
pass
# Fall back to file cache
cache_path = self._file_cache_path(key)
if cache_path.exists():
with open(cache_path, "rb") as f:
cached = pickle.load(f)
if time.time() < cached["expires"]:
return cached["data"]
else:
cache_path.unlink() # Remove expired
return None
def set(self, key, value, ttl=None):
"""Set value in cache."""
ttl = ttl or self.default_ttl
# Try Redis
if self.redis_client:
try:
self.redis_client.setex(key, ttl, pickle.dumps(value))
return
except:
pass
# File cache fallback
cache_path = self._file_cache_path(key)
cached = {
"data": value,
"expires": time.time() + ttl,
"created": datetime.now().isoformat()
}
with open(cache_path, "wb") as f:
pickle.dump(cached, f)
def cached(self, ttl=None):
"""Decorator for caching function results."""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
key = self._get_cache_key(func.__name__, args, kwargs)
# Check cache
cached_value = self.get(key)
if cached_value is not None:
return cached_value
# Call function and cache result
result = func(*args, **kwargs)
self.set(key, result, ttl)
return result
return wrapper
return decorator
def clear(self, pattern=None):
"""Clear cache entries."""
if self.redis_client and pattern:
keys = self.redis_client.keys(pattern)
if keys:
self.redis_client.delete(*keys)
# Clear file cache
for cache_file in self.cache_dir.glob("*.pkl"):
if pattern is None or pattern in cache_file.name:
cache_file.unlink()
# Usage
cache = CacheManager(ttl_hours=4)
@cache.cached(ttl=3600) # 1 hour cache
def get_player_stats(player_id, season):
"""Fetch player stats (cached)."""
# Expensive API call
response = api_client.get(f"/players/{player_id}/stats/{season}")
return response.json()
# First call - fetches from API
stats = get_player_stats(12345, 2024)
# Second call - returns cached data
stats = get_player_stats(12345, 2024)
Prospect Ranking System
Create prospect ranking system combining multiple data sources.
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
class ProspectRanker:
"""Rank prospects using multiple evaluation methods."""
def __init__(self, weights=None):
self.weights = weights or {
"tools": 0.3,
"stats": 0.4,
"projection": 0.2,
"makeup": 0.1
}
self.scaler = MinMaxScaler()
def score_tools(self, prospect_df):
"""Score raw tools on 20-80 scale."""
tool_cols = ["hit", "power", "speed", "arm", "field"]
# Normalize to 0-1 scale
for col in tool_cols:
prospect_df[f"{col}_norm"] = (prospect_df[col] - 20) / 60
# Overall tool score
prospect_df["tools_score"] = prospect_df[
[f"{c}_norm" for c in tool_cols]
].mean(axis=1)
return prospect_df
def score_stats(self, prospect_df, position_adjustments):
"""Score statistical performance."""
# Normalize stats by league and level
stat_cols = ["avg", "obp", "slg", "hr_rate", "k_rate", "bb_rate"]
for col in stat_cols:
col_by_level = prospect_df.groupby("level")[col].transform(
lambda x: (x - x.mean()) / x.std()
)
prospect_df[f"{col}_z"] = col_by_level
# Position adjustments
prospect_df["pos_adj"] = prospect_df["position"].map(position_adjustments)
# Combined stats score
weights = {"avg_z": 0.15, "obp_z": 0.2, "slg_z": 0.2,
"hr_rate_z": 0.15, "k_rate_z": -0.15, "bb_rate_z": 0.15}
prospect_df["stats_score"] = sum(
prospect_df[col] * weight for col, weight in weights.items()
) + prospect_df["pos_adj"]
# Scale to 0-1
prospect_df["stats_score"] = self.scaler.fit_transform(
prospect_df[["stats_score"]]
)
return prospect_df
def project_future_value(self, prospect_df):
"""Project future MLB value."""
# Factors: age relative to level, tools trajectory, injury history
prospect_df["age_factor"] = 1 - (prospect_df["age"] - 18) / 10
# Simple projection model
prospect_df["projection_score"] = (
prospect_df["tools_score"] * 0.5 +
prospect_df["stats_score"].values.flatten() * 0.3 +
prospect_df["age_factor"] * 0.2
)
return prospect_df
def calculate_final_rank(self, prospect_df):
"""Calculate final prospect ranking."""
prospect_df["final_score"] = (
prospect_df["tools_score"] * self.weights["tools"] +
prospect_df["stats_score"].values.flatten() * self.weights["stats"] +
prospect_df["projection_score"] * self.weights["projection"]
)
prospect_df["rank"] = prospect_df["final_score"].rank(ascending=False)
return prospect_df.sort_values("rank")
def generate_report(self, prospect_df, top_n=100):
"""Generate prospect ranking report."""
df = self.score_tools(prospect_df)
df = self.score_stats(df, {
"C": 0.1, "SS": 0.08, "CF": 0.05,
"2B": 0.02, "3B": 0.02, "RF": 0,
"LF": -0.02, "1B": -0.05, "DH": -0.08
})
df = self.project_future_value(df)
df = self.calculate_final_rank(df)
report = df.head(top_n)[[
"rank", "name", "age", "position", "organization",
"tools_score", "stats_score", "projection_score", "final_score"
]]
return report
# Usage
ranker = ProspectRanker()
rankings = ranker.generate_report(prospects_df, top_n=50)
print("Top 50 Prospects:")
print(rankings)
Defensive Positioning Heatmap
Create defensive positioning heatmaps from tracking data.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter
import seaborn as sns
def create_defensive_heatmap(tracking_df, fielder_position, team_id=None):
"""Create defensive positioning heatmap."""
# Filter to defensive plays
defense = tracking_df[
(tracking_df["position"] == fielder_position) &
(tracking_df["on_defense"] == True)
]
if team_id:
defense = defense[defense["team_id"] == team_id]
# Get positions at contact
contact_positions = defense[defense["event"] == "ball_contact"]
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# 1. Raw positioning scatter
ax1 = axes[0]
ax1.scatter(contact_positions["x"], contact_positions["y"],
alpha=0.3, s=10)
ax1.set_title(f"{fielder_position} - Raw Positions")
ax1.set_xlabel("X (feet)")
ax1.set_ylabel("Y (feet)")
# 2. 2D Histogram heatmap
ax2 = axes[1]
h = ax2.hist2d(contact_positions["x"], contact_positions["y"],
bins=50, cmap="YlOrRd")
plt.colorbar(h[3], ax=ax2, label="Frequency")
ax2.set_title(f"{fielder_position} - Positioning Density")
# 3. Smoothed KDE heatmap
ax3 = axes[2]
x_range = np.linspace(contact_positions["x"].min(), contact_positions["x"].max(), 100)
y_range = np.linspace(contact_positions["y"].min(), contact_positions["y"].max(), 100)
heatmap, xedges, yedges = np.histogram2d(
contact_positions["x"], contact_positions["y"],
bins=[x_range, y_range]
)
heatmap = gaussian_filter(heatmap, sigma=2)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
im = ax3.imshow(heatmap.T, origin="lower", extent=extent,
cmap="YlOrRd", aspect="auto")
plt.colorbar(im, ax=ax3, label="Density")
ax3.set_title(f"{fielder_position} - Smoothed Heatmap")
plt.tight_layout()
return fig
def shift_analysis(tracking_df, batter_id):
"""Analyze defensive shifts against a specific batter."""
# Get defensive positions when this batter is up
batter_plays = tracking_df[
(tracking_df["batter_id"] == batter_id) &
(tracking_df["event"] == "pitch")
]
# Group by position and get average location
positions = ["1B", "2B", "SS", "3B", "LF", "CF", "RF"]
avg_positions = []
for pos in positions:
pos_data = batter_plays[batter_plays["position"] == pos]
if len(pos_data) > 0:
avg_positions.append({
"position": pos,
"avg_x": pos_data["x"].mean(),
"avg_y": pos_data["y"].mean(),
"std_x": pos_data["x"].std(),
"std_y": pos_data["y"].std()
})
avg_df = pd.DataFrame(avg_positions)
# Compare to standard positions
standard_positions = {
"1B": (90, 30), "2B": (60, 90), "SS": (-60, 90),
"3B": (-90, 30), "LF": (-180, 250), "CF": (0, 300), "RF": (180, 250)
}
avg_df["shift_x"] = avg_df.apply(
lambda x: x["avg_x"] - standard_positions[x["position"]][0], axis=1
)
avg_df["shift_y"] = avg_df.apply(
lambda x: x["avg_y"] - standard_positions[x["position"]][1], axis=1
)
return avg_df
def visualize_defensive_alignment(positions_df, title="Defensive Alignment"):
"""Visualize defensive alignment on field."""
fig, ax = plt.subplots(figsize=(12, 10))
# Draw field
# Infield dirt
infield = plt.Circle((0, 0), 95, color="peru", alpha=0.3)
ax.add_patch(infield)
# Bases
bases = [(0, 0), (63.6, 63.6), (0, 127.3), (-63.6, 63.6)]
for bx, by in bases:
ax.plot(bx, by, "ws", markersize=15, markeredgecolor="black")
# Outfield grass
ax.set_facecolor("forestgreen")
# Plot fielder positions
for _, row in positions_df.iterrows():
ax.scatter(row["avg_x"], row["avg_y"], s=200, c="blue",
edgecolors="white", linewidth=2, zorder=5)
ax.annotate(row["position"], (row["avg_x"], row["avg_y"]),
fontsize=10, ha="center", va="bottom",
color="white", fontweight="bold")
# Show shift arrows from standard position
if abs(row["shift_x"]) > 10 or abs(row["shift_y"]) > 10:
ax.annotate("",
xy=(row["avg_x"], row["avg_y"]),
xytext=(row["avg_x"] - row["shift_x"],
row["avg_y"] - row["shift_y"]),
arrowprops=dict(arrowstyle="->", color="red", lw=2))
ax.set_xlim(-350, 350)
ax.set_ylim(-50, 400)
ax.set_aspect("equal")
ax.set_title(title)
return fig, ax
# Usage
heatmap_fig = create_defensive_heatmap(tracking_df, "SS")
shift_df = shift_analysis(tracking_df, batter_id=12345)
alignment_fig, ax = visualize_defensive_alignment(shift_df, "Shift vs Left-Handed Pull Hitter")
Shot Quality Analysis
Analyze shot quality metrics for basketball.
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
def calculate_shot_quality(shots_df):
"""Calculate shot quality metrics."""
# Add shot zones
shots_df = shots_df.copy()
# Distance from hoop
shots_df["distance"] = np.sqrt(
shots_df["loc_x"] ** 2 + shots_df["loc_y"] ** 2
)
# Shot zones
def get_zone(row):
x, y = row["loc_x"], row["loc_y"]
dist = row["distance"]
if dist < 4:
return "Restricted Area"
elif dist < 8:
return "Paint (Non-RA)"
elif dist < 14:
return "Mid-Range"
elif y > 7.8:
return "Corner 3"
elif dist >= 23.75:
return "Above Break 3"
else:
return "Long 2"
shots_df["zone"] = shots_df.apply(get_zone, axis=1)
return shots_df
def train_shot_model(shots_df):
"""Train expected FG% model."""
features = [
"distance",
"shot_clock",
"touch_time",
"defender_distance",
"dribbles"
]
df = shots_df.dropna(subset=features + ["is_made"])
X = df[features]
y = df["is_made"].astype(int)
model = LogisticRegression(max_iter=1000)
model.fit(X, y)
df["expected_fg"] = model.predict_proba(X)[:, 1]
return model, df
def shot_quality_report(shots_df, player_name=None):
"""Generate shot quality report."""
df = shots_df.copy()
if player_name:
df = df[df["player_name"] == player_name]
# Zone breakdown
zone_stats = df.groupby("zone").agg({
"is_made": ["count", "sum", "mean"],
"expected_fg": "mean"
})
zone_stats.columns = ["attempts", "makes", "fg_pct", "expected_fg"]
zone_stats["points_per_shot"] = zone_stats.apply(
lambda x: x["fg_pct"] * (3 if "3" in x.name else 2),
axis=1
)
# Shot quality metrics
total_shots = len(df)
avg_expected = df["expected_fg"].mean()
actual_fg = df["is_made"].mean()
# Shot making vs shot selection
selection_value = avg_expected - 0.45 # vs league avg
making_value = actual_fg - avg_expected
report = {
"shots": total_shots,
"fg_pct": actual_fg,
"expected_fg": avg_expected,
"shot_selection": selection_value, # Positive = good shot selection
"shot_making": making_value, # Positive = makes tough shots
"zone_breakdown": zone_stats
}
return report
def visualize_shot_chart(shots_df, player_name):
"""Create shot chart visualization."""
player_shots = shots_df[shots_df["player_name"] == player_name]
fig, ax = plt.subplots(figsize=(12, 11))
# Draw court
court = plt.Circle((0, 0), 23.75, fill=False, color="black")
ax.add_patch(court)
# Paint
ax.add_patch(plt.Rectangle((-80, -47.5), 160, 190, fill=False))
# Restricted area
ax.add_patch(plt.Circle((0, 0), 40, fill=False, color="black"))
# Plot shots
made = player_shots[player_shots["is_made"] == 1]
missed = player_shots[player_shots["is_made"] == 0]
ax.scatter(made["loc_x"], made["loc_y"], c="green", marker="o", s=30, alpha=0.6, label="Made")
ax.scatter(missed["loc_x"], missed["loc_y"], c="red", marker="x", s=30, alpha=0.6, label="Missed")
ax.set_xlim(-250, 250)
ax.set_ylim(-50, 420)
ax.set_aspect("equal")
ax.legend()
ax.set_title(f"{player_name} Shot Chart")
return fig, ax
# Usage
shots_df = calculate_shot_quality(shots_df)
model, shots_with_xfg = train_shot_model(shots_df)
report = shot_quality_report(shots_with_xfg, "Stephen Curry")
print(f"Shot Selection: {report['shot_selection']*100:+.1f}% vs avg")
print(f"Shot Making: {report['shot_making']*100:+.1f}% vs expected")
Lineup Optimization
Optimize batting lineup order using genetic algorithms.
import numpy as np
import pandas as pd
from typing import List, Tuple
import random
from deap import base, creator, tools, algorithms
class LineupOptimizer:
"""Optimize batting lineup using genetic algorithm."""
def __init__(self, players_df, simulation_games=1000):
self.players = players_df
self.n_players = len(players_df)
self.sim_games = simulation_games
def simulate_game(self, lineup_order: List[int]) -> float:
"""Simulate game with given lineup order and return expected runs."""
runs = 0
outs = 0
bases = [0, 0, 0] # First, second, third
batter_idx = 0
while outs < 27: # 9 innings * 3 outs
player_idx = lineup_order[batter_idx % 9]
player = self.players.iloc[player_idx]
# Simple outcome probabilities from player stats
outcomes = self._at_bat_outcomes(player)
outcome = np.random.choice(
["out", "single", "double", "triple", "hr", "walk"],
p=outcomes
)
if outcome == "out":
outs += 1
else:
# Advance runners and score runs
runs_scored, bases = self._advance_runners(bases, outcome)
runs += runs_scored
batter_idx += 1
return runs
def _at_bat_outcomes(self, player) -> List[float]:
"""Get outcome probabilities for player."""
# Simplified model
single_pct = (player["avg"] - player["hr_rate"] * 0.8) * 0.7
double_pct = player["avg"] * 0.2
triple_pct = player["avg"] * 0.03
hr_pct = player["hr_rate"]
walk_pct = player["bb_rate"]
out_pct = 1 - single_pct - double_pct - triple_pct - hr_pct - walk_pct
return [max(0, out_pct), single_pct, double_pct, triple_pct, hr_pct, walk_pct]
def _advance_runners(self, bases: List[int], outcome: str) -> Tuple[int, List[int]]:
"""Advance runners based on outcome."""
runs = 0
new_bases = [0, 0, 0]
if outcome == "hr":
runs = 1 + sum(bases)
elif outcome == "triple":
runs = sum(bases)
new_bases[2] = 1
elif outcome == "double":
runs = bases[1] + bases[2]
new_bases[2] = bases[0]
new_bases[1] = 1
elif outcome in ["single", "walk"]:
runs = bases[2]
new_bases[2] = bases[1]
new_bases[1] = bases[0]
new_bases[0] = 1
return runs, new_bases
def evaluate_lineup(self, lineup: List[int]) -> Tuple[float]:
"""Evaluate lineup fitness."""
total_runs = sum(
self.simulate_game(lineup)
for _ in range(self.sim_games)
)
return (total_runs / self.sim_games,)
def optimize(self, generations=100, population_size=50):
"""Run genetic algorithm optimization."""
# Setup DEAP
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("indices", random.sample, range(self.n_players), 9)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.indices)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", self.evaluate_lineup)
toolbox.register("mate", tools.cxOrdered)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)
pop = toolbox.population(n=population_size)
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("max", np.max)
pop, log = algorithms.eaSimple(
pop, toolbox,
cxpb=0.7, mutpb=0.2,
ngen=generations,
stats=stats,
halloffame=hof,
verbose=True
)
best_lineup = hof[0]
return best_lineup, log
# Usage
optimizer = LineupOptimizer(players_df, simulation_games=500)
best_lineup, log = optimizer.optimize(generations=50)
print("Optimal Lineup Order:")
for i, idx in enumerate(best_lineup, 1):
player = players_df.iloc[idx]
print(f"{i}. {player['name']} ({player['position']})")
Pitch Sequencing Analysis
Analyze pitch sequencing patterns and their effectiveness.
import pandas as pd
import numpy as np
from collections import Counter
from itertools import product
def analyze_pitch_sequences(pitches_df, pitcher_id):
"""Analyze pitch sequencing patterns."""
pitcher_pitches = pitches_df[
pitches_df["pitcher_id"] == pitcher_id
].sort_values(["game_id", "at_bat_number", "pitch_number"])
# Get pitch transitions
pitcher_pitches["prev_pitch"] = pitcher_pitches.groupby(
["game_id", "at_bat_number"]
)["pitch_type"].shift(1)
# Filter to valid transitions (not first pitch of AB)
transitions = pitcher_pitches[pitcher_pitches["prev_pitch"].notna()].copy()
# Count transitions
transition_counts = transitions.groupby(
["prev_pitch", "pitch_type"]
).size().unstack(fill_value=0)
# Calculate transition probabilities
transition_probs = transition_counts.div(
transition_counts.sum(axis=1), axis=0
)
return transition_counts, transition_probs
def sequence_effectiveness(pitches_df, pitcher_id):
"""Measure effectiveness of pitch sequences."""
pitcher_pitches = pitches_df[
pitches_df["pitcher_id"] == pitcher_id
].sort_values(["game_id", "at_bat_number", "pitch_number"])
pitcher_pitches["prev_pitch"] = pitcher_pitches.groupby(
["game_id", "at_bat_number"]
)["pitch_type"].shift(1)
# Effectiveness metrics by sequence
sequences = pitcher_pitches[pitcher_pitches["prev_pitch"].notna()].copy()
sequences["sequence"] = sequences["prev_pitch"] + " -> " + sequences["pitch_type"]
effectiveness = sequences.groupby("sequence").agg({
"pitch_type": "count",
"is_strike": "mean",
"is_swing": "mean",
"is_whiff": lambda x: x.sum() / max(sequences.loc[x.index, "is_swing"].sum(), 1),
"delta_run_exp": "mean" # Run value
}).rename(columns={
"pitch_type": "count",
"is_strike": "strike_pct",
"is_swing": "swing_pct",
"is_whiff": "whiff_pct",
"delta_run_exp": "run_value"
})
effectiveness = effectiveness[effectiveness["count"] >= 20]
return effectiveness.sort_values("run_value")
def pitch_tunneling_analysis(pitches_df, pitcher_id):
"""Analyze pitch tunneling (how similar pitches look at release)."""
pitcher = pitches_df[pitches_df["pitcher_id"] == pitcher_id]
# Group by pitch type
pitch_types = pitcher.groupby("pitch_type").agg({
"release_pos_x": "mean",
"release_pos_z": "mean",
"release_extension": "mean",
"plate_x": ["mean", "std"],
"plate_z": ["mean", "std"],
"release_speed": "mean",
"pfx_x": "mean",
"pfx_z": "mean"
})
pitch_types.columns = ["_".join(col).strip("_") for col in pitch_types.columns]
# Calculate tunnel distance between pitch pairs
pitch_list = pitch_types.index.tolist()
tunnel_matrix = pd.DataFrame(index=pitch_list, columns=pitch_list, dtype=float)
for p1, p2 in product(pitch_list, pitch_list):
# Distance at release point
release_dist = np.sqrt(
(pitch_types.loc[p1, "release_pos_x"] - pitch_types.loc[p2, "release_pos_x"]) ** 2 +
(pitch_types.loc[p1, "release_pos_z"] - pitch_types.loc[p2, "release_pos_z"]) ** 2
)
# Distance at plate
plate_dist = np.sqrt(
(pitch_types.loc[p1, "plate_x_mean"] - pitch_types.loc[p2, "plate_x_mean"]) ** 2 +
(pitch_types.loc[p1, "plate_z_mean"] - pitch_types.loc[p2, "plate_z_mean"]) ** 2
)
# Good tunneling = small release distance, large plate distance
tunnel_matrix.loc[p1, p2] = plate_dist - release_dist
return pitch_types, tunnel_matrix
# Analyze a pitcher
transitions, probs = analyze_pitch_sequences(pitches_df, pitcher_id=12345)
print("Pitch Transition Probabilities:")
print(probs)
effectiveness = sequence_effectiveness(pitches_df, pitcher_id=12345)
print("\nBest Sequences (by run value):")
print(effectiveness.head(10))
print("\nWorst Sequences:")
print(effectiveness.tail(5))
Game State Win Probability
Calculate real-time win probability based on current game state.
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
class WinProbabilityModel:
"""Calculate win probability for different sports."""
def __init__(self, sport="baseball"):
self.sport = sport
self.model = None
self.poly = PolynomialFeatures(degree=2, include_bias=False)
def train_baseball_model(self, historical_games_df):
"""Train win probability model for baseball."""
# Create game states from historical data
states = []
for _, game in historical_games_df.iterrows():
for inning in range(1, 10):
for half in [0, 0.5]: # Top/bottom
state = {
"inning": inning + half,
"score_diff": game[f"home_score_after_{inning}"] -
game[f"away_score_after_{inning}"],
"home_win": game["home_win"]
}
states.append(state)
states_df = pd.DataFrame(states)
# Features
X = states_df[["inning", "score_diff"]]
X_poly = self.poly.fit_transform(X)
y = states_df["home_win"]
self.model = LogisticRegression(max_iter=1000)
self.model.fit(X_poly, y)
return self
def get_win_probability(self, inning, score_diff, is_home_batting=True):
"""Get current win probability."""
if self.model is None:
raise ValueError("Model not trained")
# Adjust inning for half
inning_val = inning + (0.5 if is_home_batting else 0)
X = np.array([[inning_val, score_diff]])
X_poly = self.poly.transform(X)
return self.model.predict_proba(X_poly)[0, 1]
def plot_win_probability_curve(self, play_by_play_df, game_id):
"""Plot win probability over course of game."""
game = play_by_play_df[play_by_play_df["game_id"] == game_id].copy()
# Calculate WP at each play
wp_list = []
for _, play in game.iterrows():
wp = self.get_win_probability(
play["inning"],
play["home_score"] - play["away_score"],
play["is_home_batting"]
)
wp_list.append(wp)
game["win_prob"] = wp_list
# Plot
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(range(len(game)), game["win_prob"], "b-", linewidth=2)
ax.axhline(y=0.5, color="gray", linestyle="--", alpha=0.5)
ax.fill_between(range(len(game)), 0.5, game["win_prob"],
where=game["win_prob"] >= 0.5, alpha=0.3, color="blue")
ax.fill_between(range(len(game)), game["win_prob"], 0.5,
where=game["win_prob"] < 0.5, alpha=0.3, color="red")
ax.set_ylim(0, 1)
ax.set_ylabel("Home Win Probability")
ax.set_xlabel("Play Number")
ax.set_title(f"Win Probability Chart - Game {game_id}")
# Mark scoring plays
scoring = game[game["runs_scored"] > 0]
ax.scatter(scoring.index, scoring["win_prob"], c="red", s=100, zorder=5)
plt.tight_layout()
return fig, ax
# Usage
wp_model = WinProbabilityModel("baseball")
wp_model.train_baseball_model(historical_df)
# Current game state
current_wp = wp_model.get_win_probability(inning=7, score_diff=2, is_home_batting=False)
print(f"Home team win probability: {current_wp:.1%}")
Strain and Workload Monitoring
Monitor player workload and injury risk using training data.
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
class WorkloadMonitor:
"""Monitor player workload and injury risk."""
def __init__(self):
self.chronic_window = 28 # days
self.acute_window = 7 # days
def calculate_workload(self, training_df, player_id):
"""Calculate acute and chronic workload."""
player_data = training_df[
training_df["player_id"] == player_id
].sort_values("date")
# Calculate rolling workloads
player_data["acute_load"] = player_data["training_load"].rolling(
window=self.acute_window, min_periods=1
).sum()
player_data["chronic_load"] = player_data["training_load"].rolling(
window=self.chronic_window, min_periods=7
).mean() * self.acute_window
# Acute:Chronic Workload Ratio (ACWR)
player_data["acwr"] = (
player_data["acute_load"] /
player_data["chronic_load"].replace(0, np.nan)
)
return player_data
def calculate_monotony_strain(self, training_df, player_id):
"""Calculate training monotony and strain."""
player_data = training_df[
training_df["player_id"] == player_id
].sort_values("date")
# Weekly calculations
player_data["week"] = player_data["date"].dt.isocalendar().week
weekly = player_data.groupby("week").agg({
"training_load": ["sum", "mean", "std"]
})
weekly.columns = ["weekly_load", "daily_mean", "daily_std"]
# Monotony = mean / std (lower variation = higher monotony)
weekly["monotony"] = weekly["daily_mean"] / weekly["daily_std"].replace(0, np.nan)
# Strain = weekly_load * monotony
weekly["strain"] = weekly["weekly_load"] * weekly["monotony"]
return weekly
def assess_injury_risk(self, player_data):
"""Assess injury risk based on workload."""
latest = player_data.iloc[-1]
risk_factors = []
# ACWR risk zones
acwr = latest.get("acwr", 1.0)
if acwr < 0.8:
risk_factors.append(("Low fitness", "ACWR below 0.8"))
elif acwr > 1.5:
risk_factors.append(("Spike in load", "ACWR above 1.5"))
elif 1.0 <= acwr <= 1.25:
risk_factors.append(("Optimal zone", "ACWR in sweet spot"))
# Workload spike detection
recent_load = player_data["training_load"].tail(7).sum()
avg_load = player_data["training_load"].tail(28).mean() * 7
if recent_load > avg_load * 1.3:
risk_factors.append(("Week-to-week spike", f"{(recent_load/avg_load-1)*100:.0f}% increase"))
# Overall risk score (0-100)
risk_score = 0
if acwr < 0.8 or acwr > 1.5:
risk_score += 30
if acwr > 1.75:
risk_score += 20
if recent_load > avg_load * 1.5:
risk_score += 25
return {
"risk_score": min(risk_score, 100),
"acwr": acwr,
"risk_factors": risk_factors,
"recommendation": self._get_recommendation(risk_score, acwr)
}
def _get_recommendation(self, risk_score, acwr):
if risk_score >= 50:
return "Reduce training load, consider rest day"
elif acwr < 0.8:
return "Gradually increase training load"
elif 1.0 <= acwr <= 1.25:
return "Maintain current training plan"
else:
return "Monitor closely, avoid further increases"
# Usage
monitor = WorkloadMonitor()
# Calculate for each player
for player_id in training_df["player_id"].unique():
workload = monitor.calculate_workload(training_df, player_id)
risk = monitor.assess_injury_risk(workload)
print(f"Player {player_id}: Risk Score={risk['risk_score']}, ACWR={risk['acwr']:.2f}")
print(f" Recommendation: {risk['recommendation']}")
Route Running Analysis
Analyze NFL receiver routes using tracking data.
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
def extract_route_features(tracking_df, receiver_id, play_id):
"""Extract features from a single route."""
route = tracking_df[
(tracking_df["nfl_id"] == receiver_id) &
(tracking_df["play_id"] == play_id)
].sort_values("frame_id")
if len(route) < 10:
return None
# Get route after snap
snap_frame = route[route["event"] == "ball_snap"]["frame_id"].iloc[0]
route = route[route["frame_id"] >= snap_frame]
# Calculate features
features = {
"play_id": play_id,
"receiver_id": receiver_id,
# Distance metrics
"total_distance": route["dis"].sum(),
"max_depth": route["y"].max() - route["y"].iloc[0],
"lateral_movement": abs(route["x"].max() - route["x"].min()),
# Speed metrics
"max_speed": route["s"].max(),
"avg_speed": route["s"].mean(),
"speed_at_catch": route[route["event"] == "pass_arrived"]["s"].iloc[0]
if "pass_arrived" in route["event"].values else np.nan,
# Acceleration
"max_acceleration": route["a"].max(),
"break_acceleration": route.loc[route["a"].idxmax(), "a"],
# Direction changes
"direction_changes": (
(route["dir"].diff().abs() > 45).sum()
),
# Separation at key moments
"separation_at_target": route[
route["event"] == "pass_arrived"
]["separation"].iloc[0] if "pass_arrived" in route["event"].values else np.nan
}
return features
def classify_routes(route_features_df, n_clusters=9):
"""Cluster routes into types."""
features = [
"max_depth", "lateral_movement", "direction_changes",
"total_distance", "max_acceleration"
]
X = route_features_df[features].dropna()
X_scaled = (X - X.mean()) / X.std()
# K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
route_features_df.loc[X.index, "route_cluster"] = kmeans.fit_predict(X_scaled)
# Analyze clusters
cluster_profiles = route_features_df.groupby("route_cluster")[features].mean()
# Route type labels based on characteristics
route_labels = {
0: "Go/Fade",
1: "Out",
2: "In/Dig",
3: "Slant",
4: "Curl/Comeback",
5: "Post",
6: "Corner",
7: "Flat/Screen",
8: "Wheel"
}
return route_features_df, cluster_profiles
def analyze_receiver_routes(tracking_df, receiver_name):
"""Full route analysis for a receiver."""
receiver_df = tracking_df[tracking_df["display_name"] == receiver_name]
receiver_id = receiver_df["nfl_id"].iloc[0]
play_ids = receiver_df["play_id"].unique()
features_list = []
for play_id in play_ids:
features = extract_route_features(tracking_df, receiver_id, play_id)
if features:
features_list.append(features)
route_df = pd.DataFrame(features_list)
route_df, cluster_profiles = classify_routes(route_df)
# Summary stats
summary = {
"receiver": receiver_name,
"total_routes": len(route_df),
"avg_separation": route_df["separation_at_target"].mean(),
"avg_max_speed": route_df["max_speed"].mean(),
"route_distribution": route_df["route_cluster"].value_counts().to_dict()
}
return route_df, summary
route_df, summary = analyze_receiver_routes(tracking_df, "Justin Jefferson")
print(summary)
Draft Pick Value Calculator
Calculate and compare draft pick values across rounds.
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
class DraftValueCalculator:
"""Calculate draft pick values based on historical production."""
def __init__(self, historical_draft_df):
self.draft_data = historical_draft_df
self.value_curve = None
def fit_value_curve(self):
"""Fit value curve to historical data."""
# Group by pick and calculate average career value
pick_values = self.draft_data.groupby("pick_number").agg({
"career_war": "mean",
"player_id": "count"
}).rename(columns={"player_id": "sample_size"})
# Filter to picks with enough sample
pick_values = pick_values[pick_values["sample_size"] >= 10]
# Fit exponential decay
def value_func(x, a, b, c):
return a * np.exp(-b * x) + c
picks = pick_values.index.values
values = pick_values["career_war"].values
params, _ = curve_fit(value_func, picks, values, p0=[10, 0.05, 0])
self.value_curve = lambda x: value_func(x, *params)
return pick_values
def get_pick_value(self, pick_number):
"""Get expected value for a pick."""
if self.value_curve is None:
self.fit_value_curve()
return max(0, self.value_curve(pick_number))
def compare_picks(self, pick1, pick2):
"""Compare value between two picks."""
val1 = self.get_pick_value(pick1)
val2 = self.get_pick_value(pick2)
return {
"pick1": pick1,
"pick1_value": val1,
"pick2": pick2,
"pick2_value": val2,
"difference": val1 - val2,
"ratio": val1 / val2 if val2 > 0 else float("inf")
}
def equivalent_picks(self, pick_number, target_value_pct=1.0):
"""Find equivalent pick combinations."""
target_value = self.get_pick_value(pick_number) * target_value_pct
# Find single pick equivalents
for p in range(pick_number + 1, 250):
if self.get_pick_value(p) <= target_value:
break
# Find two-pick combinations
combinations = []
for p1 in range(pick_number + 5, 100):
remaining = target_value - self.get_pick_value(p1)
for p2 in range(p1 + 5, 200):
combo_value = self.get_pick_value(p2)
if abs(combo_value - remaining) < 0.5:
combinations.append({
"picks": [p1, p2],
"total_value": self.get_pick_value(p1) + combo_value
})
return combinations[:5]
def trade_analyzer(self, team1_picks, team2_picks):
"""Analyze a trade between two teams."""
team1_value = sum(self.get_pick_value(p) for p in team1_picks)
team2_value = sum(self.get_pick_value(p) for p in team2_picks)
return {
"team1_picks": team1_picks,
"team1_value": team1_value,
"team2_picks": team2_picks,
"team2_value": team2_value,
"difference": team1_value - team2_value,
"winner": "Team 1" if team1_value > team2_value else "Team 2"
}
# Usage
calculator = DraftValueCalculator(historical_drafts_df)
calculator.fit_value_curve()
# Compare picks
comparison = calculator.compare_picks(1, 10)
print(f"Pick 1 value: {comparison['pick1_value']:.2f} WAR")
print(f"Pick 10 value: {comparison['pick2_value']:.2f} WAR")
# Analyze trade
trade = calculator.trade_analyzer(
team1_picks=[5],
team2_picks=[15, 25, 50]
)
print(f"\nTrade Winner: {trade['winner']}")
print(f"Value difference: {abs(trade['difference']):.2f} WAR")
Plus-Minus Rating Calculator
Calculate plus-minus ratings for basketball players with adjustments.
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.linear_model import RidgeCV
def calculate_raw_plus_minus(play_by_play_df):
"""Calculate raw plus-minus from play-by-play data."""
# Get all players
all_players = set()
for col in ["home_1", "home_2", "home_3", "home_4", "home_5",
"away_1", "away_2", "away_3", "away_4", "away_5"]:
all_players.update(play_by_play_df[col].dropna().unique())
player_list = sorted(list(all_players))
player_idx = {p: i for i, p in enumerate(player_list)}
# Calculate per-stint stats
stints = play_by_play_df.groupby("stint_id").agg({
"home_1": "first", "home_2": "first", "home_3": "first",
"home_4": "first", "home_5": "first",
"away_1": "first", "away_2": "first", "away_3": "first",
"away_4": "first", "away_5": "first",
"home_points": "sum",
"away_points": "sum",
"possessions": "sum"
})
stints["margin"] = stints["home_points"] - stints["away_points"]
stints["margin_per_100"] = stints["margin"] / stints["possessions"] * 100
return stints, player_list, player_idx
def calculate_rapm(stints_df, player_list, player_idx, alpha_range=(0.01, 100)):
"""Calculate Regularized Adjusted Plus-Minus (RAPM)."""
n_stints = len(stints_df)
n_players = len(player_list)
# Build sparse design matrix
rows, cols, data = [], [], []
for idx, stint in stints_df.iterrows():
row_idx = stints_df.index.get_loc(idx)
# Home players get +1
for col in ["home_1", "home_2", "home_3", "home_4", "home_5"]:
player = stint[col]
if pd.notna(player) and player in player_idx:
rows.append(row_idx)
cols.append(player_idx[player])
data.append(1)
# Away players get -1
for col in ["away_1", "away_2", "away_3", "away_4", "away_5"]:
player = stint[col]
if pd.notna(player) and player in player_idx:
rows.append(row_idx)
cols.append(player_idx[player])
data.append(-1)
X = sparse.csr_matrix((data, (rows, cols)), shape=(n_stints, n_players))
y = stints_df["margin_per_100"].values
# Weight by possessions
weights = np.sqrt(stints_df["possessions"].values)
# Ridge regression with cross-validation
alphas = np.logspace(np.log10(alpha_range[0]), np.log10(alpha_range[1]), 50)
model = RidgeCV(alphas=alphas, fit_intercept=True)
model.fit(X.multiply(weights[:, np.newaxis]), y * weights)
print(f"Best alpha: {model.alpha_:.2f}")
# Extract RAPM values
rapm = pd.DataFrame({
"player": player_list,
"rapm": model.coef_
})
return rapm.sort_values("rapm", ascending=False)
# Calculate RAPM
stints, player_list, player_idx = calculate_raw_plus_minus(pbp_df)
rapm_ratings = calculate_rapm(stints, player_list, player_idx)
print("Top Players by RAPM:")
print(rapm_ratings.head(20))
print("\nBottom Players by RAPM:")
print(rapm_ratings.tail(10))
Catch Probability Model
Model catch probability for outfield fly balls using Statcast tracking data.
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
def calculate_catch_features(batted_balls_df):
"""Calculate features for catch probability."""
df = batted_balls_df.copy()
# Distance from fielder to ball landing spot
df["catch_distance"] = np.sqrt(
(df["fielder_start_x"] - df["landing_x"]) ** 2 +
(df["fielder_start_y"] - df["landing_y"]) ** 2
)
# Hang time (time for ball to land)
df["hang_time"] = df["hit_time"] - df["pitch_time"]
# Fielder reaction time needed
# Average sprint speed ~27 ft/s
df["time_needed"] = df["catch_distance"] / 27.0
df["time_margin"] = df["hang_time"] - df["time_needed"]
# Direction of ball relative to fielder (forward/back/lateral)
df["angle_to_ball"] = np.arctan2(
df["landing_y"] - df["fielder_start_y"],
df["landing_x"] - df["fielder_start_x"]
) * 180 / np.pi
# Going back is harder
df["going_back"] = (df["landing_y"] > df["fielder_start_y"]).astype(int)
return df
def train_catch_probability_model(batted_balls_df):
"""Train catch probability model."""
df = calculate_catch_features(batted_balls_df)
df = df[df["hit_type"] == "fly_ball"]
features = [
"catch_distance",
"hang_time",
"time_margin",
"going_back",
"launch_angle",
"exit_velocity",
"fielder_sprint_speed"
]
df = df.dropna(subset=features + ["was_caught"])
X = df[features]
y = df["was_caught"].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
model = GradientBoostingClassifier(
n_estimators=200,
max_depth=5,
learning_rate=0.1,
random_state=42
)
model.fit(X_train, y_train)
# Add predictions
df["catch_prob"] = model.predict_proba(X[features])[:, 1]
print(f"Model Accuracy: {model.score(X_test, y_test):.3f}")
# Outs Above Average
df["oaa"] = df["was_caught"] - df["catch_prob"]
# Aggregate by fielder
fielder_oaa = df.groupby("fielder_name").agg({
"oaa": "sum",
"catch_prob": ["sum", "count"],
"was_caught": "sum"
})
fielder_oaa.columns = ["OAA", "Expected_Catches", "Opportunities", "Actual_Catches"]
fielder_oaa = fielder_oaa.sort_values("OAA", ascending=False)
return model, df, fielder_oaa
model, catches_df, fielder_rankings = train_catch_probability_model(batted_balls_df)
print("\nTop Outfielders by Outs Above Average:")
print(fielder_rankings.head(15))
Player Embeddings with Neural Networks
Create player embeddings using neural networks to find similar players.
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
class PlayerEmbeddingNet(nn.Module):
"""Autoencoder to create player embeddings."""
def __init__(self, input_dim, embedding_dim=32):
super().__init__()
# Encoder
self.encoder = nn.Sequential(
nn.Linear(input_dim, 64),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, embedding_dim)
)
# Decoder
self.decoder = nn.Sequential(
nn.Linear(embedding_dim, 32),
nn.ReLU(),
nn.Linear(32, 64),
nn.ReLU(),
nn.Linear(64, input_dim)
)
def forward(self, x):
embedding = self.encoder(x)
reconstruction = self.decoder(embedding)
return reconstruction, embedding
def get_embedding(self, x):
with torch.no_grad():
return self.encoder(x)
def train_embedding_model(player_df, stat_columns, epochs=100):
"""Train autoencoder for player embeddings."""
# Prepare data
X = player_df[stat_columns].fillna(player_df[stat_columns].median())
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_tensor = torch.FloatTensor(X_scaled)
# Model
model = PlayerEmbeddingNet(len(stat_columns), embedding_dim=16)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Training
model.train()
for epoch in range(epochs):
optimizer.zero_grad()
reconstruction, _ = model(X_tensor)
loss = criterion(reconstruction, X_tensor)
loss.backward()
optimizer.step()
if (epoch + 1) % 20 == 0:
print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
# Get embeddings
model.eval()
embeddings = model.get_embedding(X_tensor).numpy()
# Create embedding DataFrame
embedding_df = pd.DataFrame(
embeddings,
columns=[f"emb_{i}" for i in range(embeddings.shape[1])],
index=player_df.index
)
embedding_df["player_name"] = player_df["name"]
return model, scaler, embedding_df
def find_similar_players(embedding_df, player_name, n=10):
"""Find similar players using embeddings."""
emb_cols = [c for c in embedding_df.columns if c.startswith("emb_")]
nn = NearestNeighbors(n_neighbors=n+1, metric="cosine")
nn.fit(embedding_df[emb_cols])
player_idx = embedding_df[embedding_df["player_name"] == player_name].index[0]
player_emb = embedding_df.loc[player_idx, emb_cols].values.reshape(1, -1)
distances, indices = nn.kneighbors(player_emb)
similar = embedding_df.iloc[indices[0][1:]].copy() # Exclude self
similar["similarity"] = 1 - distances[0][1:]
return similar[["player_name", "similarity"]]
# Usage
stat_cols = ["avg", "obp", "slg", "hr_rate", "bb_rate", "k_rate", "sprint_speed", "war"]
model, scaler, embeddings = train_embedding_model(players_df, stat_cols)
# Find players similar to a specific player
similar = find_similar_players(embeddings, "Mike Trout")
print("Players similar to Mike Trout:")
print(similar)
Pitch Classification Model
Classify pitch types using Statcast data with machine learning.
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
def build_pitch_classifier(pitches_df):
"""Classify pitch types from Statcast data."""
# Features for classification
features = [
"release_speed",
"release_spin_rate",
"release_extension",
"release_pos_x",
"release_pos_z",
"pfx_x", # Horizontal movement
"pfx_z", # Vertical movement
"plate_x",
"plate_z",
"vx0", "vy0", "vz0", # Initial velocities
"ax", "ay", "az" # Accelerations
]
# Filter to common pitch types
pitch_types = ["FF", "SL", "CH", "CU", "SI", "FC", "KC"]
df = pitches_df[pitches_df["pitch_type"].isin(pitch_types)].copy()
df = df.dropna(subset=features + ["pitch_type"])
X = df[features]
y = df["pitch_type"]
# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
# Train model
model = GradientBoostingClassifier(
n_estimators=200,
max_depth=6,
learning_rate=0.1,
random_state=42
)
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
# Feature importance
importance = pd.DataFrame({
"feature": features,
"importance": model.feature_importances_
}).sort_values("importance", ascending=False)
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Pitch Type Classification Confusion Matrix")
plt.ylabel("Actual")
plt.xlabel("Predicted")
return {
"model": model,
"scaler": scaler,
"encoder": le,
"accuracy": accuracy,
"feature_importance": importance
}
result = build_pitch_classifier(statcast_df)
Expected Goals (xG) Model
Build expected goals model for soccer shot analysis using logistic regression.
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
def calculate_shot_features(shots_df):
"""Engineer features for xG model."""
df = shots_df.copy()
# Distance to goal center (assuming goal at x=100, y=50)
df["distance"] = np.sqrt(
(100 - df["x"]) ** 2 + (50 - df["y"]) ** 2
)
# Angle to goal
df["angle"] = np.abs(np.arctan2(
df["y"] - 50,
100 - df["x"]
)) * 180 / np.pi
# Goal mouth angle (visible goal width)
goal_width = 7.32 # meters
df["goal_angle"] = np.arctan(
goal_width * (100 - df["x"]) /
((100 - df["x"]) ** 2 + (df["y"] - 50) ** 2 - (goal_width / 2) ** 2)
)
# Distance squared (non-linear effect)
df["distance_sq"] = df["distance"] ** 2
# Shot type encoding
df["is_header"] = (df["body_part"] == "head").astype(int)
df["is_foot"] = (df["body_part"] == "foot").astype(int)
# Situation encoding
df["is_penalty"] = (df["situation"] == "penalty").astype(int)
df["is_free_kick"] = (df["situation"] == "free_kick").astype(int)
df["is_corner"] = (df["situation"] == "from_corner").astype(int)
return df
def train_xg_model(shots_df):
"""Train expected goals model."""
df = calculate_shot_features(shots_df)
features = [
"distance", "distance_sq", "angle", "goal_angle",
"is_header", "is_penalty", "is_free_kick", "is_corner"
]
X = df[features].fillna(0)
y = df["is_goal"].astype(int)
# Calibrated logistic regression for accurate probabilities
base_model = LogisticRegression(max_iter=1000, C=1.0)
model = CalibratedClassifierCV(base_model, cv=5, method="isotonic")
model.fit(X, y)
# Cross-validation score
cv_scores = cross_val_score(base_model, X, y, cv=5, scoring="neg_log_loss")
print(f"CV Log Loss: {-cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
# Add predictions
df["xg"] = model.predict_proba(X)[:, 1]
# xG by distance visualization
bins = pd.cut(df["distance"], bins=10)
xg_by_dist = df.groupby(bins).agg({
"xg": "mean",
"is_goal": "mean"
})
plt.figure(figsize=(10, 6))
plt.plot(range(len(xg_by_dist)), xg_by_dist["xg"], "b-", label="Predicted xG")
plt.plot(range(len(xg_by_dist)), xg_by_dist["is_goal"], "r--", label="Actual Goal Rate")
plt.xlabel("Distance Bin")
plt.ylabel("Probability")
plt.legend()
plt.title("xG Calibration by Distance")
return model, df
model, shots_with_xg = train_xg_model(shots_df)
# Calculate player xG totals
player_xg = shots_with_xg.groupby("player_name").agg({
"xg": "sum",
"is_goal": "sum"
}).rename(columns={"is_goal": "goals"})
player_xg["xg_diff"] = player_xg["goals"] - player_xg["xg"]
print(player_xg.sort_values("xg", ascending=False).head(20))
Streak Detection Query
Find winning/losing streaks and hitting streaks.
-- Detect hitting streaks
WITH game_hits AS (
SELECT
player_id,
game_date,
hits,
CASE WHEN hits > 0 THEN 1 ELSE 0 END AS had_hit,
ROW_NUMBER() OVER (PARTITION BY player_id ORDER BY game_date) AS game_num
FROM game_log
WHERE player_id = ?
AND season = 2024
AND at_bats > 0
),
streak_groups AS (
SELECT
*,
game_num - ROW_NUMBER() OVER (
PARTITION BY player_id, had_hit
ORDER BY game_date
) AS streak_group
FROM game_hits
),
streaks AS (
SELECT
player_id,
MIN(game_date) AS streak_start,
MAX(game_date) AS streak_end,
COUNT(*) AS streak_length,
SUM(hits) AS total_hits,
had_hit
FROM streak_groups
GROUP BY player_id, streak_group, had_hit
)
SELECT
streak_start,
streak_end,
streak_length,
total_hits,
CASE WHEN had_hit = 1 THEN 'Hit Streak' ELSE 'Hitless Streak' END AS streak_type
FROM streaks
WHERE had_hit = 1 -- Only hit streaks
AND streak_length >= 5 -- Minimum 5 games
ORDER BY streak_length DESC;
-- Team winning streak detection
WITH team_games AS (
SELECT
team_id,
game_date,
CASE WHEN won = 1 THEN 1 ELSE 0 END AS won,
ROW_NUMBER() OVER (PARTITION BY team_id ORDER BY game_date) AS game_num
FROM (
SELECT
home_team_id AS team_id,
game_date,
CASE WHEN home_score > away_score THEN 1 ELSE 0 END AS won
FROM games WHERE season = 2024
UNION ALL
SELECT
away_team_id AS team_id,
game_date,
CASE WHEN away_score > home_score THEN 1 ELSE 0 END AS won
FROM games WHERE season = 2024
) all_games
),
streak_groups AS (
SELECT
*,
game_num - ROW_NUMBER() OVER (
PARTITION BY team_id, won ORDER BY game_date
) AS streak_group
FROM team_games
)
SELECT
t.team_name,
MIN(sg.game_date) AS streak_start,
MAX(sg.game_date) AS streak_end,
COUNT(*) AS streak_length,
CASE WHEN sg.won = 1 THEN 'Winning' ELSE 'Losing' END AS streak_type
FROM streak_groups sg
JOIN teams t ON sg.team_id = t.team_id
GROUP BY t.team_name, sg.streak_group, sg.won
HAVING COUNT(*) >= 5
ORDER BY streak_length DESC
LIMIT 20;
Split Statistics Query
Calculate player splits: home/away, vs lefty/righty, by month.
-- Player performance splits
SELECT
p.name,
-- Home/Away splits
'Home' AS split_type,
SUM(CASE WHEN gl.is_home = 1 THEN gl.at_bats ELSE 0 END) AS ab,
SUM(CASE WHEN gl.is_home = 1 THEN gl.hits ELSE 0 END) AS h,
ROUND(
SUM(CASE WHEN gl.is_home = 1 THEN gl.hits ELSE 0 END) * 1.0 /
NULLIF(SUM(CASE WHEN gl.is_home = 1 THEN gl.at_bats ELSE 0 END), 0),
3
) AS avg,
SUM(CASE WHEN gl.is_home = 1 THEN gl.home_runs ELSE 0 END) AS hr,
SUM(CASE WHEN gl.is_home = 1 THEN gl.rbi ELSE 0 END) AS rbi
FROM players p
JOIN game_log gl ON p.player_id = gl.player_id
WHERE p.player_id = ? AND gl.season = 2024
GROUP BY p.name
UNION ALL
SELECT
p.name,
'Away' AS split_type,
SUM(CASE WHEN gl.is_home = 0 THEN gl.at_bats ELSE 0 END),
SUM(CASE WHEN gl.is_home = 0 THEN gl.hits ELSE 0 END),
ROUND(
SUM(CASE WHEN gl.is_home = 0 THEN gl.hits ELSE 0 END) * 1.0 /
NULLIF(SUM(CASE WHEN gl.is_home = 0 THEN gl.at_bats ELSE 0 END), 0),
3
),
SUM(CASE WHEN gl.is_home = 0 THEN gl.home_runs ELSE 0 END),
SUM(CASE WHEN gl.is_home = 0 THEN gl.rbi ELSE 0 END)
FROM players p
JOIN game_log gl ON p.player_id = gl.player_id
WHERE p.player_id = ? AND gl.season = 2024
GROUP BY p.name
UNION ALL
-- vs Left-handed pitchers
SELECT
p.name,
'vs LHP' AS split_type,
SUM(CASE WHEN opp.throws = 'L' THEN gl.at_bats ELSE 0 END),
SUM(CASE WHEN opp.throws = 'L' THEN gl.hits ELSE 0 END),
ROUND(
SUM(CASE WHEN opp.throws = 'L' THEN gl.hits ELSE 0 END) * 1.0 /
NULLIF(SUM(CASE WHEN opp.throws = 'L' THEN gl.at_bats ELSE 0 END), 0),
3
),
SUM(CASE WHEN opp.throws = 'L' THEN gl.home_runs ELSE 0 END),
SUM(CASE WHEN opp.throws = 'L' THEN gl.rbi ELSE 0 END)
FROM players p
JOIN game_log gl ON p.player_id = gl.player_id
JOIN players opp ON gl.opposing_pitcher_id = opp.player_id
WHERE p.player_id = ? AND gl.season = 2024
GROUP BY p.name
UNION ALL
-- vs Right-handed pitchers
SELECT
p.name,
'vs RHP' AS split_type,
SUM(CASE WHEN opp.throws = 'R' THEN gl.at_bats ELSE 0 END),
SUM(CASE WHEN opp.throws = 'R' THEN gl.hits ELSE 0 END),
ROUND(
SUM(CASE WHEN opp.throws = 'R' THEN gl.hits ELSE 0 END) * 1.0 /
NULLIF(SUM(CASE WHEN opp.throws = 'R' THEN gl.at_bats ELSE 0 END), 0),
3
),
SUM(CASE WHEN opp.throws = 'R' THEN gl.home_runs ELSE 0 END),
SUM(CASE WHEN opp.throws = 'R' THEN gl.rbi ELSE 0 END)
FROM players p
JOIN game_log gl ON p.player_id = gl.player_id
JOIN players opp ON gl.opposing_pitcher_id = opp.player_id
WHERE p.player_id = ? AND gl.season = 2024
GROUP BY p.name;
Fantasy Points Calculation
Calculate fantasy sports points based on scoring rules.
-- Calculate fantasy baseball points (standard scoring)
WITH fantasy_scoring AS (
SELECT
p.player_id,
p.name,
pos.position,
t.team_name,
s.games,
-- Batting points
s.hits * 1 + -- 1 pt per hit
s.doubles * 1 + -- +1 for 2B (total 2)
s.triples * 2 + -- +2 for 3B (total 3)
s.home_runs * 3 + -- +3 for HR (total 4)
s.rbi * 1 + -- 1 pt per RBI
s.runs * 1 + -- 1 pt per run
s.walks * 1 + -- 1 pt per walk
s.stolen_bases * 2 + -- 2 pts per SB
s.caught_stealing * -1 + -- -1 per CS
s.strikeouts * -0.5 -- -0.5 per K
AS batting_points,
-- Pitching points
COALESCE(ps.innings_pitched * 3, 0) + -- 3 pts per IP
COALESCE(ps.strikeouts * 1, 0) + -- 1 pt per K
COALESCE(ps.wins * 5, 0) + -- 5 pts per W
COALESCE(ps.saves * 5, 0) + -- 5 pts per SV
COALESCE(ps.earned_runs * -2, 0) + -- -2 per ER
COALESCE(ps.walks * -1, 0) + -- -1 per BB
COALESCE(ps.hits_allowed * -1, 0) -- -1 per hit
AS pitching_points
FROM players p
JOIN player_positions pos ON p.player_id = pos.player_id
JOIN teams t ON p.team_id = t.team_id
LEFT JOIN batting_stats s ON p.player_id = s.player_id AND s.season = 2024
LEFT JOIN pitching_stats ps ON p.player_id = ps.player_id AND ps.season = 2024
)
SELECT
player_id,
name,
position,
team_name,
games,
ROUND(batting_points, 1) AS batting_pts,
ROUND(pitching_points, 1) AS pitching_pts,
ROUND(batting_points + pitching_points, 1) AS total_fantasy_pts,
ROUND((batting_points + pitching_points) / NULLIF(games, 0), 2) AS pts_per_game,
RANK() OVER (ORDER BY batting_points + pitching_points DESC) AS overall_rank,
RANK() OVER (PARTITION BY position ORDER BY batting_points + pitching_points DESC) AS position_rank
FROM fantasy_scoring
WHERE games >= 20
ORDER BY total_fantasy_pts DESC
LIMIT 100;
Team Standings with Run Differential
Calculate team standings with Pythagorean win expectation.
-- Team standings with Pythagorean expectation
WITH team_records AS (
SELECT
t.team_id,
t.team_name,
t.division,
t.league,
SUM(CASE WHEN g.winner_id = t.team_id THEN 1 ELSE 0 END) AS wins,
SUM(CASE WHEN g.loser_id = t.team_id THEN 1 ELSE 0 END) AS losses,
SUM(CASE WHEN g.home_team_id = t.team_id THEN g.home_score ELSE g.away_score END) AS runs_scored,
SUM(CASE WHEN g.home_team_id = t.team_id THEN g.away_score ELSE g.home_score END) AS runs_allowed
FROM teams t
JOIN games g ON t.team_id = g.home_team_id OR t.team_id = g.away_team_id
WHERE g.season = 2024 AND g.status = 'Final'
GROUP BY t.team_id, t.team_name, t.division, t.league
)
SELECT
team_name,
division,
wins,
losses,
ROUND(wins * 1.0 / (wins + losses), 3) AS win_pct,
runs_scored AS RS,
runs_allowed AS RA,
runs_scored - runs_allowed AS run_diff,
-- Pythagorean expectation (exponent 1.83 for baseball)
ROUND(
POWER(runs_scored, 1.83) /
(POWER(runs_scored, 1.83) + POWER(runs_allowed, 1.83)),
3
) AS pythag_pct,
ROUND(
(wins + losses) * POWER(runs_scored, 1.83) /
(POWER(runs_scored, 1.83) + POWER(runs_allowed, 1.83)),
0
) AS expected_wins,
wins - ROUND(
(wins + losses) * POWER(runs_scored, 1.83) /
(POWER(runs_scored, 1.83) + POWER(runs_allowed, 1.83)),
0
) AS luck_factor,
-- Games behind division leader
(
SELECT MAX(w2.wins) - wins + (losses - MIN(w2.losses))
FROM team_records w2
WHERE w2.division = team_records.division
) / 2.0 AS games_behind
FROM team_records
ORDER BY division, wins DESC;
Park Factor Calculation
Calculate park factors to adjust for home ballpark effects.
-- Calculate park factors for each ballpark
WITH park_stats AS (
SELECT
v.venue_id,
v.venue_name,
t.team_name AS home_team,
-- Home games stats
SUM(g.home_score + g.away_score) AS total_runs,
SUM(g.home_hits + g.away_hits) AS total_hits,
SUM(g.home_hr + g.away_hr) AS total_hr,
COUNT(*) AS home_games
FROM games g
JOIN venues v ON g.venue_id = v.venue_id
JOIN teams t ON g.home_team_id = t.team_id
WHERE g.season = 2024
AND g.status = 'Final'
GROUP BY v.venue_id, v.venue_name, t.team_name
),
road_stats AS (
SELECT
t.team_id,
t.team_name,
SUM(CASE
WHEN g.home_team_id = t.team_id THEN g.home_score + g.away_score
ELSE g.home_score + g.away_score
END) AS total_runs,
SUM(CASE
WHEN g.home_team_id = t.team_id THEN g.home_hits + g.away_hits
ELSE g.home_hits + g.away_hits
END) AS total_hits,
SUM(CASE
WHEN g.home_team_id = t.team_id THEN g.home_hr + g.away_hr
ELSE g.home_hr + g.away_hr
END) AS total_hr,
SUM(CASE WHEN g.away_team_id = t.team_id THEN 1 ELSE 0 END) AS road_games
FROM games g
JOIN teams t ON g.away_team_id = t.team_id
WHERE g.season = 2024
AND g.status = 'Final'
GROUP BY t.team_id, t.team_name
)
SELECT
ps.venue_name,
ps.home_team,
ps.home_games,
-- Runs park factor
ROUND(
(ps.total_runs / ps.home_games) /
NULLIF((rs.total_runs / rs.road_games), 0),
3
) AS runs_pf,
-- Hits park factor
ROUND(
(ps.total_hits / ps.home_games) /
NULLIF((rs.total_hits / rs.road_games), 0),
3
) AS hits_pf,
-- HR park factor
ROUND(
(ps.total_hr / ps.home_games) /
NULLIF((rs.total_hr / NULLIF(rs.road_games, 0)), 0),
3
) AS hr_pf,
-- Average runs per game at park
ROUND(ps.total_runs * 1.0 / ps.home_games, 2) AS runs_per_game,
ROUND(ps.total_hr * 1.0 / ps.home_games, 2) AS hr_per_game
FROM park_stats ps
JOIN teams t ON ps.home_team = t.team_name
JOIN road_stats rs ON t.team_id = rs.team_id
ORDER BY runs_pf DESC;
Advanced Pitching Metrics
Calculate FIP, xFIP, and other advanced pitching metrics.
-- Calculate FIP and advanced pitching metrics
WITH league_constants AS (
SELECT
season,
-- FIP constant = lgERA - ((13*lgHR + 3*lgBB - 2*lgK) / lgIP)
AVG(era) - (
(13 * SUM(home_runs) + 3 * (SUM(walks) + SUM(hbp)) - 2 * SUM(strikeouts)) /
NULLIF(SUM(innings_pitched), 0)
) AS fip_constant,
-- League HR/FB rate for xFIP
SUM(home_runs) * 1.0 / NULLIF(SUM(fly_balls), 0) AS lg_hr_fb_rate
FROM pitching_stats
WHERE season = 2024
GROUP BY season
),
pitcher_metrics AS (
SELECT
p.player_id,
p.name,
t.team_name,
ps.season,
ps.games,
ps.games_started,
ps.innings_pitched AS ip,
ps.wins,
ps.losses,
ps.saves,
ps.strikeouts AS k,
ps.walks AS bb,
ps.hbp,
ps.home_runs AS hr,
ps.earned_runs,
ps.fly_balls,
ps.era,
-- FIP = ((13*HR + 3*(BB+HBP) - 2*K) / IP) + FIP_constant
ROUND(
((13 * ps.home_runs + 3 * (ps.walks + ps.hbp) - 2 * ps.strikeouts) /
NULLIF(ps.innings_pitched, 0)) + lc.fip_constant,
2
) AS fip,
-- xFIP uses league average HR/FB rate
ROUND(
((13 * (ps.fly_balls * lc.lg_hr_fb_rate) + 3 * (ps.walks + ps.hbp) - 2 * ps.strikeouts) /
NULLIF(ps.innings_pitched, 0)) + lc.fip_constant,
2
) AS xfip,
-- K/9
ROUND(ps.strikeouts * 9.0 / NULLIF(ps.innings_pitched, 0), 2) AS k_9,
-- BB/9
ROUND(ps.walks * 9.0 / NULLIF(ps.innings_pitched, 0), 2) AS bb_9,
-- K/BB ratio
ROUND(ps.strikeouts * 1.0 / NULLIF(ps.walks, 0), 2) AS k_bb,
-- HR/9
ROUND(ps.home_runs * 9.0 / NULLIF(ps.innings_pitched, 0), 2) AS hr_9,
-- WHIP
ROUND((ps.walks + ps.hits) / NULLIF(ps.innings_pitched, 0), 3) AS whip,
-- BABIP
ROUND(
(ps.hits - ps.home_runs) * 1.0 /
NULLIF(ps.at_bats - ps.strikeouts - ps.home_runs + ps.sacrifice_flies, 0),
3
) AS babip
FROM pitching_stats ps
JOIN players p ON ps.player_id = p.player_id
JOIN teams t ON ps.team_id = t.team_id
CROSS JOIN league_constants lc
WHERE ps.season = 2024
AND ps.innings_pitched >= 50 -- Minimum IP qualifier
)
SELECT *
FROM pitcher_metrics
ORDER BY fip ASC
LIMIT 50;
Pitching Matchup Analysis
Analyze batter vs pitcher historical matchups.
-- Batter vs Pitcher matchup history
SELECT
b.name AS batter_name,
p.name AS pitcher_name,
COUNT(*) AS plate_appearances,
SUM(CASE WHEN pa.event_type = 'single' THEN 1 ELSE 0 END) AS singles,
SUM(CASE WHEN pa.event_type = 'double' THEN 1 ELSE 0 END) AS doubles,
SUM(CASE WHEN pa.event_type = 'triple' THEN 1 ELSE 0 END) AS triples,
SUM(CASE WHEN pa.event_type = 'home_run' THEN 1 ELSE 0 END) AS home_runs,
SUM(CASE WHEN pa.event_type IN ('single','double','triple','home_run') THEN 1 ELSE 0 END) AS hits,
SUM(CASE WHEN pa.event_type = 'strikeout' THEN 1 ELSE 0 END) AS strikeouts,
SUM(CASE WHEN pa.event_type = 'walk' THEN 1 ELSE 0 END) AS walks,
-- At bats (exclude walks, HBP, sac)
SUM(CASE WHEN pa.is_at_bat = 1 THEN 1 ELSE 0 END) AS at_bats,
-- Batting average
ROUND(
SUM(CASE WHEN pa.event_type IN ('single','double','triple','home_run') THEN 1 ELSE 0 END) * 1.0 /
NULLIF(SUM(CASE WHEN pa.is_at_bat = 1 THEN 1 ELSE 0 END), 0),
3
) AS avg,
-- Slugging
ROUND(
(SUM(CASE WHEN pa.event_type = 'single' THEN 1 ELSE 0 END) +
SUM(CASE WHEN pa.event_type = 'double' THEN 2 ELSE 0 END) +
SUM(CASE WHEN pa.event_type = 'triple' THEN 3 ELSE 0 END) +
SUM(CASE WHEN pa.event_type = 'home_run' THEN 4 ELSE 0 END)) * 1.0 /
NULLIF(SUM(CASE WHEN pa.is_at_bat = 1 THEN 1 ELSE 0 END), 0),
3
) AS slg,
-- Recent form (last 2 seasons)
SUM(CASE WHEN pa.season >= YEAR(CURDATE()) - 1 THEN 1 ELSE 0 END) AS recent_pa
FROM plate_appearances pa
JOIN players b ON pa.batter_id = b.player_id
JOIN players p ON pa.pitcher_id = p.player_id
WHERE pa.batter_id = ?
AND pa.pitcher_id = ?
GROUP BY b.name, p.name
HAVING plate_appearances >= 5;
Rolling Statistics Window
Calculate rolling averages over game windows for trend analysis.
-- Rolling 20-game batting average
WITH game_stats AS (
SELECT
gl.player_id,
gl.game_date,
gl.at_bats,
gl.hits,
ROW_NUMBER() OVER (
PARTITION BY gl.player_id
ORDER BY gl.game_date
) AS game_num
FROM game_log gl
WHERE gl.player_id = ?
AND gl.season = 2024
AND gl.at_bats > 0
)
SELECT
game_date,
at_bats,
hits,
game_num,
-- Rolling 20-game totals
SUM(hits) OVER (
ORDER BY game_num
ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
) AS rolling_hits,
SUM(at_bats) OVER (
ORDER BY game_num
ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
) AS rolling_ab,
-- Rolling 20-game average
ROUND(
SUM(hits) OVER (
ORDER BY game_num
ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
) * 1.0 /
NULLIF(SUM(at_bats) OVER (
ORDER BY game_num
ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
), 0),
3
) AS rolling_avg,
-- Season-to-date average
ROUND(
SUM(hits) OVER (ORDER BY game_num) * 1.0 /
NULLIF(SUM(at_bats) OVER (ORDER BY game_num), 0),
3
) AS season_avg
FROM game_stats
ORDER BY game_date;
Player Comparison Query
Compare two players head-to-head across multiple statistics.
-- Side-by-side player comparison
WITH player_stats AS (
SELECT
p.player_id,
p.name,
p.position,
s.season,
s.games,
s.at_bats,
s.hits,
ROUND(s.hits * 1.0 / NULLIF(s.at_bats, 0), 3) AS avg,
s.home_runs,
s.rbi,
s.stolen_bases,
s.walks,
s.strikeouts,
ROUND(s.walks * 1.0 / s.plate_appearances * 100, 1) AS bb_pct,
ROUND(s.strikeouts * 1.0 / s.plate_appearances * 100, 1) AS k_pct,
s.war
FROM players p
JOIN player_stats s ON p.player_id = s.player_id
WHERE p.player_id IN (?, ?) -- Two player IDs to compare
AND s.season = 2024
)
SELECT
'Statistic' AS metric,
MAX(CASE WHEN player_id = ? THEN name END) AS player1,
MAX(CASE WHEN player_id = ? THEN name END) AS player2
FROM player_stats
UNION ALL
SELECT 'Games',
MAX(CASE WHEN player_id = ? THEN CAST(games AS CHAR) END),
MAX(CASE WHEN player_id = ? THEN CAST(games AS CHAR) END)
FROM player_stats
UNION ALL
SELECT 'AVG',
MAX(CASE WHEN player_id = ? THEN CAST(avg AS CHAR) END),
MAX(CASE WHEN player_id = ? THEN CAST(avg AS CHAR) END)
FROM player_stats
UNION ALL
SELECT 'Home Runs',
MAX(CASE WHEN player_id = ? THEN CAST(home_runs AS CHAR) END),
MAX(CASE WHEN player_id = ? THEN CAST(home_runs AS CHAR) END)
FROM player_stats
UNION ALL
SELECT 'RBI',
MAX(CASE WHEN player_id = ? THEN CAST(rbi AS CHAR) END),
MAX(CASE WHEN player_id = ? THEN CAST(rbi AS CHAR) END)
FROM player_stats
UNION ALL
SELECT 'WAR',
MAX(CASE WHEN player_id = ? THEN CAST(war AS CHAR) END),
MAX(CASE WHEN player_id = ? THEN CAST(war AS CHAR) END)
FROM player_stats;
Player Career Statistics
Aggregate career statistics for a player across all seasons.
-- Career statistics with season-by-season breakdown
SELECT
p.player_id,
p.name,
p.birth_date,
TIMESTAMPDIFF(YEAR, p.birth_date, CURDATE()) AS current_age,
MIN(s.season) AS first_season,
MAX(s.season) AS last_season,
COUNT(DISTINCT s.season) AS seasons_played,
SUM(s.games) AS total_games,
SUM(s.plate_appearances) AS total_pa,
SUM(s.at_bats) AS total_ab,
SUM(s.hits) AS total_hits,
SUM(s.doubles) AS total_2b,
SUM(s.triples) AS total_3b,
SUM(s.home_runs) AS total_hr,
SUM(s.rbi) AS total_rbi,
SUM(s.stolen_bases) AS total_sb,
SUM(s.walks) AS total_bb,
SUM(s.strikeouts) AS total_so,
ROUND(SUM(s.hits) * 1.0 / NULLIF(SUM(s.at_bats), 0), 3) AS career_avg,
ROUND((SUM(s.hits) + SUM(s.walks)) * 1.0 /
NULLIF(SUM(s.at_bats) + SUM(s.walks) + SUM(s.hbp), 0), 3) AS career_obp,
ROUND((SUM(s.hits) + SUM(s.doubles) + 2*SUM(s.triples) + 3*SUM(s.home_runs)) * 1.0 /
NULLIF(SUM(s.at_bats), 0), 3) AS career_slg,
ROUND(SUM(s.war), 1) AS career_war
FROM players p
JOIN player_stats s ON p.player_id = s.player_id
WHERE p.player_id = ? -- Parameter for specific player
GROUP BY p.player_id, p.name, p.birth_date
UNION ALL
-- Season breakdown
SELECT
NULL AS player_id,
CONCAT(' ', s.season, ' Season') AS name,
NULL AS birth_date,
NULL AS current_age,
s.season AS first_season,
s.season AS last_season,
1 AS seasons_played,
s.games,
s.plate_appearances,
s.at_bats,
s.hits,
s.doubles,
s.triples,
s.home_runs,
s.rbi,
s.stolen_bases,
s.walks,
s.strikeouts,
ROUND(s.hits * 1.0 / NULLIF(s.at_bats, 0), 3) AS season_avg,
ROUND((s.hits + s.walks) * 1.0 / NULLIF(s.at_bats + s.walks, 0), 3) AS season_obp,
ROUND((s.hits + s.doubles + 2*s.triples + 3*s.home_runs) * 1.0 /
NULLIF(s.at_bats, 0), 3) AS season_slg,
ROUND(s.war, 1) AS season_war
FROM player_stats s
WHERE s.player_id = ?
ORDER BY first_season;
Season Leaders Query
SQL query to find statistical leaders across batting categories with minimum qualifications.
-- Find batting leaders with minimum PA qualification
WITH qualified_batters AS (
SELECT
p.player_id,
p.name,
t.team_name,
s.season,
s.games,
s.plate_appearances,
s.at_bats,
s.hits,
s.doubles,
s.triples,
s.home_runs,
s.rbi,
s.stolen_bases,
s.walks,
s.strikeouts,
ROUND(s.hits * 1.0 / NULLIF(s.at_bats, 0), 3) AS batting_avg,
ROUND((s.hits + s.walks + s.hbp) * 1.0 /
NULLIF(s.at_bats + s.walks + s.hbp + s.sacrifice_flies, 0), 3) AS obp,
ROUND((s.hits + s.doubles + 2*s.triples + 3*s.home_runs) * 1.0 /
NULLIF(s.at_bats, 0), 3) AS slg
FROM player_stats s
JOIN players p ON s.player_id = p.player_id
JOIN teams t ON s.team_id = t.team_id
WHERE s.season = 2024
AND s.plate_appearances >= (SELECT MAX(team_games) * 3.1 FROM team_standings WHERE season = 2024)
)
SELECT
player_id,
name,
team_name,
batting_avg,
obp,
slg,
ROUND(obp + slg, 3) AS ops,
home_runs,
rbi,
stolen_bases,
RANK() OVER (ORDER BY batting_avg DESC) AS avg_rank,
RANK() OVER (ORDER BY home_runs DESC) AS hr_rank,
RANK() OVER (ORDER BY rbi DESC) AS rbi_rank
FROM qualified_batters
ORDER BY batting_avg DESC
LIMIT 20;
Performance Testing for Analytics
Performance and load testing for sports analytics functions and APIs.
import pytest
import time
import pandas as pd
import numpy as np
from functools import wraps
from concurrent.futures import ThreadPoolExecutor
import psutil
import tracemalloc
from sports_analytics import (
calculate_war_batch,
generate_projections,
simulate_season
)
def timing_decorator(func):
"""Decorator to measure function execution time."""
@wraps(func)
def wrapper(*args, **kwargs):
start = time.perf_counter()
result = func(*args, **kwargs)
end = time.perf_counter()
print(f"{func.__name__} took {end - start:.4f} seconds")
return result
return wrapper
class TestPerformance:
"""Performance tests for analytics functions."""
@pytest.fixture
def large_dataset(self):
"""Generate large test dataset."""
np.random.seed(42)
n = 100000
return pd.DataFrame({
"player_id": range(n),
"games": np.random.randint(1, 162, n),
"at_bats": np.random.randint(100, 600, n),
"hits": np.random.randint(20, 200, n),
"home_runs": np.random.randint(0, 50, n),
"rbi": np.random.randint(0, 130, n),
"walks": np.random.randint(10, 100, n),
"strikeouts": np.random.randint(30, 200, n)
})
@pytest.mark.performance
def test_war_calculation_performance(self, large_dataset):
"""WAR calculation should complete in reasonable time."""
start = time.perf_counter()
result = calculate_war_batch(large_dataset)
duration = time.perf_counter() - start
assert duration < 5.0, f"WAR calculation took {duration:.2f}s (max 5s)"
assert len(result) == len(large_dataset)
@pytest.mark.performance
def test_projection_memory_usage(self, large_dataset):
"""Test memory usage during projections."""
tracemalloc.start()
result = generate_projections(large_dataset)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
peak_mb = peak / 1024 / 1024
assert peak_mb < 500, f"Peak memory {peak_mb:.1f}MB exceeds 500MB limit"
print(f"Peak memory usage: {peak_mb:.1f}MB")
@pytest.mark.performance
def test_simulation_scalability(self):
"""Test season simulation scales linearly."""
times = []
for n_games in [100, 500, 1000, 2000]:
start = time.perf_counter()
simulate_season(n_games=n_games)
times.append((n_games, time.perf_counter() - start))
# Check roughly linear scaling
# Time for 2000 should be < 2.5x time for 1000
ratio = times[-1][1] / times[-2][1]
assert ratio < 2.5, f"Scaling ratio {ratio:.2f} suggests non-linear performance"
@pytest.mark.performance
def test_concurrent_requests(self):
"""Test handling concurrent requests."""
from sports_api import APIClient
client = APIClient()
def make_request(player_id):
start = time.perf_counter()
result = client.get_player_stats(player_id)
return time.perf_counter() - start, result
player_ids = list(range(1, 51)) # 50 concurrent requests
with ThreadPoolExecutor(max_workers=10) as executor:
results = list(executor.map(make_request, player_ids))
times = [r[0] for r in results]
avg_time = sum(times) / len(times)
assert avg_time < 1.0, f"Average request time {avg_time:.2f}s too slow"
assert all(r[1] is not None for r in results), "Some requests failed"
@pytest.fixture(scope="session")
def benchmark_baseline():
"""Store baseline performance metrics."""
return {
"war_calculation": 2.0, # seconds
"projection_memory": 300, # MB
"api_response": 0.5 # seconds
}
class TestBenchmarks:
"""Benchmark tests against baseline."""
@pytest.mark.benchmark
def test_against_baseline(self, benchmark_baseline, large_dataset):
"""Compare performance against baseline."""
start = time.perf_counter()
calculate_war_batch(large_dataset)
duration = time.perf_counter() - start
baseline = benchmark_baseline["war_calculation"]
assert duration <= baseline * 1.2, (
f"Performance regression: {duration:.2f}s vs baseline {baseline}s"
)
Mock Testing for External Services
Test sports analytics code that depends on external APIs using mocks.
import pytest
from unittest.mock import Mock, patch, MagicMock
import pandas as pd
from datetime import datetime
from sports_service import (
PlayerStatsService,
GameDataService,
LeaderboardService
)
class TestPlayerStatsService:
"""Test PlayerStatsService with mocked dependencies."""
@pytest.fixture
def mock_api(self):
"""Create mock API client."""
mock = Mock()
mock.get_player.return_value = {
"id": 12345,
"name": "Test Player",
"team": "Test Team",
"position": "OF"
}
mock.get_stats.return_value = {
"batting_avg": 0.300,
"home_runs": 25,
"rbi": 80,
"war": 4.5
}
return mock
@pytest.fixture
def mock_cache(self):
"""Create mock cache."""
mock = Mock()
mock.get.return_value = None # Cache miss by default
return mock
@pytest.fixture
def stats_service(self, mock_api, mock_cache):
"""Create service with mocked dependencies."""
return PlayerStatsService(api=mock_api, cache=mock_cache)
def test_get_player_stats(self, stats_service, mock_api):
"""Test fetching player stats."""
stats = stats_service.get_player_stats(player_id=12345)
assert stats["name"] == "Test Player"
assert stats["batting_avg"] == 0.300
mock_api.get_player.assert_called_once_with(12345)
def test_uses_cache_when_available(self, stats_service, mock_api, mock_cache):
"""Test cache is used when data is available."""
cached_data = {"name": "Cached Player", "batting_avg": 0.280}
mock_cache.get.return_value = cached_data
stats = stats_service.get_player_stats(player_id=12345)
assert stats["name"] == "Cached Player"
mock_api.get_player.assert_not_called()
def test_caches_api_response(self, stats_service, mock_api, mock_cache):
"""Test API responses are cached."""
stats_service.get_player_stats(player_id=12345)
mock_cache.set.assert_called_once()
cached_key, cached_value = mock_cache.set.call_args[0]
assert "12345" in cached_key
def test_handles_api_error(self, stats_service, mock_api):
"""Test graceful handling of API errors."""
mock_api.get_player.side_effect = Exception("API Error")
with pytest.raises(stats_service.ServiceError):
stats_service.get_player_stats(player_id=12345)
class TestGameDataService:
"""Test GameDataService with mocked external services."""
@pytest.fixture
def mock_schedule_api(self):
mock = Mock()
mock.get_games.return_value = [
{"game_id": "g1", "home": "Team A", "away": "Team B", "time": "19:00"},
{"game_id": "g2", "home": "Team C", "away": "Team D", "time": "20:00"}
]
return mock
@pytest.fixture
def mock_odds_api(self):
mock = Mock()
mock.get_odds.return_value = {
"g1": {"home": -150, "away": +130},
"g2": {"home": +110, "away": -120}
}
return mock
@pytest.fixture
def game_service(self, mock_schedule_api, mock_odds_api):
return GameDataService(
schedule_api=mock_schedule_api,
odds_api=mock_odds_api
)
def test_get_games_with_odds(self, game_service):
"""Test combining game data with odds."""
games = game_service.get_todays_games_with_odds()
assert len(games) == 2
assert games[0]["odds"]["home"] == -150
assert games[1]["odds"]["away"] == -120
@patch("sports_service.datetime")
def test_filters_by_date(self, mock_datetime, game_service, mock_schedule_api):
"""Test date filtering."""
mock_datetime.now.return_value = datetime(2024, 4, 15)
game_service.get_todays_games_with_odds()
mock_schedule_api.get_games.assert_called_with(date="2024-04-15")
class TestLeaderboardService:
"""Test leaderboard calculations with mocked data."""
@pytest.fixture
def mock_db(self):
"""Create mock database."""
mock = MagicMock()
# Mock query results
mock.query.return_value = pd.DataFrame({
"player_id": [1, 2, 3],
"name": ["Player A", "Player B", "Player C"],
"stat_value": [0.320, 0.305, 0.298]
})
return mock
def test_get_batting_leaders(self, mock_db):
"""Test batting leaderboard."""
service = LeaderboardService(db=mock_db)
leaders = service.get_batting_leaders(stat="avg", limit=10)
assert len(leaders) == 3
assert leaders.iloc[0]["stat_value"] == 0.320
mock_db.query.assert_called_once()
def test_leaderboard_respects_limit(self, mock_db):
"""Test limit parameter."""
service = LeaderboardService(db=mock_db)
service.get_batting_leaders(stat="hr", limit=5)
call_args = mock_db.query.call_args
assert "LIMIT 5" in call_args[0][0] or call_args[1].get("limit") == 5
Property-Based Testing with Hypothesis
Use property-based testing to find edge cases in sports analytics functions.
import pytest
from hypothesis import given, strategies as st, assume, settings
from hypothesis.extra.pandas import columns, data_frames, column
import pandas as pd
import numpy as np
# Import functions to test
from sports_analytics import (
calculate_batting_avg,
calculate_obp,
calculate_slg,
calculate_ops,
normalize_stats
)
class TestBattingStatsProperties:
"""Property-based tests for batting statistics."""
@given(
hits=st.integers(min_value=0, max_value=300),
at_bats=st.integers(min_value=1, max_value=700)
)
def test_batting_avg_bounds(self, hits, at_bats):
"""Batting average must be between 0 and 1."""
assume(hits <= at_bats) # Hits can't exceed at bats
avg = calculate_batting_avg(hits, at_bats)
assert 0 <= avg <= 1
assert avg == hits / at_bats
@given(
hits=st.integers(min_value=0, max_value=200),
walks=st.integers(min_value=0, max_value=150),
hbp=st.integers(min_value=0, max_value=20),
at_bats=st.integers(min_value=1, max_value=600),
sf=st.integers(min_value=0, max_value=15)
)
def test_obp_greater_than_avg(self, hits, walks, hbp, at_bats, sf):
"""OBP must be >= AVG (walks only help)."""
assume(hits <= at_bats)
avg = calculate_batting_avg(hits, at_bats)
obp = calculate_obp(hits, walks, hbp, at_bats, sf)
assert obp >= avg
@given(
singles=st.integers(min_value=0, max_value=150),
doubles=st.integers(min_value=0, max_value=50),
triples=st.integers(min_value=0, max_value=15),
home_runs=st.integers(min_value=0, max_value=60),
at_bats=st.integers(min_value=1, max_value=600)
)
def test_slg_greater_than_avg(self, singles, doubles, triples, home_runs, at_bats):
"""SLG must be >= AVG (extra base hits add value)."""
hits = singles + doubles + triples + home_runs
assume(hits <= at_bats)
avg = calculate_batting_avg(hits, at_bats)
slg = calculate_slg(singles, doubles, triples, home_runs, at_bats)
assert slg >= avg
@given(
obp=st.floats(min_value=0.200, max_value=0.500),
slg=st.floats(min_value=0.250, max_value=0.800)
)
def test_ops_is_sum(self, obp, slg):
"""OPS should be sum of OBP and SLG."""
ops = calculate_ops(obp, slg)
assert ops == pytest.approx(obp + slg, rel=1e-6)
class TestDataFrameProperties:
"""Property-based tests for DataFrame operations."""
@given(
df=data_frames(columns=[
column("player_id", dtype=int),
column("batting_avg", elements=st.floats(0.150, 0.400)),
column("home_runs", elements=st.integers(0, 60)),
column("rbi", elements=st.integers(0, 150))
])
)
@settings(max_examples=50)
def test_normalize_preserves_rows(self, df):
"""Normalization should preserve row count."""
assume(len(df) > 0)
normalized = normalize_stats(df, ["batting_avg", "home_runs", "rbi"])
assert len(normalized) == len(df)
@given(
df=data_frames(columns=[
column("value", elements=st.floats(0, 100, allow_nan=False))
])
)
def test_normalize_bounds(self, df):
"""Normalized values should be between 0 and 1."""
assume(len(df) > 1)
assume(df["value"].std() > 0) # Need variation
normalized = normalize_stats(df, ["value"])
assert normalized["value"].min() >= 0
assert normalized["value"].max() <= 1
@given(
values=st.lists(
st.floats(min_value=-100, max_value=100, allow_nan=False),
min_size=2, max_size=100
)
)
def test_percentile_ranking(self, values):
"""Percentile ranking should produce values 0-100."""
from sports_analytics import percentile_rank
df = pd.DataFrame({"stat": values})
df["percentile"] = percentile_rank(df["stat"])
assert df["percentile"].min() >= 0
assert df["percentile"].max() <= 100
@given(
runs_for=st.integers(min_value=1, max_value=1500),
runs_against=st.integers(min_value=1, max_value=1500)
)
def test_pythagorean_expectation_bounds(runs_for, runs_against):
"""Pythagorean expectation must be between 0 and 1."""
from sports_analytics import pythagorean_expectation
win_pct = pythagorean_expectation(runs_for, runs_against)
assert 0 < win_pct < 1
# Equal runs should give 50%
if runs_for == runs_against:
assert win_pct == pytest.approx(0.5, rel=1e-6)
Integration Testing for APIs
Integration tests for sports data API clients and database connections.
import pytest
import requests
from unittest.mock import Mock, patch
from sports_api import BaseballAPIClient, DatabaseConnection
class TestAPIIntegration:
"""Integration tests for API clients."""
@pytest.fixture
def api_client(self):
"""Create API client for testing."""
return BaseballAPIClient(api_key="test_key")
@pytest.fixture
def mock_response(self):
"""Create mock API response."""
mock = Mock()
mock.status_code = 200
mock.json.return_value = {
"players": [
{"id": 1, "name": "Test Player", "avg": 0.300}
]
}
return mock
def test_api_connection(self, api_client):
"""Test API can be reached."""
# Use a mock in CI, real connection in integration
with patch("requests.get") as mock_get:
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = {"status": "ok"}
result = api_client.health_check()
assert result is True
def test_get_player_data(self, api_client, mock_response):
"""Test fetching player data."""
with patch("requests.get", return_value=mock_response):
players = api_client.get_players()
assert len(players) > 0
assert players[0]["name"] == "Test Player"
def test_rate_limiting(self, api_client):
"""Test rate limiting is respected."""
with patch("requests.get") as mock_get:
mock_get.return_value.status_code = 429
mock_get.return_value.headers = {"Retry-After": "60"}
with pytest.raises(api_client.RateLimitError):
api_client.get_players()
def test_api_error_handling(self, api_client):
"""Test API error handling."""
with patch("requests.get") as mock_get:
mock_get.return_value.status_code = 500
mock_get.return_value.text = "Internal Server Error"
with pytest.raises(api_client.APIError):
api_client.get_players()
class TestDatabaseIntegration:
"""Integration tests for database operations."""
@pytest.fixture
def db_connection(self):
"""Create test database connection."""
conn = DatabaseConnection(
host="localhost",
database="sports_test",
user="test_user",
password="test_pass"
)
yield conn
conn.close()
@pytest.fixture
def setup_test_data(self, db_connection):
"""Set up test data in database."""
db_connection.execute("""
CREATE TEMPORARY TABLE test_players (
id INT PRIMARY KEY,
name VARCHAR(100),
batting_avg DECIMAL(4,3)
)
""")
db_connection.execute("""
INSERT INTO test_players VALUES
(1, 'Test Player A', 0.300),
(2, 'Test Player B', 0.275)
""")
yield
db_connection.execute("DROP TEMPORARY TABLE test_players")
def test_database_connection(self, db_connection):
"""Test database connection works."""
result = db_connection.execute("SELECT 1")
assert result is not None
def test_insert_and_retrieve(self, db_connection, setup_test_data):
"""Test data can be inserted and retrieved."""
# Insert
db_connection.execute(
"INSERT INTO test_players VALUES (3, 'New Player', 0.285)"
)
# Retrieve
result = db_connection.query(
"SELECT * FROM test_players WHERE id = 3"
)
assert len(result) == 1
assert result[0]["name"] == "New Player"
def test_transaction_rollback(self, db_connection, setup_test_data):
"""Test transaction rollback works."""
db_connection.begin_transaction()
db_connection.execute(
"INSERT INTO test_players VALUES (4, 'Should Rollback', 0.250)"
)
db_connection.rollback()
result = db_connection.query(
"SELECT * FROM test_players WHERE id = 4"
)
assert len(result) == 0
# Markers for different test environments
pytestmark = [
pytest.mark.integration,
pytest.mark.skipif(
not pytest.config.getoption("--integration"),
reason="Integration tests disabled"
)
]
Data Validation Tests
Test data quality and validation rules for sports statistics datasets.
import pytest
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import great_expectations as gx
class TestDataQuality:
"""Test data quality for sports datasets."""
@pytest.fixture
def player_stats(self):
"""Load player statistics for testing."""
return pd.read_csv("player_stats.csv")
def test_no_duplicate_player_seasons(self, player_stats):
"""Ensure no duplicate player-season combinations."""
duplicates = player_stats.duplicated(subset=["player_id", "season"])
assert not duplicates.any(), f"Found {duplicates.sum()} duplicates"
def test_no_null_player_ids(self, player_stats):
"""All records must have player_id."""
null_ids = player_stats["player_id"].isna()
assert not null_ids.any(), f"Found {null_ids.sum()} null player_ids"
def test_games_played_positive(self, player_stats):
"""Games played must be positive."""
assert (player_stats["games"] > 0).all()
def test_batting_avg_valid_range(self, player_stats):
"""Batting average must be between 0 and 1."""
avg_col = player_stats["batting_avg"]
assert (avg_col >= 0).all() and (avg_col <= 1).all()
def test_at_bats_greater_than_hits(self, player_stats):
"""At bats must be >= hits."""
assert (player_stats["at_bats"] >= player_stats["hits"]).all()
def test_season_values_valid(self, player_stats):
"""Season must be a valid year."""
current_year = datetime.now().year
assert (player_stats["season"] >= 1900).all()
assert (player_stats["season"] <= current_year).all()
class TestGreatExpectations:
"""Use Great Expectations for data validation."""
def test_player_stats_expectations(self):
"""Comprehensive data expectations."""
df = pd.read_csv("player_stats.csv")
context = gx.get_context()
# Create expectation suite
expectations = [
# Column existence
{"expectation_type": "expect_column_to_exist", "kwargs": {"column": "player_id"}},
{"expectation_type": "expect_column_to_exist", "kwargs": {"column": "batting_avg"}},
# Null checks
{"expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "player_id"}},
# Value ranges
{
"expectation_type": "expect_column_values_to_be_between",
"kwargs": {"column": "batting_avg", "min_value": 0, "max_value": 1}
},
{
"expectation_type": "expect_column_values_to_be_between",
"kwargs": {"column": "era", "min_value": 0, "max_value": 50}
},
# Uniqueness
{
"expectation_type": "expect_compound_columns_to_be_unique",
"kwargs": {"column_list": ["player_id", "season"]}
}
]
# Validate
datasource = context.sources.add_pandas("pandas_datasource")
data_asset = datasource.add_dataframe_asset(name="player_stats")
batch_request = data_asset.build_batch_request(dataframe=df)
results = context.run_checkpoint(
checkpoint_name="player_stats_checkpoint",
validations=[{
"batch_request": batch_request,
"expectation_suite_name": "player_stats_suite"
}]
)
assert results.success, "Data validation failed"
class TestStatisticalValidity:
"""Test statistical properties of the data."""
@pytest.fixture
def batting_data(self):
return pd.read_csv("batting_stats.csv")
def test_avg_distribution_reasonable(self, batting_data):
"""Average should follow reasonable distribution."""
avg = batting_data["batting_avg"]
# Mean should be around .250-.270
assert 0.200 < avg.mean() < 0.300
# Standard deviation should be reasonable
assert 0.020 < avg.std() < 0.050
def test_outliers_flagged(self, batting_data):
"""Identify statistical outliers."""
avg = batting_data["batting_avg"]
mean, std = avg.mean(), avg.std()
# Flag values > 3 std from mean
outliers = batting_data[abs(avg - mean) > 3 * std]
# Log outliers for review
if len(outliers) > 0:
print(f"Potential outliers found: {len(outliers)}")
print(outliers[["player_id", "name", "batting_avg"]])
def test_calculated_fields_match(self, batting_data):
"""Verify calculated fields are correct."""
# Verify AVG = H / AB
calculated_avg = batting_data["hits"] / batting_data["at_bats"]
stored_avg = batting_data["batting_avg"]
# Allow small floating point differences
assert np.allclose(calculated_avg, stored_avg, rtol=1e-3, equal_nan=True)
Unit Testing Sports Functions
Comprehensive unit tests for sports analytics functions using pytest.
import pytest
import pandas as pd
import numpy as np
from sports_analytics import (
calculate_batting_avg,
calculate_era,
calculate_war,
pythagorean_expectation
)
class TestBattingStats:
"""Test batting statistics calculations."""
def test_batting_avg_normal(self):
"""Test batting average calculation."""
assert calculate_batting_avg(hits=150, at_bats=500) == 0.300
def test_batting_avg_zero_at_bats(self):
"""Test batting average with zero at bats."""
assert calculate_batting_avg(hits=0, at_bats=0) == 0.0
def test_batting_avg_perfect(self):
"""Test perfect batting average."""
assert calculate_batting_avg(hits=100, at_bats=100) == 1.0
@pytest.mark.parametrize("hits,at_bats,expected", [
(100, 400, 0.250),
(175, 500, 0.350),
(0, 300, 0.0),
(1, 1, 1.0),
])
def test_batting_avg_parametrized(self, hits, at_bats, expected):
"""Parametrized test for various scenarios."""
result = calculate_batting_avg(hits, at_bats)
assert result == pytest.approx(expected, rel=1e-3)
class TestPitchingStats:
"""Test pitching statistics calculations."""
def test_era_normal(self):
"""Test ERA calculation."""
# ERA = (ER / IP) * 9
result = calculate_era(earned_runs=45, innings_pitched=180)
assert result == pytest.approx(2.25, rel=1e-3)
def test_era_zero_innings(self):
"""Test ERA with zero innings (should handle gracefully)."""
result = calculate_era(earned_runs=5, innings_pitched=0)
assert result == float("inf") or result is None
class TestAdvancedMetrics:
"""Test advanced analytics functions."""
def test_pythagorean_expectation(self):
"""Test Pythagorean win expectation."""
# With equal runs scored and allowed
result = pythagorean_expectation(runs_scored=700, runs_allowed=700)
assert result == pytest.approx(0.5, rel=1e-3)
# Team that scores more should have > 50%
result = pythagorean_expectation(runs_scored=800, runs_allowed=600)
assert result > 0.5
def test_war_calculation(self):
"""Test WAR calculation."""
result = calculate_war(
batting_runs=15,
baserunning_runs=3,
fielding_runs=5,
positional_adjustment=10,
league_adjustment=2,
replacement_level=20
)
# Sum of components divided by runs per win (~10)
expected = (15 + 3 + 5 + 10 + 2 + 20) / 10
assert result == pytest.approx(expected, rel=0.1)
@pytest.fixture
def sample_player_data():
"""Fixture providing sample player DataFrame."""
return pd.DataFrame({
"player_id": [1, 2, 3],
"name": ["Player A", "Player B", "Player C"],
"avg": [0.300, 0.250, 0.275],
"hr": [30, 15, 22],
"rbi": [100, 60, 80]
})
class TestDataProcessing:
"""Test data processing functions."""
def test_dataframe_not_empty(self, sample_player_data):
"""Test that fixture provides non-empty data."""
assert len(sample_player_data) > 0
def test_required_columns_exist(self, sample_player_data):
"""Test required columns are present."""
required = ["player_id", "name", "avg", "hr", "rbi"]
for col in required:
assert col in sample_player_data.columns
def test_avg_in_valid_range(self, sample_player_data):
"""Test batting averages are valid."""
assert (sample_player_data["avg"] >= 0).all()
assert (sample_player_data["avg"] <= 1).all()
# Run with: pytest test_sports_analytics.py -v
ESPN API Integration
Fetch live scores, standings, and player data from ESPN's public API endpoints.
import requests
import pandas as pd
from datetime import datetime, timedelta
class ESPNClient:
"""Client for ESPN API endpoints."""
BASE_URL = "https://site.api.espn.com/apis/site/v2/sports"
SPORT_LEAGUES = {
"mlb": ("baseball", "mlb"),
"nba": ("basketball", "nba"),
"nfl": ("football", "nfl"),
"nhl": ("hockey", "nhl"),
"mls": ("soccer", "usa.1"),
"epl": ("soccer", "eng.1")
}
def __init__(self):
self.session = requests.Session()
def get_scoreboard(self, league: str, date: str = None) -> dict:
"""Get live/daily scoreboard."""
sport, league_id = self.SPORT_LEAGUES.get(league, (league, league))
url = f"{self.BASE_URL}/{sport}/{league_id}/scoreboard"
params = {}
if date:
params["dates"] = date.replace("-", "")
response = self.session.get(url, params=params)
return response.json()
def get_standings(self, league: str) -> pd.DataFrame:
"""Get current standings."""
sport, league_id = self.SPORT_LEAGUES.get(league, (league, league))
url = f"{self.BASE_URL}/{sport}/{league_id}/standings"
response = self.session.get(url)
data = response.json()
teams = []
for group in data.get("children", []):
for team_entry in group.get("standings", {}).get("entries", []):
team = team_entry.get("team", {})
stats = {s["name"]: s["value"] for s in team_entry.get("stats", [])}
teams.append({
"team_id": team.get("id"),
"name": team.get("displayName"),
"abbreviation": team.get("abbreviation"),
**stats
})
return pd.DataFrame(teams)
def get_team_roster(self, league: str, team_id: str) -> pd.DataFrame:
"""Get team roster."""
sport, league_id = self.SPORT_LEAGUES.get(league, (league, league))
url = f"{self.BASE_URL}/{sport}/{league_id}/teams/{team_id}/roster"
response = self.session.get(url)
data = response.json()
players = []
for athlete in data.get("athletes", []):
players.append({
"player_id": athlete.get("id"),
"name": athlete.get("fullName"),
"position": athlete.get("position", {}).get("abbreviation"),
"jersey": athlete.get("jersey"),
"age": athlete.get("age"),
"height": athlete.get("height"),
"weight": athlete.get("weight")
})
return pd.DataFrame(players)
def get_player_stats(self, league: str, player_id: str) -> dict:
"""Get player statistics."""
sport, league_id = self.SPORT_LEAGUES.get(league, (league, league))
url = f"{self.BASE_URL}/{sport}/{league_id}/athletes/{player_id}"
response = self.session.get(url)
return response.json()
# Usage
espn = ESPNClient()
# Get today's MLB scores
scores = espn.get_scoreboard("mlb")
for event in scores.get("events", []):
competition = event["competitions"][0]
teams = competition["competitors"]
print(f"{teams[0]['team']['name']} {teams[0].get('score', 0)} - "
f"{teams[1].get('score', 0)} {teams[1]['team']['name']}")
# Get NBA standings
nba_standings = espn.get_standings("nba")
print(nba_standings.head())
Odds API for Betting Lines
Fetch live betting odds from multiple sportsbooks using The Odds API.
import requests
import pandas as pd
from datetime import datetime
class OddsAPIClient:
"""Client for The Odds API."""
BASE_URL = "https://api.the-odds-api.com/v4"
SPORTS = {
"mlb": "baseball_mlb",
"nba": "basketball_nba",
"nfl": "americanfootball_nfl",
"nhl": "icehockey_nhl",
"epl": "soccer_epl",
"ncaab": "basketball_ncaab",
"ncaaf": "americanfootball_ncaaf"
}
def __init__(self, api_key: str):
self.api_key = api_key
def get_sports(self) -> list:
"""Get list of available sports."""
url = f"{self.BASE_URL}/sports"
params = {"apiKey": self.api_key}
response = requests.get(url, params=params)
return response.json()
def get_odds(self, sport: str, regions: str = "us",
markets: str = "h2h,spreads,totals") -> pd.DataFrame:
"""Get odds for upcoming games."""
sport_key = self.SPORTS.get(sport, sport)
url = f"{self.BASE_URL}/sports/{sport_key}/odds"
params = {
"apiKey": self.api_key,
"regions": regions,
"markets": markets,
"oddsFormat": "american"
}
response = requests.get(url, params=params)
data = response.json()
games = []
for game in data:
game_info = {
"game_id": game.get("id"),
"sport": game.get("sport_key"),
"commence_time": game.get("commence_time"),
"home_team": game.get("home_team"),
"away_team": game.get("away_team")
}
# Extract odds from each bookmaker
for bookmaker in game.get("bookmakers", []):
book = bookmaker.get("key")
for market in bookmaker.get("markets", []):
market_key = market.get("key")
for outcome in market.get("outcomes", []):
col_name = f"{book}_{market_key}_{outcome.get('name', '').lower().replace(' ', '_')}"
game_info[col_name] = outcome.get("price")
if "point" in outcome:
game_info[f"{col_name}_line"] = outcome.get("point")
games.append(game_info)
return pd.DataFrame(games)
def get_scores(self, sport: str, days_from: int = 1) -> pd.DataFrame:
"""Get scores for completed games."""
sport_key = self.SPORTS.get(sport, sport)
url = f"{self.BASE_URL}/sports/{sport_key}/scores"
params = {
"apiKey": self.api_key,
"daysFrom": days_from
}
response = requests.get(url, params=params)
data = response.json()
games = []
for game in data:
scores = game.get("scores", [])
home_score = next((s["score"] for s in scores if s["name"] == game["home_team"]), None)
away_score = next((s["score"] for s in scores if s["name"] == game["away_team"]), None)
games.append({
"game_id": game.get("id"),
"home_team": game.get("home_team"),
"away_team": game.get("away_team"),
"home_score": home_score,
"away_score": away_score,
"completed": game.get("completed")
})
return pd.DataFrame(games)
def find_best_odds(self, odds_df: pd.DataFrame, team: str, market: str = "h2h") -> dict:
"""Find best odds for a team across all books."""
team_cols = [c for c in odds_df.columns if market in c and team.lower() in c.lower()]
best_odds = {}
for col in team_cols:
book = col.split("_")[0]
odds = odds_df[col].iloc[0]
if pd.notna(odds):
best_odds[book] = odds
if best_odds:
best_book = max(best_odds, key=best_odds.get)
return {"book": best_book, "odds": best_odds[best_book], "all_odds": best_odds}
return {}
# Usage (requires API key from the-odds-api.com)
# odds_client = OddsAPIClient("YOUR_API_KEY")
# nfl_odds = odds_client.get_odds("nfl")
# print(nfl_odds.head())
NHL API Game Data
Access NHL game data, player stats, and play-by-play using the official NHL API.
import requests
import pandas as pd
from datetime import datetime
class NHLClient:
"""Client for NHL Stats API."""
BASE_URL = "https://api-web.nhle.com/v1"
def __init__(self):
self.session = requests.Session()
def get_schedule(self, date: str = None) -> dict:
"""Get schedule for a specific date."""
if date is None:
date = datetime.now().strftime("%Y-%m-%d")
url = f"{self.BASE_URL}/schedule/{date}"
response = self.session.get(url)
return response.json()
def get_standings(self, season: str = None) -> pd.DataFrame:
"""Get current standings."""
url = f"{self.BASE_URL}/standings/now"
response = self.session.get(url)
data = response.json()
teams = []
for team in data.get("standings", []):
teams.append({
"team": team.get("teamName", {}).get("default"),
"team_abbrev": team.get("teamAbbrev", {}).get("default"),
"conference": team.get("conferenceName"),
"division": team.get("divisionName"),
"games_played": team.get("gamesPlayed"),
"wins": team.get("wins"),
"losses": team.get("losses"),
"ot_losses": team.get("otLosses"),
"points": team.get("points"),
"goal_diff": team.get("goalDifferential"),
"goals_for": team.get("goalFor"),
"goals_against": team.get("goalAgainst")
})
return pd.DataFrame(teams)
def get_player_stats(self, player_id: int, season: str = "20232024") -> dict:
"""Get player season statistics."""
url = f"{self.BASE_URL}/player/{player_id}/landing"
response = self.session.get(url)
return response.json()
def get_game_boxscore(self, game_id: int) -> dict:
"""Get game boxscore."""
url = f"{self.BASE_URL}/gamecenter/{game_id}/boxscore"
response = self.session.get(url)
return response.json()
def get_game_playbyplay(self, game_id: int) -> pd.DataFrame:
"""Get play-by-play data for a game."""
url = f"{self.BASE_URL}/gamecenter/{game_id}/play-by-play"
response = self.session.get(url)
data = response.json()
plays = []
for play in data.get("plays", []):
plays.append({
"period": play.get("periodDescriptor", {}).get("number"),
"time": play.get("timeInPeriod"),
"type": play.get("typeDescKey"),
"description": play.get("details", {}).get("reason", ""),
"x_coord": play.get("details", {}).get("xCoord"),
"y_coord": play.get("details", {}).get("yCoord")
})
return pd.DataFrame(plays)
def search_players(self, query: str) -> list:
"""Search for players by name."""
url = f"{self.BASE_URL}/search/player?q={query}"
response = self.session.get(url)
return response.json()
# Usage
nhl = NHLClient()
# Get standings
standings = nhl.get_standings()
print("NHL Standings:")
print(standings.sort_values("points", ascending=False).head(10))
# Get schedule
schedule = nhl.get_schedule()
for game_week in schedule.get("gameWeek", []):
for game in game_week.get("games", []):
home = game.get("homeTeam", {}).get("abbrev")
away = game.get("awayTeam", {}).get("abbrev")
print(f"{away} @ {home}")
Weather API for Game Conditions
Fetch weather data for outdoor sports games using OpenWeatherMap API.
import requests
import pandas as pd
from datetime import datetime, timedelta
from typing import Dict, Optional
class SportWeatherClient:
"""Weather data client for sports analytics."""
BASE_URL = "https://api.openweathermap.org/data/2.5"
# NFL stadium coordinates
STADIUMS = {
"lambeau_field": {"lat": 44.5013, "lon": -88.0622, "team": "Green Bay Packers"},
"arrowhead": {"lat": 39.0489, "lon": -94.4839, "team": "Kansas City Chiefs"},
"gillette": {"lat": 42.0909, "lon": -71.2643, "team": "New England Patriots"},
"mile_high": {"lat": 39.7439, "lon": -105.0201, "team": "Denver Broncos"},
"soldier_field": {"lat": 41.8623, "lon": -87.6167, "team": "Chicago Bears"},
"heinz_field": {"lat": 40.4468, "lon": -80.0158, "team": "Pittsburgh Steelers"},
"metlife": {"lat": 40.8128, "lon": -74.0742, "team": "Giants/Jets"},
}
def __init__(self, api_key: str):
self.api_key = api_key
def get_current_weather(self, lat: float, lon: float) -> Dict:
"""Get current weather conditions."""
url = f"{self.BASE_URL}/weather"
params = {
"lat": lat,
"lon": lon,
"appid": self.api_key,
"units": "imperial"
}
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
return {
"temp": data["main"]["temp"],
"feels_like": data["main"]["feels_like"],
"humidity": data["main"]["humidity"],
"wind_speed": data["wind"]["speed"],
"wind_deg": data["wind"].get("deg", 0),
"conditions": data["weather"][0]["main"],
"description": data["weather"][0]["description"],
"visibility": data.get("visibility", 10000) / 1000, # km
"precipitation": data.get("rain", {}).get("1h", 0) + data.get("snow", {}).get("1h", 0)
}
def get_forecast(self, lat: float, lon: float, hours: int = 24) -> pd.DataFrame:
"""Get hourly weather forecast."""
url = f"{self.BASE_URL}/forecast"
params = {
"lat": lat,
"lon": lon,
"appid": self.api_key,
"units": "imperial"
}
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
forecasts = []
for item in data["list"][:hours // 3]:
forecasts.append({
"datetime": datetime.fromtimestamp(item["dt"]),
"temp": item["main"]["temp"],
"feels_like": item["main"]["feels_like"],
"humidity": item["main"]["humidity"],
"wind_speed": item["wind"]["speed"],
"conditions": item["weather"][0]["main"],
"pop": item.get("pop", 0) * 100 # Probability of precipitation
})
return pd.DataFrame(forecasts)
def get_game_weather(self, stadium: str) -> Optional[Dict]:
"""Get weather for a specific stadium."""
if stadium not in self.STADIUMS:
return None
coords = self.STADIUMS[stadium]
weather = self.get_current_weather(coords["lat"], coords["lon"])
weather["stadium"] = stadium
weather["team"] = coords["team"]
return weather
def categorize_conditions(self, weather: Dict) -> Dict[str, str]:
"""Categorize weather impact on game."""
impacts = {}
# Temperature impact
temp = weather["temp"]
if temp < 20:
impacts["temp_impact"] = "extreme_cold"
elif temp < 40:
impacts["temp_impact"] = "cold"
elif temp > 90:
impacts["temp_impact"] = "hot"
else:
impacts["temp_impact"] = "moderate"
# Wind impact
wind = weather["wind_speed"]
if wind > 20:
impacts["wind_impact"] = "high"
elif wind > 10:
impacts["wind_impact"] = "moderate"
else:
impacts["wind_impact"] = "low"
# Precipitation
conditions = weather["conditions"].lower()
if "rain" in conditions or "snow" in conditions:
impacts["precip_impact"] = "active"
else:
impacts["precip_impact"] = "none"
return impacts
def all_stadiums_weather(self) -> pd.DataFrame:
"""Get weather for all tracked stadiums."""
weather_data = []
for stadium in self.STADIUMS:
weather = self.get_game_weather(stadium)
if weather:
impacts = self.categorize_conditions(weather)
weather_data.append({**weather, **impacts})
return pd.DataFrame(weather_data)
# Usage (requires OpenWeatherMap API key)
# weather = SportWeatherClient("YOUR_API_KEY")
# lambeau = weather.get_game_weather("lambeau_field")
# print(f"Lambeau Field: {lambeau['temp']}°F, {lambeau['conditions']}")
Sportradar API Client
Professional-grade API client for Sportradar data including live feeds, stats, and odds.
import requests
import pandas as pd
from datetime import datetime
from typing import Optional
class SportradarClient:
"""Client for Sportradar API."""
BASE_URLS = {
"mlb": "https://api.sportradar.us/mlb/trial/v7/en",
"nba": "https://api.sportradar.us/nba/trial/v8/en",
"nfl": "https://api.sportradar.us/nfl/official/trial/v7/en",
"nhl": "https://api.sportradar.us/nhl/trial/v7/en"
}
def __init__(self, api_key: str):
self.api_key = api_key
self.session = requests.Session()
def _request(self, league: str, endpoint: str) -> dict:
"""Make API request."""
base_url = self.BASE_URLS.get(league)
if not base_url:
raise ValueError(f"Unknown league: {league}")
url = f"{base_url}/{endpoint}.json"
params = {"api_key": self.api_key}
response = self.session.get(url, params=params)
response.raise_for_status()
return response.json()
def get_schedule(self, league: str, year: int, season_type: str = "REG") -> pd.DataFrame:
"""Get season schedule."""
endpoint = f"games/{year}/{season_type}/schedule"
data = self._request(league, endpoint)
games = []
for week in data.get("weeks", data.get("games", [])):
game_list = week.get("games", [week]) if isinstance(week, dict) else [week]
for game in game_list:
games.append({
"game_id": game.get("id"),
"scheduled": game.get("scheduled"),
"home_team": game.get("home", {}).get("name"),
"away_team": game.get("away", {}).get("name"),
"venue": game.get("venue", {}).get("name"),
"status": game.get("status")
})
return pd.DataFrame(games)
def get_standings(self, league: str, year: int, season_type: str = "REG") -> pd.DataFrame:
"""Get standings."""
endpoint = f"seasons/{year}/{season_type}/standings"
data = self._request(league, endpoint)
teams = []
for conference in data.get("conferences", []):
for division in conference.get("divisions", []):
for team in division.get("teams", []):
teams.append({
"team_id": team.get("id"),
"name": team.get("name"),
"conference": conference.get("name"),
"division": division.get("name"),
"wins": team.get("wins"),
"losses": team.get("losses"),
"win_pct": team.get("win_pct"),
"games_behind": team.get("games_behind")
})
return pd.DataFrame(teams)
def get_player_profile(self, league: str, player_id: str) -> dict:
"""Get player profile and stats."""
endpoint = f"players/{player_id}/profile"
return self._request(league, endpoint)
def get_team_profile(self, league: str, team_id: str) -> dict:
"""Get team profile with roster."""
endpoint = f"teams/{team_id}/profile"
return self._request(league, endpoint)
def get_game_boxscore(self, league: str, game_id: str) -> dict:
"""Get game boxscore."""
endpoint = f"games/{game_id}/boxscore"
return self._request(league, endpoint)
def get_play_by_play(self, league: str, game_id: str) -> dict:
"""Get play-by-play data."""
endpoint = f"games/{game_id}/pbp"
return self._request(league, endpoint)
def get_daily_transfers(self, league: str, year: int, month: int, day: int) -> dict:
"""Get player transactions for a date."""
endpoint = f"league/{year}/{month:02d}/{day:02d}/transfers"
return self._request(league, endpoint)
# Usage (requires Sportradar API key)
# sr = SportradarClient("YOUR_API_KEY")
# schedule = sr.get_schedule("nfl", 2024)
# standings = sr.get_standings("nba", 2024)
NBA Stats API (stats.nba.com)
Access official NBA statistics using the stats.nba.com API endpoints.
import requests
import pandas as pd
from typing import Optional
class NBAStatsClient:
"""Client for NBA Stats API (stats.nba.com)."""
BASE_URL = "https://stats.nba.com/stats"
HEADERS = {
"Host": "stats.nba.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.nba.com/",
"x-nba-stats-origin": "stats",
"x-nba-stats-token": "true",
"Connection": "keep-alive",
}
def __init__(self):
self.session = requests.Session()
self.session.headers.update(self.HEADERS)
def _request(self, endpoint: str, params: dict) -> pd.DataFrame:
"""Make API request and parse response."""
url = f"{self.BASE_URL}/{endpoint}"
response = self.session.get(url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
result_set = data.get("resultSets", [{}])[0]
headers = result_set.get("headers", [])
rows = result_set.get("rowSet", [])
return pd.DataFrame(rows, columns=headers)
def get_player_stats(self, season: str = "2023-24",
per_mode: str = "PerGame",
season_type: str = "Regular Season") -> pd.DataFrame:
"""Get league-wide player statistics."""
params = {
"LeagueID": "00",
"Season": season,
"SeasonType": season_type,
"PerMode": per_mode,
"MeasureType": "Base",
"PaceAdjust": "N",
"Rank": "N",
"PlusMinus": "N"
}
return self._request("leaguedashplayerstats", params)
def get_player_game_log(self, player_id: int, season: str = "2023-24") -> pd.DataFrame:
"""Get player game-by-game stats."""
params = {
"PlayerID": player_id,
"Season": season,
"SeasonType": "Regular Season"
}
return self._request("playergamelog", params)
def get_team_stats(self, season: str = "2023-24",
per_mode: str = "PerGame") -> pd.DataFrame:
"""Get team statistics."""
params = {
"LeagueID": "00",
"Season": season,
"SeasonType": "Regular Season",
"PerMode": per_mode,
"MeasureType": "Base"
}
return self._request("leaguedashteamstats", params)
def get_shot_chart(self, player_id: int, season: str = "2023-24") -> pd.DataFrame:
"""Get player shot chart data."""
params = {
"PlayerID": player_id,
"Season": season,
"SeasonType": "Regular Season",
"ContextMeasure": "FGA",
"LeagueID": "00"
}
return self._request("shotchartdetail", params)
def get_play_by_play(self, game_id: str) -> pd.DataFrame:
"""Get play-by-play data for a game."""
params = {
"GameID": game_id,
"StartPeriod": 0,
"EndPeriod": 10
}
return self._request("playbyplayv2", params)
def search_player(self, name: str) -> Optional[int]:
"""Find player ID by name."""
all_players = self._request("commonallplayers", {
"LeagueID": "00",
"Season": "2023-24",
"IsOnlyCurrentSeason": 0
})
matches = all_players[
all_players["DISPLAY_FIRST_LAST"].str.contains(name, case=False)
]
if not matches.empty:
return matches.iloc[0]["PERSON_ID"]
return None
# Usage
nba = NBAStatsClient()
# Get league player stats
player_stats = nba.get_player_stats(season="2023-24")
print(player_stats[["PLAYER_NAME", "TEAM_ABBREVIATION", "PTS", "REB", "AST"]].head(20))
# Get specific player game log
lebron_id = 2544 # LeBron James
game_log = nba.get_player_game_log(lebron_id, "2023-24")
print(game_log[["GAME_DATE", "MATCHUP", "PTS", "REB", "AST"]].head(10))
Soccer Data API (Football-Data.org)
Fetch comprehensive soccer data including matches, standings, and player stats.
import requests
import pandas as pd
from datetime import datetime, timedelta
class FootballDataClient:
"""Client for Football-Data.org API."""
BASE_URL = "https://api.football-data.org/v4"
COMPETITIONS = {
"epl": "PL", # Premier League
"laliga": "PD", # La Liga
"bundesliga": "BL1",
"seriea": "SA",
"ligue1": "FL1",
"ucl": "CL", # Champions League
"worldcup": "WC"
}
def __init__(self, api_key: str):
self.api_key = api_key
self.headers = {"X-Auth-Token": api_key}
def _request(self, endpoint: str, params: dict = None) -> dict:
"""Make API request."""
url = f"{self.BASE_URL}/{endpoint}"
response = requests.get(url, headers=self.headers, params=params)
response.raise_for_status()
return response.json()
def get_competitions(self) -> pd.DataFrame:
"""Get available competitions."""
data = self._request("competitions")
competitions = []
for comp in data.get("competitions", []):
competitions.append({
"id": comp.get("id"),
"code": comp.get("code"),
"name": comp.get("name"),
"area": comp.get("area", {}).get("name"),
"type": comp.get("type"),
"current_season": comp.get("currentSeason", {}).get("id")
})
return pd.DataFrame(competitions)
def get_standings(self, competition: str) -> pd.DataFrame:
"""Get competition standings."""
comp_code = self.COMPETITIONS.get(competition, competition)
data = self._request(f"competitions/{comp_code}/standings")
teams = []
for standing in data.get("standings", []):
if standing.get("type") == "TOTAL":
for team in standing.get("table", []):
teams.append({
"position": team.get("position"),
"team": team.get("team", {}).get("name"),
"played": team.get("playedGames"),
"won": team.get("won"),
"draw": team.get("draw"),
"lost": team.get("lost"),
"goals_for": team.get("goalsFor"),
"goals_against": team.get("goalsAgainst"),
"goal_diff": team.get("goalDifference"),
"points": team.get("points")
})
return pd.DataFrame(teams)
def get_matches(self, competition: str, status: str = None,
date_from: str = None, date_to: str = None) -> pd.DataFrame:
"""Get matches for a competition."""
comp_code = self.COMPETITIONS.get(competition, competition)
params = {}
if status:
params["status"] = status # SCHEDULED, LIVE, FINISHED
if date_from:
params["dateFrom"] = date_from
if date_to:
params["dateTo"] = date_to
data = self._request(f"competitions/{comp_code}/matches", params)
matches = []
for match in data.get("matches", []):
matches.append({
"match_id": match.get("id"),
"matchday": match.get("matchday"),
"date": match.get("utcDate"),
"status": match.get("status"),
"home_team": match.get("homeTeam", {}).get("name"),
"away_team": match.get("awayTeam", {}).get("name"),
"home_score": match.get("score", {}).get("fullTime", {}).get("home"),
"away_score": match.get("score", {}).get("fullTime", {}).get("away"),
"winner": match.get("score", {}).get("winner")
})
return pd.DataFrame(matches)
def get_team(self, team_id: int) -> dict:
"""Get team details."""
return self._request(f"teams/{team_id}")
def get_top_scorers(self, competition: str) -> pd.DataFrame:
"""Get top scorers for a competition."""
comp_code = self.COMPETITIONS.get(competition, competition)
data = self._request(f"competitions/{comp_code}/scorers")
scorers = []
for scorer in data.get("scorers", []):
scorers.append({
"player": scorer.get("player", {}).get("name"),
"team": scorer.get("team", {}).get("name"),
"goals": scorer.get("goals"),
"assists": scorer.get("assists"),
"penalties": scorer.get("penalties")
})
return pd.DataFrame(scorers)
# Usage (requires API key from football-data.org)
# fd = FootballDataClient("YOUR_API_KEY")
# standings = fd.get_standings("epl")
# print(standings)
Fantasy Sports API Integration
Integrate with Yahoo Fantasy Sports API for leagues, rosters, and player data.
import requests
from requests_oauthlib import OAuth2Session
import pandas as pd
class YahooFantasyClient:
"""Client for Yahoo Fantasy Sports API."""
AUTH_URL = "https://api.login.yahoo.com/oauth2/request_auth"
TOKEN_URL = "https://api.login.yahoo.com/oauth2/get_token"
BASE_URL = "https://fantasysports.yahooapis.com/fantasy/v2"
def __init__(self, client_id: str, client_secret: str, redirect_uri: str):
self.client_id = client_id
self.client_secret = client_secret
self.redirect_uri = redirect_uri
self.oauth = None
self.token = None
def get_auth_url(self) -> str:
"""Get authorization URL for user consent."""
self.oauth = OAuth2Session(
self.client_id,
redirect_uri=self.redirect_uri,
scope=["fspt-r"]
)
auth_url, _ = self.oauth.authorization_url(self.AUTH_URL)
return auth_url
def fetch_token(self, authorization_response: str):
"""Exchange authorization code for access token."""
self.token = self.oauth.fetch_token(
self.TOKEN_URL,
authorization_response=authorization_response,
client_secret=self.client_secret
)
def _request(self, endpoint: str) -> dict:
"""Make authenticated API request."""
url = f"{self.BASE_URL}/{endpoint}"
headers = {"Authorization": f"Bearer {self.token['access_token']}"}
params = {"format": "json"}
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
return response.json()
def get_user_leagues(self, game_key: str = "nfl") -> pd.DataFrame:
"""Get user's fantasy leagues."""
data = self._request(f"users;use_login=1/games;game_keys={game_key}/leagues")
leagues = []
for league in data.get("fantasy_content", {}).get("users", {}).get("0", {}).get("user", [])[1].get("games", {}).get("0", {}).get("game", [])[1].get("leagues", {}).values():
if isinstance(league, dict):
league_data = league.get("league", [])
if league_data:
leagues.append({
"league_key": league_data[0].get("league_key"),
"league_id": league_data[0].get("league_id"),
"name": league_data[0].get("name"),
"num_teams": league_data[0].get("num_teams"),
"scoring_type": league_data[0].get("scoring_type")
})
return pd.DataFrame(leagues)
def get_league_standings(self, league_key: str) -> pd.DataFrame:
"""Get league standings."""
data = self._request(f"league/{league_key}/standings")
teams = []
standings = data.get("fantasy_content", {}).get("league", [])[1].get("standings", [])[0].get("teams", {})
for team_data in standings.values():
if isinstance(team_data, dict):
team = team_data.get("team", [])
team_standings = team[1].get("team_standings", {})
teams.append({
"team_key": team[0][0].get("team_key"),
"name": team[0][2].get("name"),
"rank": team_standings.get("rank"),
"wins": team_standings.get("outcome_totals", {}).get("wins"),
"losses": team_standings.get("outcome_totals", {}).get("losses"),
"points_for": team_standings.get("points_for"),
"points_against": team_standings.get("points_against")
})
return pd.DataFrame(teams)
def get_roster(self, team_key: str) -> pd.DataFrame:
"""Get team roster."""
data = self._request(f"team/{team_key}/roster")
players = []
roster = data.get("fantasy_content", {}).get("team", [])[1].get("roster", {}).get("0", {}).get("players", {})
for player_data in roster.values():
if isinstance(player_data, dict):
player = player_data.get("player", [])
if player:
players.append({
"player_key": player[0][0].get("player_key"),
"name": player[0][2].get("name", {}).get("full"),
"position": player[0][4].get("display_position"),
"team": player[0][6].get("editorial_team_abbr"),
"status": player[0][3].get("status") if len(player[0]) > 3 else None
})
return pd.DataFrame(players)
# Usage (requires Yahoo Developer credentials)
# yahoo = YahooFantasyClient(
# client_id="YOUR_CLIENT_ID",
# client_secret="YOUR_CLIENT_SECRET",
# redirect_uri="https://localhost/callback"
# )
# auth_url = yahoo.get_auth_url()
# print(f"Visit: {auth_url}")
Bootstrap Confidence Intervals
Calculate bootstrap confidence intervals for sports statistics with the boot package.
library(boot)
library(dplyr)
library(ggplot2)
# Sample batting data
player_games <- data.frame(
game = 1:162,
ab = sample(3:5, 162, replace = TRUE),
hits = rbinom(162, 4, 0.280)
)
player_games <- player_games %>%
mutate(
avg = hits / ab,
cum_avg = cumsum(hits) / cumsum(ab)
)
# Define statistic function for bootstrap
batting_stat <- function(data, indices) {
d <- data[indices, ]
return(sum(d$hits) / sum(d$ab))
}
# Run bootstrap
set.seed(42)
boot_results <- boot(
data = player_games,
statistic = batting_stat,
R = 10000
)
# View results
print(boot_results)
# Calculate confidence intervals
# BCa (bias-corrected and accelerated) is preferred
ci_results <- boot.ci(
boot_results,
type = c("norm", "basic", "perc", "bca")
)
print(ci_results)
# Visualize bootstrap distribution
boot_df <- data.frame(avg = boot_results$t)
ggplot(boot_df, aes(x = avg)) +
geom_histogram(aes(y = after_stat(density)), bins = 50,
fill = "steelblue", alpha = 0.7) +
geom_density(color = "red", size = 1) +
geom_vline(xintercept = boot_results$t0, color = "black",
linetype = "dashed", size = 1) +
geom_vline(xintercept = ci_results$bca[4:5], color = "darkgreen",
linetype = "dotted", size = 1) +
labs(
title = "Bootstrap Distribution of Batting Average",
subtitle = paste("95% BCa CI: [", round(ci_results$bca[4], 3),
", ", round(ci_results$bca[5], 3), "]"),
x = "Batting Average",
y = "Density"
) +
theme_minimal()
# Bootstrap multiple statistics
multi_stat <- function(data, indices) {
d <- data[indices, ]
c(
avg = sum(d$hits) / sum(d$ab),
obp = (sum(d$hits) + 10) / (sum(d$ab) + 20), # Simplified
games_above_300 = mean(d$avg > 0.300)
)
}
multi_boot <- boot(player_games, multi_stat, R = 5000)
print(multi_boot)
Bayesian Analysis with brms
Perform Bayesian regression analysis for sports data using brms package.
library(brms)
library(dplyr)
library(ggplot2)
library(bayesplot)
# Sample player projection data
set.seed(42)
players <- data.frame(
player_id = 1:100,
age = sample(23:38, 100, replace = TRUE),
experience = pmax(1, sample(1:15, 100, replace = TRUE)),
avg_3yr = rnorm(100, 0.265, 0.025),
war_3yr = rnorm(100, 2.5, 1.5)
) %>%
mutate(
# Simulate next year WAR with age curve
next_war = war_3yr * 0.6 +
3 * exp(-(age - 27)^2 / 50) +
rnorm(100, 0, 0.8)
)
# Fit Bayesian model with brms
# Priors: informative based on domain knowledge
model <- brm(
next_war ~ age + I(age^2) + experience + avg_3yr + war_3yr,
data = players,
family = gaussian(),
prior = c(
prior(normal(0, 5), class = "Intercept"),
prior(normal(0, 1), class = "b"),
prior(exponential(1), class = "sigma")
),
chains = 4,
iter = 2000,
warmup = 1000,
seed = 42
)
# Summary
summary(model)
# Posterior distributions
mcmc_areas(model, pars = c("b_age", "b_Iage2", "b_experience", "b_war_3yr"),
prob = 0.95)
# Posterior predictive check
pp_check(model, ndraws = 100)
# Predictions with uncertainty
new_player <- data.frame(
age = 28,
experience = 5,
avg_3yr = 0.285,
war_3yr = 3.5
)
# Get posterior predictions
pred <- predict(model, newdata = new_player, summary = FALSE)
# Prediction summary
cat("Predicted WAR for new player:\n")
cat("Mean:", round(mean(pred), 2), "\n")
cat("95% CI:", round(quantile(pred, c(0.025, 0.975)), 2), "\n")
# Probability of being above average (WAR > 2)
prob_above_avg <- mean(pred > 2)
cat("P(WAR > 2):", round(prob_above_avg, 3), "\n")
# Conditional effects plot
conditional_effects(model, effects = "age")
Survival Analysis for Player Careers
Analyze player career longevity using survival analysis techniques in R.
library(survival)
library(survminer)
library(dplyr)
# Sample career data
set.seed(42)
n_players <- 200
careers <- data.frame(
player_id = 1:n_players,
draft_round = sample(1:10, n_players, replace = TRUE),
position = sample(c("Pitcher", "Infielder", "Outfielder", "Catcher"),
n_players, replace = TRUE),
college = sample(c("Yes", "No"), n_players, replace = TRUE, prob = c(0.6, 0.4)),
debut_age = sample(21:26, n_players, replace = TRUE)
) %>%
mutate(
# Career length with position and draft effects
base_career = 6 +
(11 - draft_round) * 0.3 +
ifelse(position == "Pitcher", -1, 0) +
ifelse(college == "Yes", 0.5, 0) +
rnorm(n_players, 0, 2),
career_years = pmax(1, round(base_career)),
# Censoring (still active)
retired = rbinom(n_players, 1, 0.85)
)
# Create survival object
surv_obj <- Surv(time = careers$career_years, event = careers$retired)
# Kaplan-Meier overall
km_fit <- survfit(surv_obj ~ 1)
print(km_fit)
# Plot survival curve
ggsurvplot(
km_fit,
data = careers,
conf.int = TRUE,
risk.table = TRUE,
xlab = "Years in MLB",
ylab = "Probability of Active Career",
title = "MLB Career Survival Curve"
)
# Kaplan-Meier by position
km_position <- survfit(surv_obj ~ position, data = careers)
ggsurvplot(
km_position,
data = careers,
pval = TRUE,
risk.table = TRUE,
legend.title = "Position",
palette = "jco"
)
# Cox proportional hazards model
cox_model <- coxph(
surv_obj ~ draft_round + position + college + debut_age,
data = careers
)
summary(cox_model)
# Forest plot of hazard ratios
ggforest(cox_model, data = careers)
# Test proportional hazards assumption
ph_test <- cox.zph(cox_model)
print(ph_test)
# Predicted survival curves for different profiles
new_players <- data.frame(
draft_round = c(1, 5, 10),
position = "Infielder",
college = "Yes",
debut_age = 23
)
pred_surv <- survfit(cox_model, newdata = new_players)
ggsurvplot(pred_surv, legend.labs = c("Round 1", "Round 5", "Round 10"))
Lahman Database Analysis
Analyze historical baseball data using the Lahman package with dplyr for aggregations.
library(Lahman)
library(dplyr)
library(ggplot2)
# Load batting data
batting <- Lahman::Batting %>%
filter(yearID >= 1950) %>%
group_by(playerID, yearID) %>%
summarize(
AB = sum(AB, na.rm = TRUE),
H = sum(H, na.rm = TRUE),
HR = sum(HR, na.rm = TRUE),
RBI = sum(RBI, na.rm = TRUE),
BB = sum(BB, na.rm = TRUE),
SO = sum(SO, na.rm = TRUE),
.groups = "drop"
) %>%
filter(AB >= 400) %>%
mutate(
AVG = H / AB,
OBP = (H + BB) / (AB + BB),
ISO = (HR * 4 + (H - HR)) / AB - AVG # Simplified ISO
)
# Add player names
people <- Lahman::People %>%
select(playerID, nameFirst, nameLast) %>%
mutate(name = paste(nameFirst, nameLast))
batting_named <- batting %>%
left_join(people, by = "playerID")
# Top seasons by HR
top_hr_seasons <- batting_named %>%
arrange(desc(HR)) %>%
head(20) %>%
select(name, yearID, AB, HR, AVG, OBP)
print(top_hr_seasons)
# Visualize HR trend over time
hr_trend <- batting %>%
group_by(yearID) %>%
summarize(
avg_hr = mean(HR),
total_hr = sum(HR),
players = n()
)
ggplot(hr_trend, aes(x = yearID, y = avg_hr)) +
geom_line(color = "blue", size = 1) +
geom_smooth(method = "loess", se = TRUE, alpha = 0.2) +
labs(
title = "Average Home Runs per Qualified Batter",
x = "Year",
y = "Average HR"
) +
theme_minimal()
tidymodels Player Prediction
Build predictive model using tidymodels workflow for player performance forecasting.
library(tidymodels)
library(dplyr)
# Sample player data
set.seed(42)
player_data <- tibble(
player_id = 1:500,
age = sample(22:38, 500, replace = TRUE),
experience = sample(1:15, 500, replace = TRUE),
avg_3yr = runif(500, 0.220, 0.320),
obp_3yr = runif(500, 0.280, 0.420),
slg_3yr = runif(500, 0.350, 0.600),
war_3yr = runif(500, -1, 8),
next_year_war = war_3yr * 0.7 + rnorm(500, 0, 1) # Target
)
# Split data
data_split <- initial_split(player_data, prop = 0.8, strata = next_year_war)
train_data <- training(data_split)
test_data <- testing(data_split)
# Define recipe (preprocessing)
war_recipe <- recipe(next_year_war ~ age + experience + avg_3yr + obp_3yr + slg_3yr + war_3yr,
data = train_data) %>%
step_normalize(all_numeric_predictors()) %>%
step_poly(age, degree = 2) %>%
step_interact(terms = ~ age:experience)
# Define model
rf_model <- rand_forest(
mtry = tune(),
trees = 500,
min_n = tune()
) %>%
set_engine("ranger") %>%
set_mode("regression")
# Create workflow
war_workflow <- workflow() %>%
add_recipe(war_recipe) %>%
add_model(rf_model)
# Cross-validation
cv_folds <- vfold_cv(train_data, v = 5)
# Tune hyperparameters
rf_grid <- grid_regular(
mtry(range = c(2, 6)),
min_n(range = c(5, 20)),
levels = 5
)
tune_results <- tune_grid(
war_workflow,
resamples = cv_folds,
grid = rf_grid,
metrics = metric_set(rmse, rsq, mae)
)
# Best model
best_params <- select_best(tune_results, metric = "rmse")
final_workflow <- finalize_workflow(war_workflow, best_params)
# Fit final model
final_fit <- fit(final_workflow, data = train_data)
# Evaluate on test set
predictions <- predict(final_fit, test_data) %>%
bind_cols(test_data)
metrics <- predictions %>%
metrics(truth = next_year_war, estimate = .pred)
print(metrics)
Sports Regression Diagnostics
Comprehensive regression diagnostics for sports statistics models in R.
library(dplyr)
library(ggplot2)
library(broom)
library(car)
# Example: Predict ERA from various pitching metrics
pitching_data <- tibble(
era = c(3.2, 4.1, 3.8, 2.9, 5.2, 3.5, 4.8, 3.1, 4.4, 3.7),
fip = c(3.1, 4.3, 3.6, 2.8, 5.0, 3.4, 4.5, 3.0, 4.2, 3.5),
whip = c(1.10, 1.32, 1.21, 1.05, 1.45, 1.18, 1.38, 1.08, 1.28, 1.15),
k_per_9 = c(9.2, 7.5, 8.8, 10.1, 6.5, 8.2, 7.0, 9.8, 7.8, 8.5),
bb_per_9 = c(2.5, 3.2, 2.8, 2.1, 4.0, 2.9, 3.5, 2.3, 3.1, 2.7)
)
# Fit model
model <- lm(era ~ fip + whip + k_per_9 + bb_per_9, data = pitching_data)
# Summary
summary(model)
# Get tidy output
tidy_results <- tidy(model, conf.int = TRUE)
print(tidy_results)
# Model statistics
glance(model)
# Residual diagnostics
augmented <- augment(model)
# 1. Residuals vs Fitted
p1 <- ggplot(augmented, aes(.fitted, .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(se = FALSE) +
labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals") +
theme_minimal()
# 2. Q-Q Plot
p2 <- ggplot(augmented, aes(sample = .std.resid)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = "Normal Q-Q", x = "Theoretical", y = "Standardized Residuals") +
theme_minimal()
# 3. Scale-Location
p3 <- ggplot(augmented, aes(.fitted, sqrt(abs(.std.resid)))) +
geom_point() +
geom_smooth(se = FALSE) +
labs(title = "Scale-Location", x = "Fitted", y = "sqrt(|Standardized Residuals|)") +
theme_minimal()
# VIF for multicollinearity
vif_values <- vif(model)
print("Variance Inflation Factors:")
print(vif_values)
# Cook's Distance for influential points
cooks_d <- cooks.distance(model)
influential <- which(cooks_d > 4 / nrow(pitching_data))
print(paste("Influential observations:", paste(influential, collapse = ", ")))
Player Similarity with Clustering
Find similar players using hierarchical clustering and visualization in R.
library(dplyr)
library(ggplot2)
library(cluster)
library(factoextra)
# Sample player statistics
players <- tibble(
name = c("Player A", "Player B", "Player C", "Player D", "Player E",
"Player F", "Player G", "Player H", "Player I", "Player J"),
avg = c(.310, .245, .298, .275, .320, .258, .288, .265, .302, .278),
obp = c(.380, .310, .365, .345, .395, .325, .355, .330, .375, .350),
slg = c(.520, .420, .485, .450, .550, .400, .470, .435, .505, .460),
hr = c(32, 15, 25, 20, 38, 12, 22, 18, 28, 21),
sb = c(5, 25, 12, 18, 3, 30, 8, 22, 6, 15),
bb_pct = c(12, 8, 10, 9, 14, 7, 9, 8, 11, 10),
k_pct = c(18, 22, 16, 20, 15, 25, 17, 21, 14, 19)
)
# Prepare data for clustering
player_stats <- players %>%
select(-name) %>%
scale()
rownames(player_stats) <- players$name
# Hierarchical clustering
dist_matrix <- dist(player_stats, method = "euclidean")
hclust_result <- hclust(dist_matrix, method = "ward.D2")
# Plot dendrogram
fviz_dend(hclust_result,
k = 4,
cex = 0.8,
main = "Player Similarity Dendrogram",
xlab = "Players",
palette = "jco")
# Cut into clusters
clusters <- cutree(hclust_result, k = 4)
players$cluster <- as.factor(clusters)
# Visualize clusters with PCA
pca_result <- prcomp(player_stats)
fviz_pca_ind(pca_result,
geom.ind = "point",
col.ind = players$cluster,
palette = "jco",
addEllipses = TRUE,
legend.title = "Cluster")
# Find similar players function
find_similar <- function(target_player, data, n = 5) {
target_idx <- which(data$name == target_player)
if (length(target_idx) == 0) return(NULL)
distances <- as.matrix(dist_matrix)[target_idx, ]
similar_idx <- order(distances)[2:(n + 1)] # Exclude self
data[similar_idx, ] %>%
mutate(similarity = 1 - distances[similar_idx] / max(distances))
}
# Example: Find players similar to Player A
similar_to_A <- find_similar("Player A", players)
print(similar_to_A)
Time Series Forecasting with fable
Forecast seasonal sports metrics using the fable package for tidy time series.
library(fable)
library(tsibble)
library(feasts)
library(dplyr)
library(ggplot2)
# Create sample attendance data
set.seed(42)
dates <- seq(as.Date("2020-01-01"), as.Date("2023-12-31"), by = "month")
attendance <- tibble(
date = dates,
attendance = 30000 +
10000 * sin(2 * pi * (1:length(dates)) / 12) + # Seasonality
500 * (1:length(dates)) + # Trend
rnorm(length(dates), 0, 2000) # Noise
)
# Convert to tsibble
attendance_ts <- attendance %>%
mutate(month = yearmonth(date)) %>%
as_tsibble(index = month)
# Visualize decomposition
attendance_ts %>%
model(STL(attendance ~ season(window = "periodic"))) %>%
components() %>%
autoplot()
# Fit multiple models
models <- attendance_ts %>%
model(
ets = ETS(attendance),
arima = ARIMA(attendance),
snaive = SNAIVE(attendance)
)
# Compare accuracy
accuracy(models)
# Generate forecasts
forecasts <- models %>%
forecast(h = "12 months")
# Plot forecasts
forecasts %>%
autoplot(attendance_ts, level = c(80, 95)) +
facet_wrap(~.model, ncol = 1) +
labs(
title = "Attendance Forecasts",
y = "Monthly Attendance",
x = "Date"
) +
theme_minimal()
# Cross-validation
cv_results <- attendance_ts %>%
stretch_tsibble(.init = 24, .step = 3) %>%
model(
ets = ETS(attendance),
arima = ARIMA(attendance)
) %>%
forecast(h = 3) %>%
accuracy(attendance_ts)
print(cv_results)
Shiny Dashboard for Sports Stats
Create interactive Shiny dashboard for exploring player statistics.
library(shiny)
library(shinydashboard)
library(dplyr)
library(ggplot2)
library(DT)
# Sample data
players_data <- data.frame(
name = paste("Player", LETTERS[1:20]),
team = rep(c("Team A", "Team B", "Team C", "Team D"), each = 5),
position = sample(c("C", "1B", "2B", "SS", "3B", "OF"), 20, replace = TRUE),
avg = runif(20, 0.220, 0.320),
hr = sample(5:40, 20),
rbi = sample(30:120, 20),
war = runif(20, -1, 7)
)
ui <- dashboardPage(
dashboardHeader(title = "Sports Stats Dashboard"),
dashboardSidebar(
selectInput("team", "Select Team:",
choices = c("All", unique(players_data$team))),
selectInput("stat", "Select Statistic:",
choices = c("avg", "hr", "rbi", "war")),
sliderInput("min_war", "Minimum WAR:",
min = -1, max = 7, value = 0)
),
dashboardBody(
fluidRow(
valueBoxOutput("total_players"),
valueBoxOutput("avg_stat"),
valueBoxOutput("top_player")
),
fluidRow(
box(title = "Distribution", status = "primary",
plotOutput("hist_plot"), width = 6),
box(title = "Comparison", status = "info",
plotOutput("bar_plot"), width = 6)
),
fluidRow(
box(title = "Player Data", status = "success",
DTOutput("player_table"), width = 12)
)
)
)
server <- function(input, output) {
filtered_data <- reactive({
data <- players_data %>%
filter(war >= input$min_war)
if (input$team != "All") {
data <- data %>% filter(team == input$team)
}
data
})
output$total_players <- renderValueBox({
valueBox(nrow(filtered_data()), "Players", icon = icon("users"))
})
output$avg_stat <- renderValueBox({
avg_val <- mean(filtered_data()[[input$stat]], na.rm = TRUE)
valueBox(round(avg_val, 3), paste("Avg", input$stat), icon = icon("chart-line"))
})
output$top_player <- renderValueBox({
top <- filtered_data() %>%
arrange(desc(!!sym(input$stat))) %>%
slice(1)
valueBox(top$name, "Top Player", icon = icon("trophy"))
})
output$hist_plot <- renderPlot({
ggplot(filtered_data(), aes_string(x = input$stat)) +
geom_histogram(bins = 15, fill = "steelblue", color = "white") +
labs(title = paste("Distribution of", input$stat)) +
theme_minimal()
})
output$bar_plot <- renderPlot({
filtered_data() %>%
arrange(desc(!!sym(input$stat))) %>%
head(10) %>%
ggplot(aes_string(x = "reorder(name, -get(input$stat))", y = input$stat)) +
geom_col(fill = "coral") +
coord_flip() +
labs(title = paste("Top 10 by", input$stat), x = "") +
theme_minimal()
})
output$player_table <- renderDT({
datatable(filtered_data(), options = list(pageLength = 10))
})
}
# Run app
# shinyApp(ui, server)
Mixed Effects Model for Player Analysis
Use mixed effects models to account for team and park effects in player statistics.
library(lme4)
library(dplyr)
library(ggplot2)
library(broom.mixed)
# Sample data with player nested in team
set.seed(42)
n_teams <- 10
n_players_per_team <- 15
n_seasons <- 3
player_data <- expand.grid(
team = paste("Team", LETTERS[1:n_teams]),
player = 1:n_players_per_team,
season = 2022:2024
) %>%
mutate(
player_id = paste(team, player, sep = "_"),
# Team random effect
team_effect = rep(rnorm(n_teams, 0, 0.02), each = n_players_per_team * n_seasons),
# Player random effect
player_skill = rep(rnorm(n_teams * n_players_per_team, 0.265, 0.025), each = n_seasons),
# Season fixed effect
season_effect = (season - 2022) * 0.003,
# Home park factor
park_factor = rep(runif(n_teams, 0.95, 1.05), each = n_players_per_team * n_seasons),
# Observed batting average
avg = player_skill + team_effect + season_effect + rnorm(n(), 0, 0.015)
) %>%
filter(avg > 0.150 & avg < 0.400) # Reasonable bounds
# Fit mixed effects model
# Fixed: season
# Random: player nested in team
model <- lmer(
avg ~ season + (1 | team) + (1 | team:player_id),
data = player_data,
REML = TRUE
)
# Summary
summary(model)
# Extract variance components
VarCorr(model)
# Random effects
ranef_team <- ranef(model)$team
ranef_player <- ranef(model)$`team:player_id`
# Team effects
team_effects <- data.frame(
team = rownames(ranef_team),
effect = ranef_team[[1]]
) %>%
arrange(desc(effect))
print("Team Random Effects:")
print(team_effects)
# Predictions
player_data$predicted_avg <- predict(model)
# Plot random effects
ggplot(team_effects, aes(x = reorder(team, effect), y = effect)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Team Random Effects on Batting Average",
x = "Team",
y = "Effect on AVG"
) +
theme_minimal()
# ICC - proportion of variance at each level
icc <- as.data.frame(VarCorr(model))
total_var <- sum(icc$vcov)
print("Intraclass Correlations:")
print(icc %>% mutate(ICC = vcov / total_var))
JSON Data Flattener
Flatten nested JSON structures from sports APIs into tabular format for database storage.
import pandas as pd
import json
from typing import List, Dict, Any
from collections import defaultdict
class JSONFlattener:
"""Flatten nested JSON to tabular format."""
def __init__(self, separator: str = "_"):
self.separator = separator
def flatten_dict(self, d: dict, parent_key: str = "", sep: str = "_") -> dict:
"""Recursively flatten nested dictionary."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(self.flatten_dict(v, new_key, sep).items())
elif isinstance(v, list):
if len(v) > 0 and isinstance(v[0], dict):
# List of dicts - need to handle separately
items.append((new_key, json.dumps(v)))
else:
items.append((new_key, v))
else:
items.append((new_key, v))
return dict(items)
def flatten_json(self, data: Any) -> pd.DataFrame:
"""Flatten JSON data to DataFrame."""
if isinstance(data, dict):
# Single record
flattened = self.flatten_dict(data, sep=self.separator)
return pd.DataFrame([flattened])
elif isinstance(data, list):
# List of records
flattened_records = []
for record in data:
if isinstance(record, dict):
flattened_records.append(
self.flatten_dict(record, sep=self.separator)
)
else:
flattened_records.append({"value": record})
return pd.DataFrame(flattened_records)
return pd.DataFrame()
def extract_nested_array(self, data: List[dict],
array_key: str,
parent_keys: List[str] = None) -> pd.DataFrame:
"""Extract nested array and maintain parent context."""
rows = []
for record in data:
parent_data = {}
if parent_keys:
for pk in parent_keys:
if pk in record:
parent_data[pk] = record[pk]
nested_items = record.get(array_key, [])
for item in nested_items:
if isinstance(item, dict):
row = {**parent_data, **self.flatten_dict(item)}
else:
row = {**parent_data, array_key: item}
rows.append(row)
return pd.DataFrame(rows)
# Example: Flatten game data with nested plays
game_json = {
"game_id": "2024001",
"date": "2024-04-01",
"home_team": {"id": 1, "name": "Yankees", "city": "New York"},
"away_team": {"id": 2, "name": "Red Sox", "city": "Boston"},
"final_score": {"home": 5, "away": 3},
"plays": [
{"inning": 1, "batter": "Judge", "result": "single"},
{"inning": 1, "batter": "Soto", "result": "home_run"}
]
}
flattener = JSONFlattener()
# Flatten game info
game_df = flattener.flatten_json(game_json)
print("Flattened game data:")
print(game_df.columns.tolist())
# Extract plays with game context
plays_df = flattener.extract_nested_array(
[game_json],
"plays",
parent_keys=["game_id", "date"]
)
print("\nPlays data:")
print(plays_df)
CSV to Database Importer
Robust CSV importer with encoding detection, type inference, and batch loading.
import pandas as pd
import chardet
from sqlalchemy import create_engine, types
import numpy as np
import logging
logger = logging.getLogger(__name__)
class CSVImporter:
"""Import CSV files to database with automatic configuration."""
def __init__(self, engine):
self.engine = engine
def detect_encoding(self, file_path: str) -> str:
"""Detect file encoding."""
with open(file_path, "rb") as f:
result = chardet.detect(f.read(10000))
return result["encoding"]
def infer_sql_types(self, df: pd.DataFrame) -> dict:
"""Infer SQL types from DataFrame."""
type_map = {}
for col in df.columns:
dtype = df[col].dtype
if pd.api.types.is_integer_dtype(dtype):
max_val = df[col].max()
if max_val < 32767:
type_map[col] = types.SmallInteger()
elif max_val < 2147483647:
type_map[col] = types.Integer()
else:
type_map[col] = types.BigInteger()
elif pd.api.types.is_float_dtype(dtype):
type_map[col] = types.Float()
elif pd.api.types.is_datetime64_any_dtype(dtype):
type_map[col] = types.DateTime()
else: # String/object
max_len = df[col].astype(str).str.len().max()
if max_len < 50:
type_map[col] = types.String(50)
elif max_len < 255:
type_map[col] = types.String(255)
else:
type_map[col] = types.Text()
return type_map
def import_csv(self, file_path: str, table_name: str,
chunk_size: int = 5000,
if_exists: str = "replace") -> dict:
"""Import CSV file to database table."""
# Detect encoding
encoding = self.detect_encoding(file_path)
logger.info(f"Detected encoding: {encoding}")
# Read CSV with detected encoding
df = pd.read_csv(file_path, encoding=encoding, low_memory=False)
logger.info(f"Read {len(df)} rows, {len(df.columns)} columns")
# Clean column names
df.columns = [c.lower().strip().replace(" ", "_").replace("-", "_")
for c in df.columns]
# Parse dates
for col in df.columns:
if "date" in col.lower():
df[col] = pd.to_datetime(df[col], errors="coerce")
# Infer types
sql_types = self.infer_sql_types(df)
# Import in chunks
total_loaded = 0
for i in range(0, len(df), chunk_size):
chunk = df.iloc[i:i+chunk_size]
mode = if_exists if i == 0 else "append"
chunk.to_sql(
table_name,
self.engine,
if_exists=mode,
index=False,
dtype=sql_types if i == 0 else None,
chunksize=1000
)
total_loaded += len(chunk)
logger.info(f"Loaded {total_loaded}/{len(df)} rows")
return {
"rows_loaded": total_loaded,
"columns": list(df.columns),
"table": table_name,
"encoding": encoding
}
# Usage
engine = create_engine("mysql://user:pass@localhost/sports_db")
importer = CSVImporter(engine)
result = importer.import_csv(
"batting_stats_2024.csv",
"batting_stats",
chunk_size=5000,
if_exists="replace"
)
print(f"Imported {result['rows_loaded']} rows to {result['table']}")
Data Quality Monitoring
Monitor data quality metrics over time and alert on anomalies in sports data pipelines.
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import Dict, List, Callable
import logging
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class QualityMetric:
name: str
value: float
threshold: float
status: str # "pass", "warn", "fail"
timestamp: datetime
class DataQualityMonitor:
"""Monitor data quality for sports statistics."""
def __init__(self, db_connection):
self.db = db_connection
self.metrics_history: List[QualityMetric] = []
def check_completeness(self, table: str, required_cols: List[str]) -> QualityMetric:
"""Check percentage of non-null values in required columns."""
df = pd.read_sql(f"SELECT * FROM {table} ORDER BY created_at DESC LIMIT 10000", self.db)
completeness_scores = []
for col in required_cols:
if col in df.columns:
score = (df[col].notna().sum() / len(df)) * 100
completeness_scores.append(score)
avg_completeness = np.mean(completeness_scores)
return QualityMetric(
name=f"{table}_completeness",
value=avg_completeness,
threshold=95.0,
status="pass" if avg_completeness >= 95 else "fail",
timestamp=datetime.now()
)
def check_freshness(self, table: str, timestamp_col: str,
max_age_hours: int = 24) -> QualityMetric:
"""Check if data is being updated regularly."""
query = f"SELECT MAX({timestamp_col}) as latest FROM {table}"
result = pd.read_sql(query, self.db)
latest = result["latest"].iloc[0]
if pd.isna(latest):
age_hours = float("inf")
else:
age_hours = (datetime.now() - latest).total_seconds() / 3600
return QualityMetric(
name=f"{table}_freshness",
value=age_hours,
threshold=max_age_hours,
status="pass" if age_hours <= max_age_hours else "fail",
timestamp=datetime.now()
)
def check_row_count(self, table: str, min_rows: int) -> QualityMetric:
"""Verify minimum expected row count."""
query = f"SELECT COUNT(*) as cnt FROM {table}"
count = pd.read_sql(query, self.db)["cnt"].iloc[0]
return QualityMetric(
name=f"{table}_row_count",
value=count,
threshold=min_rows,
status="pass" if count >= min_rows else "fail",
timestamp=datetime.now()
)
def check_stat_ranges(self, table: str,
range_checks: Dict[str, tuple]) -> List[QualityMetric]:
"""Verify statistics are within expected ranges."""
df = pd.read_sql(f"SELECT * FROM {table}", self.db)
metrics = []
for col, (min_val, max_val) in range_checks.items():
if col not in df.columns:
continue
out_of_range = ((df[col] < min_val) | (df[col] > max_val)).sum()
pct_valid = ((len(df) - out_of_range) / len(df)) * 100
metrics.append(QualityMetric(
name=f"{table}_{col}_range",
value=pct_valid,
threshold=99.0,
status="pass" if pct_valid >= 99 else "warn" if pct_valid >= 95 else "fail",
timestamp=datetime.now()
))
return metrics
def run_all_checks(self, config: dict) -> Dict:
"""Run all quality checks and return report."""
results = {"pass": [], "warn": [], "fail": []}
for table, checks in config.items():
# Completeness
if "required_cols" in checks:
m = self.check_completeness(table, checks["required_cols"])
results[m.status].append(m)
# Freshness
if "timestamp_col" in checks:
m = self.check_freshness(table, checks["timestamp_col"])
results[m.status].append(m)
# Row count
if "min_rows" in checks:
m = self.check_row_count(table, checks["min_rows"])
results[m.status].append(m)
# Stat ranges
if "ranges" in checks:
for m in self.check_stat_ranges(table, checks["ranges"]):
results[m.status].append(m)
return results
# Usage
monitor = DataQualityMonitor(db_connection)
config = {
"player_stats": {
"required_cols": ["player_id", "team", "games", "at_bats"],
"timestamp_col": "updated_at",
"min_rows": 1000,
"ranges": {
"batting_avg": (0, 1),
"era": (0, 50),
"games": (0, 162)
}
}
}
report = monitor.run_all_checks(config)
print(f"Passed: {len(report['pass'])}")
print(f"Warnings: {len(report['warn'])}")
print(f"Failed: {len(report['fail'])}")
Data Validation Framework
Comprehensive data validation framework for sports statistics with customizable rules.
import pandas as pd
from typing import Callable, List, Dict, Any
from dataclasses import dataclass
from enum import Enum
class Severity(Enum):
ERROR = "error"
WARNING = "warning"
INFO = "info"
@dataclass
class ValidationResult:
rule_name: str
passed: bool
severity: Severity
message: str
affected_rows: int = 0
class ValidationRule:
def __init__(self, name: str, check_fn: Callable, severity: Severity = Severity.ERROR):
self.name = name
self.check_fn = check_fn
self.severity = severity
def validate(self, df: pd.DataFrame) -> ValidationResult:
try:
passed, affected, msg = self.check_fn(df)
return ValidationResult(
rule_name=self.name,
passed=passed,
severity=self.severity,
message=msg,
affected_rows=affected
)
except Exception as e:
return ValidationResult(
rule_name=self.name,
passed=False,
severity=Severity.ERROR,
message=f"Validation error: {str(e)}"
)
class SportsDataValidator:
"""Validate sports statistics data."""
def __init__(self):
self.rules: List[ValidationRule] = []
self._add_default_rules()
def _add_default_rules(self):
"""Add common validation rules."""
# No duplicate records
self.add_rule(
"no_duplicates",
lambda df: (
not df.duplicated().any(),
df.duplicated().sum(),
f"Found {df.duplicated().sum()} duplicate rows"
),
Severity.ERROR
)
# Batting average in valid range
self.add_rule(
"valid_batting_avg",
lambda df: (
((df["avg"] >= 0) & (df["avg"] <= 1)).all()
if "avg" in df.columns else True,
((df["avg"] < 0) | (df["avg"] > 1)).sum()
if "avg" in df.columns else 0,
"Batting average must be between 0 and 1"
),
Severity.ERROR
)
# ERA reasonable range
self.add_rule(
"valid_era",
lambda df: (
((df["era"] >= 0) & (df["era"] <= 50)).all()
if "era" in df.columns else True,
((df["era"] < 0) | (df["era"] > 50)).sum()
if "era" in df.columns else 0,
"ERA should be between 0 and 50"
),
Severity.WARNING
)
# Games played positive
self.add_rule(
"positive_games",
lambda df: (
(df["games"] > 0).all() if "games" in df.columns else True,
(df["games"] <= 0).sum() if "games" in df.columns else 0,
"Games played must be positive"
),
Severity.ERROR
)
def add_rule(self, name: str, check_fn: Callable, severity: Severity = Severity.ERROR):
self.rules.append(ValidationRule(name, check_fn, severity))
def validate(self, df: pd.DataFrame) -> Dict[str, Any]:
results = []
for rule in self.rules:
result = rule.validate(df)
results.append(result)
return {
"passed": all(r.passed for r in results if r.severity == Severity.ERROR),
"results": results,
"errors": [r for r in results if not r.passed and r.severity == Severity.ERROR],
"warnings": [r for r in results if not r.passed and r.severity == Severity.WARNING]
}
# Usage
validator = SportsDataValidator()
# Add custom rule
validator.add_rule(
"valid_player_id",
lambda df: (
df["player_id"].notna().all(),
df["player_id"].isna().sum(),
"All records must have player_id"
)
)
report = validator.validate(stats_df)
print(f"Validation passed: {report['passed']}")
for error in report["errors"]:
print(f"ERROR: {error.rule_name} - {error.message}")
Schema Migration Tool
Handle database schema changes and data migrations for sports statistics tables.
from sqlalchemy import create_engine, text, inspect
from datetime import datetime
import pandas as pd
import logging
logger = logging.getLogger(__name__)
class SchemaMigration:
"""Manage database schema migrations."""
def __init__(self, engine):
self.engine = engine
self.migrations_table = "schema_migrations"
self._ensure_migrations_table()
def _ensure_migrations_table(self):
"""Create migrations tracking table if not exists."""
query = text(f"""
CREATE TABLE IF NOT EXISTS {self.migrations_table} (
id INT AUTO_INCREMENT PRIMARY KEY,
version VARCHAR(50) NOT NULL UNIQUE,
description VARCHAR(255),
applied_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
with self.engine.connect() as conn:
conn.execute(query)
conn.commit()
def get_applied_migrations(self) -> list:
"""Get list of applied migration versions."""
query = text(f"SELECT version FROM {self.migrations_table} ORDER BY id")
with self.engine.connect() as conn:
result = conn.execute(query).fetchall()
return [r[0] for r in result]
def apply_migration(self, version: str, description: str, up_sql: str):
"""Apply a migration."""
applied = self.get_applied_migrations()
if version in applied:
logger.info(f"Migration {version} already applied")
return False
logger.info(f"Applying migration {version}: {description}")
with self.engine.connect() as conn:
# Execute migration
for statement in up_sql.split(";"):
if statement.strip():
conn.execute(text(statement))
# Record migration
conn.execute(text(f"""
INSERT INTO {self.migrations_table} (version, description)
VALUES (:version, :description)
"""), {"version": version, "description": description})
conn.commit()
logger.info(f"Migration {version} applied successfully")
return True
# Define migrations
MIGRATIONS = [
{
"version": "001",
"description": "Add advanced batting stats columns",
"up": """
ALTER TABLE player_stats
ADD COLUMN wrc_plus DECIMAL(5,1) NULL,
ADD COLUMN war DECIMAL(4,2) NULL,
ADD COLUMN babip DECIMAL(4,3) NULL;
CREATE INDEX idx_player_stats_war ON player_stats(war)
"""
},
{
"version": "002",
"description": "Add pitch tracking table",
"up": """
CREATE TABLE pitch_tracking (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
game_id VARCHAR(20) NOT NULL,
pitcher_id INT NOT NULL,
batter_id INT NOT NULL,
pitch_type VARCHAR(10),
velocity DECIMAL(4,1),
spin_rate INT,
release_x DECIMAL(4,2),
release_z DECIMAL(4,2),
plate_x DECIMAL(4,2),
plate_z DECIMAL(4,2),
is_strike BOOLEAN,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
INDEX idx_pitch_game (game_id),
INDEX idx_pitch_pitcher (pitcher_id)
)
"""
},
{
"version": "003",
"description": "Add team standings table",
"up": """
CREATE TABLE team_standings (
id INT AUTO_INCREMENT PRIMARY KEY,
team_id INT NOT NULL,
season YEAR NOT NULL,
wins INT DEFAULT 0,
losses INT DEFAULT 0,
run_diff INT DEFAULT 0,
pythag_wins DECIMAL(5,2),
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY uk_team_season (team_id, season)
)
"""
}
]
# Apply migrations
engine = create_engine("mysql://user:pass@localhost/sports_db")
migrator = SchemaMigration(engine)
for migration in MIGRATIONS:
migrator.apply_migration(
migration["version"],
migration["description"],
migration["up"]
)
print("Applied migrations:", migrator.get_applied_migrations())
Data Deduplication
Identify and handle duplicate records in sports data using various strategies.
import pandas as pd
import numpy as np
from typing import List, Literal
from fuzzywuzzy import fuzz
import hashlib
class DataDeduplicator:
"""Handle duplicate records in sports data."""
def __init__(self, df: pd.DataFrame):
self.df = df.copy()
self.duplicates_found = 0
def exact_duplicates(self, subset: List[str] = None,
keep: Literal["first", "last", False] = "first") -> pd.DataFrame:
"""Remove exact duplicate rows."""
before = len(self.df)
self.df = self.df.drop_duplicates(subset=subset, keep=keep)
self.duplicates_found = before - len(self.df)
return self.df
def create_record_hash(self, columns: List[str]) -> pd.Series:
"""Create hash for deduplication."""
def hash_row(row):
values = "".join(str(row[c]) for c in columns)
return hashlib.md5(values.encode()).hexdigest()
return self.df.apply(hash_row, axis=1)
def fuzzy_player_match(self, name_col: str, threshold: int = 85) -> pd.DataFrame:
"""Find fuzzy duplicate player names."""
names = self.df[name_col].unique()
matches = []
for i, name1 in enumerate(names):
for name2 in names[i+1:]:
score = fuzz.ratio(name1.lower(), name2.lower())
if score >= threshold:
matches.append({
"name1": name1,
"name2": name2,
"similarity": score
})
return pd.DataFrame(matches)
def merge_duplicates(self, group_cols: List[str],
agg_rules: dict = None) -> pd.DataFrame:
"""Merge duplicate records using aggregation."""
if agg_rules is None:
# Default: sum numeric, first for others
numeric_cols = self.df.select_dtypes(include=[np.number]).columns
agg_rules = {c: "sum" for c in numeric_cols if c not in group_cols}
other_cols = [c for c in self.df.columns
if c not in group_cols and c not in numeric_cols]
for c in other_cols:
agg_rules[c] = "first"
self.df = self.df.groupby(group_cols, as_index=False).agg(agg_rules)
return self.df
def flag_duplicates(self, subset: List[str],
flag_col: str = "is_duplicate") -> pd.DataFrame:
"""Flag duplicates instead of removing them."""
self.df[flag_col] = self.df.duplicated(subset=subset, keep=False)
self.df["duplicate_group"] = self.df.groupby(subset).ngroup()
return self.df
def report(self) -> dict:
"""Generate deduplication report."""
return {
"total_records": len(self.df),
"duplicates_removed": self.duplicates_found,
"unique_players": self.df["player_id"].nunique() if "player_id" in self.df.columns else None,
"unique_teams": self.df["team"].nunique() if "team" in self.df.columns else None
}
# Usage
dedup = DataDeduplicator(raw_stats_df)
# Remove exact duplicates
clean_df = dedup.exact_duplicates(
subset=["player_id", "game_date", "team"],
keep="last" # Keep most recent
)
# Find similar player names
fuzzy_matches = dedup.fuzzy_player_match("player_name", threshold=90)
print("Potential duplicate players:")
print(fuzzy_matches)
# Merge duplicate game logs
merged_df = dedup.merge_duplicates(
group_cols=["player_id", "season"],
agg_rules={
"games": "sum",
"at_bats": "sum",
"hits": "sum",
"avg": lambda x: x.iloc[-1] # Use last avg
}
)
print(dedup.report())
Incremental Data Loader
Load data incrementally based on timestamps to avoid reprocessing existing records.
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine, text
import logging
logger = logging.getLogger(__name__)
class IncrementalLoader:
"""Load data incrementally based on watermarks."""
def __init__(self, engine, table_name: str, timestamp_col: str = "updated_at"):
self.engine = engine
self.table_name = table_name
self.timestamp_col = timestamp_col
self.watermark_table = "etl_watermarks"
def get_watermark(self) -> datetime:
"""Get last processed timestamp."""
query = text(f"""
SELECT last_processed
FROM {self.watermark_table}
WHERE table_name = :table
""")
with self.engine.connect() as conn:
result = conn.execute(query, {"table": self.table_name}).fetchone()
if result:
return result[0]
return datetime(1900, 1, 1) # Default to process all
def set_watermark(self, timestamp: datetime):
"""Update watermark after successful load."""
query = text(f"""
INSERT INTO {self.watermark_table} (table_name, last_processed, updated_at)
VALUES (:table, :timestamp, NOW())
ON DUPLICATE KEY UPDATE
last_processed = :timestamp,
updated_at = NOW()
""")
with self.engine.connect() as conn:
conn.execute(query, {"table": self.table_name, "timestamp": timestamp})
conn.commit()
def extract_incremental(self, source_query: str) -> pd.DataFrame:
"""Extract only new/updated records."""
watermark = self.get_watermark()
logger.info(f"Extracting records since {watermark}")
# Add watermark filter to query
full_query = f"""
{source_query}
WHERE {self.timestamp_col} > %(watermark)s
ORDER BY {self.timestamp_col}
"""
df = pd.read_sql(full_query, self.engine, params={"watermark": watermark})
logger.info(f"Extracted {len(df)} new/updated records")
return df
def upsert(self, df: pd.DataFrame, key_cols: list):
"""Insert or update records."""
if df.empty:
logger.info("No records to upsert")
return 0
# Build upsert query
columns = df.columns.tolist()
placeholders = ", ".join([f":{c}" for c in columns])
update_cols = [c for c in columns if c not in key_cols]
update_clause = ", ".join([f"{c} = VALUES({c})" for c in update_cols])
query = text(f"""
INSERT INTO {self.table_name} ({", ".join(columns)})
VALUES ({placeholders})
ON DUPLICATE KEY UPDATE {update_clause}
""")
with self.engine.connect() as conn:
for _, row in df.iterrows():
conn.execute(query, row.to_dict())
conn.commit()
# Update watermark
max_timestamp = df[self.timestamp_col].max()
self.set_watermark(max_timestamp)
logger.info(f"Upserted {len(df)} records")
return len(df)
def run(self, source_query: str, key_cols: list, transform_fn=None):
"""Execute incremental load."""
df = self.extract_incremental(source_query)
if transform_fn:
df = transform_fn(df)
return self.upsert(df, key_cols)
# Usage
engine = create_engine("mysql://user:pass@localhost/sports_db")
loader = IncrementalLoader(engine, "player_stats", "last_updated")
loaded = loader.run(
source_query="SELECT * FROM raw_player_stats",
key_cols=["player_id", "season"],
transform_fn=lambda df: df.assign(processed_at=datetime.now())
)
Complete ETL Pipeline
Full ETL pipeline for sports data with extraction, transformation, validation, and loading stages.
import pandas as pd
import numpy as np
from datetime import datetime
import logging
from typing import Dict, List, Optional
from dataclasses import dataclass
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class ETLConfig:
source_path: str
destination_table: str
batch_size: int = 1000
validate: bool = True
class SportsDataETL:
"""ETL pipeline for sports statistics."""
def __init__(self, config: ETLConfig, db_connection):
self.config = config
self.db = db_connection
self.errors = []
self.stats = {"extracted": 0, "transformed": 0, "loaded": 0, "errors": 0}
def extract(self) -> pd.DataFrame:
"""Extract data from source."""
logger.info(f"Extracting from {self.config.source_path}")
if self.config.source_path.endswith(".csv"):
df = pd.read_csv(self.config.source_path)
elif self.config.source_path.endswith(".json"):
df = pd.read_json(self.config.source_path)
elif self.config.source_path.startswith("http"):
df = pd.read_csv(self.config.source_path)
else:
raise ValueError(f"Unsupported source: {self.config.source_path}")
self.stats["extracted"] = len(df)
logger.info(f"Extracted {len(df)} records")
return df
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Transform and clean data."""
logger.info("Transforming data")
original_count = len(df)
# Remove duplicates
df = df.drop_duplicates()
# Standardize column names
df.columns = [c.lower().replace(" ", "_") for c in df.columns]
# Handle missing values
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
# Parse dates
date_cols = [c for c in df.columns if "date" in c.lower()]
for col in date_cols:
df[col] = pd.to_datetime(df[col], errors="coerce")
# Add metadata
df["etl_timestamp"] = datetime.now()
df["etl_source"] = self.config.source_path
self.stats["transformed"] = len(df)
logger.info(f"Transformed {len(df)} records ({original_count - len(df)} removed)")
return df
def validate(self, df: pd.DataFrame) -> pd.DataFrame:
"""Validate data quality."""
if not self.config.validate:
return df
logger.info("Validating data")
valid_mask = pd.Series(True, index=df.index)
# Check for required fields
required_cols = ["player_id", "team", "season"]
for col in required_cols:
if col in df.columns:
invalid = df[col].isna() | (df[col] == "")
valid_mask &= ~invalid
if invalid.any():
self.errors.append(f"Missing {col}: {invalid.sum()} rows")
# Validate numeric ranges
if "batting_avg" in df.columns:
invalid = (df["batting_avg"] < 0) | (df["batting_avg"] > 1)
valid_mask &= ~invalid
df_valid = df[valid_mask].copy()
self.stats["errors"] = len(df) - len(df_valid)
logger.info(f"Validation complete: {len(df_valid)} valid, {self.stats['errors']} invalid")
return df_valid
def load(self, df: pd.DataFrame):
"""Load data to destination."""
logger.info(f"Loading to {self.config.destination_table}")
# Batch insert
for i in range(0, len(df), self.config.batch_size):
batch = df.iloc[i:i+self.config.batch_size]
batch.to_sql(
self.config.destination_table,
self.db,
if_exists="append",
index=False
)
logger.info(f"Loaded batch {i//self.config.batch_size + 1}")
self.stats["loaded"] = len(df)
def run(self) -> Dict:
"""Execute full ETL pipeline."""
try:
df = self.extract()
df = self.transform(df)
df = self.validate(df)
self.load(df)
logger.info(f"ETL complete: {self.stats}")
except Exception as e:
logger.error(f"ETL failed: {e}")
self.errors.append(str(e))
return {"stats": self.stats, "errors": self.errors}
# Usage
config = ETLConfig(
source_path="player_stats_2024.csv",
destination_table="player_stats"
)
etl = SportsDataETL(config, db_connection)
results = etl.run()
Heat Map Shot Chart
Create basketball shot chart heat map using matplotlib and seaborn showing shot density and efficiency.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.patches import Circle, Rectangle, Arc
from scipy.ndimage import gaussian_filter
def draw_court(ax=None, color="black", lw=2):
"""Draw basketball court lines."""
if ax is None:
ax = plt.gca()
# Hoop
hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)
ax.add_patch(hoop)
# Backboard
ax.plot([-30, 30], [-7.5, -7.5], color=color, linewidth=lw)
# Paint
outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw,
color=color, fill=False)
ax.add_patch(outer_box)
# Free throw circle
free_throw = Arc((0, 142.5), 120, 120, theta1=0, theta2=180,
linewidth=lw, color=color)
ax.add_patch(free_throw)
# Three point line
three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158,
linewidth=lw, color=color)
ax.add_patch(three_arc)
ax.plot([-220, -220], [-47.5, 92.5], color=color, linewidth=lw)
ax.plot([220, 220], [-47.5, 92.5], color=color, linewidth=lw)
ax.set_xlim(-250, 250)
ax.set_ylim(-47.5, 422.5)
return ax
def shot_chart_heatmap(shots_df, title="Shot Chart"):
"""Create shot chart heat map."""
fig, ax = plt.subplots(figsize=(12, 11))
# Create 2D histogram
heatmap, xedges, yedges = np.histogram2d(
shots_df["loc_x"], shots_df["loc_y"],
bins=30, range=[[-250, 250], [-50, 400]]
)
# Smooth with Gaussian filter
heatmap = gaussian_filter(heatmap, sigma=1.5)
# Plot heatmap
im = ax.imshow(
heatmap.T, origin="lower",
extent=[-250, 250, -50, 400],
cmap="YlOrRd", alpha=0.7
)
draw_court(ax, color="white", lw=2)
ax.set_title(title, fontsize=16, fontweight="bold")
ax.axis("off")
cbar = fig.colorbar(im, ax=ax, shrink=0.7)
cbar.set_label("Shot Frequency", fontsize=12)
plt.tight_layout()
return fig, ax
fig, ax = shot_chart_heatmap(shots_df, "Player Shot Chart 2024")
Rolling Average Performance Chart
Plot rolling averages over time to visualize player performance trends with confidence bands.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
def plot_rolling_performance(df, date_col, value_col, window=20,
player_name="Player", stat_name="Stat"):
"""Plot rolling average with confidence bands."""
df = df.sort_values(date_col).copy()
# Calculate rolling statistics
df["rolling_mean"] = df[value_col].rolling(window=window, min_periods=5).mean()
df["rolling_std"] = df[value_col].rolling(window=window, min_periods=5).std()
fig, ax = plt.subplots(figsize=(14, 6))
# Individual games (faded)
ax.scatter(df[date_col], df[value_col], alpha=0.3, s=30,
color="gray", label="Individual Games")
# Rolling average
ax.plot(df[date_col], df["rolling_mean"], linewidth=2.5,
color="#1f77b4", label=f"{window}-Game Rolling Avg")
# Confidence band (±1 std)
ax.fill_between(
df[date_col],
df["rolling_mean"] - df["rolling_std"],
df["rolling_mean"] + df["rolling_std"],
alpha=0.2, color="#1f77b4"
)
# Season average line
season_avg = df[value_col].mean()
ax.axhline(y=season_avg, color="red", linestyle="--",
linewidth=1.5, label=f"Season Avg: {season_avg:.3f}")
ax.set_xlabel("Date", fontsize=12)
ax.set_ylabel(stat_name, fontsize=12)
ax.set_title(f"{player_name} - {stat_name} Trend ({window}-Game Rolling)",
fontsize=14, fontweight="bold")
ax.legend(loc="best")
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
return fig, ax
# Example usage
fig, ax = plot_rolling_performance(
game_log_df, "game_date", "batting_avg",
window=15, player_name="Mike Trout", stat_name="Batting Average"
)
Radar Chart Player Comparison
Create radar/spider chart to compare multiple players across different statistical categories.
import matplotlib.pyplot as plt
import numpy as np
def create_radar_chart(players_data, categories, title="Player Comparison"):
"""Create radar chart comparing players across categories."""
# Number of categories
N = len(categories)
# Angle for each category
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1] # Complete the loop
# Create figure
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"]
for idx, (player_name, values) in enumerate(players_data.items()):
values = values + values[:1] # Complete the loop
ax.plot(angles, values, "o-", linewidth=2,
label=player_name, color=colors[idx % len(colors)])
ax.fill(angles, values, alpha=0.25, color=colors[idx % len(colors)])
# Set category labels
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=11)
# Customize
ax.set_ylim(0, 100)
ax.set_title(title, size=16, fontweight="bold", y=1.08)
ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
plt.tight_layout()
return fig, ax
# Example: Compare players (values are percentiles 0-100)
categories = ["Power", "Contact", "Speed", "Defense", "Plate Discipline", "WAR"]
players_data = {
"Player A": [85, 75, 60, 70, 80, 90],
"Player B": [70, 90, 80, 85, 65, 82],
"Player C": [95, 55, 40, 60, 70, 88]
}
fig, ax = create_radar_chart(players_data, categories)
plt.show()
Sankey Flow Diagram
Create Sankey diagram to visualize player movement, draft flows, or game state transitions.
import plotly.graph_objects as go
import pandas as pd
def create_trade_sankey(trades_df):
"""Create Sankey diagram for player trades/transactions."""
# Get unique teams
all_teams = list(set(
trades_df["from_team"].tolist() + trades_df["to_team"].tolist()
))
team_idx = {team: i for i, team in enumerate(all_teams)}
# Build links
links = trades_df.groupby(["from_team", "to_team"]).agg({
"player_id": "count",
"war": "sum"
}).reset_index()
source = [team_idx[t] for t in links["from_team"]]
target = [team_idx[t] for t in links["to_team"]]
value = links["player_id"].tolist()
# Color based on WAR traded
colors = []
for war in links["war"]:
if war > 5:
colors.append("rgba(255, 0, 0, 0.5)") # High WAR = red
elif war > 0:
colors.append("rgba(255, 165, 0, 0.5)") # Positive WAR = orange
else:
colors.append("rgba(128, 128, 128, 0.5)") # Negative WAR = gray
fig = go.Figure(go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0.5),
label=all_teams,
color="blue"
),
link=dict(
source=source,
target=target,
value=value,
color=colors,
label=[f"{v} players ({w:.1f} WAR)"
for v, w in zip(value, links["war"])]
)
))
fig.update_layout(
title="Player Trade Flow Between Teams",
font_size=12,
height=600
)
return fig
fig = create_trade_sankey(trades_df)
fig.show()
Box Plot Comparison
Create box plots to compare statistical distributions across teams, positions, or seasons.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
def create_boxplot_comparison(df, value_col, group_col,
title=None, horizontal=False):
"""Create styled box plot comparison."""
fig, ax = plt.subplots(figsize=(14, 8))
# Order groups by median value
order = df.groupby(group_col)[value_col].median().sort_values(
ascending=not horizontal
).index.tolist()
# Create box plot with swarm overlay
if horizontal:
sns.boxplot(
data=df, y=group_col, x=value_col, order=order,
palette="Set2", width=0.6, ax=ax
)
sns.swarmplot(
data=df, y=group_col, x=value_col, order=order,
color="black", alpha=0.4, size=3, ax=ax
)
else:
sns.boxplot(
data=df, x=group_col, y=value_col, order=order,
palette="Set2", width=0.6, ax=ax
)
sns.swarmplot(
data=df, x=group_col, y=value_col, order=order,
color="black", alpha=0.4, size=3, ax=ax
)
# Add mean markers
means = df.groupby(group_col)[value_col].mean()
for i, group in enumerate(order):
mean_val = means[group]
if horizontal:
ax.scatter(mean_val, i, color="red", s=100,
marker="D", zorder=5, label="Mean" if i == 0 else "")
else:
ax.scatter(i, mean_val, color="red", s=100,
marker="D", zorder=5, label="Mean" if i == 0 else "")
# Styling
if title:
ax.set_title(title, fontsize=14, fontweight="bold")
if not horizontal:
plt.xticks(rotation=45, ha="right")
ax.grid(True, alpha=0.3)
ax.legend(loc="upper right")
plt.tight_layout()
return fig, ax
# Example: Compare ERA across teams
fig, ax = create_boxplot_comparison(
pitchers_df, "era", "team",
title="ERA Distribution by Team",
horizontal=True
)
Interactive Plotly Dashboard
Build interactive multi-chart dashboard using Plotly for exploring player statistics.
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
def create_player_dashboard(player_df, game_log_df, player_name):
"""Create interactive dashboard for player analysis."""
fig = make_subplots(
rows=2, cols=2,
subplot_titles=(
"Season Stats Trend", "Stat Distribution",
"Performance by Month", "Category Breakdown"
),
specs=[
[{"type": "scatter"}, {"type": "histogram"}],
[{"type": "bar"}, {"type": "pie"}]
]
)
# 1. Rolling average trend
game_log_df = game_log_df.sort_values("game_date")
game_log_df["rolling_avg"] = game_log_df["avg"].rolling(20).mean()
fig.add_trace(
go.Scatter(
x=game_log_df["game_date"],
y=game_log_df["rolling_avg"],
mode="lines", name="20-Game Avg",
line=dict(color="blue", width=2)
),
row=1, col=1
)
# 2. Hit distribution histogram
fig.add_trace(
go.Histogram(
x=game_log_df["hits"], name="Hits/Game",
marker_color="green", opacity=0.7
),
row=1, col=2
)
# 3. Monthly performance
monthly = game_log_df.groupby(
game_log_df["game_date"].dt.month
).agg({"avg": "mean", "hr": "sum"}).reset_index()
fig.add_trace(
go.Bar(
x=["Apr", "May", "Jun", "Jul", "Aug", "Sep"],
y=monthly["avg"],
name="Monthly AVG",
marker_color="orange"
),
row=2, col=1
)
# 4. Hit type breakdown (pie)
hit_types = player_df[["singles", "doubles", "triples", "hr"]].iloc[0]
fig.add_trace(
go.Pie(
labels=["Singles", "Doubles", "Triples", "HR"],
values=hit_types.values,
hole=0.4
),
row=2, col=2
)
fig.update_layout(
title=dict(text=f"{player_name} - Season Dashboard", font=dict(size=20)),
height=700,
showlegend=True,
template="plotly_white"
)
return fig
# Create dashboard
fig = create_player_dashboard(player_season_df, game_log_df, "Mike Trout")
fig.show()
Histogram Distribution Analysis
Create histogram with distribution fit and statistical annotations for analyzing stat distributions.
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd
def distribution_histogram(data, stat_name, bins=30, fit_dist="norm"):
"""Histogram with distribution fit and statistics."""
fig, ax = plt.subplots(figsize=(12, 6))
# Remove NaN values
data = data.dropna() if hasattr(data, "dropna") else data[~np.isnan(data)]
# Histogram
n, bins_edges, patches = ax.hist(
data, bins=bins, density=True, alpha=0.7,
color="#1f77b4", edgecolor="white", linewidth=0.5
)
# Fit distribution
if fit_dist == "norm":
mu, sigma = stats.norm.fit(data)
x = np.linspace(data.min(), data.max(), 100)
pdf = stats.norm.pdf(x, mu, sigma)
ax.plot(x, pdf, "r-", linewidth=2,
label=f"Normal fit (μ={mu:.3f}, σ={sigma:.3f})")
# Percentile lines
percentiles = [25, 50, 75, 90]
colors = ["green", "orange", "red", "purple"]
for p, c in zip(percentiles, colors):
pval = np.percentile(data, p)
ax.axvline(x=pval, color=c, linestyle="--", linewidth=1.5,
label=f"{p}th percentile: {pval:.3f}")
# Statistics box
stats_text = (
f"n = {len(data):,}\n"
f"Mean: {data.mean():.3f}\n"
f"Median: {np.median(data):.3f}\n"
f"Std: {data.std():.3f}\n"
f"Min: {data.min():.3f}\n"
f"Max: {data.max():.3f}"
)
ax.text(0.95, 0.95, stats_text, transform=ax.transAxes,
fontsize=10, verticalalignment="top", horizontalalignment="right",
bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.8))
ax.set_xlabel(stat_name, fontsize=12)
ax.set_ylabel("Density", fontsize=12)
ax.set_title(f"Distribution of {stat_name}", fontsize=14, fontweight="bold")
ax.legend(loc="upper left", fontsize=9)
ax.grid(True, alpha=0.3)
plt.tight_layout()
return fig, ax
# Example
fig, ax = distribution_histogram(players_df["war"], "WAR (Wins Above Replacement)")
Animated Play Visualization
Create animated visualization of play tracking data using matplotlib animation.
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import pandas as pd
def animate_play(tracking_df, field_type="football"):
"""Animate player tracking data."""
# Get unique frames
frames = sorted(tracking_df["frame_id"].unique())
fig, ax = plt.subplots(figsize=(14, 6))
if field_type == "football":
# Draw football field
ax.set_xlim(0, 120)
ax.set_ylim(0, 53.3)
ax.set_facecolor("#228B22")
# Yard lines
for yard in range(0, 121, 10):
ax.axvline(x=yard, color="white", linewidth=1, alpha=0.5)
# End zones
ax.axvspan(0, 10, alpha=0.3, color="blue")
ax.axvspan(110, 120, alpha=0.3, color="red")
# Initialize scatter plots for teams
offense = ax.scatter([], [], s=200, c="blue", edgecolors="white",
linewidth=2, label="Offense")
defense = ax.scatter([], [], s=200, c="red", edgecolors="white",
linewidth=2, label="Defense")
ball = ax.scatter([], [], s=100, c="brown", marker="o",
edgecolors="white", linewidth=2, label="Ball")
ax.legend(loc="upper right")
title = ax.set_title("", fontsize=12, fontweight="bold")
def init():
offense.set_offsets(np.empty((0, 2)))
defense.set_offsets(np.empty((0, 2)))
ball.set_offsets(np.empty((0, 2)))
return offense, defense, ball, title
def update(frame):
frame_data = tracking_df[tracking_df["frame_id"] == frame]
off_data = frame_data[frame_data["team"] == "offense"]
def_data = frame_data[frame_data["team"] == "defense"]
ball_data = frame_data[frame_data["team"] == "ball"]
offense.set_offsets(off_data[["x", "y"]].values)
defense.set_offsets(def_data[["x", "y"]].values)
if not ball_data.empty:
ball.set_offsets(ball_data[["x", "y"]].values)
title.set_text(f"Frame: {frame}")
return offense, defense, ball, title
ani = animation.FuncAnimation(
fig, update, frames=frames,
init_func=init, blit=True, interval=100
)
return fig, ani
# Save animation
fig, ani = animate_play(tracking_df)
ani.save("play_animation.gif", writer="pillow", fps=10)
Bar Chart with Error Bars
Create grouped bar chart with error bars for comparing statistics across categories or groups.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def grouped_bar_chart(df, group_col, categories, values_cols,
errors_cols=None, title="Comparison"):
"""Create grouped bar chart with optional error bars."""
groups = df[group_col].unique()
n_groups = len(groups)
n_categories = len(categories)
fig, ax = plt.subplots(figsize=(12, 7))
bar_width = 0.8 / n_categories
x = np.arange(n_groups)
colors = plt.cm.Set2(np.linspace(0, 1, n_categories))
for i, (cat, val_col) in enumerate(zip(categories, values_cols)):
values = [df[df[group_col] == g][val_col].values[0] for g in groups]
errors = None
if errors_cols:
errors = [df[df[group_col] == g][errors_cols[i]].values[0]
for g in groups]
bars = ax.bar(
x + i * bar_width, values, bar_width,
label=cat, color=colors[i],
yerr=errors, capsize=4,
edgecolor="white", linewidth=1
)
# Add value labels on bars
for bar, val in zip(bars, values):
height = bar.get_height()
ax.annotate(
f"{val:.2f}",
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3), textcoords="offset points",
ha="center", va="bottom", fontsize=9
)
ax.set_xlabel(group_col, fontsize=12)
ax.set_ylabel("Value", fontsize=12)
ax.set_title(title, fontsize=14, fontweight="bold")
ax.set_xticks(x + bar_width * (n_categories - 1) / 2)
ax.set_xticklabels(groups, rotation=45, ha="right")
ax.legend(loc="upper right")
ax.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
return fig, ax
# Example: Compare teams across stats
fig, ax = grouped_bar_chart(
team_stats_df,
group_col="team",
categories=["OBP", "SLG", "wRC+"],
values_cols=["obp", "slg", "wrc_plus"],
title="Team Offensive Comparison"
)
Scatter Plot with Regression
Create scatter plot with regression line and annotations for analyzing stat correlations.
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd
def scatter_with_regression(df, x_col, y_col, label_col=None,
highlight_players=None):
"""Scatter plot with regression line and player labels."""
fig, ax = plt.subplots(figsize=(12, 8))
x = df[x_col]
y = df[y_col]
# Regression
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
line_x = np.linspace(x.min(), x.max(), 100)
line_y = slope * line_x + intercept
# Scatter
scatter = ax.scatter(x, y, alpha=0.6, s=80, c="#1f77b4",
edgecolors="white", linewidth=0.5)
# Regression line
ax.plot(line_x, line_y, color="red", linewidth=2, linestyle="--",
label=f"R² = {r_value**2:.3f}")
# Highlight specific players
if highlight_players and label_col:
for player in highlight_players:
player_data = df[df[label_col] == player]
if not player_data.empty:
px, py = player_data[x_col].values[0], player_data[y_col].values[0]
ax.scatter(px, py, s=150, c="orange", edgecolors="black",
linewidth=2, zorder=5)
ax.annotate(
player, (px, py), fontsize=10, fontweight="bold",
xytext=(10, 10), textcoords="offset points",
arrowprops=dict(arrowstyle="->", color="black")
)
# Labels and styling
ax.set_xlabel(x_col, fontsize=12)
ax.set_ylabel(y_col, fontsize=12)
ax.set_title(f"{y_col} vs {x_col}", fontsize=14, fontweight="bold")
ax.legend(loc="best", fontsize=11)
ax.grid(True, alpha=0.3)
# Add correlation annotation
ax.text(0.05, 0.95, f"Correlation: {r_value:.3f}\np-value: {p_value:.2e}",
transform=ax.transAxes, fontsize=10, verticalalignment="top",
bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5))
plt.tight_layout()
return fig, ax
# Example
fig, ax = scatter_with_regression(
players_df, "exit_velocity", "slg",
label_col="player_name",
highlight_players=["Aaron Judge", "Shohei Ohtani"]
)
XGBoost Game Outcome Predictor
Predict game outcomes using XGBoost gradient boosting with feature engineering for team matchups.
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
def build_game_predictor(games_df):
"""Build XGBoost model to predict game outcomes."""
# Feature engineering
features = [
"home_win_pct", "away_win_pct",
"home_pts_avg", "away_pts_avg",
"home_pts_allowed_avg", "away_pts_allowed_avg",
"home_streak", "away_streak",
"home_rest_days", "away_rest_days",
"home_elo", "away_elo"
]
X = games_df[features]
y = games_df["home_win"].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# XGBoost parameters
params = {
"objective": "binary:logistic",
"eval_metric": "logloss",
"max_depth": 6,
"learning_rate": 0.1,
"n_estimators": 200,
"subsample": 0.8,
"colsample_bytree": 0.8,
"random_state": 42
}
model = xgb.XGBClassifier(**params)
model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=False
)
# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
return {
"model": model,
"accuracy": accuracy_score(y_test, y_pred),
"auc": roc_auc_score(y_test, y_prob),
"log_loss": log_loss(y_test, y_prob),
"feature_importance": dict(zip(features, model.feature_importances_))
}
results = build_game_predictor(games_df)
print(f"Accuracy: {results['accuracy']:.3f}")
print(f"AUC: {results['auc']:.3f}")
Random Forest Player Prediction
Use Random Forest classifier to predict player performance categories (elite, above-average, average, below-average) based on historical stats.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
def train_player_classifier(df, features, target_col):
"""Train Random Forest to classify player performance tiers."""
# Prepare features and target
X = df[features].copy()
y = df[target_col]
# Handle missing values
X = X.fillna(X.median())
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
# Train model
rf = RandomForestClassifier(
n_estimators=100,
max_depth=10,
min_samples_split=5,
random_state=42,
n_jobs=-1
)
rf.fit(X_train, y_train)
# Evaluate
y_pred = rf.predict(X_test)
cv_scores = cross_val_score(rf, X_scaled, y, cv=5)
# Feature importance
importance = pd.DataFrame({
"feature": features,
"importance": rf.feature_importances_
}).sort_values("importance", ascending=False)
return {
"model": rf,
"scaler": scaler,
"accuracy": rf.score(X_test, y_test),
"cv_mean": cv_scores.mean(),
"cv_std": cv_scores.std(),
"feature_importance": importance,
"classification_report": classification_report(y_test, y_pred),
"confusion_matrix": confusion_matrix(y_test, y_pred)
}
# Example usage
features = ["avg", "obp", "slg", "hr", "rbi", "sb", "bb_pct", "k_pct"]
results = train_player_classifier(player_df, features, "performance_tier")
print(f"Accuracy: {results['accuracy']:.3f}")
print(f"CV Score: {results['cv_mean']:.3f} (+/- {results['cv_std']:.3f})")
print("\nTop Features:")
print(results["feature_importance"].head(10))
PCA Dimensionality Reduction
Use Principal Component Analysis to reduce high-dimensional player statistics while preserving variance.
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
def perform_player_pca(df, stat_columns, n_components=None, variance_threshold=0.95):
"""Reduce player stats dimensions using PCA."""
X = df[stat_columns].fillna(df[stat_columns].median())
# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Initial PCA to determine components
pca_full = PCA()
pca_full.fit(X_scaled)
# Find components for variance threshold
cumulative_var = np.cumsum(pca_full.explained_variance_ratio_)
if n_components is None:
n_components = np.argmax(cumulative_var >= variance_threshold) + 1
# Final PCA
pca = PCA(n_components=n_components)
X_reduced = pca.fit_transform(X_scaled)
# Component loadings
loadings = pd.DataFrame(
pca.components_.T,
columns=[f"PC{i+1}" for i in range(n_components)],
index=stat_columns
)
# Add to dataframe
for i in range(n_components):
df[f"PC{i+1}"] = X_reduced[:, i]
return {
"pca": pca,
"scaler": scaler,
"n_components": n_components,
"explained_variance": pca.explained_variance_ratio_,
"cumulative_variance": cumulative_var[:n_components],
"loadings": loadings
}
# Reduce batting stats
stat_cols = ["avg", "obp", "slg", "hr", "sb", "bb_pct", "k_pct",
"iso", "babip", "wrc_plus", "war"]
results = perform_player_pca(batters_df, stat_cols)
print(f"Reduced to {results['n_components']} components")
print(f"Variance explained: {results['cumulative_variance'][-1]:.1%}")
print("\nTop loadings for PC1:")
print(results["loadings"]["PC1"].abs().sort_values(ascending=False).head())
Bayesian Regression with PyMC
Bayesian linear regression for player projections with uncertainty quantification using PyMC.
import pymc as pm
import numpy as np
import pandas as pd
import arviz as az
def bayesian_projection_model(df, features, target):
"""Build Bayesian regression with uncertainty estimates."""
X = df[features].values
y = df[target].values
# Standardize
X_mean, X_std = X.mean(axis=0), X.std(axis=0)
y_mean, y_std = y.mean(), y.std()
X_scaled = (X - X_mean) / X_std
y_scaled = (y - y_mean) / y_std
with pm.Model() as model:
# Priors
alpha = pm.Normal("alpha", mu=0, sigma=1)
betas = pm.Normal("betas", mu=0, sigma=1, shape=len(features))
sigma = pm.HalfNormal("sigma", sigma=1)
# Linear model
mu = alpha + pm.math.dot(X_scaled, betas)
# Likelihood
y_obs = pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y_scaled)
# Sample
trace = pm.sample(2000, tune=1000, cores=2, return_inferencedata=True)
# Posterior summary
summary = az.summary(trace, var_names=["alpha", "betas", "sigma"])
def predict_with_uncertainty(new_X):
"""Predict with credible intervals."""
new_X_scaled = (new_X - X_mean) / X_std
posterior = trace.posterior
alpha_samples = posterior["alpha"].values.flatten()
beta_samples = posterior["betas"].values.reshape(-1, len(features))
predictions = alpha_samples[:, None] + np.dot(beta_samples, new_X_scaled.T)
predictions = predictions * y_std + y_mean
return {
"mean": predictions.mean(axis=0),
"std": predictions.std(axis=0),
"ci_95": np.percentile(predictions, [2.5, 97.5], axis=0)
}
return model, trace, predict_with_uncertainty
features = ["age", "pa", "avg_3yr", "obp_3yr", "slg_3yr"]
model, trace, predict = bayesian_projection_model(
player_df, features, "next_year_war"
)
# Predict for new player
pred = predict(np.array([[28, 600, .280, .350, .450]]))
print(f"Projected WAR: {pred['mean'][0]:.2f} ± {pred['std'][0]:.2f}")
Ensemble Model Stacking
Combine multiple models using stacking to improve prediction accuracy for player projections.
from sklearn.ensemble import (
RandomForestRegressor, GradientBoostingRegressor,
StackingRegressor
)
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import pandas as pd
def build_stacked_projection_model(df, features, target):
"""Build stacked ensemble for player projections."""
X = df[features].fillna(df[features].median())
y = df[target]
# Base models
base_models = [
("rf", RandomForestRegressor(
n_estimators=100, max_depth=8, random_state=42
)),
("gb", GradientBoostingRegressor(
n_estimators=100, max_depth=5, random_state=42
)),
("ridge", Ridge(alpha=1.0)),
("elastic", ElasticNet(alpha=0.5, l1_ratio=0.5)),
]
# Meta-learner
meta_model = Ridge(alpha=0.5)
# Stacking ensemble
stacked = StackingRegressor(
estimators=base_models,
final_estimator=meta_model,
cv=5,
n_jobs=-1
)
# Evaluate
cv_scores = cross_val_score(stacked, X, y, cv=5, scoring="neg_mean_absolute_error")
# Fit final model
stacked.fit(X, y)
# Individual model scores for comparison
individual_scores = {}
for name, model in base_models:
scores = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
individual_scores[name] = -scores.mean()
return {
"model": stacked,
"stacked_mae": -cv_scores.mean(),
"individual_mae": individual_scores
}
features = ["age", "pa", "avg_3yr", "obp_3yr", "slg_3yr", "war_3yr"]
results = build_stacked_projection_model(projections_df, features, "actual_war")
print(f"Stacked MAE: {results['stacked_mae']:.3f}")
print("Individual MAEs:", results["individual_mae"])
Logistic Regression Win Probability
Simple but interpretable logistic regression model for real-time win probability calculation.
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.calibration import CalibratedClassifierCV
def train_win_probability_model(plays_df):
"""Train calibrated win probability model."""
# Features for win probability
features = [
"score_diff", # Current score differential
"time_remaining", # Seconds remaining
"possession", # 1 if home has ball, 0 if away
"yard_line", # Field position (football)
"down", # Current down
"distance" # Yards to first down
]
X = plays_df[features]
y = plays_df["home_win"]
# Add polynomial features for non-linear relationships
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
# Train with calibration for accurate probabilities
base_model = LogisticRegression(max_iter=1000, C=0.1)
model = CalibratedClassifierCV(base_model, cv=5, method="isotonic")
model.fit(X_poly, y)
def predict_win_prob(score_diff, time_remaining, possession,
yard_line=50, down=1, distance=10):
"""Get win probability for current game state."""
input_data = np.array([[
score_diff, time_remaining, possession,
yard_line, down, distance
]])
input_poly = poly.transform(input_data)
prob = model.predict_proba(input_poly)[0, 1]
return prob
return model, poly, predict_win_prob
model, poly, predict_wp = train_win_probability_model(plays_df)
# Example: Home team up 7, 5 min left, has ball
wp = predict_wp(score_diff=7, time_remaining=300, possession=1)
print(f"Home Win Probability: {wp:.1%}")
Gradient Boosting Injury Risk
Use LightGBM to predict player injury risk based on workload, age, and historical injury data.
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve
def build_injury_risk_model(df):
"""Build injury risk prediction model."""
features = [
"age", "career_games", "games_last_season",
"workload_index", "previous_injuries",
"days_since_last_injury", "position_risk_factor",
"bmi", "sprint_speed_decline", "throwing_velocity_change"
]
X = df[features].fillna(-1)
y = df["injured_next_season"].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
"objective": "binary",
"metric": "auc",
"boosting_type": "gbdt",
"num_leaves": 31,
"learning_rate": 0.05,
"feature_fraction": 0.8,
"bagging_fraction": 0.8,
"bagging_freq": 5,
"verbose": -1,
"is_unbalance": True
}
model = lgb.train(
params, train_data,
num_boost_round=500,
valid_sets=[test_data],
callbacks=[lgb.early_stopping(50)]
)
# Predictions
y_prob = model.predict(X_test)
auc = roc_auc_score(y_test, y_prob)
# Feature importance
importance = pd.DataFrame({
"feature": features,
"importance": model.feature_importance(importance_type="gain")
}).sort_values("importance", ascending=False)
return {"model": model, "auc": auc, "importance": importance}
results = build_injury_risk_model(players_df)
print(f"AUC: {results['auc']:.3f}")
print("\nTop Risk Factors:")
print(results["importance"].head())
LSTM Sequence Prediction
Long Short-Term Memory network to predict player performance sequences over time, capturing temporal patterns.
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
class PlayerLSTM(nn.Module):
def __init__(self, input_size, hidden_size=64, num_layers=2, output_size=1):
super().__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(
input_size, hidden_size, num_layers,
batch_first=True, dropout=0.2
)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
# x shape: (batch, seq_len, features)
lstm_out, _ = self.lstm(x)
# Take last time step
out = self.fc(lstm_out[:, -1, :])
return out
def create_sequences(df, player_col, features, target, seq_length=10):
"""Create sequences for LSTM training."""
sequences = []
targets = []
for player in df[player_col].unique():
player_data = df[df[player_col] == player].sort_values("season")
if len(player_data) < seq_length + 1:
continue
for i in range(len(player_data) - seq_length):
seq = player_data[features].iloc[i:i+seq_length].values
tgt = player_data[target].iloc[i+seq_length]
sequences.append(seq)
targets.append(tgt)
return np.array(sequences), np.array(targets)
# Create sequences
features = ["age", "pa", "avg", "obp", "slg", "war"]
X, y = create_sequences(career_df, "player_id", features, "war", seq_length=5)
# Train model
model = PlayerLSTM(input_size=len(features))
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
X_tensor = torch.FloatTensor(X)
y_tensor = torch.FloatTensor(y).unsqueeze(1)
for epoch in range(100):
model.train()
optimizer.zero_grad()
outputs = model(X_tensor)
loss = criterion(outputs, y_tensor)
loss.backward()
optimizer.step()
if (epoch + 1) % 20 == 0:
print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
K-Means Player Clustering
Cluster players into similar performance groups using K-Means algorithm with automatic optimal cluster selection.
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
def cluster_players(df, features, max_clusters=10):
"""Cluster players and find optimal number of clusters."""
X = df[features].fillna(df[features].median())
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Find optimal clusters using elbow method and silhouette
inertias = []
silhouettes = []
K_range = range(2, max_clusters + 1)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)
inertias.append(kmeans.inertia_)
silhouettes.append(silhouette_score(X_scaled, labels))
# Best k by silhouette
optimal_k = K_range[np.argmax(silhouettes)]
# Final clustering
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df["cluster"] = final_kmeans.fit_predict(X_scaled)
# Cluster profiles
profiles = df.groupby("cluster")[features].mean()
profiles["count"] = df.groupby("cluster").size()
return {
"optimal_k": optimal_k,
"labels": df["cluster"],
"profiles": profiles,
"silhouette": max(silhouettes),
"model": final_kmeans,
"scaler": scaler
}
# Cluster hitters
features = ["avg", "obp", "slg", "hr_rate", "bb_rate", "k_rate", "sprint_speed"]
results = cluster_players(hitters_df, features)
print(f"Optimal clusters: {results['optimal_k']}")
print("\nCluster Profiles:")
print(results["profiles"])
Neural Network Player Projections
Deep learning model using PyTorch to project future player statistics based on historical performance trends.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
class PlayerProjectionNet(nn.Module):
def __init__(self, input_size, hidden_sizes=[64, 32], output_size=1):
super().__init__()
layers = []
prev_size = input_size
for hidden in hidden_sizes:
layers.extend([
nn.Linear(prev_size, hidden),
nn.BatchNorm1d(hidden),
nn.ReLU(),
nn.Dropout(0.2)
])
prev_size = hidden
layers.append(nn.Linear(prev_size, output_size))
self.network = nn.Sequential(*layers)
def forward(self, x):
return self.network(x)
def train_projection_model(df, feature_cols, target_col, epochs=100):
"""Train neural network for player stat projections."""
# Prepare data
X = df[feature_cols].values
y = df[target_col].values.reshape(-1, 1)
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y)
# Convert to tensors
X_tensor = torch.FloatTensor(X_scaled)
y_tensor = torch.FloatTensor(y_scaled)
dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
# Model
model = PlayerProjectionNet(len(feature_cols))
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training
model.train()
for epoch in range(epochs):
total_loss = 0
for X_batch, y_batch in loader:
optimizer.zero_grad()
predictions = model(X_batch)
loss = criterion(predictions, y_batch)
loss.backward()
optimizer.step()
total_loss += loss.item()
if (epoch + 1) % 20 == 0:
print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")
return model, scaler_X, scaler_y
# Usage
feature_cols = ["age", "games", "pa", "avg_3yr", "obp_3yr", "slg_3yr"]
model, scaler_X, scaler_y = train_projection_model(
player_df, feature_cols, "next_year_war"
)
Daily Data Pipeline
Automated pipeline for daily sports data updates.
"""Automated daily data pipeline for sports analytics."""
import pandas as pd
import requests
from datetime import datetime, timedelta
import schedule
import time
import logging
from pathlib import Path
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)
class DailyPipeline:
"""Automated data collection and processing pipeline."""
def __init__(self, data_dir: str = "./data"):
self.data_dir = Path(data_dir)
self.data_dir.mkdir(exist_ok=True)
def fetch_scores(self, sport: str, date: str = None) -> pd.DataFrame:
"""Fetch daily scores from API."""
if date is None:
date = datetime.now().strftime("%Y-%m-%d")
# Example API call (replace with actual)
# response = requests.get(f"https://api.example.com/{sport}/scores/{date}")
# return pd.DataFrame(response.json())
logger.info(f"Fetched {sport} scores for {date}")
return pd.DataFrame()
def update_database(self, df: pd.DataFrame, table: str):
"""Update database with new data."""
# Example: df.to_sql(table, engine, if_exists="append")
logger.info(f"Updated {table} with {len(df)} rows")
def calculate_daily_metrics(self, sport: str) -> pd.DataFrame:
"""Calculate daily summary metrics."""
# Load recent data and calculate metrics
logger.info(f"Calculated daily metrics for {sport}")
return pd.DataFrame()
def send_alerts(self, alerts: list):
"""Send alerts for significant events."""
for alert in alerts:
logger.info(f"Alert: {alert}")
def run_daily_job(self):
"""Run the complete daily pipeline."""
logger.info("Starting daily pipeline...")
sports = ["mlb", "nba", "nfl"]
for sport in sports:
try:
# Fetch data
scores = self.fetch_scores(sport)
# Update database
if not scores.empty:
self.update_database(scores, f"{sport}_scores")
# Calculate metrics
metrics = self.calculate_daily_metrics(sport)
except Exception as e:
logger.error(f"Error processing {sport}: {e}")
logger.info("Daily pipeline complete")
def main():
pipeline = DailyPipeline()
# Schedule daily run at 6 AM
schedule.every().day.at("06:00").do(pipeline.run_daily_job)
# Run immediately for testing
pipeline.run_daily_job()
# Keep running
while True:
schedule.run_pending()
time.sleep(60)
if __name__ == "__main__":
main()
MMA Fight Statistics
Calculate MMA fighter statistics and performance metrics.
"""MMA Fighter Statistics Calculator."""
import pandas as pd
import numpy as np
class MMAStats:
"""Calculate MMA fighter statistics."""
@staticmethod
def striking_accuracy(sig_strikes_landed: int, sig_strikes_attempted: int) -> float:
"""Calculate significant striking accuracy."""
if sig_strikes_attempted == 0:
return 0
return sig_strikes_landed / sig_strikes_attempted
@staticmethod
def takedown_accuracy(takedowns_landed: int, takedowns_attempted: int) -> float:
"""Calculate takedown accuracy."""
if takedowns_attempted == 0:
return 0
return takedowns_landed / takedowns_attempted
@staticmethod
def defense_rate(strikes_absorbed: int, strikes_attempted_against: int) -> float:
"""Calculate striking defense rate."""
if strikes_attempted_against == 0:
return 1.0
return 1 - (strikes_absorbed / strikes_attempted_against)
@staticmethod
def submission_rate(wins: int, sub_wins: int) -> float:
"""Calculate submission win percentage."""
if wins == 0:
return 0
return sub_wins / wins
@staticmethod
def calculate_fight_iq(df: pd.DataFrame) -> pd.Series:
"""Calculate composite fight IQ score."""
# Normalize components
strike_acc = df["striking_accuracy"].rank(pct=True)
td_acc = df["takedown_accuracy"].rank(pct=True)
defense = df["defense_rate"].rank(pct=True)
# Weighted average
return 0.4 * strike_acc + 0.3 * td_acc + 0.3 * defense
# Example
# fighter_stats["fight_iq"] = MMAStats.calculate_fight_iq(fighter_stats)
Draft Value Analysis
Analyze draft pick value and player development.
"""Draft pick value analysis."""
import pandas as pd
import numpy as np
def calculate_draft_value_curve(historical_drafts: pd.DataFrame,
value_metric: str = "career_war") -> pd.DataFrame:
"""Calculate expected value by draft position."""
return historical_drafts.groupby("pick").agg({
value_metric: ["mean", "std", "count"],
"all_star": "mean",
"years_played": "mean"
}).reset_index()
def surplus_value(player_value: float, contract_value: float,
years: int) -> float:
"""Calculate surplus value over contract."""
return (player_value * years) - contract_value
def draft_efficiency(team_drafts: pd.DataFrame,
expected_values: dict) -> pd.DataFrame:
"""Calculate team draft efficiency vs expected."""
team_drafts = team_drafts.copy()
team_drafts["expected_value"] = team_drafts["pick"].map(expected_values)
team_drafts["value_over_expected"] = team_drafts["actual_value"] - team_drafts["expected_value"]
return team_drafts.groupby("team").agg({
"value_over_expected": "sum",
"actual_value": "sum",
"expected_value": "sum"
})
def project_rookie_development(stats: pd.DataFrame,
similar_players: list) -> dict:
"""Project rookie development based on similar players."""
similar_careers = stats[stats["player"].isin(similar_players)]
projections = similar_careers.groupby("years_exp").agg({
"war": ["mean", "std"],
"games": "mean"
}).reset_index()
return {
"year_projections": projections,
"peak_year": projections.loc[projections[("war", "mean")].idxmax(), "years_exp"]
}
Generate PDF Report
Create professional PDF reports for sports analytics.
"""Generate PDF reports for sports analytics."""
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image
from reportlab.lib.units import inch
import pandas as pd
import matplotlib.pyplot as plt
import io
class SportsReport:
"""Generate professional PDF sports reports."""
def __init__(self, filename: str, title: str):
self.doc = SimpleDocTemplate(filename, pagesize=letter)
self.elements = []
self.styles = getSampleStyleSheet()
# Add title
title_style = ParagraphStyle("Title", fontSize=24, spaceAfter=30)
self.elements.append(Paragraph(title, title_style))
def add_heading(self, text: str, level: int = 1):
"""Add a section heading."""
style = self.styles[f"Heading{level}"]
self.elements.append(Paragraph(text, style))
self.elements.append(Spacer(1, 12))
def add_paragraph(self, text: str):
"""Add a paragraph of text."""
self.elements.append(Paragraph(text, self.styles["Normal"]))
self.elements.append(Spacer(1, 12))
def add_table(self, df: pd.DataFrame, title: str = None):
"""Add a data table."""
if title:
self.add_heading(title, level=2)
# Convert DataFrame to list
data = [df.columns.tolist()] + df.values.tolist()
table = Table(data)
table.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), colors.grey),
("TEXTCOLOR", (0, 0), (-1, 0), colors.whitesmoke),
("ALIGN", (0, 0), (-1, -1), "CENTER"),
("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
("FONTSIZE", (0, 0), (-1, 0), 10),
("BOTTOMPADDING", (0, 0), (-1, 0), 12),
("GRID", (0, 0), (-1, -1), 1, colors.black)
]))
self.elements.append(table)
self.elements.append(Spacer(1, 20))
def add_chart(self, fig, width: float = 6, height: float = 4):
"""Add a matplotlib figure."""
img_buffer = io.BytesIO()
fig.savefig(img_buffer, format="png", dpi=150, bbox_inches="tight")
img_buffer.seek(0)
img = Image(img_buffer, width=width*inch, height=height*inch)
self.elements.append(img)
self.elements.append(Spacer(1, 20))
def build(self):
"""Generate the PDF."""
self.doc.build(self.elements)
# Example usage
# report = SportsReport("team_analysis.pdf", "2024 Season Analysis")
# report.add_heading("Performance Summary")
# report.add_table(stats_df, "Key Statistics")
# report.add_chart(performance_chart)
# report.build()
Trade Value Calculator
Calculate player trade values based on multiple factors.
"""Calculate player trade values."""
import pandas as pd
import numpy as np
class TradeValueCalculator:
"""Calculate and compare player trade values."""
def __init__(self, salary_cap: float = 150_000_000):
self.salary_cap = salary_cap
def calculate_value(self, player: dict) -> float:
"""Calculate total trade value for a player."""
# Base value from production
production_value = player["war"] * 8_000_000 # $8M per WAR
# Age adjustment (peak = 27)
age_factor = 1 - abs(player["age"] - 27) * 0.03
# Contract value
years_left = player.get("contract_years", 1)
salary = player.get("salary", 0)
contract_value = (production_value - salary) * years_left
# Control premium for pre-arb/arb players
control_premium = 0
if player.get("service_time", 7) < 3:
control_premium = production_value * 0.5
elif player.get("service_time", 7) < 6:
control_premium = production_value * 0.25
return (production_value * age_factor) + contract_value + control_premium
def evaluate_trade(self, team_a_gives: list, team_b_gives: list) -> dict:
"""Evaluate fairness of a proposed trade."""
value_a = sum(self.calculate_value(p) for p in team_a_gives)
value_b = sum(self.calculate_value(p) for p in team_b_gives)
difference = value_a - value_b
return {
"team_a_value": value_a,
"team_b_value": value_b,
"difference": abs(difference),
"favors": "Team A" if difference > 0 else "Team B" if difference < 0 else "Even",
"fair": abs(difference) < (value_a + value_b) * 0.1
}
def find_matching_value(self, target_value: float,
available_players: pd.DataFrame,
max_players: int = 3) -> list:
"""Find combination of players matching target value."""
available_players = available_players.copy()
available_players["trade_value"] = available_players.apply(
lambda x: self.calculate_value(x.to_dict()), axis=1
)
# Simple greedy approach
selected = []
remaining_value = target_value
for _ in range(max_players):
if remaining_value <= 0:
break
best_match = available_players.iloc[
(available_players["trade_value"] - remaining_value).abs().argmin()
]
selected.append(best_match["player_name"])
remaining_value -= best_match["trade_value"]
available_players = available_players[
available_players["player_name"] != best_match["player_name"]
]
return selected
Injury Impact Analysis
Analyze team performance impact from player injuries.
"""Analyze injury impact on team performance."""
import pandas as pd
import numpy as np
from scipy import stats
def calculate_injury_impact(games: pd.DataFrame, player: str,
team_col: str, result_col: str) -> dict:
"""Calculate team performance with/without a player."""
with_player = games[games["active_players"].str.contains(player, na=False)]
without_player = games[~games["active_players"].str.contains(player, na=False)]
if len(with_player) < 5 or len(without_player) < 5:
return {"error": "Insufficient data"}
with_wins = with_player[result_col].mean()
without_wins = without_player[result_col].mean()
# Statistical significance
t_stat, p_value = stats.ttest_ind(
with_player[result_col],
without_player[result_col]
)
return {
"player": player,
"games_with": len(with_player),
"games_without": len(without_player),
"win_pct_with": with_wins,
"win_pct_without": without_wins,
"impact": with_wins - without_wins,
"p_value": p_value,
"significant": p_value < 0.05
}
def replacement_level_analysis(player_stats: pd.DataFrame,
injured_player: str,
replacement: str,
stat_cols: list) -> pd.DataFrame:
"""Compare injured player to replacement."""
injured_stats = player_stats[player_stats["player"] == injured_player][stat_cols].iloc[0]
replace_stats = player_stats[player_stats["player"] == replacement][stat_cols].iloc[0]
comparison = pd.DataFrame({
"Stat": stat_cols,
"Injured": injured_stats.values,
"Replacement": replace_stats.values,
"Difference": injured_stats.values - replace_stats.values
})
return comparison
Interactive Plotly Visualizations
Create interactive sports visualizations using Plotly.
"""Interactive sports visualizations with Plotly."""
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
def create_player_comparison(players_df: pd.DataFrame, metrics: list,
player_col: str = "player_name") -> go.Figure:
"""Create interactive radar chart comparing players."""
fig = go.Figure()
for _, player in players_df.iterrows():
fig.add_trace(go.Scatterpolar(
r=[player[m] for m in metrics],
theta=metrics,
fill="toself",
name=player[player_col]
))
fig.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
showlegend=True,
title="Player Comparison"
)
return fig
def create_timeline(df: pd.DataFrame, date_col: str, value_col: str,
group_col: str = None) -> go.Figure:
"""Create interactive timeline chart."""
if group_col:
fig = px.line(df, x=date_col, y=value_col, color=group_col,
title=f"{value_col} Over Time")
else:
fig = px.line(df, x=date_col, y=value_col, title=f"{value_col} Over Time")
fig.update_xaxes(rangeslider_visible=True)
return fig
def create_scatter_with_hover(df: pd.DataFrame, x: str, y: str,
hover_data: list = None) -> go.Figure:
"""Create scatter plot with detailed hover info."""
fig = px.scatter(
df, x=x, y=y,
hover_data=hover_data,
trendline="ols",
title=f"{y} vs {x}"
)
return fig
def create_heatmap(df: pd.DataFrame, x: str, y: str,
value: str) -> go.Figure:
"""Create interactive heatmap."""
pivot = df.pivot_table(index=y, columns=x, values=value)
fig = go.Figure(data=go.Heatmap(
z=pivot.values,
x=pivot.columns,
y=pivot.index,
colorscale="RdYlGn",
text=pivot.values.round(2),
texttemplate="%{text}",
textfont={"size": 10}
))
fig.update_layout(title=f"{value} by {x} and {y}")
return fig
def create_dashboard(df: pd.DataFrame, metrics: list) -> go.Figure:
"""Create multi-panel dashboard."""
fig = make_subplots(
rows=2, cols=2,
subplot_titles=metrics[:4],
specs=[[{"type": "indicator"}, {"type": "indicator"}],
[{"type": "bar"}, {"type": "scatter"}]]
)
# Add indicator gauges for first two metrics
for i, metric in enumerate(metrics[:2]):
fig.add_trace(
go.Indicator(
mode="gauge+number",
value=df[metric].mean(),
title={"text": metric},
gauge={"axis": {"range": [df[metric].min(), df[metric].max()]}}
),
row=1, col=i+1
)
return fig
# Example usage:
# fig = create_player_comparison(top_players, ["Points", "Assists", "Rebounds"])
# fig.show()
R Data Wrangling for Sports
Common R data wrangling operations for sports data.
# Common R data wrangling operations for sports
library(dplyr)
library(tidyr)
library(lubridate)
# Calculate per-game stats
per_game_stats <- function(df, counting_stats, games_col = "G") {
df %>%
mutate(across(all_of(counting_stats), ~ . / .data[[games_col]], .names = "{col}_per_game"))
}
# Rolling averages
add_rolling_stats <- function(df, stat_cols, windows = c(5, 10), group_col = "player_id") {
df <- df %>% arrange(.data[[group_col]], date)
for (w in windows) {
for (col in stat_cols) {
new_col <- paste0(col, "_MA", w)
df <- df %>%
group_by(.data[[group_col]]) %>%
mutate(!!new_col := zoo::rollmean(.data[[col]], k = w, fill = NA, align = "right")) %>%
ungroup()
}
}
df
}
# Lag features for modeling
create_lag_features <- function(df, stat_cols, lags = 1:3, group_col = "player_id") {
df <- df %>% arrange(.data[[group_col]], date)
for (l in lags) {
for (col in stat_cols) {
new_col <- paste0(col, "_lag", l)
df <- df %>%
group_by(.data[[group_col]]) %>%
mutate(!!new_col := lag(.data[[col]], l)) %>%
ungroup()
}
}
df
}
# Pivot stats long to wide
pivot_stats_wide <- function(df, stat_col, value_col, id_cols) {
df %>%
pivot_wider(
id_cols = all_of(id_cols),
names_from = all_of(stat_col),
values_from = all_of(value_col)
)
}
# Calculate year-over-year change
yoy_change <- function(df, stat_cols, year_col = "season", group_col = "player_id") {
df %>%
arrange(.data[[group_col]], .data[[year_col]]) %>%
group_by(.data[[group_col]]) %>%
mutate(across(all_of(stat_cols),
~ . - lag(.),
.names = "{col}_yoy_change")) %>%
ungroup()
}
Common Sports SQL Queries
Useful SQL queries for sports databases.
-- Common SQL queries for sports analytics
-- Get player career stats with rankings
SELECT
player_name,
SUM(points) as career_points,
AVG(points) as ppg,
COUNT(DISTINCT season) as seasons,
RANK() OVER (ORDER BY SUM(points) DESC) as points_rank
FROM player_game_stats
GROUP BY player_id, player_name
HAVING COUNT(*) >= 100
ORDER BY career_points DESC;
-- Calculate team win percentage by month
SELECT
team,
DATE_TRUNC('month', game_date) as month,
COUNT(*) as games,
SUM(CASE WHEN won THEN 1 ELSE 0 END) as wins,
ROUND(AVG(CASE WHEN won THEN 1.0 ELSE 0.0 END), 3) as win_pct
FROM games
GROUP BY team, DATE_TRUNC('month', game_date)
ORDER BY team, month;
-- Find players with hot streaks (5+ games above average)
WITH player_avg AS (
SELECT player_id, AVG(fantasy_points) as avg_pts
FROM daily_stats
GROUP BY player_id
),
streaks AS (
SELECT
d.player_id,
d.game_date,
d.fantasy_points,
d.fantasy_points > p.avg_pts * 1.2 as hot,
SUM(CASE WHEN d.fantasy_points > p.avg_pts * 1.2 THEN 0 ELSE 1 END)
OVER (PARTITION BY d.player_id ORDER BY d.game_date) as streak_group
FROM daily_stats d
JOIN player_avg p ON d.player_id = p.player_id
)
SELECT player_id, MIN(game_date) as streak_start, COUNT(*) as streak_length
FROM streaks
WHERE hot
GROUP BY player_id, streak_group
HAVING COUNT(*) >= 5;
-- Head-to-head record between teams
SELECT
home_team,
away_team,
COUNT(*) as games,
SUM(CASE WHEN home_score > away_score THEN 1 ELSE 0 END) as home_wins,
SUM(CASE WHEN away_score > home_score THEN 1 ELSE 0 END) as away_wins,
AVG(home_score + away_score) as avg_total
FROM games
WHERE season = 2024
GROUP BY home_team, away_team;
Unit Conversion Utilities
Convert between different units commonly used in sports.
"""Sports unit conversion utilities."""
# Speed conversions
def mph_to_kph(mph: float) -> float:
"""Miles per hour to kilometers per hour."""
return mph * 1.60934
def kph_to_mph(kph: float) -> float:
"""Kilometers per hour to miles per hour."""
return kph / 1.60934
# Distance conversions
def feet_to_meters(feet: float) -> float:
"""Feet to meters."""
return feet * 0.3048
def meters_to_feet(meters: float) -> float:
"""Meters to feet."""
return meters / 0.3048
def yards_to_meters(yards: float) -> float:
"""Yards to meters."""
return yards * 0.9144
# Weight conversions
def lbs_to_kg(lbs: float) -> float:
"""Pounds to kilograms."""
return lbs * 0.453592
def kg_to_lbs(kg: float) -> float:
"""Kilograms to pounds."""
return kg / 0.453592
# Height conversions
def inches_to_cm(inches: float) -> float:
"""Inches to centimeters."""
return inches * 2.54
def height_string_to_inches(height: str) -> int:
"""Convert height string (e.g., '6-2' or '6'2\"') to inches."""
import re
match = re.match(r"(\d+)['\-](\d+)", height)
if match:
feet, inches = int(match.group(1)), int(match.group(2))
return feet * 12 + inches
return 0
# Time conversions
def min_sec_to_decimal(minutes: int, seconds: int) -> float:
"""Convert minutes:seconds to decimal minutes."""
return minutes + seconds / 60
def pace_to_speed(pace_min_per_mile: float) -> float:
"""Convert pace (min/mile) to speed (mph)."""
return 60 / pace_min_per_mile
Date and Schedule Utilities
Utility functions for working with sports schedules and dates.
"""Sports schedule and date utilities."""
import pandas as pd
from datetime import datetime, timedelta
from typing import List, Tuple
def get_week_of_season(date: datetime, season_start: datetime) -> int:
"""Calculate week of season from a date."""
days_since_start = (date - season_start).days
return (days_since_start // 7) + 1
def parse_game_time(time_str: str, timezone: str = "ET") -> datetime:
"""Parse game time string to datetime."""
import pytz
tz_map = {
"ET": "America/New_York",
"CT": "America/Chicago",
"MT": "America/Denver",
"PT": "America/Los_Angeles"
}
# Parse common formats
for fmt in ["%I:%M %p", "%H:%M", "%I:%M%p"]:
try:
dt = datetime.strptime(time_str, fmt)
tz = pytz.timezone(tz_map.get(timezone, "America/New_York"))
return tz.localize(dt)
except ValueError:
continue
raise ValueError(f"Cannot parse time: {time_str}")
def calculate_rest_days(schedule: pd.DataFrame, team_col: str,
date_col: str) -> pd.DataFrame:
"""Calculate rest days between games for each team."""
schedule = schedule.sort_values(date_col)
schedule["rest_days"] = schedule.groupby(team_col)[date_col].diff().dt.days
return schedule
def find_back_to_backs(schedule: pd.DataFrame, team: str) -> pd.DataFrame:
"""Find back-to-back games for a team."""
team_games = schedule[
(schedule["home_team"] == team) | (schedule["away_team"] == team)
].sort_values("date")
team_games["is_b2b"] = team_games["date"].diff().dt.days == 1
return team_games[team_games["is_b2b"]]
def generate_playoff_bracket(teams: List[str], format: str = "single") -> dict:
"""Generate playoff bracket structure."""
n_teams = len(teams)
rounds = []
current_round = [[teams[i], teams[n_teams-1-i]] for i in range(n_teams//2)]
rounds.append(current_round)
while len(current_round) > 1:
next_round = [[f"Winner G{i*2+1}", f"Winner G{i*2+2}"]
for i in range(len(current_round)//2)]
rounds.append(next_round)
current_round = next_round
return {"rounds": rounds, "format": format}
Volleyball Rally Analysis
Analyze volleyball rally patterns and point sequences.
"""Volleyball rally analysis."""
import pandas as pd
import numpy as np
class VolleyballAnalysis:
"""Analyze volleyball match and rally statistics."""
@staticmethod
def sideout_percentage(points_on_receive: int, opponent_serves: int) -> float:
"""Calculate sideout percentage."""
if opponent_serves == 0:
return 0
return points_on_receive / opponent_serves
@staticmethod
def kill_percentage(kills: int, errors: int, attempts: int) -> float:
"""Calculate kill percentage."""
if attempts == 0:
return 0
return (kills - errors) / attempts
@staticmethod
def efficiency(kills: int, errors: int, total_attempts: int) -> float:
"""Calculate hitting efficiency."""
if total_attempts == 0:
return 0
return (kills - errors) / total_attempts
@staticmethod
def passing_rating(passes: pd.DataFrame) -> float:
"""Calculate passing rating (3.0 scale)."""
# Weight passes by quality (3 = perfect, 2 = good, 1 = ok, 0 = error)
weights = {"perfect": 3, "good": 2, "ok": 1, "error": 0}
passes["weighted"] = passes["quality"].map(weights)
return passes["weighted"].mean()
@staticmethod
def serve_receive_analysis(rallies: pd.DataFrame) -> dict:
"""Analyze serve receive patterns."""
return {
"sideout_pct": rallies[rallies["serve_receive"]]["point_won"].mean(),
"avg_rally_length": rallies[rallies["serve_receive"]]["touches"].mean(),
"first_ball_kill_pct": rallies[
rallies["serve_receive"] & (rallies["touches"] == 3)
]["point_won"].mean()
}
Player Archetype Clustering
Cluster players into archetypes based on statistical profiles.
"""Cluster players into archetypes."""
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
class PlayerArchetypes:
"""Identify player archetypes using clustering."""
def __init__(self, n_clusters: int = 8):
self.n_clusters = n_clusters
self.scaler = StandardScaler()
self.kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
self.pca = PCA(n_components=2)
def fit(self, df: pd.DataFrame, stat_cols: list) -> "PlayerArchetypes":
"""Fit clustering model."""
X = df[stat_cols].fillna(0)
X_scaled = self.scaler.fit_transform(X)
self.kmeans.fit(X_scaled)
self.pca.fit(X_scaled)
return self
def assign_archetypes(self, df: pd.DataFrame, stat_cols: list) -> pd.DataFrame:
"""Assign archetype labels to players."""
df = df.copy()
X = df[stat_cols].fillna(0)
X_scaled = self.scaler.transform(X)
df["archetype"] = self.kmeans.predict(X_scaled)
return df
def describe_archetypes(self, df: pd.DataFrame, stat_cols: list) -> pd.DataFrame:
"""Describe each archetype by average stats."""
return df.groupby("archetype")[stat_cols].mean()
def plot_archetypes(self, df: pd.DataFrame, stat_cols: list):
"""Visualize archetypes using PCA."""
X = df[stat_cols].fillna(0)
X_scaled = self.scaler.transform(X)
X_pca = self.pca.transform(X_scaled)
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1],
c=df["archetype"], cmap="tab10", alpha=0.6)
plt.colorbar(scatter, label="Archetype")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Player Archetypes")
return plt
# Usage
# archetypes = PlayerArchetypes(n_clusters=6)
# archetypes.fit(players, ["pts", "reb", "ast", "stl", "blk"])
# players = archetypes.assign_archetypes(players, stat_cols)
Player Performance Trends
Analyze player performance trends over time.
"""Analyze player performance trends over time."""
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression
def calculate_rolling_stats(df: pd.DataFrame, stat_col: str,
windows: list = [5, 10, 20]) -> pd.DataFrame:
"""Calculate rolling averages for a statistic."""
df = df.copy()
for w in windows:
df[f"{stat_col}_MA{w}"] = df[stat_col].rolling(w, min_periods=1).mean()
df[f"{stat_col}_STD{w}"] = df[stat_col].rolling(w, min_periods=1).std()
return df
def detect_trend(values: np.ndarray) -> Dict:
"""Detect trend using linear regression."""
x = np.arange(len(values)).reshape(-1, 1)
y = values
model = LinearRegression()
model.fit(x, y)
slope = model.coef_[0]
r_squared = model.score(x, y)
# Mann-Kendall trend test
tau, p_value = stats.kendalltau(x.flatten(), y)
return {
"slope": slope,
"r_squared": r_squared,
"kendall_tau": tau,
"p_value": p_value,
"trend": "increasing" if slope > 0 and p_value < 0.05 else
"decreasing" if slope < 0 and p_value < 0.05 else "stable"
}
def identify_hot_cold_streaks(df: pd.DataFrame, stat_col: str,
threshold: float = 1.5) -> pd.DataFrame:
"""Identify hot and cold streaks."""
df = df.copy()
mean = df[stat_col].mean()
std = df[stat_col].std()
df["z_score"] = (df[stat_col] - mean) / std
df["streak_type"] = np.where(df["z_score"] > threshold, "hot",
np.where(df["z_score"] < -threshold, "cold", "normal"))
return df
Seasonal Decomposition
Decompose sports statistics into trend, seasonal, and residual components.
"""Seasonal decomposition for sports analytics."""
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
def decompose_season_stats(df: pd.DataFrame, stat_col: str,
period: int = 7) -> dict:
"""Decompose statistic into trend, seasonal, and residual."""
# Ensure datetime index
if not isinstance(df.index, pd.DatetimeIndex):
df = df.set_index("date")
# Fill missing dates
df = df.resample("D").mean().interpolate()
decomposition = seasonal_decompose(df[stat_col], model="additive", period=period)
return {
"trend": decomposition.trend,
"seasonal": decomposition.seasonal,
"residual": decomposition.resid,
"observed": decomposition.observed
}
def test_stationarity(series: pd.Series) -> dict:
"""Test for stationarity using Augmented Dickey-Fuller test."""
result = adfuller(series.dropna())
return {
"adf_statistic": result[0],
"p_value": result[1],
"is_stationary": result[1] < 0.05,
"critical_values": result[4]
}
def forecast_stat(df: pd.DataFrame, stat_col: str,
periods: int = 10) -> pd.DataFrame:
"""Simple forecast using exponential smoothing."""
from statsmodels.tsa.holtwinters import ExponentialSmoothing
model = ExponentialSmoothing(df[stat_col], trend="add", seasonal="add",
seasonal_periods=7)
fitted = model.fit()
forecast = fitted.forecast(periods)
return pd.DataFrame({
"forecast": forecast,
"lower": forecast - 1.96 * fitted.sse,
"upper": forecast + 1.96 * fitted.sse
})
Handle Missing Sports Data
Strategies for handling missing values in sports datasets.
"""Handle missing data in sports datasets."""
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
def analyze_missing(df: pd.DataFrame) -> pd.DataFrame:
"""Analyze missing data patterns."""
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
return pd.DataFrame({
"Missing": missing,
"Percent": missing_pct
}).sort_values("Percent", ascending=False)
def impute_stats(df: pd.DataFrame, method: str = "mean") -> pd.DataFrame:
"""Impute missing statistics using various methods."""
numeric_cols = df.select_dtypes(include=[np.number]).columns
if method == "mean":
imputer = SimpleImputer(strategy="mean")
elif method == "median":
imputer = SimpleImputer(strategy="median")
elif method == "knn":
imputer = KNNImputer(n_neighbors=5)
else:
raise ValueError(f"Unknown method: {method}")
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
return df
def fill_with_career_avg(df: pd.DataFrame, player_col: str, stat_cols: list) -> pd.DataFrame:
"""Fill missing stats with player career averages."""
df = df.copy()
for col in stat_cols:
career_avg = df.groupby(player_col)[col].transform("mean")
df[col] = df[col].fillna(career_avg)
return df
# Example
# df = analyze_missing(player_stats)
# df = impute_stats(player_stats, method="knn")
Merge Multiple Data Sources
Combine data from multiple sources with proper matching.
"""Merge sports data from multiple sources."""
import pandas as pd
from fuzzywuzzy import fuzz, process
from typing import Tuple
def fuzzy_merge(df1: pd.DataFrame, df2: pd.DataFrame,
key1: str, key2: str, threshold: int = 85) -> pd.DataFrame:
"""Merge DataFrames using fuzzy string matching on names."""
matches = []
for name in df1[key1].unique():
match = process.extractOne(name, df2[key2].unique(), score_cutoff=threshold)
if match:
matches.append({"name1": name, "name2": match[0], "score": match[1]})
match_df = pd.DataFrame(matches)
df1_matched = df1.merge(match_df, left_on=key1, right_on="name1")
return df1_matched.merge(df2, left_on="name2", right_on=key2)
def standardize_team_names(df: pd.DataFrame, team_col: str,
mapping: dict) -> pd.DataFrame:
"""Standardize team abbreviations across sources."""
df = df.copy()
df[team_col] = df[team_col].map(mapping).fillna(df[team_col])
return df
def merge_with_validation(df1: pd.DataFrame, df2: pd.DataFrame,
on: list) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Merge and return unmatched rows for review."""
merged = df1.merge(df2, on=on, how="outer", indicator=True)
unmatched = merged[merged["_merge"] != "both"]
matched = merged[merged["_merge"] == "both"].drop("_merge", axis=1)
return matched, unmatched
Normalize Player Statistics
Normalize and standardize player statistics for comparison.
"""Normalize sports statistics for analysis."""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
def per_game_normalize(df: pd.DataFrame, stat_cols: list, games_col: str = "G") -> pd.DataFrame:
"""Convert counting stats to per-game averages."""
df = df.copy()
for col in stat_cols:
df[f"{col}_per_game"] = df[col] / df[games_col]
return df
def per_100_possessions(df: pd.DataFrame, stats: list, poss_col: str) -> pd.DataFrame:
"""Normalize stats per 100 possessions (basketball)."""
df = df.copy()
for stat in stats:
df[f"{stat}_per100"] = df[stat] / df[poss_col] * 100
return df
def z_score_normalize(df: pd.DataFrame, stat_cols: list) -> pd.DataFrame:
"""Z-score normalize stats (mean=0, std=1)."""
df = df.copy()
scaler = StandardScaler()
df[stat_cols] = scaler.fit_transform(df[stat_cols])
return df
def percentile_rank(df: pd.DataFrame, stat_cols: list) -> pd.DataFrame:
"""Convert stats to percentile rankings."""
df = df.copy()
for col in stat_cols:
df[f"{col}_pctl"] = df[col].rank(pct=True) * 100
return df
Monte Carlo Game Simulator
Simulate game outcomes using Monte Carlo methods.
"""Monte Carlo simulation for game outcomes."""
import numpy as np
import pandas as pd
from typing import Tuple, Dict
class GameSimulator:
"""Simulate sports game outcomes using Monte Carlo."""
def __init__(self, n_sims: int = 10000):
self.n_sims = n_sims
def simulate_game(self, home_mean: float, away_mean: float,
home_std: float = None, away_std: float = None) -> Dict:
"""Simulate a single game many times."""
if home_std is None:
home_std = home_mean * 0.3
if away_std is None:
away_std = away_mean * 0.3
home_scores = np.random.normal(home_mean, home_std, self.n_sims)
away_scores = np.random.normal(away_mean, away_std, self.n_sims)
home_wins = (home_scores > away_scores).sum()
ties = (home_scores == away_scores).sum()
return {
"home_win_prob": home_wins / self.n_sims,
"away_win_prob": (self.n_sims - home_wins - ties) / self.n_sims,
"avg_home_score": home_scores.mean(),
"avg_away_score": away_scores.mean(),
"avg_total": (home_scores + away_scores).mean()
}
def simulate_season(self, schedule: pd.DataFrame,
team_ratings: Dict[str, float]) -> pd.DataFrame:
"""Simulate full season standings."""
results = {team: {"W": 0, "L": 0} for team in team_ratings}
for _, game in schedule.iterrows():
home, away = game["home"], game["away"]
result = self.simulate_game(
team_ratings[home] + 3, # Home advantage
team_ratings[away]
)
if np.random.random() < result["home_win_prob"]:
results[home]["W"] += 1
results[away]["L"] += 1
else:
results[away]["W"] += 1
results[home]["L"] += 1
return pd.DataFrame(results).T
Bootstrap Confidence Intervals
Calculate confidence intervals for sports statistics using bootstrap.
"""Bootstrap confidence intervals for sports statistics."""
import numpy as np
import pandas as pd
from typing import Tuple, Callable
def bootstrap_ci(data: np.ndarray, stat_func: Callable = np.mean,
n_bootstrap: int = 10000, ci: float = 0.95) -> Tuple[float, float]:
"""Calculate bootstrap confidence interval."""
bootstrap_stats = []
n = len(data)
for _ in range(n_bootstrap):
sample = np.random.choice(data, size=n, replace=True)
bootstrap_stats.append(stat_func(sample))
alpha = (1 - ci) / 2
lower = np.percentile(bootstrap_stats, alpha * 100)
upper = np.percentile(bootstrap_stats, (1 - alpha) * 100)
return lower, upper
def bootstrap_player_stat(df: pd.DataFrame, player: str, stat: str,
ci: float = 0.95) -> Dict:
"""Bootstrap confidence interval for a player stat."""
player_data = df[df["player"] == player][stat].values
if len(player_data) < 10:
return {"error": "Insufficient data"}
mean = player_data.mean()
lower, upper = bootstrap_ci(player_data, np.mean, ci=ci)
return {
"player": player,
"stat": stat,
"mean": mean,
"ci_lower": lower,
"ci_upper": upper,
"sample_size": len(player_data)
}
# Example: Calculate 95% CI for batting average
# result = bootstrap_player_stat(batting_logs, "Mike Trout", "AVG")
Correlation Analysis for Stacking
Analyze player correlations for DFS game stacking strategies.
"""DFS Correlation Analysis for Game Stacking."""
import pandas as pd
import numpy as np
from scipy import stats
from typing import Dict, List, Tuple
import itertools
class CorrelationAnalyzer:
"""
Analyze fantasy point correlations for stacking strategies.
Key correlations:
- QB-WR/TE (passing game)
- RB-DEF (game script)
- Bring-back (opposing players)
"""
# Historical correlation estimates by position pair
NFL_CORRELATIONS = {
("QB", "WR"): 0.35,
("QB", "TE"): 0.30,
("QB", "RB"): 0.15,
("WR", "WR"): 0.05,
("RB", "DST"): -0.10,
("QB", "opp_WR"): 0.10, # Bring-back
("QB", "opp_RB"): 0.05,
}
NBA_CORRELATIONS = {
("PG", "C"): 0.20, # Pick and roll
("SG", "SG"): -0.15, # Same position, compete for shots
("PF", "C"): 0.10,
}
def __init__(self, sport: str = "nfl"):
self.sport = sport
self.correlations = self.NFL_CORRELATIONS if sport == "nfl" else self.NBA_CORRELATIONS
def calculate_correlation(
self,
player1_scores: pd.Series,
player2_scores: pd.Series
) -> Dict:
"""Calculate correlation between two players' fantasy scores."""
# Align data
combined = pd.DataFrame({
"p1": player1_scores,
"p2": player2_scores
}).dropna()
if len(combined) < 5:
return {"correlation": np.nan, "p_value": np.nan, "sample_size": len(combined)}
corr, p_value = stats.pearsonr(combined["p1"], combined["p2"])
return {
"correlation": corr,
"p_value": p_value,
"sample_size": len(combined),
"significant": p_value < 0.05
}
def build_correlation_matrix(
self,
game_logs: pd.DataFrame,
players: List[str]
) -> pd.DataFrame:
"""Build correlation matrix for a set of players."""
# Pivot to get scores by game
pivot = game_logs.pivot_table(
index="game_id",
columns="player_name",
values="fantasy_points",
aggfunc="first"
)
# Filter to specified players
available = [p for p in players if p in pivot.columns]
pivot = pivot[available]
return pivot.corr()
def get_stack_correlation(
self,
qb: str,
pass_catchers: List[str],
player_positions: Dict[str, str]
) -> float:
"""
Calculate expected correlation for a stack.
Args:
qb: Quarterback name
pass_catchers: List of WR/TE names
player_positions: Dict mapping player names to positions
"""
total_corr = 0
for player in pass_catchers:
pos = player_positions.get(player, "WR")
corr = self.correlations.get(("QB", pos), 0.25)
total_corr += corr
# Average correlation
return total_corr / len(pass_catchers) if pass_catchers else 0
def find_optimal_stacks(
self,
players: pd.DataFrame,
correlation_matrix: pd.DataFrame = None,
stack_size: int = 3
) -> pd.DataFrame:
"""
Find optimal player stacks based on correlation and projection.
Args:
players: Player pool with projections
correlation_matrix: Pre-calculated correlations
stack_size: Number of players in stack
"""
stacks = []
# Group by team
teams = players.groupby("Team")
for team, team_players in teams:
if len(team_players) < stack_size:
continue
# Get QBs
qbs = team_players[team_players["Position"] == "QB"]
pass_catchers = team_players[team_players["Position"].isin(["WR", "TE"])]
if len(qbs) == 0:
continue
qb = qbs.iloc[0]
# Generate combinations of pass catchers
for combo in itertools.combinations(pass_catchers.index, min(stack_size - 1, len(pass_catchers))):
stack_players = [qb.name] + list(combo)
stack_df = players.loc[stack_players]
# Calculate stack metrics
total_projection = stack_df["Projection"].sum()
total_salary = stack_df["Salary"].sum()
avg_ownership = stack_df["Ownership"].mean()
# Estimate correlation boost
pos_dict = dict(zip(stack_df["Name"], stack_df["Position"]))
corr = self.get_stack_correlation(
qb["Name"],
[players.loc[i, "Name"] for i in combo],
pos_dict
)
# Stack ceiling (projection * (1 + correlation factor))
ceiling_boost = 1 + corr * 0.5
stack_ceiling = total_projection * ceiling_boost
stacks.append({
"Team": team,
"Players": ", ".join(stack_df["Name"].tolist()),
"Positions": ", ".join(stack_df["Position"].tolist()),
"Projection": total_projection,
"Salary": total_salary,
"Avg_Ownership": avg_ownership,
"Correlation": corr,
"Stack_Ceiling": stack_ceiling,
"Value": stack_ceiling / total_salary * 1000
})
return pd.DataFrame(stacks).sort_values("Stack_Ceiling", ascending=False)
def analyze_bring_back(
self,
primary_stack: List[str],
opponent_players: pd.DataFrame,
game_total: float = 48
) -> pd.DataFrame:
"""
Find optimal bring-back (opposing) players for a stack.
Higher game totals favor bring-backs.
"""
# Bring-back correlation increases with game total
base_corr = 0.08
game_factor = (game_total - 40) / 20 # Normalized around 50 total
adjusted_corr = base_corr * (1 + game_factor)
opponent_players = opponent_players.copy()
opponent_players["Bring_Back_Value"] = (
opponent_players["Projection"] *
(1 + adjusted_corr) /
opponent_players["Ownership"].clip(0.05)
)
return opponent_players.sort_values("Bring_Back_Value", ascending=False)
def simulate_stack_outcomes(
self,
stack_projection: float,
stack_correlation: float,
individual_std: float = 8,
n_simulations: int = 10000
) -> Dict:
"""
Simulate stack outcomes to understand ceiling/floor.
"""
# Correlation affects how scores move together
# Higher correlation = more extreme outcomes
stack_std = individual_std * np.sqrt(1 + stack_correlation)
simulations = np.random.normal(stack_projection, stack_std, n_simulations)
return {
"mean": np.mean(simulations),
"std": np.std(simulations),
"median": np.median(simulations),
"25th_percentile": np.percentile(simulations, 25),
"75th_percentile": np.percentile(simulations, 75),
"90th_percentile": np.percentile(simulations, 90),
"ceiling": np.percentile(simulations, 95),
"floor": np.percentile(simulations, 5)
}
# Example usage
if __name__ == "__main__":
analyzer = CorrelationAnalyzer("nfl")
# Sample players
players = pd.DataFrame({
"Name": ["Patrick Mahomes", "Travis Kelce", "Rashee Rice", "Isiah Pacheco", "Chiefs DST",
"Josh Allen", "Stefon Diggs", "Dalton Kincaid", "James Cook", "Bills DST"],
"Position": ["QB", "TE", "WR", "RB", "DST", "QB", "WR", "TE", "RB", "DST"],
"Team": ["KC", "KC", "KC", "KC", "KC", "BUF", "BUF", "BUF", "BUF", "BUF"],
"Salary": [8200, 6800, 6200, 5800, 3500, 7800, 6500, 5200, 6000, 3200],
"Projection": [22.5, 15.2, 14.8, 13.5, 7.5, 21.0, 14.5, 11.2, 14.0, 6.8],
"Ownership": [0.22, 0.18, 0.15, 0.12, 0.08, 0.20, 0.14, 0.10, 0.13, 0.05]
})
# Find optimal stacks
stacks = analyzer.find_optimal_stacks(players, stack_size=3)
print("Top Stacks:")
print(stacks[["Team", "Players", "Projection", "Correlation", "Stack_Ceiling"]].head(5))
# Simulate stack outcomes
sim_results = analyzer.simulate_stack_outcomes(
stack_projection=52.5, # QB + 2 pass catchers
stack_correlation=0.32
)
print(f"\nStack Simulation:")
print(f" Ceiling (95th): {sim_results['ceiling']:.1f}")
print(f" Floor (5th): {sim_results['floor']:.1f}")
Bankroll Management System
Implement proper bankroll management for sports betting using Kelly criterion.
"""Sports Betting Bankroll Management."""
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
@dataclass
class Bet:
"""Represents a single bet."""
id: str
sport: str
bet_type: str
selection: str
odds: int # American odds
stake: float
result: Optional[str] = None # "win", "loss", "push"
profit: Optional[float] = None
date: datetime = None
edge: Optional[float] = None
class BankrollManager:
"""
Manage sports betting bankroll with Kelly criterion.
Features:
- Kelly and fractional Kelly staking
- Unit-based tracking
- Risk of ruin calculations
- Performance analytics
"""
def __init__(
self,
initial_bankroll: float,
unit_size: float = None,
kelly_fraction: float = 0.25,
max_bet_pct: float = 0.05
):
"""
Initialize bankroll manager.
Args:
initial_bankroll: Starting bankroll
unit_size: Standard unit size (default: 1% of bankroll)
kelly_fraction: Fraction of Kelly to use (0.25 = quarter Kelly)
max_bet_pct: Maximum bet as percentage of bankroll
"""
self.initial_bankroll = initial_bankroll
self.current_bankroll = initial_bankroll
self.unit_size = unit_size or (initial_bankroll * 0.01)
self.kelly_fraction = kelly_fraction
self.max_bet_pct = max_bet_pct
self.bets: List[Bet] = []
self.history = [{"date": datetime.now(), "bankroll": initial_bankroll}]
def kelly_criterion(
self,
probability: float,
american_odds: int
) -> float:
"""
Calculate Kelly criterion bet size.
Kelly% = (p * b - q) / b
where p = win prob, q = loss prob, b = decimal odds - 1
"""
if american_odds > 0:
decimal_odds = (american_odds / 100) + 1
else:
decimal_odds = (100 / abs(american_odds)) + 1
b = decimal_odds - 1
q = 1 - probability
kelly = (probability * b - q) / b
return max(0, kelly)
def calculate_stake(
self,
probability: float,
odds: int,
confidence: str = "normal"
) -> Dict:
"""
Calculate recommended stake.
Args:
probability: Estimated win probability
odds: American odds
confidence: "low", "normal", "high"
"""
# Base Kelly
kelly_pct = self.kelly_criterion(probability, odds)
# Apply Kelly fraction
adjusted_kelly = kelly_pct * self.kelly_fraction
# Confidence adjustment
confidence_mult = {"low": 0.5, "normal": 1.0, "high": 1.5}
adjusted_kelly *= confidence_mult.get(confidence, 1.0)
# Apply max bet limit
final_pct = min(adjusted_kelly, self.max_bet_pct)
# Calculate stake
stake = self.current_bankroll * final_pct
# Round to unit size
units = round(stake / self.unit_size, 1)
stake = units * self.unit_size
return {
"stake": stake,
"units": units,
"kelly_pct": kelly_pct,
"adjusted_pct": final_pct,
"pct_of_bankroll": stake / self.current_bankroll
}
def place_bet(self, bet: Bet) -> None:
"""Record a placed bet."""
self.bets.append(bet)
def settle_bet(self, bet_id: str, result: str) -> float:
"""
Settle a bet and update bankroll.
Args:
bet_id: ID of bet to settle
result: "win", "loss", or "push"
Returns:
Profit/loss amount
"""
bet = next((b for b in self.bets if b.id == bet_id), None)
if not bet:
raise ValueError(f"Bet {bet_id} not found")
bet.result = result
bet.date = datetime.now()
if result == "win":
if bet.odds > 0:
profit = bet.stake * (bet.odds / 100)
else:
profit = bet.stake * (100 / abs(bet.odds))
self.current_bankroll += profit + bet.stake
elif result == "loss":
profit = -bet.stake
self.current_bankroll -= bet.stake
else: # push
profit = 0
bet.profit = profit
self.history.append({
"date": datetime.now(),
"bankroll": self.current_bankroll
})
return profit
def risk_of_ruin(
self,
win_rate: float,
avg_odds: int,
ruin_threshold: float = 0.1
) -> float:
"""
Calculate risk of ruin.
Probability of losing (1 - ruin_threshold) of bankroll.
"""
# Convert odds to decimal
if avg_odds > 0:
decimal_odds = (avg_odds / 100) + 1
else:
decimal_odds = (100 / abs(avg_odds)) + 1
# Calculate edge
edge = (win_rate * decimal_odds) - 1
if edge <= 0:
return 1.0 # Guaranteed ruin with negative edge
# Simplified RoR formula
# Assumes fixed bet size relative to bankroll
bet_size = self.unit_size / self.current_bankroll
variance = win_rate * (1 - win_rate) * (decimal_odds ** 2)
# Risk of ruin approximation
ror = ((1 - edge / variance) ** (1 / bet_size)) ** (
(1 - ruin_threshold) * self.current_bankroll / self.unit_size
)
return min(ror, 1.0)
def get_performance_stats(self) -> Dict:
"""Calculate betting performance statistics."""
settled = [b for b in self.bets if b.result is not None]
if not settled:
return {"message": "No settled bets"}
wins = [b for b in settled if b.result == "win"]
losses = [b for b in settled if b.result == "loss"]
total_staked = sum(b.stake for b in settled)
total_profit = sum(b.profit for b in settled)
return {
"total_bets": len(settled),
"wins": len(wins),
"losses": len(losses),
"win_rate": len(wins) / len(settled),
"total_staked": total_staked,
"total_profit": total_profit,
"roi": total_profit / total_staked if total_staked > 0 else 0,
"current_bankroll": self.current_bankroll,
"bankroll_growth": (self.current_bankroll - self.initial_bankroll) / self.initial_bankroll,
"avg_stake": total_staked / len(settled),
"avg_profit_per_bet": total_profit / len(settled),
"largest_win": max((b.profit for b in wins), default=0),
"largest_loss": min((b.profit for b in losses), default=0),
"current_streak": self._get_streak(settled)
}
def _get_streak(self, bets: List[Bet]) -> Dict:
"""Calculate current winning/losing streak."""
if not bets:
return {"type": None, "length": 0}
sorted_bets = sorted(bets, key=lambda x: x.date or datetime.min, reverse=True)
streak_type = sorted_bets[0].result
streak_length = 0
for bet in sorted_bets:
if bet.result == streak_type:
streak_length += 1
else:
break
return {"type": streak_type, "length": streak_length}
def get_bankroll_history(self) -> pd.DataFrame:
"""Get bankroll history as DataFrame."""
return pd.DataFrame(self.history)
# Example usage
if __name__ == "__main__":
# Initialize with $10,000 bankroll
manager = BankrollManager(
initial_bankroll=10000,
kelly_fraction=0.25,
max_bet_pct=0.03
)
print(f"Starting Bankroll: ${manager.current_bankroll:,.2f}")
print(f"Unit Size: ${manager.unit_size:,.2f}")
# Calculate stake for a bet
stake_info = manager.calculate_stake(
probability=0.55, # 55% estimated win probability
odds=-110,
confidence="normal"
)
print(f"\nRecommended Stake:")
print(f" Amount: ${stake_info['stake']:,.2f}")
print(f" Units: {stake_info['units']}")
print(f" Kelly%: {stake_info['kelly_pct']:.2%}")
print(f" % of Bankroll: {stake_info['pct_of_bankroll']:.2%}")
# Risk of ruin
ror = manager.risk_of_ruin(win_rate=0.52, avg_odds=-110)
print(f"\nRisk of Ruin: {ror:.2%}")
Betting Value Finder
Identify value bets by comparing projections to betting lines.
"""Sports Betting Value Finder."""
import pandas as pd
import numpy as np
from scipy import stats
from typing import Dict, List, Tuple, Optional
class BettingValueFinder:
"""
Find value bets by comparing model projections to market odds.
Supports: Spreads, Totals, Moneylines, Player Props
"""
def __init__(self, edge_threshold: float = 0.03):
"""
Initialize value finder.
Args:
edge_threshold: Minimum edge to flag a bet (default 3%)
"""
self.edge_threshold = edge_threshold
@staticmethod
def american_to_decimal(american: int) -> float:
"""Convert American odds to decimal."""
if american > 0:
return (american / 100) + 1
else:
return (100 / abs(american)) + 1
@staticmethod
def decimal_to_american(decimal: float) -> int:
"""Convert decimal odds to American."""
if decimal >= 2:
return int((decimal - 1) * 100)
else:
return int(-100 / (decimal - 1))
@staticmethod
def implied_probability(american: int) -> float:
"""Calculate implied probability from American odds."""
if american > 0:
return 100 / (american + 100)
else:
return abs(american) / (abs(american) + 100)
def find_spread_value(
self,
team: str,
spread: float,
odds: int,
projected_margin: float,
margin_std: float = 13.5 # NFL typical std dev
) -> Dict:
"""
Find value on spread bets.
Args:
team: Team name
spread: Betting spread (negative = favorite)
odds: American odds
projected_margin: Model projected margin
margin_std: Standard deviation of margin
"""
# Calculate cover probability
cover_margin = projected_margin + spread # Adjusted margin needed to cover
cover_prob = 1 - stats.norm.cdf(0, cover_margin, margin_std)
# Compare to implied probability
implied_prob = self.implied_probability(odds)
edge = cover_prob - implied_prob
# Calculate Kelly criterion bet size
decimal_odds = self.american_to_decimal(odds)
kelly = (cover_prob * decimal_odds - 1) / (decimal_odds - 1) if edge > 0 else 0
kelly = max(0, min(kelly, 0.25)) # Cap at 25%
return {
"bet_type": "spread",
"team": team,
"spread": spread,
"odds": odds,
"cover_probability": cover_prob,
"implied_probability": implied_prob,
"edge": edge,
"is_value": edge >= self.edge_threshold,
"kelly_fraction": kelly,
"rating": "Strong" if edge > 0.08 else "Moderate" if edge > 0.05 else "Slight"
}
def find_total_value(
self,
game: str,
total: float,
side: str, # "over" or "under"
odds: int,
projected_total: float,
total_std: float = 10.0
) -> Dict:
"""Find value on totals."""
if side.lower() == "over":
prob = 1 - stats.norm.cdf(total, projected_total, total_std)
else:
prob = stats.norm.cdf(total, projected_total, total_std)
implied_prob = self.implied_probability(odds)
edge = prob - implied_prob
decimal_odds = self.american_to_decimal(odds)
kelly = (prob * decimal_odds - 1) / (decimal_odds - 1) if edge > 0 else 0
kelly = max(0, min(kelly, 0.25))
return {
"bet_type": "total",
"game": game,
"total": total,
"side": side,
"odds": odds,
"hit_probability": prob,
"implied_probability": implied_prob,
"edge": edge,
"is_value": edge >= self.edge_threshold,
"kelly_fraction": kelly
}
def find_moneyline_value(
self,
team: str,
odds: int,
win_probability: float
) -> Dict:
"""Find value on moneyline bets."""
implied_prob = self.implied_probability(odds)
edge = win_probability - implied_prob
decimal_odds = self.american_to_decimal(odds)
kelly = (win_probability * decimal_odds - 1) / (decimal_odds - 1) if edge > 0 else 0
kelly = max(0, min(kelly, 0.25))
return {
"bet_type": "moneyline",
"team": team,
"odds": odds,
"win_probability": win_probability,
"implied_probability": implied_prob,
"edge": edge,
"is_value": edge >= self.edge_threshold,
"kelly_fraction": kelly,
"expected_value": win_probability * (decimal_odds - 1) - (1 - win_probability)
}
def find_prop_value(
self,
player: str,
stat: str,
line: float,
side: str,
odds: int,
projection: float,
std: float
) -> Dict:
"""Find value on player props."""
if side.lower() == "over":
prob = 1 - stats.norm.cdf(line, projection, std)
else:
prob = stats.norm.cdf(line, projection, std)
implied_prob = self.implied_probability(odds)
edge = prob - implied_prob
return {
"bet_type": "prop",
"player": player,
"stat": stat,
"line": line,
"side": side,
"odds": odds,
"projection": projection,
"hit_probability": prob,
"implied_probability": implied_prob,
"edge": edge,
"is_value": edge >= self.edge_threshold
}
def scan_market(
self,
odds_df: pd.DataFrame,
projections: Dict[str, Dict]
) -> pd.DataFrame:
"""
Scan entire market for value bets.
Args:
odds_df: DataFrame with current odds
projections: Dict with model projections
"""
value_bets = []
for _, row in odds_df.iterrows():
game_key = f"{row['away_team']}@{row['home_team']}"
if game_key not in projections:
continue
proj = projections[game_key]
# Check spread value
for team, spread, odds in [
(row["home_team"], row["home_spread"], row["home_spread_odds"]),
(row["away_team"], -row["home_spread"], row["away_spread_odds"])
]:
if team == row["home_team"]:
margin = proj["home_margin"]
else:
margin = -proj["home_margin"]
result = self.find_spread_value(team, spread, odds, margin)
if result["is_value"]:
result["game"] = game_key
value_bets.append(result)
# Check total value
for side, odds in [("over", row["over_odds"]), ("under", row["under_odds"])]:
result = self.find_total_value(
game_key, row["total"], side, odds, proj["total"]
)
if result["is_value"]:
value_bets.append(result)
return pd.DataFrame(value_bets)
def calculate_clv(bet_odds: int, closing_odds: int) -> float:
"""
Calculate Closing Line Value.
CLV is one of the best predictors of long-term betting success.
"""
bet_implied = BettingValueFinder.implied_probability(bet_odds)
close_implied = BettingValueFinder.implied_probability(closing_odds)
return close_implied - bet_implied
# Example usage
if __name__ == "__main__":
finder = BettingValueFinder(edge_threshold=0.03)
# Example: Find spread value
spread_value = finder.find_spread_value(
team="Kansas City",
spread=-3.5,
odds=-110,
projected_margin=6.5 # Model says KC wins by 6.5
)
print("Spread Bet Analysis:")
print(f" Team: {spread_value['team']} {spread_value['spread']}")
print(f" Cover Prob: {spread_value['cover_probability']:.1%}")
print(f" Implied Prob: {spread_value['implied_probability']:.1%}")
print(f" Edge: {spread_value['edge']:.1%}")
print(f" Is Value: {spread_value['is_value']}")
# Example: Player prop
prop_value = finder.find_prop_value(
player="Patrick Mahomes",
stat="Passing Yards",
line=275.5,
side="over",
odds=-115,
projection=295,
std=45
)
print(f"\nProp Bet Analysis:")
print(f" {prop_value['player']} {prop_value['stat']} {prop_value['side']} {prop_value['line']}")
print(f" Hit Prob: {prop_value['hit_probability']:.1%}")
print(f" Edge: {prop_value['edge']:.1%}")
Player Projection Model
Build fantasy sports projection models using machine learning.
"""Fantasy Sports Projection Model."""
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from typing import Dict, List, Tuple
class ProjectionModel:
"""
Build fantasy point projections using historical data.
"""
def __init__(self, sport: str = "nfl", position: str = None):
self.sport = sport
self.position = position
self.model = None
self.scaler = StandardScaler()
self.feature_names = None
def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Prepare features for modeling.
Creates rolling averages, matchup adjustments, etc.
"""
df = df.copy()
# Sort by player and date
df = df.sort_values(["player_id", "game_date"])
# Rolling averages (last 3 games)
rolling_cols = ["fantasy_points", "yards", "touchdowns", "receptions"]
for col in rolling_cols:
if col in df.columns:
df[f"{col}_L3"] = df.groupby("player_id")[col].transform(
lambda x: x.rolling(3, min_periods=1).mean().shift(1)
)
df[f"{col}_L5"] = df.groupby("player_id")[col].transform(
lambda x: x.rolling(5, min_periods=1).mean().shift(1)
)
# Season averages
for col in rolling_cols:
if col in df.columns:
df[f"{col}_season_avg"] = df.groupby(["player_id", "season"])[col].transform(
lambda x: x.expanding().mean().shift(1)
)
# Opponent defense ranking
if "opponent_def_rank" not in df.columns and "opponent" in df.columns:
# Calculate opponent strength from data
opp_avg = df.groupby("opponent")["fantasy_points"].mean()
df["opponent_def_rank"] = df["opponent"].map(opp_avg)
# Home/away indicator
if "is_home" not in df.columns and "location" in df.columns:
df["is_home"] = (df["location"] == "home").astype(int)
# Rest days (simplified)
df["days_rest"] = df.groupby("player_id")["game_date"].diff().dt.days.fillna(7)
return df
def select_features(self, df: pd.DataFrame) -> List[str]:
"""Select features for modeling."""
# Base features
features = [
"fantasy_points_L3", "fantasy_points_L5", "fantasy_points_season_avg",
"opponent_def_rank", "is_home", "days_rest"
]
# Position-specific features
if self.position == "QB":
features.extend([
"passing_yards_L3", "passing_tds_L3",
"rushing_yards_L3", "interceptions_L3"
])
elif self.position in ["RB", "WR", "TE"]:
features.extend([
"targets_L3", "receptions_L3", "yards_L3",
"touchdowns_L3", "snap_pct_L3"
])
# Filter to available features
available = [f for f in features if f in df.columns]
self.feature_names = available
return available
def train(
self,
df: pd.DataFrame,
target: str = "fantasy_points",
model_type: str = "gbm"
) -> Dict:
"""
Train projection model.
Args:
df: Historical data
target: Target variable (fantasy points)
model_type: "gbm" or "rf"
"""
# Prepare features
df = self.prepare_features(df)
features = self.select_features(df)
# Remove rows with missing values
df = df.dropna(subset=features + [target])
X = df[features]
y = df[target]
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Select model
if model_type == "gbm":
model = GradientBoostingRegressor(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
random_state=42
)
else:
model = RandomForestRegressor(
n_estimators=100,
max_depth=10,
random_state=42
)
# Create pipeline with scaling
self.model = Pipeline([
("scaler", StandardScaler()),
("model", model)
])
# Fit
self.model.fit(X_train, y_train)
# Evaluate
train_score = self.model.score(X_train, y_train)
test_score = self.model.score(X_test, y_test)
cv_scores = cross_val_score(self.model, X, y, cv=5)
# Feature importance
if model_type == "gbm":
importance = model.feature_importances_
else:
importance = self.model.named_steps["model"].feature_importances_
return {
"train_r2": train_score,
"test_r2": test_score,
"cv_mean": cv_scores.mean(),
"cv_std": cv_scores.std(),
"feature_importance": dict(zip(features, importance))
}
def predict(self, df: pd.DataFrame) -> pd.Series:
"""Generate projections for players."""
if self.model is None:
raise ValueError("Model not trained")
df = self.prepare_features(df)
X = df[self.feature_names]
return pd.Series(self.model.predict(X), index=df.index)
def get_projections(self, df: pd.DataFrame) -> pd.DataFrame:
"""Get projections with confidence intervals."""
predictions = self.predict(df)
# Estimate uncertainty from CV or ensemble
# Simplified: use fixed percentage
uncertainty = predictions * 0.15
result = df[["player_name", "team", "opponent"]].copy()
result["projection"] = predictions
result["floor"] = predictions - 1.5 * uncertainty
result["ceiling"] = predictions + 1.5 * uncertainty
return result.sort_values("projection", ascending=False)
# Example usage
if __name__ == "__main__":
# Generate sample historical data
np.random.seed(42)
n_games = 500
historical = pd.DataFrame({
"player_id": np.repeat(range(50), 10),
"player_name": [f"Player {i//10}" for i in range(n_games)],
"game_date": pd.date_range("2023-01-01", periods=n_games, freq="W"),
"season": [2023] * 250 + [2024] * 250,
"team": np.random.choice(["KC", "BUF", "PHI", "SF", "DAL"], n_games),
"opponent": np.random.choice(["NYG", "WAS", "CHI", "DET", "MIN"], n_games),
"location": np.random.choice(["home", "away"], n_games),
"fantasy_points": np.random.normal(15, 8, n_games).clip(0),
"yards": np.random.normal(70, 30, n_games).clip(0),
"touchdowns": np.random.poisson(0.5, n_games),
"receptions": np.random.poisson(4, n_games)
})
# Train model
model = ProjectionModel("nfl", "WR")
results = model.train(historical)
print("Model Performance:")
print(f" Test R2: {results['test_r2']:.3f}")
print(f" CV Mean: {results['cv_mean']:.3f}")
print("\nFeature Importance:")
for feat, imp in sorted(results["feature_importance"].items(), key=lambda x: -x[1])[:5]:
print(f" {feat}: {imp:.3f}")
DFS Lineup Optimizer
Optimize daily fantasy sports lineups using linear programming.
"""DFS Lineup Optimizer using Linear Programming."""
import pandas as pd
import numpy as np
from scipy.optimize import linprog, milp, LinearConstraint, Bounds
from typing import List, Dict, Tuple, Optional
class DFSOptimizer:
"""
Optimize DFS lineups using mixed integer linear programming.
Supports: DraftKings, FanDuel salary structures
"""
SITE_CONFIGS = {
"draftkings": {
"nfl": {
"positions": ["QB", "RB", "RB", "WR", "WR", "WR", "TE", "FLEX", "DST"],
"salary_cap": 50000,
"roster_size": 9
},
"nba": {
"positions": ["PG", "SG", "SF", "PF", "C", "G", "F", "UTIL"],
"salary_cap": 50000,
"roster_size": 8
},
"mlb": {
"positions": ["P", "P", "C", "1B", "2B", "3B", "SS", "OF", "OF", "OF"],
"salary_cap": 50000,
"roster_size": 10
}
},
"fanduel": {
"nfl": {
"positions": ["QB", "RB", "RB", "WR", "WR", "WR", "TE", "FLEX", "DST"],
"salary_cap": 60000,
"roster_size": 9
},
"nba": {
"positions": ["PG", "PG", "SG", "SG", "SF", "SF", "PF", "PF", "C"],
"salary_cap": 60000,
"roster_size": 9
}
}
}
def __init__(self, site: str = "draftkings", sport: str = "nfl"):
self.site = site
self.sport = sport
self.config = self.SITE_CONFIGS[site][sport]
self.players = None
def load_players(self, df: pd.DataFrame):
"""
Load player pool.
Required columns: Name, Position, Salary, Projection
Optional: Team, Opponent, Ownership
"""
self.players = df.copy()
self.players["idx"] = range(len(self.players))
return self
def _create_position_matrix(self) -> np.ndarray:
"""Create position eligibility matrix."""
n_players = len(self.players)
n_roster_spots = self.config["roster_size"]
# Map positions to roster spots
position_matrix = np.zeros((n_roster_spots, n_players))
for spot_idx, required_pos in enumerate(self.config["positions"]):
for player_idx, player_pos in enumerate(self.players["Position"]):
# Check if player is eligible for this spot
if self._is_eligible(player_pos, required_pos):
position_matrix[spot_idx, player_idx] = 1
return position_matrix
def _is_eligible(self, player_pos: str, roster_spot: str) -> bool:
"""Check if player position is eligible for roster spot."""
# Direct match
if player_pos == roster_spot:
return True
# FLEX eligibility
if roster_spot == "FLEX" and player_pos in ["RB", "WR", "TE"]:
return True
if roster_spot == "UTIL":
return True
if roster_spot == "G" and player_pos in ["PG", "SG"]:
return True
if roster_spot == "F" and player_pos in ["SF", "PF"]:
return True
return False
def optimize(
self,
min_salary: float = None,
max_from_team: int = 4,
locked: List[str] = None,
excluded: List[str] = None,
max_ownership: float = None
) -> pd.DataFrame:
"""
Generate optimal lineup.
Args:
min_salary: Minimum salary to use
max_from_team: Max players from same team
locked: Players that must be in lineup
excluded: Players to exclude
max_ownership: Maximum ownership percentage
Returns:
DataFrame with optimal lineup
"""
n_players = len(self.players)
salary_cap = self.config["salary_cap"]
# Objective: maximize projections
c = -self.players["Projection"].values # Negative for maximization
# Constraints
constraints = []
bounds = Bounds(0, 1) # Binary variables
# Salary cap constraint: sum(salary * x) <= cap
A_salary = self.players["Salary"].values.reshape(1, -1)
constraints.append(LinearConstraint(A_salary, 0, salary_cap))
# Minimum salary constraint
if min_salary:
constraints.append(LinearConstraint(A_salary, min_salary, salary_cap))
# Roster size constraint: sum(x) = roster_size
A_roster = np.ones((1, n_players))
constraints.append(LinearConstraint(A_roster, self.config["roster_size"],
self.config["roster_size"]))
# Position constraints
pos_matrix = self._create_position_matrix()
for i, required_pos in enumerate(self.config["positions"]):
eligible = pos_matrix[i, :].reshape(1, -1)
# Must select at least 1 from eligible players for this spot
# This is simplified - real optimizer uses more complex position constraints
# Team stacking constraints
if max_from_team:
for team in self.players["Team"].unique():
team_mask = (self.players["Team"] == team).astype(int).values
constraints.append(LinearConstraint(team_mask.reshape(1, -1), 0, max_from_team))
# Locked players
if locked:
for name in locked:
idx = self.players[self.players["Name"] == name].index
if len(idx) > 0:
lock_constraint = np.zeros(n_players)
lock_constraint[idx[0]] = 1
constraints.append(LinearConstraint(lock_constraint.reshape(1, -1), 1, 1))
# Excluded players
if excluded:
for name in excluded:
idx = self.players[self.players["Name"] == name].index
if len(idx) > 0:
exclude_constraint = np.zeros(n_players)
exclude_constraint[idx[0]] = 1
constraints.append(LinearConstraint(exclude_constraint.reshape(1, -1), 0, 0))
# Ownership ceiling
if max_ownership and "Ownership" in self.players.columns:
for idx, row in self.players.iterrows():
if row["Ownership"] > max_ownership:
own_constraint = np.zeros(n_players)
own_constraint[idx] = 1
constraints.append(LinearConstraint(own_constraint.reshape(1, -1), 0, 0))
# Solve MILP
integrality = np.ones(n_players) # All binary
result = milp(c, constraints=constraints, integrality=integrality, bounds=bounds)
if not result.success:
raise ValueError(f"Optimization failed: {result.message}")
# Extract lineup
selected_idx = np.where(result.x > 0.5)[0]
lineup = self.players.iloc[selected_idx].copy()
return lineup
def generate_multiple_lineups(
self,
n_lineups: int,
max_exposure: float = 0.6,
**kwargs
) -> List[pd.DataFrame]:
"""
Generate multiple unique lineups.
Args:
n_lineups: Number of lineups to generate
max_exposure: Maximum times a player can appear (as fraction)
"""
lineups = []
player_counts = {name: 0 for name in self.players["Name"]}
for i in range(n_lineups):
# Exclude overexposed players
excluded = kwargs.get("excluded", [])
max_count = max_exposure * (i + 1)
for name, count in player_counts.items():
if count >= max_count:
excluded.append(name)
kwargs["excluded"] = list(set(excluded))
try:
lineup = self.optimize(**kwargs)
lineups.append(lineup)
# Update counts
for name in lineup["Name"]:
player_counts[name] = player_counts.get(name, 0) + 1
except ValueError:
continue
return lineups
# Example usage
if __name__ == "__main__":
# Sample player pool
players = pd.DataFrame({
"Name": ["Player A", "Player B", "Player C", "Player D", "Player E",
"Player F", "Player G", "Player H", "Player I", "Player J"],
"Position": ["QB", "RB", "RB", "WR", "WR", "WR", "TE", "RB", "WR", "DST"],
"Team": ["KC", "KC", "BUF", "KC", "BUF", "PHI", "PHI", "PHI", "DAL", "DAL"],
"Salary": [8000, 7500, 7200, 6800, 6500, 6200, 5500, 5000, 4500, 3500],
"Projection": [22.5, 18.2, 17.8, 16.5, 15.2, 14.8, 12.5, 11.0, 10.5, 8.0],
"Ownership": [0.25, 0.20, 0.18, 0.15, 0.12, 0.10, 0.08, 0.05, 0.04, 0.03]
})
optimizer = DFSOptimizer("draftkings", "nfl")
optimizer.load_players(players)
# Generate single lineup
# lineup = optimizer.optimize(max_from_team=3)
# print(lineup)
print("DFS Optimizer loaded successfully")
Ownership Leverage Strategy
Analyze ownership leverage for GPP tournament strategy in DFS.
"""DFS Ownership Leverage Analysis."""
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
class OwnershipLeverage:
"""
Analyze ownership leverage for GPP (tournament) DFS strategy.
Key concepts:
- Leverage = being different from the field
- Positive leverage = overweight on high-upside, underowned players
- Game theory optimal (GTO) considerations
"""
def __init__(self, field_size: int = 10000):
self.field_size = field_size
def calculate_leverage(
self,
lineup_ownership: float,
field_ownership: float
) -> float:
"""
Calculate ownership leverage.
Leverage = (Lineup Ownership - Field Ownership) / Field Ownership
Positive = overweight relative to field
Negative = underweight relative to field
"""
if field_ownership == 0:
return float("inf") if lineup_ownership > 0 else 0
return (lineup_ownership - field_ownership) / field_ownership
def expected_duplicates(self, lineup_ownership: float) -> float:
"""
Estimate expected duplicate lineups.
Uses Poisson approximation.
"""
expected = (self.field_size - 1) * lineup_ownership
return expected
def duplication_probability(self, lineup_ownership: float) -> float:
"""Probability of at least one duplicate."""
expected = self.expected_duplicates(lineup_ownership)
return 1 - np.exp(-expected)
def optimal_exposure(
self,
player_projection: float,
player_ceiling: float,
field_ownership: float,
contest_type: str = "gpp"
) -> float:
"""
Calculate optimal player exposure.
For GPPs: Consider ceiling and correlation
For cash: Focus on floor and consistency
"""
if contest_type == "gpp":
# Higher exposure for high ceiling, low ownership
ceiling_factor = player_ceiling / player_projection
ownership_factor = 1 / (field_ownership + 0.05) # Avoid division by zero
# Base exposure on projection
base_exposure = min(player_projection / 20, 0.5)
# Adjust for ceiling and ownership
optimal = base_exposure * (ceiling_factor * 0.4 + ownership_factor * 0.1)
return min(optimal, 1.0)
else: # Cash game
# Focus on floor
floor_factor = (player_projection * 0.8) / player_projection
return min(floor_factor * 0.6, 1.0)
def correlation_leverage(
self,
player1_ownership: float,
player2_ownership: float,
correlation: float
) -> float:
"""
Calculate leverage from correlated players (stacking).
Stack leverage = how different your correlation is from the field.
"""
# Field stack probability (simplified)
field_stack_prob = player1_ownership * player2_ownership * 2
# Your stack probability (if you stack them)
your_stack_prob = 1.0
return self.calculate_leverage(your_stack_prob, field_stack_prob)
def analyze_lineup_leverage(
self,
lineup: pd.DataFrame,
field_ownership: pd.DataFrame
) -> Dict:
"""
Analyze total lineup leverage.
Args:
lineup: Your lineup with player names
field_ownership: Field ownership percentages
"""
merged = lineup.merge(
field_ownership[["Name", "Ownership"]],
on="Name", how="left"
)
# Overall ownership metrics
total_ownership = merged["Ownership"].sum()
avg_ownership = merged["Ownership"].mean()
max_ownership = merged["Ownership"].max()
min_ownership = merged["Ownership"].min()
# Leverage by player
player_leverage = []
for _, row in merged.iterrows():
leverage = self.calculate_leverage(1/len(merged), row["Ownership"])
player_leverage.append({
"Name": row["Name"],
"Ownership": row["Ownership"],
"Leverage": leverage
})
# Uniqueness score
uniqueness = 1 - self.duplication_probability(total_ownership / 100)
return {
"total_ownership": total_ownership,
"avg_ownership": avg_ownership,
"max_ownership": max_ownership,
"min_ownership": min_ownership,
"player_leverage": player_leverage,
"expected_duplicates": self.expected_duplicates(total_ownership / 100),
"uniqueness_score": uniqueness
}
def find_leverage_plays(
self,
players: pd.DataFrame,
min_projection: float = 10,
max_ownership: float = 0.15
) -> pd.DataFrame:
"""
Find high-leverage plays (low owned, high projection).
"""
leverage_plays = players[
(players["Projection"] >= min_projection) &
(players["Ownership"] <= max_ownership)
].copy()
leverage_plays["Leverage_Score"] = (
leverage_plays["Projection"] / leverage_plays["Ownership"].clip(0.01)
)
leverage_plays["Value"] = (
leverage_plays["Projection"] / leverage_plays["Salary"] * 1000
)
return leverage_plays.sort_values("Leverage_Score", ascending=False)
def game_theory_exposure(
self,
players: pd.DataFrame,
your_edge: float = 0.02
) -> pd.DataFrame:
"""
Calculate game theory optimal exposure.
Based on your perceived edge over the field.
"""
players = players.copy()
# GTO exposure formula (simplified)
# If you have edge, deviate from chalk toward your edge
players["GTO_Exposure"] = players.apply(
lambda row: self.optimal_exposure(
row["Projection"],
row.get("Ceiling", row["Projection"] * 1.5),
row["Ownership"]
) * (1 + your_edge * 10),
axis=1
)
return players[["Name", "Position", "Ownership", "Projection", "GTO_Exposure"]]
def simulate_tournament_roi(
lineup_score: float,
lineup_ownership: float,
field_scores: np.ndarray,
payout_structure: Dict[int, float]
) -> float:
"""
Simulate tournament ROI given a lineup score.
Args:
lineup_score: Your lineup's fantasy score
lineup_ownership: Estimated uniqueness factor
field_scores: Array of simulated field scores
payout_structure: Dict mapping finish positions to payout multipliers
"""
# Rank in field
better_scores = (field_scores > lineup_score).sum()
finish = better_scores + 1
# Account for ties (simplified)
ties = (field_scores == lineup_score).sum()
if ties > 0:
# Split payout among tied positions
avg_finish = finish + ties / 2
# Get payout
payout = 0
for pos, mult in payout_structure.items():
if finish <= pos:
payout = mult
break
return payout - 1 # ROI (subtract entry)
# Example usage
if __name__ == "__main__":
leverage = OwnershipLeverage(field_size=10000)
# Sample player pool
players = pd.DataFrame({
"Name": ["Star RB", "Popular WR", "Sneaky TE", "Chalk QB", "Value RB"],
"Position": ["RB", "WR", "TE", "QB", "RB"],
"Salary": [9000, 7500, 5000, 7800, 4500],
"Projection": [22.5, 17.5, 12.0, 20.0, 11.0],
"Ownership": [0.35, 0.28, 0.08, 0.42, 0.05],
"Ceiling": [35, 28, 22, 32, 20]
})
# Find leverage plays
leverage_plays = leverage.find_leverage_plays(players, min_projection=10, max_ownership=0.15)
print("Leverage Plays:")
print(leverage_plays[["Name", "Projection", "Ownership", "Leverage_Score"]])
# GTO exposure
gto = leverage.game_theory_exposure(players)
print("\nGTO Exposure:")
print(gto)
Statistical Modeling in R
Build predictive models for sports using R statistical packages.
# Statistical Modeling for Sports Analytics in R
library(tidymodels)
library(dplyr)
library(ggplot2)
# =====================
# Data Preparation
# =====================
#' Prepare data for modeling
prep_model_data <- function(data, outcome_var, predictors) {
data %>%
select(all_of(c(outcome_var, predictors))) %>%
drop_na()
}
#' Create train/test split
create_splits <- function(data, prop = 0.8, strata = NULL) {
if (!is.null(strata)) {
initial_split(data, prop = prop, strata = all_of(strata))
} else {
initial_split(data, prop = prop)
}
}
# =====================
# Linear Models
# =====================
#' Fit linear regression model
fit_linear_model <- function(data, formula) {
spec <- linear_reg() %>%
set_engine("lm")
workflow() %>%
add_formula(formula) %>%
add_model(spec) %>%
fit(data)
}
#' Fit ridge/lasso regression
fit_regularized_model <- function(data, formula, penalty = 0.01, mixture = 0.5) {
# mixture: 0 = ridge, 1 = lasso, between = elastic net
spec <- linear_reg(penalty = penalty, mixture = mixture) %>%
set_engine("glmnet")
workflow() %>%
add_formula(formula) %>%
add_model(spec) %>%
fit(data)
}
# =====================
# Classification Models
# =====================
#' Fit logistic regression (binary classification)
fit_logistic <- function(data, formula) {
spec <- logistic_reg() %>%
set_engine("glm")
workflow() %>%
add_formula(formula) %>%
add_model(spec) %>%
fit(data)
}
#' Fit random forest classifier
fit_rf_classifier <- function(data, formula, trees = 500, mtry = NULL) {
spec <- rand_forest(trees = trees, mtry = mtry) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
workflow() %>%
add_formula(formula) %>%
add_model(spec) %>%
fit(data)
}
#' Fit XGBoost classifier
fit_xgb_classifier <- function(data, formula, trees = 100, tree_depth = 6,
learn_rate = 0.3) {
spec <- boost_tree(
trees = trees,
tree_depth = tree_depth,
learn_rate = learn_rate
) %>%
set_engine("xgboost") %>%
set_mode("classification")
workflow() %>%
add_formula(formula) %>%
add_model(spec) %>%
fit(data)
}
# =====================
# Model Evaluation
# =====================
#' Evaluate regression model
evaluate_regression <- function(model, test_data, truth_col) {
predictions <- predict(model, test_data) %>%
bind_cols(test_data)
metrics <- predictions %>%
metrics(truth = !!sym(truth_col), estimate = .pred)
list(
predictions = predictions,
metrics = metrics,
rmse = metrics %>% filter(.metric == "rmse") %>% pull(.estimate),
r_squared = metrics %>% filter(.metric == "rsq") %>% pull(.estimate)
)
}
#' Evaluate classification model
evaluate_classification <- function(model, test_data, truth_col) {
predictions <- predict(model, test_data, type = "prob") %>%
bind_cols(predict(model, test_data)) %>%
bind_cols(test_data)
truth_sym <- sym(truth_col)
list(
predictions = predictions,
accuracy = predictions %>%
accuracy(truth = !!truth_sym, estimate = .pred_class) %>%
pull(.estimate),
auc = predictions %>%
roc_auc(truth = !!truth_sym, .pred_1) %>% # Adjust column name
pull(.estimate),
confusion = predictions %>%
conf_mat(truth = !!truth_sym, estimate = .pred_class)
)
}
# =====================
# Cross-Validation
# =====================
#' Perform k-fold cross-validation
cross_validate <- function(data, formula, model_spec, folds = 10) {
cv_folds <- vfold_cv(data, v = folds)
wf <- workflow() %>%
add_formula(formula) %>%
add_model(model_spec)
cv_results <- fit_resamples(
wf,
resamples = cv_folds,
metrics = metric_set(rmse, rsq, mae)
)
collect_metrics(cv_results)
}
#' Tune hyperparameters
tune_model <- function(data, formula, model_spec, grid_size = 20, folds = 5) {
cv_folds <- vfold_cv(data, v = folds)
wf <- workflow() %>%
add_formula(formula) %>%
add_model(model_spec)
tuned <- tune_grid(
wf,
resamples = cv_folds,
grid = grid_size
)
list(
results = collect_metrics(tuned),
best = select_best(tuned, metric = "rmse")
)
}
# =====================
# Example: Win Prediction Model
# =====================
build_win_model <- function(team_data) {
# Prepare data
model_data <- team_data %>%
mutate(win = factor(win, levels = c(0, 1))) %>%
select(win, points_for, points_against, turnovers, yards, time_of_possession)
# Split
splits <- initial_split(model_data, prop = 0.8, strata = win)
train <- training(splits)
test <- testing(splits)
# Fit model
rf_spec <- rand_forest(trees = 500) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
model <- workflow() %>%
add_formula(win ~ .) %>%
add_model(rf_spec) %>%
fit(train)
# Evaluate
evaluation <- evaluate_classification(model, test, "win")
list(
model = model,
accuracy = evaluation$accuracy,
auc = evaluation$auc,
feature_importance = model %>%
extract_fit_parsnip() %>%
vip::vi()
)
}
print("Statistical modeling functions loaded")
Sports Visualization with ggplot2
Create professional sports visualizations using ggplot2.
# Professional sports visualizations with ggplot2
library(ggplot2)
library(dplyr)
library(scales)
library(patchwork) # For combining plots
# =====================
# Theme Setup
# =====================
#' Custom theme for sports analytics
theme_sports <- function(base_size = 12) {
theme_minimal(base_size = base_size) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0),
plot.subtitle = element_text(color = "gray40", size = rel(0.9)),
plot.caption = element_text(color = "gray60", size = rel(0.7)),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray90"),
axis.title = element_text(face = "bold", size = rel(0.9)),
legend.position = "bottom",
legend.title = element_text(face = "bold", size = rel(0.8))
)
}
# =====================
# Scatter Plots
# =====================
#' Create scatter plot with quadrants
plot_quadrant <- function(data, x_var, y_var, label_var,
title = NULL, highlight_top = 5) {
x_mean <- mean(data[[x_var]], na.rm = TRUE)
y_mean <- mean(data[[y_var]], na.rm = TRUE)
# Identify top performers
data <- data %>%
mutate(
quadrant = case_when(
.data[[x_var]] >= x_mean & .data[[y_var]] >= y_mean ~ "Elite",
.data[[x_var]] >= x_mean & .data[[y_var]] < y_mean ~ "Efficient",
.data[[x_var]] < x_mean & .data[[y_var]] >= y_mean ~ "Volume",
TRUE ~ "Below Average"
),
highlight = rank(-.data[[x_var]] * .data[[y_var]]) <= highlight_top
)
ggplot(data, aes(x = .data[[x_var]], y = .data[[y_var]])) +
geom_hline(yintercept = y_mean, linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = x_mean, linetype = "dashed", alpha = 0.5) +
geom_point(aes(color = quadrant), size = 3, alpha = 0.7) +
geom_text(
data = filter(data, highlight),
aes(label = .data[[label_var]]),
vjust = -0.5, size = 3, check_overlap = TRUE
) +
scale_color_manual(values = c(
"Elite" = "#2ecc71", "Efficient" = "#3498db",
"Volume" = "#f39c12", "Below Average" = "#95a5a6"
)) +
labs(title = title, x = x_var, y = y_var, color = "Category") +
theme_sports()
}
# =====================
# Bar Charts
# =====================
#' Create horizontal bar chart with team colors
plot_team_bars <- function(data, stat_var, team_var, title = NULL,
team_colors = NULL) {
data <- data %>% arrange(desc(.data[[stat_var]]))
p <- ggplot(data, aes(x = reorder(.data[[team_var]], .data[[stat_var]]),
y = .data[[stat_var]])) +
geom_col(fill = "#3498db", alpha = 0.8) +
geom_text(aes(label = round(.data[[stat_var]], 1)),
hjust = -0.1, size = 3) +
coord_flip() +
labs(title = title, x = "", y = stat_var) +
theme_sports() +
theme(panel.grid.major.y = element_blank())
if (!is.null(team_colors)) {
p <- p + geom_col(aes(fill = .data[[team_var]])) +
scale_fill_manual(values = team_colors) +
guides(fill = "none")
}
p
}
# =====================
# Line Charts
# =====================
#' Create rolling average line chart
plot_rolling_avg <- function(data, date_var, value_var, window = 10,
title = NULL, group_var = NULL) {
data <- data %>%
arrange(.data[[date_var]])
if (!is.null(group_var)) {
data <- data %>%
group_by(.data[[group_var]]) %>%
mutate(rolling_avg = zoo::rollmean(.data[[value_var]], k = window,
fill = NA, align = "right")) %>%
ungroup()
} else {
data <- data %>%
mutate(rolling_avg = zoo::rollmean(.data[[value_var]], k = window,
fill = NA, align = "right"))
}
p <- ggplot(data, aes(x = .data[[date_var]], y = rolling_avg))
if (!is.null(group_var)) {
p <- p + geom_line(aes(color = .data[[group_var]]), size = 1)
} else {
p <- p + geom_line(color = "#3498db", size = 1)
}
p +
labs(
title = title,
subtitle = paste0(window, "-game rolling average"),
x = "", y = value_var
) +
theme_sports()
}
# =====================
# Distribution Plots
# =====================
#' Create density ridge plot
plot_density_ridges <- function(data, value_var, group_var, title = NULL) {
library(ggridges)
ggplot(data, aes(x = .data[[value_var]], y = .data[[group_var]],
fill = stat(x))) +
geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
scale_fill_viridis_c(option = "C") +
labs(title = title, x = value_var, y = "") +
theme_sports() +
theme(legend.position = "none")
}
#' Create box plot comparison
plot_box_comparison <- function(data, value_var, group_var, title = NULL) {
ggplot(data, aes(x = reorder(.data[[group_var]], .data[[value_var]], median),
y = .data[[value_var]], fill = .data[[group_var]])) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3,
fill = "white") +
labs(title = title, x = "", y = value_var) +
theme_sports() +
theme(legend.position = "none") +
coord_flip()
}
# =====================
# Heat Maps
# =====================
#' Create performance heat map
plot_heatmap <- function(data, x_var, y_var, fill_var, title = NULL) {
ggplot(data, aes(x = .data[[x_var]], y = .data[[y_var]],
fill = .data[[fill_var]])) +
geom_tile(color = "white", size = 0.5) +
scale_fill_gradient2(low = "#e74c3c", mid = "white", high = "#2ecc71",
midpoint = median(data[[fill_var]], na.rm = TRUE)) +
labs(title = title, x = x_var, y = y_var, fill = fill_var) +
theme_sports() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid = element_blank()
)
}
# =====================
# Composite Dashboards
# =====================
#' Create player comparison dashboard
create_player_dashboard <- function(player_data, player_name) {
player <- player_data %>% filter(name == player_name)
# Multiple plots combined
p1 <- plot_quadrant(player_data, "offense", "defense", "name",
title = "Offense vs Defense")
p2 <- player_data %>%
select(name, stat1, stat2, stat3) %>%
pivot_longer(-name) %>%
filter(name == player_name) %>%
ggplot(aes(x = name, y = value, fill = name)) +
geom_col() +
facet_wrap(~name, scales = "free_y") +
theme_sports()
# Combine with patchwork
p1 + p2 +
plot_annotation(
title = paste(player_name, "Performance Dashboard"),
theme = theme_sports()
)
}
print("Sports visualization functions loaded")
Soccer Analysis with worldfootballR
European soccer data analysis using worldfootballR package.
# Soccer analysis with worldfootballR
library(worldfootballR)
library(dplyr)
library(ggplot2)
library(tidyr)
# =====================
# Data Loading
# =====================
#' Get player stats from FBref
#'
#' @param country Country (e.g., "ENG" for England)
#' @param gender "M" or "F"
#' @param season_end_year End year of season
#' @param stat_type Type of stats
get_player_stats <- function(country, gender = "M", season_end_year, stat_type = "standard") {
fb_league_stats(
country = country,
gender = gender,
season_end_year = season_end_year,
stat_type = stat_type
)
}
# Get Premier League player stats
# pl_standard <- get_player_stats("ENG", "M", 2024, "standard")
# pl_shooting <- get_player_stats("ENG", "M", 2024, "shooting")
# pl_passing <- get_player_stats("ENG", "M", 2024, "passing")
#' Get match results
get_match_results <- function(country, gender = "M", season_end_year) {
fb_match_results(
country = country,
gender = gender,
season_end_year = season_end_year
)
}
# =====================
# Player Analysis
# =====================
#' Calculate player per 90 stats
calc_per_90 <- function(player_stats) {
player_stats %>%
filter(Mins_Per_90 >= 10) %>% # At least 10 90s played
mutate(
Goals_per90 = Gls / Mins_Per_90,
Assists_per90 = Ast / Mins_Per_90,
GA_per90 = `G+A` / Mins_Per_90,
xG_per90 = xG / Mins_Per_90,
xA_per90 = xA / Mins_Per_90,
xGA_per90 = `xG+xA` / Mins_Per_90,
GminusxG = Gls - xG, # Overperformance
AminusxA = Ast - xA
) %>%
select(Player, Squad, Pos, Mins_Per_90, Goals_per90, Assists_per90,
xG_per90, xA_per90, GminusxG, AminusxA) %>%
arrange(desc(xGA_per90))
}
#' Find similar players based on stats
find_similar_players <- function(player_stats, target_player, n = 10) {
# Select numeric columns for comparison
numeric_cols <- player_stats %>%
select(where(is.numeric)) %>%
select(-matches("Mins|90"))
# Scale the data
scaled <- scale(numeric_cols)
row.names(scaled) <- player_stats$Player
# Find target player
target_idx <- which(player_stats$Player == target_player)
if (length(target_idx) == 0) {
stop("Player not found")
}
# Calculate distances
distances <- apply(scaled, 1, function(x) {
sqrt(sum((x - scaled[target_idx, ])^2))
})
# Return most similar
similar <- sort(distances)[2:(n+1)]
tibble(
Player = names(similar),
Similarity = 1 - (similar / max(similar))
)
}
# =====================
# Team Analysis
# =====================
#' Calculate team expected points
calc_expected_points <- function(matches) {
matches %>%
mutate(
Home_xPts = case_when(
Home_xG > Away_xG + 0.5 ~ 3 * (1 - pnorm(0, Home_xG - Away_xG, 1)),
Home_xG < Away_xG - 0.5 ~ 0 + 3 * pnorm(0, Home_xG - Away_xG, 1),
TRUE ~ 1 + 2 * dnorm(0, Home_xG - Away_xG, 1)
),
Away_xPts = 3 - Home_xPts # Simplified
)
}
#' Create league table
create_league_table <- function(matches) {
home <- matches %>%
group_by(Team = Home) %>%
summarise(
P = n(),
W = sum(HomeGoals > AwayGoals),
D = sum(HomeGoals == AwayGoals),
L = sum(HomeGoals < AwayGoals),
GF = sum(HomeGoals),
GA = sum(AwayGoals),
.groups = "drop"
)
away <- matches %>%
group_by(Team = Away) %>%
summarise(
P = n(),
W = sum(AwayGoals > HomeGoals),
D = sum(AwayGoals == HomeGoals),
L = sum(AwayGoals < HomeGoals),
GF = sum(AwayGoals),
GA = sum(HomeGoals),
.groups = "drop"
)
bind_rows(home, away) %>%
group_by(Team) %>%
summarise(across(everything(), sum)) %>%
mutate(
GD = GF - GA,
Pts = W * 3 + D
) %>%
arrange(desc(Pts), desc(GD), desc(GF))
}
# =====================
# xG Analysis
# =====================
#' Plot xG timeline for a match
plot_xg_timeline <- function(match_shots) {
# Assumes match_shots has minute, team, xG columns
match_shots %>%
arrange(minute) %>%
group_by(team) %>%
mutate(cumulative_xG = cumsum(xG)) %>%
ungroup() %>%
ggplot(aes(x = minute, y = cumulative_xG, color = team)) +
geom_step(size = 1.2) +
geom_point(aes(size = xG), alpha = 0.6) +
scale_x_continuous(breaks = seq(0, 90, 15)) +
labs(
title = "Match xG Timeline",
x = "Minute",
y = "Cumulative xG"
) +
theme_minimal()
}
#' Calculate shot quality distribution
analyze_shot_quality <- function(shots) {
shots %>%
mutate(
xG_bucket = cut(xG,
breaks = c(0, 0.05, 0.1, 0.2, 0.4, 1),
labels = c("Very Low", "Low", "Medium", "High", "Very High")
)
) %>%
group_by(team, xG_bucket) %>%
summarise(
Shots = n(),
Goals = sum(is_goal, na.rm = TRUE),
Conversion = Goals / Shots,
.groups = "drop"
)
}
# =====================
# Visualization
# =====================
#' Create player radar chart
create_radar_chart <- function(player_stats, player_name, metrics) {
player <- player_stats %>% filter(Player == player_name)
# Normalize metrics to 0-100 scale
normalized <- player_stats %>%
select(all_of(metrics)) %>%
mutate(across(everything(), ~ percent_rank(.) * 100))
player_normalized <- normalized[player_stats$Player == player_name, ]
# Would need specialized radar chart package like ggradar
# This is a placeholder for the concept
tibble(
Metric = metrics,
Value = as.numeric(player_normalized)
)
}
print("worldfootballR soccer analysis functions loaded")
NFL Analysis with nflfastR
Comprehensive NFL play-by-play analysis using nflfastR package.
# NFL analysis with nflfastR
library(nflfastR)
library(dplyr)
library(ggplot2)
library(tidyr)
# =====================
# Loading Data
# =====================
#' Load NFL play-by-play data
#'
#' @param seasons Vector of seasons to load
load_pbp_data <- function(seasons) {
load_pbp(seasons) %>%
filter(season_type == "REG") # Regular season only
}
#' Load roster data
load_rosters <- function(seasons) {
load_rosters(seasons)
}
# Load recent seasons
# pbp <- load_pbp_data(2020:2024)
# rosters <- load_rosters(2024)
# =====================
# EPA Analysis
# =====================
#' Calculate QB EPA metrics
calc_qb_epa <- function(pbp) {
pbp %>%
filter(!is.na(epa), !is.na(passer_id)) %>%
group_by(passer_id, passer) %>%
summarise(
Games = n_distinct(game_id),
Dropbacks = n(),
EPA_Total = sum(epa),
EPA_per_Play = mean(epa),
CPOE = mean(cpoe, na.rm = TRUE),
Success_Rate = mean(success, na.rm = TRUE),
Comp_Pct = mean(complete_pass, na.rm = TRUE),
Air_EPA = mean(air_epa, na.rm = TRUE),
YAC_EPA = mean(yac_epa, na.rm = TRUE),
.groups = "drop"
) %>%
filter(Dropbacks >= 200) %>%
arrange(desc(EPA_per_Play))
}
#' Calculate rushing EPA
calc_rush_epa <- function(pbp) {
pbp %>%
filter(!is.na(epa), rush == 1, !is.na(rusher_id)) %>%
group_by(rusher_id, rusher) %>%
summarise(
Carries = n(),
Yards = sum(rushing_yards, na.rm = TRUE),
TDs = sum(rush_touchdown, na.rm = TRUE),
EPA_Total = sum(epa),
EPA_per_Carry = mean(epa),
Success_Rate = mean(success, na.rm = TRUE),
YPC = mean(rushing_yards, na.rm = TRUE),
.groups = "drop"
) %>%
filter(Carries >= 100) %>%
arrange(desc(EPA_per_Carry))
}
#' Calculate receiver EPA
calc_receiver_epa <- function(pbp) {
pbp %>%
filter(!is.na(epa), !is.na(receiver_id), complete_pass == 1) %>%
group_by(receiver_id, receiver) %>%
summarise(
Targets = n(),
Receptions = sum(complete_pass),
Yards = sum(receiving_yards, na.rm = TRUE),
TDs = sum(pass_touchdown, na.rm = TRUE),
EPA_Total = sum(epa),
EPA_per_Target = mean(epa),
YAC = sum(yards_after_catch, na.rm = TRUE),
ADOT = mean(air_yards, na.rm = TRUE),
.groups = "drop"
) %>%
filter(Targets >= 50) %>%
arrange(desc(EPA_Total))
}
# =====================
# Team Analysis
# =====================
#' Calculate team offensive efficiency
calc_team_offense <- function(pbp) {
pbp %>%
filter(!is.na(epa), !is.na(posteam)) %>%
group_by(posteam) %>%
summarise(
Plays = n(),
Pass_Plays = sum(pass, na.rm = TRUE),
Rush_Plays = sum(rush, na.rm = TRUE),
Pass_Rate = Pass_Plays / Plays,
EPA_Total = sum(epa),
EPA_per_Play = mean(epa),
Pass_EPA = mean(epa[pass == 1], na.rm = TRUE),
Rush_EPA = mean(epa[rush == 1], na.rm = TRUE),
Success_Rate = mean(success, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(EPA_per_Play))
}
#' Calculate team defensive efficiency
calc_team_defense <- function(pbp) {
pbp %>%
filter(!is.na(epa), !is.na(defteam)) %>%
group_by(defteam) %>%
summarise(
Plays = n(),
EPA_Allowed = sum(epa),
EPA_per_Play_Allowed = mean(epa),
Pass_EPA_Allowed = mean(epa[pass == 1], na.rm = TRUE),
Rush_EPA_Allowed = mean(epa[rush == 1], na.rm = TRUE),
Success_Rate_Allowed = mean(success, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(EPA_per_Play_Allowed) # Lower is better
}
# =====================
# Situational Analysis
# =====================
#' Analyze performance by down and distance
analyze_situations <- function(pbp, team = NULL) {
data <- pbp %>%
filter(!is.na(epa), !is.na(down))
if (!is.null(team)) {
data <- data %>% filter(posteam == team)
}
data %>%
mutate(
ydstogo_bucket = case_when(
ydstogo <= 3 ~ "Short (1-3)",
ydstogo <= 7 ~ "Medium (4-7)",
ydstogo <= 10 ~ "Long (8-10)",
TRUE ~ "Very Long (11+)"
)
) %>%
group_by(down, ydstogo_bucket) %>%
summarise(
Plays = n(),
Pass_Rate = mean(pass, na.rm = TRUE),
EPA_per_Play = mean(epa),
Success_Rate = mean(success, na.rm = TRUE),
.groups = "drop"
)
}
# =====================
# Visualization
# =====================
#' Plot EPA by team
plot_team_epa <- function(offense_stats, defense_stats) {
combined <- offense_stats %>%
select(team = posteam, Off_EPA = EPA_per_Play) %>%
left_join(
defense_stats %>%
select(team = defteam, Def_EPA = EPA_per_Play_Allowed),
by = "team"
)
ggplot(combined, aes(x = Off_EPA, y = -Def_EPA)) +
geom_point(size = 3) +
geom_text(aes(label = team), vjust = -0.5, size = 3) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
labs(
title = "Team Efficiency",
x = "Offensive EPA/Play",
y = "Defensive EPA/Play (inverted)"
) +
theme_minimal()
}
#' Create passing chart
plot_passing_chart <- function(pbp, passer_name) {
passes <- pbp %>%
filter(passer == passer_name, !is.na(air_yards))
ggplot(passes, aes(x = air_yards, fill = factor(complete_pass))) +
geom_histogram(binwidth = 5, position = "dodge", alpha = 0.7) +
scale_fill_manual(values = c("0" = "red", "1" = "green"),
labels = c("Incomplete", "Complete")) +
labs(
title = paste(passer_name, "Air Yards Distribution"),
x = "Air Yards", y = "Count", fill = ""
) +
theme_minimal()
}
print("nflfastR NFL analysis functions loaded")
Basketball Analysis with hoopR
NBA data analysis using hoopR package for player and team statistics.
# NBA analysis with hoopR package
library(hoopR)
library(dplyr)
library(ggplot2)
library(tidyr)
# =====================
# Loading NBA Data
# =====================
#' Get NBA player box scores
#'
#' @param season NBA season (e.g., 2024 for 2023-24)
get_player_box <- function(season) {
load_nba_player_box(seasons = season)
}
#' Get team box scores
get_team_box <- function(season) {
load_nba_team_box(seasons = season)
}
#' Get play-by-play data
get_pbp <- function(game_id) {
espn_nba_pbp(game_id)
}
# Load 2024 season data
# player_stats <- get_player_box(2024)
# team_stats <- get_team_box(2024)
# =====================
# Player Analysis
# =====================
#' Calculate advanced stats for players
calc_advanced_stats <- function(player_box) {
player_box %>%
group_by(athlete_id, athlete_display_name) %>%
summarise(
Games = n(),
MPG = mean(minutes, na.rm = TRUE),
PPG = mean(points, na.rm = TRUE),
RPG = mean(rebounds, na.rm = TRUE),
APG = mean(assists, na.rm = TRUE),
FG_Pct = sum(field_goals_made) / sum(field_goals_attempted),
Three_Pct = sum(three_point_field_goals_made) / sum(three_point_field_goals_attempted),
FT_Pct = sum(free_throws_made) / sum(free_throws_attempted),
TS_Pct = sum(points) / (2 * (sum(field_goals_attempted) + 0.44 * sum(free_throws_attempted))),
.groups = "drop"
) %>%
filter(Games >= 20) %>%
arrange(desc(PPG))
}
#' Calculate usage rate
calc_usage <- function(player_box, team_box) {
player_box %>%
left_join(
team_box %>%
select(game_id, team_id, team_fga = field_goals_attempted,
team_fta = free_throws_attempted, team_tov = turnovers),
by = c("game_id", "team_id")
) %>%
mutate(
USG = 100 * ((field_goals_attempted + 0.44 * free_throws_attempted + turnovers) *
(team_minutes / 5)) /
(minutes * (team_fga + 0.44 * team_fta + team_tov))
)
}
# =====================
# Team Analysis
# =====================
#' Calculate team efficiency metrics
calc_team_efficiency <- function(team_box) {
team_box %>%
group_by(team_id, team_display_name) %>%
summarise(
Games = n(),
Wins = sum(team_winner, na.rm = TRUE),
PPG = mean(team_score, na.rm = TRUE),
OppPPG = mean(opponent_team_score, na.rm = TRUE),
Net_Rating = PPG - OppPPG,
Pace = mean((field_goals_attempted + 0.44 * free_throws_attempted -
offensive_rebounds + turnovers) * 48 / minutes, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(Win_Pct = Wins / Games) %>%
arrange(desc(Net_Rating))
}
#' Four Factors analysis
calc_four_factors <- function(team_box) {
team_box %>%
group_by(team_display_name) %>%
summarise(
# Effective FG%
eFG = (sum(field_goals_made) + 0.5 * sum(three_point_field_goals_made)) /
sum(field_goals_attempted),
# Turnover Rate
TOV_Rate = sum(turnovers) / (sum(field_goals_attempted) +
0.44 * sum(free_throws_attempted) + sum(turnovers)),
# Offensive Rebound Rate
ORB_Rate = sum(offensive_rebounds) /
(sum(offensive_rebounds) + sum(opponent_defensive_rebounds)),
# Free Throw Rate
FT_Rate = sum(free_throws_made) / sum(field_goals_attempted),
.groups = "drop"
)
}
# =====================
# Shot Chart
# =====================
#' Create shot chart from play-by-play
create_shot_chart <- function(pbp_data, player_name = NULL) {
shots <- pbp_data %>%
filter(shooting_play == TRUE) %>%
filter(!is.na(coordinate_x), !is.na(coordinate_y))
if (!is.null(player_name)) {
shots <- shots %>% filter(grepl(player_name, text, ignore.case = TRUE))
}
ggplot(shots, aes(x = coordinate_x, y = coordinate_y)) +
# Court outline would be added here
geom_point(aes(color = scoring_play), alpha = 0.6, size = 2) +
scale_color_manual(values = c("TRUE" = "green", "FALSE" = "red")) +
coord_fixed() +
labs(title = "Shot Chart", color = "Made") +
theme_minimal()
}
# =====================
# Win Probability
# =====================
#' Calculate win probability from game state
calc_win_prob <- function(score_diff, time_remaining_sec, possession = 0) {
# Simplified logistic model
# Real model would be trained on historical data
z <- (score_diff + possession * 2) / sqrt(time_remaining_sec / 60)
1 / (1 + exp(-0.15 * z))
}
#' Add win probability to play-by-play
add_win_prob <- function(pbp_data) {
pbp_data %>%
mutate(
score_diff = home_score - away_score,
time_remaining = (4 - period) * 720 + clock_minutes * 60 + clock_seconds,
home_win_prob = calc_win_prob(score_diff, time_remaining)
)
}
print("hoopR NBA analysis functions loaded")
Baseball Stats with baseballr
Fetch and analyze baseball statistics using the baseballr package.
# Baseball analysis with baseballr package
library(baseballr)
library(dplyr)
library(ggplot2)
# =====================
# Fetching Player Stats
# =====================
#' Get batting stats for a season
#'
#' @param year Season year
#' @param qual Minimum PA qualifier (default 100)
get_batting_stats <- function(year, qual = 100) {
stats <- fg_batter_leaders(year, year, qual = qual)
stats %>%
select(Name, Team, G, PA, AB, H, HR, RBI, BB, SO,
AVG, OBP, SLG, wOBA, wRC_plus = `wRC+`, WAR) %>%
arrange(desc(WAR))
}
# Get 2024 batting leaders
batting_2024 <- get_batting_stats(2024)
print(head(batting_2024, 20))
# =====================
# Statcast Data
# =====================
#' Get Statcast data for a date range
get_statcast <- function(start_date, end_date) {
statcast_search(
start_date = start_date,
end_date = end_date,
player_type = "batter"
)
}
# Get recent Statcast data
# statcast_data <- get_statcast("2024-06-01", "2024-06-07")
#' Calculate Statcast metrics for a player
calc_statcast_metrics <- function(player_data) {
player_data %>%
filter(!is.na(launch_speed)) %>%
summarise(
Batted_Balls = n(),
Avg_EV = mean(launch_speed, na.rm = TRUE),
Max_EV = max(launch_speed, na.rm = TRUE),
Avg_LA = mean(launch_angle, na.rm = TRUE),
Barrel_Pct = mean(barrel == 1, na.rm = TRUE) * 100,
HardHit_Pct = mean(launch_speed >= 95, na.rm = TRUE) * 100,
Sweet_Spot_Pct = mean(launch_angle >= 8 & launch_angle <= 32, na.rm = TRUE) * 100
)
}
# =====================
# Pitching Analysis
# =====================
#' Get pitching stats
get_pitching_stats <- function(year, qual = 50) {
fg_pitcher_leaders(year, year, qual = qual) %>%
select(Name, Team, G, GS, IP, W, L, ERA, WHIP, K9 = `K/9`,
BB9 = `BB/9`, FIP, xFIP, WAR) %>%
arrange(desc(WAR))
}
#' Calculate pitch mix for a pitcher
calc_pitch_mix <- function(statcast_data, pitcher_name) {
statcast_data %>%
filter(player_name == pitcher_name) %>%
group_by(pitch_type) %>%
summarise(
Count = n(),
Avg_Velo = mean(release_speed, na.rm = TRUE),
Avg_Spin = mean(release_spin_rate, na.rm = TRUE),
Whiff_Pct = mean(description %in% c("swinging_strike", "swinging_strike_blocked"), na.rm = TRUE) * 100
) %>%
mutate(Usage_Pct = Count / sum(Count) * 100) %>%
arrange(desc(Usage_Pct))
}
# =====================
# Visualization
# =====================
#' Create spray chart
create_spray_chart <- function(batted_balls, player_name = NULL) {
if (!is.null(player_name)) {
batted_balls <- batted_balls %>% filter(player_name == !!player_name)
}
ggplot(batted_balls, aes(x = hc_x - 125, y = 200 - hc_y)) +
geom_point(aes(color = events), alpha = 0.6, size = 2) +
scale_color_manual(values = c(
"single" = "blue", "double" = "green",
"triple" = "orange", "home_run" = "red",
"field_out" = "gray"
)) +
coord_fixed() +
labs(title = paste(player_name, "Spray Chart"),
x = "Horizontal Position", y = "Distance") +
theme_minimal()
}
#' Create pitch movement plot
plot_pitch_movement <- function(pitches) {
ggplot(pitches, aes(x = pfx_x * 12, y = pfx_z * 12, color = pitch_type)) +
geom_point(alpha = 0.5, size = 2) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = 0, linestyle = "dashed", alpha = 0.5) +
labs(title = "Pitch Movement Profile",
x = "Horizontal Movement (inches)",
y = "Vertical Movement (inches)") +
theme_minimal() +
coord_fixed()
}
# Example: WAR vs Salary analysis
# salary_data <- chadwick_player_lu() %>%
# left_join(batting_2024, by = c("name_last", "name_first"))
# ggplot(salary_data, aes(x = WAR, y = salary/1e6)) +
# geom_point() + geom_smooth(method = "lm") +
# labs(x = "WAR", y = "Salary (Millions)")
Hockey Analysis with hockeyR
NHL data analysis using hockeyR and related packages.
# NHL Hockey Analysis in R
library(hockeyR) # Or fastRhockey
library(dplyr)
library(ggplot2)
library(tidyr)
# =====================
# Data Loading
# =====================
#' Load NHL play-by-play data
#'
#' @param season Season in YYYYYYYY format (e.g., 20232024)
load_nhl_pbp <- function(season) {
load_pbp(season)
}
#' Load player stats
load_player_stats <- function(season) {
# Using NHL API or other sources
get_skater_stats(season)
}
# =====================
# Shot Analysis
# =====================
#' Calculate shot metrics
calc_shot_metrics <- function(pbp) {
shots <- pbp %>%
filter(event_type %in% c("SHOT", "GOAL", "MISS", "BLOCK"))
shots %>%
group_by(event_player_1_name, event_team) %>%
summarise(
Shots = sum(event_type == "SHOT"),
Goals = sum(event_type == "GOAL"),
Missed = sum(event_type == "MISS"),
Blocked = sum(event_type == "BLOCK"),
Sh_Pct = Goals / (Shots + Goals),
.groups = "drop"
)
}
#' Calculate expected goals (simplified model)
calc_xg <- function(shots) {
# Distance-based xG (simplified)
shots %>%
mutate(
distance = sqrt(x_fixed^2 + y_fixed^2),
angle = atan2(abs(y_fixed), 89 - x_fixed) * 180 / pi,
xG = case_when(
event_type == "GOAL" & penalty_shot ~ 0.33, # Penalty shot
distance < 10 ~ 0.20,
distance < 20 ~ 0.12,
distance < 30 ~ 0.06,
distance < 40 ~ 0.03,
TRUE ~ 0.02
) * (1 + 0.01 * pmax(0, 45 - angle)) # Angle adjustment
)
}
# =====================
# Corsi/Fenwick
# =====================
#' Calculate Corsi metrics for teams
calc_team_corsi <- function(pbp) {
shot_events <- pbp %>%
filter(event_type %in% c("SHOT", "GOAL", "MISS", "BLOCK"))
# For each team
corsi_for <- shot_events %>%
group_by(event_team) %>%
summarise(CF = n())
corsi_against <- shot_events %>%
group_by(away_team = ifelse(event_team == home_team, away_team, home_team)) %>%
summarise(CA = n()) %>%
rename(event_team = away_team)
corsi_for %>%
left_join(corsi_against, by = "event_team") %>%
mutate(
Corsi_Diff = CF - CA,
Corsi_Pct = CF / (CF + CA) * 100
) %>%
arrange(desc(Corsi_Pct))
}
#' Calculate player Corsi (on-ice)
calc_player_corsi <- function(pbp, min_toi = 200) {
# This would require tracking on-ice players
# Simplified version using events only
pbp %>%
filter(event_type %in% c("SHOT", "GOAL", "MISS", "BLOCK")) %>%
group_by(event_player_1_name) %>%
summarise(
iCF = n(), # Individual Corsi For
# Would need on-ice data for full Corsi
.groups = "drop"
)
}
# =====================
# Game State Analysis
# =====================
#' Analyze performance by game state
analyze_by_strength <- function(pbp) {
pbp %>%
filter(event_type %in% c("SHOT", "GOAL")) %>%
mutate(
strength_state = case_when(
strength_code == "EV" ~ "Even Strength",
strength_code == "PP" ~ "Power Play",
strength_code == "SH" ~ "Shorthanded",
TRUE ~ "Other"
)
) %>%
group_by(event_team, strength_state) %>%
summarise(
Shots = sum(event_type == "SHOT"),
Goals = sum(event_type == "GOAL"),
Sh_Pct = Goals / (Shots + Goals) * 100,
.groups = "drop"
)
}
#' Calculate special teams efficiency
calc_special_teams <- function(pbp) {
# Power Play
pp_data <- pbp %>%
filter(strength_code == "PP") %>%
group_by(event_team) %>%
summarise(
PP_Goals = sum(event_type == "GOAL"),
PP_Shots = sum(event_type %in% c("SHOT", "GOAL")),
.groups = "drop"
)
# Penalty Kill (when opponent is on PP)
pk_data <- pbp %>%
filter(strength_code == "PP") %>%
group_by(defending_team = ifelse(event_team == home_team, away_team, home_team)) %>%
summarise(
GA_on_PK = sum(event_type == "GOAL"),
SA_on_PK = sum(event_type %in% c("SHOT", "GOAL")),
.groups = "drop"
)
pp_data %>%
left_join(pk_data, by = c("event_team" = "defending_team"))
}
# =====================
# Visualization
# =====================
#' Create shot plot (rink view)
plot_shots <- function(shots, team = NULL) {
plot_data <- shots %>%
filter(event_type %in% c("SHOT", "GOAL"))
if (!is.null(team)) {
plot_data <- plot_data %>% filter(event_team == team)
}
ggplot(plot_data, aes(x = x_fixed, y = y_fixed)) +
# Add rink markings here
geom_point(aes(color = event_type, size = ifelse(event_type == "GOAL", 3, 1)),
alpha = 0.6) +
scale_color_manual(values = c("SHOT" = "blue", "GOAL" = "red")) +
coord_fixed(xlim = c(-100, 100), ylim = c(-42.5, 42.5)) +
labs(title = "Shot Location Plot", x = "", y = "") +
theme_minimal() +
theme(legend.position = "bottom")
}
#' Create win probability chart
plot_win_probability <- function(pbp, game_id) {
game_data <- pbp %>%
filter(game_id == !!game_id) %>%
arrange(period, time)
# Calculate simple win probability based on score and time
game_data <- game_data %>%
mutate(
score_diff = home_score - away_score,
time_remaining = (3 - period) * 20 + time / 60,
home_wp = pnorm(score_diff, sd = sqrt(time_remaining / 10))
)
ggplot(game_data, aes(x = row_number(), y = home_wp)) +
geom_line(color = "blue", size = 1) +
geom_hline(yintercept = 0.5, linetype = "dashed", alpha = 0.5) +
scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
labs(
title = "Win Probability",
x = "Play Number",
y = "Home Team Win Probability"
) +
theme_minimal()
}
print("NHL hockey analysis functions loaded")
Strokes Gained Analysis
Calculate strokes gained statistics for golf performance analysis.
"""Golf Strokes Gained Analysis."""
import pandas as pd
import numpy as np
class StrokesGained:
"""
Calculate Strokes Gained statistics.
Strokes Gained measures performance relative to baseline
expected strokes from each position.
"""
# Baseline expected strokes to hole out from various distances
# Based on PGA Tour averages
BASELINE_TEE = {
# Distance in yards: Expected strokes
100: 2.92, 125: 2.99, 150: 3.05, 175: 3.12, 200: 3.18,
225: 3.25, 250: 3.33, 275: 3.42, 300: 3.51, 325: 3.61,
350: 3.71, 375: 3.82, 400: 3.94, 425: 4.06, 450: 4.18,
475: 4.31, 500: 4.44, 525: 4.58, 550: 4.73
}
BASELINE_FAIRWAY = {
# Distance in yards: Expected strokes
25: 2.40, 50: 2.65, 75: 2.77, 100: 2.87, 125: 2.96,
150: 3.04, 175: 3.12, 200: 3.21, 225: 3.31, 250: 3.42,
275: 3.55, 300: 3.70
}
BASELINE_ROUGH = {
# Typically 0.1-0.2 strokes worse than fairway
25: 2.50, 50: 2.75, 75: 2.90, 100: 3.00, 125: 3.10,
150: 3.20, 175: 3.30, 200: 3.42, 225: 3.55, 250: 3.70
}
BASELINE_SAND = {
# Greenside bunkers
5: 2.40, 10: 2.50, 15: 2.60, 20: 2.70, 25: 2.80,
30: 2.90, 40: 3.10, 50: 3.30
}
BASELINE_GREEN = {
# Distance in feet: Expected putts
2: 1.01, 3: 1.05, 4: 1.12, 5: 1.18, 6: 1.25,
7: 1.32, 8: 1.39, 9: 1.46, 10: 1.53, 12: 1.61,
15: 1.70, 20: 1.80, 25: 1.87, 30: 1.93, 35: 1.98,
40: 2.02, 45: 2.06, 50: 2.09, 60: 2.15, 70: 2.20,
80: 2.24, 90: 2.27, 100: 2.30
}
@classmethod
def _interpolate_baseline(cls, distance: float, baseline: dict) -> float:
"""Interpolate baseline strokes for a given distance."""
distances = sorted(baseline.keys())
if distance <= distances[0]:
return baseline[distances[0]]
if distance >= distances[-1]:
return baseline[distances[-1]]
# Find surrounding distances
for i in range(len(distances) - 1):
if distances[i] <= distance <= distances[i + 1]:
d1, d2 = distances[i], distances[i + 1]
s1, s2 = baseline[d1], baseline[d2]
# Linear interpolation
return s1 + (s2 - s1) * (distance - d1) / (d2 - d1)
return baseline[distances[-1]]
@classmethod
def expected_strokes(cls, distance: float, lie: str) -> float:
"""
Get expected strokes from a position.
Args:
distance: Distance to hole (yards or feet for putting)
lie: "tee", "fairway", "rough", "sand", "green"
"""
baselines = {
"tee": cls.BASELINE_TEE,
"fairway": cls.BASELINE_FAIRWAY,
"rough": cls.BASELINE_ROUGH,
"sand": cls.BASELINE_SAND,
"green": cls.BASELINE_GREEN
}
baseline = baselines.get(lie.lower(), cls.BASELINE_FAIRWAY)
return cls._interpolate_baseline(distance, baseline)
@classmethod
def strokes_gained_shot(
cls,
start_distance: float,
start_lie: str,
end_distance: float,
end_lie: str,
strokes: int = 1
) -> float:
"""
Calculate strokes gained for a single shot.
SG = Expected_before - Expected_after - strokes_taken
"""
exp_before = cls.expected_strokes(start_distance, start_lie)
if end_distance == 0 and end_lie == "hole":
exp_after = 0
else:
exp_after = cls.expected_strokes(end_distance, end_lie)
return exp_before - exp_after - strokes
@classmethod
def analyze_round(cls, shots: pd.DataFrame) -> dict:
"""
Analyze a complete round.
Expected columns: hole, shot_num, start_distance, start_lie,
end_distance, end_lie
"""
# Calculate SG for each shot
shots = shots.copy()
shots["sg"] = shots.apply(
lambda r: cls.strokes_gained_shot(
r["start_distance"], r["start_lie"],
r["end_distance"], r["end_lie"]
), axis=1
)
# Categorize shots
off_tee = shots[shots["start_lie"] == "tee"]["sg"].sum()
approach = shots[
(shots["start_lie"].isin(["fairway", "rough"])) &
(shots["end_lie"] == "green")
]["sg"].sum()
around_green = shots[
(shots["start_lie"].isin(["rough", "sand"])) &
(shots["end_distance"] < 30) &
(shots["end_lie"] == "green")
]["sg"].sum()
putting = shots[shots["start_lie"] == "green"]["sg"].sum()
return {
"SG_Total": shots["sg"].sum(),
"SG_Off_Tee": off_tee,
"SG_Approach": approach,
"SG_Around_Green": around_green,
"SG_Putting": putting,
"Total_Shots": len(shots),
"Score_vs_Par": len(shots) - 72 # Assuming par 72
}
def simulate_round() -> pd.DataFrame:
"""Simulate a round of golf for demonstration."""
np.random.seed(42)
shots = []
for hole in range(1, 19):
# Par 4 for simplicity
par = 4
# Tee shot
drive_dist = np.random.normal(280, 25)
fairway_hit = np.random.random() > 0.35
remaining = 450 - drive_dist # 450 yard hole
shots.append({
"hole": hole, "shot_num": 1,
"start_distance": 450, "start_lie": "tee",
"end_distance": remaining, "end_lie": "fairway" if fairway_hit else "rough"
})
# Approach shot
gir = np.random.random() > 0.3
putt_dist = np.random.exponential(20) if gir else 50 + np.random.exponential(10)
shots.append({
"hole": hole, "shot_num": 2,
"start_distance": remaining, "start_lie": "fairway" if fairway_hit else "rough",
"end_distance": putt_dist, "end_lie": "green"
})
# Putting
current_dist = putt_dist
putt_num = 3
while current_dist > 0:
if current_dist < 3 or (current_dist < 10 and np.random.random() > 0.5):
# Hole it
shots.append({
"hole": hole, "shot_num": putt_num,
"start_distance": current_dist, "start_lie": "green",
"end_distance": 0, "end_lie": "hole"
})
current_dist = 0
else:
# Leave a shorter putt
miss_dist = max(2, current_dist * 0.2 * np.random.random())
shots.append({
"hole": hole, "shot_num": putt_num,
"start_distance": current_dist, "start_lie": "green",
"end_distance": miss_dist, "end_lie": "green"
})
current_dist = miss_dist
putt_num += 1
return pd.DataFrame(shots)
# Example usage
if __name__ == "__main__":
# Simulate a round
round_data = simulate_round()
# Analyze
sg = StrokesGained()
results = sg.analyze_round(round_data)
print("Round Analysis:")
for k, v in results.items():
print(f" {k}: {v:.2f}" if isinstance(v, float) else f" {k}: {v}")
# Single shot example
shot_sg = sg.strokes_gained_shot(
start_distance=175, start_lie="fairway",
end_distance=10, end_lie="green"
)
print(f"\nSample approach shot (175 yards to 10 feet): {shot_sg:.3f} SG")
Player Efficiency Rating (PER) Calculator
Calculate NBA Player Efficiency Rating from box score statistics.
"""Calculate Player Efficiency Rating (PER)."""
import pandas as pd
import numpy as np
def calculate_per(
player_stats: pd.DataFrame,
league_stats: dict = None
) -> pd.Series:
"""
Calculate Player Efficiency Rating.
PER formula by John Hollinger - summarizes player productivity
in a single number. League average is ~15.
Args:
player_stats: DataFrame with player box score stats
league_stats: Dict with league averages (pace, scoring, etc.)
"""
# Default league stats if not provided
if league_stats is None:
league_stats = {
"lg_AST": 24.0, # League assists per game
"lg_FG": 41.0, # League FG per game
"lg_FT": 17.0, # League FT made per game
"lg_PTS": 110.0, # League points per game
"lg_FGA": 88.0, # League FGA per game
"lg_FTA": 22.0, # League FTA per game
"lg_TRB": 44.0, # League rebounds per game
"lg_ORB": 10.0, # League offensive rebounds
"lg_TOV": 14.0, # League turnovers
"lg_PACE": 100.0, # League pace
"lg_VOP": 1.0, # Value of possession
"lg_DRBP": 0.77 # Defensive rebound percentage
}
# Calculate factor and VOP
factor = (2/3) - (0.5 * (league_stats["lg_AST"] / league_stats["lg_FG"])) / \
(2 * (league_stats["lg_FG"] / league_stats["lg_FT"]))
VOP = league_stats["lg_PTS"] / (
league_stats["lg_FGA"] -
league_stats["lg_ORB"] +
league_stats["lg_TOV"] +
0.44 * league_stats["lg_FTA"]
)
DRBP = (league_stats["lg_TRB"] - league_stats["lg_ORB"]) / league_stats["lg_TRB"]
# Extract stats
df = player_stats.copy()
# uPER (unadjusted PER)
uPER = (1 / df["MP"]) * (
df["3P"] +
(2/3) * df["AST"] +
(2 - factor * (df["TM_AST"] / df["TM_FG"])) * df["FG"] +
(df["FT"] * 0.5 * (1 + (1 - (df["TM_AST"] / df["TM_FG"])) + (2/3) * (df["TM_AST"] / df["TM_FG"]))) -
VOP * df["TOV"] -
VOP * DRBP * (df["FGA"] - df["FG"]) -
VOP * 0.44 * (0.44 + (0.56 * DRBP)) * (df["FTA"] - df["FT"]) +
VOP * (1 - DRBP) * (df["TRB"] - df["ORB"]) +
VOP * DRBP * df["ORB"] +
VOP * df["STL"] +
VOP * DRBP * df["BLK"] -
df["PF"] * ((league_stats["lg_FT"] / league_stats["lg_PF"]) -
0.44 * (league_stats["lg_FTA"] / league_stats["lg_PF"]) * VOP)
)
# Pace adjustment
pace_adj = league_stats["lg_PACE"] / df["TM_PACE"]
# Final PER (league average = 15)
PER = uPER * pace_adj * (15 / uPER.mean())
return PER
def per_components(player_stats: pd.DataFrame) -> pd.DataFrame:
"""
Break down PER into positive and negative components.
"""
df = player_stats.copy()
components = pd.DataFrame({
"Player": df["PLAYER_NAME"],
"Scoring": (df["PTS"] - df["FGA"] * 0.44 - df["FTA"] * 0.44) / df["MP"] * 48,
"Rebounding": df["TRB"] / df["MP"] * 48,
"Assists": df["AST"] / df["MP"] * 48 * 0.67,
"Steals": df["STL"] / df["MP"] * 48,
"Blocks": df["BLK"] / df["MP"] * 48,
"Turnovers": -df["TOV"] / df["MP"] * 48,
"Fouls": -df["PF"] / df["MP"] * 48 * 0.2
})
components["Total"] = components.drop("Player", axis=1).sum(axis=1)
return components
def calculate_ts_pct(pts: int, fga: int, fta: int) -> float:
"""Calculate True Shooting Percentage."""
return pts / (2 * (fga + 0.44 * fta))
def calculate_efg_pct(fg: int, fg3: int, fga: int) -> float:
"""Calculate Effective Field Goal Percentage."""
return (fg + 0.5 * fg3) / fga
def calculate_usg_pct(fga: int, fta: int, tov: int, mp: float,
tm_fga: int, tm_fta: int, tm_tov: int, tm_mp: float) -> float:
"""Calculate Usage Percentage."""
return 100 * ((fga + 0.44 * fta + tov) * (tm_mp / 5)) / \
(mp * (tm_fga + 0.44 * tm_fta + tm_tov))
# Example usage
if __name__ == "__main__":
# Sample player data (season totals)
players = pd.DataFrame({
"PLAYER_NAME": ["Player A", "Player B", "Player C"],
"GP": [72, 65, 70],
"MP": [2400, 2000, 1800],
"PTS": [1800, 1200, 900],
"FG": [650, 450, 350],
"FGA": [1400, 950, 750],
"3P": [150, 100, 80],
"FT": [350, 200, 120],
"FTA": [420, 250, 150],
"ORB": [60, 180, 50],
"TRB": [360, 650, 200],
"AST": [500, 200, 400],
"STL": [90, 50, 80],
"BLK": [40, 120, 20],
"TOV": [200, 150, 120],
"PF": [150, 180, 120],
# Team stats needed for PER
"TM_AST": [1800, 1800, 1800],
"TM_FG": [3000, 3000, 3000],
"TM_PACE": [100, 100, 100]
})
# Calculate efficiency stats
players["TS%"] = players.apply(
lambda r: calculate_ts_pct(r["PTS"], r["FGA"], r["FTA"]), axis=1
)
players["eFG%"] = players.apply(
lambda r: calculate_efg_pct(r["FG"], r["3P"], r["FGA"]), axis=1
)
print("Player Efficiency Stats:")
print(players[["PLAYER_NAME", "PTS", "TRB", "AST", "TS%", "eFG%"]].round(3))
Tennis Match Statistics
Calculate comprehensive tennis match statistics including serve and return metrics.
"""Tennis Match Statistics Calculator."""
import pandas as pd
import numpy as np
class TennisStats:
"""Calculate tennis match and career statistics."""
@staticmethod
def serve_stats(
aces: int, double_faults: int,
first_serves_in: int, first_serves_total: int,
first_serve_points_won: int, first_serve_points: int,
second_serve_points_won: int, second_serve_points: int
) -> dict:
"""Calculate serve statistics."""
total_serve_points = first_serve_points + second_serve_points
total_serve_points_won = first_serve_points_won + second_serve_points_won
return {
"Aces": aces,
"Double_Faults": double_faults,
"First_Serve_Pct": first_serves_in / first_serves_total * 100 if first_serves_total > 0 else 0,
"First_Serve_Won_Pct": first_serve_points_won / first_serve_points * 100 if first_serve_points > 0 else 0,
"Second_Serve_Won_Pct": second_serve_points_won / second_serve_points * 100 if second_serve_points > 0 else 0,
"Service_Points_Won_Pct": total_serve_points_won / total_serve_points * 100 if total_serve_points > 0 else 0,
"Service_Games_Won_Pct": None # Need service games data
}
@staticmethod
def return_stats(
return_points_won: int, return_points: int,
first_return_won: int, first_return_points: int,
second_return_won: int, second_return_points: int,
break_points_won: int, break_points: int
) -> dict:
"""Calculate return statistics."""
return {
"Return_Points_Won_Pct": return_points_won / return_points * 100 if return_points > 0 else 0,
"First_Return_Won_Pct": first_return_won / first_return_points * 100 if first_return_points > 0 else 0,
"Second_Return_Won_Pct": second_return_won / second_return_points * 100 if second_return_points > 0 else 0,
"Break_Points_Won_Pct": break_points_won / break_points * 100 if break_points > 0 else 0,
"Break_Points_Faced": break_points
}
@staticmethod
def dominance_ratio(serve_won_pct: float, return_won_pct: float) -> float:
"""
Calculate dominance ratio.
DR = (Serve Points Won + Return Points Won) / Total Points
Values > 1 indicate dominance
"""
return (serve_won_pct + return_won_pct) / 100
@staticmethod
def efficiency_stats(
winners: int, unforced_errors: int,
forced_errors: int, total_points: int
) -> dict:
"""Calculate efficiency and aggression metrics."""
return {
"Winners": winners,
"Unforced_Errors": unforced_errors,
"W_UE_Ratio": winners / unforced_errors if unforced_errors > 0 else float("inf"),
"Winner_Pct": winners / total_points * 100,
"Error_Pct": unforced_errors / total_points * 100,
"Aggression_Index": (winners + forced_errors) / total_points * 100
}
@staticmethod
def tiebreak_stats(tiebreaks_won: int, tiebreaks_played: int) -> dict:
"""Calculate tiebreak statistics."""
return {
"Tiebreaks_Won": tiebreaks_won,
"Tiebreaks_Lost": tiebreaks_played - tiebreaks_won,
"Tiebreak_Win_Pct": tiebreaks_won / tiebreaks_played * 100 if tiebreaks_played > 0 else 0
}
@staticmethod
def surface_performance(matches_df: pd.DataFrame, surface: str) -> dict:
"""Calculate performance on a specific surface."""
surface_matches = matches_df[matches_df["surface"] == surface]
if len(surface_matches) == 0:
return {"surface": surface, "matches": 0}
wins = surface_matches["won"].sum()
total = len(surface_matches)
return {
"Surface": surface,
"Matches": total,
"Wins": wins,
"Losses": total - wins,
"Win_Pct": wins / total * 100
}
def calculate_match_stats(match_data: dict) -> pd.DataFrame:
"""
Calculate comprehensive match statistics.
Args:
match_data: Dict with player stats
"""
stats = TennisStats()
results = []
for player, data in match_data.items():
serve = stats.serve_stats(
data["aces"], data["double_faults"],
data["first_serve_in"], data["first_serve_total"],
data["first_serve_won"], data["first_serve_points"],
data["second_serve_won"], data["second_serve_points"]
)
ret = stats.return_stats(
data["return_won"], data["return_points"],
data["first_return_won"], data["first_return_points"],
data["second_return_won"], data["second_return_points"],
data["break_points_won"], data["break_points"]
)
eff = stats.efficiency_stats(
data["winners"], data["unforced_errors"],
data.get("forced_errors", 0), data["total_points"]
)
results.append({
"Player": player,
**serve,
**ret,
**eff,
"Dominance_Ratio": stats.dominance_ratio(
serve["Service_Points_Won_Pct"],
ret["Return_Points_Won_Pct"]
)
})
return pd.DataFrame(results)
def expected_games_won(serve_hold_pct: float, return_break_pct: float,
total_games: int) -> float:
"""
Calculate expected games won based on serve/return percentages.
"""
# In a match, roughly half are serve games
serve_games = total_games / 2
return_games = total_games / 2
return serve_games * serve_hold_pct + return_games * return_break_pct
# Example usage
if __name__ == "__main__":
# Sample match data
match_data = {
"Player A": {
"aces": 12, "double_faults": 3,
"first_serve_in": 48, "first_serve_total": 65,
"first_serve_won": 38, "first_serve_points": 48,
"second_serve_won": 10, "second_serve_points": 17,
"return_won": 32, "return_points": 75,
"first_return_won": 18, "first_return_points": 50,
"second_return_won": 14, "second_return_points": 25,
"break_points_won": 4, "break_points": 8,
"winners": 35, "unforced_errors": 22,
"total_points": 140
},
"Player B": {
"aces": 8, "double_faults": 5,
"first_serve_in": 50, "first_serve_total": 75,
"first_serve_won": 35, "first_serve_points": 50,
"second_serve_won": 8, "second_serve_points": 25,
"return_won": 17, "return_points": 65,
"first_return_won": 10, "first_return_points": 48,
"second_return_won": 7, "second_return_points": 17,
"break_points_won": 2, "break_points": 6,
"winners": 28, "unforced_errors": 30,
"total_points": 140
}
}
results = calculate_match_stats(match_data)
print("Match Statistics:")
cols = ["Player", "Aces", "First_Serve_Pct", "Service_Points_Won_Pct",
"Return_Points_Won_Pct", "W_UE_Ratio", "Dominance_Ratio"]
print(results[cols].round(1).to_string(index=False))
Statcast Pitch Analysis
Analyze pitch characteristics using Statcast data including velocity, spin, and movement.
"""Statcast pitch analysis and classification."""
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
class PitchAnalyzer:
"""Analyze and classify pitches using Statcast data."""
def __init__(self):
self.scaler = StandardScaler()
def analyze_repertoire(self, pitches: pd.DataFrame, pitcher_name: str = None) -> pd.DataFrame:
"""
Analyze a pitcher's pitch repertoire.
Expected columns: pitch_type, release_speed, release_spin_rate,
pfx_x (horizontal movement), pfx_z (vertical movement)
"""
if pitcher_name:
pitches = pitches[pitches["player_name"] == pitcher_name].copy()
# Group by pitch type
repertoire = pitches.groupby("pitch_type").agg({
"release_speed": ["mean", "std", "count"],
"release_spin_rate": ["mean", "std"],
"pfx_x": "mean", # Horizontal break (inches)
"pfx_z": "mean", # Vertical break (inches)
}).round(1)
repertoire.columns = [
"Velocity", "Velo_STD", "Count",
"Spin_Rate", "Spin_STD",
"Horiz_Break", "Vert_Break"
]
# Calculate usage percentage
repertoire["Usage%"] = (repertoire["Count"] / repertoire["Count"].sum() * 100).round(1)
return repertoire.sort_values("Usage%", ascending=False)
def pitch_movement_plot(self, pitches: pd.DataFrame, ax=None):
"""Create pitch movement plot."""
if ax is None:
fig, ax = plt.subplots(figsize=(10, 10))
pitch_colors = {
"FF": "red", # 4-seam fastball
"SI": "orange", # Sinker
"FC": "purple", # Cutter
"SL": "blue", # Slider
"CU": "green", # Curveball
"CH": "gray", # Changeup
"FS": "brown", # Splitter
}
for pitch_type in pitches["pitch_type"].unique():
subset = pitches[pitches["pitch_type"] == pitch_type]
color = pitch_colors.get(pitch_type, "black")
ax.scatter(subset["pfx_x"], subset["pfx_z"],
label=pitch_type, alpha=0.5, c=color, s=20)
ax.axhline(y=0, color="gray", linestyle="--", alpha=0.5)
ax.axvline(x=0, color="gray", linestyle="--", alpha=0.5)
ax.set_xlabel("Horizontal Movement (inches)")
ax.set_ylabel("Vertical Movement (inches)")
ax.set_title("Pitch Movement Profile")
ax.legend()
ax.set_xlim(-25, 25)
ax.set_ylim(-25, 25)
return ax
def stuff_plus(self, pitches: pd.DataFrame, league_avgs: dict) -> pd.Series:
"""
Calculate Stuff+ style metric.
Compares pitch characteristics to league average.
100 = league average, higher = better
"""
# This is a simplified version
# Real Stuff+ uses ML models trained on outcomes
results = []
for _, pitch in pitches.iterrows():
pitch_type = pitch["pitch_type"]
if pitch_type not in league_avgs:
results.append(100)
continue
la = league_avgs[pitch_type]
# Compare to league average (simplified)
velo_diff = (pitch["release_speed"] - la["velocity"]) / la["velocity_std"]
spin_diff = (pitch["release_spin_rate"] - la["spin"]) / la["spin_std"]
# Weighted combination (higher velo/spin = better for most pitches)
stuff = 100 + (velo_diff * 5) + (spin_diff * 3)
results.append(stuff)
return pd.Series(results, index=pitches.index)
def classify_pitches(self, pitches: pd.DataFrame, n_clusters: int = 6) -> pd.Series:
"""
Classify pitches using clustering.
Useful for finding misclassified pitches.
"""
features = ["release_speed", "release_spin_rate", "pfx_x", "pfx_z"]
X = pitches[features].dropna()
X_scaled = self.scaler.fit_transform(X)
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
return pd.Series(clusters, index=X.index, name="cluster")
# Example usage
if __name__ == "__main__":
# Create sample pitch data
np.random.seed(42)
pitches = pd.DataFrame({
"pitch_type": np.random.choice(["FF", "SL", "CH", "CU"], 500, p=[0.5, 0.25, 0.15, 0.1]),
"release_speed": np.random.normal(93, 5, 500),
"release_spin_rate": np.random.normal(2300, 300, 500),
"pfx_x": np.random.normal(0, 8, 500),
"pfx_z": np.random.normal(10, 8, 500),
"player_name": "Sample Pitcher"
})
analyzer = PitchAnalyzer()
print(analyzer.analyze_repertoire(pitches))
Spray Chart Generator
Create baseball spray charts showing batted ball locations and outcomes.
"""Baseball spray chart generator."""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import Polygon, Arc
def draw_field(ax, field_type="outfield"):
"""Draw baseball field outline."""
# Set equal aspect ratio
ax.set_aspect("equal")
if field_type == "outfield":
# Outfield arc
theta = np.linspace(-np.pi/4, np.pi/4 + np.pi/2, 100)
r = 300 # feet to outfield wall
x = r * np.sin(theta)
y = r * np.cos(theta)
ax.plot(x, y, "green", linewidth=2)
# Infield
infield_r = 90 * np.sqrt(2)
ax.add_patch(patches.RegularPolygon(
(0, 63.5), 4, infield_r, orientation=np.pi/4,
fill=False, edgecolor="brown", linewidth=2
))
# Foul lines
ax.plot([0, -250], [0, 250], "white", linewidth=1)
ax.plot([0, 250], [0, 250], "white", linewidth=1)
# Home plate
ax.scatter([0], [0], c="white", s=100, marker="^", zorder=5)
ax.set_xlim(-350, 350)
ax.set_ylim(-50, 400)
ax.set_facecolor("darkgreen")
ax.set_xticks([])
ax.set_yticks([])
return ax
def create_spray_chart(
batted_balls: pd.DataFrame,
player_name: str = None,
color_by: str = "hit_outcome",
ax=None
):
"""
Create spray chart from batted ball data.
Expected columns:
- hc_x, hc_y: hit coordinates (Statcast uses 0-250 scale)
- hit_outcome: single, double, triple, home_run, out
- launch_speed: exit velocity
- launch_angle: launch angle
"""
if ax is None:
fig, ax = plt.subplots(figsize=(10, 10))
draw_field(ax)
if player_name:
batted_balls = batted_balls[batted_balls["player_name"] == player_name]
# Convert Statcast coordinates (if necessary)
# Statcast: 0-250 scale, need to convert to feet from home plate
if "hc_x" in batted_balls.columns:
x = batted_balls["hc_x"].copy()
y = batted_balls["hc_y"].copy()
# Convert from Statcast coords (125.42, 199.27) is home plate
x = (x - 125.42) * 2.5
y = (199.27 - y) * 2.5
else:
x = batted_balls["hit_x"]
y = batted_balls["hit_y"]
# Color mapping
if color_by == "hit_outcome":
color_map = {
"single": "blue",
"double": "green",
"triple": "orange",
"home_run": "red",
"out": "gray",
"field_out": "gray",
"field_error": "yellow"
}
colors = batted_balls["events"].map(lambda x: color_map.get(x, "gray"))
elif color_by == "exit_velocity":
colors = batted_balls["launch_speed"]
elif color_by == "launch_angle":
colors = batted_balls["launch_angle"]
else:
colors = "blue"
scatter = ax.scatter(x, y, c=colors, alpha=0.6, s=30, edgecolors="black", linewidth=0.5)
# Add colorbar for continuous variables
if color_by in ["exit_velocity", "launch_angle"]:
plt.colorbar(scatter, ax=ax, label=color_by.replace("_", " ").title())
# Add legend for categorical
if color_by == "hit_outcome":
for outcome, color in color_map.items():
ax.scatter([], [], c=color, label=outcome.replace("_", " ").title())
ax.legend(loc="upper right")
title = "Spray Chart"
if player_name:
title = f"{player_name} Spray Chart"
ax.set_title(title, fontsize=14, fontweight="bold")
return ax
def calculate_pull_tendency(batted_balls: pd.DataFrame, batter_hand: str = "R") -> dict:
"""
Calculate pull/center/opposite field tendencies.
"""
# Convert coordinates
x = (batted_balls["hc_x"] - 125.42) * 2.5
# Adjust for batter handedness
if batter_hand == "L":
x = -x
# Pull = positive x for RHB, negative for LHB
pull = (x > 40).sum()
center = ((x >= -40) & (x <= 40)).sum()
oppo = (x < -40).sum()
total = pull + center + oppo
return {
"Pull%": round(pull / total * 100, 1) if total > 0 else 0,
"Center%": round(center / total * 100, 1) if total > 0 else 0,
"Oppo%": round(oppo / total * 100, 1) if total > 0 else 0,
"Total_BIP": total
}
# Example usage
if __name__ == "__main__":
# Create sample batted ball data
np.random.seed(42)
n = 200
# Random positions (roughly realistic spray pattern)
angles = np.random.uniform(-45, 45, n) # degrees from center
distances = np.random.uniform(100, 350, n)
x = distances * np.sin(np.radians(angles))
y = distances * np.cos(np.radians(angles))
# Convert to Statcast-like coordinates
hc_x = x / 2.5 + 125.42
hc_y = 199.27 - y / 2.5
batted_balls = pd.DataFrame({
"hc_x": hc_x,
"hc_y": hc_y,
"launch_speed": np.random.normal(90, 10, n),
"launch_angle": np.random.normal(15, 15, n),
"events": np.random.choice(
["single", "double", "triple", "home_run", "field_out"],
n, p=[0.2, 0.05, 0.01, 0.04, 0.7]
),
"player_name": "Sample Batter"
})
# Create spray chart
fig, ax = plt.subplots(figsize=(10, 10))
create_spray_chart(batted_balls, color_by="hit_outcome", ax=ax)
plt.tight_layout()
# plt.savefig("spray_chart.png")
# plt.show()
print("Pull Tendency:", calculate_pull_tendency(batted_balls))
NBA Shot Chart Generator
Create basketball shot charts with zones and percentages.
"""NBA shot chart generator."""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, Rectangle, Arc
def draw_court(ax=None, color="black", lw=2, outer_lines=False):
"""Draw NBA court lines."""
if ax is None:
ax = plt.gca()
# Hoop
hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)
ax.add_patch(hoop)
# Backboard
backboard = Rectangle((-30, -7.5), 60, -1, linewidth=lw, color=color)
ax.add_patch(backboard)
# Paint/Key
outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw, color=color, fill=False)
inner_box = Rectangle((-60, -47.5), 120, 190, linewidth=lw, color=color, fill=False)
ax.add_patch(outer_box)
ax.add_patch(inner_box)
# Free throw circle
top_arc = Arc((0, 142.5), 120, 120, theta1=0, theta2=180, linewidth=lw, color=color)
bottom_arc = Arc((0, 142.5), 120, 120, theta1=180, theta2=360, linewidth=lw, color=color, linestyle="dashed")
ax.add_patch(top_arc)
ax.add_patch(bottom_arc)
# Restricted Area
restricted = Arc((0, 0), 80, 80, theta1=0, theta2=180, linewidth=lw, color=color)
ax.add_patch(restricted)
# Three point line
corner_three_left = Rectangle((-220, -47.5), 0, 140, linewidth=lw, color=color)
corner_three_right = Rectangle((220, -47.5), 0, 140, linewidth=lw, color=color)
ax.add_patch(corner_three_left)
ax.add_patch(corner_three_right)
three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158, linewidth=lw, color=color)
ax.add_patch(three_arc)
# Center court
center_outer = Arc((0, 422.5), 120, 120, theta1=180, theta2=0, linewidth=lw, color=color)
center_inner = Arc((0, 422.5), 40, 40, theta1=180, theta2=0, linewidth=lw, color=color)
ax.add_patch(center_outer)
ax.add_patch(center_inner)
if outer_lines:
outer = Rectangle((-250, -47.5), 500, 470, linewidth=lw, color=color, fill=False)
ax.add_patch(outer)
ax.set_xlim(-250, 250)
ax.set_ylim(-47.5, 422.5)
ax.set_aspect("equal")
ax.set_xticks([])
ax.set_yticks([])
return ax
def create_shot_chart(
shots: pd.DataFrame,
player_name: str = None,
mode: str = "scatter",
ax=None
):
"""
Create shot chart.
Args:
shots: DataFrame with LOC_X, LOC_Y, SHOT_MADE_FLAG columns
player_name: Filter to specific player
mode: "scatter", "hexbin", or "zone"
"""
if ax is None:
fig, ax = plt.subplots(figsize=(12, 11))
if player_name:
shots = shots[shots["PLAYER_NAME"] == player_name]
draw_court(ax, color="black", lw=1)
x = shots["LOC_X"]
y = shots["LOC_Y"]
made = shots["SHOT_MADE_FLAG"]
if mode == "scatter":
# Green for makes, red for misses
colors = ["green" if m == 1 else "red" for m in made]
ax.scatter(x, y, c=colors, alpha=0.5, s=20)
elif mode == "hexbin":
# Hexbin showing shooting percentage
hb = ax.hexbin(x, y, C=made, gridsize=30, cmap="RdYlGn",
reduce_C_function=np.mean, mincnt=3, extent=[-250, 250, -47.5, 400])
plt.colorbar(hb, ax=ax, label="FG%")
elif mode == "zone":
# Zone-based percentages
zones = define_shot_zones(shots)
for zone_name, zone_shots in zones.items():
if len(zone_shots) > 0:
fg_pct = zone_shots["SHOT_MADE_FLAG"].mean()
count = len(zone_shots)
centroid_x = zone_shots["LOC_X"].mean()
centroid_y = zone_shots["LOC_Y"].mean()
color = "green" if fg_pct > 0.4 else "orange" if fg_pct > 0.3 else "red"
ax.text(centroid_x, centroid_y, f"{fg_pct:.1%}\n({count})",
ha="center", va="center", fontsize=9, fontweight="bold", color=color)
title = "Shot Chart"
if player_name:
title = f"{player_name} Shot Chart"
ax.set_title(title, fontsize=14, fontweight="bold")
return ax
def define_shot_zones(shots: pd.DataFrame) -> dict:
"""Define shot zones and classify shots."""
zones = {}
x = shots["LOC_X"]
y = shots["LOC_Y"]
# Distance from basket
dist = np.sqrt(x**2 + y**2)
# Three point line (approx 237.5 at arc)
is_three = dist > 237.5
is_three = is_three | ((np.abs(x) > 220) & (y < 92.5))
# Restricted area
zones["Restricted Area"] = shots[dist <= 40]
# Paint (non-restricted)
zones["Paint"] = shots[(dist > 40) & (dist <= 80) & (np.abs(x) < 80)]
# Mid-range
zones["Mid-Range Left"] = shots[~is_three & (x < -80) & (dist > 40)]
zones["Mid-Range Center"] = shots[~is_three & (np.abs(x) <= 80) & (dist > 80)]
zones["Mid-Range Right"] = shots[~is_three & (x > 80) & (dist > 40)]
# Three pointers
zones["Corner 3 Left"] = shots[is_three & (x < -220)]
zones["Corner 3 Right"] = shots[is_three & (x > 220)]
zones["Above Break 3"] = shots[is_three & (np.abs(x) <= 220)]
return zones
def shot_zone_summary(shots: pd.DataFrame) -> pd.DataFrame:
"""Get shooting summary by zone."""
zones = define_shot_zones(shots)
summary = []
for zone_name, zone_shots in zones.items():
if len(zone_shots) > 0:
summary.append({
"Zone": zone_name,
"FGA": len(zone_shots),
"FGM": zone_shots["SHOT_MADE_FLAG"].sum(),
"FG%": zone_shots["SHOT_MADE_FLAG"].mean(),
"Pts/Shot": zone_shots["SHOT_MADE_FLAG"].mean() * (3 if "3" in zone_name else 2)
})
return pd.DataFrame(summary).sort_values("Pts/Shot", ascending=False).round(3)
# Example usage
if __name__ == "__main__":
# Create sample shot data
np.random.seed(42)
n = 500
shots = pd.DataFrame({
"LOC_X": np.random.normal(0, 100, n),
"LOC_Y": np.random.uniform(0, 300, n),
"SHOT_MADE_FLAG": np.random.binomial(1, 0.45, n),
"PLAYER_NAME": "Sample Player"
})
# Create shot chart
fig, axes = plt.subplots(1, 2, figsize=(20, 10))
create_shot_chart(shots, mode="scatter", ax=axes[0])
create_shot_chart(shots, mode="hexbin", ax=axes[1])
plt.tight_layout()
# Zone summary
print(shot_zone_summary(shots))
NFL Passing Stats Calculator
Calculate advanced NFL passing metrics including ANY/A, QBR components.
"""NFL Advanced Passing Statistics Calculator."""
import pandas as pd
import numpy as np
class PassingMetrics:
"""Calculate advanced NFL passing statistics."""
@staticmethod
def passer_rating(comp: int, att: int, yards: int, td: int, int_: int) -> float:
"""
Calculate NFL Passer Rating.
Perfect rating is 158.3
"""
if att == 0:
return 0
# Four components, each capped at 2.375
a = max(0, min(((comp / att) - 0.3) * 5, 2.375))
b = max(0, min(((yards / att) - 3) * 0.25, 2.375))
c = max(0, min((td / att) * 20, 2.375))
d = max(0, min(2.375 - ((int_ / att) * 25), 2.375))
return ((a + b + c + d) / 6) * 100
@staticmethod
def adjusted_yards_per_attempt(yards: int, td: int, int_: int, att: int) -> float:
"""
Adjusted Yards per Attempt (AY/A).
AY/A = (Yards + 20*TD - 45*INT) / Attempts
"""
if att == 0:
return 0
return (yards + 20 * td - 45 * int_) / att
@staticmethod
def adjusted_net_yards_per_attempt(
yards: int, td: int, int_: int, att: int,
sacks: int, sack_yards: int
) -> float:
"""
Adjusted Net Yards per Attempt (ANY/A).
Includes sack impact.
"""
total_plays = att + sacks
if total_plays == 0:
return 0
return (yards + 20 * td - 45 * int_ - sack_yards) / total_plays
@staticmethod
def completion_percentage_over_expected(
actual_comp_pct: float,
expected_comp_pct: float
) -> float:
"""
Completion Percentage Over Expected (CPOE).
Requires pre-calculated expected completion percentage.
"""
return actual_comp_pct - expected_comp_pct
@staticmethod
def air_yards_metrics(
air_yards: int, att: int, comp: int,
completed_air_yards: int, yards: int
) -> dict:
"""
Calculate air yards related metrics.
"""
return {
"Intended_Air_Yards_per_Att": air_yards / att if att > 0 else 0,
"Completed_Air_Yards_per_Comp": completed_air_yards / comp if comp > 0 else 0,
"Air_Yards_per_Att": completed_air_yards / att if att > 0 else 0,
"YAC": yards - completed_air_yards,
"YAC_per_Comp": (yards - completed_air_yards) / comp if comp > 0 else 0,
"RACR": yards / air_yards if air_yards > 0 else 0 # Receiver Air Conversion Ratio
}
@staticmethod
def pressure_stats(
pressures: int, dropbacks: int,
pressured_comp: int, pressured_att: int, pressured_yards: int,
clean_comp: int, clean_att: int, clean_yards: int
) -> dict:
"""
Calculate passing stats under pressure vs clean pocket.
"""
return {
"Pressure_Rate": pressures / dropbacks if dropbacks > 0 else 0,
"Pressured_Comp%": pressured_comp / pressured_att if pressured_att > 0 else 0,
"Pressured_YPA": pressured_yards / pressured_att if pressured_att > 0 else 0,
"Clean_Comp%": clean_comp / clean_att if clean_att > 0 else 0,
"Clean_YPA": clean_yards / clean_att if clean_att > 0 else 0
}
def calculate_qb_stats(df: pd.DataFrame) -> pd.DataFrame:
"""
Calculate comprehensive QB statistics.
Expected columns: player, completions, attempts, yards, td, int,
sacks, sack_yards, air_yards
"""
results = df.copy()
# Passer Rating
results["Passer_Rating"] = df.apply(
lambda r: PassingMetrics.passer_rating(
r["completions"], r["attempts"], r["yards"], r["td"], r["int"]
), axis=1
)
# AY/A
results["AY/A"] = df.apply(
lambda r: PassingMetrics.adjusted_yards_per_attempt(
r["yards"], r["td"], r["int"], r["attempts"]
), axis=1
)
# ANY/A
if "sacks" in df.columns:
results["ANY/A"] = df.apply(
lambda r: PassingMetrics.adjusted_net_yards_per_attempt(
r["yards"], r["td"], r["int"], r["attempts"],
r["sacks"], r["sack_yards"]
), axis=1
)
# Basic stats
results["Comp%"] = df["completions"] / df["attempts"]
results["YPA"] = df["yards"] / df["attempts"]
results["TD%"] = df["td"] / df["attempts"]
results["INT%"] = df["int"] / df["attempts"]
return results
# Example usage
if __name__ == "__main__":
# Sample QB season data
qb_stats = pd.DataFrame({
"player": ["QB1", "QB2", "QB3"],
"completions": [380, 360, 340],
"attempts": [560, 550, 520],
"yards": [4500, 4200, 3900],
"td": [35, 30, 28],
"int": [10, 12, 8],
"sacks": [25, 35, 20],
"sack_yards": [180, 250, 140],
"air_yards": [4800, 4500, 4000]
})
results = calculate_qb_stats(qb_stats)
print("QB Statistics:")
print(results[["player", "Comp%", "YPA", "Passer_Rating", "AY/A", "ANY/A"]].round(2))
Soccer Pass Network Analysis
Analyze team passing networks and player connectivity in soccer matches.
"""Soccer Pass Network Analysis."""
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
class PassNetwork:
"""Analyze passing networks in soccer matches."""
def __init__(self, events: pd.DataFrame):
"""
Initialize with event data.
Expected columns: player_name, pass_recipient, team, location, type
"""
self.events = events
self.graph = None
def build_network(self, team: str) -> nx.DiGraph:
"""Build directed graph of passes for a team."""
# Filter to team passes
passes = self.events[
(self.events["team"] == team) &
(self.events["type"] == "Pass") &
(self.events["pass_recipient"].notna())
].copy()
# Create directed graph
G = nx.DiGraph()
# Count passes between players
pass_counts = passes.groupby(["player_name", "pass_recipient"]).size().reset_index(name="weight")
# Add edges
for _, row in pass_counts.iterrows():
G.add_edge(row["player_name"], row["pass_recipient"], weight=row["weight"])
self.graph = G
return G
def centrality_metrics(self) -> pd.DataFrame:
"""Calculate network centrality metrics for each player."""
if self.graph is None:
raise ValueError("Build network first")
metrics = pd.DataFrame({
"Player": list(self.graph.nodes()),
"Degree_Centrality": list(nx.degree_centrality(self.graph).values()),
"Betweenness": list(nx.betweenness_centrality(self.graph).values()),
"Closeness": list(nx.closeness_centrality(self.graph).values()),
"PageRank": list(nx.pagerank(self.graph).values())
})
# Total passes
in_degree = dict(self.graph.in_degree(weight="weight"))
out_degree = dict(self.graph.out_degree(weight="weight"))
metrics["Passes_Received"] = metrics["Player"].map(in_degree).fillna(0)
metrics["Passes_Made"] = metrics["Player"].map(out_degree).fillna(0)
metrics["Total_Involvement"] = metrics["Passes_Received"] + metrics["Passes_Made"]
return metrics.sort_values("PageRank", ascending=False)
def key_partnerships(self, top_n: int = 10) -> pd.DataFrame:
"""Find most frequent passing partnerships."""
if self.graph is None:
raise ValueError("Build network first")
edges = []
for u, v, data in self.graph.edges(data=True):
edges.append({
"From": u,
"To": v,
"Passes": data["weight"]
})
# Combine both directions
edge_df = pd.DataFrame(edges)
# Create undirected pairs
edge_df["Pair"] = edge_df.apply(
lambda r: tuple(sorted([r["From"], r["To"]])), axis=1
)
partnerships = edge_df.groupby("Pair")["Passes"].sum().sort_values(ascending=False)
result = pd.DataFrame({
"Partnership": [f"{p[0]} - {p[1]}" for p in partnerships.head(top_n).index],
"Passes": partnerships.head(top_n).values
})
return result
def plot_network(self, ax=None, layout="spring"):
"""Visualize the passing network."""
if self.graph is None:
raise ValueError("Build network first")
if ax is None:
fig, ax = plt.subplots(figsize=(12, 10))
# Layout
if layout == "spring":
pos = nx.spring_layout(self.graph, k=2, iterations=50)
elif layout == "circular":
pos = nx.circular_layout(self.graph)
elif layout == "kamada_kawai":
pos = nx.kamada_kawai_layout(self.graph)
else:
pos = nx.spring_layout(self.graph)
# Node sizes based on total involvement
in_degree = dict(self.graph.in_degree(weight="weight"))
out_degree = dict(self.graph.out_degree(weight="weight"))
node_sizes = [
(in_degree.get(n, 0) + out_degree.get(n, 0)) * 10 + 100
for n in self.graph.nodes()
]
# Edge widths based on pass count
edge_weights = [self.graph[u][v]["weight"] for u, v in self.graph.edges()]
max_weight = max(edge_weights) if edge_weights else 1
edge_widths = [w / max_weight * 5 for w in edge_weights]
# Draw
nx.draw_networkx_nodes(self.graph, pos, node_size=node_sizes,
node_color="lightblue", alpha=0.8, ax=ax)
nx.draw_networkx_labels(self.graph, pos, font_size=8, ax=ax)
nx.draw_networkx_edges(self.graph, pos, width=edge_widths,
alpha=0.5, edge_color="gray",
connectionstyle="arc3,rad=0.1", ax=ax)
ax.set_title("Team Passing Network", fontsize=14, fontweight="bold")
ax.axis("off")
return ax
def network_stats(self) -> dict:
"""Calculate overall network statistics."""
if self.graph is None:
raise ValueError("Build network first")
# Convert to undirected for some metrics
G_undirected = self.graph.to_undirected()
return {
"Nodes": self.graph.number_of_nodes(),
"Edges": self.graph.number_of_edges(),
"Density": nx.density(self.graph),
"Average_Clustering": nx.average_clustering(G_undirected),
"Avg_Shortest_Path": nx.average_shortest_path_length(G_undirected)
if nx.is_connected(G_undirected) else None,
"Total_Passes": sum(d["weight"] for _, _, d in self.graph.edges(data=True))
}
# Example usage
if __name__ == "__main__":
# Create sample event data
np.random.seed(42)
players = ["GK", "LB", "CB1", "CB2", "RB", "CDM", "CM1", "CM2", "LW", "RW", "ST"]
events = []
for _ in range(300): # 300 passes in a game
passer = np.random.choice(players)
recipient = np.random.choice([p for p in players if p != passer])
events.append({
"player_name": passer,
"pass_recipient": recipient,
"team": "Home Team",
"type": "Pass",
"location": [np.random.uniform(0, 120), np.random.uniform(0, 80)]
})
events_df = pd.DataFrame(events)
# Build and analyze network
network = PassNetwork(events_df)
G = network.build_network("Home Team")
print("Network Stats:")
for k, v in network.network_stats().items():
print(f" {k}: {v}")
print("\nPlayer Centrality:")
print(network.centrality_metrics().head(5))
print("\nKey Partnerships:")
print(network.key_partnerships(5))
NHL Corsi and Fenwick Calculator
Calculate advanced hockey possession metrics including Corsi, Fenwick, and PDO.
"""NHL Advanced Possession Metrics."""
import pandas as pd
import numpy as np
class HockeyMetrics:
"""Calculate advanced NHL possession and efficiency metrics."""
@staticmethod
def corsi(shots_for: int, shots_against: int, blocked_for: int,
blocked_against: int, missed_for: int, missed_against: int) -> dict:
"""
Calculate Corsi metrics (all shot attempts).
Corsi For (CF) = Shots + Blocked + Missed
Corsi% = CF / (CF + CA)
"""
cf = shots_for + blocked_for + missed_for
ca = shots_against + blocked_against + missed_against
return {
"CF": cf,
"CA": ca,
"Corsi_Diff": cf - ca,
"Corsi%": cf / (cf + ca) * 100 if (cf + ca) > 0 else 50
}
@staticmethod
def fenwick(shots_for: int, shots_against: int,
missed_for: int, missed_against: int) -> dict:
"""
Calculate Fenwick metrics (unblocked shot attempts).
Fenwick excludes blocked shots as they are somewhat random.
"""
ff = shots_for + missed_for
fa = shots_against + missed_against
return {
"FF": ff,
"FA": fa,
"Fenwick_Diff": ff - fa,
"Fenwick%": ff / (ff + fa) * 100 if (ff + fa) > 0 else 50
}
@staticmethod
def pdo(shooting_pct: float, save_pct: float) -> float:
"""
Calculate PDO (shooting % + save %).
PDO around 100 is sustainable, extreme values tend to regress.
"""
return shooting_pct + save_pct * 100
@staticmethod
def expected_goals(shots: pd.DataFrame) -> float:
"""
Calculate expected goals based on shot quality.
Simplified model - real xG uses ML with many features.
"""
# Base xG by shot type (simplified)
xg_by_type = {
"WRIST": 0.05,
"SLAP": 0.04,
"SNAP": 0.06,
"BACKHAND": 0.08,
"DEFLECTION": 0.15,
"TIP-IN": 0.20,
"WRAP": 0.10
}
# Distance adjustment
def distance_factor(dist):
if dist < 10:
return 2.0
elif dist < 20:
return 1.5
elif dist < 30:
return 1.0
elif dist < 40:
return 0.7
else:
return 0.3
total_xg = 0
for _, shot in shots.iterrows():
base = xg_by_type.get(shot.get("shot_type", "WRIST"), 0.05)
dist_mult = distance_factor(shot.get("distance", 30))
total_xg += base * dist_mult
return total_xg
@staticmethod
def relative_metrics(player_on: dict, player_off: dict) -> dict:
"""
Calculate relative metrics (player on ice vs off ice).
Positive = team is better with player on ice.
"""
return {
"Rel_Corsi%": player_on.get("Corsi%", 50) - player_off.get("Corsi%", 50),
"Rel_Fenwick%": player_on.get("Fenwick%", 50) - player_off.get("Fenwick%", 50),
"Rel_GF%": player_on.get("GF%", 50) - player_off.get("GF%", 50)
}
@staticmethod
def zone_starts(off_zone: int, def_zone: int, neutral: int) -> dict:
"""
Calculate zone start percentages.
High offensive zone starts = easier deployment.
"""
total = off_zone + def_zone + neutral
return {
"OZS%": off_zone / total * 100 if total > 0 else 0,
"DZS%": def_zone / total * 100 if total > 0 else 0,
"NZS%": neutral / total * 100 if total > 0 else 0,
"ZS_Diff": (off_zone - def_zone) / total * 100 if total > 0 else 0
}
def calculate_team_metrics(team_stats: pd.DataFrame) -> pd.DataFrame:
"""Calculate advanced metrics for teams."""
results = []
for _, row in team_stats.iterrows():
corsi = HockeyMetrics.corsi(
row["SOG"], row["SOG_Against"],
row["Blocked_For"], row["Blocked_Against"],
row["Missed_For"], row["Missed_Against"]
)
fenwick = HockeyMetrics.fenwick(
row["SOG"], row["SOG_Against"],
row["Missed_For"], row["Missed_Against"]
)
pdo = HockeyMetrics.pdo(
row["Goals_For"] / row["SOG"] * 100,
1 - row["Goals_Against"] / row["SOG_Against"]
)
results.append({
"Team": row["Team"],
**corsi,
**fenwick,
"PDO": pdo,
"Sh%": row["Goals_For"] / row["SOG"] * 100,
"Sv%": (1 - row["Goals_Against"] / row["SOG_Against"]) * 100
})
return pd.DataFrame(results)
def calculate_player_metrics(player_events: pd.DataFrame, player_name: str) -> dict:
"""
Calculate metrics for a specific player.
Uses on-ice events when player was on ice.
"""
on_ice = player_events[player_events["player_on_ice"].str.contains(player_name, na=False)]
off_ice = player_events[~player_events["player_on_ice"].str.contains(player_name, na=False)]
def get_metrics(events):
shots_for = len(events[events["event_type"] == "SHOT"])
shots_against = len(events[events["event_type"] == "SHOT_AGAINST"])
blocked = len(events[events["event_type"] == "BLOCKED_SHOT"])
blocked_against = len(events[events["event_type"] == "BLOCKED_SHOT_AGAINST"])
missed = len(events[events["event_type"] == "MISSED_SHOT"])
missed_against = len(events[events["event_type"] == "MISSED_SHOT_AGAINST"])
return HockeyMetrics.corsi(shots_for, shots_against, blocked, blocked_against, missed, missed_against)
on_metrics = get_metrics(on_ice)
off_metrics = get_metrics(off_ice)
rel_metrics = HockeyMetrics.relative_metrics(on_metrics, off_metrics)
return {
"Player": player_name,
**on_metrics,
**rel_metrics,
"TOI": len(on_ice) / 60 # Simplified TOI estimate
}
# Example usage
if __name__ == "__main__":
# Sample team data
teams = pd.DataFrame({
"Team": ["TOR", "BOS", "TBL", "FLA", "NYR"],
"SOG": [2500, 2450, 2600, 2400, 2550],
"SOG_Against": [2300, 2200, 2400, 2500, 2350],
"Goals_For": [250, 240, 270, 230, 255],
"Goals_Against": [220, 200, 235, 245, 225],
"Blocked_For": [800, 750, 850, 700, 780],
"Blocked_Against": [720, 680, 780, 750, 700],
"Missed_For": [600, 550, 650, 580, 620],
"Missed_Against": [550, 500, 600, 620, 570]
})
results = calculate_team_metrics(teams)
print("Team Advanced Metrics:")
print(results[["Team", "Corsi%", "Fenwick%", "PDO", "Sh%", "Sv%"]].round(2))
Elo Rating System
Implement Elo rating system for any sport with customizable K-factor and home advantage.
"""Elo Rating System for sports."""
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional
from datetime import date
class EloRating:
"""
Elo rating system for sports teams/players.
Features:
- Customizable K-factor
- Home field advantage
- Margin of victory adjustment
- Season regression
"""
def __init__(
self,
k_factor: float = 20,
home_advantage: float = 100,
initial_rating: float = 1500,
regression_factor: float = 0.33
):
"""
Initialize Elo system.
Args:
k_factor: Maximum rating change per game
home_advantage: Elo points for home team
initial_rating: Starting rating for new teams
regression_factor: How much ratings regress to mean between seasons
"""
self.k = k_factor
self.home_adv = home_advantage
self.initial = initial_rating
self.regression = regression_factor
self.ratings: Dict[str, float] = {}
self.history: List[Dict] = []
def get_rating(self, team: str) -> float:
"""Get current rating for a team."""
return self.ratings.get(team, self.initial)
def expected_score(self, rating_a: float, rating_b: float) -> float:
"""
Calculate expected score for team A vs team B.
Returns probability of team A winning.
"""
return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
def margin_of_victory_mult(
self,
score_diff: float,
winner_elo: float,
loser_elo: float
) -> float:
"""
Calculate margin of victory multiplier.
Larger victories = larger rating changes, but with diminishing returns.
Also adjusts for elo difference (upsets get bigger boost).
"""
elo_diff = winner_elo - loser_elo
# FiveThirtyEight NFL formula
return np.log(abs(score_diff) + 1) * (2.2 / ((elo_diff * 0.001) + 2.2))
def update(
self,
team_a: str,
team_b: str,
score_a: float,
score_b: float,
is_home_a: bool = True,
use_mov: bool = True
) -> Tuple[float, float]:
"""
Update ratings after a game.
Args:
team_a: First team
team_b: Second team
score_a: Team A score
score_b: Team B score
is_home_a: Is team A the home team?
use_mov: Use margin of victory adjustment?
Returns:
Tuple of (new_rating_a, new_rating_b)
"""
# Get current ratings
rating_a = self.get_rating(team_a)
rating_b = self.get_rating(team_b)
# Apply home advantage
adjusted_a = rating_a + (self.home_adv if is_home_a else -self.home_adv)
adjusted_b = rating_b + (self.home_adv if not is_home_a else -self.home_adv)
# Calculate expected scores
exp_a = self.expected_score(adjusted_a, adjusted_b)
exp_b = 1 - exp_a
# Actual scores (1 for win, 0.5 for tie, 0 for loss)
if score_a > score_b:
actual_a, actual_b = 1, 0
elif score_a < score_b:
actual_a, actual_b = 0, 1
else:
actual_a, actual_b = 0.5, 0.5
# K-factor adjustment for margin of victory
k_mult = 1
if use_mov and score_a != score_b:
winner_elo = rating_a if score_a > score_b else rating_b
loser_elo = rating_b if score_a > score_b else rating_a
k_mult = self.margin_of_victory_mult(
abs(score_a - score_b), winner_elo, loser_elo
)
# Update ratings
k_adjusted = self.k * k_mult
new_rating_a = rating_a + k_adjusted * (actual_a - exp_a)
new_rating_b = rating_b + k_adjusted * (actual_b - exp_b)
# Store updates
self.ratings[team_a] = new_rating_a
self.ratings[team_b] = new_rating_b
# Record history
self.history.append({
"team_a": team_a,
"team_b": team_b,
"score_a": score_a,
"score_b": score_b,
"rating_a_before": rating_a,
"rating_b_before": rating_b,
"rating_a_after": new_rating_a,
"rating_b_after": new_rating_b,
"expected_a": exp_a,
"k_mult": k_mult
})
return new_rating_a, new_rating_b
def new_season(self):
"""
Apply regression to mean for new season.
"""
mean_rating = np.mean(list(self.ratings.values())) if self.ratings else self.initial
for team in self.ratings:
self.ratings[team] = (
self.ratings[team] * (1 - self.regression) +
mean_rating * self.regression
)
def predict(self, team_a: str, team_b: str, is_home_a: bool = True) -> Dict:
"""
Predict game outcome.
Returns dict with win probabilities and predicted spread.
"""
rating_a = self.get_rating(team_a) + (self.home_adv if is_home_a else -self.home_adv)
rating_b = self.get_rating(team_b) + (self.home_adv if not is_home_a else -self.home_adv)
prob_a = self.expected_score(rating_a, rating_b)
# Elo to spread conversion (rough: 25 Elo = 1 point)
spread = (rating_a - rating_b) / 25
return {
"prob_a": prob_a,
"prob_b": 1 - prob_a,
"spread": round(spread, 1),
"rating_a": self.get_rating(team_a),
"rating_b": self.get_rating(team_b)
}
def get_rankings(self) -> pd.DataFrame:
"""Get current rankings."""
df = pd.DataFrame([
{"Team": team, "Rating": rating}
for team, rating in self.ratings.items()
])
df = df.sort_values("Rating", ascending=False).reset_index(drop=True)
df["Rank"] = df.index + 1
return df[["Rank", "Team", "Rating"]]
def process_games(self, games: pd.DataFrame) -> "EloRating":
"""
Process multiple games.
Expected columns: team_a, team_b, score_a, score_b, is_home_a (optional)
"""
for _, row in games.iterrows():
is_home_a = row.get("is_home_a", True)
self.update(
row["team_a"],
row["team_b"],
row["score_a"],
row["score_b"],
is_home_a
)
return self
# Example usage
if __name__ == "__main__":
# Initialize Elo system
elo = EloRating(k_factor=20, home_advantage=65)
# Sample NFL season games
np.random.seed(42)
teams = ["KC", "BUF", "PHI", "SF", "DAL", "MIA", "BAL", "DET",
"CIN", "JAX", "NYJ", "LAC", "SEA", "MIN", "GB", "TB"]
# Generate sample games
games = []
for week in range(17):
np.random.shuffle(teams)
for i in range(0, len(teams), 2):
home = teams[i]
away = teams[i+1]
# Generate scores
home_score = np.random.poisson(24)
away_score = np.random.poisson(21)
games.append({
"week": week + 1,
"team_a": home,
"team_b": away,
"score_a": home_score,
"score_b": away_score,
"is_home_a": True
})
games_df = pd.DataFrame(games)
# Process all games
elo.process_games(games_df)
# Show rankings
print("Final Elo Rankings:")
print(elo.get_rankings())
# Make a prediction
prediction = elo.predict("KC", "BUF", is_home_a=True)
print(f"\nKC vs BUF prediction:")
print(f" KC win prob: {prediction['prob_a']:.1%}")
print(f" Spread: KC {prediction['spread']}")
Win Probability Model
Calculate real-time win probability for sports games based on game state.
"""Win Probability model for sports games."""
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.ensemble import GradientBoostingClassifier
from typing import Dict, List, Tuple
class WinProbability:
"""
Calculate real-time win probability based on game state.
Features:
- Score differential
- Time remaining
- Possession (for applicable sports)
- Field position (football)
- Historical modeling
"""
def __init__(self, sport: str = "basketball"):
self.sport = sport
self.model = None
def analytical_wp(
self,
score_diff: float,
time_remaining: float,
total_time: float,
tempo: float = None
) -> float:
"""
Calculate win probability analytically.
Uses normal distribution assumption for score differential.
Args:
score_diff: Current lead (positive = winning)
time_remaining: Time remaining in game
total_time: Total game time
tempo: Scoring rate (points per time unit)
"""
if time_remaining <= 0:
return 1.0 if score_diff > 0 else (0.5 if score_diff == 0 else 0.0)
# Estimate scoring volatility
if tempo is None:
tempo_map = {
"basketball": 100 / 48, # ~100 points per 48 min
"football": 24 / 60, # ~24 points per 60 min
"soccer": 2.7 / 90, # ~2.7 goals per 90 min
"hockey": 6 / 60, # ~6 goals per 60 min
"baseball": 9 / 9 # ~9 runs per 9 innings
}
tempo = tempo_map.get(self.sport, 1)
# Standard deviation of scoring in remaining time
# Approximate: std grows with sqrt of time
remaining_pct = time_remaining / total_time
expected_std = tempo * np.sqrt(time_remaining * 2) # Both teams score
# Win probability using normal CDF
if expected_std > 0:
z_score = score_diff / expected_std
wp = stats.norm.cdf(z_score)
else:
wp = 1.0 if score_diff > 0 else 0.5
return wp
def train_model(
self,
game_data: pd.DataFrame,
feature_cols: List[str],
target_col: str = "home_win"
) -> "WinProbability":
"""
Train ML model for win probability.
Args:
game_data: DataFrame with game state features and outcomes
feature_cols: Columns to use as features
target_col: Binary win indicator column
"""
X = game_data[feature_cols]
y = game_data[target_col]
self.feature_cols = feature_cols
self.model = GradientBoostingClassifier(
n_estimators=100,
max_depth=4,
learning_rate=0.1,
random_state=42
)
self.model.fit(X, y)
return self
def predict_wp(self, game_state: Dict) -> float:
"""
Predict win probability for a game state.
Args:
game_state: Dict with feature values
"""
if self.model is None:
# Use analytical method
return self.analytical_wp(
game_state.get("score_diff", 0),
game_state.get("time_remaining", 0),
game_state.get("total_time", 48)
)
# Use trained model
X = pd.DataFrame([{col: game_state.get(col, 0) for col in self.feature_cols}])
return self.model.predict_proba(X)[0, 1]
def calculate_wpa(
self,
game_log: pd.DataFrame,
time_col: str = "time_remaining",
score_col: str = "score_diff"
) -> pd.DataFrame:
"""
Calculate Win Probability Added for each play.
WPA = WP(after) - WP(before)
"""
game_log = game_log.copy()
# Calculate WP before each play
game_log["wp_before"] = game_log.apply(
lambda row: self.analytical_wp(
row[score_col],
row[time_col],
game_log[time_col].max()
),
axis=1
)
# WP after is WP before of next play
game_log["wp_after"] = game_log["wp_before"].shift(-1)
game_log.loc[game_log.index[-1], "wp_after"] = (
1.0 if game_log[score_col].iloc[-1] > 0 else 0.0
)
# Calculate WPA
game_log["wpa"] = game_log["wp_after"] - game_log["wp_before"]
return game_log
def leverage_index(
self,
score_diff: float,
time_remaining: float,
total_time: float
) -> float:
"""
Calculate Leverage Index (importance of situation).
LI = sensitivity of WP to scoring events.
Higher LI = more important moment.
"""
# Calculate WP at current state
wp = self.analytical_wp(score_diff, time_remaining, total_time)
# Calculate WP if +1 and -1 scoring event
wp_plus = self.analytical_wp(score_diff + 1, time_remaining, total_time)
wp_minus = self.analytical_wp(score_diff - 1, time_remaining, total_time)
# LI is the change in WP from a scoring event
# Normalized so average is ~1.0
li = (wp_plus - wp_minus) / 0.04 # 0.04 is approximately average WP swing
return max(li, 0)
def plot_win_probability(wp_series: pd.Series):
"""
Create win probability chart.
"""
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.plot(wp_series.values, linewidth=2)
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
plt.fill_between(range(len(wp_series)),
wp_series.values,
0.5,
alpha=0.3,
where=(wp_series.values >= 0.5),
color='green')
plt.fill_between(range(len(wp_series)),
wp_series.values,
0.5,
alpha=0.3,
where=(wp_series.values < 0.5),
color='red')
plt.xlabel("Play Number")
plt.ylabel("Home Team Win Probability")
plt.title("Win Probability Chart")
plt.ylim(0, 1)
plt.grid(True, alpha=0.3)
return plt
# Example usage
if __name__ == "__main__":
# Basketball example
wp_model = WinProbability(sport="basketball")
# Sample game situations
situations = [
{"score_diff": 0, "time_remaining": 48, "description": "Start of game"},
{"score_diff": 10, "time_remaining": 24, "description": "Up 10 at half"},
{"score_diff": 5, "time_remaining": 12, "description": "Up 5, 4th quarter"},
{"score_diff": -3, "time_remaining": 2, "description": "Down 3, 2 min left"},
{"score_diff": 2, "time_remaining": 0.1, "description": "Up 2, 6 seconds left"}
]
print("Basketball Win Probabilities:")
for sit in situations:
wp = wp_model.analytical_wp(sit["score_diff"], sit["time_remaining"], 48)
li = wp_model.leverage_index(sit["score_diff"], sit["time_remaining"], 48)
print(f" {sit['description']}: WP={wp:.1%}, LI={li:.2f}")
# Simulate a game for WP chart
np.random.seed(42)
plays = []
score_diff = 0
time_left = 48
while time_left > 0:
time_elapsed = np.random.exponential(0.5) # Minutes between scores
time_left = max(0, time_left - time_elapsed)
score_change = np.random.choice([-3, -2, 2, 3], p=[0.2, 0.3, 0.3, 0.2])
score_diff += score_change
plays.append({
"time_remaining": time_left,
"score_diff": score_diff
})
game_df = pd.DataFrame(plays)
game_df = wp_model.calculate_wpa(game_df)
print(f"\nSimulated game: Final score diff = {score_diff}")
print("Highest WPA plays:")
print(game_df.nlargest(5, "wpa")[["time_remaining", "score_diff", "wp_before", "wp_after", "wpa"]])
Bayesian Player True Talent Estimation
Estimate player true talent levels using Bayesian methods with population priors.
"""Bayesian True Talent Estimation for player statistics."""
import numpy as np
import pandas as pd
from scipy import stats
from typing import Tuple, Dict, Optional
class BayesianTrueTalent:
"""
Estimate true talent using Bayesian methods.
Combines observed performance with league-wide prior
to get better estimates, especially for small samples.
"""
def __init__(self, prior_mean: float = None, prior_var: float = None):
"""
Initialize with prior parameters.
If not provided, priors will be estimated from data.
"""
self.prior_mean = prior_mean
self.prior_var = prior_var
self.binomial_n = None
def fit_prior(self, observed: pd.Series, n: pd.Series = None) -> "BayesianTrueTalent":
"""
Estimate prior parameters from population data.
For rates: use beta-binomial model
For continuous: use normal-normal model
Args:
observed: Observed values (rates or means)
n: Sample sizes (for rates)
"""
if n is not None:
# For rates - use beta-binomial
successes = observed * n
self._fit_beta_binomial(successes.values, n.values)
else:
# For continuous stats - use normal model
self.prior_mean = observed.mean()
# Estimate true variance (observed variance minus sampling variance)
self.prior_var = max(observed.var() - (observed.mean() * (1-observed.mean()) / 500), 0.001)
return self
def _fit_beta_binomial(self, successes: np.ndarray, trials: np.ndarray):
"""
Fit beta prior parameters using method of moments.
"""
rates = successes / trials
# Weighted mean and variance
weights = trials / trials.sum()
mean_rate = np.average(rates, weights=weights)
var_rate = np.average((rates - mean_rate)**2, weights=weights)
# Estimate within-player variance
expected_binomial_var = np.mean(rates * (1 - rates) / trials)
# Between-player variance (true talent variance)
between_var = max(var_rate - expected_binomial_var, 0.0001)
self.prior_mean = mean_rate
self.prior_var = between_var
def estimate(
self,
observed: float,
n: int,
return_interval: bool = False
) -> Dict:
"""
Estimate true talent for a single observation.
Args:
observed: Observed rate or mean
n: Sample size
return_interval: Return credible interval?
Returns:
Dict with estimate and optional interval
"""
if self.prior_mean is None:
raise ValueError("Must fit prior first or provide prior parameters")
# Calculate posterior parameters
# Using normal approximation for simplicity
# Observation variance
if 0 <= observed <= 1: # Rate
obs_var = observed * (1 - observed) / n
else: # Continuous stat
obs_var = self.prior_var / n # Approximate
# Posterior mean (weighted average)
total_precision = 1/self.prior_var + 1/obs_var
posterior_mean = (
(self.prior_mean / self.prior_var + observed / obs_var) /
total_precision
)
# Posterior variance
posterior_var = 1 / total_precision
# Regression to mean
regression_pct = obs_var / (self.prior_var + obs_var)
result = {
"observed": observed,
"estimated": posterior_mean,
"regression_pct": regression_pct,
"n": n
}
if return_interval:
# 95% credible interval
posterior_std = np.sqrt(posterior_var)
result["ci_lower"] = posterior_mean - 1.96 * posterior_std
result["ci_upper"] = posterior_mean + 1.96 * posterior_std
return result
def estimate_population(
self,
df: pd.DataFrame,
obs_col: str,
n_col: str,
name_col: str = None
) -> pd.DataFrame:
"""
Estimate true talent for all players.
Args:
df: DataFrame with player data
obs_col: Column with observed rates/means
n_col: Column with sample sizes
name_col: Column with player names
"""
results = []
for _, row in df.iterrows():
est = self.estimate(row[obs_col], row[n_col], return_interval=True)
if name_col:
est["player"] = row[name_col]
results.append(est)
result_df = pd.DataFrame(results)
# Sort by estimate
result_df = result_df.sort_values("estimated", ascending=False)
return result_df
def regress_to_mean(
observed: float,
n: int,
league_mean: float,
regression_n: int = 1200
) -> float:
"""
Simple regression to mean formula.
Args:
observed: Observed rate
n: Sample size (e.g., PA for batting average)
league_mean: League average
regression_n: Sample size where regression = 50%
(~1200 PA for batting average)
"""
weight = n / (n + regression_n)
return weight * observed + (1 - weight) * league_mean
# Example usage
if __name__ == "__main__":
# Create sample batting data
np.random.seed(42)
n_players = 100
# True talent (unknown in real life)
true_talent = np.random.beta(80, 240, n_players) # ~.250 average
# Observed performance
pa = np.random.randint(100, 600, n_players)
hits = np.array([np.random.binomial(p, t) for p, t in zip(pa, true_talent)])
observed_avg = hits / pa
players = pd.DataFrame({
"Player": [f"Player_{i}" for i in range(n_players)],
"PA": pa,
"H": hits,
"AVG": observed_avg,
"True_Talent": true_talent # Usually unknown
})
# Fit Bayesian model
bayes = BayesianTrueTalent()
bayes.fit_prior(players["AVG"], players["PA"])
print(f"Prior Mean: {bayes.prior_mean:.3f}")
print(f"Prior Std: {np.sqrt(bayes.prior_var):.3f}")
# Estimate true talent
estimates = bayes.estimate_population(players, "AVG", "PA", "Player")
# Compare with actual (we know true talent in simulation)
estimates = estimates.merge(
players[["Player", "True_Talent"]],
left_on="player",
right_on="Player"
)
print("\nTop 10 Estimated Players:")
print(estimates[["player", "observed", "estimated", "True_Talent", "n"]].head(10).round(3))
# Error comparison
obs_error = np.abs(players["AVG"] - players["True_Talent"]).mean()
est_error = np.abs(estimates["estimated"] - estimates["True_Talent"]).mean()
print(f"\nMean Absolute Error:")
print(f" Observed: {obs_error:.4f}")
print(f" Estimated: {est_error:.4f}")
print(f" Improvement: {(obs_error - est_error) / obs_error:.1%}")
Player Aging Curves
Model how player performance changes with age across different sports.
"""Player Aging Curve Analysis."""
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from sklearn.linear_model import Ridge
import warnings
class AgingCurve:
"""
Model player performance aging curves.
Methods:
- Delta method (year-over-year changes)
- Regression-based
- Parametric curve fitting
"""
def __init__(self, peak_age: int = None, sport: str = "baseball"):
"""
Initialize aging curve model.
Args:
peak_age: Expected peak age (None = estimate from data)
sport: Sport for default parameters
"""
# Default peak ages by sport
default_peaks = {
"baseball": 27,
"basketball": 27,
"football": 27,
"soccer": 26,
"hockey": 25
}
self.peak_age = peak_age or default_peaks.get(sport, 27)
self.sport = sport
self.curve_params = None
@staticmethod
def quadratic_aging(age: np.ndarray, a: float, b: float, c: float) -> np.ndarray:
"""Quadratic aging curve: performance = a*age^2 + b*age + c"""
return a * age**2 + b * age + c
@staticmethod
def asymmetric_aging(
age: np.ndarray,
peak: float,
peak_age: float,
growth_rate: float,
decline_rate: float
) -> np.ndarray:
"""
Asymmetric aging curve with different growth/decline rates.
Allows for faster decline than growth.
"""
result = np.zeros_like(age, dtype=float)
young_mask = age <= peak_age
old_mask = age > peak_age
# Growth phase
result[young_mask] = peak * (1 - np.exp(-growth_rate * (age[young_mask] - 18)))
# Decline phase
result[old_mask] = peak * np.exp(-decline_rate * (age[old_mask] - peak_age))
return result
def delta_method(
self,
df: pd.DataFrame,
player_col: str = "Player",
age_col: str = "Age",
stat_col: str = "WAR",
min_pa: int = 200,
pa_col: str = "PA"
) -> pd.DataFrame:
"""
Calculate aging curve using delta method.
Compares same players across consecutive seasons.
Args:
df: DataFrame with player seasons
player_col: Player identifier column
age_col: Age column
stat_col: Statistic to model
min_pa: Minimum playing time threshold
pa_col: Playing time column
"""
# Filter to qualified seasons
qualified = df[df[pa_col] >= min_pa].copy()
qualified = qualified.sort_values([player_col, age_col])
# Calculate year-over-year changes
deltas = []
for player, group in qualified.groupby(player_col):
if len(group) < 2:
continue
for i in range(len(group) - 1):
curr = group.iloc[i]
next_yr = group.iloc[i + 1]
# Only consecutive ages
if next_yr[age_col] - curr[age_col] != 1:
continue
deltas.append({
"player": player,
"age_from": curr[age_col],
"age_to": next_yr[age_col],
"mid_age": (curr[age_col] + next_yr[age_col]) / 2,
"delta": next_yr[stat_col] - curr[stat_col],
"weight": min(curr[pa_col], next_yr[pa_col])
})
delta_df = pd.DataFrame(deltas)
# Weight-adjusted average delta by age
aging = delta_df.groupby("age_from").apply(
lambda x: np.average(x["delta"], weights=x["weight"])
).reset_index()
aging.columns = ["Age", "Delta"]
# Cumulative aging curve (relative to peak age)
aging = aging.sort_values("Age")
aging["Cumulative"] = aging["Delta"].cumsum()
# Normalize to 0 at peak age
peak_value = aging.loc[aging["Age"] == self.peak_age, "Cumulative"]
if len(peak_value) > 0:
aging["Cumulative"] = aging["Cumulative"] - peak_value.values[0]
return aging
def fit_parametric(
self,
ages: np.ndarray,
values: np.ndarray,
weights: np.ndarray = None,
model: str = "quadratic"
) -> dict:
"""
Fit parametric aging curve.
Args:
ages: Array of ages
values: Array of performance values
weights: Optional weights
model: "quadratic" or "asymmetric"
"""
if model == "quadratic":
func = self.quadratic_aging
p0 = [-0.1, 5, -50] # Initial guess
bounds = ([-1, 0, -200], [0, 20, 0])
else:
func = self.asymmetric_aging
p0 = [10, 27, 0.5, 0.1] # peak, peak_age, growth, decline
bounds = ([0, 22, 0, 0], [50, 32, 2, 1])
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
if weights is not None:
sigma = 1 / np.sqrt(weights)
popt, pcov = curve_fit(func, ages, values, p0=p0,
bounds=bounds, sigma=sigma, maxfev=5000)
else:
popt, pcov = curve_fit(func, ages, values, p0=p0,
bounds=bounds, maxfev=5000)
self.curve_params = popt
self.curve_func = func
# Find peak age
test_ages = np.linspace(20, 40, 100)
pred_values = func(test_ages, *popt)
self.peak_age = test_ages[np.argmax(pred_values)]
return {
"params": popt,
"peak_age": self.peak_age,
"peak_value": np.max(pred_values)
}
except Exception as e:
print(f"Fitting failed: {e}")
return None
def predict(self, ages: np.ndarray) -> np.ndarray:
"""Predict performance at given ages."""
if self.curve_params is None:
raise ValueError("Must fit curve first")
return self.curve_func(ages, *self.curve_params)
def age_adjust(
self,
stat: float,
current_age: int,
target_age: int = None
) -> float:
"""
Age-adjust a statistic.
Args:
stat: Current statistic value
current_age: Player's current age
target_age: Age to adjust to (default: peak age)
"""
if target_age is None:
target_age = self.peak_age
current_adj = self.predict(np.array([current_age]))[0]
target_adj = self.predict(np.array([target_age]))[0]
# Adjust stat to target age level
adjustment = target_adj - current_adj
return stat + adjustment
# Example usage
if __name__ == "__main__":
# Create synthetic career data
np.random.seed(42)
players = []
for player_id in range(100):
peak = np.random.normal(3, 1.5) # WAR peak
peak_age = np.random.normal(27, 2)
career_start = np.random.randint(22, 26)
career_end = np.random.randint(33, 40)
for age in range(career_start, career_end + 1):
# Quadratic aging
war = peak - 0.05 * (age - peak_age)**2 + np.random.normal(0, 0.5)
pa = np.random.randint(300, 650)
players.append({
"Player": f"Player_{player_id}",
"Age": age,
"WAR": max(war, -1),
"PA": pa
})
df = pd.DataFrame(players)
print(f"Generated {len(df)} player-seasons")
# Calculate aging curve using delta method
aging = AgingCurve(sport="baseball")
curve = aging.delta_method(df, stat_col="WAR")
print("\nAging Curve (Delta Method):")
print(curve)
# Fit parametric curve
ages = df.groupby("Age")["WAR"].mean()
result = aging.fit_parametric(
ages.index.values,
ages.values,
model="quadratic"
)
print(f"\nParametric fit peak age: {result['peak_age']:.1f}")
# Age-adjust a player
sample_war = 5.0
sample_age = 32
adjusted = aging.age_adjust(sample_war, sample_age, 27)
print(f"\nAge {sample_age} WAR {sample_war:.1f} -> Age 27 equivalent: {adjusted:.1f}")
EPA (Expected Points Added) Calculator
Calculate Expected Points Added for football plays using play-by-play data.
"""EPA (Expected Points Added) calculator for football."""
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from typing import Dict, List, Tuple
class EPACalculator:
"""
Calculate Expected Points Added for football plays.
EPA = EP(end state) - EP(start state)
EP is based on field position, down, distance, and time.
"""
def __init__(self):
self.ep_model = None
self.is_fitted = False
def calculate_ep_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Calculate features for EP model.
Expected columns: yardline_100, down, ydstogo, half_seconds_remaining
"""
features = pd.DataFrame()
features["yardline"] = df["yardline_100"]
features["down"] = df["down"]
features["ydstogo"] = df["ydstogo"].clip(1, 30)
features["goal_to_go"] = (df["ydstogo"] >= df["yardline_100"]).astype(int)
# Log transform of time
features["log_time"] = np.log(df["half_seconds_remaining"].clip(1, 1800) + 1)
# Interaction features
features["down_yardline"] = features["down"] * features["yardline"]
features["down_ydstogo"] = features["down"] * features["ydstogo"]
# Field position zones
features["red_zone"] = (df["yardline_100"] <= 20).astype(int)
features["fg_range"] = ((df["yardline_100"] <= 35) & (df["yardline_100"] > 20)).astype(int)
features["own_territory"] = (df["yardline_100"] > 50).astype(int)
return features
def train_ep_model(
self,
plays: pd.DataFrame,
ep_col: str = "next_score_ep"
) -> "EPACalculator":
"""
Train EP model on historical play data.
Args:
plays: DataFrame with play-by-play data
ep_col: Column with actual next score value
"""
# Filter to valid plays
valid = plays[
(plays["down"].between(1, 4)) &
(plays["yardline_100"].between(1, 99)) &
plays[ep_col].notna()
].copy()
X = self.calculate_ep_features(valid)
y = valid[ep_col]
self.ep_model = GradientBoostingRegressor(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
random_state=42
)
self.ep_model.fit(X, y)
self.is_fitted = True
return self
def predict_ep(self, plays: pd.DataFrame) -> np.ndarray:
"""Predict expected points for game states."""
if not self.is_fitted:
raise ValueError("Model must be trained first")
X = self.calculate_ep_features(plays)
return self.ep_model.predict(X)
def calculate_epa(self, plays: pd.DataFrame) -> pd.Series:
"""
Calculate EPA for each play.
Requires pre_ep and post_ep columns, or will calculate them.
"""
plays = plays.copy()
# Calculate start state EP
plays["ep_before"] = self.predict_ep(plays)
# Calculate end state EP (need to transform for possession changes)
end_states = plays.copy()
# Handle possession changes
if "posteam_change" in plays.columns:
# Flip yardline for turnovers
mask = plays["posteam_change"] == 1
end_states.loc[mask, "yardline_100"] = 100 - end_states.loc[mask, "yardline_100"]
# EP flips sign for turnovers
ep_after = self.predict_ep(end_states)
ep_after[mask] = -ep_after[mask]
else:
ep_after = self.predict_ep(end_states)
# Handle scoring plays
if "touchdown" in plays.columns:
ep_after = np.where(plays["touchdown"] == 1, 7, ep_after)
if "field_goal_result" in plays.columns:
ep_after = np.where(plays["field_goal_result"] == "made", 3, ep_after)
if "safety" in plays.columns:
ep_after = np.where(plays["safety"] == 1, -2, ep_after)
# EPA = EP(after) - EP(before)
return ep_after - plays["ep_before"]
@staticmethod
def get_base_ep_values() -> Dict[Tuple, float]:
"""
Return base EP values by down/distance/field position.
These are approximate values based on historical NFL data.
"""
# (down, ydstogo_bucket, yardline_bucket): EP
# Simplified for demonstration
return {
(1, 10, 80): -0.5, # 1st & 10, own 20
(1, 10, 50): 1.0, # 1st & 10, midfield
(1, 10, 20): 4.0, # 1st & 10, opponent 20
(1, 10, 5): 5.5, # 1st & goal from 5
# ... would have many more entries
}
def aggregate_epa(plays: pd.DataFrame, group_col: str) -> pd.DataFrame:
"""
Aggregate EPA by a grouping column (player, team, etc.)
"""
agg = plays.groupby(group_col).agg({
"epa": ["sum", "mean", "count"],
"success": "mean" # If success column exists
}).round(3)
agg.columns = ["total_epa", "epa_per_play", "plays", "success_rate"]
return agg.sort_values("total_epa", ascending=False)
# Example usage
if __name__ == "__main__":
# Create sample play data
np.random.seed(42)
n_plays = 1000
plays = pd.DataFrame({
"yardline_100": np.random.randint(1, 100, n_plays),
"down": np.random.choice([1, 2, 3, 4], n_plays, p=[0.4, 0.3, 0.2, 0.1]),
"ydstogo": np.random.randint(1, 20, n_plays),
"half_seconds_remaining": np.random.randint(1, 1800, n_plays),
"yards_gained": np.random.normal(5, 8, n_plays),
"passer_player_name": np.random.choice(["QB1", "QB2", "QB3"], n_plays),
"rusher_player_name": np.random.choice(["RB1", "RB2", None], n_plays)
})
# Create target (simplified)
plays["next_score_ep"] = (
7 * (1 - plays["yardline_100"]/100) -
2 * (plays["yardline_100"]/100) +
np.random.normal(0, 1, n_plays)
)
# Train EP model
epa_calc = EPACalculator()
epa_calc.train_ep_model(plays, "next_score_ep")
# Calculate EPA
plays["epa"] = epa_calc.calculate_epa(plays)
plays["success"] = (plays["epa"] > 0).astype(int)
print("EPA Statistics:")
print(f"Mean EPA: {plays['epa'].mean():.3f}")
print(f"Success Rate: {plays['success'].mean():.1%}")
# Aggregate by passer
print("\nQB EPA Rankings:")
qb_epa = aggregate_epa(plays.dropna(subset=["passer_player_name"]), "passer_player_name")
print(qb_epa)
RAPM (Regularized Adjusted Plus-Minus)
Calculate basketball player impact using regularized adjusted plus-minus regression.
"""RAPM (Regularized Adjusted Plus-Minus) for basketball."""
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV
from scipy import sparse
from typing import Dict, List, Tuple
class RAPM:
"""
Calculate Regularized Adjusted Plus-Minus.
RAPM estimates player impact by regressing point differential
on player participation while controlling for teammates and opponents.
"""
def __init__(self, lambda_values: List[float] = None):
"""
Initialize RAPM model.
Args:
lambda_values: Ridge regularization values to try
"""
if lambda_values is None:
lambda_values = [0.01, 0.1, 1, 10, 100, 1000]
self.model = RidgeCV(alphas=lambda_values, cv=5)
self.player_ids = None
self.player_names = None
self.rapm_values = None
def prepare_data(
self,
stints: pd.DataFrame,
home_players: List[str],
away_players: List[str],
margin_col: str = "margin",
possessions_col: str = "possessions"
) -> Tuple[sparse.csr_matrix, np.ndarray, np.ndarray]:
"""
Prepare stint data for RAPM calculation.
Args:
stints: DataFrame where each row is a stint (period with same 10 players)
home_players: Column names for 5 home player IDs
away_players: Column names for 5 away player IDs
margin_col: Column with point margin (home - away)
possessions_col: Column with possession count
Returns:
X: Sparse matrix of player participation
y: Point margin per 100 possessions
weights: Possession-based weights
"""
# Get unique players
all_players = set()
for col in home_players + away_players:
all_players.update(stints[col].dropna().unique())
self.player_ids = sorted(list(all_players))
player_to_idx = {p: i for i, p in enumerate(self.player_ids)}
n_players = len(self.player_ids)
# Build sparse matrix
n_stints = len(stints)
rows, cols, data = [], [], []
for stint_idx, row in stints.iterrows():
# Home players get +1
for col in home_players:
if pd.notna(row[col]):
player_idx = player_to_idx[row[col]]
rows.append(stint_idx)
cols.append(player_idx)
data.append(1)
# Away players get -1
for col in away_players:
if pd.notna(row[col]):
player_idx = player_to_idx[row[col]]
rows.append(stint_idx)
cols.append(player_idx)
data.append(-1)
X = sparse.csr_matrix((data, (rows, cols)), shape=(n_stints, n_players))
# Target: margin per 100 possessions
y = (stints[margin_col] / stints[possessions_col]) * 100
# Weights: sqrt of possessions
weights = np.sqrt(stints[possessions_col].values)
return X, y.values, weights
def fit(
self,
stints: pd.DataFrame,
home_players: List[str] = None,
away_players: List[str] = None,
player_names: Dict[str, str] = None
) -> "RAPM":
"""
Fit RAPM model.
Args:
stints: Stint data
home_players: Column names for home players (default: H1-H5)
away_players: Column names for away players (default: A1-A5)
player_names: Dict mapping player IDs to names
"""
if home_players is None:
home_players = ["H1", "H2", "H3", "H4", "H5"]
if away_players is None:
away_players = ["A1", "A2", "A3", "A4", "A5"]
X, y, weights = self.prepare_data(stints, home_players, away_players)
# Fit weighted ridge regression
self.model.fit(X, y, sample_weight=weights)
# Extract RAPM values
self.rapm_values = pd.Series(
self.model.coef_,
index=self.player_ids,
name="RAPM"
).sort_values(ascending=False)
# Add names if provided
if player_names:
self.player_names = player_names
return self
def get_rankings(self, top_n: int = None) -> pd.DataFrame:
"""Get RAPM rankings."""
if self.rapm_values is None:
raise ValueError("Model not fitted")
df = self.rapm_values.reset_index()
df.columns = ["player_id", "RAPM"]
if self.player_names:
df["Player"] = df["player_id"].map(self.player_names)
else:
df["Player"] = df["player_id"]
df["Rank"] = range(1, len(df) + 1)
if top_n:
df = df.head(top_n)
return df[["Rank", "Player", "RAPM"]]
def get_player_rapm(self, player_id: str) -> float:
"""Get RAPM for a specific player."""
if self.rapm_values is None:
raise ValueError("Model not fitted")
return self.rapm_values.get(player_id, np.nan)
def create_sample_stints(n_stints: int = 5000, n_players: int = 100) -> pd.DataFrame:
"""Create sample stint data for demonstration."""
np.random.seed(42)
# Generate player effects
player_effects = {f"P{i}": np.random.normal(0, 3) for i in range(n_players)}
stints = []
for _ in range(n_stints):
# Randomly select 10 players
players = np.random.choice(list(player_effects.keys()), 10, replace=False)
home = players[:5]
away = players[5:]
# Calculate expected margin
home_effect = sum(player_effects[p] for p in home)
away_effect = sum(player_effects[p] for p in away)
expected_margin = home_effect - away_effect + np.random.normal(2, 0) # Home advantage
possessions = np.random.randint(5, 30)
actual_margin = expected_margin * possessions / 100 + np.random.normal(0, 3)
stint = {
"H1": home[0], "H2": home[1], "H3": home[2], "H4": home[3], "H5": home[4],
"A1": away[0], "A2": away[1], "A3": away[2], "A4": away[3], "A5": away[4],
"margin": actual_margin,
"possessions": possessions
}
stints.append(stint)
return pd.DataFrame(stints)
# Example usage
if __name__ == "__main__":
# Generate sample data
stints = create_sample_stints(10000, 50)
print(f"Generated {len(stints)} stints with {50} players")
# Fit RAPM
rapm = RAPM()
rapm.fit(stints)
# Get rankings
print("\nTop 15 Players by RAPM:")
print(rapm.get_rankings(15))
print(f"\nBest lambda: {rapm.model.alpha_:.2f}")
Player Similarity Analysis
Find similar players using statistical profiles and dimensionality reduction.
"""Player similarity analysis using statistical methods."""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.cluster import KMeans
from typing import List, Tuple, Dict
class PlayerSimilarity:
"""
Find similar players based on statistical profiles.
Methods:
- Cosine similarity
- Euclidean distance
- PCA-based similarity
"""
def __init__(self, stat_columns: List[str] = None):
self.stat_columns = stat_columns
self.scaler = StandardScaler()
self.pca = None
self.player_data = None
self.scaled_data = None
def fit(self, df: pd.DataFrame, player_col: str = "Player") -> "PlayerSimilarity":
"""
Fit the similarity model.
Args:
df: DataFrame with player statistics
player_col: Column containing player names
"""
self.player_col = player_col
self.player_data = df.copy()
# Auto-detect numeric columns if not specified
if self.stat_columns is None:
self.stat_columns = df.select_dtypes(include=[np.number]).columns.tolist()
# Remove common non-stat columns
exclude = ["Age", "G", "GS", "Year", "Season"]
self.stat_columns = [c for c in self.stat_columns if c not in exclude]
# Scale the data
stats = df[self.stat_columns].fillna(0)
self.scaled_data = self.scaler.fit_transform(stats)
return self
def find_similar(
self,
player_name: str,
n: int = 10,
method: str = "cosine",
exclude_self: bool = True
) -> pd.DataFrame:
"""
Find most similar players.
Args:
player_name: Name of target player
n: Number of similar players to return
method: "cosine", "euclidean", or "pca"
exclude_self: Whether to exclude the player from results
"""
# Find player index
player_idx = self.player_data[
self.player_data[self.player_col].str.contains(player_name, case=False)
].index
if len(player_idx) == 0:
raise ValueError(f"Player not found: {player_name}")
player_idx = player_idx[0]
target_vector = self.scaled_data[player_idx].reshape(1, -1)
# Calculate similarities
if method == "cosine":
similarities = cosine_similarity(target_vector, self.scaled_data)[0]
higher_is_better = True
elif method == "euclidean":
similarities = -euclidean_distances(target_vector, self.scaled_data)[0]
higher_is_better = True
elif method == "pca":
if self.pca is None:
self.pca = PCA(n_components=min(10, len(self.stat_columns)))
pca_data = self.pca.fit_transform(self.scaled_data)
else:
pca_data = self.pca.transform(self.scaled_data)
target_pca = pca_data[player_idx].reshape(1, -1)
similarities = cosine_similarity(target_pca, pca_data)[0]
higher_is_better = True
else:
raise ValueError(f"Unknown method: {method}")
# Create results DataFrame
results = self.player_data.copy()
results["Similarity"] = similarities
# Sort and filter
results = results.sort_values("Similarity", ascending=not higher_is_better)
if exclude_self:
results = results[results.index != player_idx]
return results.head(n)[[self.player_col, "Similarity"] + self.stat_columns[:5]]
def cluster_players(
self,
n_clusters: int = 8,
method: str = "kmeans"
) -> pd.DataFrame:
"""
Cluster players into groups.
"""
if method == "kmeans":
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(self.scaled_data)
results = self.player_data.copy()
results["Cluster"] = clusters
return results
def get_player_percentiles(self, player_name: str) -> pd.Series:
"""
Get percentile rankings for a player.
"""
player_row = self.player_data[
self.player_data[self.player_col].str.contains(player_name, case=False)
]
if len(player_row) == 0:
raise ValueError(f"Player not found: {player_name}")
player_stats = player_row[self.stat_columns].iloc[0]
percentiles = {}
for col in self.stat_columns:
pct = (self.player_data[col] < player_stats[col]).mean() * 100
percentiles[col] = round(pct, 1)
return pd.Series(percentiles)
def compare_players(self, players: List[str]) -> pd.DataFrame:
"""
Compare multiple players side by side with percentiles.
"""
comparison = []
for player in players:
try:
percentiles = self.get_player_percentiles(player)
percentiles.name = player
comparison.append(percentiles)
except ValueError:
print(f"Warning: Player not found: {player}")
return pd.DataFrame(comparison)
# Example with baseball data
if __name__ == "__main__":
# Create sample batting data
np.random.seed(42)
n_players = 200
players = pd.DataFrame({
"Player": [f"Player_{i}" for i in range(n_players)],
"Team": np.random.choice(["NYY", "BOS", "LAD", "CHC", "HOU"], n_players),
"PA": np.random.randint(300, 700, n_players),
"AVG": np.random.normal(0.260, 0.030, n_players).clip(0.180, 0.350),
"OBP": np.random.normal(0.330, 0.040, n_players).clip(0.250, 0.450),
"SLG": np.random.normal(0.420, 0.060, n_players).clip(0.300, 0.650),
"HR": np.random.randint(5, 50, n_players),
"SB": np.random.randint(0, 40, n_players),
"BB%": np.random.normal(0.09, 0.03, n_players).clip(0.03, 0.20),
"K%": np.random.normal(0.22, 0.05, n_players).clip(0.08, 0.35),
"wRC+": np.random.normal(100, 25, n_players).clip(50, 180)
})
# Calculate OPS
players["OPS"] = players["OBP"] + players["SLG"]
# Create similarity model
sim = PlayerSimilarity(stat_columns=["AVG", "OBP", "SLG", "HR", "SB", "BB%", "K%", "wRC+"])
sim.fit(players)
# Find similar players
print("Players similar to Player_0:")
similar = sim.find_similar("Player_0", n=5)
print(similar)
# Get percentiles
print("\nPlayer_0 Percentiles:")
print(sim.get_player_percentiles("Player_0"))
# Cluster players
clustered = sim.cluster_players(n_clusters=5)
print("\nCluster distribution:")
print(clustered["Cluster"].value_counts().sort_index())
Expected Goals (xG) Model
Build a simple expected goals model for soccer using shot location and other features.
"""Expected Goals (xG) model for soccer."""
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss
from typing import Tuple
class ExpectedGoalsModel:
"""
Simple xG model based on shot characteristics.
Features typically used:
- Distance to goal
- Angle to goal
- Body part (head/foot)
- Shot type (open play, set piece, penalty)
- Defender presence
- Goalkeeper position
"""
def __init__(self):
self.model = LogisticRegression(max_iter=1000)
self.feature_names = None
self.is_fitted = False
def calculate_distance(self, x: float, y: float) -> float:
"""
Calculate distance from shot location to goal center.
Assumes pitch coordinates: (0-100) x (0-100)
Goal center at (100, 50)
"""
goal_x, goal_y = 100, 50
return np.sqrt((x - goal_x)**2 + (y - goal_y)**2)
def calculate_angle(self, x: float, y: float) -> float:
"""
Calculate angle to goal from shot location.
Returns angle in degrees.
"""
goal_width = 7.32 # meters, scaled to pitch units
goal_y_min = 50 - (goal_width / 2) * (100 / 68) # Scale to pitch
goal_y_max = 50 + (goal_width / 2) * (100 / 68)
# Calculate angles to both posts
angle_1 = np.arctan2(goal_y_min - y, 100 - x)
angle_2 = np.arctan2(goal_y_max - y, 100 - x)
angle = abs(angle_2 - angle_1)
return np.degrees(angle)
def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Prepare features from shot data.
Expected columns: x, y, body_part, shot_type
"""
features = pd.DataFrame()
# Distance and angle
features["distance"] = df.apply(
lambda r: self.calculate_distance(r["x"], r["y"]), axis=1
)
features["angle"] = df.apply(
lambda r: self.calculate_angle(r["x"], r["y"]), axis=1
)
# Derived features
features["distance_sq"] = features["distance"] ** 2
features["log_distance"] = np.log(features["distance"] + 1)
# Body part (one-hot)
if "body_part" in df.columns:
features["is_header"] = (df["body_part"] == "head").astype(int)
features["is_foot"] = (df["body_part"].isin(["left_foot", "right_foot"])).astype(int)
# Shot type
if "shot_type" in df.columns:
features["is_penalty"] = (df["shot_type"] == "penalty").astype(int)
features["is_free_kick"] = (df["shot_type"] == "free_kick").astype(int)
self.feature_names = features.columns.tolist()
return features
def fit(self, shots_df: pd.DataFrame, target_col: str = "is_goal") -> "ExpectedGoalsModel":
"""
Train the xG model.
Args:
shots_df: DataFrame with shot data
target_col: Column name for goal (1) or no goal (0)
"""
X = self.prepare_features(shots_df)
y = shots_df[target_col]
self.model.fit(X, y)
self.is_fitted = True
return self
def predict_xg(self, shots_df: pd.DataFrame) -> pd.Series:
"""Predict xG for shots."""
if not self.is_fitted:
raise ValueError("Model must be fitted first")
X = self.prepare_features(shots_df)
return pd.Series(self.model.predict_proba(X)[:, 1], index=shots_df.index)
def evaluate(self, shots_df: pd.DataFrame, target_col: str = "is_goal") -> dict:
"""Evaluate model performance."""
X = self.prepare_features(shots_df)
y = shots_df[target_col]
y_pred = self.model.predict_proba(X)[:, 1]
return {
"auc": roc_auc_score(y, y_pred),
"log_loss": log_loss(y, y_pred),
"avg_xg": y_pred.mean(),
"actual_rate": y.mean()
}
def create_xg_features_statsbomb(events_df: pd.DataFrame) -> pd.DataFrame:
"""
Create features from StatsBomb event data.
"""
shots = events_df[events_df["type.name"] == "Shot"].copy()
# Extract coordinates
shots["x"] = shots["location"].apply(lambda loc: loc[0] if isinstance(loc, list) else None)
shots["y"] = shots["location"].apply(lambda loc: loc[1] if isinstance(loc, list) else None)
# Scale to 0-100
shots["x"] = shots["x"] * (100 / 120) # StatsBomb uses 120x80
shots["y"] = shots["y"] * (100 / 80)
# Outcome
shots["is_goal"] = (shots["shot.outcome.name"] == "Goal").astype(int)
# Body part
shots["body_part"] = shots["shot.body_part.name"].str.lower().replace({
"right foot": "right_foot",
"left foot": "left_foot"
})
return shots
# Example usage
if __name__ == "__main__":
# Create synthetic training data
np.random.seed(42)
n_shots = 1000
# Generate shot locations (weighted toward penalty area)
x = np.clip(np.random.normal(85, 10, n_shots), 60, 99)
y = np.clip(np.random.normal(50, 15, n_shots), 0, 100)
# Calculate base probability based on location
dist = np.sqrt((x - 100)**2 + (y - 50)**2)
base_prob = 1 / (1 + np.exp(0.15 * dist - 2))
# Add noise and generate outcomes
shots_df = pd.DataFrame({
"x": x,
"y": y,
"body_part": np.random.choice(["right_foot", "left_foot", "head"], n_shots, p=[0.5, 0.3, 0.2]),
"shot_type": np.random.choice(["open_play", "free_kick", "corner", "penalty"], n_shots, p=[0.8, 0.1, 0.08, 0.02]),
"is_goal": (np.random.random(n_shots) < base_prob).astype(int)
})
# Split data
train, test = train_test_split(shots_df, test_size=0.2, random_state=42)
# Train model
xg_model = ExpectedGoalsModel()
xg_model.fit(train)
# Evaluate
metrics = xg_model.evaluate(test)
print(f"Model Performance:")
print(f" AUC: {metrics['auc']:.3f}")
print(f" Log Loss: {metrics['log_loss']:.3f}")
# Predict xG
test["xG"] = xg_model.predict_xg(test)
print(f"\nSample predictions:")
print(test[["x", "y", "is_goal", "xG"]].head(10))
Calculate Pythagorean Win Expectation
Estimate team win percentage using Pythagorean expectation formula with various exponents.
"""Calculate Pythagorean Win Expectation for sports teams."""
import pandas as pd
import numpy as np
from typing import Union
def pythagorean_expectation(
runs_scored: Union[int, pd.Series],
runs_allowed: Union[int, pd.Series],
exponent: float = None,
sport: str = "baseball"
) -> Union[float, pd.Series]:
"""
Calculate Pythagorean win expectation.
The formula: Win% = RS^exp / (RS^exp + RA^exp)
Args:
runs_scored: Runs/points scored
runs_allowed: Runs/points allowed
exponent: Custom exponent (None uses sport default)
sport: Sport for default exponent
Default exponents by sport:
- Baseball: 1.83 (original), 2.0 (simplified)
- Basketball: 13.91 (NBA), 10.25 (College)
- Football: 2.37 (NFL)
- Hockey: 2.05 (NHL)
- Soccer: 1.3-1.5
Returns:
Expected win percentage
"""
# Default exponents by sport
default_exponents = {
"baseball": 1.83,
"basketball": 13.91,
"football": 2.37,
"hockey": 2.05,
"soccer": 1.35
}
if exponent is None:
exponent = default_exponents.get(sport.lower(), 2.0)
rs_exp = np.power(runs_scored, exponent)
ra_exp = np.power(runs_allowed, exponent)
return rs_exp / (rs_exp + ra_exp)
def pythagenpat(
runs_scored: Union[int, pd.Series],
runs_allowed: Union[int, pd.Series],
games: int = 162
) -> Union[float, pd.Series]:
"""
Calculate Pythagenpat (variable exponent Pythagorean).
Uses: exponent = ((RS + RA) / G) ^ 0.287
More accurate than fixed exponent for extreme teams.
"""
rpg = (runs_scored + runs_allowed) / games
exponent = np.power(rpg, 0.287)
return pythagorean_expectation(runs_scored, runs_allowed, exponent)
def expected_wins(
runs_scored: Union[int, pd.Series],
runs_allowed: Union[int, pd.Series],
games: int,
method: str = "pythagenpat"
) -> Union[float, pd.Series]:
"""
Calculate expected wins.
Args:
runs_scored: Total runs/points scored
runs_allowed: Total runs/points allowed
games: Number of games
method: "pythagorean" or "pythagenpat"
"""
if method == "pythagenpat":
win_pct = pythagenpat(runs_scored, runs_allowed, games)
else:
win_pct = pythagorean_expectation(runs_scored, runs_allowed)
return win_pct * games
def luck_factor(actual_wins: int, expected_wins: float) -> float:
"""
Calculate luck factor (actual - expected wins).
Positive = lucky (won more than expected)
Negative = unlucky (won fewer than expected)
"""
return actual_wins - expected_wins
# Example with MLB data
if __name__ == "__main__":
# Sample team data
teams = pd.DataFrame({
"Team": ["NYY", "BOS", "TOR", "BAL", "TBR"],
"W": [95, 89, 84, 78, 73],
"L": [67, 73, 78, 84, 89],
"RS": [850, 820, 780, 750, 700],
"RA": [700, 720, 760, 800, 820]
})
teams["G"] = teams["W"] + teams["L"]
teams["Win%"] = teams["W"] / teams["G"]
teams["Pyth_Win%"] = pythagorean_expectation(teams["RS"], teams["RA"])
teams["ExpWins"] = expected_wins(teams["RS"], teams["RA"], teams["G"])
teams["Luck"] = luck_factor(teams["W"], teams["ExpWins"])
print(teams[["Team", "W", "ExpWins", "Luck"]].round(1))
R API Client with httr2
Build a reusable sports API client in R using httr2 package with authentication and error handling.
# Sports API client in R using httr2
library(httr2)
library(jsonlite)
library(dplyr)
library(purrr)
#' Create a base API client
#'
#' @param base_url Base URL for the API
#' @param api_key Optional API key
#' @param rate_limit Requests per minute
#' @return API client object
create_api_client <- function(base_url, api_key = NULL, rate_limit = 60) {
structure(
list(
base_url = base_url,
api_key = api_key,
rate_limit = rate_limit,
last_request = Sys.time() - 60/rate_limit
),
class = "sports_api_client"
)
}
#' Make API request with rate limiting
#'
#' @param client API client object
#' @param endpoint API endpoint
#' @param params Query parameters
#' @param headers Additional headers
#' @return Parsed JSON response
api_request <- function(client, endpoint, params = list(), headers = list()) {
# Rate limiting
elapsed <- as.numeric(Sys.time() - client$last_request)
min_interval <- 60 / client$rate_limit
if (elapsed < min_interval) {
Sys.sleep(min_interval - elapsed)
}
# Build request
req <- request(paste0(client$base_url, "/", endpoint))
# Add API key if present
if (!is.null(client$api_key)) {
req <- req %>% req_headers("X-API-Key" = client$api_key)
}
# Add custom headers
if (length(headers) > 0) {
req <- req %>% req_headers(!!!headers)
}
# Add query parameters
if (length(params) > 0) {
req <- req %>% req_url_query(!!!params)
}
# Make request with retry
resp <- req %>%
req_retry(max_tries = 3, backoff = ~ 2) %>%
req_perform()
# Update last request time
client$last_request <<- Sys.time()
# Parse response
resp %>%
resp_body_json()
}
# =====================
# MLB Stats API Example
# =====================
#' Create MLB API client
mlb_client <- function() {
create_api_client("https://statsapi.mlb.com/api/v1", rate_limit = 60)
}
#' Get MLB teams
#'
#' @param client MLB API client
#' @param season Season year
#' @return Data frame of teams
mlb_get_teams <- function(client, season = NULL) {
params <- list(sportId = 1)
if (!is.null(season)) params$season <- season
data <- api_request(client, "teams", params)
map_df(data$teams, ~ tibble(
id = .x$id,
name = .x$name,
abbreviation = .x$abbreviation %||% NA,
team_name = .x$teamName,
location = .x$locationName %||% NA,
league = .x$league$name %||% NA,
division = .x$division$name %||% NA
))
}
#' Get player stats
#'
#' @param client MLB API client
#' @param player_id Player ID
#' @param stat_type Type of stats (season, career, yearByYear)
#' @param stat_group Stat group (hitting, pitching, fielding)
#' @param season Season year
mlb_get_player_stats <- function(client, player_id, stat_type = "season",
stat_group = "hitting", season = NULL) {
params <- list(
stats = stat_type,
group = stat_group
)
if (!is.null(season)) params$season <- season
data <- api_request(client, paste0("people/", player_id, "/stats"), params)
if (length(data$stats) == 0) return(tibble())
# Extract stats from nested structure
stats <- data$stats[[1]]$splits
if (length(stats) == 0) return(tibble())
map_df(stats, function(split) {
stat_data <- split$stat
tibble(
season = split$season %||% NA,
team = split$team$name %||% NA,
games = stat_data$gamesPlayed %||% NA,
at_bats = stat_data$atBats %||% NA,
hits = stat_data$hits %||% NA,
home_runs = stat_data$homeRuns %||% NA,
rbi = stat_data$rbi %||% NA,
avg = stat_data$avg %||% NA,
obp = stat_data$obp %||% NA,
slg = stat_data$slg %||% NA,
ops = stat_data$ops %||% NA
)
})
}
#' Get schedule
#'
#' @param client MLB API client
#' @param date Date string (YYYY-MM-DD)
mlb_get_schedule <- function(client, date = Sys.Date()) {
params <- list(
sportId = 1,
date = format(as.Date(date), "%Y-%m-%d")
)
data <- api_request(client, "schedule", params)
if (length(data$dates) == 0) return(tibble())
games <- data$dates[[1]]$games
map_df(games, ~ tibble(
game_pk = .x$gamePk,
game_date = .x$gameDate,
status = .x$status$detailedState,
home_team = .x$teams$home$team$name,
away_team = .x$teams$away$team$name,
home_score = .x$teams$home$score %||% NA,
away_score = .x$teams$away$score %||% NA,
venue = .x$venue$name %||% NA
))
}
# =====================
# Generic API Functions
# =====================
#' Safely extract nested value
`%||%` <- function(x, y) if (is.null(x)) y else x
#' Batch API requests with progress
#'
#' @param client API client
#' @param endpoint_template Template with {id} placeholder
#' @param ids Vector of IDs
#' @param parse_fn Function to parse each response
batch_requests <- function(client, endpoint_template, ids, parse_fn) {
results <- list()
pb <- txtProgressBar(min = 0, max = length(ids), style = 3)
for (i in seq_along(ids)) {
endpoint <- gsub("\{id\}", ids[i], endpoint_template)
tryCatch({
data <- api_request(client, endpoint)
results[[as.character(ids[i])]] <- parse_fn(data)
}, error = function(e) {
warning(paste("Error for ID", ids[i], ":", e$message))
results[[as.character(ids[i])]] <<- NULL
})
setTxtProgressBar(pb, i)
}
close(pb)
bind_rows(results, .id = "request_id")
}
# Example usage
# client <- mlb_client()
# teams <- mlb_get_teams(client, 2024)
# schedule <- mlb_get_schedule(client, "2024-06-15")
# player_stats <- mlb_get_player_stats(client, 545361, "yearByYear", "hitting") # Mike Trout
Odds API for Sports Betting Data
Access live betting odds from multiple sportsbooks via The Odds API.
"""The Odds API client for sports betting data."""
import requests
import pandas as pd
from typing import Optional, Dict, List
from datetime import datetime
class OddsAPI:
"""
Client for The Odds API.
Get API key at: https://the-odds-api.com/
Free tier: 500 requests/month
"""
BASE_URL = "https://api.the-odds-api.com/v4"
SPORTS = {
# US Sports
"nfl": "americanfootball_nfl",
"nba": "basketball_nba",
"mlb": "baseball_mlb",
"nhl": "icehockey_nhl",
"ncaaf": "americanfootball_ncaaf",
"ncaab": "basketball_ncaab",
"mls": "soccer_usa_mls",
# Soccer
"epl": "soccer_epl",
"la_liga": "soccer_spain_la_liga",
"bundesliga": "soccer_germany_bundesliga",
"serie_a": "soccer_italy_serie_a",
"ligue_1": "soccer_france_ligue_one",
"champions_league": "soccer_uefa_champs_league",
# Other
"ufc": "mma_mixed_martial_arts",
"pga": "golf_pga_championship",
"atp": "tennis_atp_aus_open"
}
def __init__(self, api_key: str):
self.api_key = api_key
self.session = requests.Session()
self.requests_remaining = None
self.requests_used = None
def _get(self, endpoint: str, params: Dict = None) -> Dict:
"""Make API request and track usage."""
url = f"{self.BASE_URL}/{endpoint}"
params = params or {}
params["apiKey"] = self.api_key
response = self.session.get(url, params=params)
response.raise_for_status()
# Track API usage
self.requests_remaining = response.headers.get("x-requests-remaining")
self.requests_used = response.headers.get("x-requests-used")
return response.json()
def get_sports(self, all_sports: bool = False) -> pd.DataFrame:
"""Get list of available sports."""
params = {"all": "true"} if all_sports else {}
data = self._get("sports", params)
return pd.DataFrame(data)
def get_odds(
self,
sport: str,
regions: str = "us",
markets: str = "h2h",
odds_format: str = "american",
bookmakers: List[str] = None
) -> pd.DataFrame:
"""
Get current odds for a sport.
Args:
sport: Sport key (use SPORTS dict or raw key)
regions: us, uk, eu, au (comma-separated for multiple)
markets: h2h (moneyline), spreads, totals
odds_format: american, decimal, fractional
bookmakers: List of specific bookmakers
Returns:
DataFrame with odds data
"""
sport_key = self.SPORTS.get(sport, sport)
params = {
"regions": regions,
"markets": markets,
"oddsFormat": odds_format
}
if bookmakers:
params["bookmakers"] = ",".join(bookmakers)
data = self._get(f"sports/{sport_key}/odds", params)
# Flatten the nested structure
games = []
for game in data:
game_info = {
"id": game.get("id"),
"sport": game.get("sport_key"),
"commence_time": game.get("commence_time"),
"home_team": game.get("home_team"),
"away_team": game.get("away_team")
}
for bookmaker in game.get("bookmakers", []):
book_name = bookmaker.get("key")
for market in bookmaker.get("markets", []):
market_key = market.get("key")
for outcome in market.get("outcomes", []):
games.append({
**game_info,
"bookmaker": book_name,
"market": market_key,
"team": outcome.get("name"),
"price": outcome.get("price"),
"point": outcome.get("point") # For spreads/totals
})
return pd.DataFrame(games)
def get_best_odds(self, sport: str, market: str = "h2h") -> pd.DataFrame:
"""Get best available odds across bookmakers."""
odds_df = self.get_odds(sport, markets=market)
if odds_df.empty:
return odds_df
# Find best price for each team/game
best_odds = odds_df.loc[
odds_df.groupby(['id', 'team'])['price'].idxmax()
]
return best_odds[['commence_time', 'home_team', 'away_team',
'team', 'price', 'bookmaker']]
def find_arbitrage(self, sport: str) -> pd.DataFrame:
"""
Find potential arbitrage opportunities.
Returns games where combined implied probabilities < 100%
"""
odds_df = self.get_odds(sport, markets="h2h", odds_format="decimal")
if odds_df.empty:
return odds_df
arb_opportunities = []
for game_id in odds_df['id'].unique():
game_odds = odds_df[odds_df['id'] == game_id]
game_info = game_odds.iloc[0]
# Get best odds for each outcome
home_best = game_odds[game_odds['team'] == game_info['home_team']]['price'].max()
away_best = game_odds[game_odds['team'] == game_info['away_team']]['price'].max()
if pd.isna(home_best) or pd.isna(away_best):
continue
# Calculate implied probability
implied_prob = (1/home_best + 1/away_best) * 100
if implied_prob < 100:
profit_margin = 100 - implied_prob
arb_opportunities.append({
"game": f"{game_info['away_team']} @ {game_info['home_team']}",
"commence_time": game_info['commence_time'],
"home_odds": home_best,
"away_odds": away_best,
"implied_prob": round(implied_prob, 2),
"profit_margin": round(profit_margin, 2)
})
return pd.DataFrame(arb_opportunities)
def compare_bookmakers(self, sport: str, market: str = "h2h") -> pd.DataFrame:
"""Compare odds across all bookmakers for a sport."""
odds_df = self.get_odds(sport, markets=market)
if odds_df.empty:
return odds_df
# Pivot to show bookmakers as columns
pivot = odds_df.pivot_table(
index=['commence_time', 'home_team', 'away_team', 'team'],
columns='bookmaker',
values='price',
aggfunc='first'
).reset_index()
return pivot
# Example usage
if __name__ == "__main__":
odds = OddsAPI("YOUR_API_KEY")
# Get available sports
sports = odds.get_sports()
print("Available Sports:")
print(sports[['key', 'title', 'active']].head(20))
# Get NFL odds
# nfl_odds = odds.get_odds("nfl")
# print("\nNFL Odds:")
# print(nfl_odds.head())
# Find best odds
# best = odds.get_best_odds("nfl")
# print("\nBest NFL Odds:")
# print(best)
print(f"\nAPI Usage: {odds.requests_used} used, {odds.requests_remaining} remaining")
SportsDataIO API Wrapper
Universal wrapper for SportsDataIO APIs covering NFL, NBA, MLB, NHL, and more.
"""SportsDataIO API wrapper for multiple sports."""
import requests
import pandas as pd
from typing import Optional, Dict, List
from datetime import date
from enum import Enum
class Sport(Enum):
NFL = "nfl"
NBA = "nba"
MLB = "mlb"
NHL = "nhl"
CFB = "cfb" # College Football
CBB = "cbb" # College Basketball
NASCAR = "nascar"
GOLF = "golf"
MMA = "mma"
SOCCER = "soccer"
class SportsDataIO:
"""
Unified client for SportsDataIO APIs.
Requires API key: https://sportsdata.io/
"""
BASE_URLS = {
Sport.NFL: "https://api.sportsdata.io/v3/nfl",
Sport.NBA: "https://api.sportsdata.io/v3/nba",
Sport.MLB: "https://api.sportsdata.io/v3/mlb",
Sport.NHL: "https://api.sportsdata.io/v3/nhl",
Sport.CFB: "https://api.sportsdata.io/v3/cfb",
Sport.CBB: "https://api.sportsdata.io/v3/cbb",
Sport.GOLF: "https://api.sportsdata.io/golf/v2",
Sport.MMA: "https://api.sportsdata.io/v3/mma",
Sport.SOCCER: "https://api.sportsdata.io/v4/soccer"
}
def __init__(self, api_keys: Dict[Sport, str]):
"""
Initialize with API keys for each sport.
Args:
api_keys: Dictionary mapping Sport enum to API key
"""
self.api_keys = api_keys
self.session = requests.Session()
def _get(self, sport: Sport, endpoint: str, params: Dict = None) -> Dict:
"""Make API request."""
base_url = self.BASE_URLS.get(sport)
if not base_url:
raise ValueError(f"Unsupported sport: {sport}")
api_key = self.api_keys.get(sport)
if not api_key:
raise ValueError(f"No API key for {sport}")
url = f"{base_url}/{endpoint}"
headers = {"Ocp-Apim-Subscription-Key": api_key}
response = self.session.get(url, headers=headers, params=params or {})
response.raise_for_status()
return response.json()
# Universal endpoints
def get_teams(self, sport: Sport) -> pd.DataFrame:
"""Get all teams for a sport."""
if sport == Sport.SOCCER:
data = self._get(sport, "scores/json/Teams")
else:
data = self._get(sport, "scores/json/Teams")
return pd.DataFrame(data)
def get_players(self, sport: Sport) -> pd.DataFrame:
"""Get all active players."""
if sport == Sport.GOLF:
data = self._get(sport, "json/Players")
else:
data = self._get(sport, "scores/json/Players")
return pd.DataFrame(data)
def get_schedule(self, sport: Sport, season: int) -> pd.DataFrame:
"""Get season schedule."""
if sport == Sport.NFL:
data = self._get(sport, f"scores/json/Schedules/{season}")
elif sport == Sport.MLB:
data = self._get(sport, f"scores/json/Games/{season}")
elif sport == Sport.NBA:
data = self._get(sport, f"scores/json/Games/{season}")
elif sport == Sport.NHL:
data = self._get(sport, f"scores/json/Games/{season}")
else:
data = self._get(sport, f"scores/json/Games/{season}")
return pd.DataFrame(data)
def get_standings(self, sport: Sport, season: int) -> pd.DataFrame:
"""Get current standings."""
data = self._get(sport, f"scores/json/Standings/{season}")
return pd.DataFrame(data)
def get_scores_by_date(self, sport: Sport, date_str: str) -> pd.DataFrame:
"""Get scores for a specific date (format: YYYY-MM-DD or YYYY-MON-DD)."""
if sport == Sport.NFL:
# NFL uses week-based
data = self._get(sport, f"scores/json/ScoresByDate/{date_str}")
else:
data = self._get(sport, f"scores/json/GamesByDate/{date_str}")
return pd.DataFrame(data)
# Sport-specific stats
def get_player_season_stats(
self,
sport: Sport,
season: int,
player_id: int = None
) -> pd.DataFrame:
"""Get player season statistics."""
if player_id:
data = self._get(sport, f"stats/json/PlayerSeasonStats/{season}")
df = pd.DataFrame(data)
return df[df['PlayerID'] == player_id]
else:
data = self._get(sport, f"stats/json/PlayerSeasonStats/{season}")
return pd.DataFrame(data)
def get_team_season_stats(self, sport: Sport, season: int) -> pd.DataFrame:
"""Get team season statistics."""
data = self._get(sport, f"scores/json/TeamSeasonStats/{season}")
return pd.DataFrame(data)
# Projections (for fantasy/betting)
def get_player_projections(self, sport: Sport, season: int) -> pd.DataFrame:
"""Get player stat projections."""
data = self._get(sport, f"projections/json/PlayerSeasonProjectionStats/{season}")
return pd.DataFrame(data)
# News
def get_news(self, sport: Sport) -> pd.DataFrame:
"""Get latest news."""
data = self._get(sport, "scores/json/News")
return pd.DataFrame(data)
def get_player_news(self, sport: Sport, player_id: int) -> pd.DataFrame:
"""Get news for a specific player."""
data = self._get(sport, f"scores/json/NewsByPlayerID/{player_id}")
return pd.DataFrame(data)
# Example usage
if __name__ == "__main__":
# Initialize with your API keys
api_keys = {
Sport.NFL: "YOUR_NFL_KEY",
Sport.NBA: "YOUR_NBA_KEY",
Sport.MLB: "YOUR_MLB_KEY"
}
sportsdata = SportsDataIO(api_keys)
# Get NFL teams
# nfl_teams = sportsdata.get_teams(Sport.NFL)
# print(nfl_teams)
# Get NBA standings
# nba_standings = sportsdata.get_standings(Sport.NBA, 2024)
# print(nba_standings)
NHL Stats API Client
Access NHL statistics through the official NHL Stats API.
"""NHL Stats API client."""
import requests
import pandas as pd
from typing import Optional, Dict, List
from datetime import date
class NHLStatsAPI:
"""
Client for NHL Stats API.
No authentication required.
"""
BASE_URL = "https://api-web.nhle.com/v1"
STATS_URL = "https://api.nhle.com/stats/rest/en"
def __init__(self):
self.session = requests.Session()
def _get(self, url: str, params: Dict = None) -> Dict:
"""Make API request."""
response = self.session.get(url, params=params or {})
response.raise_for_status()
return response.json()
# Players
def get_player(self, player_id: int) -> Dict:
"""Get player details."""
return self._get(f"{self.BASE_URL}/player/{player_id}/landing")
def get_player_stats(self, player_id: int, season: str = None) -> Dict:
"""Get player statistics."""
return self._get(f"{self.BASE_URL}/player/{player_id}/game-log/{season or 'now'}")
# Teams
def get_teams(self) -> pd.DataFrame:
"""Get all NHL teams."""
data = self._get(f"{self.STATS_URL}/team")
return pd.DataFrame(data.get("data", []))
def get_team_roster(self, team_abbrev: str, season: str = "20242025") -> Dict:
"""Get team roster."""
return self._get(f"{self.BASE_URL}/roster/{team_abbrev}/{season}")
def get_team_schedule(self, team_abbrev: str, season: str = "20242025") -> pd.DataFrame:
"""Get team schedule."""
data = self._get(f"{self.BASE_URL}/club-schedule-season/{team_abbrev}/{season}")
games = []
for game in data.get("games", []):
games.append({
"game_id": game.get("id"),
"date": game.get("gameDate"),
"game_type": game.get("gameType"),
"home_team": game.get("homeTeam", {}).get("abbrev"),
"away_team": game.get("awayTeam", {}).get("abbrev"),
"home_score": game.get("homeTeam", {}).get("score"),
"away_score": game.get("awayTeam", {}).get("score"),
"venue": game.get("venue", {}).get("default")
})
return pd.DataFrame(games)
# Standings
def get_standings(self, date_str: str = None) -> pd.DataFrame:
"""Get league standings."""
endpoint = f"{self.BASE_URL}/standings/{date_str or 'now'}"
data = self._get(endpoint)
standings = []
for team in data.get("standings", []):
standings.append({
"team": team.get("teamName", {}).get("default"),
"abbrev": team.get("teamAbbrev", {}).get("default"),
"conference": team.get("conferenceName"),
"division": team.get("divisionName"),
"games_played": team.get("gamesPlayed"),
"wins": team.get("wins"),
"losses": team.get("losses"),
"ot_losses": team.get("otLosses"),
"points": team.get("points"),
"points_pct": team.get("pointPctg"),
"goals_for": team.get("goalFor"),
"goals_against": team.get("goalAgainst"),
"goal_diff": team.get("goalDifferential")
})
return pd.DataFrame(standings)
# Schedule/Scores
def get_schedule(self, date_str: str = None) -> pd.DataFrame:
"""Get games for a date."""
endpoint = f"{self.BASE_URL}/schedule/{date_str or 'now'}"
data = self._get(endpoint)
games = []
for day in data.get("gameWeek", []):
for game in day.get("games", []):
games.append({
"game_id": game.get("id"),
"date": game.get("startTimeUTC"),
"game_state": game.get("gameState"),
"home_team": game.get("homeTeam", {}).get("abbrev"),
"away_team": game.get("awayTeam", {}).get("abbrev"),
"home_score": game.get("homeTeam", {}).get("score"),
"away_score": game.get("awayTeam", {}).get("score"),
"venue": game.get("venue", {}).get("default")
})
return pd.DataFrame(games)
# Leaders
def get_skater_leaders(self, category: str = "points", limit: int = 10) -> pd.DataFrame:
"""
Get league leaders for skaters.
Categories: points, goals, assists, plusMinus, penaltyMins
"""
data = self._get(f"{self.BASE_URL}/skater-stats-leaders/current", {
"categories": category,
"limit": limit
})
leaders = []
for player in data.get(category, []):
leaders.append({
"rank": player.get("rank"),
"player": f"{player.get('firstName', {}).get('default')} {player.get('lastName', {}).get('default')}",
"team": player.get("teamAbbrev"),
"value": player.get("value")
})
return pd.DataFrame(leaders)
def get_goalie_leaders(self, category: str = "wins", limit: int = 10) -> pd.DataFrame:
"""
Get league leaders for goalies.
Categories: wins, savePctg, goalsAgainstAverage, shutouts
"""
data = self._get(f"{self.BASE_URL}/goalie-stats-leaders/current", {
"categories": category,
"limit": limit
})
leaders = []
for player in data.get(category, []):
leaders.append({
"rank": player.get("rank"),
"player": f"{player.get('firstName', {}).get('default')} {player.get('lastName', {}).get('default')}",
"team": player.get("teamAbbrev"),
"value": player.get("value")
})
return pd.DataFrame(leaders)
# Game details
def get_game_boxscore(self, game_id: int) -> Dict:
"""Get game box score."""
return self._get(f"{self.BASE_URL}/gamecenter/{game_id}/boxscore")
def get_game_play_by_play(self, game_id: int) -> Dict:
"""Get play-by-play data."""
return self._get(f"{self.BASE_URL}/gamecenter/{game_id}/play-by-play")
# Example usage
if __name__ == "__main__":
nhl = NHLStatsAPI()
# Get standings
standings = nhl.get_standings()
print("NHL Standings:")
print(standings[['team', 'points', 'wins', 'losses']].head(10))
# Get point leaders
leaders = nhl.get_skater_leaders("points", 20)
print("\nPoints Leaders:")
print(leaders)
# Get today's games
schedule = nhl.get_schedule()
print("\nToday's Games:")
print(schedule)
StatsBomb Open Data API
Access free event-level soccer data from StatsBomb for detailed match analysis.
"""StatsBomb Open Data API client."""
import requests
import pandas as pd
from typing import Optional, List, Dict
class StatsBombAPI:
"""
Client for StatsBomb Open Data.
Free data includes:
- FIFA World Cups
- FA Women's Super League
- NWSL
- UEFA Euro 2020
- Select club competitions
"""
BASE_URL = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"
def __init__(self):
self.session = requests.Session()
def _get_json(self, path: str) -> Dict:
"""Fetch JSON data."""
url = f"{self.BASE_URL}/{path}"
response = self.session.get(url)
response.raise_for_status()
return response.json()
# Competitions
def get_competitions(self) -> pd.DataFrame:
"""Get available competitions."""
data = self._get_json("competitions.json")
return pd.DataFrame(data)
# Matches
def get_matches(self, competition_id: int, season_id: int) -> pd.DataFrame:
"""Get matches for a competition season."""
data = self._get_json(f"matches/{competition_id}/{season_id}.json")
matches = []
for match in data:
matches.append({
"match_id": match.get("match_id"),
"match_date": match.get("match_date"),
"kick_off": match.get("kick_off"),
"competition": match.get("competition", {}).get("competition_name"),
"season": match.get("season", {}).get("season_name"),
"home_team": match.get("home_team", {}).get("home_team_name"),
"away_team": match.get("away_team", {}).get("away_team_name"),
"home_score": match.get("home_score"),
"away_score": match.get("away_score"),
"stadium": match.get("stadium", {}).get("name") if match.get("stadium") else None,
"referee": match.get("referee", {}).get("name") if match.get("referee") else None
})
return pd.DataFrame(matches)
# Events (Play-by-play)
def get_events(self, match_id: int) -> pd.DataFrame:
"""Get all events for a match."""
data = self._get_json(f"events/{match_id}.json")
return pd.json_normalize(data)
def get_shots(self, match_id: int) -> pd.DataFrame:
"""Get shot events with xG data."""
events = self.get_events(match_id)
shots = events[events['type.name'] == 'Shot'].copy()
# Extract relevant columns
cols = [
'id', 'minute', 'second', 'team.name', 'player.name',
'location', 'shot.statsbomb_xg', 'shot.outcome.name',
'shot.body_part.name', 'shot.technique.name',
'shot.type.name', 'shot.end_location'
]
available_cols = [c for c in cols if c in shots.columns]
return shots[available_cols]
def get_passes(self, match_id: int) -> pd.DataFrame:
"""Get pass events."""
events = self.get_events(match_id)
passes = events[events['type.name'] == 'Pass'].copy()
cols = [
'id', 'minute', 'second', 'team.name', 'player.name',
'location', 'pass.end_location', 'pass.length',
'pass.angle', 'pass.height.name', 'pass.outcome.name',
'pass.recipient.name', 'pass.body_part.name'
]
available_cols = [c for c in cols if c in passes.columns]
return passes[available_cols]
# Lineups
def get_lineups(self, match_id: int) -> Dict[str, pd.DataFrame]:
"""Get lineups for both teams."""
data = self._get_json(f"lineups/{match_id}.json")
lineups = {}
for team in data:
team_name = team.get("team_name")
players = []
for player in team.get("lineup", []):
players.append({
"player_id": player.get("player_id"),
"player_name": player.get("player_name"),
"player_nickname": player.get("player_nickname"),
"jersey_number": player.get("jersey_number"),
"country": player.get("country", {}).get("name")
})
lineups[team_name] = pd.DataFrame(players)
return lineups
# 360 Data (freeze frames)
def get_360_data(self, match_id: int) -> pd.DataFrame:
"""Get 360 freeze frame data (if available)."""
try:
data = self._get_json(f"three-sixty/{match_id}.json")
return pd.json_normalize(data)
except:
return pd.DataFrame()
# Aggregated stats
def calculate_xg(self, match_id: int) -> Dict:
"""Calculate total xG for each team."""
shots = self.get_shots(match_id)
if 'shot.statsbomb_xg' not in shots.columns:
return {}
xg_by_team = shots.groupby('team.name')['shot.statsbomb_xg'].sum().to_dict()
goals_by_team = shots[shots['shot.outcome.name'] == 'Goal'].groupby('team.name').size().to_dict()
return {
team: {'xG': xg_by_team.get(team, 0), 'Goals': goals_by_team.get(team, 0)}
for team in set(list(xg_by_team.keys()) + list(goals_by_team.keys()))
}
# Example usage
if __name__ == "__main__":
sb = StatsBombAPI()
# Get available competitions
competitions = sb.get_competitions()
print("Available Competitions:")
print(competitions[['competition_name', 'season_name']].head(20))
# Get World Cup 2022 matches (competition_id=43, season_id=106)
matches = sb.get_matches(43, 106)
print("\nWorld Cup 2022 Matches:")
print(matches[['home_team', 'away_team', 'home_score', 'away_score']].head(10))
# Get events from a match
if len(matches) > 0:
match_id = matches.iloc[0]['match_id']
xg = sb.calculate_xg(match_id)
print(f"\nxG for match {match_id}:")
print(xg)
MLB Stats API Client
Complete Python client for the official MLB Stats API with authentication and rate limiting.
"""MLB Stats API client with caching and rate limiting."""
import requests
from datetime import datetime, date
from typing import Optional, Dict, List, Any
import time
from functools import lru_cache
class MLBStatsAPI:
"""
Client for MLB Stats API (statsapi.mlb.com).
Features:
- No authentication required
- Automatic rate limiting
- Response caching
"""
BASE_URL = "https://statsapi.mlb.com/api/v1"
def __init__(self, requests_per_minute: int = 60):
self.session = requests.Session()
self.min_interval = 60.0 / requests_per_minute
self.last_request = 0
def _rate_limit(self):
"""Enforce rate limiting."""
elapsed = time.time() - self.last_request
if elapsed < self.min_interval:
time.sleep(self.min_interval - elapsed)
self.last_request = time.time()
def _get(self, endpoint: str, params: Dict = None) -> Dict:
"""Make GET request to API."""
self._rate_limit()
url = f"{self.BASE_URL}/{endpoint}"
response = self.session.get(url, params=params)
response.raise_for_status()
return response.json()
# Teams
def get_teams(self, season: int = None, sport_id: int = 1) -> List[Dict]:
"""Get all MLB teams."""
params = {"sportId": sport_id}
if season:
params["season"] = season
data = self._get("teams", params)
return data.get("teams", [])
def get_team(self, team_id: int) -> Dict:
"""Get team details."""
data = self._get(f"teams/{team_id}")
return data.get("teams", [{}])[0]
def get_team_roster(self, team_id: int, roster_type: str = "active") -> List[Dict]:
"""Get team roster."""
data = self._get(f"teams/{team_id}/roster", {"rosterType": roster_type})
return data.get("roster", [])
# Players
def get_player(self, player_id: int) -> Dict:
"""Get player details."""
data = self._get(f"people/{player_id}")
return data.get("people", [{}])[0]
def get_player_stats(
self,
player_id: int,
stats_type: str = "season",
group: str = "hitting",
season: int = None
) -> Dict:
"""
Get player statistics.
Args:
player_id: MLB player ID
stats_type: season, career, yearByYear, etc.
group: hitting, pitching, fielding
season: Season year (required for season stats)
"""
params = {
"stats": stats_type,
"group": group
}
if season:
params["season"] = season
data = self._get(f"people/{player_id}/stats", params)
return data.get("stats", [])
# Games/Schedule
def get_schedule(
self,
start_date: date,
end_date: date = None,
team_id: int = None
) -> List[Dict]:
"""Get game schedule."""
params = {
"sportId": 1,
"startDate": start_date.strftime("%Y-%m-%d"),
"endDate": (end_date or start_date).strftime("%Y-%m-%d")
}
if team_id:
params["teamId"] = team_id
data = self._get("schedule", params)
games = []
for date_data in data.get("dates", []):
games.extend(date_data.get("games", []))
return games
def get_game(self, game_pk: int) -> Dict:
"""Get game details."""
data = self._get(f"game/{game_pk}/feed/live")
return data
def get_game_boxscore(self, game_pk: int) -> Dict:
"""Get game box score."""
data = self._get(f"game/{game_pk}/boxscore")
return data
def get_game_play_by_play(self, game_pk: int) -> Dict:
"""Get play-by-play data."""
data = self._get(f"game/{game_pk}/playByPlay")
return data
# Standings
def get_standings(self, league_id: int = None, season: int = None) -> List[Dict]:
"""Get league standings."""
params = {}
if league_id:
params["leagueId"] = league_id
if season:
params["season"] = season
data = self._get("standings", params)
return data.get("records", [])
# Search
@lru_cache(maxsize=1000)
def search_player(self, name: str) -> List[Dict]:
"""Search for players by name."""
data = self._get("sports/1/players", {"search": name})
return data.get("people", [])
# Example usage
if __name__ == "__main__":
mlb = MLBStatsAPI()
# Get Yankees roster
yankees_roster = mlb.get_team_roster(147)
print(f"Yankees roster: {len(yankees_roster)} players")
# Get Aaron Judge stats
judge_stats = mlb.get_player_stats(592450, "season", "hitting", 2024)
print("Aaron Judge 2024:", judge_stats)
# Get today's games
today = date.today()
games = mlb.get_schedule(today)
print(f"Games today: {len(games)}")
Statcast Data API
Access MLB Statcast data through Baseball Savant API for pitch-level analytics.
"""Baseball Savant Statcast API client."""
import requests
import pandas as pd
from datetime import date, datetime
from typing import Optional, List
import io
class StatcastAPI:
"""
Client for Baseball Savant Statcast data.
Access pitch-level data including:
- Exit velocity
- Launch angle
- Spin rate
- Pitch movement
"""
BASE_URL = "https://baseballsavant.mlb.com"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
def get_statcast_data(
self,
start_date: date,
end_date: date,
player_type: str = "batter",
team: str = None
) -> pd.DataFrame:
"""
Get Statcast pitch-level data.
Args:
start_date: Start date
end_date: End date
player_type: 'batter' or 'pitcher'
team: Team abbreviation filter
Returns:
DataFrame with Statcast data
"""
url = f"{self.BASE_URL}/statcast_search/csv"
params = {
"all": "true",
"hfPT": "",
"hfAB": "",
"hfBBT": "",
"hfPR": "",
"hfZ": "",
"stadium": "",
"hfBBL": "",
"hfNewZones": "",
"hfGT": "R|",
"hfC": "",
"hfSea": "",
"hfSit": "",
"player_type": player_type,
"hfOuts": "",
"opponent": "",
"pitcher_throws": "",
"batter_stands": "",
"hfSA": "",
"game_date_gt": start_date.strftime("%Y-%m-%d"),
"game_date_lt": end_date.strftime("%Y-%m-%d"),
"team": team or "",
"position": "",
"hfRO": "",
"home_road": "",
"hfFlag": "",
"metric_1": "",
"hfInn": "",
"min_pitches": "0",
"min_results": "0",
"group_by": "name",
"sort_col": "pitches",
"player_event_sort": "h_launch_speed",
"sort_order": "desc",
"min_abs": "0",
"type": "details"
}
response = self.session.get(url, params=params)
response.raise_for_status()
df = pd.read_csv(io.StringIO(response.text))
return df
def get_player_statcast(
self,
player_id: int,
season: int,
player_type: str = "batter"
) -> pd.DataFrame:
"""Get Statcast data for a specific player."""
url = f"{self.BASE_URL}/statcast_search/csv"
lookup_key = "batters_lookup[]" if player_type == "batter" else "pitchers_lookup[]"
params = {
"all": "true",
"player_type": player_type,
lookup_key: player_id,
"hfSea": f"{season}|",
"type": "details"
}
response = self.session.get(url, params=params)
df = pd.read_csv(io.StringIO(response.text))
return df
def get_leaderboard(
self,
stat: str = "exit_velocity",
year: int = 2024,
min_pa: int = 100
) -> pd.DataFrame:
"""
Get Statcast leaderboard.
Stats: exit_velocity, launch_angle, barrel, hard_hit, xba, xslg, xwoba
"""
url = f"{self.BASE_URL}/leaderboard/expected_statistics"
params = {
"type": stat,
"year": year,
"position": "",
"team": "",
"min": min_pa,
"csv": "true"
}
response = self.session.get(url, params=params)
df = pd.read_csv(io.StringIO(response.text))
return df
def get_pitch_arsenal(self, pitcher_id: int, season: int) -> pd.DataFrame:
"""Get pitcher's pitch arsenal breakdown."""
url = f"{self.BASE_URL}/savant-player/{pitcher_id}"
params = {"stats": "statcast", "season": season}
# Parse JSON endpoint
api_url = f"{self.BASE_URL}/player-services/statcast-pitching-breakdown"
params = {"playerId": pitcher_id, "season": season}
response = self.session.get(api_url, params=params)
data = response.json()
return pd.DataFrame(data.get("pitchBreakdown", []))
# Example usage
if __name__ == "__main__":
statcast = StatcastAPI()
# Get Statcast data for last week
from datetime import timedelta
end = date.today()
start = end - timedelta(days=7)
data = statcast.get_statcast_data(start, end, "batter")
print(f"Retrieved {len(data)} pitches")
# Get exit velocity leaders
leaders = statcast.get_leaderboard("exit_velocity", 2024, 200)
print("Exit Velocity Leaders:")
print(leaders[['player_name', 'avg_hit_speed']].head(10))
NBA Stats API Client
Python client for the official NBA Stats API with all major endpoints.
"""NBA Stats API client."""
import requests
import pandas as pd
from typing import Optional, Dict, List
import time
class NBAStatsAPI:
"""
Client for NBA Stats API (stats.nba.com).
Note: NBA API requires specific headers to work properly.
"""
BASE_URL = "https://stats.nba.com/stats"
HEADERS = {
"Host": "stats.nba.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"x-nba-stats-origin": "stats",
"x-nba-stats-token": "true",
"Connection": "keep-alive",
"Referer": "https://stats.nba.com/",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
}
def __init__(self):
self.session = requests.Session()
self.session.headers.update(self.HEADERS)
self.last_request = 0
def _request(self, endpoint: str, params: Dict) -> Dict:
"""Make request with rate limiting."""
# Rate limit: 1 request per second
elapsed = time.time() - self.last_request
if elapsed < 1:
time.sleep(1 - elapsed)
url = f"{self.BASE_URL}/{endpoint}"
response = self.session.get(url, params=params, timeout=30)
response.raise_for_status()
self.last_request = time.time()
return response.json()
def _parse_response(self, data: Dict, result_index: int = 0) -> pd.DataFrame:
"""Parse NBA API response into DataFrame."""
result_sets = data.get("resultSets", data.get("resultSet", []))
if isinstance(result_sets, list) and len(result_sets) > result_index:
result = result_sets[result_index]
return pd.DataFrame(
result.get("rowSet", []),
columns=result.get("headers", [])
)
return pd.DataFrame()
# Player endpoints
def get_player_info(self, player_id: int) -> pd.DataFrame:
"""Get player biographical info."""
data = self._request("commonplayerinfo", {"PlayerID": player_id})
return self._parse_response(data)
def get_player_career_stats(self, player_id: int, per_mode: str = "PerGame") -> pd.DataFrame:
"""Get player career statistics."""
data = self._request("playercareerstats", {
"PlayerID": player_id,
"PerMode": per_mode
})
return self._parse_response(data)
def get_player_game_log(
self,
player_id: int,
season: str = "2024-25",
season_type: str = "Regular Season"
) -> pd.DataFrame:
"""Get player game log."""
data = self._request("playergamelog", {
"PlayerID": player_id,
"Season": season,
"SeasonType": season_type
})
return self._parse_response(data)
# League endpoints
def get_league_leaders(
self,
stat_category: str = "PTS",
season: str = "2024-25",
per_mode: str = "PerGame"
) -> pd.DataFrame:
"""Get league leaders for a stat category."""
data = self._request("leagueleaders", {
"LeagueID": "00",
"PerMode": per_mode,
"Scope": "S",
"Season": season,
"SeasonType": "Regular Season",
"StatCategory": stat_category
})
return self._parse_response(data)
def get_all_players(self, season: str = "2024-25") -> pd.DataFrame:
"""Get all players for a season."""
data = self._request("commonallplayers", {
"LeagueID": "00",
"Season": season,
"IsOnlyCurrentSeason": 1
})
return self._parse_response(data)
# Team endpoints
def get_team_info(self, team_id: int) -> pd.DataFrame:
"""Get team information."""
data = self._request("teamdetails", {"TeamID": team_id})
return self._parse_response(data)
def get_team_roster(self, team_id: int, season: str = "2024-25") -> pd.DataFrame:
"""Get team roster."""
data = self._request("commonteamroster", {
"TeamID": team_id,
"Season": season
})
return self._parse_response(data)
def get_team_stats(
self,
season: str = "2024-25",
per_mode: str = "PerGame"
) -> pd.DataFrame:
"""Get all team statistics."""
data = self._request("leaguedashteamstats", {
"Conference": "",
"DateFrom": "",
"DateTo": "",
"Division": "",
"GameScope": "",
"GameSegment": "",
"LastNGames": 0,
"LeagueID": "00",
"Location": "",
"MeasureType": "Base",
"Month": 0,
"OpponentTeamID": 0,
"Outcome": "",
"PORound": 0,
"PaceAdjust": "N",
"PerMode": per_mode,
"Period": 0,
"PlayerExperience": "",
"PlayerPosition": "",
"PlusMinus": "N",
"Rank": "N",
"Season": season,
"SeasonSegment": "",
"SeasonType": "Regular Season",
"ShotClockRange": "",
"StarterBench": "",
"TeamID": 0,
"TwoWay": 0,
"VsConference": "",
"VsDivision": ""
})
return self._parse_response(data)
# Shot chart
def get_shot_chart(
self,
player_id: int,
season: str = "2024-25"
) -> pd.DataFrame:
"""Get player shot chart data."""
data = self._request("shotchartdetail", {
"ContextMeasure": "FGA",
"DateFrom": "",
"DateTo": "",
"GameID": "",
"GameSegment": "",
"LastNGames": 0,
"LeagueID": "00",
"Location": "",
"Month": 0,
"OpponentTeamID": 0,
"Outcome": "",
"Period": 0,
"PlayerID": player_id,
"PlayerPosition": "",
"RookieYear": "",
"Season": season,
"SeasonSegment": "",
"SeasonType": "Regular Season",
"TeamID": 0,
"VsConference": "",
"VsDivision": ""
})
return self._parse_response(data)
# Example usage
if __name__ == "__main__":
nba = NBAStatsAPI()
# Get scoring leaders
leaders = nba.get_league_leaders("PTS", "2024-25")
print("Scoring Leaders:")
print(leaders[['PLAYER', 'TEAM', 'PTS']].head(10))
# Get LeBron's career stats (player_id: 2544)
lebron = nba.get_player_career_stats(2544)
print("\nLeBron Career Stats:")
print(lebron.head())
ESPN API Client
Access ESPN's public API for scores, standings, and team data across multiple sports.
"""ESPN API client for multiple sports."""
import requests
import pandas as pd
from typing import Optional, Dict, List
from datetime import date
class ESPNAPI:
"""
Client for ESPN's public API endpoints.
Supports: NFL, NBA, MLB, NHL, College Football, College Basketball
"""
BASE_URL = "https://site.api.espn.com/apis/site/v2/sports"
SPORTS = {
"nfl": "football/nfl",
"nba": "basketball/nba",
"mlb": "baseball/mlb",
"nhl": "hockey/nhl",
"cfb": "football/college-football",
"cbb": "basketball/mens-college-basketball",
"wnba": "basketball/wnba",
"mls": "soccer/usa.1"
}
def __init__(self):
self.session = requests.Session()
def _get(self, sport: str, endpoint: str, params: Dict = None) -> Dict:
"""Make API request."""
sport_path = self.SPORTS.get(sport, sport)
url = f"{self.BASE_URL}/{sport_path}/{endpoint}"
response = self.session.get(url, params=params or {})
response.raise_for_status()
return response.json()
# Scoreboard
def get_scoreboard(self, sport: str, date_str: str = None) -> Dict:
"""
Get scoreboard/schedule.
Args:
sport: Sport key (nfl, nba, mlb, etc.)
date_str: Date in YYYYMMDD format
Returns:
Scoreboard data with games
"""
params = {}
if date_str:
params["dates"] = date_str
return self._get(sport, "scoreboard", params)
def get_scores_df(self, sport: str, date_str: str = None) -> pd.DataFrame:
"""Get scoreboard as DataFrame."""
data = self.get_scoreboard(sport, date_str)
games = []
for event in data.get("events", []):
competition = event.get("competitions", [{}])[0]
competitors = competition.get("competitors", [])
if len(competitors) >= 2:
home = next((c for c in competitors if c.get("homeAway") == "home"), competitors[0])
away = next((c for c in competitors if c.get("homeAway") == "away"), competitors[1])
games.append({
"game_id": event.get("id"),
"date": event.get("date"),
"status": event.get("status", {}).get("type", {}).get("description"),
"home_team": home.get("team", {}).get("displayName"),
"home_score": int(home.get("score", 0)),
"away_team": away.get("team", {}).get("displayName"),
"away_score": int(away.get("score", 0)),
"venue": competition.get("venue", {}).get("fullName")
})
return pd.DataFrame(games)
# Standings
def get_standings(self, sport: str, season: int = None) -> Dict:
"""Get league standings."""
params = {}
if season:
params["season"] = season
return self._get(sport, "standings", params)
def get_standings_df(self, sport: str) -> pd.DataFrame:
"""Get standings as DataFrame."""
data = self.get_standings(sport)
teams = []
for group in data.get("children", []):
for team_entry in group.get("standings", {}).get("entries", []):
team = team_entry.get("team", {})
stats = {s["name"]: s["value"] for s in team_entry.get("stats", [])}
teams.append({
"team": team.get("displayName"),
"wins": int(stats.get("wins", 0)),
"losses": int(stats.get("losses", 0)),
"pct": float(stats.get("winPercent", 0)),
"gb": stats.get("gamesBehind", "-"),
"division": group.get("name", "")
})
return pd.DataFrame(teams)
# Teams
def get_teams(self, sport: str) -> List[Dict]:
"""Get all teams for a sport."""
data = self._get(sport, "teams")
teams = []
for group in data.get("sports", [{}])[0].get("leagues", [{}])[0].get("teams", []):
team = group.get("team", {})
teams.append({
"id": team.get("id"),
"name": team.get("displayName"),
"abbreviation": team.get("abbreviation"),
"location": team.get("location"),
"color": team.get("color"),
"logo": team.get("logos", [{}])[0].get("href") if team.get("logos") else None
})
return teams
def get_team_info(self, sport: str, team_id: str) -> Dict:
"""Get detailed team information."""
return self._get(sport, f"teams/{team_id}")
def get_team_roster(self, sport: str, team_id: str) -> pd.DataFrame:
"""Get team roster."""
data = self._get(sport, f"teams/{team_id}/roster")
athletes = data.get("athletes", [])
roster = []
for group in athletes:
for athlete in group.get("items", []):
roster.append({
"id": athlete.get("id"),
"name": athlete.get("displayName"),
"position": athlete.get("position", {}).get("abbreviation"),
"jersey": athlete.get("jersey"),
"height": athlete.get("displayHeight"),
"weight": athlete.get("displayWeight"),
"age": athlete.get("age")
})
return pd.DataFrame(roster)
# News
def get_news(self, sport: str, limit: int = 10) -> List[Dict]:
"""Get latest news for a sport."""
data = self._get(sport, "news", {"limit": limit})
articles = []
for article in data.get("articles", []):
articles.append({
"headline": article.get("headline"),
"description": article.get("description"),
"published": article.get("published"),
"link": article.get("links", {}).get("web", {}).get("href")
})
return articles
# Example usage
if __name__ == "__main__":
espn = ESPNAPI()
# Get NFL scores
nfl_scores = espn.get_scores_df("nfl")
print("NFL Scores:")
print(nfl_scores)
# Get NBA standings
nba_standings = espn.get_standings_df("nba")
print("\nNBA Standings:")
print(nba_standings.head(10))
# Get MLB teams
mlb_teams = espn.get_teams("mlb")
print(f"\nMLB Teams: {len(mlb_teams)}")
Football-Data.org API
Access European soccer data including Premier League, La Liga, Bundesliga, and more.
"""Football-Data.org API client for European soccer."""
import requests
import pandas as pd
from typing import Optional, Dict, List
from datetime import date, datetime
class FootballDataAPI:
"""
Client for Football-Data.org API.
Free tier includes:
- Premier League
- La Liga
- Bundesliga
- Serie A
- Ligue 1
- Champions League
"""
BASE_URL = "https://api.football-data.org/v4"
COMPETITIONS = {
"premier_league": "PL",
"la_liga": "PD",
"bundesliga": "BL1",
"serie_a": "SA",
"ligue_1": "FL1",
"champions_league": "CL",
"world_cup": "WC"
}
def __init__(self, api_key: str):
"""
Initialize with API key.
Get free key at: https://www.football-data.org/client/register
"""
self.session = requests.Session()
self.session.headers.update({"X-Auth-Token": api_key})
def _get(self, endpoint: str, params: Dict = None) -> Dict:
"""Make API request."""
url = f"{self.BASE_URL}/{endpoint}"
response = self.session.get(url, params=params or {})
response.raise_for_status()
return response.json()
# Competitions
def get_competitions(self) -> List[Dict]:
"""Get all available competitions."""
data = self._get("competitions")
return data.get("competitions", [])
def get_competition(self, code: str) -> Dict:
"""Get competition details."""
return self._get(f"competitions/{code}")
# Standings
def get_standings(self, competition: str) -> pd.DataFrame:
"""Get league standings."""
code = self.COMPETITIONS.get(competition, competition)
data = self._get(f"competitions/{code}/standings")
standings = []
for table in data.get("standings", []):
if table.get("type") == "TOTAL":
for entry in table.get("table", []):
standings.append({
"position": entry.get("position"),
"team": entry.get("team", {}).get("name"),
"played": entry.get("playedGames"),
"won": entry.get("won"),
"draw": entry.get("draw"),
"lost": entry.get("lost"),
"goals_for": entry.get("goalsFor"),
"goals_against": entry.get("goalsAgainst"),
"goal_diff": entry.get("goalDifference"),
"points": entry.get("points")
})
return pd.DataFrame(standings)
# Matches
def get_matches(
self,
competition: str = None,
date_from: date = None,
date_to: date = None,
status: str = None
) -> pd.DataFrame:
"""
Get matches.
Args:
competition: Competition code or name
date_from: Start date
date_to: End date
status: SCHEDULED, LIVE, IN_PLAY, PAUSED, FINISHED
"""
params = {}
if date_from:
params["dateFrom"] = date_from.strftime("%Y-%m-%d")
if date_to:
params["dateTo"] = date_to.strftime("%Y-%m-%d")
if status:
params["status"] = status
if competition:
code = self.COMPETITIONS.get(competition, competition)
data = self._get(f"competitions/{code}/matches", params)
else:
data = self._get("matches", params)
matches = []
for match in data.get("matches", []):
matches.append({
"id": match.get("id"),
"competition": match.get("competition", {}).get("name"),
"date": match.get("utcDate"),
"status": match.get("status"),
"home_team": match.get("homeTeam", {}).get("name"),
"away_team": match.get("awayTeam", {}).get("name"),
"home_score": match.get("score", {}).get("fullTime", {}).get("home"),
"away_score": match.get("score", {}).get("fullTime", {}).get("away"),
"matchday": match.get("matchday")
})
return pd.DataFrame(matches)
# Teams
def get_teams(self, competition: str) -> pd.DataFrame:
"""Get teams in a competition."""
code = self.COMPETITIONS.get(competition, competition)
data = self._get(f"competitions/{code}/teams")
teams = []
for team in data.get("teams", []):
teams.append({
"id": team.get("id"),
"name": team.get("name"),
"short_name": team.get("shortName"),
"tla": team.get("tla"),
"venue": team.get("venue"),
"founded": team.get("founded"),
"coach": team.get("coach", {}).get("name")
})
return pd.DataFrame(teams)
def get_team_matches(self, team_id: int, status: str = None) -> pd.DataFrame:
"""Get matches for a specific team."""
params = {}
if status:
params["status"] = status
data = self._get(f"teams/{team_id}/matches", params)
matches = []
for match in data.get("matches", []):
matches.append({
"date": match.get("utcDate"),
"competition": match.get("competition", {}).get("name"),
"home_team": match.get("homeTeam", {}).get("name"),
"away_team": match.get("awayTeam", {}).get("name"),
"score": f"{match.get('score', {}).get('fullTime', {}).get('home', '-')}-{match.get('score', {}).get('fullTime', {}).get('away', '-')}"
})
return pd.DataFrame(matches)
# Scorers
def get_scorers(self, competition: str, limit: int = 10) -> pd.DataFrame:
"""Get top scorers."""
code = self.COMPETITIONS.get(competition, competition)
data = self._get(f"competitions/{code}/scorers", {"limit": limit})
scorers = []
for scorer in data.get("scorers", []):
player = scorer.get("player", {})
team = scorer.get("team", {})
scorers.append({
"player": player.get("name"),
"team": team.get("name"),
"goals": scorer.get("goals"),
"assists": scorer.get("assists"),
"penalties": scorer.get("penalties"),
"nationality": player.get("nationality")
})
return pd.DataFrame(scorers)
# Example usage
if __name__ == "__main__":
# Get API key from https://www.football-data.org/
api = FootballDataAPI("YOUR_API_KEY")
# Get Premier League standings
standings = api.get_standings("premier_league")
print("Premier League Standings:")
print(standings)
# Get top scorers
scorers = api.get_scorers("premier_league", 20)
print("\nTop Scorers:")
print(scorers)
Create Sports Database Schema (PostgreSQL)
Complete PostgreSQL schema for storing sports statistics with players, teams, games, and performance data.
-- Sports Analytics Database Schema (PostgreSQL)
-- Enable extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
-- Teams table
CREATE TABLE teams (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
name VARCHAR(100) NOT NULL,
abbreviation VARCHAR(10),
city VARCHAR(100),
conference VARCHAR(50),
division VARCHAR(50),
sport VARCHAR(50) NOT NULL,
active BOOLEAN DEFAULT true,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(abbreviation, sport)
);
-- Players table
CREATE TABLE players (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
first_name VARCHAR(100) NOT NULL,
last_name VARCHAR(100) NOT NULL,
team_id UUID REFERENCES teams(id),
position VARCHAR(50),
jersey_number INTEGER,
birth_date DATE,
height_inches INTEGER,
weight_lbs INTEGER,
bats VARCHAR(10), -- Baseball specific
throws VARCHAR(10), -- Baseball specific
active BOOLEAN DEFAULT true,
external_id VARCHAR(50), -- ID from external APIs
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_players_team ON players(team_id);
CREATE INDEX idx_players_name ON players(last_name, first_name);
CREATE INDEX idx_players_external ON players(external_id);
-- Seasons table
CREATE TABLE seasons (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
sport VARCHAR(50) NOT NULL,
year INTEGER NOT NULL,
start_date DATE,
end_date DATE,
type VARCHAR(20) DEFAULT 'regular', -- regular, postseason
UNIQUE(sport, year, type)
);
-- Games table
CREATE TABLE games (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
season_id UUID REFERENCES seasons(id),
home_team_id UUID REFERENCES teams(id) NOT NULL,
away_team_id UUID REFERENCES teams(id) NOT NULL,
game_date DATE NOT NULL,
game_time TIME,
venue VARCHAR(200),
home_score INTEGER,
away_score INTEGER,
status VARCHAR(20) DEFAULT 'scheduled',
external_id VARCHAR(50),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_games_date ON games(game_date);
CREATE INDEX idx_games_teams ON games(home_team_id, away_team_id);
CREATE INDEX idx_games_season ON games(season_id);
-- Player game stats (normalized design)
CREATE TABLE player_game_stats (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
player_id UUID REFERENCES players(id) NOT NULL,
game_id UUID REFERENCES games(id) NOT NULL,
stat_type VARCHAR(50) NOT NULL, -- e.g., 'batting', 'pitching', 'passing'
stats JSONB NOT NULL, -- Flexible JSON for different stat types
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(player_id, game_id, stat_type)
);
CREATE INDEX idx_player_stats_player ON player_game_stats(player_id);
CREATE INDEX idx_player_stats_game ON player_game_stats(game_id);
CREATE INDEX idx_player_stats_type ON player_game_stats(stat_type);
CREATE INDEX idx_player_stats_json ON player_game_stats USING GIN(stats);
-- Season aggregates for faster queries
CREATE TABLE player_season_stats (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
player_id UUID REFERENCES players(id) NOT NULL,
season_id UUID REFERENCES seasons(id) NOT NULL,
stat_type VARCHAR(50) NOT NULL,
games_played INTEGER DEFAULT 0,
stats JSONB NOT NULL,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(player_id, season_id, stat_type)
);
-- Trigger to update timestamps
CREATE OR REPLACE FUNCTION update_timestamp()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER update_teams_timestamp BEFORE UPDATE ON teams
FOR EACH ROW EXECUTE FUNCTION update_timestamp();
CREATE TRIGGER update_players_timestamp BEFORE UPDATE ON players
FOR EACH ROW EXECUTE FUNCTION update_timestamp();
CREATE TRIGGER update_games_timestamp BEFORE UPDATE ON games
FOR EACH ROW EXECUTE FUNCTION update_timestamp();
SQLAlchemy ORM Models for Sports Data
Python SQLAlchemy ORM models for sports analytics database with relationships and query methods.
"""SQLAlchemy ORM models for sports analytics database."""
from datetime import date, datetime
from typing import List, Optional, Dict, Any
from sqlalchemy import (
create_engine, Column, Integer, String, Float, Date, DateTime,
ForeignKey, Boolean, JSON, Index, UniqueConstraint
)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker, Session
from sqlalchemy.dialects.postgresql import UUID
import uuid
Base = declarative_base()
class Team(Base):
"""Team model."""
__tablename__ = 'teams'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
name = Column(String(100), nullable=False)
abbreviation = Column(String(10))
city = Column(String(100))
conference = Column(String(50))
division = Column(String(50))
sport = Column(String(50), nullable=False)
active = Column(Boolean, default=True)
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
players = relationship("Player", back_populates="team")
home_games = relationship("Game", foreign_keys="Game.home_team_id", back_populates="home_team")
away_games = relationship("Game", foreign_keys="Game.away_team_id", back_populates="away_team")
__table_args__ = (
UniqueConstraint('abbreviation', 'sport', name='uq_team_abbr_sport'),
)
class Player(Base):
"""Player model."""
__tablename__ = 'players'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
first_name = Column(String(100), nullable=False)
last_name = Column(String(100), nullable=False)
team_id = Column(UUID(as_uuid=True), ForeignKey('teams.id'))
position = Column(String(50))
jersey_number = Column(Integer)
birth_date = Column(Date)
height_inches = Column(Integer)
weight_lbs = Column(Integer)
active = Column(Boolean, default=True)
external_id = Column(String(50))
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
team = relationship("Team", back_populates="players")
game_stats = relationship("PlayerGameStats", back_populates="player")
season_stats = relationship("PlayerSeasonStats", back_populates="player")
@property
def full_name(self) -> str:
return f"{self.first_name} {self.last_name}"
@property
def height_formatted(self) -> str:
if self.height_inches:
feet = self.height_inches // 12
inches = self.height_inches % 12
return f"{feet}'{inches}\""
return ""
__table_args__ = (
Index('idx_players_team', 'team_id'),
Index('idx_players_name', 'last_name', 'first_name'),
Index('idx_players_external', 'external_id'),
)
class Season(Base):
"""Season model."""
__tablename__ = 'seasons'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
sport = Column(String(50), nullable=False)
year = Column(Integer, nullable=False)
start_date = Column(Date)
end_date = Column(Date)
type = Column(String(20), default='regular')
games = relationship("Game", back_populates="season")
player_stats = relationship("PlayerSeasonStats", back_populates="season")
__table_args__ = (
UniqueConstraint('sport', 'year', 'type', name='uq_season'),
)
class Game(Base):
"""Game model."""
__tablename__ = 'games'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
season_id = Column(UUID(as_uuid=True), ForeignKey('seasons.id'))
home_team_id = Column(UUID(as_uuid=True), ForeignKey('teams.id'), nullable=False)
away_team_id = Column(UUID(as_uuid=True), ForeignKey('teams.id'), nullable=False)
game_date = Column(Date, nullable=False)
venue = Column(String(200))
home_score = Column(Integer)
away_score = Column(Integer)
status = Column(String(20), default='scheduled')
external_id = Column(String(50))
season = relationship("Season", back_populates="games")
home_team = relationship("Team", foreign_keys=[home_team_id], back_populates="home_games")
away_team = relationship("Team", foreign_keys=[away_team_id], back_populates="away_games")
player_stats = relationship("PlayerGameStats", back_populates="game")
@property
def winner(self) -> Optional["Team"]:
if self.home_score is not None and self.away_score is not None:
if self.home_score > self.away_score:
return self.home_team
elif self.away_score > self.home_score:
return self.away_team
return None
class PlayerGameStats(Base):
"""Player game statistics with JSON stats."""
__tablename__ = 'player_game_stats'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
player_id = Column(UUID(as_uuid=True), ForeignKey('players.id'), nullable=False)
game_id = Column(UUID(as_uuid=True), ForeignKey('games.id'), nullable=False)
stat_type = Column(String(50), nullable=False)
stats = Column(JSON, nullable=False)
created_at = Column(DateTime, default=datetime.utcnow)
player = relationship("Player", back_populates="game_stats")
game = relationship("Game", back_populates="player_stats")
__table_args__ = (
UniqueConstraint('player_id', 'game_id', 'stat_type', name='uq_player_game_stat'),
)
class PlayerSeasonStats(Base):
"""Aggregated season statistics."""
__tablename__ = 'player_season_stats'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
player_id = Column(UUID(as_uuid=True), ForeignKey('players.id'), nullable=False)
season_id = Column(UUID(as_uuid=True), ForeignKey('seasons.id'), nullable=False)
stat_type = Column(String(50), nullable=False)
games_played = Column(Integer, default=0)
stats = Column(JSON, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
player = relationship("Player", back_populates="season_stats")
season = relationship("Season", back_populates="player_stats")
# Database connection and session management
def get_engine(database_url: str):
return create_engine(database_url)
def get_session(engine) -> Session:
SessionLocal = sessionmaker(bind=engine)
return SessionLocal()
# Example usage
if __name__ == "__main__":
DATABASE_URL = "postgresql://user:password@localhost/sports_db"
engine = get_engine(DATABASE_URL)
# Create tables
Base.metadata.create_all(engine)
# Use session
with get_session(engine) as session:
team = Team(name="New York Yankees", abbreviation="NYY", sport="baseball")
session.add(team)
session.commit()
Bulk Insert Sports Data with pandas
Efficiently bulk insert sports statistics from pandas DataFrames into a database using SQLAlchemy.
"""Bulk insert sports data from pandas to database."""
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.orm import Session
from typing import Optional
import uuid
def bulk_insert_players(
df: pd.DataFrame,
engine,
team_mapping: dict,
chunk_size: int = 1000
) -> int:
"""
Bulk insert player data from DataFrame.
Args:
df: DataFrame with player data
engine: SQLAlchemy engine
team_mapping: Dict mapping team names to IDs
chunk_size: Records per chunk
Returns:
Number of rows inserted
"""
# Prepare data
df = df.copy()
# Map team names to IDs
if 'team' in df.columns:
df['team_id'] = df['team'].map(team_mapping)
# Generate UUIDs
df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]
# Select and rename columns to match schema
column_mapping = {
'player_name': 'last_name', # Split if needed
'pos': 'position',
'number': 'jersey_number',
'ht': 'height_inches',
'wt': 'weight_lbs'
}
df = df.rename(columns=column_mapping)
# Split name if needed
if 'player_name' in df.columns or 'name' in df.columns:
name_col = 'player_name' if 'player_name' in df.columns else 'name'
names = df[name_col].str.split(' ', n=1, expand=True)
df['first_name'] = names[0]
df['last_name'] = names[1] if names.shape[1] > 1 else ''
# Select only valid columns
valid_columns = ['id', 'first_name', 'last_name', 'team_id', 'position',
'jersey_number', 'height_inches', 'weight_lbs', 'external_id']
df = df[[c for c in valid_columns if c in df.columns]]
# Bulk insert
total_inserted = 0
for i in range(0, len(df), chunk_size):
chunk = df.iloc[i:i+chunk_size]
chunk.to_sql('players', engine, if_exists='append', index=False, method='multi')
total_inserted += len(chunk)
print(f"Inserted {total_inserted}/{len(df)} players")
return total_inserted
def bulk_insert_game_stats(
df: pd.DataFrame,
engine,
player_mapping: dict,
game_mapping: dict,
stat_type: str = 'batting',
chunk_size: int = 500
) -> int:
"""
Bulk insert game statistics with JSON stats column.
Args:
df: DataFrame with game statistics
engine: SQLAlchemy engine
player_mapping: Dict mapping player names/IDs to database IDs
game_mapping: Dict mapping game identifiers to database IDs
stat_type: Type of statistics
chunk_size: Records per chunk
Returns:
Number of rows inserted
"""
import json
df = df.copy()
# Map to database IDs
df['player_id'] = df['player_id'].map(player_mapping)
df['game_id'] = df['game_id'].map(game_mapping)
# Generate IDs
df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]
df['stat_type'] = stat_type
# Identify stat columns (everything not id/mapping related)
id_columns = ['id', 'player_id', 'game_id', 'stat_type']
stat_columns = [c for c in df.columns if c not in id_columns]
# Create JSON stats column
df['stats'] = df[stat_columns].apply(
lambda row: json.dumps(row.dropna().to_dict()),
axis=1
)
# Select final columns
insert_df = df[['id', 'player_id', 'game_id', 'stat_type', 'stats']]
# Remove rows with missing IDs
insert_df = insert_df.dropna(subset=['player_id', 'game_id'])
# Bulk insert
total_inserted = 0
for i in range(0, len(insert_df), chunk_size):
chunk = insert_df.iloc[i:i+chunk_size]
chunk.to_sql('player_game_stats', engine, if_exists='append', index=False)
total_inserted += len(chunk)
print(f"Inserted {total_inserted}/{len(insert_df)} stat records")
return total_inserted
def upsert_season_aggregates(
engine,
season_id: str,
stat_type: str = 'batting'
):
"""
Aggregate game stats into season totals using SQL.
"""
sql = text("""
INSERT INTO player_season_stats (id, player_id, season_id, stat_type, games_played, stats)
SELECT
gen_random_uuid() as id,
pgs.player_id,
:season_id as season_id,
:stat_type as stat_type,
COUNT(*) as games_played,
jsonb_build_object(
'games', COUNT(*),
'total_stats', jsonb_agg(pgs.stats)
) as stats
FROM player_game_stats pgs
JOIN games g ON pgs.game_id = g.id
WHERE g.season_id = :season_id AND pgs.stat_type = :stat_type
GROUP BY pgs.player_id
ON CONFLICT (player_id, season_id, stat_type)
DO UPDATE SET
games_played = EXCLUDED.games_played,
stats = EXCLUDED.stats,
updated_at = CURRENT_TIMESTAMP
""")
with engine.begin() as conn:
result = conn.execute(sql, {"season_id": season_id, "stat_type": stat_type})
return result.rowcount
# Example usage
if __name__ == "__main__":
import pybaseball as pyb
DATABASE_URL = "postgresql://user:password@localhost/sports_db"
engine = create_engine(DATABASE_URL)
# Get batting data
batting = pyb.batting_stats(2024)
# Create team mapping (simplified)
team_mapping = {'NYY': 'uuid-here', 'BOS': 'uuid-here'}
# Insert (would need proper mappings)
# inserted = bulk_insert_game_stats(batting, engine, player_map, game_map)
Query Sports Stats with pandas read_sql
Efficient SQL queries for sports analytics using pandas and SQLAlchemy with parameterized queries.
"""Query sports statistics database with pandas."""
import pandas as pd
from sqlalchemy import create_engine, text
from typing import Optional, List
class SportsStatsQuery:
"""Query builder for sports statistics."""
def __init__(self, database_url: str):
self.engine = create_engine(database_url)
def get_player_season_stats(
self,
season_year: int,
sport: str,
stat_type: str = 'batting',
min_games: int = 0
) -> pd.DataFrame:
"""
Get player statistics for a season.
"""
query = """
SELECT
p.first_name,
p.last_name,
t.name as team,
p.position,
pss.games_played,
pss.stats
FROM player_season_stats pss
JOIN players p ON pss.player_id = p.id
JOIN teams t ON p.team_id = t.id
JOIN seasons s ON pss.season_id = s.id
WHERE s.year = :year
AND s.sport = :sport
AND pss.stat_type = :stat_type
AND pss.games_played >= :min_games
ORDER BY pss.games_played DESC
"""
df = pd.read_sql(
text(query),
self.engine,
params={
'year': season_year,
'sport': sport,
'stat_type': stat_type,
'min_games': min_games
}
)
# Expand JSON stats column
if 'stats' in df.columns and len(df) > 0:
stats_df = pd.json_normalize(df['stats'])
df = pd.concat([df.drop('stats', axis=1), stats_df], axis=1)
return df
def get_team_standings(
self,
season_year: int,
sport: str,
division: Optional[str] = None
) -> pd.DataFrame:
"""
Calculate team standings from game results.
"""
query = """
WITH game_results AS (
SELECT
t.id as team_id,
t.name as team,
t.division,
t.conference,
CASE
WHEN g.home_team_id = t.id AND g.home_score > g.away_score THEN 1
WHEN g.away_team_id = t.id AND g.away_score > g.home_score THEN 1
ELSE 0
END as win,
CASE
WHEN g.home_team_id = t.id AND g.home_score < g.away_score THEN 1
WHEN g.away_team_id = t.id AND g.away_score < g.home_score THEN 1
ELSE 0
END as loss,
CASE WHEN g.home_team_id = t.id THEN g.home_score ELSE g.away_score END as runs_scored,
CASE WHEN g.home_team_id = t.id THEN g.away_score ELSE g.home_score END as runs_allowed
FROM teams t
JOIN games g ON t.id = g.home_team_id OR t.id = g.away_team_id
JOIN seasons s ON g.season_id = s.id
WHERE s.year = :year
AND s.sport = :sport
AND g.status = 'final'
)
SELECT
team,
division,
conference,
SUM(win) as wins,
SUM(loss) as losses,
ROUND(SUM(win)::numeric / NULLIF(SUM(win) + SUM(loss), 0), 3) as win_pct,
SUM(runs_scored) as runs_scored,
SUM(runs_allowed) as runs_allowed,
SUM(runs_scored) - SUM(runs_allowed) as run_diff
FROM game_results
GROUP BY team_id, team, division, conference
ORDER BY win_pct DESC
"""
df = pd.read_sql(
text(query),
self.engine,
params={'year': season_year, 'sport': sport}
)
if division:
df = df[df['division'] == division]
return df
def get_player_game_log(
self,
player_name: str,
season_year: int,
stat_type: str = 'batting'
) -> pd.DataFrame:
"""
Get player's game-by-game statistics.
"""
query = """
SELECT
g.game_date,
CASE
WHEN g.home_team_id = t.id THEN 'vs'
ELSE '@'
END as home_away,
opp.abbreviation as opponent,
pgs.stats
FROM player_game_stats pgs
JOIN players p ON pgs.player_id = p.id
JOIN teams t ON p.team_id = t.id
JOIN games g ON pgs.game_id = g.id
JOIN teams opp ON (
CASE
WHEN g.home_team_id = t.id THEN g.away_team_id
ELSE g.home_team_id
END = opp.id
)
JOIN seasons s ON g.season_id = s.id
WHERE CONCAT(p.first_name, ' ', p.last_name) ILIKE :player_name
AND s.year = :year
AND pgs.stat_type = :stat_type
ORDER BY g.game_date
"""
df = pd.read_sql(
text(query),
self.engine,
params={
'player_name': f'%{player_name}%',
'year': season_year,
'stat_type': stat_type
}
)
# Expand stats JSON
if 'stats' in df.columns and len(df) > 0:
stats_df = pd.json_normalize(df['stats'])
df = pd.concat([df.drop('stats', axis=1), stats_df], axis=1)
return df
def compare_players(
self,
player_names: List[str],
season_year: int,
stats: List[str],
stat_type: str = 'batting'
) -> pd.DataFrame:
"""
Compare multiple players' statistics.
"""
placeholders = ', '.join([f':player_{i}' for i in range(len(player_names))])
query = f"""
SELECT
CONCAT(p.first_name, ' ', p.last_name) as player,
t.name as team,
pss.games_played,
pss.stats
FROM player_season_stats pss
JOIN players p ON pss.player_id = p.id
JOIN teams t ON p.team_id = t.id
JOIN seasons s ON pss.season_id = s.id
WHERE CONCAT(p.first_name, ' ', p.last_name) IN ({placeholders})
AND s.year = :year
AND pss.stat_type = :stat_type
"""
params = {f'player_{i}': name for i, name in enumerate(player_names)}
params['year'] = season_year
params['stat_type'] = stat_type
df = pd.read_sql(text(query), self.engine, params=params)
# Expand and select specific stats
if 'stats' in df.columns and len(df) > 0:
stats_df = pd.json_normalize(df['stats'])
df = pd.concat([df.drop('stats', axis=1), stats_df], axis=1)
# Filter to requested stats
base_cols = ['player', 'team', 'games_played']
available_stats = [s for s in stats if s in df.columns]
df = df[base_cols + available_stats]
return df
# Example usage
if __name__ == "__main__":
db = SportsStatsQuery("postgresql://user:password@localhost/sports_db")
# Get season batting stats
batting = db.get_player_season_stats(2024, 'baseball', 'batting', min_games=100)
print(batting.head())
# Get standings
standings = db.get_team_standings(2024, 'baseball')
print(standings)
# Compare players
comparison = db.compare_players(
['Mike Trout', 'Aaron Judge', 'Mookie Betts'],
2024,
['avg', 'hr', 'rbi', 'ops']
)
print(comparison)
Redis Caching for Live Sports Data
Implement Redis caching for live sports scores and frequently accessed statistics.
"""Redis caching for live sports data."""
import redis
import json
from datetime import datetime, timedelta
from typing import Optional, Dict, Any, List
import hashlib
class SportsDataCache:
"""
Redis cache for sports data with TTL management.
Supports:
- Live scores (short TTL)
- Player stats (medium TTL)
- Historical data (long TTL)
"""
def __init__(
self,
host: str = 'localhost',
port: int = 6379,
db: int = 0,
password: Optional[str] = None
):
self.redis = redis.Redis(
host=host,
port=port,
db=db,
password=password,
decode_responses=True
)
# TTL settings (seconds)
self.ttl = {
'live_scores': 30, # 30 seconds
'game_stats': 300, # 5 minutes
'player_stats': 3600, # 1 hour
'standings': 1800, # 30 minutes
'historical': 86400, # 24 hours
}
def _make_key(self, prefix: str, *args) -> str:
"""Create a cache key from prefix and arguments."""
key_parts = [prefix] + [str(a) for a in args]
return ':'.join(key_parts)
def _hash_params(self, params: Dict) -> str:
"""Create hash of parameters for cache key."""
param_str = json.dumps(params, sort_keys=True)
return hashlib.md5(param_str.encode()).hexdigest()[:8]
# Live Scores
def set_live_scores(self, sport: str, date: str, scores: List[Dict]):
"""Cache live scores for a sport and date."""
key = self._make_key('live', sport, date)
self.redis.setex(
key,
self.ttl['live_scores'],
json.dumps(scores)
)
def get_live_scores(self, sport: str, date: str) -> Optional[List[Dict]]:
"""Get cached live scores."""
key = self._make_key('live', sport, date)
data = self.redis.get(key)
return json.loads(data) if data else None
def update_single_score(self, sport: str, date: str, game_id: str, score_data: Dict):
"""Update a single game score in the cache."""
key = self._make_key('live', sport, date)
# Use pipeline for atomic update
pipe = self.redis.pipeline()
existing = self.redis.get(key)
if existing:
scores = json.loads(existing)
# Find and update the game
for i, game in enumerate(scores):
if game.get('game_id') == game_id:
scores[i].update(score_data)
break
pipe.setex(key, self.ttl['live_scores'], json.dumps(scores))
else:
pipe.setex(key, self.ttl['live_scores'], json.dumps([score_data]))
pipe.execute()
# Player Stats
def set_player_stats(
self,
player_id: str,
season: int,
stat_type: str,
stats: Dict
):
"""Cache player statistics."""
key = self._make_key('player', player_id, season, stat_type)
self.redis.setex(
key,
self.ttl['player_stats'],
json.dumps(stats)
)
def get_player_stats(
self,
player_id: str,
season: int,
stat_type: str
) -> Optional[Dict]:
"""Get cached player statistics."""
key = self._make_key('player', player_id, season, stat_type)
data = self.redis.get(key)
return json.loads(data) if data else None
# Standings
def set_standings(self, sport: str, season: int, standings: List[Dict]):
"""Cache league standings."""
key = self._make_key('standings', sport, season)
self.redis.setex(
key,
self.ttl['standings'],
json.dumps(standings)
)
def get_standings(self, sport: str, season: int) -> Optional[List[Dict]]:
"""Get cached standings."""
key = self._make_key('standings', sport, season)
data = self.redis.get(key)
return json.loads(data) if data else None
# Query Cache
def cache_query_result(
self,
query_type: str,
params: Dict,
result: Any,
ttl_type: str = 'player_stats'
):
"""Cache arbitrary query result."""
param_hash = self._hash_params(params)
key = self._make_key('query', query_type, param_hash)
self.redis.setex(
key,
self.ttl.get(ttl_type, 3600),
json.dumps(result)
)
def get_cached_query(self, query_type: str, params: Dict) -> Optional[Any]:
"""Get cached query result."""
param_hash = self._hash_params(params)
key = self._make_key('query', query_type, param_hash)
data = self.redis.get(key)
return json.loads(data) if data else None
# Leaderboards using sorted sets
def update_leaderboard(
self,
stat_name: str,
season: int,
player_id: str,
value: float
):
"""Update a statistical leaderboard."""
key = self._make_key('leaderboard', stat_name, season)
self.redis.zadd(key, {player_id: value})
def get_leaderboard(
self,
stat_name: str,
season: int,
top_n: int = 10,
descending: bool = True
) -> List[tuple]:
"""Get top N from leaderboard."""
key = self._make_key('leaderboard', stat_name, season)
if descending:
return self.redis.zrevrange(key, 0, top_n - 1, withscores=True)
return self.redis.zrange(key, 0, top_n - 1, withscores=True)
# Cache invalidation
def invalidate_player(self, player_id: str):
"""Invalidate all cache entries for a player."""
pattern = f'player:{player_id}:*'
keys = self.redis.keys(pattern)
if keys:
self.redis.delete(*keys)
def invalidate_sport_date(self, sport: str, date: str):
"""Invalidate live scores for a sport/date."""
key = self._make_key('live', sport, date)
self.redis.delete(key)
# Example usage
if __name__ == "__main__":
cache = SportsDataCache()
# Cache live scores
scores = [
{'game_id': '123', 'home': 'NYY', 'away': 'BOS', 'home_score': 5, 'away_score': 3},
{'game_id': '124', 'home': 'LAD', 'away': 'SFG', 'home_score': 2, 'away_score': 2}
]
cache.set_live_scores('mlb', '2024-06-15', scores)
# Retrieve
cached = cache.get_live_scores('mlb', '2024-06-15')
print(cached)
# Update leaderboard
cache.update_leaderboard('hr', 2024, 'player_1', 45)
cache.update_leaderboard('hr', 2024, 'player_2', 42)
# Get HR leaders
leaders = cache.get_leaderboard('hr', 2024)
print(leaders)
MongoDB Schema for Sports Events
NoSQL MongoDB schema design for storing sports events, play-by-play data, and flexible statistics.
"""MongoDB schema and operations for sports analytics."""
from pymongo import MongoClient, ASCENDING, DESCENDING
from pymongo.collection import Collection
from datetime import datetime
from typing import Dict, List, Optional, Any
from bson import ObjectId
class SportsMongoDB:
"""
MongoDB operations for sports analytics.
Collections:
- games: Game metadata and scores
- plays: Play-by-play data
- players: Player information
- stats: Aggregated statistics
"""
def __init__(self, connection_string: str, db_name: str = 'sports_analytics'):
self.client = MongoClient(connection_string)
self.db = self.client[db_name]
self._setup_indexes()
def _setup_indexes(self):
"""Create necessary indexes."""
# Games collection
self.db.games.create_index([
('sport', ASCENDING),
('date', DESCENDING)
])
self.db.games.create_index('external_id', unique=True)
# Plays collection (for play-by-play)
self.db.plays.create_index([
('game_id', ASCENDING),
('sequence', ASCENDING)
])
self.db.plays.create_index('player_ids')
# Players collection
self.db.players.create_index('external_id', unique=True)
self.db.players.create_index('name')
# Stats collection
self.db.stats.create_index([
('player_id', ASCENDING),
('season', ASCENDING),
('stat_type', ASCENDING)
], unique=True)
# Game Operations
def insert_game(self, game_data: Dict) -> str:
"""
Insert a game document.
Schema:
{
sport: "baseball",
date: ISODate,
home_team: { id, name, abbreviation },
away_team: { id, name, abbreviation },
venue: { name, city, state },
weather: { temp, wind, conditions },
score: { home: 5, away: 3 },
innings: [...], # Sport-specific
status: "final",
external_id: "mlb_123456"
}
"""
game_data['created_at'] = datetime.utcnow()
game_data['updated_at'] = datetime.utcnow()
result = self.db.games.insert_one(game_data)
return str(result.inserted_id)
def update_game_score(self, game_id: str, score: Dict, status: str = None):
"""Update game score."""
update = {
'$set': {
'score': score,
'updated_at': datetime.utcnow()
}
}
if status:
update['$set']['status'] = status
self.db.games.update_one({'_id': ObjectId(game_id)}, update)
def get_games_by_date(self, sport: str, date: datetime) -> List[Dict]:
"""Get all games for a sport on a date."""
start = datetime(date.year, date.month, date.day)
end = datetime(date.year, date.month, date.day, 23, 59, 59)
return list(self.db.games.find({
'sport': sport,
'date': {'$gte': start, '$lte': end}
}).sort('date', ASCENDING))
# Play-by-Play Operations
def insert_plays(self, game_id: str, plays: List[Dict]):
"""
Insert play-by-play data.
Play schema (baseball example):
{
game_id: ObjectId,
sequence: 1,
inning: 1,
half: "top",
outs_before: 0,
outs_after: 1,
runners_before: [],
runners_after: [],
batter_id: "player_123",
pitcher_id: "player_456",
player_ids: ["player_123", "player_456"],
event_type: "strikeout",
description: "Batter struck out swinging",
pitch_data: {
velocity: 95.2,
spin_rate: 2400,
pitch_type: "FF"
},
result: { runs: 0, hits: 0 }
}
"""
for play in plays:
play['game_id'] = ObjectId(game_id)
play['created_at'] = datetime.utcnow()
self.db.plays.insert_many(plays)
def get_plays_for_game(self, game_id: str) -> List[Dict]:
"""Get all plays for a game in order."""
return list(self.db.plays.find({
'game_id': ObjectId(game_id)
}).sort('sequence', ASCENDING))
def get_player_plays(
self,
player_id: str,
season: int,
event_types: List[str] = None
) -> List[Dict]:
"""Get all plays involving a player."""
query = {
'player_ids': player_id,
'season': season
}
if event_types:
query['event_type'] = {'$in': event_types}
return list(self.db.plays.find(query))
# Aggregation Pipelines
def aggregate_batting_stats(self, season: int) -> List[Dict]:
"""
Aggregate batting statistics from play-by-play data.
"""
pipeline = [
{'$match': {'season': season, 'event_type': {'$ne': None}}},
{'$group': {
'_id': '$batter_id',
'games': {'$addToSet': '$game_id'},
'at_bats': {
'$sum': {
'$cond': [
{'$in': ['$event_type', ['single', 'double', 'triple', 'home_run', 'strikeout', 'groundout', 'flyout']]},
1, 0
]
}
},
'hits': {
'$sum': {
'$cond': [
{'$in': ['$event_type', ['single', 'double', 'triple', 'home_run']]},
1, 0
]
}
},
'home_runs': {
'$sum': {'$cond': [{'$eq': ['$event_type', 'home_run']}, 1, 0]}
},
'walks': {
'$sum': {'$cond': [{'$eq': ['$event_type', 'walk']}, 1, 0]}
},
'strikeouts': {
'$sum': {'$cond': [{'$eq': ['$event_type', 'strikeout']}, 1, 0]}
}
}},
{'$addFields': {
'games_played': {'$size': '$games'},
'batting_avg': {
'$cond': [
{'$gt': ['$at_bats', 0]},
{'$divide': ['$hits', '$at_bats']},
0
]
}
}},
{'$sort': {'batting_avg': -1}}
]
return list(self.db.plays.aggregate(pipeline))
def get_hot_zones(self, player_id: str, pitch_type: str = None) -> Dict:
"""
Aggregate batting performance by pitch location zones.
"""
match = {'batter_id': player_id, 'pitch_data.location': {'$exists': True}}
if pitch_type:
match['pitch_data.pitch_type'] = pitch_type
pipeline = [
{'$match': match},
{'$group': {
'_id': '$pitch_data.zone',
'pitches': {'$sum': 1},
'swings': {'$sum': {'$cond': ['$swing', 1, 0]}},
'hits': {
'$sum': {
'$cond': [
{'$in': ['$event_type', ['single', 'double', 'triple', 'home_run']]},
1, 0
]
}
},
'avg_exit_velo': {'$avg': '$batted_ball.exit_velocity'}
}},
{'$project': {
'zone': '$_id',
'pitches': 1,
'swing_rate': {'$divide': ['$swings', '$pitches']},
'ba': {
'$cond': [
{'$gt': ['$swings', 0]},
{'$divide': ['$hits', '$swings']},
0
]
},
'avg_exit_velo': 1
}}
]
results = list(self.db.plays.aggregate(pipeline))
return {r['zone']: r for r in results}
# Example usage
if __name__ == "__main__":
mongo = SportsMongoDB("mongodb://localhost:27017")
# Insert a game
game = {
'sport': 'baseball',
'date': datetime(2024, 6, 15, 19, 5),
'home_team': {'id': 'nyy', 'name': 'New York Yankees', 'abbreviation': 'NYY'},
'away_team': {'id': 'bos', 'name': 'Boston Red Sox', 'abbreviation': 'BOS'},
'external_id': 'mlb_2024_06_15_nyy_bos'
}
game_id = mongo.insert_game(game)
DuckDB for Sports Analytics
Use DuckDB for fast analytics queries on sports data files and DataFrames without a database server.
"""DuckDB for fast sports analytics queries."""
import duckdb
import pandas as pd
from pathlib import Path
from typing import Optional
class SportsAnalyticsDuckDB:
"""
Fast analytics on sports data using DuckDB.
Benefits:
- No server required
- Direct query on Parquet/CSV files
- SQL interface
- Fast aggregations
"""
def __init__(self, db_path: str = ':memory:'):
"""
Initialize DuckDB connection.
Args:
db_path: Path to database file or ':memory:' for in-memory
"""
self.conn = duckdb.connect(db_path)
self._setup_extensions()
def _setup_extensions(self):
"""Load useful extensions."""
self.conn.execute("INSTALL httpfs")
self.conn.execute("LOAD httpfs")
def query(self, sql: str) -> pd.DataFrame:
"""Execute query and return DataFrame."""
return self.conn.execute(sql).fetchdf()
def register_dataframe(self, name: str, df: pd.DataFrame):
"""Register a DataFrame as a queryable table."""
self.conn.register(name, df)
def load_parquet(self, path: str, table_name: Optional[str] = None) -> pd.DataFrame:
"""Load Parquet file(s) into DuckDB."""
if table_name:
self.conn.execute(f"""
CREATE TABLE IF NOT EXISTS {table_name} AS
SELECT * FROM parquet_scan('{path}')
""")
return self.conn.execute(f"SELECT * FROM parquet_scan('{path}')").fetchdf()
def load_csv(self, path: str, table_name: Optional[str] = None) -> pd.DataFrame:
"""Load CSV file(s) into DuckDB."""
if table_name:
self.conn.execute(f"""
CREATE TABLE IF NOT EXISTS {table_name} AS
SELECT * FROM read_csv_auto('{path}')
""")
return self.conn.execute(f"SELECT * FROM read_csv_auto('{path}')").fetchdf()
# Pre-built analytics queries
def batting_leaders(
self,
stat: str,
min_pa: int = 200,
limit: int = 10
) -> pd.DataFrame:
"""Get batting leaders for a statistic."""
stat_calc = {
'avg': 'H / NULLIF(AB, 0)',
'obp': '(H + BB + HBP) / NULLIF(AB + BB + HBP + SF, 0)',
'slg': '(H + "2B" + 2*"3B" + 3*HR) / NULLIF(AB, 0)',
'ops': '((H + BB + HBP) / NULLIF(AB + BB + HBP + SF, 0)) + ((H + "2B" + 2*"3B" + 3*HR) / NULLIF(AB, 0))',
'hr': 'HR',
'rbi': 'RBI',
'sb': 'SB'
}
calc = stat_calc.get(stat.lower(), stat)
return self.query(f"""
SELECT
Name,
Team,
G,
PA,
AB,
H,
HR,
RBI,
ROUND({calc}, 3) as {stat}
FROM batting
WHERE PA >= {min_pa}
ORDER BY {stat} DESC
LIMIT {limit}
""")
def pitching_leaders(
self,
stat: str,
min_ip: float = 50.0,
limit: int = 10
) -> pd.DataFrame:
"""Get pitching leaders for a statistic."""
stat_calc = {
'era': 'ER * 9.0 / NULLIF(IP, 0)',
'whip': '(BB + H) / NULLIF(IP, 0)',
'k9': 'SO * 9.0 / NULLIF(IP, 0)',
'bb9': 'BB * 9.0 / NULLIF(IP, 0)',
'wins': 'W',
'so': 'SO'
}
calc = stat_calc.get(stat.lower(), stat)
return self.query(f"""
SELECT
Name,
Team,
G,
GS,
IP,
W,
L,
SO,
BB,
ROUND({calc}, 2) as {stat}
FROM pitching
WHERE IP >= {min_ip}
ORDER BY {stat} {"ASC" if stat.lower() in ['era', 'whip', 'bb9'] else "DESC"}
LIMIT {limit}
""")
def team_stats_summary(self) -> pd.DataFrame:
"""Aggregate team statistics."""
return self.query("""
SELECT
Team,
COUNT(*) as Players,
SUM(G) as TotalGames,
ROUND(AVG(H / NULLIF(AB, 0)), 3) as TeamAVG,
SUM(HR) as TotalHR,
SUM(RBI) as TotalRBI,
SUM(SB) as TotalSB
FROM batting
GROUP BY Team
ORDER BY TeamAVG DESC
""")
def player_comparison(self, players: list, stats: list) -> pd.DataFrame:
"""Compare multiple players across stats."""
player_filter = ", ".join([f"'{p}'" for p in players])
stat_cols = ", ".join(stats)
return self.query(f"""
SELECT Name, Team, {stat_cols}
FROM batting
WHERE Name IN ({player_filter})
""")
def rolling_avg(
self,
player: str,
stat: str,
window: int = 10
) -> pd.DataFrame:
"""Calculate rolling average for a player's game log."""
return self.query(f"""
SELECT
Date,
{stat},
AVG({stat}) OVER (
ORDER BY Date
ROWS BETWEEN {window - 1} PRECEDING AND CURRENT ROW
) as Rolling{window}Avg
FROM game_log
WHERE Name = '{player}'
ORDER BY Date
""")
def correlation_matrix(self, stats: list) -> pd.DataFrame:
"""Calculate correlation matrix between statistics."""
corr_calcs = []
for s1 in stats:
row = []
for s2 in stats:
row.append(f"CORR({s1}, {s2}) as {s1}_{s2}")
corr_calcs.append(", ".join(row))
# This is simplified - real implementation would pivot
return self.query(f"""
SELECT {", ".join([f'CORR({s}, HR) as {s}_HR' for s in stats])}
FROM batting
WHERE PA >= 200
""")
# Example usage
if __name__ == "__main__":
db = SportsAnalyticsDuckDB()
# Load data from pybaseball
import pybaseball as pyb
batting = pyb.batting_stats(2024)
pitching = pyb.pitching_stats(2024)
# Register DataFrames
db.register_dataframe('batting', batting)
db.register_dataframe('pitching', pitching)
# Get batting leaders
print("HR Leaders:")
print(db.batting_leaders('HR', min_pa=300))
print("\nERA Leaders:")
print(db.pitching_leaders('ERA', min_ip=100))
print("\nTeam Stats:")
print(db.team_stats_summary())
# Direct SQL on registered DataFrames
custom = db.query("""
SELECT Name, Team, HR, RBI, SB,
HR + SB as PowerSpeed
FROM batting
WHERE PA >= 400 AND HR >= 20 AND SB >= 15
ORDER BY PowerSpeed DESC
""")
print("\nPower-Speed:")
print(custom)
R Database Operations with DBI
Connect to databases and query sports data using R's DBI package with parameterized queries.
# Database operations for sports analytics in R
library(DBI)
library(dplyr)
library(dbplyr)
#' Connect to a sports database
#'
#' @param driver Database driver ("postgres", "mysql", "sqlite")
#' @param ... Connection parameters
#' @return DBI connection object
connect_sports_db <- function(driver = "postgres", ...) {
drv <- switch(driver,
"postgres" = RPostgres::Postgres(),
"mysql" = RMariaDB::MariaDB(),
"sqlite" = RSQLite::SQLite(),
stop("Unsupported driver")
)
dbConnect(drv, ...)
}
#' Get player statistics from database
#'
#' @param con DBI connection
#' @param season Season year
#' @param stat_type Type of statistics
#' @param min_games Minimum games played
#' @return Data frame with player statistics
get_player_stats <- function(con, season, stat_type = "batting", min_games = 50) {
query <- "
SELECT
p.first_name,
p.last_name,
t.name as team,
pss.games_played,
pss.stats
FROM player_season_stats pss
JOIN players p ON pss.player_id = p.id
JOIN teams t ON p.team_id = t.id
JOIN seasons s ON pss.season_id = s.id
WHERE s.year = $1
AND pss.stat_type = $2
AND pss.games_played >= $3
ORDER BY pss.games_played DESC
"
dbGetQuery(con, query, params = list(season, stat_type, min_games))
}
#' Insert game results
#'
#' @param con DBI connection
#' @param games Data frame with game data
#' @return Number of rows inserted
insert_games <- function(con, games) {
# Use parameterized insert for safety
dbWriteTable(con, "games", games, append = TRUE)
}
#' Batch upsert player statistics
#'
#' @param con DBI connection
#' @param stats Data frame with statistics
upsert_stats <- function(con, stats) {
# Start transaction
dbBegin(con)
tryCatch({
for (i in seq_len(nrow(stats))) {
row <- stats[i, ]
# Check if exists
existing <- dbGetQuery(con,
"SELECT id FROM player_season_stats
WHERE player_id = $1 AND season_id = $2 AND stat_type = $3",
params = list(row$player_id, row$season_id, row$stat_type)
)
if (nrow(existing) > 0) {
# Update
dbExecute(con,
"UPDATE player_season_stats
SET games_played = $1, stats = $2, updated_at = NOW()
WHERE id = $3",
params = list(row$games_played, row$stats, existing$id[1])
)
} else {
# Insert
dbExecute(con,
"INSERT INTO player_season_stats
(player_id, season_id, stat_type, games_played, stats)
VALUES ($1, $2, $3, $4, $5)",
params = list(row$player_id, row$season_id, row$stat_type,
row$games_played, row$stats)
)
}
}
dbCommit(con)
return(nrow(stats))
}, error = function(e) {
dbRollback(con)
stop(e)
})
}
#' Use dplyr/dbplyr for database queries
#'
#' @param con DBI connection
#' @return Lazy query object
query_with_dbplyr <- function(con) {
# Create lazy table references
players <- tbl(con, "players")
teams <- tbl(con, "teams")
stats <- tbl(con, "player_season_stats")
seasons <- tbl(con, "seasons")
# Build query with dplyr verbs
result <- stats %>%
inner_join(players, by = c("player_id" = "id")) %>%
inner_join(teams, by = c("team_id" = "id")) %>%
inner_join(seasons, by = c("season_id" = "id")) %>%
filter(year == 2024, stat_type == "batting") %>%
select(
first_name, last_name,
team = name.y,
games_played, stats
) %>%
arrange(desc(games_played))
# Execute and collect
collect(result)
}
#' Get standings using window functions
#'
#' @param con DBI connection
#' @param season Season year
get_standings <- function(con, season) {
query <- "
WITH game_results AS (
SELECT
t.id,
t.name as team,
t.division,
SUM(CASE
WHEN (g.home_team_id = t.id AND g.home_score > g.away_score) OR
(g.away_team_id = t.id AND g.away_score > g.home_score)
THEN 1 ELSE 0
END) as wins,
SUM(CASE
WHEN (g.home_team_id = t.id AND g.home_score < g.away_score) OR
(g.away_team_id = t.id AND g.away_score < g.home_score)
THEN 1 ELSE 0
END) as losses
FROM teams t
JOIN games g ON t.id IN (g.home_team_id, g.away_team_id)
JOIN seasons s ON g.season_id = s.id
WHERE s.year = $1 AND g.status = 'final'
GROUP BY t.id, t.name, t.division
)
SELECT
team,
division,
wins,
losses,
ROUND(wins::numeric / NULLIF(wins + losses, 0), 3) as pct,
wins - FIRST_VALUE(wins) OVER (
PARTITION BY division ORDER BY wins DESC
) as gb
FROM game_results
ORDER BY division, wins DESC
"
dbGetQuery(con, query, params = list(season))
}
# Example usage
# con <- connect_sports_db("postgres",
# dbname = "sports_db",
# host = "localhost",
# user = "user",
# password = "password"
# )
#
# stats <- get_player_stats(con, 2024, "batting", 100)
# standings <- get_standings(con, 2024)
#
# dbDisconnect(con)
Scrape Baseball Reference Player Stats
Web scraping Baseball Reference for historical player statistics using BeautifulSoup and requests.
"""Scrape Baseball Reference for player batting statistics."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
def scrape_player_batting(player_id: str, year: int = None) -> pd.DataFrame:
"""
Scrape batting stats from Baseball Reference.
Args:
player_id: Baseball Reference player ID (e.g., 'troutmi01')
year: Optional specific year, None for career stats
Returns:
DataFrame with batting statistics
"""
url = f"https://www.baseball-reference.com/players/{player_id[0]}/{player_id}.shtml"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find the batting stats table
batting_table = soup.find('table', {'id': 'batting_standard'})
if not batting_table:
raise ValueError(f"Could not find batting table for {player_id}")
# Parse table into DataFrame
rows = []
headers_row = batting_table.find('thead').find_all('th')
columns = [th.text for th in headers_row]
for row in batting_table.find('tbody').find_all('tr'):
if 'class' in row.attrs and 'thead' in row.attrs['class']:
continue
cells = row.find_all(['td', 'th'])
row_data = [cell.text for cell in cells]
if len(row_data) == len(columns):
rows.append(row_data)
df = pd.DataFrame(rows, columns=columns)
# Filter by year if specified
if year and 'Year' in df.columns:
df = df[df['Year'] == str(year)]
# Convert numeric columns
numeric_cols = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
# Example usage
if __name__ == "__main__":
# Scrape Mike Trout's stats
trout_stats = scrape_player_batting('troutmi01', 2023)
print(trout_stats)
# Be respectful - wait between requests
time.sleep(3)
Scrape MLB Standings
Scrape current MLB standings from ESPN using Python requests and BeautifulSoup.
"""Scrape MLB standings from ESPN."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_mlb_standings(year: int = 2024) -> dict:
"""
Scrape MLB standings from ESPN.
Args:
year: Season year
Returns:
Dictionary with AL and NL standings DataFrames
"""
url = f"https://www.espn.com/mlb/standings/_/season/{year}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
standings = {'AL': [], 'NL': []}
# Find all standings tables
tables = soup.find_all('table', class_='Table')
for i, table in enumerate(tables[:2]): # AL and NL
league = 'AL' if i == 0 else 'NL'
rows = table.find_all('tr')
for row in rows[1:]: # Skip header
cells = row.find_all('td')
if len(cells) >= 6:
team_data = {
'Team': cells[0].text.strip(),
'W': int(cells[1].text) if cells[1].text.isdigit() else 0,
'L': int(cells[2].text) if cells[2].text.isdigit() else 0,
'PCT': float(cells[3].text) if cells[3].text else 0,
'GB': cells[4].text.strip(),
'Diff': cells[5].text.strip()
}
standings[league].append(team_data)
return {
'AL': pd.DataFrame(standings['AL']),
'NL': pd.DataFrame(standings['NL'])
}
# Example usage
standings = scrape_mlb_standings(2024)
print("American League:")
print(standings['AL'].head(10))
print("\nNational League:")
print(standings['NL'].head(10))
Scrape Basketball Reference Box Scores
Scrape NBA game box scores from Basketball Reference for detailed game statistics.
"""Scrape NBA box scores from Basketball Reference."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
def scrape_game_box_score(game_id: str) -> dict:
"""
Scrape box score from Basketball Reference.
Args:
game_id: Basketball Reference game ID (e.g., '202401150LAL')
Returns:
Dictionary with home and away team box scores
"""
url = f"https://www.basketball-reference.com/boxscores/{game_id}.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
result = {}
# Find all box score tables (basic and advanced for each team)
tables = soup.find_all('table', {'class': 'sortable'})
for table in tables:
table_id = table.get('id', '')
if 'basic' in table_id:
team = table_id.replace('box-', '').replace('-game-basic', '')
headers_row = table.find('thead').find_all('tr')[-1]
columns = [th.text for th in headers_row.find_all('th')]
rows = []
for row in table.find('tbody').find_all('tr'):
if 'class' in row.attrs and 'thead' in row.attrs['class']:
continue
cells = row.find_all(['td', 'th'])
row_data = [cell.text for cell in cells]
if row_data:
rows.append(row_data)
df = pd.DataFrame(rows, columns=columns[:len(rows[0])] if rows else columns)
# Convert numeric columns
numeric_cols = ['MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA',
'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
result[team] = df
return result
def scrape_games_on_date(date: str) -> list:
"""
Get list of game IDs for a specific date.
Args:
date: Date string in format 'YYYY-MM-DD'
Returns:
List of game IDs
"""
dt = datetime.strptime(date, '%Y-%m-%d')
url = f"https://www.basketball-reference.com/boxscores/?month={dt.month}&day={dt.day}&year={dt.year}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
game_ids = []
for link in soup.find_all('a', href=True):
href = link['href']
if '/boxscores/' in href and href.endswith('.html'):
game_id = href.split('/')[-1].replace('.html', '')
if len(game_id) > 10: # Valid game ID
game_ids.append(game_id)
return list(set(game_ids))
# Example usage
box_scores = scrape_game_box_score('202401150LAL')
for team, df in box_scores.items():
print(f"\n{team} Box Score:")
print(df[['Starters', 'PTS', 'TRB', 'AST']].head() if 'Starters' in df.columns else df.head())
Scrape NBA Draft Data
Scrape historical NBA draft data from Basketball Reference for draft analysis.
"""Scrape NBA draft data from Basketball Reference."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_nba_draft(year: int) -> pd.DataFrame:
"""
Scrape NBA draft results for a specific year.
Args:
year: Draft year
Returns:
DataFrame with draft picks and player info
"""
url = f"https://www.basketball-reference.com/draft/NBA_{year}.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# Find the draft table
draft_table = soup.find('table', {'id': 'stats'})
if not draft_table:
raise ValueError(f"Could not find draft table for {year}")
# Get headers
header_row = draft_table.find('thead').find_all('tr')[-1]
columns = [th.text.strip() for th in header_row.find_all('th')]
# Parse rows
rows = []
for row in draft_table.find('tbody').find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
cells = row.find_all(['td', 'th'])
row_data = [cell.text.strip() for cell in cells]
if row_data and len(row_data) > 5:
rows.append(row_data)
df = pd.DataFrame(rows)
if len(df.columns) == len(columns):
df.columns = columns
# Clean and convert columns
if 'Pk' in df.columns:
df['Pk'] = pd.to_numeric(df['Pk'], errors='coerce')
# Add draft year
df['Draft_Year'] = year
return df
def get_draft_history(start_year: int, end_year: int) -> pd.DataFrame:
"""
Get draft history for multiple years.
Args:
start_year: Starting year
end_year: Ending year (inclusive)
Returns:
Combined DataFrame of all drafts
"""
import time
all_drafts = []
for year in range(start_year, end_year + 1):
try:
draft_df = scrape_nba_draft(year)
all_drafts.append(draft_df)
print(f"Scraped {year} draft: {len(draft_df)} picks")
time.sleep(3) # Be respectful
except Exception as e:
print(f"Error scraping {year}: {e}")
return pd.concat(all_drafts, ignore_index=True)
# Example usage
draft_2023 = scrape_nba_draft(2023)
print(f"2023 NBA Draft - {len(draft_2023)} picks")
print(draft_2023[['Pk', 'Player', 'Tm']].head(10))
Scrape Pro Football Reference Stats
Scrape NFL player statistics from Pro Football Reference using BeautifulSoup.
"""Scrape NFL stats from Pro Football Reference."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_passing_stats(year: int) -> pd.DataFrame:
"""
Scrape NFL passing statistics for a season.
Args:
year: Season year
Returns:
DataFrame with passing statistics
"""
url = f"https://www.pro-football-reference.com/years/{year}/passing.htm"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# Find the passing table
table = soup.find('table', {'id': 'passing'})
if not table:
raise ValueError(f"Could not find passing table for {year}")
# Get headers
header_row = table.find('thead').find_all('tr')[-1]
columns = [th.text.strip() for th in header_row.find_all('th')]
# Parse rows
rows = []
for row in table.find('tbody').find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
cells = row.find_all(['td', 'th'])
row_data = [cell.text.strip() for cell in cells]
if row_data:
rows.append(row_data)
df = pd.DataFrame(rows)
if len(df.columns) == len(columns):
df.columns = columns
# Convert numeric columns
numeric_cols = ['G', 'GS', 'Cmp', 'Att', 'Yds', 'TD', 'Int', 'Sk']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
def scrape_rushing_stats(year: int) -> pd.DataFrame:
"""
Scrape NFL rushing statistics for a season.
"""
url = f"https://www.pro-football-reference.com/years/{year}/rushing.htm"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'id': 'rushing'})
if not table:
raise ValueError(f"Could not find rushing table for {year}")
header_row = table.find('thead').find_all('tr')[-1]
columns = [th.text.strip() for th in header_row.find_all('th')]
rows = []
for row in table.find('tbody').find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
cells = row.find_all(['td', 'th'])
row_data = [cell.text.strip() for cell in cells]
if row_data:
rows.append(row_data)
df = pd.DataFrame(rows)
if len(df.columns) == len(columns):
df.columns = columns
return df
# Example usage
passing_2023 = scrape_passing_stats(2023)
print(f"2023 NFL Passing Leaders:")
print(passing_2023[['Player', 'Tm', 'Yds', 'TD']].head(10))
Scrape FBref Soccer Stats
Scrape soccer statistics from FBref (Football Reference) for player and team data.
"""Scrape soccer stats from FBref."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_league_stats(league_id: str, season: str) -> pd.DataFrame:
"""
Scrape league-wide player statistics from FBref.
Args:
league_id: FBref league ID (e.g., '9' for Premier League)
season: Season string (e.g., '2023-2024')
Returns:
DataFrame with player statistics
"""
# League IDs: 9=Premier League, 12=La Liga, 20=Bundesliga, 11=Serie A, 13=Ligue 1
url = f"https://fbref.com/en/comps/{league_id}/{season}/stats/{season}-Stats"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# Find the standard stats table
table = soup.find('table', {'id': 'stats_standard'})
if not table:
# Try alternate table ID
table = soup.find('table', class_='stats_table')
if not table:
raise ValueError(f"Could not find stats table")
# Parse table headers
header_rows = table.find('thead').find_all('tr')
columns = [th.text.strip() for th in header_rows[-1].find_all('th')]
# Parse data rows
rows = []
for row in table.find('tbody').find_all('tr'):
if row.get('class') and any(c in str(row.get('class')) for c in ['thead', 'spacer']):
continue
cells = row.find_all(['td', 'th'])
row_data = [cell.text.strip() for cell in cells]
if row_data and len(row_data) > 5:
rows.append(row_data)
df = pd.DataFrame(rows)
if len(df.columns) == len(columns):
df.columns = columns
# Convert numeric columns
numeric_cols = ['MP', 'Starts', 'Min', 'Gls', 'Ast', 'G+A', 'PK', 'PKatt']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
def scrape_team_stats(team_id: str, season: str) -> dict:
"""
Scrape detailed team statistics.
Args:
team_id: FBref team ID
season: Season string
Returns:
Dictionary with various stat DataFrames
"""
url = f"https://fbref.com/en/squads/{team_id}/{season}/all_comps"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
tables = {}
table_ids = ['stats_standard_combined', 'stats_shooting_combined', 'stats_passing_combined']
for table_id in table_ids:
table = soup.find('table', {'id': table_id})
if table:
# Parse similar to above
header_rows = table.find('thead').find_all('tr')
columns = [th.text.strip() for th in header_rows[-1].find_all('th')]
rows = []
for row in table.find('tbody').find_all('tr'):
cells = row.find_all(['td', 'th'])
row_data = [cell.text.strip() for cell in cells]
if row_data:
rows.append(row_data)
df = pd.DataFrame(rows)
if len(df.columns) == len(columns):
df.columns = columns
tables[table_id] = df
return tables
# Example usage
epl_stats = scrape_league_stats('9', '2023-2024')
print(f"Premier League 2023-24 Stats: {len(epl_stats)} players")
print(epl_stats[['Player', 'Squad', 'Gls', 'Ast']].head(10) if 'Player' in epl_stats.columns else epl_stats.head())
Scrape Transfermarkt Player Values
Scrape player market values and transfer data from Transfermarkt.
"""Scrape player market values from Transfermarkt."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_team_values(team_id: str, team_name: str) -> pd.DataFrame:
"""
Scrape squad market values from Transfermarkt.
Args:
team_id: Transfermarkt team ID
team_name: Team name slug (e.g., 'manchester-city')
Returns:
DataFrame with player values
"""
url = f"https://www.transfermarkt.com/{team_name}/kader/verein/{team_id}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
players = []
# Find player rows in squad table
table = soup.find('table', class_='items')
if table:
for row in table.find_all('tr', class_=['odd', 'even']):
try:
# Player name
name_cell = row.find('td', class_='hauptlink')
name = name_cell.find('a').text.strip() if name_cell else None
# Position
position_cell = row.find_all('td')
position = None
for td in position_cell:
if td.get('class') and 'posrela' in str(td.get('class')):
pos_text = td.find_all('tr')
if len(pos_text) > 1:
position = pos_text[1].text.strip()
# Market value
value_cell = row.find('td', class_='rechts hauptlink')
value = value_cell.text.strip() if value_cell else None
if name:
players.append({
'Name': name,
'Position': position,
'Market_Value': value
})
except Exception as e:
continue
df = pd.DataFrame(players)
# Parse market values to numeric
def parse_value(val):
if pd.isna(val) or val == '-':
return None
val = val.replace('€', '').strip()
multiplier = 1
if 'm' in val.lower():
multiplier = 1_000_000
val = val.lower().replace('m', '')
elif 'k' in val.lower():
multiplier = 1_000
val = val.lower().replace('k', '')
try:
return float(val) * multiplier
except:
return None
if 'Market_Value' in df.columns:
df['Value_EUR'] = df['Market_Value'].apply(parse_value)
return df
# Example: Manchester City (team_id=281)
# values = scrape_team_values('281', 'manchester-city')
# print(values.head(10))
Scrape Hockey Reference Stats
Scrape NHL statistics from Hockey Reference for player and team analysis.
"""Scrape NHL stats from Hockey Reference."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_skater_stats(year: int) -> pd.DataFrame:
"""
Scrape NHL skater statistics for a season.
Args:
year: Season ending year (e.g., 2024 for 2023-24 season)
Returns:
DataFrame with skater statistics
"""
url = f"https://www.hockey-reference.com/leagues/NHL_{year}_skaters.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'id': 'stats'})
if not table:
raise ValueError(f"Could not find skater stats table for {year}")
header_row = table.find('thead').find_all('tr')[-1]
columns = [th.text.strip() for th in header_row.find_all('th')]
rows = []
for row in table.find('tbody').find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
cells = row.find_all(['td', 'th'])
row_data = [cell.text.strip() for cell in cells]
if row_data:
rows.append(row_data)
df = pd.DataFrame(rows)
if len(df.columns) == len(columns):
df.columns = columns
# Convert numeric columns
numeric_cols = ['GP', 'G', 'A', 'PTS', '+/-', 'PIM', 'EVG', 'PPG', 'SHG', 'S']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
def scrape_goalie_stats(year: int) -> pd.DataFrame:
"""
Scrape NHL goalie statistics for a season.
"""
url = f"https://www.hockey-reference.com/leagues/NHL_{year}_goalies.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'id': 'stats'})
if not table:
raise ValueError(f"Could not find goalie stats table for {year}")
header_row = table.find('thead').find_all('tr')[-1]
columns = [th.text.strip() for th in header_row.find_all('th')]
rows = []
for row in table.find('tbody').find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
cells = row.find_all(['td', 'th'])
row_data = [cell.text.strip() for cell in cells]
if row_data:
rows.append(row_data)
df = pd.DataFrame(rows)
if len(df.columns) == len(columns):
df.columns = columns
return df
# Example usage
skaters_2024 = scrape_skater_stats(2024)
print(f"2023-24 NHL Skater Stats: {len(skaters_2024)} players")
if 'Player' in skaters_2024.columns:
print(skaters_2024[['Player', 'Tm', 'G', 'A', 'PTS']].head(10))
Scrape PGA Tour Statistics
Scrape PGA Tour player statistics from the official website.
"""Scrape PGA Tour statistics."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
def scrape_pga_stats(stat_id: str = '02675', year: int = 2024) -> pd.DataFrame:
"""
Scrape PGA Tour statistics.
Common stat IDs:
- 02675: Scoring Average
- 02568: Driving Distance
- 02567: Driving Accuracy
- 02564: Greens in Regulation
- 02428: Strokes Gained Total
Args:
stat_id: PGA Tour statistic ID
year: Season year
Returns:
DataFrame with player statistics
"""
url = f"https://www.pgatour.com/stats/stat.{stat_id}.y{year}.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# Find stats table
table = soup.find('table', class_='table-styled')
if not table:
# Try alternate approach with JavaScript data
script_data = soup.find('script', {'type': 'application/json'})
if script_data:
data = json.loads(script_data.string)
# Parse JSON structure (varies by page)
return pd.DataFrame(data.get('rows', []))
raise ValueError(f"Could not find stats table")
# Parse table
headers_row = table.find('thead').find('tr')
columns = [th.text.strip() for th in headers_row.find_all('th')]
rows = []
for row in table.find('tbody').find_all('tr'):
cells = row.find_all('td')
row_data = [cell.text.strip() for cell in cells]
if row_data:
rows.append(row_data)
df = pd.DataFrame(rows)
if len(df.columns) == len(columns):
df.columns = columns
return df
def scrape_tournament_leaderboard(tournament_id: str) -> pd.DataFrame:
"""
Scrape tournament leaderboard from PGA Tour.
Args:
tournament_id: PGA Tour tournament ID
Returns:
DataFrame with leaderboard data
"""
url = f"https://www.pgatour.com/tournaments/{tournament_id}/leaderboard.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# PGA Tour uses React, so data is often in JSON
script_tags = soup.find_all('script')
for script in script_tags:
if script.string and 'leaderboardData' in str(script.string):
# Extract JSON data
text = script.string
start = text.find('{')
end = text.rfind('}') + 1
if start > -1 and end > start:
try:
data = json.loads(text[start:end])
return pd.DataFrame(data.get('players', []))
except:
pass
return pd.DataFrame()
# Example usage
# stats = scrape_pga_stats('02675', 2024) # Scoring average
# print(stats.head(10))
Scrape ATP Tennis Rankings
Scrape ATP tennis rankings and player statistics from the ATP Tour website.
"""Scrape ATP tennis rankings and stats."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
def scrape_atp_rankings() -> pd.DataFrame:
"""
Scrape current ATP singles rankings.
Returns:
DataFrame with player rankings
"""
url = "https://www.atptour.com/en/rankings/singles"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
players = []
# Find ranking table
table = soup.find('table', class_='mega-table')
if table:
for row in table.find_all('tr')[1:]: # Skip header
cells = row.find_all('td')
if len(cells) >= 5:
rank = cells[0].text.strip()
# Player name might be in a link
name_cell = cells[2]
name_link = name_cell.find('a')
name = name_link.text.strip() if name_link else name_cell.text.strip()
country = cells[3].text.strip() if len(cells) > 3 else ''
points = cells[5].text.strip() if len(cells) > 5 else ''
players.append({
'Rank': int(rank) if rank.isdigit() else None,
'Player': name,
'Country': country,
'Points': int(points.replace(',', '')) if points.replace(',', '').isdigit() else None
})
return pd.DataFrame(players)
def scrape_player_stats(player_id: str) -> dict:
"""
Scrape individual player statistics from ATP.
Args:
player_id: ATP player ID slug (e.g., 'n409' for Novak Djokovic)
Returns:
Dictionary with player statistics
"""
url = f"https://www.atptour.com/en/players/-/{player_id}/overview"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
stats = {}
# Career stats
stat_items = soup.find_all('div', class_='stat-item')
for item in stat_items:
label = item.find('div', class_='stat-label')
value = item.find('div', class_='stat-value')
if label and value:
stats[label.text.strip()] = value.text.strip()
return stats
# Example usage
rankings = scrape_atp_rankings()
print(f"ATP Rankings: {len(rankings)} players")
print(rankings.head(20))
Generic Sports Table Scraper
A reusable class for scraping sports statistics tables from various websites.
"""Generic sports table scraper for various websites."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Optional, Dict, Any
import time
from urllib.parse import urljoin
class SportsTableScraper:
"""
A generic scraper for sports statistics tables.
Features:
- Automatic table detection
- Header parsing with colspan handling
- Rate limiting
- Error handling
"""
def __init__(self, base_url: str = None, delay: float = 2.0):
"""
Initialize the scraper.
Args:
base_url: Base URL for relative links
delay: Delay between requests in seconds
"""
self.base_url = base_url
self.delay = delay
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
})
self.last_request = 0
def _rate_limit(self):
"""Enforce rate limiting between requests."""
elapsed = time.time() - self.last_request
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self.last_request = time.time()
def fetch_page(self, url: str) -> BeautifulSoup:
"""Fetch and parse a page."""
self._rate_limit()
if self.base_url and not url.startswith('http'):
url = urljoin(self.base_url, url)
response = self.session.get(url)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
def parse_table(
self,
soup: BeautifulSoup,
table_id: str = None,
table_class: str = None,
table_index: int = 0
) -> pd.DataFrame:
"""
Parse an HTML table into a DataFrame.
Args:
soup: BeautifulSoup object
table_id: Table ID attribute
table_class: Table class attribute
table_index: Index if multiple tables match
Returns:
DataFrame with table data
"""
# Find table
if table_id:
table = soup.find('table', {'id': table_id})
elif table_class:
tables = soup.find_all('table', class_=table_class)
table = tables[table_index] if tables and len(tables) > table_index else None
else:
tables = soup.find_all('table')
table = tables[table_index] if tables and len(tables) > table_index else None
if not table:
return pd.DataFrame()
# Parse headers
columns = self._parse_headers(table)
# Parse body
rows = self._parse_body(table, len(columns))
df = pd.DataFrame(rows)
if len(df.columns) == len(columns):
df.columns = columns
return df
def _parse_headers(self, table) -> List[str]:
"""Parse table headers, handling colspan."""
thead = table.find('thead')
if not thead:
# Try first row of table
first_row = table.find('tr')
headers = first_row.find_all(['th', 'td']) if first_row else []
return [h.text.strip() for h in headers]
header_rows = thead.find_all('tr')
if not header_rows:
return []
# Use last header row (most specific)
last_row = header_rows[-1]
return [th.text.strip() for th in last_row.find_all('th')]
def _parse_body(self, table, expected_cols: int) -> List[List[str]]:
"""Parse table body rows."""
tbody = table.find('tbody')
rows_container = tbody if tbody else table
rows = []
for row in rows_container.find_all('tr'):
# Skip header rows in body
if row.get('class') and any(c in str(row.get('class')) for c in ['thead', 'header']):
continue
cells = row.find_all(['td', 'th'])
row_data = [cell.text.strip() for cell in cells]
if row_data:
rows.append(row_data)
return rows
def scrape_table(
self,
url: str,
table_id: str = None,
table_class: str = None,
table_index: int = 0
) -> pd.DataFrame:
"""
Fetch URL and parse table in one call.
"""
soup = self.fetch_page(url)
return self.parse_table(soup, table_id, table_class, table_index)
# Example usage
scraper = SportsTableScraper(delay=3.0)
# Scrape from any sports reference site
# df = scraper.scrape_table(
# "https://www.baseball-reference.com/leagues/majors/2024-standard-batting.shtml",
# table_id="players_standard_batting"
# )
# print(df.head())
Scrape ESPN Scoreboard
Scrape live scores and game information from ESPN for multiple sports.
"""Scrape ESPN scoreboard for live scores."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from typing import List, Dict
def scrape_espn_scores(sport: str, date: str = None) -> List[Dict]:
"""
Scrape scores from ESPN scoreboard.
Args:
sport: Sport code ('mlb', 'nba', 'nfl', 'nhl', 'soccer')
date: Date string YYYYMMDD, None for today
Returns:
List of game dictionaries
"""
if date is None:
date = datetime.now().strftime('%Y%m%d')
url = f"https://www.espn.com/{sport}/scoreboard/_/date/{date}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
games = []
# ESPN uses React, but some data is in the initial HTML
# Look for scoreboard containers
scoreboard = soup.find_all('section', class_='Scoreboard')
for game_section in scoreboard:
try:
game = {}
# Teams
teams = game_section.find_all('div', class_='ScoreCell__TeamName')
if len(teams) >= 2:
game['away_team'] = teams[0].text.strip()
game['home_team'] = teams[1].text.strip()
# Scores
scores = game_section.find_all('div', class_='ScoreCell__Score')
if len(scores) >= 2:
game['away_score'] = scores[0].text.strip()
game['home_score'] = scores[1].text.strip()
# Status
status = game_section.find('div', class_='ScoreCell__Time')
if status:
game['status'] = status.text.strip()
if game.get('away_team'):
games.append(game)
except Exception as e:
continue
return games
def get_mlb_scores(date: str = None) -> pd.DataFrame:
"""Get MLB scores as DataFrame."""
games = scrape_espn_scores('mlb', date)
return pd.DataFrame(games)
def get_nba_scores(date: str = None) -> pd.DataFrame:
"""Get NBA scores as DataFrame."""
games = scrape_espn_scores('nba', date)
return pd.DataFrame(games)
def get_nfl_scores(date: str = None) -> pd.DataFrame:
"""Get NFL scores as DataFrame."""
games = scrape_espn_scores('nfl', date)
return pd.DataFrame(games)
def get_nhl_scores(date: str = None) -> pd.DataFrame:
"""Get NHL scores as DataFrame."""
games = scrape_espn_scores('nhl', date)
return pd.DataFrame(games)
# Example usage
print("Today's MLB Scores:")
mlb_games = get_mlb_scores()
print(mlb_games)
print("\nToday's NBA Scores:")
nba_games = get_nba_scores()
print(nba_games)
Web Scraping in R with rvest
Scrape sports statistics tables using R and the rvest package.
# Web scraping sports stats with rvest
library(rvest)
library(dplyr)
library(stringr)
#' Scrape HTML table from URL
#'
#' @param url URL to scrape
#' @param table_selector CSS selector for the table
#' @return Data frame with table contents
scrape_sports_table <- function(url, table_selector = "table") {
# Read the page
page <- read_html(url)
# Extract the table
table <- page %>%
html_element(table_selector) %>%
html_table(fill = TRUE)
# Clean column names
names(table) <- make.names(names(table), unique = TRUE)
return(table)
}
#' Scrape Baseball Reference batting stats
#'
#' @param year Season year
#' @return Data frame with batting statistics
scrape_bbref_batting <- function(year) {
url <- paste0(
"https://www.baseball-reference.com/leagues/majors/",
year, "-standard-batting.shtml"
)
tryCatch({
page <- read_html(url)
# The main stats table
table <- page %>%
html_element("#players_standard_batting") %>%
html_table()
# Clean and process
table <- table %>%
filter(Rk != "Rk") %>% # Remove repeated headers
mutate(across(c(G, AB, R, H, HR, RBI, BB, SO), as.numeric))
return(table)
}, error = function(e) {
warning(paste("Error scraping:", e$message))
return(NULL)
})
}
#' Scrape Pro Football Reference passing stats
#'
#' @param year Season year
#' @return Data frame with passing statistics
scrape_pfr_passing <- function(year) {
url <- paste0(
"https://www.pro-football-reference.com/years/",
year, "/passing.htm"
)
tryCatch({
page <- read_html(url)
table <- page %>%
html_element("#passing") %>%
html_table()
# Clean
table <- table %>%
filter(Rk != "Rk") %>%
mutate(across(c(G, GS, Cmp, Att, Yds, TD, Int), as.numeric))
return(table)
}, error = function(e) {
warning(paste("Error scraping:", e$message))
return(NULL)
})
}
#' Scrape Basketball Reference player stats
#'
#' @param year Season year
#' @return Data frame with player statistics
scrape_bkref_players <- function(year) {
url <- paste0(
"https://www.basketball-reference.com/leagues/NBA_",
year, "_per_game.html"
)
tryCatch({
page <- read_html(url)
table <- page %>%
html_element("#per_game_stats") %>%
html_table()
# Clean
table <- table %>%
filter(Rk != "Rk") %>%
mutate(across(c(G, GS, MP, PTS, TRB, AST), as.numeric))
return(table)
}, error = function(e) {
warning(paste("Error scraping:", e$message))
return(NULL)
})
}
# Example usage
# Be respectful with rate limiting
Sys.sleep(3)
# batting <- scrape_bbref_batting(2024)
# print(head(batting))
# Generic scraping example
url <- "https://www.espn.com/mlb/standings"
# standings <- scrape_sports_table(url, "table.standings")
# print(standings)
Hypothesis Testing for Player Performance
Use t-tests to determine if performance differences between players or time periods are statistically significant.
import numpy as np
import pandas as pd
from scipy import stats
def compare_player_performance(player1_stats, player2_stats, metric, alpha=0.05):
"""
Compare two players using t-test.
"""
t_stat, p_value = stats.ttest_ind(player1_stats, player2_stats)
print(f"Comparing {metric}:")
print(f" Player 1 mean: {np.mean(player1_stats):.2f}")
print(f" Player 2 mean: {np.mean(player2_stats):.2f}")
print(f" t-statistic: {t_stat:.3f}")
print(f" p-value: {p_value:.4f}")
if p_value < alpha:
print(f" Result: Significant difference (p < {alpha})")
else:
print(f" Result: No significant difference")
return t_stat, p_value
def before_after_analysis(before_stats, after_stats, metric, alpha=0.05):
"""
Paired t-test for before/after comparison (e.g., injury, trade).
"""
t_stat, p_value = stats.ttest_rel(before_stats, after_stats)
print(f"Before/After Analysis - {metric}:")
print(f" Before mean: {np.mean(before_stats):.2f}")
print(f" After mean: {np.mean(after_stats):.2f}")
print(f" Change: {np.mean(after_stats) - np.mean(before_stats):+.2f}")
print(f" p-value: {p_value:.4f}")
return t_stat, p_value
# Example: Compare two players' scoring
np.random.seed(42)
player1_pts = np.random.normal(25, 5, 50) # 50 games
player2_pts = np.random.normal(22, 6, 50)
compare_player_performance(player1_pts, player2_pts, "Points Per Game")
print("\n" + "="*50 + "\n")
# Before/after trade
before = np.random.normal(18, 4, 30)
after = np.random.normal(23, 5, 30)
before_after_analysis(before, after, "PPG")
Comparing Points Per Game: Player 1 mean: 24.52 Player 2 mean: 22.15 t-statistic: 2.124 p-value: 0.0362 Result: Significant difference (p < 0.05)
Regression Analysis for Performance Prediction
Perform multiple linear regression to identify which factors predict performance outcomes.
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_absolute_error
def regression_analysis(df, target, features):
"""
Perform multiple linear regression with detailed output.
"""
X = df[features]
y = df[target]
# Add constant for intercept
X = sm.add_constant(X)
# Fit model
model = sm.OLS(y, X).fit()
print(model.summary())
return model
def feature_importance_regression(model, feature_names):
"""Extract feature importance from regression."""
# Standardized coefficients
coeffs = model.params[1:] # Exclude constant
std_errors = model.bse[1:]
p_values = model.pvalues[1:]
importance = pd.DataFrame({
'Feature': feature_names,
'Coefficient': coeffs,
'Std Error': std_errors,
'p-value': p_values,
'Significant': p_values < 0.05
}).sort_values('Coefficient', key=abs, ascending=False)
return importance
# Example: What predicts wins?
np.random.seed(42)
n = 100
data = pd.DataFrame({
'offensive_rating': np.random.uniform(105, 120, n),
'defensive_rating': np.random.uniform(100, 115, n),
'pace': np.random.uniform(95, 105, n),
'turnovers': np.random.uniform(10, 18, n)
})
data['wins'] = (0.5 * data['offensive_rating'] - 0.4 * data['defensive_rating'] +
0.1 * data['pace'] - 0.2 * data['turnovers'] +
np.random.normal(0, 3, n))
features = ['offensive_rating', 'defensive_rating', 'pace', 'turnovers']
model = regression_analysis(data, 'wins', features)
print("\nFeature Importance:")
print(feature_importance_regression(model, features))
OLS Regression Results
==============================================================================
R-squared: 0.842
Adj. R-squared: 0.835
coef std err t
offensive_rating 0.498 0.032 15.562
defensive_rating -0.395 0.031 -12.742
Calculate Correlation Matrix
Calculate correlation matrices and identify strongly related variables in sports data.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
def correlation_analysis(df, columns=None, method='pearson'):
"""
Calculate and visualize correlation matrix.
Methods: 'pearson', 'spearman', 'kendall'
"""
if columns:
df_subset = df[columns]
else:
df_subset = df.select_dtypes(include=[np.number])
# Calculate correlation matrix
corr_matrix = df_subset.corr(method=method)
# Create heatmap
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(
corr_matrix,
annot=True,
fmt='.2f',
cmap='RdBu_r',
center=0,
vmin=-1,
vmax=1,
ax=ax
)
ax.set_title(f'{method.capitalize()} Correlation Matrix')
plt.tight_layout()
return corr_matrix, fig
def find_strong_correlations(corr_matrix, threshold=0.7):
"""Find pairs with strong correlations."""
strong = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr = corr_matrix.iloc[i, j]
if abs(corr) >= threshold:
strong.append({
'var1': corr_matrix.columns[i],
'var2': corr_matrix.columns[j],
'correlation': round(corr, 3)
})
return pd.DataFrame(strong).sort_values('correlation', key=abs, ascending=False)
# Example
np.random.seed(42)
stats = pd.DataFrame({
'points': np.random.uniform(10, 30, 100),
'minutes': np.random.uniform(20, 40, 100),
'fg_attempts': np.random.uniform(8, 22, 100),
'assists': np.random.uniform(2, 10, 100),
'turnovers': np.random.uniform(1, 5, 100)
})
# Add correlations
stats['points'] = stats['points'] + 0.5 * stats['minutes'] + 0.3 * stats['fg_attempts']
corr, fig = correlation_analysis(stats)
strong_corrs = find_strong_correlations(corr, threshold=0.5)
print("Strong correlations (|r| >= 0.5):")
print(strong_corrs)
Strong correlations (|r| >= 0.5):
var1 var2 correlation
0 points minutes 0.723
1 points fg_attempts 0.581
Bayesian Estimation for Player True Talent
Use Bayesian estimation to regress small sample statistics toward population mean.
import numpy as np
from scipy import stats
def bayesian_batting_average(hits, at_bats, prior_mean=0.260, prior_std=0.030):
"""
Estimate player true batting average using Bayesian shrinkage.
Uses Beta-Binomial conjugate prior.
"""
# Convert prior mean/std to Beta parameters
prior_var = prior_std ** 2
prior_alpha = prior_mean * (prior_mean * (1 - prior_mean) / prior_var - 1)
prior_beta = (1 - prior_mean) * (prior_mean * (1 - prior_mean) / prior_var - 1)
# Posterior parameters
post_alpha = prior_alpha + hits
post_beta = prior_beta + (at_bats - hits)
# Posterior estimates
post_mean = post_alpha / (post_alpha + post_beta)
post_std = np.sqrt((post_alpha * post_beta) /
((post_alpha + post_beta)**2 * (post_alpha + post_beta + 1)))
# 95% credible interval
ci_low = stats.beta.ppf(0.025, post_alpha, post_beta)
ci_high = stats.beta.ppf(0.975, post_alpha, post_beta)
# Raw average
raw_avg = hits / at_bats if at_bats > 0 else 0
return {
'raw_avg': round(raw_avg, 3),
'estimated_true_avg': round(post_mean, 3),
'95_ci': (round(ci_low, 3), round(ci_high, 3)),
'shrinkage': round(raw_avg - post_mean, 3)
}
# Example: Early season stats
players = [
{'name': 'Hot Start', 'hits': 15, 'ab': 40}, # .375 in 40 AB
{'name': 'Slow Start', 'hits': 6, 'ab': 40}, # .150 in 40 AB
{'name': 'Full Season', 'hits': 170, 'ab': 550} # .309 in 550 AB
]
print("Bayesian Batting Average Estimates\n")
for p in players:
result = bayesian_batting_average(p['hits'], p['ab'])
print(f"{p['name']}:")
print(f" Raw: {result['raw_avg']:.3f}, Estimated: {result['estimated_true_avg']:.3f}")
print(f" 95% CI: {result['95_ci']}")
print(f" Shrinkage: {result['shrinkage']}\n")
Bayesian Batting Average Estimates Hot Start: Raw: 0.375, Estimated: 0.312 95% CI: (0.238, 0.392) Shrinkage: 0.063 Slow Start: Raw: 0.150, Estimated: 0.214 95% CI: (0.148, 0.289) Shrinkage: -0.064
ANOVA for Group Comparisons
Use ANOVA to test if there are significant differences between groups (e.g., positions, teams).
library(dplyr)
library(broom)
# One-way ANOVA: Compare performance across positions
perform_anova <- function(data, value_col, group_col) {
formula <- as.formula(paste(value_col, "~", group_col))
model <- aov(formula, data = data)
# Summary
cat("ANOVA Results:\n")
print(summary(model))
# Post-hoc Tukey test
cat("\nTukey HSD Post-hoc Test:\n")
tukey <- TukeyHSD(model)
print(tukey)
return(model)
}
# Effect size (eta-squared)
calculate_eta_squared <- function(aov_model) {
ss <- summary(aov_model)[[1]]$`Sum Sq`
eta_sq <- ss[1] / sum(ss)
cat("\nEffect Size (eta-squared):", round(eta_sq, 3))
return(eta_sq)
}
# Example: Compare scoring by position
set.seed(42)
players <- data.frame(
position = rep(c("Guard", "Forward", "Center"), each = 30),
ppg = c(
rnorm(30, 18, 5), # Guards
rnorm(30, 15, 4), # Forwards
rnorm(30, 12, 4) # Centers
)
)
model <- perform_anova(players, "ppg", "position")
calculate_eta_squared(model)
ANOVA Results:
Df Sum Sq Mean Sq F value Pr(>F)
position 2 582.3 291.2 14.53 <0.001 ***
Residuals 87 1743.5 20.0
Effect Size (eta-squared): 0.250
Calculate Percentiles and Rankings
Calculate percentile rankings and create composite player rankings from multiple metrics.
import pandas as pd
import numpy as np
from scipy import stats
def calculate_percentiles(df, columns, method='rank'):
"""
Calculate percentile rankings for multiple columns.
Methods:
- 'rank': Rank-based percentile
- 'distribution': Assuming normal distribution
"""
result = df.copy()
for col in columns:
if method == 'rank':
result[f'{col}_pctl'] = df[col].rank(pct=True) * 100
elif method == 'distribution':
z_scores = stats.zscore(df[col])
result[f'{col}_pctl'] = stats.norm.cdf(z_scores) * 100
return result
def create_player_rankings(df, metrics, weights=None, ascending=None):
"""
Create composite player rankings.
Args:
df: DataFrame with player stats
metrics: List of metric columns
weights: Optional dict of weights (default equal)
ascending: Dict of {metric: bool} for direction
"""
if weights is None:
weights = {m: 1/len(metrics) for m in metrics}
if ascending is None:
ascending = {m: False for m in metrics} # Higher is better default
# Calculate percentiles
df_pctl = df.copy()
for metric in metrics:
if ascending.get(metric, False):
# Lower is better - invert percentile
df_pctl[f'{metric}_pctl'] = 100 - df[metric].rank(pct=True) * 100
else:
df_pctl[f'{metric}_pctl'] = df[metric].rank(pct=True) * 100
# Calculate weighted composite score
df_pctl['composite_score'] = sum(
df_pctl[f'{m}_pctl'] * weights[m] for m in metrics
)
# Overall rank
df_pctl['overall_rank'] = df_pctl['composite_score'].rank(ascending=False).astype(int)
return df_pctl.sort_values('overall_rank')
# Example
np.random.seed(42)
players = pd.DataFrame({
'player': [f'Player_{i}' for i in range(50)],
'points': np.random.uniform(10, 28, 50),
'rebounds': np.random.uniform(3, 12, 50),
'assists': np.random.uniform(2, 10, 50),
'turnovers': np.random.uniform(1, 5, 50)
})
# Rank players (turnovers: lower is better)
ranked = create_player_rankings(
players,
metrics=['points', 'rebounds', 'assists', 'turnovers'],
weights={'points': 0.4, 'rebounds': 0.2, 'assists': 0.25, 'turnovers': 0.15},
ascending={'turnovers': True} # Lower turnovers is better
)
print("Top 10 Players:")
print(ranked[['player', 'points', 'rebounds', 'assists', 'composite_score', 'overall_rank']].head(10))
Top 10 Players:
player points rebounds assists composite_score overall_rank
23 Player_23 26.85 9.23 8.95 89.42 1
8 Player_8 24.12 10.82 7.34 85.18 2
Time Series Decomposition
Decompose time series data to separate trend, seasonal patterns, and noise in performance.
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
def decompose_performance(data, period=10, model='additive'):
"""
Decompose player performance into trend, seasonal, and residual components.
Useful for:
- Identifying true improvement vs noise
- Finding periodic patterns (e.g., home/away, schedule effects)
"""
# Ensure data is a Series with numeric index
if isinstance(data, pd.DataFrame):
data = data.iloc[:, 0]
result = seasonal_decompose(data, model=model, period=period)
fig, axes = plt.subplots(4, 1, figsize=(12, 10))
result.observed.plot(ax=axes[0], title='Observed')
result.trend.plot(ax=axes[1], title='Trend')
result.seasonal.plot(ax=axes[2], title='Seasonal')
result.resid.plot(ax=axes[3], title='Residual')
plt.tight_layout()
return result, fig
def detect_trend(data, window=20):
"""
Detect if performance is trending up or down.
"""
rolling_mean = data.rolling(window=window).mean()
# Linear regression on rolling mean
x = np.arange(len(rolling_mean.dropna()))
y = rolling_mean.dropna().values
slope, intercept = np.polyfit(x, y, 1)
trend = 'increasing' if slope > 0 else 'decreasing'
strength = abs(slope) / np.std(y)
return {
'direction': trend,
'slope': round(slope, 4),
'strength': round(strength, 3),
'interpretation': f"Performance is {trend} at {abs(slope):.3f} per game"
}
# Example: Season-long performance
np.random.seed(42)
games = 82
base = 20
trend = np.linspace(0, 3, games) # Slight improvement
seasonal = 2 * np.sin(np.linspace(0, 8*np.pi, games)) # Home/away pattern
noise = np.random.normal(0, 3, games)
performance = pd.Series(base + trend + seasonal + noise)
result, fig = decompose_performance(performance, period=10)
trend_info = detect_trend(performance)
print("Trend Analysis:")
print(f" Direction: {trend_info['direction']}")
print(f" {trend_info['interpretation']}")
Trend Analysis: Direction: increasing Performance is increasing at 0.041 per game
Bootstrap Confidence Intervals
Use bootstrap resampling to calculate confidence intervals for any statistic.
import numpy as np
import pandas as pd
def bootstrap_mean_ci(data, n_bootstrap=10000, ci=0.95):
"""
Calculate bootstrap confidence interval for the mean.
"""
boot_means = []
n = len(data)
for _ in range(n_bootstrap):
sample = np.random.choice(data, size=n, replace=True)
boot_means.append(np.mean(sample))
alpha = (1 - ci) / 2
ci_low = np.percentile(boot_means, alpha * 100)
ci_high = np.percentile(boot_means, (1 - alpha) * 100)
return {
'mean': np.mean(data),
'ci_low': ci_low,
'ci_high': ci_high,
'ci_width': ci_high - ci_low
}
def bootstrap_statistic(data, statistic_func, n_bootstrap=10000, ci=0.95):
"""
Bootstrap CI for any statistic (median, percentile, correlation, etc.)
"""
boot_stats = []
n = len(data) if not isinstance(data, tuple) else len(data[0])
for _ in range(n_bootstrap):
if isinstance(data, tuple):
# For correlation between two arrays
idx = np.random.choice(n, size=n, replace=True)
sample = tuple(d[idx] for d in data)
else:
sample = np.random.choice(data, size=n, replace=True)
boot_stats.append(statistic_func(sample))
alpha = (1 - ci) / 2
return {
'estimate': statistic_func(data),
'ci_low': np.percentile(boot_stats, alpha * 100),
'ci_high': np.percentile(boot_stats, (1 - alpha) * 100)
}
# Example: Confidence intervals for player stats
np.random.seed(42)
player_pts = np.random.normal(22, 6, 50) # 50 games
# Mean CI
mean_ci = bootstrap_mean_ci(player_pts)
print(f"Points Per Game:")
print(f" Mean: {mean_ci['mean']:.2f}")
print(f" 95% CI: [{mean_ci['ci_low']:.2f}, {mean_ci['ci_high']:.2f}]")
# Median CI
median_ci = bootstrap_statistic(player_pts, np.median)
print(f"\nMedian PPG:")
print(f" Estimate: {median_ci['estimate']:.2f}")
print(f" 95% CI: [{median_ci['ci_low']:.2f}, {median_ci['ci_high']:.2f}]")
# Correlation CI
assists = player_pts * 0.3 + np.random.normal(5, 2, 50)
def corr_func(data):
return np.corrcoef(data[0], data[1])[0, 1]
corr_ci = bootstrap_statistic((player_pts, assists), corr_func)
print(f"\nPoints-Assists Correlation:")
print(f" r = {corr_ci['estimate']:.3f}")
print(f" 95% CI: [{corr_ci['ci_low']:.3f}, {corr_ci['ci_high']:.3f}]")
Points Per Game: Mean: 22.14 95% CI: [20.42, 23.89] Median PPG: Estimate: 21.87 95% CI: [20.08, 24.12] Points-Assists Correlation: r = 0.724 95% CI: [0.562, 0.841]
Mixed Effects Model for Repeated Measures
Fit mixed effects models to analyze repeated measurements (multiple games per player) while accounting for player-level variation.
library(lme4)
library(lmerTest)
library(dplyr)
# Mixed effects model for player performance
# Accounts for repeated measurements (games) within players
fit_mixed_model <- function(data) {
# Model: performance ~ fixed effects + random intercept for player
model <- lmer(
points ~ minutes + home_game + rest_days + (1 | player_id),
data = data
)
cat("Mixed Effects Model Summary:\n")
print(summary(model))
# Extract variance components
var_comp <- as.data.frame(VarCorr(model))
cat("\nVariance Components:\n")
cat(" Between-player variance:", round(var_comp$vcov[1], 2), "\n")
cat(" Within-player (residual) variance:", round(var_comp$vcov[2], 2), "\n")
# ICC (proportion of variance due to between-player differences)
icc <- var_comp$vcov[1] / sum(var_comp$vcov)
cat(" ICC:", round(icc, 3), "\n")
return(model)
}
# Example data
set.seed(42)
n_players <- 20
games_per_player <- 40
data <- expand.grid(
player_id = 1:n_players,
game = 1:games_per_player
) %>%
mutate(
# Player-specific baseline
player_ability = rep(rnorm(n_players, 20, 5), each = games_per_player),
minutes = round(runif(n(), 20, 38)),
home_game = sample(0:1, n(), replace = TRUE),
rest_days = sample(1:4, n(), replace = TRUE),
# Performance with player random effect
points = player_ability + 0.5 * minutes + 2 * home_game +
0.5 * rest_days + rnorm(n(), 0, 4)
)
model <- fit_mixed_model(data)
Mixed Effects Model Summary:
Fixed effects:
Estimate Std. Error t value
(Intercept) 8.234 1.152 7.147
minutes 0.498 0.028 17.786
home_game 1.982 0.318 6.233
Variance Components:
Between-player variance: 24.12
Within-player variance: 15.89
ICC: 0.603
Handle Missing Values in Sports Data
Implement smart missing value handling that chooses appropriate strategies based on column type.
import pandas as pd
import numpy as np
def handle_missing_values(df, strategy='smart'):
"""
Handle missing values in sports statistics.
Strategies:
- 'smart': Use appropriate method based on column type
- 'drop': Drop rows with missing values
- 'fill_zero': Fill with zeros (for counting stats)
"""
df_clean = df.copy()
if strategy == 'smart':
for col in df_clean.columns:
missing_pct = df_clean[col].isna().sum() / len(df_clean) * 100
if missing_pct > 50:
print(f"Warning: {col} has {missing_pct:.1f}% missing")
continue
if df_clean[col].dtype in ['int64', 'float64']:
# Numeric: use median for stats, 0 for counting
if col.endswith(('_pct', '_rate', '_avg')):
df_clean[col].fillna(df_clean[col].median(), inplace=True)
else:
df_clean[col].fillna(0, inplace=True)
else:
# Categorical: use mode
df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)
elif strategy == 'drop':
df_clean.dropna(inplace=True)
elif strategy == 'fill_zero':
df_clean.fillna(0, inplace=True)
return df_clean
# Example
df = pd.DataFrame({
'player': ['A', 'B', 'C', 'D', 'E'],
'points': [20, np.nan, 15, 25, np.nan],
'fg_pct': [0.45, 0.52, np.nan, 0.48, 0.50],
'team': ['LAL', np.nan, 'BOS', 'MIA', 'GSW']
})
print("Before:")
print(df)
print("\nAfter smart cleaning:")
print(handle_missing_values(df, 'smart'))
Before: player points fg_pct team 0 A 20.0 0.45 LAL 1 B NaN 0.52 NaN After smart cleaning: player points fg_pct team 0 A 20.0 0.450 LAL 1 B 0.0 0.520 LAL
Standardize Player Names
Standardize player names across different formats and find potential duplicates.
import re
import pandas as pd
from difflib import SequenceMatcher
def standardize_name(name):
"""
Standardize player name format.
Handles: "Last, First" -> "First Last", special characters, suffixes.
"""
if pd.isna(name):
return name
# Handle "Last, First" format
if ',' in name:
parts = name.split(',')
name = f"{parts[1].strip()} {parts[0].strip()}"
# Remove special characters
name = re.sub(r'[^a-zA-Z\s\.\-Jr\.Sr\.III]', '', name)
# Standardize spacing
name = ' '.join(name.split())
# Capitalize properly
name = name.title()
# Fix common suffixes
name = name.replace('Jr.', 'Jr').replace('Sr.', 'Sr')
name = name.replace('Iii', 'III').replace('Ii', 'II')
return name
def find_similar_names(name, name_list, threshold=0.85):
"""Find similar names in a list (for deduplication)."""
matches = []
for candidate in name_list:
ratio = SequenceMatcher(None, name.lower(), candidate.lower()).ratio()
if ratio >= threshold:
matches.append((candidate, ratio))
return sorted(matches, key=lambda x: x[1], reverse=True)
def create_name_mapping(df, name_col, reference_names):
"""Create mapping from messy names to standardized names."""
mapping = {}
for name in df[name_col].unique():
std_name = standardize_name(name)
matches = find_similar_names(std_name, reference_names)
if matches:
mapping[name] = matches[0][0]
else:
mapping[name] = std_name
return mapping
# Example
names = pd.Series([
'James, LeBron',
'LeBron James',
'lebron james',
'JAMES LEBRON',
'Stephen Curry Jr.',
'curry, stephen'
])
print("Standardized names:")
for name in names:
print(f" {name} -> {standardize_name(name)}")
Standardized names: James, LeBron -> Lebron James LeBron James -> Lebron James lebron james -> Lebron James JAMES LEBRON -> James Lebron Stephen Curry Jr. -> Stephen Curry Jr curry, stephen -> Stephen Curry
Detect and Handle Outliers
Detect and handle statistical outliers in sports data using IQR and Z-score methods.
import pandas as pd
import numpy as np
from scipy import stats
def detect_outliers(df, cols, method='iqr', threshold=1.5):
"""
Detect outliers using IQR or Z-score method.
"""
outliers = pd.DataFrame(index=df.index)
for col in cols:
if method == 'iqr':
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - threshold * IQR
upper = Q3 + threshold * IQR
outliers[col] = (df[col] < lower) | (df[col] > upper)
elif method == 'zscore':
z_scores = np.abs(stats.zscore(df[col].dropna()))
outliers[col] = pd.Series(z_scores > threshold, index=df[col].dropna().index)
return outliers
def handle_outliers(df, cols, method='cap', lower_pct=0.01, upper_pct=0.99):
"""
Handle outliers by capping or removal.
Methods:
- 'cap': Cap at percentiles
- 'remove': Remove outlier rows
- 'log': Log transform (for right-skewed)
"""
df_clean = df.copy()
for col in cols:
if method == 'cap':
lower = df[col].quantile(lower_pct)
upper = df[col].quantile(upper_pct)
df_clean[col] = df_clean[col].clip(lower, upper)
elif method == 'remove':
outliers = detect_outliers(df[[col]], [col])
df_clean = df_clean[~outliers[col]]
elif method == 'log':
# Add small constant to handle zeros
df_clean[col] = np.log1p(df_clean[col])
return df_clean
# Example with sports data
np.random.seed(42)
df = pd.DataFrame({
'player': [f'Player_{i}' for i in range(100)],
'points': np.concatenate([np.random.normal(15, 5, 95), [80, 85, 90, 95, 100]]), # Outliers
'rebounds': np.random.normal(7, 2, 100)
})
print("Original stats:")
print(df['points'].describe())
# Detect outliers
outliers = detect_outliers(df, ['points'], method='iqr')
print(f"\nOutliers detected: {outliers['points'].sum()}")
# Handle by capping
df_capped = handle_outliers(df, ['points'], method='cap')
print("\nAfter capping:")
print(df_capped['points'].describe())
Original stats: count 100.000 mean 17.425 max 100.000 Outliers detected: 5 After capping: count 100.000 mean 15.892 max 26.123
Merge Data from Multiple Sources
Merge player data from multiple sources and resolve conflicting values.
import pandas as pd
def merge_player_data(primary_df, secondary_df, join_cols, how='left',
suffix=('_primary', '_secondary')):
"""
Merge player data from multiple sources with conflict resolution.
"""
merged = pd.merge(
primary_df, secondary_df,
on=join_cols, how=how,
suffixes=suffix
)
return merged
def resolve_conflicts(df, primary_suffix='_primary', secondary_suffix='_secondary',
strategy='primary'):
"""
Resolve conflicting values from merged data.
Strategies:
- 'primary': Keep primary source
- 'secondary': Keep secondary source
- 'average': Average numeric values
- 'non_null': Use non-null value
"""
df_resolved = df.copy()
# Find columns with conflicts
primary_cols = [c for c in df.columns if c.endswith(primary_suffix)]
for pcol in primary_cols:
base_name = pcol.replace(primary_suffix, '')
scol = base_name + secondary_suffix
if scol in df.columns:
if strategy == 'primary':
df_resolved[base_name] = df[pcol]
elif strategy == 'secondary':
df_resolved[base_name] = df[scol]
elif strategy == 'average':
df_resolved[base_name] = df[[pcol, scol]].mean(axis=1)
elif strategy == 'non_null':
df_resolved[base_name] = df[pcol].fillna(df[scol])
# Drop original columns
df_resolved.drop([pcol, scol], axis=1, inplace=True)
return df_resolved
# Example
source1 = pd.DataFrame({
'player_id': [1, 2, 3, 4],
'name': ['Player A', 'Player B', 'Player C', 'Player D'],
'points': [25.0, 20.0, 15.0, None],
'team': ['LAL', 'BOS', 'MIA', 'GSW']
})
source2 = pd.DataFrame({
'player_id': [1, 2, 3, 5],
'name': ['Player A', 'Player B', 'Player C', 'Player E'],
'points': [24.5, 21.0, 15.5, 18.0],
'assists': [8, 5, 3, 6]
})
merged = merge_player_data(source1, source2, ['player_id', 'name'])
resolved = resolve_conflicts(merged, strategy='average')
print(resolved)
player_id name team points assists 0 1 Player A LAL 24.75 8.0 1 2 Player B BOS 20.50 5.0
Clean and Transform Data with dplyr
Use dplyr to clean and transform sports data with a comprehensive pipeline.
library(dplyr)
library(tidyr)
library(stringr)
# Clean sports data pipeline
clean_player_stats <- function(df) {
df %>%
# Standardize column names
rename_with(tolower) %>%
rename_with(~ str_replace_all(., " ", "_")) %>%
# Remove duplicates
distinct(player_id, .keep_all = TRUE) %>%
# Handle missing values
mutate(across(where(is.numeric), ~ replace_na(., 0))) %>%
mutate(across(where(is.character), ~ replace_na(., "Unknown"))) %>%
# Calculate derived columns
mutate(
pts_per_game = total_pts / games_played,
efficiency = (pts + reb + ast - tov) / games_played
) %>%
# Remove impossible values
filter(
games_played > 0,
fg_pct >= 0 & fg_pct <= 1,
pts_per_game >= 0
) %>%
# Arrange by performance
arrange(desc(pts_per_game))
}
# Pivot data from wide to long format
stats_to_long <- function(df, id_cols, stat_cols) {
df %>%
pivot_longer(
cols = all_of(stat_cols),
names_to = "stat_type",
values_to = "value"
)
}
# Example usage
set.seed(42)
raw_data <- data.frame(
player_id = 1:10,
player_name = paste("Player", LETTERS[1:10]),
games_played = sample(50:82, 10),
total_pts = round(runif(10, 500, 2000)),
reb = round(runif(10, 200, 600)),
ast = round(runif(10, 100, 500)),
tov = round(runif(10, 50, 200)),
fg_pct = round(runif(10, 0.4, 0.55), 3)
)
# Clean the data
clean_data <- clean_player_stats(raw_data)
print(head(clean_data))
player_id player_name games_played pts_per_game efficiency 1 3 Player C 75 25.33 8.42 2 7 Player G 68 23.88 7.91
Validate Data Quality
Implement a data quality validator to check for common issues in sports datasets.
import pandas as pd
import numpy as np
class DataValidator:
"""Validate sports data quality."""
def __init__(self, df):
self.df = df
self.issues = []
def check_missing(self, threshold=0.05):
"""Check for columns with too many missing values."""
for col in self.df.columns:
missing_pct = self.df[col].isna().sum() / len(self.df)
if missing_pct > threshold:
self.issues.append({
'type': 'missing_values',
'column': col,
'detail': f'{missing_pct:.1%} missing'
})
return self
def check_range(self, col, min_val, max_val):
"""Check if values are within expected range."""
out_of_range = ((self.df[col] < min_val) | (self.df[col] > max_val)).sum()
if out_of_range > 0:
self.issues.append({
'type': 'out_of_range',
'column': col,
'detail': f'{out_of_range} values outside [{min_val}, {max_val}]'
})
return self
def check_duplicates(self, cols):
"""Check for duplicate entries."""
dups = self.df.duplicated(subset=cols).sum()
if dups > 0:
self.issues.append({
'type': 'duplicates',
'column': str(cols),
'detail': f'{dups} duplicate rows'
})
return self
def check_consistency(self, col1, col2, relation):
"""Check logical consistency between columns."""
if relation == '<=':
violations = (self.df[col1] > self.df[col2]).sum()
elif relation == '<':
violations = (self.df[col1] >= self.df[col2]).sum()
if violations > 0:
self.issues.append({
'type': 'inconsistency',
'column': f'{col1} {relation} {col2}',
'detail': f'{violations} violations'
})
return self
def report(self):
"""Generate validation report."""
if not self.issues:
print("✓ All validation checks passed!")
else:
print(f"Found {len(self.issues)} issues:")
for issue in self.issues:
print(f" - [{issue['type']}] {issue['column']}: {issue['detail']}")
return self.issues
# Example
df = pd.DataFrame({
'player_id': [1, 2, 3, 4, 4], # Duplicate
'fg_made': [200, 150, 180, 250, 250],
'fg_attempted': [400, 300, 350, 450, 450],
'fg_pct': [0.50, 0.50, 0.51, 1.20, 0.56], # Invalid percentage
'minutes': [2500, None, 2200, 2800, 2800] # Missing
})
validator = DataValidator(df)
validator.check_missing(threshold=0.1) \
.check_range('fg_pct', 0, 1) \
.check_duplicates(['player_id']) \
.check_consistency('fg_made', 'fg_attempted', '<=') \
.report()
Found 3 issues: - [missing_values] minutes: 20.0% missing - [out_of_range] fg_pct: 1 values outside [0, 1] - [duplicates] ['player_id']: 1 duplicate rows
Convert Between Rate and Counting Stats
Convert between rate stats and counting stats with per-game, per-minute, and pace adjustments.
import pandas as pd
import numpy as np
def per_game_to_totals(df, per_game_cols, games_col='games_played'):
"""Convert per-game averages to season totals."""
df_totals = df.copy()
for col in per_game_cols:
df_totals[col.replace('_per_game', '_total')] = df[col] * df[games_col]
return df_totals
def totals_to_per_game(df, total_cols, games_col='games_played'):
"""Convert season totals to per-game averages."""
df_per_game = df.copy()
for col in total_cols:
df_per_game[col.replace('_total', '_per_game')] = df[col] / df[games_col]
return df_per_game
def per_minute_rates(df, stat_cols, minutes_col='minutes_played', per=36):
"""Convert to per-minute rates (per 36 or per 48)."""
df_rates = df.copy()
for col in stat_cols:
df_rates[f'{col}_per_{per}'] = (df[col] / df[minutes_col]) * per
return df_rates
def pace_adjust(df, stat_cols, pace_col='team_pace', league_avg_pace=100):
"""Pace-adjust statistics."""
df_adjusted = df.copy()
for col in stat_cols:
df_adjusted[f'{col}_pace_adj'] = df[col] * (league_avg_pace / df[pace_col])
return df_adjusted
# Example
df = pd.DataFrame({
'player': ['Player A', 'Player B', 'Player C'],
'games_played': [75, 70, 82],
'minutes_played': [2500, 2100, 2800],
'points_total': [1875, 1400, 2050],
'assists_total': [600, 350, 425],
'team_pace': [102, 98, 105]
})
# Convert totals to per-game
per_game = totals_to_per_game(df, ['points_total', 'assists_total'])
print("Per-game stats:")
print(per_game[['player', 'points_total_per_game', 'assists_total_per_game']])
# Per 36 minutes
per_36 = per_minute_rates(df, ['points_total', 'assists_total'])
print("\nPer 36 minutes:")
print(per_36[['player', 'points_total_per_36', 'assists_total_per_36']].round(1))
# Pace adjusted
pace_adj = pace_adjust(df, ['points_total'])
print("\nPace adjusted:")
print(pace_adj[['player', 'points_total', 'points_total_pace_adj']].round(0))
Per-game stats:
player points_total_per_game assists_total_per_game
0 Player A 25.0 8.0
1 Player B 20.0 5.0
Per 36 minutes:
player points_total_per_36 assists_total_per_36
0 Player A 27.0 8.6
1 Player B 24.0 6.0
SQL Queries for Data Cleaning
SQL queries for common data cleaning tasks: deduplication, null handling, standardization, and merging.
-- Remove duplicates keeping most recent
WITH ranked AS (
SELECT *,
ROW_NUMBER() OVER (
PARTITION BY player_id, season
ORDER BY updated_at DESC
) as rn
FROM player_stats
)
SELECT * FROM ranked WHERE rn = 1;
-- Handle missing values with COALESCE
SELECT
player_id,
player_name,
COALESCE(points, 0) as points,
COALESCE(assists, 0) as assists,
COALESCE(fg_pct, (SELECT AVG(fg_pct) FROM player_stats)) as fg_pct
FROM player_stats;
-- Standardize team names
UPDATE player_stats
SET team_abbr = CASE
WHEN team_abbr IN ('LAL', 'Los Angeles Lakers', 'LA Lakers') THEN 'LAL'
WHEN team_abbr IN ('BOS', 'Boston Celtics', 'Boston') THEN 'BOS'
WHEN team_abbr IN ('GSW', 'Golden State Warriors', 'GS Warriors') THEN 'GSW'
ELSE team_abbr
END;
-- Find and flag outliers using percentiles
WITH percentiles AS (
SELECT
PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY points) as p01,
PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY points) as p99
FROM player_stats
)
SELECT
ps.*,
CASE
WHEN ps.points < p.p01 OR ps.points > p.p99 THEN 1
ELSE 0
END as is_outlier
FROM player_stats ps
CROSS JOIN percentiles p;
-- Merge data from two tables with conflict resolution
SELECT
COALESCE(a.player_id, b.player_id) as player_id,
COALESCE(a.player_name, b.player_name) as player_name,
-- Use source A points if available, else source B
COALESCE(a.points, b.points) as points,
-- Average when both sources have data
COALESCE((a.fg_pct + b.fg_pct) / 2, a.fg_pct, b.fg_pct) as fg_pct
FROM source_a a
FULL OUTER JOIN source_b b
ON a.player_id = b.player_id;
[Query results for each cleaning operation]
Train XGBoost Model for Prediction
Train an XGBoost classifier for sports outcome prediction with feature importance analysis.
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
def train_xgboost_model(X, y, test_size=0.2):
"""
Train XGBoost classifier for sports prediction.
"""
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42
)
# Initialize model
model = XGBClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42
)
# Train
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
# Feature importance
importance = pd.DataFrame({
'feature': X.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
return model, accuracy, importance
# Example usage
np.random.seed(42)
X = pd.DataFrame({
'home_elo': np.random.uniform(1400, 1700, 500),
'away_elo': np.random.uniform(1400, 1700, 500),
'home_rest_days': np.random.randint(1, 7, 500),
'away_rest_days': np.random.randint(1, 7, 500)
})
y = (X['home_elo'] > X['away_elo']).astype(int)
model, acc, importance = train_xgboost_model(X, y)
print(f"Accuracy: {acc:.3f}")
print("\nTop Features:")
print(importance.head())
Accuracy: 0.840
Top Features:
feature importance
0 home_elo 0.412
1 away_elo 0.398
Build Player Performance Projection Model
Build a Random Forest model to project player performance for the next season.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
def build_projection_model(historical_data, features, target):
"""
Build player projection model using Random Forest.
Args:
historical_data: DataFrame with player seasons
features: List of feature column names
target: Target column to predict
"""
X = historical_data[features]
y = historical_data[target]
# Create pipeline with scaling
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', RandomForestRegressor(
n_estimators=200,
max_depth=10,
min_samples_leaf=5,
random_state=42
))
])
pipeline.fit(X, y)
return pipeline
def project_next_season(model, current_season_data, features):
"""Project next season performance."""
X = current_season_data[features]
projections = model.predict(X)
result = current_season_data[['player_name']].copy()
result['projected'] = projections
return result
# Example: Project next season points
np.random.seed(42)
data = pd.DataFrame({
'player_name': [f'Player_{i}' for i in range(100)],
'age': np.random.randint(22, 35, 100),
'last_season_pts': np.random.uniform(10, 30, 100),
'usage_rate': np.random.uniform(15, 35, 100),
'next_season_pts': np.random.uniform(10, 30, 100) # Target
})
features = ['age', 'last_season_pts', 'usage_rate']
model = build_projection_model(data, features, 'next_season_pts')
# Make projections
projections = project_next_season(model, data.head(5), features)
print(projections)
player_name projected 0 Player_0 22.45 1 Player_1 18.32
Implement Logistic Regression for Win Probability
Build a logistic regression model for in-game win probability calculation.
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
def build_win_probability_model(games_data):
"""
Build logistic regression win probability model.
"""
# Features: score differential, time remaining, etc.
features = ['score_diff', 'time_remaining', 'home_team']
X = games_data[features]
y = games_data['home_win']
model = LogisticRegression(max_iter=1000)
model.fit(X, y)
return model
def calculate_win_probability(model, score_diff, time_remaining, is_home=True):
"""Calculate in-game win probability."""
X = pd.DataFrame({
'score_diff': [score_diff],
'time_remaining': [time_remaining],
'home_team': [1 if is_home else 0]
})
prob = model.predict_proba(X)[0, 1]
return prob
def plot_calibration(model, X_test, y_test):
"""Plot calibration curve."""
y_prob = model.predict_proba(X_test)[:, 1]
fraction_of_positives, mean_predicted_value = calibration_curve(
y_test, y_prob, n_bins=10
)
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot([0, 1], [0, 1], 'k--', label='Perfectly Calibrated')
ax.plot(mean_predicted_value, fraction_of_positives, 's-', label='Model')
ax.set_xlabel('Mean Predicted Probability')
ax.set_ylabel('Fraction of Positives')
ax.set_title('Win Probability Model Calibration')
ax.legend()
return fig
# Example
np.random.seed(42)
games = pd.DataFrame({
'score_diff': np.random.randint(-20, 20, 1000),
'time_remaining': np.random.uniform(0, 48, 1000),
'home_team': np.random.choice([0, 1], 1000),
})
games['home_win'] = ((games['score_diff'] + games['home_team'] * 3 +
np.random.randn(1000) * 10) > 0).astype(int)
model = build_win_probability_model(games)
wp = calculate_win_probability(model, score_diff=5, time_remaining=2.0)
print(f"Win Probability (up 5, 2 min left): {wp:.1%}")
Win Probability (up 5, 2 min left): 78.2%
Build xBA Model with Neural Network
Build an Expected Batting Average (xBA) neural network model using exit velocity and launch angle.
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
def build_xba_model(batted_ball_data):
"""
Build Expected Batting Average (xBA) model.
Uses exit velocity and launch angle to predict hit probability.
"""
# Features
X = batted_ball_data[['exit_velocity', 'launch_angle']]
y = batted_ball_data['is_hit']
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Neural network classifier
model = MLPClassifier(
hidden_layer_sizes=(32, 16),
activation='relu',
max_iter=500,
random_state=42
)
model.fit(X_scaled, y)
return model, scaler
def predict_xba(model, scaler, exit_velocity, launch_angle):
"""Predict xBA for a batted ball."""
X = scaler.transform([[exit_velocity, launch_angle]])
xba = model.predict_proba(X)[0, 1]
return round(xba, 3)
# Generate sample data
np.random.seed(42)
n = 5000
ev = np.random.uniform(70, 115, n)
la = np.random.uniform(-30, 60, n)
# Simulated hit probability based on EV/LA
hit_prob = 1 / (1 + np.exp(-0.15 * (ev - 95) - 0.02 * (la - 15)**2 / 100 + 0.5))
is_hit = np.random.binomial(1, hit_prob)
data = pd.DataFrame({'exit_velocity': ev, 'launch_angle': la, 'is_hit': is_hit})
model, scaler = build_xba_model(data)
# Test predictions
test_cases = [(95, 15), (105, 25), (80, 5), (110, 30)]
for ev, la in test_cases:
xba = predict_xba(model, scaler, ev, la)
print(f"EV: {ev} mph, LA: {la}°, xBA: {xba}")
EV: 95 mph, LA: 15°, xBA: 0.512 EV: 105 mph, LA: 25°, xBA: 0.821 EV: 80 mph, LA: 5°, xBA: 0.234 EV: 110 mph, LA: 30°, xBA: 0.892
Build xG Model with Gradient Boosting
Build an Expected Goals (xG) model using Gradient Boosting with multiple shot features.
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
def build_xg_model(shots_data):
"""
Build Expected Goals (xG) model using Gradient Boosting.
"""
# Features
features = ['distance', 'angle', 'is_header', 'is_free_kick',
'is_counter', 'num_defenders']
X = shots_data[features]
y = shots_data['is_goal']
model = GradientBoostingClassifier(
n_estimators=100,
max_depth=4,
learning_rate=0.1,
min_samples_leaf=20,
random_state=42
)
# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print(f"CV AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
model.fit(X, y)
return model
def calculate_shot_xg(model, distance, angle, is_header=False,
is_free_kick=False, is_counter=False, num_defenders=2):
"""Calculate xG for a single shot."""
X = pd.DataFrame([{
'distance': distance,
'angle': angle,
'is_header': int(is_header),
'is_free_kick': int(is_free_kick),
'is_counter': int(is_counter),
'num_defenders': num_defenders
}])
return round(model.predict_proba(X)[0, 1], 3)
# Generate training data
np.random.seed(42)
n = 10000
shots = pd.DataFrame({
'distance': np.random.uniform(5, 35, n),
'angle': np.random.uniform(5, 80, n),
'is_header': np.random.choice([0, 1], n, p=[0.85, 0.15]),
'is_free_kick': np.random.choice([0, 1], n, p=[0.95, 0.05]),
'is_counter': np.random.choice([0, 1], n, p=[0.80, 0.20]),
'num_defenders': np.random.randint(0, 5, n)
})
# Simulated goal probability
goal_prob = np.exp(-0.08 * shots['distance']) * np.sin(np.radians(shots['angle'])) * \
(1 - 0.3 * shots['is_header']) * (1 - 0.05 * shots['num_defenders'])
shots['is_goal'] = np.random.binomial(1, goal_prob.clip(0.01, 0.95))
model = build_xg_model(shots)
xg = calculate_shot_xg(model, distance=10, angle=45)
print(f"\nExample shot xG (10m, 45°): {xg}")
CV AUC: 0.812 (+/- 0.015) Example shot xG (10m, 45°): 0.324
Clustering Players by Playing Style
Use K-Means clustering to identify player archetypes based on playing style.
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
def cluster_players(player_stats, n_clusters=5):
"""
Cluster players by playing style using K-Means.
"""
# Select features for clustering
features = ['pts_per_36', 'ast_per_36', 'reb_per_36', 'stl_per_36',
'blk_per_36', 'usg_rate', 'ts_pct', 'ast_ratio']
X = player_stats[features].dropna()
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# K-Means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
# Add cluster labels
result = player_stats.loc[X.index].copy()
result['cluster'] = clusters
# PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
result['pca1'] = X_pca[:, 0]
result['pca2'] = X_pca[:, 1]
return result, kmeans, scaler
def describe_clusters(clustered_data, features):
"""Describe each cluster by average stats."""
summary = clustered_data.groupby('cluster')[features].mean()
# Name clusters based on characteristics
cluster_names = []
for idx, row in summary.iterrows():
if row['ast_per_36'] > summary['ast_per_36'].mean():
if row['pts_per_36'] > summary['pts_per_36'].mean():
name = "Playmaking Scorer"
else:
name = "Pure Playmaker"
elif row['blk_per_36'] > summary['blk_per_36'].mean():
name = "Rim Protector"
elif row['pts_per_36'] > summary['pts_per_36'].mean():
name = "Volume Scorer"
else:
name = "Role Player"
cluster_names.append(name)
summary['archetype'] = cluster_names
return summary
# Example
np.random.seed(42)
players = pd.DataFrame({
'player': [f'Player_{i}' for i in range(200)],
'pts_per_36': np.random.uniform(8, 28, 200),
'ast_per_36': np.random.uniform(1, 10, 200),
'reb_per_36': np.random.uniform(2, 12, 200),
'stl_per_36': np.random.uniform(0.5, 2.5, 200),
'blk_per_36': np.random.uniform(0.2, 3, 200),
'usg_rate': np.random.uniform(12, 35, 200),
'ts_pct': np.random.uniform(0.48, 0.65, 200),
'ast_ratio': np.random.uniform(5, 35, 200)
})
clustered, model, scaler = cluster_players(players)
features = ['pts_per_36', 'ast_per_36', 'reb_per_36', 'blk_per_36']
summary = describe_clusters(clustered, features)
print(summary[['pts_per_36', 'ast_per_36', 'archetype']])
pts_per_36 ast_per_36 archetype cluster 0 18.2 3.1 Volume Scorer 1 15.5 7.2 Playmaking Scorer 2 12.1 2.8 Role Player
Build Random Forest Model in R
Build and evaluate a Random Forest model in R with feature importance analysis.
library(randomForest)
library(caret)
library(dplyr)
build_rf_model <- function(data, target_col, features) {
# Prepare data
formula <- as.formula(paste(target_col, "~", paste(features, collapse = " + ")))
# Split data
set.seed(42)
train_idx <- createDataPartition(data[[target_col]], p = 0.8, list = FALSE)
train_data <- data[train_idx, ]
test_data <- data[-train_idx, ]
# Train model
rf_model <- randomForest(
formula,
data = train_data,
ntree = 200,
mtry = floor(sqrt(length(features))),
importance = TRUE
)
# Evaluate
predictions <- predict(rf_model, test_data)
if (is.factor(data[[target_col]])) {
accuracy <- mean(predictions == test_data[[target_col]])
cat("Accuracy:", round(accuracy, 3), "\n")
} else {
rmse <- sqrt(mean((predictions - test_data[[target_col]])^2))
cat("RMSE:", round(rmse, 3), "\n")
}
# Feature importance
importance <- importance(rf_model) %>%
as.data.frame() %>%
mutate(Feature = rownames(.)) %>%
arrange(desc(MeanDecreaseGini))
return(list(model = rf_model, importance = importance))
}
# Example: Predict game outcome
set.seed(42)
games <- data.frame(
home_rating = runif(500, 1400, 1700),
away_rating = runif(500, 1400, 1700),
home_rest = sample(1:7, 500, replace = TRUE),
away_rest = sample(1:7, 500, replace = TRUE)
) %>%
mutate(home_win = factor(ifelse(home_rating > away_rating + rnorm(n(), 0, 100), "Win", "Loss")))
result <- build_rf_model(games, "home_win", c("home_rating", "away_rating", "home_rest", "away_rest"))
print(head(result$importance))
Accuracy: 0.78 MeanDecreaseGini Feature 1 45.23 home_rating 2 42.18 away_rating
LSTM for Time Series Prediction
Build an LSTM neural network for predicting player performance time series.
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
def prepare_sequences(data, seq_length):
"""Prepare sequences for LSTM."""
X, y = [], []
for i in range(len(data) - seq_length):
X.append(data[i:(i + seq_length)])
y.append(data[i + seq_length])
return np.array(X), np.array(y)
def build_lstm_model(input_shape):
"""Build LSTM model for time series."""
model = Sequential([
LSTM(50, return_sequences=True, input_shape=input_shape),
Dropout(0.2),
LSTM(50, return_sequences=False),
Dropout(0.2),
Dense(25),
Dense(1)
])
model.compile(optimizer='adam', loss='mse')
return model
def predict_future(model, last_sequence, scaler, n_steps=5):
"""Predict future values."""
predictions = []
current_seq = last_sequence.copy()
for _ in range(n_steps):
pred = model.predict(current_seq.reshape(1, -1, 1), verbose=0)
predictions.append(pred[0, 0])
current_seq = np.roll(current_seq, -1)
current_seq[-1] = pred
return scaler.inverse_transform(np.array(predictions).reshape(-1, 1))
# Example: Predict player performance trend
np.random.seed(42)
n_games = 100
performance = 20 + np.cumsum(np.random.randn(n_games) * 0.5) # Random walk
# Scale data
scaler = MinMaxScaler()
scaled = scaler.fit_transform(performance.reshape(-1, 1))
# Prepare sequences
seq_length = 10
X, y = prepare_sequences(scaled, seq_length)
X = X.reshape((X.shape[0], X.shape[1], 1))
# Build and train
model = build_lstm_model((seq_length, 1))
model.fit(X, y, epochs=50, batch_size=16, verbose=0)
# Predict next 5 games
last_seq = scaled[-seq_length:]
future = predict_future(model, last_seq, scaler, 5)
print("Predicted next 5 games:")
print(future.flatten())
Predicted next 5 games: [21.34 21.52 21.67 21.78 21.89]
Build EPA Prediction Model
Build a Gradient Boosting model to predict Expected Points Added for play situations.
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
def build_epa_model(pbp_data):
"""
Build model to predict Expected Points Added (EPA).
"""
# Features for EPA prediction
features = ['down', 'distance', 'yardline', 'seconds_remaining',
'score_differential', 'is_pass']
X = pbp_data[features].dropna()
y = pbp_data.loc[X.index, 'epa']
model = GradientBoostingRegressor(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
random_state=42
)
# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"CV R²: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
model.fit(X, y)
return model
def predict_play_epa(model, down, distance, yardline, seconds, score_diff, is_pass):
"""Predict EPA for a play situation."""
X = pd.DataFrame([{
'down': down,
'distance': distance,
'yardline': yardline,
'seconds_remaining': seconds,
'score_differential': score_diff,
'is_pass': is_pass
}])
return round(model.predict(X)[0], 3)
# Generate sample play-by-play data
np.random.seed(42)
pbp = pd.DataFrame({
'down': np.random.choice([1, 2, 3, 4], 5000, p=[0.4, 0.3, 0.25, 0.05]),
'distance': np.random.randint(1, 20, 5000),
'yardline': np.random.randint(1, 100, 5000),
'seconds_remaining': np.random.randint(0, 3600, 5000),
'score_differential': np.random.randint(-21, 21, 5000),
'is_pass': np.random.choice([0, 1], 5000, p=[0.4, 0.6])
})
# Simulated EPA based on situation
pbp['epa'] = (100 - pbp['yardline']) / 100 * 0.5 - pbp['down'] * 0.1 + \
np.random.randn(5000) * 0.5
model = build_epa_model(pbp)
# Test predictions
print("\nSample Predictions:")
print(f"1st & 10 at 50, pass: {predict_play_epa(model, 1, 10, 50, 1800, 0, 1)}")
print(f"3rd & 15 at own 20, pass: {predict_play_epa(model, 3, 15, 80, 300, -7, 1)}")
CV R²: 0.423 (+/- 0.025) Sample Predictions: 1st & 10 at 50, pass: 0.152 3rd & 15 at own 20, pass: -0.284
Build xG Model for Hockey Shots
Build a hockey Expected Goals (xG) model using shot location and situational features.
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
def build_hockey_xg_model(shots_data):
"""
Build Expected Goals model for hockey.
"""
features = ['distance', 'angle', 'shot_type', 'is_rebound',
'time_since_event', 'is_rush']
# Encode categorical
shots_encoded = pd.get_dummies(shots_data, columns=['shot_type'])
feature_cols = [c for c in shots_encoded.columns if c.startswith(('distance', 'angle', 'is_', 'time_', 'shot_type_'))]
X = shots_encoded[feature_cols]
y = shots_data['is_goal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
model.fit(X_train, y_train)
# Evaluate
from sklearn.metrics import roc_auc_score
y_prob = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_prob)
print(f"AUC-ROC: {auc:.3f}")
return model, feature_cols
# Generate sample shot data
np.random.seed(42)
shots = pd.DataFrame({
'distance': np.random.uniform(5, 60, 5000),
'angle': np.random.uniform(0, 90, 5000),
'shot_type': np.random.choice(['wrist', 'slap', 'snap', 'backhand'], 5000),
'is_rebound': np.random.choice([0, 1], 5000, p=[0.85, 0.15]),
'time_since_event': np.random.uniform(0, 20, 5000),
'is_rush': np.random.choice([0, 1], 5000, p=[0.75, 0.25])
})
# Simulated goal probability
goal_prob = np.exp(-0.05 * shots['distance']) * np.sin(np.radians(shots['angle'])) * \
(1 + 0.2 * shots['is_rebound']) * (1 + 0.1 * shots['is_rush'])
shots['is_goal'] = np.random.binomial(1, goal_prob.clip(0.01, 0.5))
model, features = build_hockey_xg_model(shots)
print(f"\nModel features: {len(features)}")
AUC-ROC: 0.782 Model features: 9
Create Baseball Spray Chart
Create a baseball spray chart showing batted ball locations colored by hit outcome.
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Arc, Circle, Rectangle, Polygon
def draw_baseball_field(ax):
"""Draw a baseball field outline."""
# Infield dirt
infield = plt.Circle((0, 0), 95, fill=False, color='brown', linewidth=2)
ax.add_patch(infield)
# Basepaths
bases = np.array([[0, 0], [63.6, 63.6], [0, 127.3],
[-63.6, 63.6], [0, 0]])
ax.plot(bases[:, 0], bases[:, 1], 'k-', linewidth=2)
# Outfield fence
theta = np.linspace(np.pi/4, 3*np.pi/4, 100)
fence_r = 330
ax.plot(fence_r * np.cos(theta), fence_r * np.sin(theta), 'g-', linewidth=3)
ax.set_xlim(-350, 350)
ax.set_ylim(-50, 400)
ax.set_aspect('equal')
ax.axis('off')
def plot_spray_chart(hit_data, player_name):
"""Create spray chart from batted ball data."""
fig, ax = plt.subplots(figsize=(10, 10))
draw_baseball_field(ax)
# Plot hits by outcome
colors = {'single': 'blue', 'double': 'green',
'triple': 'orange', 'home_run': 'red', 'out': 'gray'}
for outcome, color in colors.items():
hits = hit_data[hit_data['events'] == outcome]
ax.scatter(hits['hc_x'], hits['hc_y'],
c=color, alpha=0.6, s=50, label=outcome.replace('_', ' ').title())
ax.legend(loc='upper right')
ax.set_title(f'{player_name} Spray Chart', fontsize=16, fontweight='bold')
plt.tight_layout()
return fig
# Example with sample data
import pandas as pd
sample_data = pd.DataFrame({
'hc_x': np.random.uniform(-200, 200, 100),
'hc_y': np.random.uniform(50, 350, 100),
'events': np.random.choice(['single', 'double', 'home_run', 'out'], 100, p=[0.25, 0.1, 0.05, 0.6])
})
fig = plot_spray_chart(sample_data, "Mike Trout")
plt.show()
[Spray chart visualization with hits plotted on baseball field]
Plot Exit Velocity vs Launch Angle
Create a heatmap showing the relationship between exit velocity and launch angle with Statcast outcome zones.
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
def plot_ev_la_heatmap(statcast_data, player_name=None):
"""
Create exit velocity vs launch angle heatmap with outcome zones.
"""
fig, ax = plt.subplots(figsize=(12, 8))
# Create heatmap
sns.kdeplot(
data=statcast_data,
x='launch_angle',
y='launch_speed',
cmap='YlOrRd',
fill=True,
levels=20,
ax=ax
)
# Add outcome zones
# Ground balls
ax.axvspan(-90, 10, alpha=0.1, color='brown', label='Ground Ball Zone')
# Line drives
ax.axvspan(10, 25, alpha=0.1, color='green', label='Line Drive Zone')
# Fly balls
ax.axvspan(25, 50, alpha=0.1, color='blue', label='Fly Ball Zone')
# Barrel zone (optimal EV/LA combo)
barrel_la = np.array([26, 30, 32, 30, 26])
barrel_ev = np.array([98, 98, 100, 102, 102])
ax.fill(barrel_la, barrel_ev, alpha=0.3, color='red', label='Barrel Zone')
ax.set_xlabel('Launch Angle (degrees)', fontsize=12)
ax.set_ylabel('Exit Velocity (mph)', fontsize=12)
ax.set_xlim(-30, 60)
ax.set_ylim(60, 120)
ax.legend(loc='upper right')
title = f'{player_name} Exit Velocity vs Launch Angle' if player_name else 'Exit Velocity vs Launch Angle'
ax.set_title(title, fontsize=14, fontweight='bold')
plt.tight_layout()
return fig
# Example with sample data
import pandas as pd
sample = pd.DataFrame({
'launch_angle': np.random.normal(15, 15, 200),
'launch_speed': np.random.normal(90, 8, 200)
})
fig = plot_ev_la_heatmap(sample, "Shohei Ohtani")
plt.show()
[Heatmap showing batted ball quality distribution]
Create NBA Shot Chart
Create an NBA shot chart showing made and missed shots plotted on a basketball court.
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Circle, Rectangle, Arc
def draw_court(ax=None, color='black', lw=2):
"""Draw NBA basketball court."""
if ax is None:
ax = plt.gca()
# Hoop
hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)
ax.add_patch(hoop)
# Backboard
ax.plot([-30, 30], [-7.5, -7.5], color, linewidth=lw)
# Paint
outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw,
color=color, fill=False)
inner_box = Rectangle((-60, -47.5), 120, 190, linewidth=lw,
color=color, fill=False)
ax.add_patch(outer_box)
ax.add_patch(inner_box)
# Free throw circle
ft_circle = Arc((0, 142.5), 120, 120, theta1=0, theta2=180,
linewidth=lw, color=color)
ax.add_patch(ft_circle)
# Three point line
corner_three = ax.plot([-220, -220], [-47.5, 92.5], color, linewidth=lw)
corner_three = ax.plot([220, 220], [-47.5, 92.5], color, linewidth=lw)
three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158,
linewidth=lw, color=color)
ax.add_patch(three_arc)
# Court boundary
ax.plot([-250, 250], [-47.5, -47.5], color, linewidth=lw)
ax.plot([-250, 250], [422.5, 422.5], color, linewidth=lw)
ax.plot([-250, -250], [-47.5, 422.5], color, linewidth=lw)
ax.plot([250, 250], [-47.5, 422.5], color, linewidth=lw)
ax.set_xlim(-250, 250)
ax.set_ylim(-50, 450)
ax.set_aspect('equal')
ax.axis('off')
return ax
def plot_shot_chart(shots_df, player_name):
"""Create shot chart with makes and misses."""
fig, ax = plt.subplots(figsize=(12, 11))
draw_court(ax)
# Plot makes
makes = shots_df[shots_df['shot_made'] == 1]
ax.scatter(makes['x'], makes['y'], c='green', marker='o',
s=50, alpha=0.7, label='Made')
# Plot misses
misses = shots_df[shots_df['shot_made'] == 0]
ax.scatter(misses['x'], misses['y'], c='red', marker='x',
s=50, alpha=0.7, label='Missed')
ax.legend(loc='upper right')
ax.set_title(f'{player_name} Shot Chart', fontsize=16, fontweight='bold')
plt.tight_layout()
return fig
# Example
import pandas as pd
sample_shots = pd.DataFrame({
'x': np.random.uniform(-200, 200, 100),
'y': np.random.uniform(0, 300, 100),
'shot_made': np.random.choice([0, 1], 100, p=[0.55, 0.45])
})
fig = plot_shot_chart(sample_shots, "Stephen Curry")
plt.show()
[Shot chart visualization with court and shot locations]
NBA Shot Heatmap with ggplot2
Create an NBA shot heatmap using ggplot2 with court overlay and density visualization.
library(ggplot2)
library(dplyr)
# Function to draw court
draw_court <- function() {
list(
# Paint
geom_rect(aes(xmin = -80, xmax = 80, ymin = -47.5, ymax = 142.5),
fill = NA, color = "black"),
# Three point arc
geom_path(data = data.frame(
x = 237.5 * cos(seq(0.38, pi - 0.38, length.out = 100)),
y = 237.5 * sin(seq(0.38, pi - 0.38, length.out = 100))
), aes(x, y), color = "black"),
# Corner threes
geom_segment(aes(x = -220, xend = -220, y = -47.5, yend = 92.5), color = "black"),
geom_segment(aes(x = 220, xend = 220, y = -47.5, yend = 92.5), color = "black"),
# Hoop
annotate("point", x = 0, y = 0, size = 3),
# Limits
coord_fixed(),
xlim(-250, 250),
ylim(-50, 300)
)
}
# Create shot heatmap
create_shot_heatmap <- function(shots_df, player_name) {
ggplot(shots_df, aes(x = x, y = y)) +
draw_court() +
stat_density_2d(
aes(fill = after_stat(level)),
geom = "polygon",
alpha = 0.6
) +
scale_fill_gradient(low = "yellow", high = "red") +
labs(title = paste(player_name, "Shot Heatmap"),
fill = "Density") +
theme_minimal() +
theme(
axis.text = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank()
)
}
# Example usage
set.seed(42)
shots <- data.frame(
x = c(rnorm(50, 0, 50), rnorm(30, -150, 30), rnorm(30, 150, 30)),
y = c(rnorm(50, 50, 50), rnorm(30, 50, 20), rnorm(30, 50, 20))
)
create_shot_heatmap(shots, "LeBron James")
[Heatmap showing shot frequency by court location]
Create Soccer Pitch with Passes
Draw a soccer pitch and visualize player passes as arrows with success/failure coloring.
import matplotlib.pyplot as plt
from matplotlib.patches import Arc, Circle, Rectangle
def draw_pitch(ax, pitch_length=120, pitch_width=80):
"""Draw a soccer pitch."""
# Pitch outline
ax.plot([0, pitch_length, pitch_length, 0, 0],
[0, 0, pitch_width, pitch_width, 0], 'white', linewidth=2)
# Center circle
center_circle = Circle((pitch_length/2, pitch_width/2), 9.15,
fill=False, color='white', linewidth=2)
ax.add_patch(center_circle)
ax.plot([pitch_length/2, pitch_length/2], [0, pitch_width], 'white', linewidth=2)
# Penalty areas
for x in [0, pitch_length]:
sign = 1 if x == 0 else -1
pa_x = 16.5 if x == 0 else pitch_length - 16.5
# Penalty box
ax.plot([x, pa_x, pa_x, x],
[pitch_width/2 - 20.15, pitch_width/2 - 20.15,
pitch_width/2 + 20.15, pitch_width/2 + 20.15], 'white', linewidth=2)
# Goal box
gb_x = 5.5 if x == 0 else pitch_length - 5.5
ax.plot([x, gb_x, gb_x, x],
[pitch_width/2 - 9.15, pitch_width/2 - 9.15,
pitch_width/2 + 9.15, pitch_width/2 + 9.15], 'white', linewidth=2)
# Penalty spot
spot_x = 11 if x == 0 else pitch_length - 11
ax.plot(spot_x, pitch_width/2, 'wo', markersize=3)
ax.set_facecolor('#228B22')
ax.set_xlim(-5, pitch_length + 5)
ax.set_ylim(-5, pitch_width + 5)
ax.set_aspect('equal')
ax.axis('off')
return ax
def plot_passes(ax, passes_df, player_name):
"""Plot passes as arrows on pitch."""
for _, pass_event in passes_df.iterrows():
color = 'yellow' if pass_event.get('successful', True) else 'red'
ax.annotate('',
xy=(pass_event['end_x'], pass_event['end_y']),
xytext=(pass_event['start_x'], pass_event['start_y']),
arrowprops=dict(arrowstyle='->', color=color, lw=1.5, alpha=0.7))
ax.set_title(f'{player_name} Passes', color='white', fontsize=14, fontweight='bold')
return ax
# Example
import pandas as pd
import numpy as np
fig, ax = plt.subplots(figsize=(12, 8))
draw_pitch(ax)
passes = pd.DataFrame({
'start_x': np.random.uniform(30, 90, 20),
'start_y': np.random.uniform(20, 60, 20),
'end_x': np.random.uniform(40, 110, 20),
'end_y': np.random.uniform(20, 60, 20),
'successful': np.random.choice([True, False], 20, p=[0.85, 0.15])
})
plot_passes(ax, passes, "Kevin De Bruyne")
fig.patch.set_facecolor('#1a1a1a')
plt.show()
[Soccer pitch with pass arrows showing direction and outcome]
Create xG Shot Map
Create a shot map showing xG values as circle sizes and goals/non-goals as colors.
import matplotlib.pyplot as plt
import numpy as np
def plot_xg_shot_map(shots_df, team_name):
"""
Create shot map with xG values.
Circle size = xG value
Color = goal (green) or no goal (red)
"""
fig, ax = plt.subplots(figsize=(12, 8))
# Draw half pitch (attacking half only)
ax.set_facecolor('#228B22')
# Goal
ax.plot([0, 0], [30, 50], 'white', linewidth=5)
# Penalty area
ax.plot([0, 16.5, 16.5, 0], [20, 20, 60, 60], 'white', linewidth=2)
# 6-yard box
ax.plot([0, 5.5, 5.5, 0], [32, 32, 48, 48], 'white', linewidth=2)
# Plot shots
for _, shot in shots_df.iterrows():
color = '#00ff00' if shot['is_goal'] else '#ff4444'
size = shot['xg'] * 500 # Scale xG to marker size
ax.scatter(shot['x'], shot['y'], s=size, c=color,
alpha=0.7, edgecolors='white', linewidths=1)
# Calculate totals
total_xg = shots_df['xg'].sum()
total_goals = shots_df['is_goal'].sum()
ax.set_xlim(-2, 60)
ax.set_ylim(15, 65)
ax.set_aspect('equal')
ax.axis('off')
ax.text(30, 63, f'{team_name}', fontsize=16, ha='center',
color='white', fontweight='bold')
ax.text(30, 17, f'xG: {total_xg:.2f} | Goals: {total_goals}',
fontsize=12, ha='center', color='white')
fig.patch.set_facecolor('#1a1a1a')
plt.tight_layout()
return fig
# Example
import pandas as pd
shots = pd.DataFrame({
'x': [8, 12, 20, 6, 25, 10, 15, 35],
'y': [40, 45, 38, 42, 50, 35, 40, 40],
'xg': [0.45, 0.22, 0.08, 0.55, 0.04, 0.35, 0.18, 0.02],
'is_goal': [True, False, False, True, False, False, True, False]
})
fig = plot_xg_shot_map(shots, "Manchester City")
plt.show()
[Shot map with xG circles showing shot quality and outcomes]
Plot EPA by Play Type
Create EPA visualizations showing distribution by play type and weekly trends.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
def plot_epa_by_play_type(pbp_df, team_name):
"""Visualize EPA distribution by play type."""
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Filter for pass and run plays
passes = pbp_df[pbp_df['play_type'] == 'pass']['epa'].dropna()
runs = pbp_df[pbp_df['play_type'] == 'run']['epa'].dropna()
# Density plots
sns.kdeplot(passes, ax=axes[0], fill=True, color='#3498db', alpha=0.7)
axes[0].axvline(passes.mean(), color='red', linestyle='--',
label=f'Mean: {passes.mean():.3f}')
axes[0].set_title(f'{team_name} Pass EPA Distribution', fontsize=12)
axes[0].set_xlabel('EPA')
axes[0].legend()
sns.kdeplot(runs, ax=axes[1], fill=True, color='#2ecc71', alpha=0.7)
axes[1].axvline(runs.mean(), color='red', linestyle='--',
label=f'Mean: {runs.mean():.3f}')
axes[1].set_title(f'{team_name} Rush EPA Distribution', fontsize=12)
axes[1].set_xlabel('EPA')
axes[1].legend()
plt.tight_layout()
return fig
def plot_weekly_epa(pbp_df, team_name):
"""Plot EPA trend by week."""
weekly = pbp_df.groupby('week')['epa'].agg(['sum', 'mean', 'count']).reset_index()
fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(weekly['week'], weekly['sum'], color='steelblue', alpha=0.7)
ax.axhline(0, color='black', linestyle='-', linewidth=0.5)
ax.set_xlabel('Week', fontsize=12)
ax.set_ylabel('Total EPA', fontsize=12)
ax.set_title(f'{team_name} Weekly EPA', fontsize=14, fontweight='bold')
# Color bars by positive/negative
for i, bar in enumerate(ax.patches):
if weekly.iloc[i]['sum'] < 0:
bar.set_color('#e74c3c')
plt.tight_layout()
return fig
# Example
np.random.seed(42)
sample_pbp = pd.DataFrame({
'play_type': np.random.choice(['pass', 'run'], 500, p=[0.6, 0.4]),
'epa': np.random.normal(0.05, 0.5, 500),
'week': np.random.randint(1, 18, 500)
})
fig = plot_epa_by_play_type(sample_pbp, "Kansas City Chiefs")
plt.show()
[EPA distribution plots for pass/run plays and weekly bar chart]
Create Hockey Rink Shot Plot
Draw an NHL rink and plot shots with xG values as colors and sizes, highlighting goals.
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, Rectangle, Arc
import numpy as np
def draw_rink(ax):
"""Draw NHL hockey rink (half rink for shots)."""
ax.set_facecolor('#FFFFFF')
# Rink outline (half)
ax.plot([0, 100, 100, 0], [0, 0, 85, 85], 'black', linewidth=2)
# Blue line
ax.axvline(x=25, color='blue', linewidth=3)
# Goal line
ax.axvline(x=89, color='red', linewidth=2)
# Goal crease
crease = Arc((89, 42.5), 12, 12, theta1=90, theta2=270,
color='blue', linewidth=2)
ax.add_patch(crease)
# Goal
ax.plot([89, 93, 93, 89], [40, 40, 45, 45], 'red', linewidth=3)
# Face-off circles
for y in [20, 65]:
circle = Circle((69, y), 15, fill=False, color='red', linewidth=2)
ax.add_patch(circle)
ax.plot(69, y, 'ro', markersize=3)
# Center zone circles
circle = Circle((25, 42.5), 15, fill=False, color='red', linewidth=2)
ax.add_patch(circle)
ax.set_xlim(-5, 105)
ax.set_ylim(-5, 90)
ax.set_aspect('equal')
ax.axis('off')
return ax
def plot_shots(ax, shots_df, title):
"""Plot shots on rink with xG coloring."""
scatter = ax.scatter(
shots_df['x'], shots_df['y'],
c=shots_df['xg'],
cmap='RdYlGn',
s=shots_df['xg'] * 200 + 20,
alpha=0.7,
edgecolors='black',
linewidths=0.5
)
# Mark goals
goals = shots_df[shots_df['is_goal'] == True]
ax.scatter(goals['x'], goals['y'], marker='*', s=200,
c='gold', edgecolors='black', linewidths=1, zorder=5)
ax.set_title(title, fontsize=14, fontweight='bold')
plt.colorbar(scatter, ax=ax, label='xG', shrink=0.6)
return ax
# Example
import pandas as pd
fig, ax = plt.subplots(figsize=(12, 8))
draw_rink(ax)
shots = pd.DataFrame({
'x': np.random.uniform(50, 88, 30),
'y': np.random.uniform(20, 65, 30),
'xg': np.random.uniform(0.02, 0.25, 30),
'is_goal': np.random.choice([True, False], 30, p=[0.1, 0.9])
})
plot_shots(ax, shots, "Colorado Avalanche Shots")
plt.tight_layout()
plt.show()
[Hockey rink with shot locations colored by xG and starred goals]
Create Player Comparison Radar Chart
Create a radar/spider chart for comparing two players across multiple statistical categories.
import matplotlib.pyplot as plt
import numpy as np
from math import pi
def create_radar_chart(categories, player1_values, player2_values,
player1_name, player2_name, title="Player Comparison"):
"""
Create radar/spider chart comparing two players.
Args:
categories: List of stat categories
player1_values: Normalized values (0-100) for player 1
player2_values: Normalized values (0-100) for player 2
"""
N = len(categories)
# Calculate angles for each axis
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1] # Complete the loop
# Add first value to end to close the polygon
player1_values = list(player1_values) + [player1_values[0]]
player2_values = list(player2_values) + [player2_values[0]]
# Create plot
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
# Plot data
ax.plot(angles, player1_values, 'o-', linewidth=2, label=player1_name, color='#3498db')
ax.fill(angles, player1_values, alpha=0.25, color='#3498db')
ax.plot(angles, player2_values, 'o-', linewidth=2, label=player2_name, color='#e74c3c')
ax.fill(angles, player2_values, alpha=0.25, color='#e74c3c')
# Set category labels
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=11)
# Set y-axis
ax.set_ylim(0, 100)
ax.set_yticks([20, 40, 60, 80, 100])
ax.set_yticklabels(['20', '40', '60', '80', '100'], fontsize=9)
ax.set_title(title, fontsize=16, fontweight='bold', pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
plt.tight_layout()
return fig
# Example: NFL QB comparison
categories = ['Completion %', 'TD Rate', 'INT Avoid', 'Yards/Att',
'Pressure Rate', 'Deep Ball', 'Red Zone', 'Clutch']
qb1 = [85, 90, 92, 78, 70, 82, 88, 95] # Mahomes
qb2 = [80, 75, 85, 82, 65, 78, 75, 72] # Comparable QB
fig = create_radar_chart(categories, qb1, qb2,
"Patrick Mahomes", "Josh Allen",
"2023 QB Comparison")
plt.show()
[Radar chart showing overlapping player performance areas]
Create Rolling Average Performance Chart
Create a rolling average chart to visualize performance trends over a season.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def plot_rolling_performance(df, metric_col, date_col, player_name,
window=10, title=None):
"""
Create rolling average chart to show performance trends.
Args:
df: DataFrame with player game-by-game data
metric_col: Column name of the metric to plot
date_col: Column name with dates or game numbers
window: Rolling window size
"""
fig, ax = plt.subplots(figsize=(14, 6))
# Calculate rolling average
rolling_avg = df[metric_col].rolling(window=window, min_periods=1).mean()
# Plot raw values
ax.scatter(df[date_col], df[metric_col], alpha=0.4, s=50,
c='#3498db', label='Game Value')
# Plot rolling average
ax.plot(df[date_col], rolling_avg, color='#e74c3c', linewidth=3,
label=f'{window}-Game Rolling Avg')
# Season average line
season_avg = df[metric_col].mean()
ax.axhline(season_avg, color='gray', linestyle='--', alpha=0.7,
label=f'Season Avg: {season_avg:.2f}')
# Styling
ax.set_xlabel('Game', fontsize=12)
ax.set_ylabel(metric_col, fontsize=12)
ax.set_title(title or f'{player_name} {metric_col} Trend',
fontsize=14, fontweight='bold')
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3)
# Highlight improvement/decline zones
ax.fill_between(df[date_col], rolling_avg, season_avg,
where=(rolling_avg > season_avg),
alpha=0.2, color='green', label='Above Average')
ax.fill_between(df[date_col], rolling_avg, season_avg,
where=(rolling_avg <= season_avg),
alpha=0.2, color='red', label='Below Average')
plt.tight_layout()
return fig
# Example: Player scoring trend
np.random.seed(42)
games = pd.DataFrame({
'game': range(1, 83),
'points': np.random.normal(25, 8, 82).clip(5, 50)
})
# Add some trend
games.loc[40:, 'points'] += 5 # Player improved mid-season
fig = plot_rolling_performance(games, 'points', 'game', "Jayson Tatum", window=10)
plt.show()
[Line chart showing raw performance and rolling average with trend zones]
Create Team Rankings Bump Chart
Create a bump chart showing how team rankings change over time using ggbump.
library(ggplot2)
library(ggbump)
library(dplyr)
# Create sample ranking data
create_bump_chart <- function(rankings_df, title = "Team Rankings Over Time") {
ggplot(rankings_df, aes(x = week, y = rank, color = team)) +
geom_bump(size = 2, smooth = 8) +
geom_point(size = 4) +
geom_text(data = rankings_df %>% filter(week == min(week)),
aes(label = team), x = 0.7, hjust = 1, fontface = "bold") +
geom_text(data = rankings_df %>% filter(week == max(week)),
aes(label = team), x = max(rankings_df$week) + 0.3, hjust = 0, fontface = "bold") +
scale_y_reverse(breaks = 1:10) +
scale_x_continuous(breaks = 1:max(rankings_df$week)) +
labs(title = title,
x = "Week",
y = "Ranking") +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_text(size = 11),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5)
)
}
# Example data
set.seed(42)
teams <- c("Chiefs", "49ers", "Eagles", "Bills", "Cowboys")
rankings <- expand.grid(team = teams, week = 1:10) %>%
group_by(team) %>%
mutate(
base_rank = case_when(
team == "Chiefs" ~ 1,
team == "49ers" ~ 2,
team == "Eagles" ~ 3,
team == "Bills" ~ 4,
team == "Cowboys" ~ 5
),
rank = base_rank + sample(-2:2, n(), replace = TRUE)
) %>%
mutate(rank = pmin(pmax(rank, 1), 5)) %>%
ungroup()
create_bump_chart(rankings, "NFL Power Rankings by Week")
[Bump chart showing team ranking trajectories across weeks]
Calculate WAR Components
Calculate Wins Above Replacement (WAR) from its component parts: batting, baserunning, fielding, and adjustments.
import pandas as pd
import numpy as np
def calculate_batting_runs(row, league_woba, woba_scale):
"""Calculate batting runs above average."""
woba = row['wOBA']
pa = row['PA']
wRAA = ((woba - league_woba) / woba_scale) * pa
return wRAA
def calculate_war(batting_runs, baserunning_runs, fielding_runs,
positional_adj, league_adj, replacement_runs, rpw=10):
"""
Calculate Wins Above Replacement.
Args:
batting_runs: Offensive runs above average
baserunning_runs: Baserunning runs
fielding_runs: Defensive runs above average
positional_adj: Position adjustment
league_adj: League adjustment
replacement_runs: Runs vs replacement level
rpw: Runs per win (default 10)
Returns:
WAR value
"""
total_runs = (batting_runs + baserunning_runs + fielding_runs +
positional_adj + league_adj + replacement_runs)
war = total_runs / rpw
return round(war, 1)
# Example
player_war = calculate_war(
batting_runs=35.2,
baserunning_runs=2.5,
fielding_runs=-5.0,
positional_adj=-7.5,
league_adj=2.0,
replacement_runs=20.0
)
print(f"Player WAR: {player_war}")
Player WAR: 4.7
Calculate wOBA and wRC+
Calculate wOBA (weighted On-Base Average) and wRC+ from raw batting statistics.
import pandas as pd
def calculate_woba(bb, hbp, singles, doubles, triples, hr, ab, sf, ibb=0):
"""
Calculate weighted On-Base Average.
Weights are approximate (2023 values).
"""
numerator = (0.69 * (bb - ibb) + 0.72 * hbp + 0.88 * singles +
1.24 * doubles + 1.56 * triples + 2.00 * hr)
denominator = ab + bb - ibb + sf + hbp
return numerator / denominator if denominator > 0 else 0
def calculate_wrc_plus(woba, pa, league_woba=0.320, woba_scale=1.15,
park_factor=100, league_rppa=0.12):
"""
Calculate wRC+ (park and league adjusted runs created).
100 = league average.
"""
wraa = ((woba - league_woba) / woba_scale) * pa
wrc = (wraa / pa + league_rppa) + (league_rppa - (park_factor / 100) * league_rppa)
wrc_plus = (wrc / league_rppa) * 100
return round(wrc_plus)
# Example player stats
player = {
'BB': 60, 'HBP': 5, '1B': 100, '2B': 30,
'3B': 5, 'HR': 25, 'AB': 500, 'SF': 4, 'PA': 600
}
woba = calculate_woba(
player['BB'], player['HBP'], player['1B'], player['2B'],
player['3B'], player['HR'], player['AB'], player['SF']
)
wrc_plus = calculate_wrc_plus(woba, player['PA'])
print(f"wOBA: {woba:.3f}")
print(f"wRC+: {wrc_plus}")
wOBA: .382 wRC+: 142
Calculate FIP and xFIP
Calculate FIP (Fielding Independent Pitching) and xFIP which regresses home runs to league average fly ball rate.
calculate_fip <- function(hr, bb, hbp, k, ip, fip_constant = 3.10) {
# Fielding Independent Pitching
fip <- ((13 * hr + 3 * (bb + hbp) - 2 * k) / ip) + fip_constant
return(round(fip, 2))
}
calculate_xfip <- function(fb, lg_hr_fb_rate, bb, hbp, k, ip, fip_constant = 3.10) {
# Expected FIP (regresses HR to league average)
expected_hr <- fb * lg_hr_fb_rate
xfip <- ((13 * expected_hr + 3 * (bb + hbp) - 2 * k) / ip) + fip_constant
return(round(xfip, 2))
}
# Example pitcher stats
pitcher <- list(
HR = 20, BB = 50, HBP = 5, K = 180, IP = 180,
FB = 200 # Fly balls allowed
)
fip <- calculate_fip(pitcher$HR, pitcher$BB, pitcher$HBP, pitcher$K, pitcher$IP)
xfip <- calculate_xfip(pitcher$FB, 0.10, pitcher$BB, pitcher$HBP, pitcher$K, pitcher$IP)
cat("FIP:", fip, "\n")
cat("xFIP:", xfip, "\n")
FIP: 3.64 xFIP: 3.44
Calculate Advanced Shooting Metrics
Calculate True Shooting %, Effective FG%, and Points Per Shot for accurate shooting efficiency analysis.
def true_shooting_pct(points, fga, fta):
"""True Shooting Percentage - most accurate shooting efficiency."""
return points / (2 * (fga + 0.44 * fta)) * 100
def effective_fg_pct(fgm, three_pm, fga):
"""eFG% - adjusts for 3-point value."""
return (fgm + 0.5 * three_pm) / fga * 100
def points_per_shot(points, fga, fta):
"""Points generated per shooting possession."""
return points / (fga + 0.44 * fta)
# Example player game
game = {
'points': 35, 'fga': 22, 'fgm': 12,
'three_pm': 5, 'fta': 8, 'ftm': 6
}
ts = true_shooting_pct(game['points'], game['fga'], game['fta'])
efg = effective_fg_pct(game['fgm'], game['three_pm'], game['fga'])
pps = points_per_shot(game['points'], game['fga'], game['fta'])
print(f"True Shooting %: {ts:.1f}%")
print(f"Effective FG %: {efg:.1f}%")
print(f"Points Per Shot: {pps:.2f}")
True Shooting %: 67.3% Effective FG %: 65.9% Points Per Shot: 1.35
Calculate PER (Player Efficiency Rating)
Player Efficiency Rating (PER) calculation framework. Full implementation requires team and league adjustment factors.
def calculate_per(stats, team_stats, league_stats):
"""
Calculate Player Efficiency Rating (simplified version).
Note: Full PER calculation involves many adjustments.
This is a simplified approximation.
"""
# Factor calculations
factor = (2/3) - (0.5 * (league_stats['ast'] / league_stats['fg'])) / \
(2 * (league_stats['fg'] / league_stats['ft']))
vop = league_stats['pts'] / (league_stats['fga'] - league_stats['orb'] +
league_stats['tov'] + 0.44 * league_stats['fta'])
drbp = (league_stats['trb'] - league_stats['orb']) / league_stats['trb']
# uPER calculation (unadjusted)
uper = (1 / stats['min']) * (
stats['3pm'] +
(2/3) * stats['ast'] +
(2 - factor * (team_stats['ast'] / team_stats['fg'])) * stats['fg'] +
stats['ft'] * 0.5 * (1 + (1 - (team_stats['ast'] / team_stats['fg'])) +
(2/3) * (team_stats['ast'] / team_stats['fg'])) -
vop * stats['tov'] -
vop * drbp * (stats['fga'] - stats['fg']) -
vop * 0.44 * (0.44 + (0.56 * drbp)) * (stats['fta'] - stats['ft']) +
vop * (1 - drbp) * (stats['trb'] - stats['orb']) +
vop * drbp * stats['orb'] +
vop * stats['stl'] +
vop * drbp * stats['blk'] -
stats['pf'] * (league_stats['ft'] / league_stats['pf'] - 0.44 *
(league_stats['fta'] / league_stats['pf']) * vop)
)
return round(uper, 1)
print("PER calculation requires full season/league context.")
PER calculation requires full season/league context.
Calculate Four Factors
Calculate Dean Oliver's Four Factors: eFG%, Turnover Rate, Offensive Rebounding Rate, and Free Throw Rate.
# Dean Oliver's Four Factors of Basketball Success
calculate_four_factors <- function(fgm, fga, three_pm, tov, orb, opp_drb,
ftm, pts_allowed, opp_pts) {
# 1. Effective FG% (~40% weight)
efg <- (fgm + 0.5 * three_pm) / fga
# 2. Turnover Rate (~25% weight)
tov_rate <- tov / (fga + 0.44 * ftm + tov)
# 3. Offensive Rebounding Rate (~20% weight)
orb_rate <- orb / (orb + opp_drb)
# 4. Free Throw Rate (~15% weight)
ft_rate <- ftm / fga
return(list(
eFG_pct = round(efg * 100, 1),
TOV_pct = round(tov_rate * 100, 1),
ORB_pct = round(orb_rate * 100, 1),
FT_rate = round(ft_rate * 100, 1)
))
}
# Example team game stats
factors <- calculate_four_factors(
fgm = 42, fga = 88, three_pm = 12,
tov = 12, orb = 10, opp_drb = 35,
ftm = 18, pts_allowed = 105, opp_pts = 110
)
cat("Four Factors Analysis:\n")
cat("eFG%:", factors$eFG_pct, "%\n")
cat("TOV%:", factors$TOV_pct, "%\n")
cat("ORB%:", factors$ORB_pct, "%\n")
cat("FT Rate:", factors$FT_rate, "%\n")
Four Factors Analysis: eFG%: 54.5% TOV%: 11.3% ORB%: 22.2% FT Rate: 20.5%
Calculate NFL Passer Rating
Calculate NFL Passer Rating from completions, attempts, yards, touchdowns, and interceptions.
def calculate_passer_rating(comp, att, yards, td, int_):
"""
Calculate NFL Passer Rating (0-158.3 scale).
Each component is bounded between 0 and 2.375.
"""
# Component a: Completion percentage
a = ((comp / att) - 0.3) * 5
a = max(0, min(a, 2.375))
# Component b: Yards per attempt
b = ((yards / att) - 3) * 0.25
b = max(0, min(b, 2.375))
# Component c: TD percentage
c = (td / att) * 20
c = max(0, min(c, 2.375))
# Component d: INT percentage (inverted)
d = 2.375 - ((int_ / att) * 25)
d = max(0, min(d, 2.375))
# Final rating
rating = ((a + b + c + d) / 6) * 100
return round(rating, 1)
# Example game
game = {'comp': 28, 'att': 35, 'yards': 350, 'td': 4, 'int': 0}
rating = calculate_passer_rating(**game)
print(f"Passer Rating: {rating}")
# Perfect passer rating requirements
print("\nFor 158.3 rating, need:")
print("- 77.5%+ completion")
print("- 12.5+ yards/attempt")
print("- 11.875%+ TD rate")
print("- 0% INT rate")
Passer Rating: 153.0 For 158.3 rating, need: - 77.5%+ completion - 12.5+ yards/attempt - 11.875%+ TD rate - 0% INT rate
Calculate Expected Points Added (EPA)
Calculate Expected Points Added (EPA) using a simplified expected points model based on down, distance, and field position.
import numpy as np
def get_expected_points(down, distance, yard_line):
"""
Simplified expected points model.
In practice, this uses logistic regression on historical data.
"""
# Simplified model (real models use play-by-play training)
base_ep = (yard_line / 100) * 7 # 0 at own goal, 7 at opponent's
# Down adjustments
down_adj = {1: 0, 2: -0.5, 3: -1.0, 4: -2.0}
ep = base_ep + down_adj.get(down, 0)
# Distance adjustment
if down < 4:
ep -= (distance - 10) * 0.05
return round(ep, 2)
def calculate_epa(ep_before, ep_after, points_scored=0):
"""Calculate EPA for a single play."""
return round(ep_after - ep_before + points_scored, 2)
# Example: 15-yard gain on 1st and 10 from own 25
ep_before = get_expected_points(down=1, distance=10, yard_line=25)
ep_after = get_expected_points(down=1, distance=10, yard_line=40)
epa = calculate_epa(ep_before, ep_after)
print(f"EP Before: {ep_before}")
print(f"EP After: {ep_after}")
print(f"EPA: {epa}")
EP Before: 1.75 EP After: 2.80 EPA: 1.05
Calculate xG from Shot Location
Simplified xG model based on shot distance and angle. Real models use machine learning with additional features.
import numpy as np
def calculate_xg(distance, angle, is_header=False, is_penalty=False):
"""
Simplified xG model based on shot location.
Real models use:
- Distance from goal
- Angle to goal
- Body part (foot vs header)
- Assist type
- Game state
- Defender positions
"""
if is_penalty:
return 0.76 # Historical penalty conversion rate
# Base xG from distance (exponential decay)
base_xg = np.exp(-0.1 * distance)
# Angle adjustment (0-1 scale)
angle_factor = np.sin(np.radians(angle)) ** 2
# Header penalty
header_mult = 0.7 if is_header else 1.0
xg = base_xg * angle_factor * header_mult
return round(min(xg, 0.95), 3) # Cap at 0.95
# Example shots
shots = [
{'distance': 6, 'angle': 45, 'is_header': False}, # Close range
{'distance': 18, 'angle': 30, 'is_header': False}, # Edge of box
{'distance': 25, 'angle': 20, 'is_header': False}, # Long range
{'distance': 8, 'angle': 35, 'is_header': True}, # Header
]
for shot in shots:
xg = calculate_xg(**shot)
print(f"Distance: {shot['distance']}m, Angle: {shot['angle']}°, "
f"Header: {shot['is_header']}, xG: {xg}")
Distance: 6m, Angle: 45°, Header: False, xG: 0.274 Distance: 18m, Angle: 30°, Header: False, xG: 0.041 Distance: 25m, Angle: 20°, Header: False, xG: 0.010 Distance: 8m, Angle: 35°, Header: True, xG: 0.147
Calculate PPDA (Pressing Intensity)
Calculate PPDA (pressing intensity) and Field Tilt (territorial dominance) from match event data.
def calculate_ppda(opponent_passes_def_third, defensive_actions_opp_third):
"""
Calculate Passes Per Defensive Action (PPDA).
Lower PPDA = more aggressive pressing.
- < 8: Very high press (e.g., Liverpool, Dortmund)
- 8-10: High press
- 10-12: Medium press
- > 12: Low press
Args:
opponent_passes_def_third: Opponent passes in their defensive third
defensive_actions_opp_third: Your defensive actions in opponent's def third
"""
if defensive_actions_opp_third == 0:
return float('inf')
return round(opponent_passes_def_third / defensive_actions_opp_third, 2)
def calculate_field_tilt(own_final_third_touches, opp_final_third_touches):
"""
Field tilt: % of touches in final thirds that belong to team.
> 60% = dominant possession in attacking areas.
"""
total = own_final_third_touches + opp_final_third_touches
return round((own_final_third_touches / total) * 100, 1) if total > 0 else 50.0
# Example match data
match = {
'opp_passes_def_third': 85,
'def_actions_opp_third': 12,
'own_final_third_touches': 180,
'opp_final_third_touches': 95
}
ppda = calculate_ppda(match['opp_passes_def_third'], match['def_actions_opp_third'])
tilt = calculate_field_tilt(match['own_final_third_touches'], match['opp_final_third_touches'])
print(f"PPDA: {ppda} (lower = more pressing)")
print(f"Field Tilt: {tilt}%")
PPDA: 7.08 (lower = more pressing) Field Tilt: 65.5%
Calculate Corsi and Fenwick
Calculate Corsi, Fenwick, and PDO - foundational possession and luck metrics in hockey analytics.
def calculate_corsi(shots_for, shots_against, missed_for, missed_against,
blocked_for, blocked_against):
"""
Calculate Corsi (all shot attempts).
CF% > 50% = controlling play
"""
cf = shots_for + missed_for + blocked_for
ca = shots_against + missed_against + blocked_against
cf_pct = (cf / (cf + ca)) * 100 if (cf + ca) > 0 else 50
return {'CF': cf, 'CA': ca, 'CF%': round(cf_pct, 1)}
def calculate_fenwick(shots_for, shots_against, missed_for, missed_against):
"""
Calculate Fenwick (shot attempts excluding blocked shots).
Some prefer Fenwick as blocked shots are partially random.
"""
ff = shots_for + missed_for
fa = shots_against + missed_against
ff_pct = (ff / (ff + fa)) * 100 if (ff + fa) > 0 else 50
return {'FF': ff, 'FA': fa, 'FF%': round(ff_pct, 1)}
def calculate_pdo(sh_pct, sv_pct):
"""
PDO = Shooting% + Save%
Regresses strongly to 100 (or 1.000).
High PDO often indicates luck.
"""
return round(sh_pct + sv_pct, 1)
# Example player on-ice stats
on_ice = {
'shots_for': 30, 'shots_against': 25,
'missed_for': 12, 'missed_against': 10,
'blocked_for': 8, 'blocked_against': 6,
'sh_pct': 12.0, 'sv_pct': 92.5
}
corsi = calculate_corsi(
on_ice['shots_for'], on_ice['shots_against'],
on_ice['missed_for'], on_ice['missed_against'],
on_ice['blocked_for'], on_ice['blocked_against']
)
fenwick = calculate_fenwick(
on_ice['shots_for'], on_ice['shots_against'],
on_ice['missed_for'], on_ice['missed_against']
)
pdo = calculate_pdo(on_ice['sh_pct'], on_ice['sv_pct'])
print(f"Corsi: {corsi}")
print(f"Fenwick: {fenwick}")
print(f"PDO: {pdo}")
Corsi: {'CF': 50, 'CA': 41, 'CF%': 54.9}
Fenwick: {'FF': 42, 'FA': 35, 'FF%': 54.5}
PDO: 104.5
Calculate Strokes Gained
Calculate Strokes Gained for individual shots using baseline expected strokes from each position.
import numpy as np
# Expected strokes from distance (baseline data)
EXPECTED_STROKES = {
'tee': {250: 4.1, 300: 3.9, 350: 3.8, 400: 4.0, 450: 4.2},
'fairway': {50: 2.8, 100: 2.9, 150: 3.0, 200: 3.2, 250: 3.5},
'rough': {50: 3.0, 100: 3.1, 150: 3.2, 200: 3.5, 250: 3.8},
'green': {5: 1.5, 10: 1.8, 20: 2.0, 30: 2.1, 50: 2.3}
}
def get_expected_strokes(distance, lie):
"""Get expected strokes from current position."""
if lie not in EXPECTED_STROKES:
return 3.0
distances = sorted(EXPECTED_STROKES[lie].keys())
for d in distances:
if distance <= d:
return EXPECTED_STROKES[lie][d]
return EXPECTED_STROKES[lie][distances[-1]]
def calculate_strokes_gained(start_lie, start_dist, end_lie, end_dist, strokes_taken=1):
"""
Calculate strokes gained for a single shot.
SG = Expected_before - Expected_after - Strokes_taken
"""
exp_before = get_expected_strokes(start_dist, start_lie)
exp_after = get_expected_strokes(end_dist, end_lie) if end_lie != 'hole' else 0
sg = exp_before - exp_after - strokes_taken
return round(sg, 2)
# Example: Drive from tee
drive_sg = calculate_strokes_gained('tee', 450, 'fairway', 150)
print(f"Drive SG: {drive_sg}")
# Example: Approach shot
approach_sg = calculate_strokes_gained('fairway', 150, 'green', 15)
print(f"Approach SG: {approach_sg}")
# Example: Holed putt
putt_sg = calculate_strokes_gained('green', 15, 'hole', 0)
print(f"Putt SG: {putt_sg}")
Drive SG: 0.20 Approach SG: 0.20 Putt SG: 0.80
Calculate Tennis Performance Metrics
Calculate key tennis performance metrics: Service Points Won, Return Points Won, Dominance Ratio, and Performance Index.
def service_points_won(first_in, first_won, second_won, double_faults, total_serves):
"""Calculate overall service points won percentage."""
second_serves = total_serves - first_in
total_won = first_won + second_won
spw = total_won / total_serves * 100 if total_serves > 0 else 0
return round(spw, 1)
def return_points_won(first_ret_won, first_faced, second_ret_won, second_faced):
"""Calculate return points won percentage."""
total_won = first_ret_won + second_ret_won
total_faced = first_faced + second_faced
rpw = total_won / total_faced * 100 if total_faced > 0 else 0
return round(rpw, 1)
def dominance_ratio(spw_pct, rpw_pct):
"""
Dominance Ratio = SPW% / (100 - RPW%)
> 1.0 means winning more on serve than opponent
"""
opp_spw = 100 - rpw_pct
return round(spw_pct / opp_spw, 2) if opp_spw > 0 else 1.0
def performance_index(spw_pct, rpw_pct):
"""
PI = SPW% + RPW% - 100
Positive = outperforming opponent
"""
return round(spw_pct + rpw_pct - 100, 1)
# Example match stats
match = {
'first_in': 50, 'first_won': 38, 'second_won': 18,
'double_faults': 3, 'total_serves': 75,
'first_ret_won': 18, 'first_faced': 55,
'second_ret_won': 14, 'second_faced': 25
}
spw = service_points_won(match['first_in'], match['first_won'],
match['second_won'], match['double_faults'],
match['total_serves'])
rpw = return_points_won(match['first_ret_won'], match['first_faced'],
match['second_ret_won'], match['second_faced'])
print(f"Service Points Won: {spw}%")
print(f"Return Points Won: {rpw}%")
print(f"Dominance Ratio: {dominance_ratio(spw, rpw)}")
print(f"Performance Index: {performance_index(spw, rpw)}")
Service Points Won: 74.7% Return Points Won: 40.0% Dominance Ratio: 1.24 Performance Index: +14.7
Get Statcast Data with pybaseball
Fetch granular Statcast tracking data including exit velocity, launch angle, and expected stats based on batted ball quality.
from pybaseball import statcast
import pandas as pd
# Get Statcast data for a date range
statcast_data = statcast(start_dt="2023-06-01", end_dt="2023-06-07")
# Filter for home runs
home_runs = statcast_data[statcast_data['events'] == 'home_run']
# Key Statcast metrics
print(statcast_data[['player_name', 'launch_speed', 'launch_angle',
'hit_distance_sc', 'estimated_ba_using_speedangle']].head())
player_name launch_speed launch_angle hit_distance_sc estimated_ba 0 Shohei Ohtani 108.5 28.0 425.0 0.940 1 Mike Trout 104.2 32.0 398.0 0.850
Fetch MLB Data with baseballr
The baseballr package provides access to FanGraphs, Baseball Reference, and MLB data in R.
library(baseballr)
library(dplyr)
# Get team batting stats
team_batting <- fg_team_batter(2023)
# Top 5 teams by wRC+
team_batting %>%
select(team_name, wRC_plus, BB_pct, K_pct, ISO) %>%
arrange(desc(wRC_plus)) %>%
head(5)
# Get individual player stats
fg_batters <- fg_batter_leaders(2023, qual = 400)
print(paste("Retrieved", nrow(fg_batters), "qualified batters"))
team_name wRC_plus BB_pct K_pct ISO 1 Dodgers 118 9.8% 20.1% .186 2 Braves 115 9.2% 22.3% .198
Fetch NBA Stats with nba_api
The nba_api library provides comprehensive access to NBA.com statistics including player stats, team data, and play-by-play.
from nba_api.stats.endpoints import leagueleaders, playercareerstats
from nba_api.stats.static import players
import pandas as pd
# Get league leaders for current season
leaders = leagueleaders.LeagueLeaders(season='2023-24')
df = leaders.get_data_frames()[0]
# Top scorers
top_scorers = df.nlargest(10, 'PTS')[['PLAYER', 'TEAM', 'GP', 'PTS', 'AST', 'REB']]
print(top_scorers)
# Get specific player career stats
player_dict = players.find_players_by_full_name("LeBron James")[0]
career = playercareerstats.PlayerCareerStats(player_id=player_dict['id'])
print(career.get_data_frames()[0][['SEASON_ID', 'PTS', 'AST', 'REB']].tail())
PLAYER TEAM GP PTS AST REB 0 Luka Doncic DAL 70 33.9 9.8 9.2 1 Giannis A. MIL 73 30.4 6.5 11.5
Get NBA Play-by-Play Data
Fetch detailed play-by-play data for NBA games including shot locations, player actions, and game context.
from nba_api.stats.endpoints import playbyplayv2
import pandas as pd
# Get play-by-play for a specific game
game_id = "0022300001" # Example game ID
pbp = playbyplayv2.PlayByPlayV2(game_id=game_id)
df = pbp.get_data_frames()[0]
# Filter for scoring plays
scoring = df[df['SCOREMARGIN'].notna()][[
'PERIOD', 'PCTIMESTRING', 'HOMEDESCRIPTION',
'VISITORDESCRIPTION', 'SCORE'
]]
print(f"Total plays: {len(df)}")
print(f"Scoring plays: {len(scoring)}")
Total plays: 478 Scoring plays: 212
Fetch NBA Data with hoopR
hoopR provides easy access to NBA and WNBA data in R with pre-loaded datasets for efficient analysis.
library(hoopR)
library(dplyr)
# Load NBA player box scores
nba_box <- load_nba_player_box(seasons = 2024)
# Get season averages
player_avgs <- nba_box %>%
group_by(athlete_display_name, team_short_display_name) %>%
summarize(
games = n(),
ppg = mean(points, na.rm = TRUE),
rpg = mean(rebounds, na.rm = TRUE),
apg = mean(assists, na.rm = TRUE),
.groups = "drop"
) %>%
filter(games >= 20) %>%
arrange(desc(ppg))
head(player_avgs, 10)
# A tibble: 10 x 5 athlete_display_name team_short_display_name games ppg rpg 1 Luka Dončić DAL 70 33.9 9.2
Load NFL Play-by-Play with nfl_data_py
nfl_data_py provides access to NFL play-by-play data with pre-calculated EPA, WPA, and other advanced metrics.
import nfl_data_py as nfl
import pandas as pd
# Load play-by-play data
pbp = nfl.import_pbp_data([2023])
print(f"Total plays: {len(pbp)}")
# Filter for pass plays
pass_plays = pbp[pbp['play_type'] == 'pass']
# Get EPA leaders
qb_epa = pass_plays.groupby('passer_player_name').agg({
'epa': ['sum', 'mean', 'count']
}).round(3)
qb_epa.columns = ['total_epa', 'epa_per_play', 'attempts']
qb_epa = qb_epa[qb_epa['attempts'] >= 200].sort_values('total_epa', ascending=False)
print(qb_epa.head(10))
total_epa epa_per_play attempts passer_player_name T.Tagovailoa 152.34 0.281 542 J.Goff 145.21 0.265 548
Load NFL Data with nflfastR
nflfastR is the premier NFL analytics package in R, providing play-by-play data with EPA, CPOE, and other advanced metrics.
library(nflfastR)
library(dplyr)
# Load play-by-play data
pbp <- load_pbp(2023)
# Calculate QB efficiency metrics
qb_stats <- pbp %>%
filter(!is.na(epa), play_type == "pass") %>%
group_by(passer_player_name, posteam) %>%
summarize(
plays = n(),
total_epa = sum(epa),
epa_play = mean(epa),
cpoe = mean(cpoe, na.rm = TRUE),
.groups = "drop"
) %>%
filter(plays >= 200) %>%
arrange(desc(total_epa))
head(qb_stats, 10)
# A tibble: 10 x 6 passer_player_name posteam plays total_epa epa_play cpoe 1 T.Tagovailoa MIA 542 152.3 0.281 4.21 2 J.Goff DET 548 145.2 0.265 3.89
Access StatsBomb Open Data
StatsBomb provides free open data for selected competitions. Their data includes detailed event data with xG values.
from statsbombpy import sb
import pandas as pd
# Get available competitions
comps = sb.competitions()
print(comps[['competition_name', 'season_name']].drop_duplicates().head(10))
# Get matches for a competition
matches = sb.matches(competition_id=43, season_id=106) # World Cup 2022
print(f"Total matches: {len(matches)}")
# Get events for a specific match
events = sb.events(match_id=matches.iloc[0]['match_id'])
print(f"Events in match: {len(events)}")
# Filter for shots
shots = events[events['type'] == 'Shot']
print(f"Total shots: {len(shots)}")
competition_name season_name 0 FIFA World Cup 2022 1 La Liga 2020/2021 Total matches: 64 Events in match: 3247 Total shots: 28
Scrape FBref with worldfootballR
worldfootballR scrapes FBref, Transfermarkt, and other sources for comprehensive soccer data in R.
library(worldfootballR)
library(dplyr)
# Get Premier League player stats
pl_stats <- fb_big5_advanced_season_stats(
season_end_year = 2024,
stat_type = "standard",
team_or_player = "player"
)
# Filter for Premier League
epl <- pl_stats %>%
filter(Comp == "Premier League") %>%
select(Player, Squad, MP, Gls, Ast, xG, xAG) %>%
arrange(desc(xG))
head(epl, 10)
# Get team-level shooting data
team_shooting <- fb_season_team_stats(
country = "ENG",
tier = "1st",
stat_type = "shooting"
)
# A tibble: 10 x 7 Player Squad MP Gls Ast xG xAG 1 Erling Haaland Man City 31 27 5 26.2 3.1 2 Cole Palmer Chelsea 33 22 11 17.8 9.4
Fetch NHL Data with hockey_scraper
Access NHL data through the official NHL API. Data includes player stats, game logs, and play-by-play.
import requests
import pandas as pd
# NHL API endpoint
base_url = "https://api-web.nhle.com/v1"
# Get team roster
team_abbr = "TOR" # Toronto Maple Leafs
response = requests.get(f"{base_url}/roster/{team_abbr}/current")
roster = response.json()
# Get player stats
stats_url = "https://api.nhle.com/stats/rest/en/skater/summary"
params = {"cayenneExp": "seasonId=20232024", "limit": 100}
response = requests.get(stats_url, params=params)
stats = pd.DataFrame(response.json()['data'])
# Top scorers
top_scorers = stats.nlargest(10, 'points')[[
'skaterFullName', 'teamAbbrevs', 'goals', 'assists', 'points'
]]
print(top_scorers)
skaterFullName teamAbbrevs goals assists points 0 Nathan MacKinnon COL 51 89 140 1 Nikita Kucherov TBL 44 100 144
Load NHL Data with hockeyR
hockeyR provides easy access to NHL play-by-play data with pre-calculated advanced stats like Corsi and xG.
library(hockeyR)
library(dplyr)
# Load play-by-play data
pbp <- load_pbp(2024)
# Get shot attempts (Corsi)
shots <- pbp %>%
filter(event_type %in% c("SHOT", "MISSED_SHOT", "BLOCKED_SHOT", "GOAL"))
# Calculate team Corsi
team_corsi <- shots %>%
group_by(event_team_abbr) %>%
summarize(
CF = n(),
goals = sum(event_type == "GOAL")
) %>%
arrange(desc(CF))
print(team_corsi)
# Get player-level stats
player_stats <- get_skater_stats_hr(2024)
# A tibble: 32 x 3 event_team_abbr CF goals 1 COL 5234 285 2 FLA 5102 264
Scrape PGA Tour Stats
Access PGA Tour statistics through their public JSON API for strokes gained and other metrics.
import requests
import pandas as pd
# PGA Tour Stats API
base_url = "https://statdata.pgatour.com/r"
# Get Strokes Gained data
sg_url = f"{base_url}/stat/02675.json" # Strokes Gained: Total
response = requests.get(sg_url)
if response.status_code == 200:
data = response.json()
players = pd.DataFrame(data['tours'][0]['years'][0]['stats'])
# Clean and sort
players['statValue'] = pd.to_numeric(players['statValue'])
top_sg = players.nlargest(10, 'statValue')[['playerName', 'statValue']]
print("Top 10 - Strokes Gained: Total")
print(top_sg)
Top 10 - Strokes Gained: Total
playerName statValue
0 Scottie Scheffler 2.82
1 Xander Schauffele 1.95
Load Tennis Match Data
Jeff Sackmann maintains comprehensive ATP and WTA match data on GitHub, free for analysis.
import pandas as pd
# Jeff Sackmann's tennis data (GitHub)
base_url = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master"
# Load ATP matches
atp_2023 = pd.read_csv(f"{base_url}/atp_matches_2023.csv")
# Filter for Grand Slams
grand_slams = atp_2023[atp_2023['tourney_level'] == 'G']
# Top winners
top_winners = atp_2023.groupby('winner_name').size().nlargest(10)
print("Most wins in 2023:")
print(top_winners)
# Service stats analysis
serve_stats = atp_2023[['winner_name', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_ace']].dropna()
print(f"\nMatches with serve data: {len(serve_stats)}")
Most wins in 2023: Novak Djokovic 73 Jannik Sinner 64 Carlos Alcaraz 61
Scrape UFC Fighter Stats
Scrape UFC fighter statistics from UFCStats.com for comprehensive MMA analytics.
import requests
from bs4 import BeautifulSoup
import pandas as pd
# UFC Stats page
url = "http://ufcstats.com/statistics/fighters"
params = {"char": "a", "page": "all"}
response = requests.get(url, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
# Parse fighter table
table = soup.find('table', class_='b-statistics__table')
rows = table.find_all('tr')[2:] # Skip headers
fighters = []
for row in rows:
cols = row.find_all('td')
if cols:
fighters.append({
'name': cols[0].text.strip() + " " + cols[1].text.strip(),
'height': cols[3].text.strip(),
'weight': cols[4].text.strip(),
'reach': cols[5].text.strip(),
'stance': cols[6].text.strip()
})
df = pd.DataFrame(fighters)
print(f"Total fighters: {len(df)}")
Total fighters: 234
Load NCAA Volleyball Data
Calculate volleyball statistics like hitting efficiency and kill percentage from box score data.
import pandas as pd
import requests
# NCAA stats endpoint example
# Note: NCAA data often requires web scraping or specific APIs
# Example: Load volleyball box score data
# This would typically come from a CSV or database
# Sample data structure
data = {
'player': ['Player A', 'Player B', 'Player C'],
'kills': [15, 12, 8],
'errors': [3, 2, 4],
'attempts': [35, 28, 22],
'digs': [8, 5, 12],
'blocks': [2, 4, 1]
}
df = pd.DataFrame(data)
# Calculate hitting efficiency
df['efficiency'] = (df['kills'] - df['errors']) / df['attempts']
df['kill_pct'] = df['kills'] / df['attempts'] * 100
print(df[['player', 'kills', 'efficiency', 'kill_pct']].round(3))
player kills efficiency kill_pct 0 Player A 15 0.343 42.9 1 Player B 12 0.357 42.9
Generic Sports API Request
A reusable function for fetching data from various sports APIs with error handling.
import requests
import pandas as pd
def fetch_sports_api(url, params=None, headers=None):
"""
Generic function to fetch data from sports APIs.
Args:
url: API endpoint URL
params: Query parameters
headers: Request headers (for authentication)
Returns:
DataFrame with the response data
"""
try:
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
# Handle common response structures
if isinstance(data, list):
return pd.DataFrame(data)
elif 'data' in data:
return pd.DataFrame(data['data'])
elif 'results' in data:
return pd.DataFrame(data['results'])
else:
return pd.DataFrame([data])
except requests.exceptions.RequestException as e:
print(f"API Error: {e}")
return pd.DataFrame()
# Example usage
df = fetch_sports_api("https://api.example.com/stats")
print(df.head())
id name value 0 1 Stat A 100 1 2 Stat B 200
Fetch MLB Player Stats with pybaseball
Use pybaseball to fetch MLB batting and pitching statistics. The library pulls from FanGraphs, Baseball Reference, and Statcast.
from pybaseball import batting_stats, pitching_stats
# Get batting stats for a season
batting_2023 = batting_stats(2023)
print(f"Retrieved {len(batting_2023)} batters")
# Get pitching stats
pitching_2023 = pitching_stats(2023)
print(f"Retrieved {len(pitching_2023)} pitchers")
# Filter for qualified batters (minimum PA)
qualified = batting_2023[batting_2023['PA'] >= 502]
print(f"Qualified batters: {len(qualified)}")
Retrieved 789 batters Retrieved 634 pitchers Qualified batters: 143
Calculate OBP
Python function to calculate On-Base Percentage with error handling
def calculate_obp(hits, walks, hbp, at_bats, sacrifice_flies):
"""Calculate On-Base Percentage"""
numerator = hits + walks + hbp
denominator = at_bats + walks + hbp + sacrifice_flies
if denominator == 0:
return 0
return round(numerator / denominator, 3)
# Example
obp = calculate_obp(hits=145, walks=72, hbp=8, at_bats=502, sacrifice_flies=4)
print(f"OBP: {obp}") # Output: OBP: 0.385
Calculate True Shooting %
Calculate True Shooting Percentage - measures overall shooting efficiency
def calculate_true_shooting(points, fga, fta):
"""Calculate True Shooting Percentage
Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
"""
tsa = 2 * (fga + 0.44 * fta)
if tsa == 0:
return 0
return round((points / tsa) * 100, 1)
# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%") # Elite: >60%, Good: 55-60%
Calculate True Shooting %
Calculate True Shooting Percentage - measures overall shooting efficiency
def calculate_true_shooting(points, fga, fta):
"""Calculate True Shooting Percentage
Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
"""
tsa = 2 * (fga + 0.44 * fta)
if tsa == 0:
return 0
return round((points / tsa) * 100, 1)
# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%") # Elite: >60%, Good: 55-60%
Calculate True Shooting %
Calculate True Shooting Percentage - measures overall shooting efficiency
def calculate_true_shooting(points, fga, fta):
"""Calculate True Shooting Percentage
Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
"""
tsa = 2 * (fga + 0.44 * fta)
if tsa == 0:
return 0
return round((points / tsa) * 100, 1)
# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%") # Elite: >60%, Good: 55-60%
Calculate True Shooting %
Calculate True Shooting Percentage - measures overall shooting efficiency
def calculate_true_shooting(points, fga, fta):
"""Calculate True Shooting Percentage
Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
"""
tsa = 2 * (fga + 0.44 * fta)
if tsa == 0:
return 0
return round((points / tsa) * 100, 1)
# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%") # Elite: >60%, Good: 55-60%
Calculate True Shooting %
Calculate True Shooting Percentage - measures overall shooting efficiency
def calculate_true_shooting(points, fga, fta):
"""Calculate True Shooting Percentage
Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
"""
tsa = 2 * (fga + 0.44 * fta)
if tsa == 0:
return 0
return round((points / tsa) * 100, 1)
# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%") # Elite: >60%, Good: 55-60%
EPA Calculation
Calculate Expected Points Added (EPA) for NFL plays
def calculate_epa(ep_before, ep_after, touchdown=False, turnover=False):
"""Calculate Expected Points Added
EPA = EP_after - EP_before
"""
if touchdown:
ep_after = 7.0 # Assume PAT made
elif turnover:
ep_after = -ep_after # Flip for opponent
return round(ep_after - ep_before, 2)
# Example: 2nd & 8 from own 35, gain 12 yards for 1st down
epa = calculate_epa(ep_before=1.2, ep_after=2.8)
print(f"EPA: +{epa}") # Positive EPA = good play
Calculate True Shooting %
Calculate True Shooting Percentage - measures overall shooting efficiency
def calculate_true_shooting(points, fga, fta):
"""Calculate True Shooting Percentage
Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
"""
tsa = 2 * (fga + 0.44 * fta)
if tsa == 0:
return 0
return round((points / tsa) * 100, 1)
# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%") # Elite: >60%, Good: 55-60%
Simple xG Model
Basic xG model using logistic regression with distance and angle features
import numpy as np
from sklearn.linear_model import LogisticRegression
def build_xg_model(shots_df):
"""Build Expected Goals model"""
# Features: distance, angle, body_part
shots_df["distance"] = np.sqrt((shots_df["x"] - 100)**2 + (shots_df["y"] - 50)**2)
shots_df["angle"] = np.arctan2(7.32/2, shots_df["distance"]) * 2
X = shots_df[["distance", "angle"]]
y = shots_df["goal"]
model = LogisticRegression()
model.fit(X, y)
return model
Calculate True Shooting %
Calculate True Shooting Percentage - measures overall shooting efficiency
def calculate_true_shooting(points, fga, fta):
"""Calculate True Shooting Percentage
Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
"""
tsa = 2 * (fga + 0.44 * fta)
if tsa == 0:
return 0
return round((points / tsa) * 100, 1)
# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%") # Elite: >60%, Good: 55-60%
Calculate Corsi
Calculate Corsi For Percentage (CF%) - a hockey possession metric
def calculate_corsi(shots_for, shots_against, goals_for, goals_against,
missed_for, missed_against, blocked_for, blocked_against):
"""Calculate Corsi For Percentage"""
cf = shots_for + goals_for + missed_for + blocked_for
ca = shots_against + goals_against + missed_against + blocked_against
if cf + ca == 0:
return 50.0
cf_pct = (cf / (cf + ca)) * 100
return round(cf_pct, 1)
# Example: Team with 60 CF, 45 CA
corsi = calculate_corsi(30, 25, 3, 2, 15, 10, 12, 8)
print(f"CF%: {corsi}%") # Should be above 50% (good)
Simple xG Model
Basic xG model using logistic regression with distance and angle features
import numpy as np
from sklearn.linear_model import LogisticRegression
def build_xg_model(shots_df):
"""Build Expected Goals model"""
# Features: distance, angle, body_part
shots_df["distance"] = np.sqrt((shots_df["x"] - 100)**2 + (shots_df["y"] - 50)**2)
shots_df["angle"] = np.arctan2(7.32/2, shots_df["distance"]) * 2
X = shots_df[["distance", "angle"]]
y = shots_df["goal"]
model = LogisticRegression()
model.fit(X, y)
return model
Loading Baseball Data with pybaseball
This code uses the pybaseball library to fetch Statcast data from Baseball Savant.
from pybaseball import statcast
import pandas as pd
# Get Statcast data for a date range
data = statcast(start_dt='2023-04-01', end_dt='2023-04-30')
# Display basic info
print(f'Total pitches: {len(data)}')
print(data.head())
Basic NBA API Query
Using nba_api to fetch player career statistics from NBA.com.
from nba_api.stats.endpoints import playercareerstats
# Get career stats for a player
career = playercareerstats.PlayerCareerStats(player_id='201566')
df = career.get_data_frames()[0]
print(df[['SEASON_ID', 'PTS', 'REB', 'AST']].head(10))
Loading NFL Data with nfl_data_py
Loading NFL play-by-play data and calculating EPA metrics.
import nfl_data_py as nfl
# Load play-by-play data
pbp = nfl.import_pbp_data([2023])
# Filter to pass plays
pass_plays = pbp[pbp['play_type'] == 'pass']
# Calculate EPA per play by team
epa_by_team = pass_plays.groupby('posteam')['epa'].mean().sort_values(ascending=False)
print(epa_by_team.head(10))
Calculate True Shooting Percentage in R
R code to calculate and visualize True Shooting Percentage for NBA players
# Function to calculate True Shooting Percentage
calculate_ts_percentage <- function(points, fga, fta) {
# TS% Formula: Points / (2 * (FGA + 0.44 * FTA))
ts_percentage <- (points / (2 * (fga + 0.44 * fta))) * 100
return(ts_percentage)
}
# Example NBA player stats
players <- data.frame(
player = c("Player A", "Player B", "Player C"),
points = c(1832, 2140, 1654),
fga = c(1420, 1680, 1510),
fta = c(425, 380, 290)
)
# Calculate TS% for each player
players$ts_percent <- with(players,
calculate_ts_percentage(points, fga, fta)
)
# Display results
library(dplyr)
players %>%
arrange(desc(ts_percent)) %>%
mutate(ts_percent = round(ts_percent, 1)) %>%
print()
# League average comparison
league_avg_ts <- 56.5
players$above_average <- players$ts_percent > league_avg_ts
# Visualization
library(ggplot2)
ggplot(players, aes(x = player, y = ts_percent, fill = above_average)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = league_avg_ts, linetype = "dashed", color = "red") +
labs(title = "True Shooting Percentage Comparison",
y = "TS%", x = "") +
theme_minimal()
Loading Baseball Data with baseballr
R code using the baseballr package to access Statcast data.
library(baseballr)
library(dplyr)
# Get Statcast data
data <- statcast_search(
start_date = '2023-04-01',
end_date = '2023-04-30'
)
# Summary
data %>%
summarize(total_pitches = n())
Quick Reference
Common patterns and formulas for sports analytics
Common Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
Common Metrics
# Basketball: True Shooting %
TS_pct = points / (2 * (FGA + 0.44 * FTA))
# Baseball: OPS
OPS = on_base_pct + slugging_pct
# Soccer: Expected Goals (simplified)
xG = shot_quality * shot_location_weight