Code Library

Ready-to-use code snippets for sports analytics projects

302

Total Snippets

Languages

python

Stolen Base Break-Even

Calculate stolen base break-even point.

import pandas as pd

def sb_breakeven(run_environment=None):
    """Calculate stolen base break-even success rate."""
    # Run values
    sb_value = 0.175  # Value of successful SB
    cs_value = -0.40  # Cost of caught stealing

    # Break-even: SB_value * success_rate + CS_value * (1 - success_rate) = 0
    # success_rate = -CS_value / (SB_value - CS_value)

    breakeven = -cs_value / (sb_value - cs_value)
    return breakeven

def evaluate_sb_attempts(player_stats):
    """Evaluate if player should attempt more/fewer stolen bases."""
    breakeven = sb_breakeven()

    sb = player_stats["sb"]
    cs = player_stats["cs"]
    attempts = sb + cs
    success_rate = sb / attempts if attempts > 0 else 0

    net_value = sb * 0.175 + cs * -0.40

    return {
        "sb": sb,
        "cs": cs,
        "success_rate": success_rate,
        "breakeven": breakeven,
        "above_breakeven": success_rate > breakeven,
        "net_run_value": net_value,
        "recommendation": "Attempt more" if success_rate > breakeven + 0.05 else "Reduce attempts" if success_rate < breakeven - 0.05 else "Maintain current rate"
    }

python

Trade Value Calculator

Calculate player trade values.

import pandas as pd
import numpy as np

def calculate_surplus_value(player_df, dollars_per_war=8_000_000):
    """Calculate surplus value for players."""
    df = player_df.copy()

    # Calculate WAR value
    df["war_value"] = df["projected_war"] * dollars_per_war

    # Surplus = Value - Salary
    df["surplus_value"] = df["war_value"] - df["salary"]

    # Contract-adjusted (future years)
    df["total_surplus"] = df["surplus_value"] * df["contract_years_remaining"]

    return df[["player_id", "name", "age", "projected_war", "salary", "war_value", "surplus_value", "total_surplus"]]

def trade_analysis(team1_players, team2_players, dollars_per_war=8_000_000):
    """Analyze trade fairness."""
    team1_value = calculate_surplus_value(team1_players, dollars_per_war)["total_surplus"].sum()
    team2_value = calculate_surplus_value(team2_players, dollars_per_war)["total_surplus"].sum()

    return {
        "team1_value": team1_value,
        "team2_value": team2_value,
        "difference": team1_value - team2_value,
        "winner": "Team 1" if team1_value > team2_value else "Team 2"
    }

python

Player WAR Projection

Project future WAR from current stats.

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor

def project_war(historical_df, current_season_df, years_forward=1):
    """Project future WAR."""
    # Feature engineering
    features = ["age", "pa", "war_ly", "war_2y_avg", "war_3y_avg", "obp", "slg", "k_rate", "bb_rate"]

    # Training data (players with future WAR known)
    train = historical_df[historical_df[f"war_y{years_forward}"].notna()].copy()

    X_train = train[features]
    y_train = train[f"war_y{years_forward}"]

    # Train model
    model = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
    model.fit(X_train, y_train)

    # Predict for current players
    current = current_season_df.copy()
    current["projected_war"] = model.predict(current[features])

    # Apply aging curve adjustment
    def age_adj(row):
        if row["age"] > 32:
            return row["projected_war"] * (1 - 0.05 * (row["age"] - 32))
        return row["projected_war"]

    current["projected_war_adj"] = current.apply(age_adj, axis=1)

    return current[["player_id", "name", "age", "war", "projected_war", "projected_war_adj"]]

python

Defensive Runs Saved Estimator

Estimate defensive runs saved from basic stats.

import pandas as pd
import numpy as np

def estimate_drs(fielding_stats, position):
    """Estimate defensive runs saved (simplified)."""
    # Position-specific weights
    position_weights = {
        "C": {"passed_balls": -0.25, "cs_pct": 0.15},
        "1B": {"errors": -0.5, "assists": 0.1},
        "2B": {"errors": -0.6, "range_factor": 0.3},
        "SS": {"errors": -0.6, "range_factor": 0.35},
        "3B": {"errors": -0.55, "range_factor": 0.25},
        "LF": {"errors": -0.45, "outfield_assists": 0.2},
        "CF": {"errors": -0.5, "outfield_assists": 0.25},
        "RF": {"errors": -0.45, "outfield_assists": 0.3}
    }

    weights = position_weights.get(position, {"errors": -0.5})

    drs = 0
    for stat, weight in weights.items():
        if stat in fielding_stats:
            league_avg = fielding_stats[stat].mean()
            player_val = fielding_stats[stat].iloc[0]
            drs += (player_val - league_avg) * weight

    return drs

def calculate_uzr_components(zone_data, position):
    """Calculate UZR components from zone data."""
    # Simplified UZR calculation
    components = {
        "range_runs": 0,
        "error_runs": 0,
        "arm_runs": 0 if position not in ["LF", "CF", "RF"] else 0
    }

    for zone in zone_data["zone"].unique():
        zone_plays = zone_data[zone_data["zone"] == zone]
        league_avg = zone_plays["league_make_pct"].iloc[0]
        player_rate = zone_plays["plays_made"].sum() / len(zone_plays)
        opportunities = len(zone_plays)

        components["range_runs"] += (player_rate - league_avg) * opportunities * 0.8

    return components

python

FIP Calculator

Calculate Fielding Independent Pitching.

import pandas as pd

def calculate_fip(pitcher_stats, fip_constant=3.10):
    """Calculate FIP for pitchers."""
    # FIP = ((13*HR + 3*(BB+HBP) - 2*K) / IP) + FIP_constant
    fip = (
        (13 * pitcher_stats["hr"] +
         3 * (pitcher_stats["bb"] + pitcher_stats["hbp"]) -
         2 * pitcher_stats["so"]) /
        pitcher_stats["ip"]
    ) + fip_constant

    return fip

def calculate_xfip(pitcher_stats, league_hr_fb_rate=0.10, fip_constant=3.10):
    """Calculate expected FIP (xFIP)."""
    # Use league HR/FB rate instead of actual
    expected_hr = pitcher_stats["fb"] * league_hr_fb_rate

    xfip = (
        (13 * expected_hr +
         3 * (pitcher_stats["bb"] + pitcher_stats["hbp"]) -
         2 * pitcher_stats["so"]) /
        pitcher_stats["ip"]
    ) + fip_constant

    return xfip

def calculate_siera(pitcher_stats):
    """Calculate SIERA (simplified)."""
    # Simplified SIERA calculation
    k_pct = pitcher_stats["so"] / pitcher_stats["bf"]
    bb_pct = pitcher_stats["bb"] / pitcher_stats["bf"]
    gb_pct = pitcher_stats["gb"] / (pitcher_stats["gb"] + pitcher_stats["fb"])

    siera = 6.145 - 16.986 * k_pct + 11.434 * bb_pct - 1.858 * gb_pct
    return siera

python

Weighted OBA Calculator

Calculate weighted on-base average (wOBA).

import pandas as pd

# wOBA weights (2023 values - update annually)
WOBA_WEIGHTS = {
    "uBB": 0.690,
    "HBP": 0.722,
    "1B": 0.888,
    "2B": 1.271,
    "3B": 1.616,
    "HR": 2.101
}

def calculate_woba(player_stats):
    """Calculate weighted on-base average."""
    numerator = (
        WOBA_WEIGHTS["uBB"] * player_stats["ubb"] +
        WOBA_WEIGHTS["HBP"] * player_stats["hbp"] +
        WOBA_WEIGHTS["1B"] * player_stats["singles"] +
        WOBA_WEIGHTS["2B"] * player_stats["doubles"] +
        WOBA_WEIGHTS["3B"] * player_stats["triples"] +
        WOBA_WEIGHTS["HR"] * player_stats["hr"]
    )

    denominator = (
        player_stats["ab"] +
        player_stats["ubb"] +
        player_stats["sf"] +
        player_stats["hbp"]
    )

    return numerator / denominator

def woba_to_wrc_plus(woba, league_woba=0.320, park_factor=1.0, league_runs_pa=0.12):
    """Convert wOBA to wRC+."""
    woba_scale = 1.25
    wRAA = ((woba - league_woba) / woba_scale) * 600
    wRC = wRAA + (600 * league_runs_pa)
    wRC_plus = (wRC / park_factor) / (600 * league_runs_pa) * 100
    return wRC_plus

python

Streakiness Analysis

Analyze player performance streakiness.

import pandas as pd
import numpy as np
from scipy import stats

def calculate_streakiness(game_log_df, player_id, stat="hits"):
    """Calculate streakiness index for a player."""
    player_games = game_log_df[game_log_df["player_id"] == player_id].sort_values("game_date")

    values = player_games[stat].values

    if len(values) < 20:
        return None

    # Calculate runs (consecutive games above/below mean)
    mean_val = values.mean()
    above_mean = values > mean_val

    # Count runs
    runs = 1
    for i in range(1, len(above_mean)):
        if above_mean[i] != above_mean[i-1]:
            runs += 1

    # Expected runs under random model
    n1 = above_mean.sum()
    n2 = len(above_mean) - n1
    expected_runs = (2 * n1 * n2) / (n1 + n2) + 1
    std_runs = np.sqrt((2 * n1 * n2 * (2 * n1 * n2 - n1 - n2)) / ((n1 + n2)**2 * (n1 + n2 - 1)))

    # Z-score (negative = more streaky than random)
    z_score = (runs - expected_runs) / std_runs

    # Autocorrelation
    autocorr = np.corrcoef(values[:-1], values[1:])[0, 1]

    return {
        "player_id": player_id,
        "games": len(values),
        "observed_runs": runs,
        "expected_runs": expected_runs,
        "runs_z_score": z_score,
        "autocorrelation": autocorr,
        "is_streaky": z_score < -2 or autocorr > 0.2
    }

python

Pythagorean Wins Calculator

Calculate expected wins using Pythagorean expectation.

import pandas as pd
import numpy as np

def pythagorean_wins(runs_scored, runs_allowed, games, exponent=None, sport="baseball"):
    """Calculate Pythagorean expected wins."""
    # Sport-specific exponents
    default_exponents = {
        "baseball": 1.83,
        "basketball": 13.91,
        "football": 2.37,
        "hockey": 2.0
    }

    if exponent is None:
        exponent = default_exponents.get(sport, 2.0)

    expected_pct = runs_scored ** exponent / (runs_scored ** exponent + runs_allowed ** exponent)
    expected_wins = expected_pct * games

    return expected_wins, expected_pct

def pythagenpat(runs_scored, runs_allowed, games):
    """Calculate Pythagenpat wins (variable exponent)."""
    # Exponent varies with run environment
    total_runs_per_game = (runs_scored + runs_allowed) / games
    exponent = total_runs_per_game ** 0.287

    return pythagorean_wins(runs_scored, runs_allowed, games, exponent)

def calculate_luck(actual_wins, expected_wins):
    """Calculate luck factor (actual - expected)."""
    return actual_wins - expected_wins

python

Regression to Mean Calculator

Calculate regressed statistics.

import pandas as pd
import numpy as np

def regress_to_mean(observed, sample_size, league_avg, reliability_denominator):
    """
    Regress observed value toward league average.

    reliability_denominator: sample size at which 50% regression occurs
    """
    reliability = sample_size / (sample_size + reliability_denominator)
    regressed = reliability * observed + (1 - reliability) * league_avg
    return regressed

def regress_batting_stats(player_df, league_df):
    """Regress batting statistics."""
    regressed = player_df.copy()

    # Regression denominators (approximate PA needed for 50% reliability)
    denominators = {
        "avg": 500,
        "babip": 800,
        "hr_rate": 300,
        "k_rate": 200,
        "bb_rate": 400
    }

    league_avgs = league_df.mean()

    for stat, denom in denominators.items():
        if stat in regressed.columns:
            regressed[f"{stat}_regressed"] = regressed.apply(
                lambda row: regress_to_mean(
                    row[stat], row["pa"], league_avgs[stat], denom
                ),
                axis=1
            )

    return regressed

# Usage
regressed_stats = regress_batting_stats(player_stats_df, league_stats_df)
print(regressed_stats[["name", "avg", "avg_regressed", "pa"]].head())

python

Park Factors Adjustment

Adjust statistics for park effects.

import pandas as pd

def calculate_park_factors(games_df):
    """Calculate park factors from game data."""
    park_stats = games_df.groupby("park_id").agg({
        "home_runs": "sum",
        "away_runs": "sum",
        "home_hr": "sum",
        "away_hr": "sum",
        "game_id": "count"
    })

    park_stats["total_runs"] = park_stats["home_runs"] + park_stats["away_runs"]
    park_stats["total_hr"] = park_stats["home_hr"] + park_stats["away_hr"]
    park_stats["games"] = park_stats["game_id"]

    league_rpg = park_stats["total_runs"].sum() / park_stats["games"].sum()
    league_hrpg = park_stats["total_hr"].sum() / park_stats["games"].sum()

    park_stats["runs_pf"] = (park_stats["total_runs"] / park_stats["games"]) / league_rpg
    park_stats["hr_pf"] = (park_stats["total_hr"] / park_stats["games"]) / league_hrpg

    return park_stats[["runs_pf", "hr_pf", "games"]]

def park_adjust_stats(player_stats, park_factors):
    """Adjust player stats for park effects."""
    adjusted = player_stats.merge(park_factors, on="park_id")

    # Adjust counting stats
    adjusted["adj_hr"] = adjusted["hr"] / adjusted["hr_pf"]
    adjusted["adj_runs"] = adjusted["runs"] / adjusted["runs_pf"]
    adjusted["adj_rbi"] = adjusted["rbi"] / adjusted["runs_pf"]

    return adjusted

python

Batting Order Optimizer

Optimize batting order using run expectancy.

import numpy as np
from itertools import permutations
import random

def expected_runs_lineup(lineup, players_df, innings=9, simulations=1000):
    """Estimate expected runs for a lineup."""
    total_runs = 0

    for _ in range(simulations):
        runs = 0
        outs = 0
        batter_idx = 0

        while outs < innings * 3:
            player = players_df.iloc[lineup[batter_idx % 9]]

            # Simple outcome model
            rand = random.random()
            if rand < player["bb_rate"]:
                outcome = "walk"
            elif rand < player["bb_rate"] + player["hr_rate"]:
                outcome = "hr"
                runs += 1
            elif rand < player["bb_rate"] + player["hr_rate"] + (player["avg"] - player["hr_rate"]):
                outcome = "single"
            else:
                outcome = "out"
                outs += 1

            batter_idx += 1

        total_runs += runs

    return total_runs / simulations

def optimize_lineup(players_df, n_iterations=1000):
    """Find optimal batting order using simulated annealing."""
    current = list(range(9))
    random.shuffle(current)
    current_score = expected_runs_lineup(current, players_df)

    best = current.copy()
    best_score = current_score

    temp = 1.0

    for i in range(n_iterations):
        # Swap two random positions
        new = current.copy()
        i, j = random.sample(range(9), 2)
        new[i], new[j] = new[j], new[i]

        new_score = expected_runs_lineup(new, players_df)

        # Accept or reject
        if new_score > current_score or random.random() < np.exp((new_score - current_score) / temp):
            current = new
            current_score = new_score

            if current_score > best_score:
                best = current.copy()
                best_score = current_score

        temp *= 0.995

    return best, best_score

python

Player Comparison Tool

Compare players across multiple statistical dimensions.

import pandas as pd
import numpy as np
from scipy import stats

class PlayerComparator:
    """Compare players across statistics."""

    def __init__(self, league_stats_df):
        self.league_stats = league_stats_df

    def compare_players(self, player1_id, player2_id, stats_to_compare=None):
        """Compare two players."""
        p1 = self.league_stats[self.league_stats["player_id"] == player1_id].iloc[0]
        p2 = self.league_stats[self.league_stats["player_id"] == player2_id].iloc[0]

        if stats_to_compare is None:
            stats_to_compare = ["avg", "obp", "slg", "hr", "war"]

        comparison = []
        for stat in stats_to_compare:
            league_mean = self.league_stats[stat].mean()
            league_std = self.league_stats[stat].std()

            comparison.append({
                "stat": stat,
                "player1": p1[stat],
                "player2": p2[stat],
                "difference": p1[stat] - p2[stat],
                "player1_zscore": (p1[stat] - league_mean) / league_std,
                "player2_zscore": (p2[stat] - league_mean) / league_std,
                "advantage": p1["name"] if p1[stat] > p2[stat] else p2["name"]
            })

        return pd.DataFrame(comparison)

    def similarity_score(self, player1_id, player2_id, stats=None):
        """Calculate similarity between two players."""
        if stats is None:
            stats = ["avg", "obp", "slg", "hr_rate", "k_rate", "bb_rate"]

        p1 = self.league_stats[self.league_stats["player_id"] == player1_id][stats].values[0]
        p2 = self.league_stats[self.league_stats["player_id"] == player2_id][stats].values[0]

        # Normalize
        means = self.league_stats[stats].mean().values
        stds = self.league_stats[stats].std().values

        p1_norm = (p1 - means) / stds
        p2_norm = (p2 - means) / stds

        # Cosine similarity
        similarity = np.dot(p1_norm, p2_norm) / (np.linalg.norm(p1_norm) * np.linalg.norm(p2_norm))

        return similarity

python

Schedule Strength Calculator

Calculate strength of schedule for teams.

import pandas as pd
import numpy as np

def calculate_sos(games_df, team_ratings):
    """Calculate strength of schedule."""
    results = []

    for team_id in games_df["home_team_id"].unique():
        # Get all opponents
        home_games = games_df[games_df["home_team_id"] == team_id]
        away_games = games_df[games_df["away_team_id"] == team_id]

        opponents = list(home_games["away_team_id"]) + list(away_games["home_team_id"])

        # Get opponent ratings
        opp_ratings = [team_ratings.get(opp, 0.5) for opp in opponents]

        # SOS = average opponent win %
        sos = np.mean(opp_ratings)

        # Future SOS (remaining games)
        played = len([g for g in opponents if g in games_df[games_df["status"] == "Final"]["home_team_id"].values])
        future_opps = opponents[played:]
        future_sos = np.mean([team_ratings.get(opp, 0.5) for opp in future_opps]) if future_opps else sos

        results.append({
            "team_id": team_id,
            "games_played": played,
            "games_remaining": len(opponents) - played,
            "sos": sos,
            "past_sos": np.mean(opp_ratings[:played]) if played > 0 else 0.5,
            "future_sos": future_sos
        })

    return pd.DataFrame(results).sort_values("sos", ascending=False)

python Basketball

Win Shares Calculator

Calculate basketball win shares.

import pandas as pd

def calculate_win_shares(player_stats, team_stats):
    """Calculate offensive and defensive win shares."""
    # Marginal offense
    pts_produced = player_stats["pts"] + player_stats["ast"] * 0.5
    possessions = player_stats["fga"] + 0.44 * player_stats["fta"] - player_stats["oreb"] + player_stats["tov"]

    # Points per possession
    ppp = pts_produced / possessions
    league_ppp = team_stats["pts"].sum() / team_stats["possessions"].sum()

    # Marginal PPP
    marginal_off = (ppp - 0.92 * league_ppp) * possessions

    # Offensive win shares
    marginal_pts_per_win = league_ppp * team_stats["pace"].mean() * 2 / 0.32
    ows = marginal_off / marginal_pts_per_win

    # Defensive win shares (simplified)
    def_rating = player_stats["drtg"]
    league_def = team_stats["drtg"].mean()
    dws = (league_def - def_rating) / 100 * player_stats["mp"] / 48 * 0.1

    return pd.DataFrame({
        "player_id": player_stats["player_id"],
        "OWS": ows,
        "DWS": dws,
        "WS": ows + dws
    })

python

Player Projection Aggregator

Aggregate projections from multiple systems.

import pandas as pd
import numpy as np

def aggregate_projections(projection_systems: dict, weights: dict = None):
    """Aggregate projections from multiple systems."""
    if weights is None:
        weights = {name: 1/len(projection_systems) for name in projection_systems}

    # Normalize weights
    total = sum(weights.values())
    weights = {k: v/total for k, v in weights.items()}

    # Merge all projections
    combined = None
    for name, df in projection_systems.items():
        df = df.copy()
        df.columns = [f"{col}_{name}" if col != "player_id" else col for col in df.columns]
        if combined is None:
            combined = df
        else:
            combined = combined.merge(df, on="player_id", how="outer")

    # Calculate weighted averages
    stat_cols = ["pa", "avg", "hr", "rbi", "war"]
    for stat in stat_cols:
        system_cols = [f"{stat}_{name}" for name in projection_systems]
        existing_cols = [c for c in system_cols if c in combined.columns]
        combined[f"{stat}_proj"] = sum(
            combined[col] * weights.get(col.split("_")[-1], 0)
            for col in existing_cols
        )

    return combined

# Example usage
projections = {
    "steamer": steamer_df,
    "zips": zips_df,
    "pecota": pecota_df
}
weights = {"steamer": 0.4, "zips": 0.35, "pecota": 0.25}
combined = aggregate_projections(projections, weights)

python Baseball

MLB Spray Chart Generator

Generate spray charts from batted ball data.

import matplotlib.pyplot as plt
import numpy as np

def create_spray_chart(batted_balls_df, player_name):
    """Create spray chart for a player."""
    player_data = batted_balls_df[batted_balls_df["batter_name"] == player_name]

    fig, ax = plt.subplots(figsize=(10, 10))

    # Draw field outline
    theta = np.linspace(np.pi/4, 3*np.pi/4, 100)
    r = 400
    ax.plot(r * np.cos(theta), r * np.sin(theta), "k-", lw=2)
    ax.plot([0, r * np.cos(np.pi/4)], [0, r * np.sin(np.pi/4)], "k-", lw=2)
    ax.plot([0, r * np.cos(3*np.pi/4)], [0, r * np.sin(3*np.pi/4)], "k-", lw=2)

    # Color by hit type
    colors = {"single": "blue", "double": "green", "triple": "orange", "home_run": "red", "out": "gray"}

    for hit_type, color in colors.items():
        subset = player_data[player_data["events"] == hit_type]
        ax.scatter(subset["hc_x"], subset["hc_y"], c=color, s=30, alpha=0.6, label=hit_type)

    ax.set_xlim(-250, 250)
    ax.set_ylim(-50, 450)
    ax.set_title(f"{player_name} Spray Chart")
    ax.legend()
    return fig, ax

python Basketball

NBA Shot Zones Analysis

Analyze shooting efficiency by court zones.

import pandas as pd
import numpy as np

def classify_shot_zone(x, y):
    """Classify shot location into zones."""
    distance = np.sqrt(x**2 + y**2)
    angle = np.arctan2(y, x) * 180 / np.pi

    if distance < 4:
        return "Restricted Area"
    elif distance < 8:
        return "Paint"
    elif y <= 7.8 and abs(x) > 22:
        return "Corner 3"
    elif distance > 23.75:
        return "Above Break 3"
    elif distance < 16:
        return "Mid-Range"
    else:
        return "Long 2"

def zone_efficiency(shots_df):
    """Calculate efficiency by zone."""
    shots = shots_df.copy()
    shots["zone"] = shots.apply(lambda r: classify_shot_zone(r["loc_x"], r["loc_y"]), axis=1)

    return shots.groupby("zone").agg({
        "is_made": ["sum", "count", "mean"]
    }).rename(columns={"sum": "makes", "count": "attempts", "mean": "fg_pct"})

print(zone_efficiency(shots_df))

python

Batter vs Pitcher Matchup

Analyze batter vs pitcher historical matchups.

import pandas as pd
import numpy as np

def analyze_matchup(pa_df, batter_id, pitcher_id, min_pa=10):
    """Analyze historical batter vs pitcher matchup."""
    matchup = pa_df[(pa_df["batter_id"] == batter_id) & (pa_df["pitcher_id"] == pitcher_id)]

    if len(matchup) < min_pa:
        return {"sufficient_data": False, "pa": len(matchup)}

    stats = {
        "pa": len(matchup),
        "ab": matchup["is_ab"].sum(),
        "hits": matchup["is_hit"].sum(),
        "hr": (matchup["event"] == "home_run").sum(),
        "so": (matchup["event"] == "strikeout").sum(),
        "bb": (matchup["event"] == "walk").sum(),
        "avg": matchup["is_hit"].sum() / matchup["is_ab"].sum() if matchup["is_ab"].sum() > 0 else 0,
        "woba": matchup["woba_value"].mean(),
        "sufficient_data": True
    }

    return stats

def matchup_projection(batter_stats, pitcher_stats, batter_vs_pitcher=None, pa_weight=30):
    """Project matchup performance."""
    # Weighted average of overall stats and matchup history

    if batter_vs_pitcher and batter_vs_pitcher.get("sufficient_data"):
        matchup_pa = batter_vs_pitcher["pa"]
        weight = matchup_pa / (matchup_pa + pa_weight)

        projected_woba = (
            weight * batter_vs_pitcher["woba"] +
            (1 - weight) * (batter_stats["woba"] + pitcher_stats["woba_against"]) / 2
        )
    else:
        projected_woba = (batter_stats["woba"] + pitcher_stats["woba_against"]) / 2

    return projected_woba

python

Run Expectancy Matrix

Calculate run expectancy by base-out state.

import pandas as pd
import numpy as np

def calculate_re_matrix(pbp_df):
    """Calculate run expectancy matrix from play-by-play data."""
    # Define base-out states
    states = []
    for outs in range(3):
        for first in [0, 1]:
            for second in [0, 1]:
                for third in [0, 1]:
                    states.append({
                        "outs": outs,
                        "first": first,
                        "second": second,
                        "third": third,
                        "state": f"{third}{second}{first}_{outs}"
                    })

    # Calculate runs scored from each state to end of inning
    pbp = pbp_df.copy()
    pbp["state"] = pbp.apply(
        lambda r: f"{r['third']}{r['second']}{r['first']}_{r['outs']}", axis=1
    )

    # Group and calculate average runs remaining
    re_matrix = pbp.groupby("state")["runs_to_end_inning"].mean().to_dict()

    # Format as DataFrame
    re_df = pd.DataFrame(states)
    re_df["run_expectancy"] = re_df["state"].map(re_matrix)

    return re_df.pivot_table(
        index=["third", "second", "first"],
        columns="outs",
        values="run_expectancy"
    )

def calculate_re24(pbp_df, re_matrix):
    """Calculate RE24 for each play."""
    pbp = pbp_df.copy()

    # Get RE before and after each play
    pbp["re_before"] = pbp.apply(lambda r: re_matrix.get(r["state_before"], 0), axis=1)
    pbp["re_after"] = pbp.apply(lambda r: re_matrix.get(r["state_after"], 0), axis=1)

    # RE24 = runs scored + (RE after - RE before)
    pbp["re24"] = pbp["runs_on_play"] + (pbp["re_after"] - pbp["re_before"])

    return pbp

python

Quality Start Probability

Predict probability of a quality start.

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

def train_qs_model(historical_starts_df):
    """Train model to predict quality starts."""
    features = [
        "season_era", "season_fip", "season_k9", "season_bb9",
        "last_3_era", "home_game", "opp_wrc_plus",
        "rest_days", "park_factor"
    ]

    X = historical_starts_df[features]
    y = (historical_starts_df["ip"] >= 6) & (historical_starts_df["er"] <= 3)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = LogisticRegression()
    model.fit(X_scaled, y)

    return model, scaler, features

def predict_qs_probability(model, scaler, features, game_data):
    """Predict QS probability for upcoming start."""
    X = game_data[features]
    X_scaled = scaler.transform(X)

    probability = model.predict_proba(X_scaled)[0, 1]
    return probability

python

Plate Discipline Metrics

Calculate plate discipline statistics.

import pandas as pd

def calculate_plate_discipline(pitch_df, batter_id):
    """Calculate plate discipline metrics."""
    batter_pitches = pitch_df[pitch_df["batter_id"] == batter_id]

    total = len(batter_pitches)
    in_zone = batter_pitches["in_zone"].sum()
    out_zone = total - in_zone

    swings = batter_pitches["swing"].sum()
    swings_zone = batter_pitches[batter_pitches["in_zone"]]["swing"].sum()
    swings_out = batter_pitches[~batter_pitches["in_zone"]]["swing"].sum()

    contact = batter_pitches[batter_pitches["swing"]]["contact"].sum()
    contact_zone = batter_pitches[batter_pitches["in_zone"] & batter_pitches["swing"]]["contact"].sum()
    contact_out = batter_pitches[~batter_pitches["in_zone"] & batter_pitches["swing"]]["contact"].sum()

    return {
        "pitches": total,
        "zone_pct": in_zone / total * 100,
        "swing_pct": swings / total * 100,
        "z_swing_pct": swings_zone / in_zone * 100 if in_zone > 0 else 0,
        "o_swing_pct": swings_out / out_zone * 100 if out_zone > 0 else 0,
        "contact_pct": contact / swings * 100 if swings > 0 else 0,
        "z_contact_pct": contact_zone / swings_zone * 100 if swings_zone > 0 else 0,
        "o_contact_pct": contact_out / swings_out * 100 if swings_out > 0 else 0,
        "swstr_pct": (swings - contact) / total * 100  # Swinging strike %
    }

python Baseball

Exit Velocity Analysis

Analyze batted ball exit velocities.

import pandas as pd
import numpy as np

def analyze_exit_velocity(batted_balls_df, player_id):
    """Analyze exit velocity metrics."""
    player_bb = batted_balls_df[batted_balls_df["batter_id"] == player_id]

    ev = player_bb["launch_speed"]
    la = player_bb["launch_angle"]

    # Hard hit = 95+ mph
    hard_hit_pct = (ev >= 95).mean() * 100

    # Barrel = optimal EV + LA combination
    barrels = player_bb[(ev >= 98) & (la >= 26) & (la <= 30)]
    barrel_pct = len(barrels) / len(player_bb) * 100 if len(player_bb) > 0 else 0

    # Sweet spot = 8-32 degree launch angle
    sweet_spot = player_bb[(la >= 8) & (la <= 32)]
    sweet_spot_pct = len(sweet_spot) / len(player_bb) * 100 if len(player_bb) > 0 else 0

    return {
        "batted_balls": len(player_bb),
        "avg_ev": ev.mean(),
        "max_ev": ev.max(),
        "ev_50th": ev.quantile(0.5),
        "ev_90th": ev.quantile(0.9),
        "hard_hit_pct": hard_hit_pct,
        "barrel_pct": barrel_pct,
        "sweet_spot_pct": sweet_spot_pct,
        "avg_la": la.mean(),
        "gb_pct": (la < 10).mean() * 100,
        "fb_pct": (la > 25).mean() * 100
    }

python Soccer

Soccer Pass Network Analysis

Analyze passing networks and identify key players in soccer.

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

def build_pass_network(events_df, team_id, match_id):
    """Build passing network from match events."""
    passes = events_df[
        (events_df["team_id"] == team_id) &
        (events_df["match_id"] == match_id) &
        (events_df["event_type"] == "pass") &
        (events_df["pass_outcome"] == "successful")
    ]

    # Create directed graph
    G = nx.DiGraph()

    # Add edges (passes between players)
    for _, p in passes.iterrows():
        passer = p["player_id"]
        receiver = p["pass_recipient_id"]

        if G.has_edge(passer, receiver):
            G[passer][receiver]["weight"] += 1
        else:
            G.add_edge(passer, receiver, weight=1)

    return G, passes

def calculate_network_metrics(G, players_df):
    """Calculate network centrality metrics."""
    metrics = {}

    # Degree centrality (how connected)
    metrics["degree"] = nx.degree_centrality(G)

    # Betweenness (how often in passing chains)
    metrics["betweenness"] = nx.betweenness_centrality(G, weight="weight")

    # PageRank (importance weighted by connections)
    metrics["pagerank"] = nx.pagerank(G, weight="weight")

    # Closeness (how quickly can reach others)
    metrics["closeness"] = nx.closeness_centrality(G)

    # Compile into DataFrame
    results = pd.DataFrame(metrics)
    results.index.name = "player_id"
    results = results.reset_index()

    # Add player names
    results = results.merge(
        players_df[["player_id", "name", "position"]],
        on="player_id"
    )

    return results.sort_values("pagerank", ascending=False)

def visualize_pass_network(G, players_df, avg_positions):
    """Visualize the passing network."""
    fig, ax = plt.subplots(figsize=(14, 9))

    # Draw pitch
    ax.set_xlim(0, 120)
    ax.set_ylim(0, 80)
    ax.set_facecolor("#228B22")

    # Get node positions from average player positions
    pos = {row["player_id"]: (row["avg_x"], row["avg_y"])
           for _, row in avg_positions.iterrows()}

    # Node sizes based on degree
    degrees = dict(G.degree())
    node_sizes = [degrees.get(n, 1) * 100 for n in G.nodes()]

    # Edge widths based on pass count
    edge_widths = [G[u][v]["weight"] / 5 for u, v in G.edges()]

    # Draw network
    nx.draw_networkx_nodes(G, pos, node_size=node_sizes,
                          node_color="white", edgecolors="black", ax=ax)

    nx.draw_networkx_edges(G, pos, width=edge_widths,
                          edge_color="white", alpha=0.6,
                          arrows=True, arrowsize=15, ax=ax)

    # Labels
    labels = {row["player_id"]: row["name"].split()[-1]
              for _, row in players_df.iterrows() if row["player_id"] in G.nodes()}
    nx.draw_networkx_labels(G, pos, labels, font_size=8, font_color="black", ax=ax)

    ax.set_title("Team Passing Network", fontsize=14)

    return fig, ax

# Build and analyze network
G, passes = build_pass_network(events_df, team_id=1, match_id=12345)
metrics = calculate_network_metrics(G, players_df)

print("Most Central Players:")
print(metrics[["name", "position", "pagerank", "betweenness"]].head(10))

python Hockey

Hockey Corsi and Fenwick

Calculate Corsi and Fenwick shot attempt metrics for hockey.

import pandas as pd
import numpy as np

def calculate_corsi_fenwick(events_df, player_id):
    """Calculate Corsi and Fenwick for a player."""
    # Filter to when player is on ice
    on_ice = events_df[events_df["players_on_ice"].apply(lambda x: player_id in x)]

    # Corsi events: all shot attempts (goals, shots, missed, blocked)
    corsi_events = ["goal", "shot", "missed_shot", "blocked_shot"]

    # Fenwick events: unblocked shot attempts
    fenwick_events = ["goal", "shot", "missed_shot"]

    player_team = events_df[events_df["player_id"] == player_id]["team_id"].iloc[0]

    # Count events
    corsi_for = len(on_ice[
        (on_ice["event_type"].isin(corsi_events)) &
        (on_ice["team_id"] == player_team)
    ])

    corsi_against = len(on_ice[
        (on_ice["event_type"].isin(corsi_events)) &
        (on_ice["team_id"] != player_team)
    ])

    fenwick_for = len(on_ice[
        (on_ice["event_type"].isin(fenwick_events)) &
        (on_ice["team_id"] == player_team)
    ])

    fenwick_against = len(on_ice[
        (on_ice["event_type"].isin(fenwick_events)) &
        (on_ice["team_id"] != player_team)
    ])

    # Time on ice
    toi = on_ice["event_time"].max() - on_ice["event_time"].min()
    toi_minutes = toi / 60

    return {
        "player_id": player_id,
        "CF": corsi_for,
        "CA": corsi_against,
        "CF%": corsi_for / (corsi_for + corsi_against) * 100 if (corsi_for + corsi_against) > 0 else 50,
        "CF_rel": (corsi_for - corsi_against) / toi_minutes * 60 if toi_minutes > 0 else 0,
        "FF": fenwick_for,
        "FA": fenwick_against,
        "FF%": fenwick_for / (fenwick_for + fenwick_against) * 100 if (fenwick_for + fenwick_against) > 0 else 50,
        "TOI": toi_minutes
    }

def team_corsi_summary(events_df, team_id, game_state="5v5"):
    """Calculate team Corsi summary."""
    # Filter to game state
    if game_state == "5v5":
        events = events_df[
            (events_df["home_skaters"] == 5) &
            (events_df["away_skaters"] == 5)
        ]
    else:
        events = events_df

    team_events = events[events["team_id"] == team_id]
    opp_events = events[events["team_id"] != team_id]

    corsi_events = ["goal", "shot", "missed_shot", "blocked_shot"]

    cf = len(team_events[team_events["event_type"].isin(corsi_events)])
    ca = len(opp_events[opp_events["event_type"].isin(corsi_events)])

    return {
        "team_id": team_id,
        "game_state": game_state,
        "CF": cf,
        "CA": ca,
        "CF%": cf / (cf + ca) * 100 if (cf + ca) > 0 else 50,
        "shot_diff": cf - ca
    }

# Calculate for all players
all_stats = []
for player_id in events_df["player_id"].unique():
    stats = calculate_corsi_fenwick(events_df, player_id)
    all_stats.append(stats)

corsi_df = pd.DataFrame(all_stats)
print("Top Players by CF%:")
print(corsi_df.nlargest(10, "CF%")[["player_id", "CF%", "CF", "CA", "TOI"]])

python MMA

Tennis Match Statistics

Analyze tennis match statistics and serve patterns.

import pandas as pd
import numpy as np

def analyze_serve_stats(points_df, player_id):
    """Analyze serve statistics for a player."""
    serves = points_df[points_df["server_id"] == player_id]

    stats = {
        "total_service_points": len(serves),
        # First serve
        "first_serves_in": serves["first_serve_in"].sum(),
        "first_serve_pct": serves["first_serve_in"].mean() * 100,
        "first_serve_won": serves[serves["first_serve_in"]]["point_won"].mean() * 100,
        # Second serve
        "second_serves": len(serves[~serves["first_serve_in"]]),
        "double_faults": serves["double_fault"].sum(),
        "second_serve_won": serves[(~serves["first_serve_in"]) & (~serves["double_fault"])]["point_won"].mean() * 100,
        # Aces
        "aces": serves["ace"].sum(),
        "ace_pct": serves["ace"].mean() * 100,
        # Break points
        "break_points_faced": serves["break_point"].sum(),
        "break_points_saved": serves[serves["break_point"]]["point_won"].mean() * 100
    }

    return stats

def analyze_return_stats(points_df, player_id):
    """Analyze return statistics for a player."""
    returns = points_df[points_df["returner_id"] == player_id]

    stats = {
        "total_return_points": len(returns),
        "first_serve_return_won": returns[returns["first_serve_in"]]["point_won"].mean() * 100,
        "second_serve_return_won": returns[~returns["first_serve_in"]]["point_won"].mean() * 100,
        "break_points_created": returns["break_point"].sum(),
        "break_points_converted": returns[returns["break_point"]]["point_won"].mean() * 100
    }

    return stats

def rally_analysis(points_df, player_id):
    """Analyze rally patterns."""
    player_points = points_df[
        (points_df["server_id"] == player_id) |
        (points_df["returner_id"] == player_id)
    ]

    player_points["is_server"] = player_points["server_id"] == player_id

    # Performance by rally length
    rally_stats = player_points.groupby(
        pd.cut(player_points["rally_length"], bins=[0, 4, 8, float("inf")])
    ).agg({
        "point_won": ["mean", "count"]
    })

    return {
        "short_rally_win_pct": player_points[player_points["rally_length"] <= 4]["point_won"].mean() * 100,
        "medium_rally_win_pct": player_points[
            (player_points["rally_length"] > 4) & (player_points["rally_length"] <= 8)
        ]["point_won"].mean() * 100,
        "long_rally_win_pct": player_points[player_points["rally_length"] > 8]["point_won"].mean() * 100,
        "avg_rally_length": player_points["rally_length"].mean()
    }

# Comprehensive match analysis
def match_summary(points_df, player1_id, player2_id):
    """Generate comprehensive match summary."""
    p1_serve = analyze_serve_stats(points_df, player1_id)
    p1_return = analyze_return_stats(points_df, player1_id)
    p1_rally = rally_analysis(points_df, player1_id)

    p2_serve = analyze_serve_stats(points_df, player2_id)
    p2_return = analyze_return_stats(points_df, player2_id)
    p2_rally = rally_analysis(points_df, player2_id)

    return pd.DataFrame({
        "Player 1": {**p1_serve, **p1_return, **p1_rally},
        "Player 2": {**p2_serve, **p2_return, **p2_rally}
    })

summary = match_summary(points_df, player1_id=1, player2_id=2)
print(summary)

Sports Linear Mixed Models in R

Fit linear mixed models for hierarchical sports data.

library(lme4)
library(lmerTest)
library(dplyr)
library(ggplot2)

# Load hierarchical data (players nested in teams)
# Model: Performance varies by player, team, and year

# Fit mixed model
# Fixed effects: age, experience
# Random effects: player (nested in team), team
model <- lmer(
  war ~ age + I(age^2) + experience + (1 | team/player_id) + (1 | season),
  data = player_seasons,
  REML = TRUE
)

# Summary
summary(model)

# Extract variance components
VarCorr(model)

# Random effects
ranef_team <- ranef(model)$team
ranef_player <- ranef(model)$`team:player_id`

# Best teams by random effect
team_effects <- data.frame(
  team = rownames(ranef_team),
  effect = ranef_team[[1]]
) %>%
  arrange(desc(effect))

print("Top teams by random effect:")
print(head(team_effects, 10))

# Diagnostic plots
par(mfrow = c(2, 2))
plot(model)

# Predictions with confidence intervals
newdata <- data.frame(
  age = 28,
  experience = 5,
  team = "Yankees",
  player_id = "new_player",
  season = 2024
)

# Prediction (population average)
predict(model, newdata, re.form = NA)

# Prediction with team effect
predict(model, newdata, re.form = ~ (1 | team))

# Compare models with likelihood ratio test
model_reduced <- lmer(
  war ~ age + experience + (1 | team/player_id),
  data = player_seasons
)

anova(model_reduced, model)

Sports GAM Models in R

Fit Generalized Additive Models for non-linear relationships.

library(mgcv)
library(ggplot2)
library(dplyr)

# Fit GAM for WAR prediction
# Allows non-linear relationships with age, experience, etc.
gam_model <- gam(
  war ~ s(age, k = 10) +  # Smooth function of age
        s(experience, k = 5) +
        s(plate_appearances, k = 5) +
        position +  # Categorical
        ti(age, experience, k = 5),  # Tensor interaction
  data = player_stats,
  family = gaussian(),
  method = "REML"
)

# Summary
summary(gam_model)

# Check effective degrees of freedom
gam.check(gam_model)

# Visualize smooth terms
par(mfrow = c(2, 2))
plot(gam_model, pages = 1, shade = TRUE)

# Get partial effects
library(gratia)
draw(gam_model)

# Predict aging curve
age_pred <- data.frame(
  age = 20:40,
  experience = 5,
  plate_appearances = 500,
  position = "OF"
)

age_pred$predicted_war <- predict(gam_model, newdata = age_pred)
age_pred$se <- predict(gam_model, newdata = age_pred, se.fit = TRUE)$se.fit

ggplot(age_pred, aes(x = age, y = predicted_war)) +
  geom_ribbon(aes(ymin = predicted_war - 1.96 * se,
                  ymax = predicted_war + 1.96 * se),
              fill = "lightblue", alpha = 0.5) +
  geom_line(color = "blue", size = 1.5) +
  geom_vline(xintercept = 27, linetype = "dashed") +
  labs(title = "GAM Aging Curve",
       x = "Age", y = "Predicted WAR") +
  theme_minimal()

# Compare to linear model
lm_model <- lm(war ~ poly(age, 2) + experience + plate_appearances + position,
               data = player_stats)

AIC(gam_model, lm_model)

Sports Data Imputation in R

Handle missing data in sports datasets using multiple imputation.

library(mice)
library(dplyr)
library(VIM)

# Visualize missing data patterns
md.pattern(player_stats)
aggr(player_stats, col = c("navyblue", "red"),
     numbers = TRUE, sortVars = TRUE)

# Multiple imputation with MICE
# Predictive mean matching for numeric, logistic for binary
imp <- mice(
  player_stats,
  m = 5,  # Number of imputations
  method = c(
    "pmm",   # age
    "pmm",   # experience
    "pmm",   # avg
    "pmm",   # hr
    "logreg" # all_star (binary)
  ),
  maxit = 20,
  seed = 42
)

# Check convergence
plot(imp)

# Pool results from imputed datasets
# Fit model on each imputed dataset
model_imp <- with(imp, lm(war ~ age + I(age^2) + avg + hr))

# Pool estimates
pooled <- pool(model_imp)
summary(pooled)

# Get complete datasets
complete_data <- complete(imp, action = "long", include = TRUE)

# Compare distributions
library(lattice)
densityplot(imp, ~ avg | .imp)

# Sensitivity analysis: compare results across imputations
results <- sapply(1:5, function(i) {
  df <- complete(imp, i)
  coef(lm(war ~ age + avg + hr, data = df))
})

# Show variation across imputations
print(round(results, 4))

python

Websocket Live Data Feed

Connect to live sports data websocket feeds.

import asyncio
import websockets
import json
from datetime import datetime
from collections import deque

class LiveDataFeed:
    """Connect to live sports data feed."""

    def __init__(self, url, on_message_callback=None):
        self.url = url
        self.callback = on_message_callback or self.default_handler
        self.messages = deque(maxlen=1000)
        self.connected = False

    async def connect(self):
        """Establish websocket connection."""
        async with websockets.connect(self.url) as ws:
            self.connected = True
            print(f"Connected to {self.url}")

            while True:
                try:
                    message = await ws.recv()
                    data = json.loads(message)
                    self.messages.append({
                        "timestamp": datetime.now(),
                        "data": data
                    })
                    await self.callback(data)
                except websockets.ConnectionClosed:
                    print("Connection closed")
                    self.connected = False
                    break

    async def default_handler(self, data):
        """Default message handler."""
        event_type = data.get("type", "unknown")
        print(f"[{datetime.now()}] {event_type}: {data.get('message', data)}")

    def get_recent_messages(self, n=10):
        """Get recent messages."""
        return list(self.messages)[-n:]

class SportsScoreTracker:
    """Track live sports scores."""

    def __init__(self):
        self.games = {}
        self.feed = None

    async def on_score_update(self, data):
        """Handle score update messages."""
        if data.get("type") == "score_update":
            game_id = data["game_id"]

            if game_id not in self.games:
                self.games[game_id] = {
                    "home_team": data["home_team"],
                    "away_team": data["away_team"],
                    "home_score": 0,
                    "away_score": 0
                }

            self.games[game_id]["home_score"] = data.get("home_score", 0)
            self.games[game_id]["away_score"] = data.get("away_score", 0)

            print(f"{self.games[game_id]['away_team']} {self.games[game_id]['away_score']} @ "
                  f"{self.games[game_id]['home_team']} {self.games[game_id]['home_score']}")

    async def start(self, feed_url):
        """Start tracking scores."""
        self.feed = LiveDataFeed(feed_url, self.on_score_update)
        await self.feed.connect()

# Usage
async def main():
    tracker = SportsScoreTracker()
    await tracker.start("wss://live-scores.example.com/feed")

# asyncio.run(main())

python

Sports Data Logger

Structured logging for sports analytics pipelines.

import logging
import json
from datetime import datetime
from pathlib import Path
from typing import Any, Dict
import sys

class SportsDataLogger:
    """Structured logging for sports analytics."""

    def __init__(self, name: str, log_dir: str = "./logs"):
        self.name = name
        self.log_dir = Path(log_dir)
        self.log_dir.mkdir(exist_ok=True)

        self.logger = logging.getLogger(name)
        self.logger.setLevel(logging.DEBUG)

        # Console handler
        console = logging.StreamHandler(sys.stdout)
        console.setLevel(logging.INFO)
        console.setFormatter(logging.Formatter(
            "%(asctime)s | %(levelname)s | %(message)s"
        ))
        self.logger.addHandler(console)

        # File handler (JSON lines)
        file_path = self.log_dir / f"{name}_{datetime.now():%Y%m%d}.jsonl"
        file_handler = logging.FileHandler(file_path)
        file_handler.setLevel(logging.DEBUG)
        file_handler.setFormatter(JsonFormatter())
        self.logger.addHandler(file_handler)

    def log_data_load(self, source: str, records: int, duration: float):
        """Log data loading operation."""
        self.logger.info(
            f"Loaded {records} records from {source}",
            extra={
                "event_type": "data_load",
                "source": source,
                "records": records,
                "duration_seconds": duration
            }
        )

    def log_model_train(self, model_name: str, metrics: Dict[str, float]):
        """Log model training."""
        self.logger.info(
            f"Trained model {model_name}",
            extra={
                "event_type": "model_train",
                "model": model_name,
                "metrics": metrics
            }
        )

    def log_prediction(self, model: str, input_data: Dict, prediction: Any):
        """Log prediction."""
        self.logger.debug(
            f"Prediction from {model}",
            extra={
                "event_type": "prediction",
                "model": model,
                "input": input_data,
                "prediction": prediction
            }
        )

    def log_error(self, error: Exception, context: Dict = None):
        """Log error with context."""
        self.logger.error(
            str(error),
            extra={
                "event_type": "error",
                "error_type": type(error).__name__,
                "context": context or {}
            },
            exc_info=True
        )

class JsonFormatter(logging.Formatter):
    """Format log records as JSON."""

    def format(self, record):
        log_data = {
            "timestamp": datetime.utcnow().isoformat(),
            "level": record.levelname,
            "logger": record.name,
            "message": record.getMessage()
        }

        # Add extra fields
        for key in ["event_type", "source", "records", "duration_seconds",
                    "model", "metrics", "input", "prediction", "error_type", "context"]:
            if hasattr(record, key):
                log_data[key] = getattr(record, key)

        return json.dumps(log_data)

# Usage
logger = SportsDataLogger("mlb_analytics")

# Log data operations
import time
start = time.time()
# ... load data ...
logger.log_data_load("statcast_api", records=50000, duration=time.time() - start)

# Log model training
logger.log_model_train("war_predictor", {
    "rmse": 0.85,
    "r2": 0.78,
    "mae": 0.62
})

python

Sports Alert System

Generate alerts for notable sports events and statistics.

from dataclasses import dataclass
from datetime import datetime
from typing import List, Callable, Dict, Any
from enum import Enum
import json

class AlertPriority(Enum):
    LOW = 1
    MEDIUM = 2
    HIGH = 3
    CRITICAL = 4

@dataclass
class Alert:
    """Sports alert."""
    id: str
    title: str
    message: str
    priority: AlertPriority
    timestamp: datetime
    data: Dict[str, Any]
    category: str

class AlertRule:
    """Rule for generating alerts."""

    def __init__(self, name: str, condition: Callable, priority: AlertPriority, category: str):
        self.name = name
        self.condition = condition
        self.priority = priority
        self.category = category

    def check(self, data: Dict) -> Alert:
        """Check if rule triggers an alert."""
        result = self.condition(data)
        if result:
            return Alert(
                id=f"{self.name}_{datetime.now().timestamp()}",
                title=result.get("title", self.name),
                message=result.get("message", ""),
                priority=self.priority,
                timestamp=datetime.now(),
                data=data,
                category=self.category
            )
        return None

class SportsAlertSystem:
    """Alert system for sports analytics."""

    def __init__(self):
        self.rules: List[AlertRule] = []
        self.alerts: List[Alert] = []
        self.handlers: List[Callable] = []

    def add_rule(self, rule: AlertRule):
        """Add alert rule."""
        self.rules.append(rule)

    def add_handler(self, handler: Callable):
        """Add alert handler (notification function)."""
        self.handlers.append(handler)

    def check_all(self, data: Dict) -> List[Alert]:
        """Check all rules against data."""
        triggered = []
        for rule in self.rules:
            alert = rule.check(data)
            if alert:
                triggered.append(alert)
                self.alerts.append(alert)
                self._dispatch(alert)
        return triggered

    def _dispatch(self, alert: Alert):
        """Dispatch alert to all handlers."""
        for handler in self.handlers:
            try:
                handler(alert)
            except Exception as e:
                print(f"Handler error: {e}")

# Example rules
def milestone_rule(data):
    """Check for career milestones."""
    if data.get("career_hr", 0) in [500, 600, 700, 714, 755, 762]:
        return {
            "title": "Career HR Milestone!",
            "message": f"{data['player_name']} hit career HR #{data['career_hr']}"
        }
    return None

def injury_rule(data):
    """Check for injury reports."""
    if data.get("injury_status") == "out":
        return {
            "title": "Player Injury",
            "message": f"{data['player_name']} placed on IL: {data.get('injury_type', 'Unknown')}"
        }
    return None

def blowout_rule(data):
    """Check for blowout games."""
    margin = abs(data.get("home_score", 0) - data.get("away_score", 0))
    if margin >= 15 and data.get("inning", 0) >= 7:
        return {
            "title": "Blowout Alert",
            "message": f"Large margin ({margin}) in {data['away_team']} @ {data['home_team']}"
        }
    return None

# Setup system
alert_system = SportsAlertSystem()
alert_system.add_rule(AlertRule("milestone", milestone_rule, AlertPriority.HIGH, "player"))
alert_system.add_rule(AlertRule("injury", injury_rule, AlertPriority.CRITICAL, "health"))
alert_system.add_rule(AlertRule("blowout", blowout_rule, AlertPriority.LOW, "game"))

# Add handlers
alert_system.add_handler(lambda a: print(f"[{a.priority.name}] {a.title}: {a.message}"))

# Check data
alerts = alert_system.check_all({
    "player_name": "Albert Pujols",
    "career_hr": 700
})

python

WAR Components Breakdown

Calculate and visualize WAR component breakdown.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def calculate_war_components(player_stats):
    """Calculate individual WAR components."""
    stats = player_stats.copy()

    # Constants (simplified)
    RUNS_PER_WIN = 10
    LG_WOBA = 0.320
    WOBA_SCALE = 1.25

    # Batting Runs (wRAA)
    stats["batting_runs"] = (
        (stats["woba"] - LG_WOBA) / WOBA_SCALE * stats["pa"]
    )

    # Baserunning Runs
    stats["baserunning_runs"] = (
        stats["stolen_bases"] * 0.2 -
        stats["caught_stealing"] * 0.4 +
        stats["extra_bases_taken"] * 0.15
    )

    # Fielding Runs (using UZR or DRS)
    stats["fielding_runs"] = stats.get("uzr", 0)

    # Positional Adjustment
    position_adj = {
        "C": 12.5, "SS": 7.5, "2B": 2.5, "CF": 2.5, "3B": 2.5,
        "RF": -7.5, "LF": -7.5, "1B": -12.5, "DH": -17.5
    }
    stats["position_adj"] = (
        stats["position"].map(position_adj) * stats["games"] / 162
    )

    # League Adjustment (simplified)
    stats["league_adj"] = 0

    # Replacement Level (about 20 runs per 600 PA)
    stats["replacement_runs"] = stats["pa"] / 600 * 20

    # Total WAR
    stats["war"] = (
        stats["batting_runs"] +
        stats["baserunning_runs"] +
        stats["fielding_runs"] +
        stats["position_adj"] +
        stats["league_adj"] +
        stats["replacement_runs"]
    ) / RUNS_PER_WIN

    return stats

def visualize_war_breakdown(player_stats, player_name):
    """Visualize WAR components for a player."""
    player = player_stats[player_stats["name"] == player_name].iloc[0]

    components = {
        "Batting": player["batting_runs"],
        "Baserunning": player["baserunning_runs"],
        "Fielding": player["fielding_runs"],
        "Position": player["position_adj"],
        "Replacement": player["replacement_runs"]
    }

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Bar chart
    ax1 = axes[0]
    colors = ["green" if v >= 0 else "red" for v in components.values()]
    ax1.barh(list(components.keys()), list(components.values()), color=colors)
    ax1.axvline(x=0, color="black", linewidth=0.5)
    ax1.set_xlabel("Runs")
    ax1.set_title(f"{player_name} WAR Components")

    # Waterfall chart
    ax2 = axes[1]
    values = list(components.values())
    labels = list(components.keys())

    cumulative = [0]
    for i, v in enumerate(values):
        cumulative.append(cumulative[-1] + v)

    for i, (label, val) in enumerate(zip(labels, values)):
        bottom = cumulative[i]
        color = "green" if val >= 0 else "red"
        ax2.bar(label, val, bottom=bottom, color=color, edgecolor="black")

    # Add total
    total = sum(values)
    ax2.bar("Total WAR", total / 10, color="navy")
    ax2.axhline(y=0, color="black", linewidth=0.5)
    ax2.set_ylabel("Runs")
    ax2.set_title(f"{player_name} WAR Waterfall")

    plt.tight_layout()
    return fig

# Calculate and visualize
player_stats = calculate_war_components(raw_stats_df)
fig = visualize_war_breakdown(player_stats, "Mike Trout")

python

Rolling Stats Calculator

Calculate various rolling statistics for time series analysis.

import pandas as pd
import numpy as np
from typing import List, Dict

class RollingStatsCalculator:
    """Calculate rolling statistics for sports data."""

    def __init__(self, df: pd.DataFrame, date_col: str = "date"):
        self.df = df.sort_values(date_col).copy()
        self.date_col = date_col

    def rolling_avg(self, value_col: str, window: int, min_periods: int = 1) -> pd.Series:
        """Simple rolling average."""
        return self.df[value_col].rolling(window=window, min_periods=min_periods).mean()

    def weighted_rolling_avg(self, value_col: str, window: int) -> pd.Series:
        """Exponentially weighted rolling average."""
        return self.df[value_col].ewm(span=window, adjust=False).mean()

    def rolling_sum(self, value_col: str, window: int) -> pd.Series:
        """Rolling sum."""
        return self.df[value_col].rolling(window=window).sum()

    def rolling_percentile(self, value_col: str, window: int, percentile: float) -> pd.Series:
        """Rolling percentile."""
        return self.df[value_col].rolling(window=window).quantile(percentile)

    def rolling_zscore(self, value_col: str, window: int) -> pd.Series:
        """Rolling z-score (how many std from rolling mean)."""
        rolling_mean = self.df[value_col].rolling(window=window).mean()
        rolling_std = self.df[value_col].rolling(window=window).std()
        return (self.df[value_col] - rolling_mean) / rolling_std

    def calculate_all_rolling(self, value_cols: List[str], windows: List[int]) -> pd.DataFrame:
        """Calculate multiple rolling stats."""
        result = self.df.copy()

        for col in value_cols:
            for window in windows:
                result[f"{col}_roll{window}_avg"] = self.rolling_avg(col, window)
                result[f"{col}_roll{window}_sum"] = self.rolling_sum(col, window)

        return result

    def pace_adjusted_rolling(self, value_col: str, attempts_col: str, window: int) -> pd.Series:
        """Calculate pace-adjusted rolling (e.g., per-PA or per-100-possessions)."""
        rolling_value = self.df[value_col].rolling(window=window).sum()
        rolling_attempts = self.df[attempts_col].rolling(window=window).sum()
        return rolling_value / rolling_attempts

    def hot_cold_streak(self, value_col: str, threshold_pct: float = 0.75, min_streak: int = 5) -> pd.DataFrame:
        """Identify hot and cold streaks."""
        threshold_high = self.df[value_col].quantile(threshold_pct)
        threshold_low = self.df[value_col].quantile(1 - threshold_pct)

        self.df["above_threshold"] = self.df[value_col] >= threshold_high
        self.df["below_threshold"] = self.df[value_col] <= threshold_low

        # Find consecutive streaks
        self.df["hot_streak"] = (
            self.df["above_threshold"]
            .groupby((~self.df["above_threshold"]).cumsum())
            .cumsum()
        )
        self.df["cold_streak"] = (
            self.df["below_threshold"]
            .groupby((~self.df["below_threshold"]).cumsum())
            .cumsum()
        )

        return self.df

# Usage
calc = RollingStatsCalculator(game_log_df, date_col="game_date")

# Add rolling stats
game_log_df["avg_roll20"] = calc.rolling_avg("batting_avg", window=20)
game_log_df["avg_roll50_ewm"] = calc.weighted_rolling_avg("batting_avg", window=50)
game_log_df["hr_roll30_sum"] = calc.rolling_sum("home_runs", window=30)
game_log_df["avg_zscore"] = calc.rolling_zscore("batting_avg", window=50)

# Pace-adjusted (batting avg = hits/AB)
game_log_df["avg_roll20_calc"] = calc.pace_adjusted_rolling("hits", "at_bats", window=20)

print(game_log_df[["game_date", "batting_avg", "avg_roll20", "avg_zscore"]].tail(20))

python

Sports Percentile Ranking

Calculate percentile rankings across various statistics.

import pandas as pd
import numpy as np
from scipy import stats

def percentile_rank(series: pd.Series) -> pd.Series:
    """Calculate percentile rank (0-100) for a series."""
    return series.rank(pct=True) * 100

def percentile_rank_grouped(df: pd.DataFrame, value_col: str, group_col: str) -> pd.Series:
    """Calculate percentile rank within groups."""
    return df.groupby(group_col)[value_col].transform(lambda x: x.rank(pct=True) * 100)

class PlayerPercentileProfile:
    """Calculate comprehensive percentile profile for players."""

    def __init__(self, league_stats_df: pd.DataFrame):
        self.league_stats = league_stats_df

    def calculate_percentiles(self, player_stats: dict) -> dict:
        """Calculate percentile for each stat."""
        percentiles = {}

        for stat, value in player_stats.items():
            if stat in self.league_stats.columns:
                league_values = self.league_stats[stat].dropna()
                percentile = stats.percentileofscore(league_values, value)
                percentiles[stat] = round(percentile, 1)

        return percentiles

    def create_profile(self, player_df: pd.DataFrame) -> pd.DataFrame:
        """Create percentile profile for all players."""
        stats_cols = ["avg", "obp", "slg", "hr", "rbi", "sb", "war",
                      "k_pct", "bb_pct", "iso", "babip", "wrc_plus"]

        result = player_df.copy()

        for col in stats_cols:
            if col in result.columns:
                # Higher is better for most stats
                higher_better = col not in ["k_pct"]
                result[f"{col}_pct"] = result[col].rank(pct=True, ascending=higher_better) * 100

        return result

    def compare_to_position(self, player_stats: dict, position: str) -> dict:
        """Compare player to position average."""
        pos_stats = self.league_stats[self.league_stats["position"] == position]

        comparison = {}
        for stat, value in player_stats.items():
            if stat in pos_stats.columns:
                pos_avg = pos_stats[stat].mean()
                pos_std = pos_stats[stat].std()

                comparison[stat] = {
                    "value": value,
                    "position_avg": pos_avg,
                    "vs_avg": value - pos_avg,
                    "z_score": (value - pos_avg) / pos_std if pos_std > 0 else 0,
                    "percentile": stats.percentileofscore(pos_stats[stat].dropna(), value)
                }

        return comparison

def visualize_percentile_profile(percentiles: dict, player_name: str):
    """Visualize percentile profile as radar chart or bar chart."""
    import matplotlib.pyplot as plt

    fig, ax = plt.subplots(figsize=(10, 6))

    stats = list(percentiles.keys())
    values = list(percentiles.values())

    colors = ["green" if v >= 70 else "orange" if v >= 30 else "red" for v in values]

    ax.barh(stats, values, color=colors, edgecolor="black")
    ax.axvline(x=50, color="gray", linestyle="--", alpha=0.5)
    ax.axvline(x=75, color="blue", linestyle=":", alpha=0.3)
    ax.axvline(x=25, color="blue", linestyle=":", alpha=0.3)

    ax.set_xlim(0, 100)
    ax.set_xlabel("Percentile")
    ax.set_title(f"{player_name} Percentile Profile")

    for i, (stat, val) in enumerate(zip(stats, values)):
        ax.text(val + 2, i, f"{val:.0f}", va="center")

    return fig

# Usage
profiler = PlayerPercentileProfile(league_wide_stats_df)

# Get player percentiles
player = {"avg": 0.295, "obp": 0.380, "slg": 0.540, "hr": 35, "war": 6.5}
percentiles = profiler.calculate_percentiles(player)

print("Player Percentile Rankings:")
for stat, pct in sorted(percentiles.items(), key=lambda x: -x[1]):
    print(f"  {stat}: {pct:.0f}th percentile")

python

Sports Data Export Utilities

Export sports data to various formats.

import pandas as pd
import json
from pathlib import Path
from datetime import datetime
from typing import Optional
import xlsxwriter

class SportsDataExporter:
    """Export sports data to various formats."""

    def __init__(self, output_dir: str = "./exports"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)

    def _get_filename(self, base_name: str, extension: str) -> Path:
        """Generate timestamped filename."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        return self.output_dir / f"{base_name}_{timestamp}.{extension}"

    def to_csv(self, df: pd.DataFrame, name: str, **kwargs) -> Path:
        """Export to CSV."""
        path = self._get_filename(name, "csv")
        df.to_csv(path, index=False, **kwargs)
        return path

    def to_excel(self, data: dict, name: str, include_summary: bool = True) -> Path:
        """Export multiple DataFrames to Excel workbook."""
        path = self._get_filename(name, "xlsx")

        with pd.ExcelWriter(path, engine="xlsxwriter") as writer:
            workbook = writer.book

            # Formats
            header_fmt = workbook.add_format({
                "bold": True,
                "bg_color": "#4472C4",
                "font_color": "white"
            })
            number_fmt = workbook.add_format({"num_format": "0.000"})

            for sheet_name, df in data.items():
                df.to_excel(writer, sheet_name=sheet_name[:31], index=False)
                worksheet = writer.sheets[sheet_name[:31]]

                # Format header
                for col_num, value in enumerate(df.columns):
                    worksheet.write(0, col_num, value, header_fmt)

                # Autofit columns
                for i, col in enumerate(df.columns):
                    max_len = max(df[col].astype(str).str.len().max(), len(col)) + 2
                    worksheet.set_column(i, i, min(max_len, 50))

            # Add summary sheet
            if include_summary:
                summary_data = []
                for name, df in data.items():
                    summary_data.append({
                        "Sheet": name,
                        "Rows": len(df),
                        "Columns": len(df.columns)
                    })
                pd.DataFrame(summary_data).to_excel(
                    writer, sheet_name="Summary", index=False
                )

        return path

    def to_json(self, df: pd.DataFrame, name: str, orient: str = "records") -> Path:
        """Export to JSON."""
        path = self._get_filename(name, "json")
        df.to_json(path, orient=orient, indent=2, date_format="iso")
        return path

    def to_parquet(self, df: pd.DataFrame, name: str) -> Path:
        """Export to Parquet (efficient for large datasets)."""
        path = self._get_filename(name, "parquet")
        df.to_parquet(path, index=False)
        return path

    def to_html_report(self, df: pd.DataFrame, name: str, title: str = "Report") -> Path:
        """Export to styled HTML report."""
        path = self._get_filename(name, "html")

        html = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>{title}</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 20px; }}
                h1 {{ color: #333; }}
                table {{ border-collapse: collapse; width: 100%; }}
                th {{ background-color: #4472C4; color: white; padding: 10px; text-align: left; }}
                td {{ border: 1px solid #ddd; padding: 8px; }}
                tr:nth-child(even) {{ background-color: #f2f2f2; }}
                tr:hover {{ background-color: #ddd; }}
            </style>
        </head>
        <body>
            <h1>{title}</h1>
            <p>Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
            <p>Records: {len(df)}</p>
            {df.to_html(index=False, classes="data-table")}
        </body>
        </html>
        """

        path.write_text(html)
        return path

# Usage
exporter = SportsDataExporter("./exports")

# Export player stats
csv_path = exporter.to_csv(player_stats_df, "player_stats")
print(f"CSV exported to: {csv_path}")

# Export multiple sheets to Excel
excel_path = exporter.to_excel({
    "Batting": batting_df,
    "Pitching": pitching_df,
    "Fielding": fielding_df
}, "full_stats_export")
print(f"Excel exported to: {excel_path}")

# Export to Parquet for big data
parquet_path = exporter.to_parquet(large_dataset_df, "historical_data")

python

Team Roster Constructor

Build and manage team rosters with salary cap considerations.

from dataclasses import dataclass, field
from typing import List, Dict, Optional
from datetime import date

@dataclass
class Player:
    """Player on roster."""
    id: int
    name: str
    position: str
    salary: float
    war: float
    contract_years: int = 1
    status: str = "active"

@dataclass
class Roster:
    """Team roster management."""
    team_name: str
    salary_cap: float
    players: List[Player] = field(default_factory=list)

    @property
    def total_salary(self) -> float:
        return sum(p.salary for p in self.players if p.status == "active")

    @property
    def cap_space(self) -> float:
        return self.salary_cap - self.total_salary

    @property
    def total_war(self) -> float:
        return sum(p.war for p in self.players if p.status == "active")

    def add_player(self, player: Player) -> bool:
        """Add player to roster if salary fits."""
        if player.salary <= self.cap_space:
            self.players.append(player)
            return True
        return False

    def remove_player(self, player_id: int) -> Optional[Player]:
        """Remove player from roster."""
        for i, p in enumerate(self.players):
            if p.id == player_id:
                return self.players.pop(i)
        return None

    def get_by_position(self, position: str) -> List[Player]:
        """Get players by position."""
        return [p for p in self.players if p.position == position and p.status == "active"]

    def to_dataframe(self):
        """Convert roster to DataFrame."""
        import pandas as pd
        return pd.DataFrame([
            {"name": p.name, "position": p.position, "salary": p.salary,
             "war": p.war, "status": p.status}
            for p in self.players
        ])

    def summary(self) -> Dict:
        """Get roster summary."""
        return {
            "team": self.team_name,
            "players": len([p for p in self.players if p.status == "active"]),
            "total_salary": self.total_salary,
            "cap_space": self.cap_space,
            "total_war": self.total_war,
            "dollars_per_war": self.total_salary / self.total_war if self.total_war > 0 else 0,
            "by_position": {
                pos: len(self.get_by_position(pos))
                for pos in ["C", "1B", "2B", "SS", "3B", "LF", "CF", "RF", "DH", "SP", "RP"]
            }
        }

# Usage
roster = Roster("Yankees", salary_cap=250_000_000)

# Add players
roster.add_player(Player(1, "Aaron Judge", "RF", 40_000_000, 8.0, 9))
roster.add_player(Player(2, "Gerrit Cole", "SP", 36_000_000, 5.5, 5))
roster.add_player(Player(3, "Anthony Rizzo", "1B", 17_000_000, 2.0, 2))

print(f"Total Salary: ${roster.total_salary/1e6:.1f}M")
print(f"Cap Space: ${roster.cap_space/1e6:.1f}M")
print(f"Total WAR: {roster.total_war:.1f}")
print(f"\nRoster Summary:")
print(roster.summary())

python

Sports Calendar Utilities

Manage sports schedules and calendar operations.

from datetime import datetime, timedelta, date
from typing import List, Dict, Optional
import calendar

class SportsCalendar:
    """Sports calendar and schedule utilities."""

    SEASON_DATES = {
        "MLB": {
            "spring_training": (2, 20),
            "opening_day": (3, 28),
            "all_star_break": (7, 14, 7, 18),  # Start and end
            "regular_season_end": (9, 30),
            "postseason_end": (11, 5),
            "games_per_season": 162
        },
        "NBA": {
            "preseason_start": (10, 1),
            "regular_season_start": (10, 22),
            "all_star_break": (2, 14, 2, 20),
            "regular_season_end": (4, 14),
            "playoffs_end": (6, 20),
            "games_per_season": 82
        },
        "NFL": {
            "preseason_start": (8, 3),
            "regular_season_start": (9, 7),
            "bye_weeks": (5, 14),
            "regular_season_end": (1, 8),
            "super_bowl": (2, 11),
            "games_per_season": 17
        }
    }

    def __init__(self, league: str):
        self.league = league.upper()
        self.season_info = self.SEASON_DATES.get(self.league, {})

    def get_season_year(self, dt: datetime) -> int:
        """Get season year for a date."""
        if self.league == "MLB":
            return dt.year
        elif self.league in ["NBA", "NFL"]:
            # Season spans years
            if dt.month < 7:
                return dt.year - 1
            return dt.year

    def is_regular_season(self, dt: datetime) -> bool:
        """Check if date is during regular season."""
        year = self.get_season_year(dt)

        if self.league == "MLB":
            start = datetime(year, *self.season_info["opening_day"][:2])
            end = datetime(year, *self.season_info["regular_season_end"][:2])
            return start <= dt <= end

        elif self.league == "NBA":
            start = datetime(year, *self.season_info["regular_season_start"][:2])
            end = datetime(year + 1, *self.season_info["regular_season_end"][:2])
            return start <= dt <= end

        return True

    def is_all_star_break(self, dt: datetime) -> bool:
        """Check if date is during All-Star break."""
        if "all_star_break" not in self.season_info:
            return False

        year = self.get_season_year(dt)
        asb = self.season_info["all_star_break"]

        if self.league == "MLB":
            start = datetime(year, asb[0], asb[1])
            end = datetime(year, asb[2], asb[3])
        else:
            start = datetime(year + 1, asb[0], asb[1])
            end = datetime(year + 1, asb[2], asb[3])

        return start <= dt <= end

    def games_remaining(self, dt: datetime, current_games: int) -> int:
        """Calculate games remaining in season."""
        total = self.season_info.get("games_per_season", 162)
        return total - current_games

    def generate_schedule_dates(self, year: int) -> List[date]:
        """Generate potential game dates for a season."""
        dates = []
        start = datetime(year, *self.season_info.get("opening_day", (3, 28))[:2])
        end = datetime(year, *self.season_info.get("regular_season_end", (9, 30))[:2])

        current = start
        while current <= end:
            if not self.is_all_star_break(current):
                dates.append(current.date())
            current += timedelta(days=1)

        return dates

    def get_rest_days(self, game_dates: List[date], current: date) -> int:
        """Calculate rest days before current date."""
        past_games = [d for d in game_dates if d < current]
        if not past_games:
            return 7
        return (current - max(past_games)).days - 1

# Usage
mlb_cal = SportsCalendar("MLB")
nba_cal = SportsCalendar("NBA")

today = datetime.now()
print(f"MLB Season Year: {mlb_cal.get_season_year(today)}")
print(f"Is Regular Season: {mlb_cal.is_regular_season(today)}")
print(f"Is All-Star Break: {mlb_cal.is_all_star_break(today)}")

# Generate schedule
schedule_dates = mlb_cal.generate_schedule_dates(2024)
print(f"\n2024 MLB potential game dates: {len(schedule_dates)}")

python

API Rate Limiter

Rate limit API calls for sports data fetching.

import time
from datetime import datetime
from functools import wraps
from typing import Optional, Callable
import threading
from collections import deque

class RateLimiter:
    """Rate limiter for API calls."""

    def __init__(self, calls_per_second: float = 1.0, calls_per_minute: float = 60.0):
        self.calls_per_second = calls_per_second
        self.calls_per_minute = calls_per_minute
        self.call_times = deque()
        self.lock = threading.Lock()

    def wait_if_needed(self):
        """Wait if rate limit would be exceeded."""
        with self.lock:
            now = time.time()

            # Clean old entries
            minute_ago = now - 60
            while self.call_times and self.call_times[0] < minute_ago:
                self.call_times.popleft()

            # Check per-second limit
            second_ago = now - 1
            recent_calls = sum(1 for t in self.call_times if t > second_ago)
            if recent_calls >= self.calls_per_second:
                wait_time = 1 - (now - max(t for t in self.call_times if t > second_ago))
                if wait_time > 0:
                    time.sleep(wait_time)
                    now = time.time()

            # Check per-minute limit
            if len(self.call_times) >= self.calls_per_minute:
                oldest_in_minute = self.call_times[0]
                wait_time = 60 - (now - oldest_in_minute)
                if wait_time > 0:
                    time.sleep(wait_time)
                    now = time.time()

            self.call_times.append(now)

    def __call__(self, func: Callable) -> Callable:
        """Decorator to rate limit function calls."""
        @wraps(func)
        def wrapper(*args, **kwargs):
            self.wait_if_needed()
            return func(*args, **kwargs)
        return wrapper

class AdaptiveRateLimiter(RateLimiter):
    """Rate limiter that adapts to API responses."""

    def __init__(self, initial_rate: float = 10.0):
        super().__init__(calls_per_second=initial_rate)
        self.error_count = 0
        self.success_count = 0

    def record_success(self):
        """Record successful API call."""
        self.success_count += 1
        # Gradually increase rate after successes
        if self.success_count > 10 and self.error_count == 0:
            self.calls_per_second = min(self.calls_per_second * 1.1, 20.0)

    def record_error(self, status_code: int):
        """Record API error."""
        self.error_count += 1
        self.success_count = 0

        if status_code == 429:  # Rate limited
            self.calls_per_second = max(self.calls_per_second * 0.5, 0.5)
        elif status_code >= 500:  # Server error
            self.calls_per_second = max(self.calls_per_second * 0.8, 1.0)

# Usage as decorator
rate_limiter = RateLimiter(calls_per_second=2, calls_per_minute=100)

@rate_limiter
def fetch_player_stats(player_id: int):
    """Fetch player stats (rate limited)."""
    import requests
    response = requests.get(f"https://api.example.com/players/{player_id}")
    return response.json()

# Usage with adaptive limiter
adaptive_limiter = AdaptiveRateLimiter(initial_rate=5.0)

def fetch_with_adaptive_limit(url: str):
    """Fetch with adaptive rate limiting."""
    import requests

    adaptive_limiter.wait_if_needed()

    try:
        response = requests.get(url)
        if response.status_code == 200:
            adaptive_limiter.record_success()
            return response.json()
        else:
            adaptive_limiter.record_error(response.status_code)
            return None
    except Exception as e:
        adaptive_limiter.record_error(500)
        raise

# Fetch multiple players
for player_id in range(1, 100):
    stats = fetch_player_stats(player_id)
    print(f"Fetched player {player_id}")

sql

Team Performance Trends SQL

Analyze team performance trends over multiple seasons.

-- Team performance trends across seasons
WITH season_stats AS (
    SELECT
        t.team_id,
        t.team_name,
        s.season,
        s.wins,
        s.losses,
        s.runs_scored,
        s.runs_allowed,
        s.wins * 1.0 / (s.wins + s.losses) AS win_pct,
        s.runs_scored - s.runs_allowed AS run_diff,
        LAG(s.wins) OVER (PARTITION BY t.team_id ORDER BY s.season) AS prev_wins,
        LAG(s.runs_scored - s.runs_allowed) OVER (PARTITION BY t.team_id ORDER BY s.season) AS prev_run_diff
    FROM teams t
    JOIN team_season_stats s ON t.team_id = s.team_id
    WHERE s.season >= 2020
)
SELECT
    team_name,
    season,
    wins,
    win_pct,
    run_diff,
    wins - COALESCE(prev_wins, wins) AS win_change,
    run_diff - COALESCE(prev_run_diff, run_diff) AS run_diff_change,
    AVG(win_pct) OVER (PARTITION BY team_id ORDER BY season ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS rolling_3yr_win_pct
FROM season_stats
ORDER BY team_name, season;

sql

Player Milestone Tracking SQL

Track players approaching career milestones.

-- Players approaching milestones
WITH career_totals AS (
    SELECT
        p.player_id,
        p.name,
        SUM(s.hits) AS career_hits,
        SUM(s.home_runs) AS career_hr,
        SUM(s.rbi) AS career_rbi,
        SUM(s.strikeouts) AS career_k,  -- For pitchers
        SUM(s.wins) AS career_wins
    FROM players p
    JOIN player_stats s ON p.player_id = s.player_id
    GROUP BY p.player_id, p.name
),
milestones AS (
    SELECT
        player_id,
        name,
        career_hits,
        career_hr,
        career_rbi,
        -- Hits milestones
        CASE
            WHEN career_hits >= 2900 AND career_hits < 3000 THEN 3000 - career_hits
            WHEN career_hits >= 1900 AND career_hits < 2000 THEN 2000 - career_hits
            WHEN career_hits >= 900 AND career_hits < 1000 THEN 1000 - career_hits
            ELSE NULL
        END AS hits_to_milestone,
        -- HR milestones
        CASE
            WHEN career_hr >= 490 AND career_hr < 500 THEN 500 - career_hr
            WHEN career_hr >= 290 AND career_hr < 300 THEN 300 - career_hr
            ELSE NULL
        END AS hr_to_milestone
    FROM career_totals
)
SELECT
    name,
    career_hits,
    career_hr,
    career_rbi,
    CONCAT(career_hits + hits_to_milestone, ' hits') AS hits_milestone,
    hits_to_milestone AS hits_needed,
    CONCAT(career_hr + hr_to_milestone, ' HR') AS hr_milestone,
    hr_to_milestone AS hr_needed
FROM milestones
WHERE hits_to_milestone IS NOT NULL OR hr_to_milestone IS NOT NULL
ORDER BY COALESCE(hits_to_milestone, hr_to_milestone);

sql

Head-to-Head Record SQL

Calculate head-to-head records between teams.

-- Head-to-head records between teams
WITH matchups AS (
    SELECT
        LEAST(home_team_id, away_team_id) AS team1_id,
        GREATEST(home_team_id, away_team_id) AS team2_id,
        CASE
            WHEN home_score > away_score THEN home_team_id
            ELSE away_team_id
        END AS winner_id,
        home_score + away_score AS total_runs,
        ABS(home_score - away_score) AS margin
    FROM games
    WHERE season = 2024 AND status = 'Final'
)
SELECT
    t1.team_name AS team1,
    t2.team_name AS team2,
    COUNT(*) AS games_played,
    SUM(CASE WHEN m.winner_id = m.team1_id THEN 1 ELSE 0 END) AS team1_wins,
    SUM(CASE WHEN m.winner_id = m.team2_id THEN 1 ELSE 0 END) AS team2_wins,
    ROUND(AVG(total_runs), 1) AS avg_total_runs,
    ROUND(AVG(margin), 1) AS avg_margin,
    SUM(CASE WHEN margin <= 2 THEN 1 ELSE 0 END) AS close_games
FROM matchups m
JOIN teams t1 ON m.team1_id = t1.team_id
JOIN teams t2 ON m.team2_id = t2.team_id
GROUP BY t1.team_name, t2.team_name
HAVING COUNT(*) >= 3
ORDER BY games_played DESC, avg_margin;

python Football

Expected Points Added (EPA)

Calculate EPA for football plays.

import pandas as pd
import numpy as np

def calculate_expected_points(down, distance, yard_line):
    """Calculate expected points based on game state."""
    # Simplified EP model (would use ML model in production)
    base_ep = (yard_line - 50) * 0.05  # Field position value

    # Down adjustments
    down_adj = {1: 0.5, 2: 0.2, 3: -0.3, 4: -1.0}
    base_ep += down_adj.get(down, 0)

    # Distance adjustment
    if distance <= 3:
        base_ep += 0.3
    elif distance >= 10:
        base_ep -= 0.3

    return base_ep

def calculate_epa(plays_df):
    """Calculate EPA for each play."""
    plays = plays_df.copy()

    # Calculate EP before play
    plays["ep_before"] = plays.apply(
        lambda x: calculate_expected_points(x["down"], x["distance"], x["yard_line"]),
        axis=1
    )

    # Calculate EP after play
    plays["ep_after"] = plays.apply(
        lambda x: calculate_expected_points(
            x["next_down"] if not x["turnover"] else 1,
            x["next_distance"] if not x["turnover"] else 10,
            x["next_yard_line"] if not x["turnover"] else 100 - x["next_yard_line"]
        ) * (-1 if x["turnover"] else 1),
        axis=1
    )

    # Handle scoring plays
    plays.loc[plays["touchdown"] == 1, "ep_after"] = 7
    plays.loc[plays["field_goal"] == 1, "ep_after"] = 3
    plays.loc[plays["safety"] == 1, "ep_after"] = 2

    # EPA = EP after - EP before
    plays["epa"] = plays["ep_after"] - plays["ep_before"]

    return plays

def player_epa_summary(plays_df):
    """Summarize EPA by player."""
    # Passing EPA
    passing = plays_df[plays_df["play_type"] == "pass"].groupby("passer_id").agg({
        "epa": ["sum", "mean", "count"],
        "yards_gained": "sum",
        "touchdown": "sum"
    })
    passing.columns = ["total_epa", "epa_per_play", "attempts", "yards", "tds"]

    # Rushing EPA
    rushing = plays_df[plays_df["play_type"] == "run"].groupby("rusher_id").agg({
        "epa": ["sum", "mean", "count"]
    })
    rushing.columns = ["total_epa", "epa_per_carry", "carries"]

    return passing, rushing

plays_with_epa = calculate_epa(pbp_df)
passing_epa, rushing_epa = player_epa_summary(plays_with_epa)
print("Top Passers by EPA:")
print(passing_epa.nlargest(10, "total_epa"))

python Tennis

Strokes Gained Calculator

Calculate strokes gained statistics for golf.

import pandas as pd
import numpy as np

class StrokesGainedCalculator:
    """Calculate strokes gained for golf shots."""

    # Baseline strokes to hole from distance (PGA Tour averages)
    BASELINE_STROKES = {
        "tee": {100: 2.92, 150: 2.99, 200: 3.05, 250: 3.15, 300: 3.30, 350: 3.45, 400: 3.65},
        "fairway": {50: 2.60, 100: 2.80, 150: 2.92, 200: 3.02},
        "rough": {50: 2.75, 100: 2.92, 150: 3.05, 200: 3.18},
        "sand": {20: 2.43, 40: 2.65, 60: 2.85},
        "green": {5: 1.50, 10: 1.61, 20: 1.78, 30: 1.95, 40: 2.10, 60: 2.30}
    }

    def get_baseline_strokes(self, lie: str, distance: int) -> float:
        """Get baseline strokes to hole."""
        baseline = self.BASELINE_STROKES.get(lie, self.BASELINE_STROKES["fairway"])

        # Interpolate
        distances = sorted(baseline.keys())
        if distance <= distances[0]:
            return baseline[distances[0]]
        if distance >= distances[-1]:
            return baseline[distances[-1]]

        for i in range(len(distances) - 1):
            if distances[i] <= distance <= distances[i + 1]:
                ratio = (distance - distances[i]) / (distances[i + 1] - distances[i])
                return baseline[distances[i]] + ratio * (baseline[distances[i + 1]] - baseline[distances[i]])

        return 3.0  # Default

    def calculate_sg(self, shots_df):
        """Calculate strokes gained for each shot."""
        shots = shots_df.copy()

        # Get baseline strokes before shot
        shots["baseline_before"] = shots.apply(
            lambda x: self.get_baseline_strokes(x["lie_before"], x["distance_before"]),
            axis=1
        )

        # Get baseline strokes after shot (0 if holed)
        shots["baseline_after"] = shots.apply(
            lambda x: 0 if x["holed"] else self.get_baseline_strokes(x["lie_after"], x["distance_after"]),
            axis=1
        )

        # SG = baseline_before - baseline_after - 1 (for the stroke taken)
        shots["strokes_gained"] = shots["baseline_before"] - shots["baseline_after"] - 1

        return shots

    def player_summary(self, shots_df):
        """Summarize strokes gained by player and category."""
        shots = self.calculate_sg(shots_df)

        # Categorize shots
        def categorize_shot(row):
            if row["lie_before"] == "tee" and row["distance_before"] > 250:
                return "off_the_tee"
            elif row["lie_before"] in ["fairway", "rough", "sand"] and row["distance_before"] > 100:
                return "approach"
            elif row["lie_before"] in ["fairway", "rough", "sand"] and row["distance_before"] <= 100:
                return "around_green"
            elif row["lie_before"] == "green":
                return "putting"
            return "other"

        shots["category"] = shots.apply(categorize_shot, axis=1)

        # Summarize
        summary = shots.groupby(["player_id", "category"])["strokes_gained"].agg([
            ("total_sg", "sum"),
            ("avg_sg", "mean"),
            ("shots", "count")
        ]).unstack(level=1)

        summary["total_sg_all"] = shots.groupby("player_id")["strokes_gained"].sum()

        return summary

calculator = StrokesGainedCalculator()
sg_summary = calculator.player_summary(shots_df)
print(sg_summary.sort_values("total_sg_all", ascending=False).head(20))

python

Game Simulation Engine

Monte Carlo simulation engine for game outcomes.

import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List, Tuple

@dataclass
class TeamStats:
    """Team statistical profile."""
    name: str
    offense_rating: float
    defense_rating: float
    pace: float
    variance: float = 5.0

class GameSimulator:
    """Monte Carlo game simulation engine."""

    def __init__(self, league_avg_pace=100):
        self.league_avg_pace = league_avg_pace

    def simulate_game(self, home_team: TeamStats, away_team: TeamStats,
                     home_advantage: float = 3.0) -> Dict:
        """Simulate single game."""
        # Expected pace (average of both teams)
        expected_pace = (home_team.pace + away_team.pace) / 2

        # Expected scores
        # Home: their offense vs opponent defense + home advantage
        home_expected = (
            (home_team.offense_rating - away_team.defense_rating + 100)
            * expected_pace / 100 + home_advantage
        )

        away_expected = (
            (away_team.offense_rating - home_team.defense_rating + 100)
            * expected_pace / 100
        )

        # Add variance
        home_score = np.random.normal(home_expected, home_team.variance)
        away_score = np.random.normal(away_expected, away_team.variance)

        # Round to integers
        home_score = max(0, round(home_score))
        away_score = max(0, round(away_score))

        return {
            "home_score": home_score,
            "away_score": away_score,
            "home_win": home_score > away_score,
            "margin": home_score - away_score
        }

    def simulate_series(self, home_team: TeamStats, away_team: TeamStats,
                       n_games: int = 7, wins_needed: int = 4) -> Dict:
        """Simulate playoff series."""
        home_wins = 0
        away_wins = 0
        games = []

        while home_wins < wins_needed and away_wins < wins_needed:
            game_num = len(games) + 1

            # Alternate home court (simplified 2-2-1-1-1)
            if game_num in [1, 2, 5, 7]:
                result = self.simulate_game(home_team, away_team)
            else:
                result = self.simulate_game(away_team, home_team)
                result["home_win"] = not result["home_win"]

            games.append(result)

            if result["home_win"]:
                home_wins += 1
            else:
                away_wins += 1

        return {
            "winner": home_team.name if home_wins >= wins_needed else away_team.name,
            "games": len(games),
            "home_wins": home_wins,
            "away_wins": away_wins,
            "game_results": games
        }

    def monte_carlo_prediction(self, home_team: TeamStats, away_team: TeamStats,
                              n_simulations: int = 10000) -> Dict:
        """Run Monte Carlo simulation for game prediction."""
        results = [
            self.simulate_game(home_team, away_team)
            for _ in range(n_simulations)
        ]

        home_wins = sum(r["home_win"] for r in results)
        margins = [r["margin"] for r in results]
        home_scores = [r["home_score"] for r in results]
        away_scores = [r["away_score"] for r in results]

        return {
            "home_win_prob": home_wins / n_simulations,
            "away_win_prob": 1 - home_wins / n_simulations,
            "expected_margin": np.mean(margins),
            "margin_std": np.std(margins),
            "home_score_mean": np.mean(home_scores),
            "away_score_mean": np.mean(away_scores),
            "total_mean": np.mean(home_scores) + np.mean(away_scores)
        }

# Usage
celtics = TeamStats("Celtics", offense_rating=118, defense_rating=106, pace=98)
lakers = TeamStats("Lakers", offense_rating=114, defense_rating=110, pace=100)

simulator = GameSimulator()

# Single game prediction
prediction = simulator.monte_carlo_prediction(celtics, lakers, n_simulations=10000)
print(f"Celtics Win Probability: {prediction['home_win_prob']:.1%}")
print(f"Expected Score: {prediction['home_score_mean']:.0f} - {prediction['away_score_mean']:.0f}")
print(f"Expected Margin: {prediction['expected_margin']:.1f} ± {prediction['margin_std']:.1f}")

python

Injury Prediction with Survival Analysis

Predict time until next injury using survival analysis.

import pandas as pd
import numpy as np
from lifelines import CoxPHFitter, KaplanMeierFitter
from lifelines.utils import concordance_index
import matplotlib.pyplot as plt

class InjuryPredictor:
    """Predict injury risk using survival analysis."""

    def __init__(self):
        self.cox_model = CoxPHFitter()
        self.km_fitter = KaplanMeierFitter()

    def prepare_data(self, player_history_df):
        """Prepare data for survival analysis."""
        # Each row = player-stint (time between injuries)
        df = player_history_df.copy()

        # Time variable: days until injury (or end of observation)
        df["duration"] = (df["end_date"] - df["start_date"]).dt.days

        # Event variable: did injury occur?
        df["injured"] = df["injury_type"].notna().astype(int)

        return df

    def fit(self, df):
        """Fit Cox proportional hazards model."""
        features = [
            "age", "career_games", "workload_30d",
            "previous_injuries", "position_risk",
            "bmi", "sprint_speed_percentile"
        ]

        survival_df = df[features + ["duration", "injured"]].dropna()

        self.cox_model.fit(
            survival_df,
            duration_col="duration",
            event_col="injured"
        )

        print(self.cox_model.summary)

        return self

    def predict_risk(self, player_features, time_horizon=180):
        """Predict injury risk for a player."""
        # Survival function at time horizon
        survival_func = self.cox_model.predict_survival_function(
            player_features
        )

        # Risk = 1 - survival probability
        risk = 1 - survival_func.loc[time_horizon].values[0]

        # Hazard ratio compared to baseline
        hr = self.cox_model.predict_partial_hazard(player_features).values[0]

        return {
            "injury_risk_6mo": risk,
            "hazard_ratio": hr,
            "risk_category": "High" if risk > 0.5 else "Medium" if risk > 0.25 else "Low"
        }

    def plot_survival_curves(self, df, group_col):
        """Plot survival curves by group."""
        fig, ax = plt.subplots(figsize=(10, 6))

        for group in df[group_col].unique():
            group_data = df[df[group_col] == group]

            self.km_fitter.fit(
                group_data["duration"],
                event_observed=group_data["injured"],
                label=group
            )
            self.km_fitter.plot_survival_function(ax=ax)

        ax.set_xlabel("Days Since Last Injury")
        ax.set_ylabel("Probability of Staying Healthy")
        ax.set_title(f"Injury-Free Survival by {group_col}")
        ax.legend()

        return fig, ax

    def risk_factors_report(self):
        """Generate risk factors report."""
        summary = self.cox_model.summary.copy()
        summary["risk_increase"] = (np.exp(summary["coef"]) - 1) * 100

        return summary[["coef", "exp(coef)", "risk_increase", "p"]].sort_values(
            "risk_increase", ascending=False
        )

# Usage
predictor = InjuryPredictor()
df = predictor.prepare_data(player_injury_history)
predictor.fit(df)

# Predict for specific player
player = pd.DataFrame([{
    "age": 28,
    "career_games": 500,
    "workload_30d": 450,
    "previous_injuries": 3,
    "position_risk": 0.7,
    "bmi": 24.5,
    "sprint_speed_percentile": 65
}])

risk = predictor.predict_risk(player)
print(f"6-Month Injury Risk: {risk['injury_risk_6mo']:.1%}")
print(f"Risk Category: {risk['risk_category']}")

python

Contract Optimization Model

Optimize team salary cap allocation using linear programming.

import pandas as pd
import numpy as np
from scipy.optimize import linprog, milp, LinearConstraint, Bounds

def optimize_roster(players_df, salary_cap, roster_spots=15, min_by_position=None):
    """Optimize roster construction under salary cap."""

    n_players = len(players_df)

    # Objective: maximize total WAR
    c = -players_df["projected_war"].values  # Negative for maximization

    # Salary cap constraint
    A_salary = players_df["salary"].values.reshape(1, -1)
    b_salary = np.array([salary_cap])

    # Roster size constraint
    A_roster = np.ones((1, n_players))
    b_roster = np.array([roster_spots])

    # Position constraints
    A_position = []
    b_position = []

    if min_by_position:
        for pos, min_count in min_by_position.items():
            pos_vector = (players_df["position"] == pos).astype(int).values
            A_position.append(pos_vector)
            b_position.append(min_count)

    # Combine constraints
    A_ub = np.vstack([A_salary, A_roster])
    b_ub = np.concatenate([b_salary, b_roster])

    if A_position:
        A_ub = np.vstack([A_ub, -np.array(A_position)])
        b_ub = np.concatenate([b_ub, -np.array(b_position)])

    # Bounds: binary selection (0 or 1)
    bounds = [(0, 1) for _ in range(n_players)]

    # Solve
    result = linprog(c, A_ub=A_ub, b_ub=b_ub, bounds=bounds, method="highs")

    if result.success:
        # Get selected players (round to binary)
        selected = result.x > 0.5
        roster = players_df[selected].copy()
        roster["selected"] = 1

        return {
            "roster": roster,
            "total_war": roster["projected_war"].sum(),
            "total_salary": roster["salary"].sum(),
            "cap_space": salary_cap - roster["salary"].sum(),
            "roster_size": len(roster)
        }
    else:
        return {"error": "Optimization failed", "message": result.message}

def trade_optimizer(team_roster, available_players, salary_cap):
    """Find optimal trades to improve team."""

    current_war = team_roster["projected_war"].sum()
    current_salary = team_roster["salary"].sum()

    trade_options = []

    # For each player on roster, find beneficial swaps
    for _, player_out in team_roster.iterrows():
        for _, player_in in available_players.iterrows():
            # Check salary works
            new_salary = current_salary - player_out["salary"] + player_in["salary"]
            if new_salary > salary_cap:
                continue

            # Check position match (simplified)
            if player_out["position"] != player_in["position"]:
                continue

            # Calculate improvement
            war_change = player_in["projected_war"] - player_out["projected_war"]

            if war_change > 0:
                trade_options.append({
                    "player_out": player_out["name"],
                    "player_in": player_in["name"],
                    "war_gain": war_change,
                    "salary_change": player_in["salary"] - player_out["salary"],
                    "new_cap_space": salary_cap - new_salary
                })

    return pd.DataFrame(trade_options).sort_values("war_gain", ascending=False)

# Usage
min_positions = {"C": 2, "PF": 2, "SF": 2, "SG": 2, "PG": 2}

result = optimize_roster(
    free_agents_df,
    salary_cap=140_000_000,
    roster_spots=15,
    min_by_position=min_positions
)

print(f"Optimal Roster ({result['roster_size']} players):")
print(result["roster"][["name", "position", "salary", "projected_war"]])
print(f"\nTotal WAR: {result['total_war']:.1f}")
print(f"Total Salary: ${result['total_salary']/1e6:.1f}M")
print(f"Cap Space: ${result['cap_space']/1e6:.1f}M")

python

Real-Time Score Prediction

Predict final score based on current game state.

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
import joblib

class ScorePredictor:
    """Predict final game score from current state."""

    def __init__(self):
        self.home_model = None
        self.away_model = None

    def train(self, historical_games_df):
        """Train score prediction models."""
        # Create training data from historical game states
        training_data = []

        for _, game in historical_games_df.iterrows():
            for period in range(1, 5):  # Quarters
                if f"home_q{period}" not in game:
                    continue

                state = {
                    "period": period,
                    "home_current": sum(game[f"home_q{i}"] for i in range(1, period)),
                    "away_current": sum(game[f"away_q{i}"] for i in range(1, period)),
                    "home_q1": game.get("home_q1", 0),
                    "away_q1": game.get("away_q1", 0),
                    "home_final": game["home_score"],
                    "away_final": game["away_score"]
                }
                training_data.append(state)

        df = pd.DataFrame(training_data)

        features = ["period", "home_current", "away_current", "home_q1", "away_q1"]

        X = df[features]

        # Train separate models for home/away
        self.home_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
        self.home_model.fit(X, df["home_final"])

        self.away_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
        self.away_model.fit(X, df["away_final"])

    def predict(self, current_state):
        """Predict final scores from current game state."""
        features = np.array([[
            current_state["period"],
            current_state["home_score"],
            current_state["away_score"],
            current_state.get("home_q1", current_state["home_score"]),
            current_state.get("away_q1", current_state["away_score"])
        ]])

        home_pred = self.home_model.predict(features)[0]
        away_pred = self.away_model.predict(features)[0]

        # Ensure predictions are at least current score
        home_pred = max(home_pred, current_state["home_score"])
        away_pred = max(away_pred, current_state["away_score"])

        return {
            "home_predicted": round(home_pred),
            "away_predicted": round(away_pred),
            "predicted_margin": round(home_pred - away_pred),
            "home_win_likely": home_pred > away_pred
        }

    def predict_with_uncertainty(self, current_state, n_simulations=1000):
        """Predict with uncertainty estimates."""
        base_pred = self.predict(current_state)

        # Add noise based on remaining game time
        remaining_periods = 4 - current_state["period"]
        noise_std = 5 * remaining_periods  # More uncertainty with more time

        home_sims = np.random.normal(
            base_pred["home_predicted"], noise_std, n_simulations
        )
        away_sims = np.random.normal(
            base_pred["away_predicted"], noise_std, n_simulations
        )

        home_wins = (home_sims > away_sims).mean()

        return {
            "home_predicted": base_pred["home_predicted"],
            "away_predicted": base_pred["away_predicted"],
            "home_win_prob": home_wins,
            "home_95_ci": np.percentile(home_sims, [2.5, 97.5]),
            "away_95_ci": np.percentile(away_sims, [2.5, 97.5])
        }

# Usage
predictor = ScorePredictor()
predictor.train(historical_games_df)

# Current game state: Home up 58-52 at halftime
current = {
    "period": 2,
    "home_score": 58,
    "away_score": 52,
    "home_q1": 28,
    "away_q1": 25
}

prediction = predictor.predict_with_uncertainty(current)
print(f"Predicted Final: {prediction['home_predicted']} - {prediction['away_predicted']}")
print(f"Home Win Probability: {prediction['home_win_prob']:.1%}")

python Basketball

Pace and Space Analysis

Analyze team pace and spacing metrics for basketball.

import pandas as pd
import numpy as np

def calculate_pace(team_stats):
    """Calculate team pace (possessions per 48 minutes)."""
    # Possessions = FGA + 0.44*FTA - ORB + TOV
    poss = (
        team_stats["fga"] +
        0.44 * team_stats["fta"] -
        team_stats["orb"] +
        team_stats["tov"]
    )

    minutes = team_stats["minutes"]
    pace = poss / minutes * 48

    return pace

def spacing_analysis(tracking_df, team_id):
    """Analyze team spacing from tracking data."""
    team_possessions = tracking_df[
        (tracking_df["team_id"] == team_id) &
        (tracking_df["on_offense"] == True)
    ]

    spacing_metrics = []

    for poss_id in team_possessions["possession_id"].unique():
        poss_data = team_possessions[team_possessions["possession_id"] == poss_id]

        # Get player positions at each frame
        for frame in poss_data["frame_id"].unique():
            frame_data = poss_data[poss_data["frame_id"] == frame]

            if len(frame_data) < 5:
                continue

            # Calculate pairwise distances
            positions = frame_data[["x", "y"]].values
            distances = []
            for i in range(5):
                for j in range(i+1, 5):
                    dist = np.sqrt(
                        (positions[i][0] - positions[j][0])**2 +
                        (positions[i][1] - positions[j][1])**2
                    )
                    distances.append(dist)

            # Spacing metrics
            spacing_metrics.append({
                "possession_id": poss_id,
                "frame_id": frame,
                "avg_spacing": np.mean(distances),
                "min_spacing": np.min(distances),
                "max_spacing": np.max(distances),
                "spacing_std": np.std(distances),
                # Distance from 3pt line
                "three_pt_spacing": sum(
                    1 for x, y in positions
                    if np.sqrt(x**2 + y**2) > 23.75
                )
            })

    return pd.DataFrame(spacing_metrics)

def analyze_pace_impact(game_df, team_id):
    """Analyze how pace affects team performance."""
    team_games = game_df[
        (game_df["home_team_id"] == team_id) |
        (game_df["away_team_id"] == team_id)
    ].copy()

    team_games["is_home"] = team_games["home_team_id"] == team_id
    team_games["team_pace"] = np.where(
        team_games["is_home"],
        team_games["home_pace"],
        team_games["away_pace"]
    )
    team_games["team_margin"] = np.where(
        team_games["is_home"],
        team_games["home_score"] - team_games["away_score"],
        team_games["away_score"] - team_games["home_score"]
    )

    # Categorize pace
    pace_median = team_games["team_pace"].median()
    team_games["pace_category"] = np.where(
        team_games["team_pace"] > pace_median + 2,
        "Fast",
        np.where(team_games["team_pace"] < pace_median - 2, "Slow", "Normal")
    )

    # Results by pace
    pace_results = team_games.groupby("pace_category").agg({
        "team_margin": ["mean", "std"],
        "team_pace": "count"
    })
    pace_results.columns = ["avg_margin", "std_margin", "games"]

    return pace_results

# Calculate team spacing metrics
spacing_df = spacing_analysis(tracking_df, team_id=1610612744)
print("Average Team Spacing:", spacing_df["avg_spacing"].mean())
print("3PT Spacing (avg players beyond arc):", spacing_df["three_pt_spacing"].mean())

python

Player Value Calculator

Calculate player market value based on performance metrics.

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

class PlayerValueCalculator:
    """Calculate player market value from performance."""

    def __init__(self, salary_df):
        self.salary_data = salary_df
        self.dollars_per_war = None
        self.model = None

    def calculate_market_rate(self, season):
        """Calculate market $/WAR rate."""
        season_data = self.salary_data[self.salary_data["season"] == season]

        # Filter to meaningful playing time
        season_data = season_data[season_data["war"] > 0.5]

        # Calculate $/WAR
        season_data["dollars_per_war"] = season_data["salary"] / season_data["war"]

        # Use median to avoid outliers
        self.dollars_per_war = season_data["dollars_per_war"].median()

        return self.dollars_per_war

    def war_to_dollars(self, war, season=None):
        """Convert WAR to dollar value."""
        if self.dollars_per_war is None:
            self.calculate_market_rate(season or 2024)

        return war * self.dollars_per_war

    def surplus_value(self, player_df):
        """Calculate surplus value (value produced - salary)."""
        player_df = player_df.copy()

        player_df["war_value"] = player_df["war"].apply(self.war_to_dollars)
        player_df["surplus"] = player_df["war_value"] - player_df["salary"]
        player_df["surplus_pct"] = player_df["surplus"] / player_df["salary"] * 100

        return player_df

    def project_contract_value(self, player_projections_df, years):
        """Project total contract value from WAR projections."""
        contracts = []

        for _, player in player_projections_df.iterrows():
            total_value = 0
            year_values = []

            for year in range(years):
                # Apply aging curve (decline ~0.5 WAR/year after 30)
                age = player["age"] + year
                war_decline = max(0, (age - 30) * 0.5) if age > 30 else 0
                projected_war = max(0, player["projected_war"] - war_decline)

                year_value = self.war_to_dollars(projected_war)
                year_values.append(year_value)
                total_value += year_value

            contracts.append({
                "player_name": player["name"],
                "age": player["age"],
                "years": years,
                "total_value": total_value,
                "aav": total_value / years,
                "year_breakdown": year_values
            })

        return pd.DataFrame(contracts)

    def comparable_contracts(self, player_war, player_age, n=5):
        """Find comparable historical contracts."""
        # Find similar players at signing
        comparables = self.salary_data[
            (abs(self.salary_data["war"] - player_war) < 1) &
            (abs(self.salary_data["age"] - player_age) < 2)
        ].copy()

        comparables["similarity"] = (
            1 - abs(comparables["war"] - player_war) / 5 -
            abs(comparables["age"] - player_age) / 10
        )

        return comparables.nlargest(n, "similarity")[[
            "name", "age", "war", "salary", "years", "similarity"
        ]]

# Usage
calculator = PlayerValueCalculator(salary_history_df)
calculator.calculate_market_rate(2024)

print(f"Market rate: ${calculator.dollars_per_war/1e6:.2f}M per WAR")

# Calculate surplus value
players_with_surplus = calculator.surplus_value(current_contracts_df)
print("\nBest Value Contracts:")
print(players_with_surplus.nlargest(10, "surplus")[
    ["name", "salary", "war", "war_value", "surplus"]
])

python

Clutch Performance Analysis

Analyze player performance in high-leverage situations.

import pandas as pd
import numpy as np
from scipy import stats

def calculate_leverage_index(game_state):
    """Calculate leverage index for game situation."""
    inning = game_state["inning"]
    score_diff = abs(game_state["home_score"] - game_state["away_score"])
    outs = game_state["outs"]
    runners = game_state["runners_on"]

    # Base LI calculation (simplified)
    base_li = 1.0

    # Late inning bonus
    if inning >= 7:
        base_li *= 1.5
    if inning >= 9:
        base_li *= 1.3

    # Close game bonus
    if score_diff <= 1:
        base_li *= 2.0
    elif score_diff <= 3:
        base_li *= 1.5

    # Runners on bonus
    base_li *= (1 + 0.3 * runners)

    return base_li

def clutch_performance(player_stats_df, plays_df):
    """Analyze clutch performance by player."""

    # Add leverage index to each play
    plays_df["leverage"] = plays_df.apply(calculate_leverage_index, axis=1)

    # Define high leverage (top 20%)
    li_threshold = plays_df["leverage"].quantile(0.80)
    plays_df["high_leverage"] = plays_df["leverage"] >= li_threshold

    # Calculate performance in different situations
    clutch_stats = []

    for player_id in plays_df["batter_id"].unique():
        player_plays = plays_df[plays_df["batter_id"] == player_id]

        if len(player_plays) < 50:
            continue

        # Overall stats
        overall_woba = player_plays["woba_value"].mean()

        # High leverage stats
        high_lev = player_plays[player_plays["high_leverage"]]
        if len(high_lev) >= 20:
            clutch_woba = high_lev["woba_value"].mean()

            # Clutch score = high leverage performance - overall
            clutch_score = clutch_woba - overall_woba

            clutch_stats.append({
                "player_id": player_id,
                "total_pa": len(player_plays),
                "high_lev_pa": len(high_lev),
                "overall_woba": overall_woba,
                "clutch_woba": clutch_woba,
                "clutch_score": clutch_score
            })

    clutch_df = pd.DataFrame(clutch_stats)

    # Statistical test: is clutch a skill or noise?
    # If clutch is random, year-to-year correlation should be ~0
    year_pairs = clutch_df.groupby("player_id").apply(
        lambda x: x.sort_values("season")
    )

    return clutch_df.sort_values("clutch_score", ascending=False)

def late_close_analysis(plays_df):
    """Analyze late and close game performance."""

    # Late and close: 7th inning or later, within 1 run
    plays_df["late_close"] = (
        (plays_df["inning"] >= 7) &
        (abs(plays_df["home_score"] - plays_df["away_score"]) <= 1)
    )

    # Compare performance
    lc_stats = plays_df.groupby(["batter_id", "late_close"]).agg({
        "woba_value": "mean",
        "event": "count"
    }).unstack()

    lc_stats.columns = ["woba_normal", "woba_late_close", "pa_normal", "pa_late_close"]
    lc_stats["lc_difference"] = lc_stats["woba_late_close"] - lc_stats["woba_normal"]

    return lc_stats.sort_values("lc_difference", ascending=False)

clutch_rankings = clutch_performance(player_stats, pbp_df)
print("Most Clutch Players:")
print(clutch_rankings.head(15))

R Markdown Sports Report

Create automated sports analytics report using R Markdown.

---
title: "Weekly MLB Analytics Report"
author: "Sports Analytics Team"
date: "`r Sys.Date()`"
output:
  html_document:
    toc: true
    toc_float: true
    theme: flatly
params:
  week_num: 10
  season: 2024
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
library(dplyr)
library(ggplot2)
library(knitr)
library(DT)
```

# Weekly Summary

```{r load-data}
# Load data for the specified week
batting <- read.csv("batting_weekly.csv") %>%
  filter(week == params$week_num, season == params$season)

pitching <- read.csv("pitching_weekly.csv") %>%
  filter(week == params$week_num, season == params$season)
```

## Top Performers

### Batting Leaders

```{r batting-table}
batting %>%
  arrange(desc(war)) %>%
  head(10) %>%
  select(Name = name, Team = team, AVG = avg, HR = hr, RBI = rbi, WAR = war) %>%
  datatable(options = list(pageLength = 10))
```

### Pitching Leaders

```{r pitching-table}
pitching %>%
  arrange(era) %>%
  head(10) %>%
  select(Name = name, Team = team, W = wins, ERA = era, K = strikeouts, WAR = war) %>%
  datatable(options = list(pageLength = 10))
```

## Visualizations

```{r war-distribution, fig.width=10, fig.height=6}
ggplot(batting, aes(x = war)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  geom_vline(xintercept = mean(batting$war), color = "red", linetype = "dashed") +
  labs(
    title = paste("WAR Distribution - Week", params$week_num),
    x = "Wins Above Replacement",
    y = "Count"
  ) +
  theme_minimal()
```

## Week-over-Week Trends

```{r trends, fig.width=12, fig.height=5}
all_weeks <- read.csv("batting_weekly.csv") %>%
  filter(season == params$season, week <= params$week_num)

weekly_summary <- all_weeks %>%
  group_by(week) %>%
  summarize(
    avg_war = mean(war),
    total_hr = sum(hr),
    league_avg = mean(avg)
  )

ggplot(weekly_summary, aes(x = week, y = total_hr)) +
  geom_line(color = "steelblue", size = 1.5) +
  geom_point(size = 3) +
  labs(title = "Weekly Home Run Totals", x = "Week", y = "Home Runs") +
  theme_minimal()
```

ggplot2 Faceted Sports Charts

Create multi-panel visualizations using ggplot2 facets for comparing across groups.

library(ggplot2)
library(dplyr)
library(scales)

# Faceted bar chart by division
ggplot(team_stats, aes(x = reorder(team, wins), y = wins, fill = above_500)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ division, scales = "free_y", ncol = 2) +
  scale_fill_manual(values = c("FALSE" = "coral", "TRUE" = "steelblue")) +
  labs(title = "Wins by Team and Division", x = "", y = "Wins") +
  theme_minimal() +
  theme(legend.position = "none")

# Faceted scatter plot with trend lines
ggplot(player_stats, aes(x = obp, y = slg)) +
  geom_point(aes(color = war), alpha = 0.7, size = 2) +
  geom_smooth(method = "lm", se = TRUE, color = "red", linetype = "dashed") +
  facet_grid(position ~ league) +
  scale_color_viridis_c(option = "plasma") +
  labs(
    title = "OBP vs SLG by Position and League",
    x = "On-Base Percentage",
    y = "Slugging Percentage"
  ) +
  theme_bw()

# Time series faceted by player
ggplot(game_log, aes(x = game_date, y = rolling_avg)) +
  geom_line(color = "steelblue", size = 1) +
  geom_hline(aes(yintercept = season_avg), linetype = "dashed", color = "red") +
  facet_wrap(~ player_name, scales = "free_y", ncol = 3) +
  scale_x_date(date_labels = "%b") +
  labs(
    title = "Rolling Batting Average by Player",
    x = "Date",
    y = "20-Game Rolling Average"
  ) +
  theme_minimal()

# Distribution comparison with facets
ggplot(pitching_stats, aes(x = era)) +
  geom_histogram(aes(y = ..density.., fill = role), bins = 30, alpha = 0.7) +
  geom_density(color = "black", size = 1) +
  facet_grid(league ~ role) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "ERA Distribution by Role and League") +
  theme_light()

# Custom facet labeller
position_labels <- c(
  "C" = "Catcher", "1B" = "First Base", "2B" = "Second Base",
  "SS" = "Shortstop", "3B" = "Third Base", "OF" = "Outfield"
)

ggplot(defensive_stats, aes(x = uzr, y = drs)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  facet_wrap(~ position, labeller = labeller(position = position_labels)) +
  labs(title = "UZR vs DRS by Position")

python

Batch Data Processor

Process large sports datasets in parallel batches.

import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from typing import Callable, List, Any
import logging
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class BatchProcessor:
    """Process large datasets in parallel batches."""

    def __init__(self, n_workers=4, batch_size=1000):
        self.n_workers = n_workers
        self.batch_size = batch_size

    def split_batches(self, data: pd.DataFrame) -> List[pd.DataFrame]:
        """Split DataFrame into batches."""
        return [
            data.iloc[i:i+self.batch_size]
            for i in range(0, len(data), self.batch_size)
        ]

    def process_parallel(self, data: pd.DataFrame,
                        process_func: Callable,
                        use_processes: bool = True) -> pd.DataFrame:
        """Process data in parallel batches."""

        batches = self.split_batches(data)
        logger.info(f"Processing {len(data)} rows in {len(batches)} batches")

        Executor = ProcessPoolExecutor if use_processes else ThreadPoolExecutor

        results = []
        with Executor(max_workers=self.n_workers) as executor:
            futures = [
                executor.submit(process_func, batch)
                for batch in batches
            ]

            for future in tqdm(futures, desc="Processing batches"):
                results.append(future.result())

        return pd.concat(results, ignore_index=True)

    def process_with_state(self, data: pd.DataFrame,
                          process_func: Callable,
                          state: dict) -> pd.DataFrame:
        """Process with shared state (single-threaded with batching)."""

        batches = self.split_batches(data)
        results = []

        for batch in tqdm(batches, desc="Processing"):
            result = process_func(batch, state)
            results.append(result)

        return pd.concat(results, ignore_index=True)

def calculate_war_batch(batch: pd.DataFrame) -> pd.DataFrame:
    """Calculate WAR for a batch of players."""
    batch = batch.copy()

    # Simplified WAR calculation
    batch["batting_runs"] = (batch["woba"] - 0.320) / 1.25 * batch["pa"]
    batch["base_running"] = batch["stolen_bases"] * 0.2 - batch["caught_stealing"] * 0.4
    batch["position_adj"] = batch["position"].map({
        "C": 12.5, "SS": 7.5, "CF": 2.5, "2B": 2.5, "3B": 2.5,
        "RF": -7.5, "LF": -7.5, "1B": -12.5, "DH": -17.5
    }) * batch["games"] / 162

    batch["war"] = (
        batch["batting_runs"] +
        batch["base_running"] +
        batch["position_adj"] +
        batch["games"] * 0.1  # Replacement level
    ) / 10

    return batch

# Usage
processor = BatchProcessor(n_workers=4, batch_size=5000)

# Process 100k player records
large_df = pd.DataFrame({
    "player_id": range(100000),
    "woba": np.random.normal(0.320, 0.030, 100000),
    "pa": np.random.randint(100, 600, 100000),
    "stolen_bases": np.random.randint(0, 30, 100000),
    "caught_stealing": np.random.randint(0, 10, 100000),
    "games": np.random.randint(50, 162, 100000),
    "position": np.random.choice(
        ["C", "1B", "2B", "SS", "3B", "LF", "CF", "RF", "DH"],
        100000
    )
})

result = processor.process_parallel(large_df, calculate_war_batch)
print(f"Processed {len(result)} players")
print(f"Average WAR: {result['war'].mean():.2f}")

Sports Data Joins in dplyr

Efficiently join multiple sports datasets using dplyr verbs.

library(dplyr)
library(tidyr)

# Load various data sources
batting <- read.csv("batting_stats.csv")
fielding <- read.csv("fielding_stats.csv")
players <- read.csv("players.csv")
teams <- read.csv("teams.csv")
salaries <- read.csv("salaries.csv")

# Inner join - only matching records
batting_with_fielding <- batting %>%
  inner_join(fielding, by = c("player_id", "season"))

# Left join - keep all batting records
full_stats <- batting %>%
  left_join(fielding, by = c("player_id", "season")) %>%
  left_join(players, by = "player_id") %>%
  left_join(teams, by = "team_id")

# Anti-join - find players without fielding stats
batters_no_fielding <- batting %>%
  anti_join(fielding, by = c("player_id", "season"))

# Multiple condition join
salary_comparison <- salaries %>%
  inner_join(
    batting,
    by = c("player_id", "season"),
    suffix = c("_salary", "_stats")
  )

# Fuzzy date join (within 7 days)
library(fuzzyjoin)
injuries <- read.csv("injuries.csv")
game_log <- read.csv("game_log.csv")

games_near_injury <- game_log %>%
  fuzzy_left_join(
    injuries,
    by = c("player_id", "game_date" = "injury_date"),
    match_fun = list(`==`, function(x, y) abs(x - y) <= 7)
  )

# Complex aggregation with multiple joins
season_summary <- batting %>%
  group_by(player_id, season) %>%
  summarize(
    games = sum(games),
    avg = sum(hits) / sum(at_bats),
    war = sum(war),
    .groups = "drop"
  ) %>%
  left_join(
    salaries %>% select(player_id, season, salary),
    by = c("player_id", "season")
  ) %>%
  mutate(
    dollars_per_war = salary / pmax(war, 0.1)
  ) %>%
  arrange(desc(war))

python

Player Aging Curve

Model player performance aging curves.

import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt

def delta_method_aging(player_seasons_df):
    """Calculate aging curve using delta method."""
    # Get consecutive seasons for same player
    df = player_seasons_df.sort_values(["player_id", "season"]).copy()
    df["prev_war"] = df.groupby("player_id")["war"].shift(1)
    df["prev_age"] = df.groupby("player_id")["age"].shift(1)

    # Only consecutive seasons
    df = df[df["season"] == df.groupby("player_id")["season"].shift(1) + 1]
    df["war_change"] = df["war"] - df["prev_war"]

    # Average change at each age
    aging_curve = df.groupby("age").agg({
        "war_change": ["mean", "std", "count"]
    })
    aging_curve.columns = ["delta_war", "std", "n"]

    # Cumulative curve (set peak at age 27)
    aging_curve["cumulative"] = aging_curve["delta_war"].cumsum()
    peak_value = aging_curve.loc[27, "cumulative"] if 27 in aging_curve.index else 0
    aging_curve["relative_to_peak"] = aging_curve["cumulative"] - peak_value

    return aging_curve

def fit_parametric_curve(aging_data):
    """Fit parametric aging curve."""
    # Gaussian-like aging curve
    def aging_func(age, peak_age, peak_value, decline_rate, asymmetry):
        return peak_value * np.exp(
            -0.5 * ((age - peak_age) / decline_rate) ** 2 *
            (1 + asymmetry * np.sign(age - peak_age))
        )

    ages = aging_data.index.values
    values = aging_data["cumulative"].values + 5  # Shift for fitting

    params, _ = curve_fit(
        aging_func, ages, values,
        p0=[27, 5, 5, 0.3],
        bounds=([22, 0, 1, -1], [32, 10, 15, 1])
    )

    return params, aging_func

def position_aging_curves(player_seasons_df):
    """Calculate aging curves by position."""
    positions = player_seasons_df["position"].unique()
    curves = {}

    for pos in positions:
        pos_data = player_seasons_df[player_seasons_df["position"] == pos]
        curves[pos] = delta_method_aging(pos_data)

    return curves

def plot_aging_curves(curves_dict):
    """Plot aging curves for different positions."""
    fig, ax = plt.subplots(figsize=(12, 6))

    colors = plt.cm.Set1(np.linspace(0, 1, len(curves_dict)))

    for (pos, curve), color in zip(curves_dict.items(), colors):
        ax.plot(
            curve.index,
            curve["relative_to_peak"],
            label=pos,
            color=color,
            linewidth=2
        )
        ax.fill_between(
            curve.index,
            curve["relative_to_peak"] - curve["std"],
            curve["relative_to_peak"] + curve["std"],
            alpha=0.2,
            color=color
        )

    ax.axhline(y=0, color="black", linestyle="--", alpha=0.5)
    ax.axvline(x=27, color="gray", linestyle=":", alpha=0.5)
    ax.set_xlabel("Age")
    ax.set_ylabel("WAR Relative to Peak")
    ax.set_title("Aging Curves by Position")
    ax.legend()
    ax.grid(True, alpha=0.3)

    return fig, ax

# Calculate aging curves
overall_curve = delta_method_aging(player_seasons_df)
position_curves = position_aging_curves(player_seasons_df)

print("Peak Age Analysis:")
print(overall_curve[["delta_war", "n"]].loc[24:32])

# Plot curves
fig, ax = plot_aging_curves(position_curves)
plt.show()

python

Configuration Management

Manage configuration for sports analytics projects.

import yaml
import os
from pathlib import Path
from dataclasses import dataclass, field
from typing import Dict, Any, Optional
import json

@dataclass
class DatabaseConfig:
    host: str = "localhost"
    port: int = 3306
    database: str = "sports_analytics"
    user: str = "analyst"
    password: str = ""

@dataclass
class APIConfig:
    base_url: str = ""
    api_key: str = ""
    rate_limit: int = 100
    timeout: int = 30

@dataclass
class AnalyticsConfig:
    database: DatabaseConfig = field(default_factory=DatabaseConfig)
    apis: Dict[str, APIConfig] = field(default_factory=dict)
    cache_dir: str = "./cache"
    log_level: str = "INFO"
    parallel_workers: int = 4

class ConfigManager:
    """Manage application configuration."""

    def __init__(self, config_dir: str = "./config"):
        self.config_dir = Path(config_dir)
        self.config_dir.mkdir(exist_ok=True)
        self.config: Optional[AnalyticsConfig] = None

    def load(self, env: str = "development") -> AnalyticsConfig:
        """Load configuration for environment."""
        # Load base config
        base_path = self.config_dir / "base.yaml"
        config_data = self._load_yaml(base_path) if base_path.exists() else {}

        # Load environment-specific config
        env_path = self.config_dir / f"{env}.yaml"
        if env_path.exists():
            env_data = self._load_yaml(env_path)
            config_data = self._deep_merge(config_data, env_data)

        # Override with environment variables
        config_data = self._apply_env_vars(config_data)

        # Build config object
        self.config = self._build_config(config_data)
        return self.config

    def _load_yaml(self, path: Path) -> dict:
        with open(path) as f:
            return yaml.safe_load(f) or {}

    def _deep_merge(self, base: dict, override: dict) -> dict:
        """Deep merge two dictionaries."""
        result = base.copy()
        for key, value in override.items():
            if key in result and isinstance(result[key], dict) and isinstance(value, dict):
                result[key] = self._deep_merge(result[key], value)
            else:
                result[key] = value
        return result

    def _apply_env_vars(self, config: dict) -> dict:
        """Apply environment variable overrides."""
        # Database
        if "SPORTS_DB_HOST" in os.environ:
            config.setdefault("database", {})["host"] = os.environ["SPORTS_DB_HOST"]
        if "SPORTS_DB_PASSWORD" in os.environ:
            config.setdefault("database", {})["password"] = os.environ["SPORTS_DB_PASSWORD"]

        # API keys
        for key in os.environ:
            if key.startswith("SPORTS_API_"):
                api_name = key.replace("SPORTS_API_", "").lower()
                config.setdefault("apis", {}).setdefault(api_name, {})["api_key"] = os.environ[key]

        return config

    def _build_config(self, data: dict) -> AnalyticsConfig:
        """Build config object from dictionary."""
        db_config = DatabaseConfig(**data.get("database", {}))

        api_configs = {}
        for name, api_data in data.get("apis", {}).items():
            api_configs[name] = APIConfig(**api_data)

        return AnalyticsConfig(
            database=db_config,
            apis=api_configs,
            cache_dir=data.get("cache_dir", "./cache"),
            log_level=data.get("log_level", "INFO"),
            parallel_workers=data.get("parallel_workers", 4)
        )

    def save_template(self):
        """Save configuration template."""
        template = {
            "database": {
                "host": "localhost",
                "port": 3306,
                "database": "sports_analytics",
                "user": "analyst",
                "password": "CHANGE_ME"
            },
            "apis": {
                "mlb": {
                    "base_url": "https://statsapi.mlb.com/api/v1",
                    "api_key": "",
                    "rate_limit": 100
                },
                "nba": {
                    "base_url": "https://stats.nba.com/stats",
                    "api_key": "",
                    "rate_limit": 50
                }
            },
            "cache_dir": "./cache",
            "log_level": "INFO",
            "parallel_workers": 4
        }

        with open(self.config_dir / "template.yaml", "w") as f:
            yaml.dump(template, f, default_flow_style=False)

# Usage
config_manager = ConfigManager()
config = config_manager.load(env="production")

print(f"Database: {config.database.host}:{config.database.port}")
print(f"APIs configured: {list(config.apis.keys())}")

python

Sports Date Utilities

Utility functions for handling sports-specific date logic.

from datetime import datetime, timedelta
from typing import List, Tuple, Optional
import pandas as pd

class SportsDateUtils:
    """Utilities for sports date handling."""

    # Season date ranges (approximate)
    SEASON_DATES = {
        "mlb": {"start": (3, 28), "end": (10, 1), "playoffs_end": (11, 5)},
        "nba": {"start": (10, 22), "end": (4, 14), "playoffs_end": (6, 20)},
        "nfl": {"start": (9, 5), "end": (1, 8), "playoffs_end": (2, 12)},
        "nhl": {"start": (10, 10), "end": (4, 13), "playoffs_end": (6, 25)}
    }

    @classmethod
    def get_season_year(cls, date: datetime, sport: str) -> int:
        """Get season year for a given date and sport."""
        season_info = cls.SEASON_DATES.get(sport, {})
        start_month = season_info.get("start", (1, 1))[0]

        # For sports that span calendar years (NBA, NHL, NFL)
        if sport in ["nba", "nhl"]:
            # If before start month, use previous year
            if date.month < start_month:
                return date.year - 1
            return date.year
        elif sport == "nfl":
            if date.month < 3:  # Before March = previous season
                return date.year - 1
            return date.year
        else:  # MLB and others
            return date.year

    @classmethod
    def is_regular_season(cls, date: datetime, sport: str) -> bool:
        """Check if date is during regular season."""
        season_info = cls.SEASON_DATES.get(sport)
        if not season_info:
            return True

        start = datetime(date.year, *season_info["start"])
        end_month, end_day = season_info["end"]

        # Handle season spanning years
        if end_month < season_info["start"][0]:
            end = datetime(date.year + 1, end_month, end_day)
        else:
            end = datetime(date.year, end_month, end_day)

        return start <= date <= end

    @classmethod
    def get_game_week(cls, date: datetime, sport: str = "nfl") -> int:
        """Get NFL game week number."""
        season_year = cls.get_season_year(date, sport)
        season_start = datetime(season_year, 9, 1)

        # Find first Thursday in September
        while season_start.weekday() != 3:  # Thursday
            season_start += timedelta(days=1)

        if date < season_start:
            return 0

        days_since_start = (date - season_start).days
        return (days_since_start // 7) + 1

    @classmethod
    def get_rest_days(cls, game_dates: List[datetime],
                      current_date: datetime) -> int:
        """Calculate days of rest before current game."""
        previous_games = [d for d in game_dates if d < current_date]
        if not previous_games:
            return 7  # Default to week rest

        last_game = max(previous_games)
        return (current_date - last_game).days - 1

    @classmethod
    def create_game_schedule_features(cls, games_df: pd.DataFrame,
                                      sport: str) -> pd.DataFrame:
        """Add schedule-based features to games DataFrame."""
        df = games_df.copy()
        df["game_date"] = pd.to_datetime(df["game_date"])

        # Season year
        df["season"] = df["game_date"].apply(
            lambda x: cls.get_season_year(x, sport)
        )

        # Regular season flag
        df["is_regular_season"] = df["game_date"].apply(
            lambda x: cls.is_regular_season(x, sport)
        )

        # Day of week
        df["day_of_week"] = df["game_date"].dt.dayofweek
        df["is_weekend"] = df["day_of_week"].isin([5, 6])

        # Time of season (0-1)
        def get_season_progress(row):
            season_info = cls.SEASON_DATES.get(sport, {})
            start = datetime(row["season"], *season_info.get("start", (1, 1)))
            end_month, end_day = season_info.get("end", (12, 31))
            if end_month < start.month:
                end = datetime(row["season"] + 1, end_month, end_day)
            else:
                end = datetime(row["season"], end_month, end_day)
            total_days = (end - start).days
            days_in = (row["game_date"] - start).days
            return max(0, min(1, days_in / total_days))

        df["season_progress"] = df.apply(get_season_progress, axis=1)

        return df

# Usage
utils = SportsDateUtils()

# Get current NFL week
today = datetime.now()
week = utils.get_game_week(today)
print(f"Current NFL Week: {week}")

# Add features to games
games_with_features = utils.create_game_schedule_features(games_df, "mlb")
print(games_with_features[["game_date", "season", "is_regular_season", "season_progress"]].head())

python

Sports Data Cache Manager

Implement caching layer for sports API data to reduce API calls.

import json
import hashlib
import time
from datetime import datetime, timedelta
from pathlib import Path
from functools import wraps
import pickle
import redis

class CacheManager:
    """Manage caching for sports data."""

    def __init__(self, cache_dir="./cache", ttl_hours=1):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.default_ttl = ttl_hours * 3600
        self.redis_client = None

    def connect_redis(self, host="localhost", port=6379):
        """Connect to Redis for distributed caching."""
        self.redis_client = redis.Redis(host=host, port=port, decode_responses=False)

    def _get_cache_key(self, func_name, args, kwargs):
        """Generate unique cache key."""
        key_data = f"{func_name}:{args}:{sorted(kwargs.items())}"
        return hashlib.md5(key_data.encode()).hexdigest()

    def _file_cache_path(self, key):
        """Get file path for cache key."""
        return self.cache_dir / f"{key}.pkl"

    def get(self, key):
        """Get value from cache."""
        # Try Redis first
        if self.redis_client:
            try:
                data = self.redis_client.get(key)
                if data:
                    return pickle.loads(data)
            except:
                pass

        # Fall back to file cache
        cache_path = self._file_cache_path(key)
        if cache_path.exists():
            with open(cache_path, "rb") as f:
                cached = pickle.load(f)
                if time.time() < cached["expires"]:
                    return cached["data"]
                else:
                    cache_path.unlink()  # Remove expired

        return None

    def set(self, key, value, ttl=None):
        """Set value in cache."""
        ttl = ttl or self.default_ttl

        # Try Redis
        if self.redis_client:
            try:
                self.redis_client.setex(key, ttl, pickle.dumps(value))
                return
            except:
                pass

        # File cache fallback
        cache_path = self._file_cache_path(key)
        cached = {
            "data": value,
            "expires": time.time() + ttl,
            "created": datetime.now().isoformat()
        }
        with open(cache_path, "wb") as f:
            pickle.dump(cached, f)

    def cached(self, ttl=None):
        """Decorator for caching function results."""
        def decorator(func):
            @wraps(func)
            def wrapper(*args, **kwargs):
                key = self._get_cache_key(func.__name__, args, kwargs)

                # Check cache
                cached_value = self.get(key)
                if cached_value is not None:
                    return cached_value

                # Call function and cache result
                result = func(*args, **kwargs)
                self.set(key, result, ttl)
                return result
            return wrapper
        return decorator

    def clear(self, pattern=None):
        """Clear cache entries."""
        if self.redis_client and pattern:
            keys = self.redis_client.keys(pattern)
            if keys:
                self.redis_client.delete(*keys)

        # Clear file cache
        for cache_file in self.cache_dir.glob("*.pkl"):
            if pattern is None or pattern in cache_file.name:
                cache_file.unlink()

# Usage
cache = CacheManager(ttl_hours=4)

@cache.cached(ttl=3600)  # 1 hour cache
def get_player_stats(player_id, season):
    """Fetch player stats (cached)."""
    # Expensive API call
    response = api_client.get(f"/players/{player_id}/stats/{season}")
    return response.json()

# First call - fetches from API
stats = get_player_stats(12345, 2024)

# Second call - returns cached data
stats = get_player_stats(12345, 2024)

python

Prospect Ranking System

Create prospect ranking system combining multiple data sources.

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

class ProspectRanker:
    """Rank prospects using multiple evaluation methods."""

    def __init__(self, weights=None):
        self.weights = weights or {
            "tools": 0.3,
            "stats": 0.4,
            "projection": 0.2,
            "makeup": 0.1
        }
        self.scaler = MinMaxScaler()

    def score_tools(self, prospect_df):
        """Score raw tools on 20-80 scale."""
        tool_cols = ["hit", "power", "speed", "arm", "field"]

        # Normalize to 0-1 scale
        for col in tool_cols:
            prospect_df[f"{col}_norm"] = (prospect_df[col] - 20) / 60

        # Overall tool score
        prospect_df["tools_score"] = prospect_df[
            [f"{c}_norm" for c in tool_cols]
        ].mean(axis=1)

        return prospect_df

    def score_stats(self, prospect_df, position_adjustments):
        """Score statistical performance."""
        # Normalize stats by league and level
        stat_cols = ["avg", "obp", "slg", "hr_rate", "k_rate", "bb_rate"]

        for col in stat_cols:
            col_by_level = prospect_df.groupby("level")[col].transform(
                lambda x: (x - x.mean()) / x.std()
            )
            prospect_df[f"{col}_z"] = col_by_level

        # Position adjustments
        prospect_df["pos_adj"] = prospect_df["position"].map(position_adjustments)

        # Combined stats score
        weights = {"avg_z": 0.15, "obp_z": 0.2, "slg_z": 0.2,
                   "hr_rate_z": 0.15, "k_rate_z": -0.15, "bb_rate_z": 0.15}

        prospect_df["stats_score"] = sum(
            prospect_df[col] * weight for col, weight in weights.items()
        ) + prospect_df["pos_adj"]

        # Scale to 0-1
        prospect_df["stats_score"] = self.scaler.fit_transform(
            prospect_df[["stats_score"]]
        )

        return prospect_df

    def project_future_value(self, prospect_df):
        """Project future MLB value."""
        # Factors: age relative to level, tools trajectory, injury history
        prospect_df["age_factor"] = 1 - (prospect_df["age"] - 18) / 10

        # Simple projection model
        prospect_df["projection_score"] = (
            prospect_df["tools_score"] * 0.5 +
            prospect_df["stats_score"].values.flatten() * 0.3 +
            prospect_df["age_factor"] * 0.2
        )

        return prospect_df

    def calculate_final_rank(self, prospect_df):
        """Calculate final prospect ranking."""
        prospect_df["final_score"] = (
            prospect_df["tools_score"] * self.weights["tools"] +
            prospect_df["stats_score"].values.flatten() * self.weights["stats"] +
            prospect_df["projection_score"] * self.weights["projection"]
        )

        prospect_df["rank"] = prospect_df["final_score"].rank(ascending=False)

        return prospect_df.sort_values("rank")

    def generate_report(self, prospect_df, top_n=100):
        """Generate prospect ranking report."""
        df = self.score_tools(prospect_df)
        df = self.score_stats(df, {
            "C": 0.1, "SS": 0.08, "CF": 0.05,
            "2B": 0.02, "3B": 0.02, "RF": 0,
            "LF": -0.02, "1B": -0.05, "DH": -0.08
        })
        df = self.project_future_value(df)
        df = self.calculate_final_rank(df)

        report = df.head(top_n)[[
            "rank", "name", "age", "position", "organization",
            "tools_score", "stats_score", "projection_score", "final_score"
        ]]

        return report

# Usage
ranker = ProspectRanker()
rankings = ranker.generate_report(prospects_df, top_n=50)
print("Top 50 Prospects:")
print(rankings)

python

Defensive Positioning Heatmap

Create defensive positioning heatmaps from tracking data.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter
import seaborn as sns

def create_defensive_heatmap(tracking_df, fielder_position, team_id=None):
    """Create defensive positioning heatmap."""

    # Filter to defensive plays
    defense = tracking_df[
        (tracking_df["position"] == fielder_position) &
        (tracking_df["on_defense"] == True)
    ]

    if team_id:
        defense = defense[defense["team_id"] == team_id]

    # Get positions at contact
    contact_positions = defense[defense["event"] == "ball_contact"]

    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    # 1. Raw positioning scatter
    ax1 = axes[0]
    ax1.scatter(contact_positions["x"], contact_positions["y"],
                alpha=0.3, s=10)
    ax1.set_title(f"{fielder_position} - Raw Positions")
    ax1.set_xlabel("X (feet)")
    ax1.set_ylabel("Y (feet)")

    # 2. 2D Histogram heatmap
    ax2 = axes[1]
    h = ax2.hist2d(contact_positions["x"], contact_positions["y"],
                   bins=50, cmap="YlOrRd")
    plt.colorbar(h[3], ax=ax2, label="Frequency")
    ax2.set_title(f"{fielder_position} - Positioning Density")

    # 3. Smoothed KDE heatmap
    ax3 = axes[2]
    x_range = np.linspace(contact_positions["x"].min(), contact_positions["x"].max(), 100)
    y_range = np.linspace(contact_positions["y"].min(), contact_positions["y"].max(), 100)

    heatmap, xedges, yedges = np.histogram2d(
        contact_positions["x"], contact_positions["y"],
        bins=[x_range, y_range]
    )
    heatmap = gaussian_filter(heatmap, sigma=2)

    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
    im = ax3.imshow(heatmap.T, origin="lower", extent=extent,
                    cmap="YlOrRd", aspect="auto")
    plt.colorbar(im, ax=ax3, label="Density")
    ax3.set_title(f"{fielder_position} - Smoothed Heatmap")

    plt.tight_layout()
    return fig

def shift_analysis(tracking_df, batter_id):
    """Analyze defensive shifts against a specific batter."""

    # Get defensive positions when this batter is up
    batter_plays = tracking_df[
        (tracking_df["batter_id"] == batter_id) &
        (tracking_df["event"] == "pitch")
    ]

    # Group by position and get average location
    positions = ["1B", "2B", "SS", "3B", "LF", "CF", "RF"]

    avg_positions = []
    for pos in positions:
        pos_data = batter_plays[batter_plays["position"] == pos]
        if len(pos_data) > 0:
            avg_positions.append({
                "position": pos,
                "avg_x": pos_data["x"].mean(),
                "avg_y": pos_data["y"].mean(),
                "std_x": pos_data["x"].std(),
                "std_y": pos_data["y"].std()
            })

    avg_df = pd.DataFrame(avg_positions)

    # Compare to standard positions
    standard_positions = {
        "1B": (90, 30), "2B": (60, 90), "SS": (-60, 90),
        "3B": (-90, 30), "LF": (-180, 250), "CF": (0, 300), "RF": (180, 250)
    }

    avg_df["shift_x"] = avg_df.apply(
        lambda x: x["avg_x"] - standard_positions[x["position"]][0], axis=1
    )
    avg_df["shift_y"] = avg_df.apply(
        lambda x: x["avg_y"] - standard_positions[x["position"]][1], axis=1
    )

    return avg_df

def visualize_defensive_alignment(positions_df, title="Defensive Alignment"):
    """Visualize defensive alignment on field."""

    fig, ax = plt.subplots(figsize=(12, 10))

    # Draw field
    # Infield dirt
    infield = plt.Circle((0, 0), 95, color="peru", alpha=0.3)
    ax.add_patch(infield)

    # Bases
    bases = [(0, 0), (63.6, 63.6), (0, 127.3), (-63.6, 63.6)]
    for bx, by in bases:
        ax.plot(bx, by, "ws", markersize=15, markeredgecolor="black")

    # Outfield grass
    ax.set_facecolor("forestgreen")

    # Plot fielder positions
    for _, row in positions_df.iterrows():
        ax.scatter(row["avg_x"], row["avg_y"], s=200, c="blue",
                  edgecolors="white", linewidth=2, zorder=5)
        ax.annotate(row["position"], (row["avg_x"], row["avg_y"]),
                   fontsize=10, ha="center", va="bottom",
                   color="white", fontweight="bold")

        # Show shift arrows from standard position
        if abs(row["shift_x"]) > 10 or abs(row["shift_y"]) > 10:
            ax.annotate("",
                       xy=(row["avg_x"], row["avg_y"]),
                       xytext=(row["avg_x"] - row["shift_x"],
                              row["avg_y"] - row["shift_y"]),
                       arrowprops=dict(arrowstyle="->", color="red", lw=2))

    ax.set_xlim(-350, 350)
    ax.set_ylim(-50, 400)
    ax.set_aspect("equal")
    ax.set_title(title)

    return fig, ax

# Usage
heatmap_fig = create_defensive_heatmap(tracking_df, "SS")
shift_df = shift_analysis(tracking_df, batter_id=12345)
alignment_fig, ax = visualize_defensive_alignment(shift_df, "Shift vs Left-Handed Pull Hitter")

python Basketball

Shot Quality Analysis

Analyze shot quality metrics for basketball.

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

def calculate_shot_quality(shots_df):
    """Calculate shot quality metrics."""

    # Add shot zones
    shots_df = shots_df.copy()

    # Distance from hoop
    shots_df["distance"] = np.sqrt(
        shots_df["loc_x"] ** 2 + shots_df["loc_y"] ** 2
    )

    # Shot zones
    def get_zone(row):
        x, y = row["loc_x"], row["loc_y"]
        dist = row["distance"]

        if dist < 4:
            return "Restricted Area"
        elif dist < 8:
            return "Paint (Non-RA)"
        elif dist < 14:
            return "Mid-Range"
        elif y > 7.8:
            return "Corner 3"
        elif dist >= 23.75:
            return "Above Break 3"
        else:
            return "Long 2"

    shots_df["zone"] = shots_df.apply(get_zone, axis=1)

    return shots_df

def train_shot_model(shots_df):
    """Train expected FG% model."""
    features = [
        "distance",
        "shot_clock",
        "touch_time",
        "defender_distance",
        "dribbles"
    ]

    df = shots_df.dropna(subset=features + ["is_made"])

    X = df[features]
    y = df["is_made"].astype(int)

    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)

    df["expected_fg"] = model.predict_proba(X)[:, 1]

    return model, df

def shot_quality_report(shots_df, player_name=None):
    """Generate shot quality report."""
    df = shots_df.copy()

    if player_name:
        df = df[df["player_name"] == player_name]

    # Zone breakdown
    zone_stats = df.groupby("zone").agg({
        "is_made": ["count", "sum", "mean"],
        "expected_fg": "mean"
    })
    zone_stats.columns = ["attempts", "makes", "fg_pct", "expected_fg"]
    zone_stats["points_per_shot"] = zone_stats.apply(
        lambda x: x["fg_pct"] * (3 if "3" in x.name else 2),
        axis=1
    )

    # Shot quality metrics
    total_shots = len(df)
    avg_expected = df["expected_fg"].mean()
    actual_fg = df["is_made"].mean()

    # Shot making vs shot selection
    selection_value = avg_expected - 0.45  # vs league avg
    making_value = actual_fg - avg_expected

    report = {
        "shots": total_shots,
        "fg_pct": actual_fg,
        "expected_fg": avg_expected,
        "shot_selection": selection_value,  # Positive = good shot selection
        "shot_making": making_value,        # Positive = makes tough shots
        "zone_breakdown": zone_stats
    }

    return report

def visualize_shot_chart(shots_df, player_name):
    """Create shot chart visualization."""
    player_shots = shots_df[shots_df["player_name"] == player_name]

    fig, ax = plt.subplots(figsize=(12, 11))

    # Draw court
    court = plt.Circle((0, 0), 23.75, fill=False, color="black")
    ax.add_patch(court)

    # Paint
    ax.add_patch(plt.Rectangle((-80, -47.5), 160, 190, fill=False))

    # Restricted area
    ax.add_patch(plt.Circle((0, 0), 40, fill=False, color="black"))

    # Plot shots
    made = player_shots[player_shots["is_made"] == 1]
    missed = player_shots[player_shots["is_made"] == 0]

    ax.scatter(made["loc_x"], made["loc_y"], c="green", marker="o", s=30, alpha=0.6, label="Made")
    ax.scatter(missed["loc_x"], missed["loc_y"], c="red", marker="x", s=30, alpha=0.6, label="Missed")

    ax.set_xlim(-250, 250)
    ax.set_ylim(-50, 420)
    ax.set_aspect("equal")
    ax.legend()
    ax.set_title(f"{player_name} Shot Chart")

    return fig, ax

# Usage
shots_df = calculate_shot_quality(shots_df)
model, shots_with_xfg = train_shot_model(shots_df)
report = shot_quality_report(shots_with_xfg, "Stephen Curry")

print(f"Shot Selection: {report['shot_selection']*100:+.1f}% vs avg")
print(f"Shot Making: {report['shot_making']*100:+.1f}% vs expected")

python

Lineup Optimization

Optimize batting lineup order using genetic algorithms.

import numpy as np
import pandas as pd
from typing import List, Tuple
import random
from deap import base, creator, tools, algorithms

class LineupOptimizer:
    """Optimize batting lineup using genetic algorithm."""

    def __init__(self, players_df, simulation_games=1000):
        self.players = players_df
        self.n_players = len(players_df)
        self.sim_games = simulation_games

    def simulate_game(self, lineup_order: List[int]) -> float:
        """Simulate game with given lineup order and return expected runs."""
        runs = 0
        outs = 0
        bases = [0, 0, 0]  # First, second, third
        batter_idx = 0

        while outs < 27:  # 9 innings * 3 outs
            player_idx = lineup_order[batter_idx % 9]
            player = self.players.iloc[player_idx]

            # Simple outcome probabilities from player stats
            outcomes = self._at_bat_outcomes(player)
            outcome = np.random.choice(
                ["out", "single", "double", "triple", "hr", "walk"],
                p=outcomes
            )

            if outcome == "out":
                outs += 1
            else:
                # Advance runners and score runs
                runs_scored, bases = self._advance_runners(bases, outcome)
                runs += runs_scored

            batter_idx += 1

        return runs

    def _at_bat_outcomes(self, player) -> List[float]:
        """Get outcome probabilities for player."""
        # Simplified model
        single_pct = (player["avg"] - player["hr_rate"] * 0.8) * 0.7
        double_pct = player["avg"] * 0.2
        triple_pct = player["avg"] * 0.03
        hr_pct = player["hr_rate"]
        walk_pct = player["bb_rate"]
        out_pct = 1 - single_pct - double_pct - triple_pct - hr_pct - walk_pct

        return [max(0, out_pct), single_pct, double_pct, triple_pct, hr_pct, walk_pct]

    def _advance_runners(self, bases: List[int], outcome: str) -> Tuple[int, List[int]]:
        """Advance runners based on outcome."""
        runs = 0
        new_bases = [0, 0, 0]

        if outcome == "hr":
            runs = 1 + sum(bases)
        elif outcome == "triple":
            runs = sum(bases)
            new_bases[2] = 1
        elif outcome == "double":
            runs = bases[1] + bases[2]
            new_bases[2] = bases[0]
            new_bases[1] = 1
        elif outcome in ["single", "walk"]:
            runs = bases[2]
            new_bases[2] = bases[1]
            new_bases[1] = bases[0]
            new_bases[0] = 1

        return runs, new_bases

    def evaluate_lineup(self, lineup: List[int]) -> Tuple[float]:
        """Evaluate lineup fitness."""
        total_runs = sum(
            self.simulate_game(lineup)
            for _ in range(self.sim_games)
        )
        return (total_runs / self.sim_games,)

    def optimize(self, generations=100, population_size=50):
        """Run genetic algorithm optimization."""
        # Setup DEAP
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMax)

        toolbox = base.Toolbox()
        toolbox.register("indices", random.sample, range(self.n_players), 9)
        toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.indices)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)

        toolbox.register("evaluate", self.evaluate_lineup)
        toolbox.register("mate", tools.cxOrdered)
        toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.2)
        toolbox.register("select", tools.selTournament, tournsize=3)

        pop = toolbox.population(n=population_size)
        hof = tools.HallOfFame(1)

        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean)
        stats.register("max", np.max)

        pop, log = algorithms.eaSimple(
            pop, toolbox,
            cxpb=0.7, mutpb=0.2,
            ngen=generations,
            stats=stats,
            halloffame=hof,
            verbose=True
        )

        best_lineup = hof[0]
        return best_lineup, log

# Usage
optimizer = LineupOptimizer(players_df, simulation_games=500)
best_lineup, log = optimizer.optimize(generations=50)

print("Optimal Lineup Order:")
for i, idx in enumerate(best_lineup, 1):
    player = players_df.iloc[idx]
    print(f"{i}. {player['name']} ({player['position']})")

python Baseball

Pitch Sequencing Analysis

Analyze pitch sequencing patterns and their effectiveness.

import pandas as pd
import numpy as np
from collections import Counter
from itertools import product

def analyze_pitch_sequences(pitches_df, pitcher_id):
    """Analyze pitch sequencing patterns."""
    pitcher_pitches = pitches_df[
        pitches_df["pitcher_id"] == pitcher_id
    ].sort_values(["game_id", "at_bat_number", "pitch_number"])

    # Get pitch transitions
    pitcher_pitches["prev_pitch"] = pitcher_pitches.groupby(
        ["game_id", "at_bat_number"]
    )["pitch_type"].shift(1)

    # Filter to valid transitions (not first pitch of AB)
    transitions = pitcher_pitches[pitcher_pitches["prev_pitch"].notna()].copy()

    # Count transitions
    transition_counts = transitions.groupby(
        ["prev_pitch", "pitch_type"]
    ).size().unstack(fill_value=0)

    # Calculate transition probabilities
    transition_probs = transition_counts.div(
        transition_counts.sum(axis=1), axis=0
    )

    return transition_counts, transition_probs

def sequence_effectiveness(pitches_df, pitcher_id):
    """Measure effectiveness of pitch sequences."""
    pitcher_pitches = pitches_df[
        pitches_df["pitcher_id"] == pitcher_id
    ].sort_values(["game_id", "at_bat_number", "pitch_number"])

    pitcher_pitches["prev_pitch"] = pitcher_pitches.groupby(
        ["game_id", "at_bat_number"]
    )["pitch_type"].shift(1)

    # Effectiveness metrics by sequence
    sequences = pitcher_pitches[pitcher_pitches["prev_pitch"].notna()].copy()
    sequences["sequence"] = sequences["prev_pitch"] + " -> " + sequences["pitch_type"]

    effectiveness = sequences.groupby("sequence").agg({
        "pitch_type": "count",
        "is_strike": "mean",
        "is_swing": "mean",
        "is_whiff": lambda x: x.sum() / max(sequences.loc[x.index, "is_swing"].sum(), 1),
        "delta_run_exp": "mean"  # Run value
    }).rename(columns={
        "pitch_type": "count",
        "is_strike": "strike_pct",
        "is_swing": "swing_pct",
        "is_whiff": "whiff_pct",
        "delta_run_exp": "run_value"
    })

    effectiveness = effectiveness[effectiveness["count"] >= 20]

    return effectiveness.sort_values("run_value")

def pitch_tunneling_analysis(pitches_df, pitcher_id):
    """Analyze pitch tunneling (how similar pitches look at release)."""
    pitcher = pitches_df[pitches_df["pitcher_id"] == pitcher_id]

    # Group by pitch type
    pitch_types = pitcher.groupby("pitch_type").agg({
        "release_pos_x": "mean",
        "release_pos_z": "mean",
        "release_extension": "mean",
        "plate_x": ["mean", "std"],
        "plate_z": ["mean", "std"],
        "release_speed": "mean",
        "pfx_x": "mean",
        "pfx_z": "mean"
    })

    pitch_types.columns = ["_".join(col).strip("_") for col in pitch_types.columns]

    # Calculate tunnel distance between pitch pairs
    pitch_list = pitch_types.index.tolist()
    tunnel_matrix = pd.DataFrame(index=pitch_list, columns=pitch_list, dtype=float)

    for p1, p2 in product(pitch_list, pitch_list):
        # Distance at release point
        release_dist = np.sqrt(
            (pitch_types.loc[p1, "release_pos_x"] - pitch_types.loc[p2, "release_pos_x"]) ** 2 +
            (pitch_types.loc[p1, "release_pos_z"] - pitch_types.loc[p2, "release_pos_z"]) ** 2
        )

        # Distance at plate
        plate_dist = np.sqrt(
            (pitch_types.loc[p1, "plate_x_mean"] - pitch_types.loc[p2, "plate_x_mean"]) ** 2 +
            (pitch_types.loc[p1, "plate_z_mean"] - pitch_types.loc[p2, "plate_z_mean"]) ** 2
        )

        # Good tunneling = small release distance, large plate distance
        tunnel_matrix.loc[p1, p2] = plate_dist - release_dist

    return pitch_types, tunnel_matrix

# Analyze a pitcher
transitions, probs = analyze_pitch_sequences(pitches_df, pitcher_id=12345)
print("Pitch Transition Probabilities:")
print(probs)

effectiveness = sequence_effectiveness(pitches_df, pitcher_id=12345)
print("\nBest Sequences (by run value):")
print(effectiveness.head(10))

print("\nWorst Sequences:")
print(effectiveness.tail(5))

python

Game State Win Probability

Calculate real-time win probability based on current game state.

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt

class WinProbabilityModel:
    """Calculate win probability for different sports."""

    def __init__(self, sport="baseball"):
        self.sport = sport
        self.model = None
        self.poly = PolynomialFeatures(degree=2, include_bias=False)

    def train_baseball_model(self, historical_games_df):
        """Train win probability model for baseball."""
        # Create game states from historical data
        states = []

        for _, game in historical_games_df.iterrows():
            for inning in range(1, 10):
                for half in [0, 0.5]:  # Top/bottom
                    state = {
                        "inning": inning + half,
                        "score_diff": game[f"home_score_after_{inning}"] -
                                     game[f"away_score_after_{inning}"],
                        "home_win": game["home_win"]
                    }
                    states.append(state)

        states_df = pd.DataFrame(states)

        # Features
        X = states_df[["inning", "score_diff"]]
        X_poly = self.poly.fit_transform(X)
        y = states_df["home_win"]

        self.model = LogisticRegression(max_iter=1000)
        self.model.fit(X_poly, y)

        return self

    def get_win_probability(self, inning, score_diff, is_home_batting=True):
        """Get current win probability."""
        if self.model is None:
            raise ValueError("Model not trained")

        # Adjust inning for half
        inning_val = inning + (0.5 if is_home_batting else 0)

        X = np.array([[inning_val, score_diff]])
        X_poly = self.poly.transform(X)

        return self.model.predict_proba(X_poly)[0, 1]

    def plot_win_probability_curve(self, play_by_play_df, game_id):
        """Plot win probability over course of game."""
        game = play_by_play_df[play_by_play_df["game_id"] == game_id].copy()

        # Calculate WP at each play
        wp_list = []
        for _, play in game.iterrows():
            wp = self.get_win_probability(
                play["inning"],
                play["home_score"] - play["away_score"],
                play["is_home_batting"]
            )
            wp_list.append(wp)

        game["win_prob"] = wp_list

        # Plot
        fig, ax = plt.subplots(figsize=(14, 6))

        ax.plot(range(len(game)), game["win_prob"], "b-", linewidth=2)
        ax.axhline(y=0.5, color="gray", linestyle="--", alpha=0.5)
        ax.fill_between(range(len(game)), 0.5, game["win_prob"],
                       where=game["win_prob"] >= 0.5, alpha=0.3, color="blue")
        ax.fill_between(range(len(game)), game["win_prob"], 0.5,
                       where=game["win_prob"] < 0.5, alpha=0.3, color="red")

        ax.set_ylim(0, 1)
        ax.set_ylabel("Home Win Probability")
        ax.set_xlabel("Play Number")
        ax.set_title(f"Win Probability Chart - Game {game_id}")

        # Mark scoring plays
        scoring = game[game["runs_scored"] > 0]
        ax.scatter(scoring.index, scoring["win_prob"], c="red", s=100, zorder=5)

        plt.tight_layout()
        return fig, ax

# Usage
wp_model = WinProbabilityModel("baseball")
wp_model.train_baseball_model(historical_df)

# Current game state
current_wp = wp_model.get_win_probability(inning=7, score_diff=2, is_home_batting=False)
print(f"Home team win probability: {current_wp:.1%}")

python

Strain and Workload Monitoring

Monitor player workload and injury risk using training data.

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

class WorkloadMonitor:
    """Monitor player workload and injury risk."""

    def __init__(self):
        self.chronic_window = 28  # days
        self.acute_window = 7     # days

    def calculate_workload(self, training_df, player_id):
        """Calculate acute and chronic workload."""
        player_data = training_df[
            training_df["player_id"] == player_id
        ].sort_values("date")

        # Calculate rolling workloads
        player_data["acute_load"] = player_data["training_load"].rolling(
            window=self.acute_window, min_periods=1
        ).sum()

        player_data["chronic_load"] = player_data["training_load"].rolling(
            window=self.chronic_window, min_periods=7
        ).mean() * self.acute_window

        # Acute:Chronic Workload Ratio (ACWR)
        player_data["acwr"] = (
            player_data["acute_load"] /
            player_data["chronic_load"].replace(0, np.nan)
        )

        return player_data

    def calculate_monotony_strain(self, training_df, player_id):
        """Calculate training monotony and strain."""
        player_data = training_df[
            training_df["player_id"] == player_id
        ].sort_values("date")

        # Weekly calculations
        player_data["week"] = player_data["date"].dt.isocalendar().week

        weekly = player_data.groupby("week").agg({
            "training_load": ["sum", "mean", "std"]
        })
        weekly.columns = ["weekly_load", "daily_mean", "daily_std"]

        # Monotony = mean / std (lower variation = higher monotony)
        weekly["monotony"] = weekly["daily_mean"] / weekly["daily_std"].replace(0, np.nan)

        # Strain = weekly_load * monotony
        weekly["strain"] = weekly["weekly_load"] * weekly["monotony"]

        return weekly

    def assess_injury_risk(self, player_data):
        """Assess injury risk based on workload."""
        latest = player_data.iloc[-1]

        risk_factors = []

        # ACWR risk zones
        acwr = latest.get("acwr", 1.0)
        if acwr < 0.8:
            risk_factors.append(("Low fitness", "ACWR below 0.8"))
        elif acwr > 1.5:
            risk_factors.append(("Spike in load", "ACWR above 1.5"))
        elif 1.0 <= acwr <= 1.25:
            risk_factors.append(("Optimal zone", "ACWR in sweet spot"))

        # Workload spike detection
        recent_load = player_data["training_load"].tail(7).sum()
        avg_load = player_data["training_load"].tail(28).mean() * 7
        if recent_load > avg_load * 1.3:
            risk_factors.append(("Week-to-week spike", f"{(recent_load/avg_load-1)*100:.0f}% increase"))

        # Overall risk score (0-100)
        risk_score = 0
        if acwr < 0.8 or acwr > 1.5:
            risk_score += 30
        if acwr > 1.75:
            risk_score += 20
        if recent_load > avg_load * 1.5:
            risk_score += 25

        return {
            "risk_score": min(risk_score, 100),
            "acwr": acwr,
            "risk_factors": risk_factors,
            "recommendation": self._get_recommendation(risk_score, acwr)
        }

    def _get_recommendation(self, risk_score, acwr):
        if risk_score >= 50:
            return "Reduce training load, consider rest day"
        elif acwr < 0.8:
            return "Gradually increase training load"
        elif 1.0 <= acwr <= 1.25:
            return "Maintain current training plan"
        else:
            return "Monitor closely, avoid further increases"

# Usage
monitor = WorkloadMonitor()

# Calculate for each player
for player_id in training_df["player_id"].unique():
    workload = monitor.calculate_workload(training_df, player_id)
    risk = monitor.assess_injury_risk(workload)

    print(f"Player {player_id}: Risk Score={risk['risk_score']}, ACWR={risk['acwr']:.2f}")
    print(f"  Recommendation: {risk['recommendation']}")

python Football

Route Running Analysis

Analyze NFL receiver routes using tracking data.

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

def extract_route_features(tracking_df, receiver_id, play_id):
    """Extract features from a single route."""
    route = tracking_df[
        (tracking_df["nfl_id"] == receiver_id) &
        (tracking_df["play_id"] == play_id)
    ].sort_values("frame_id")

    if len(route) < 10:
        return None

    # Get route after snap
    snap_frame = route[route["event"] == "ball_snap"]["frame_id"].iloc[0]
    route = route[route["frame_id"] >= snap_frame]

    # Calculate features
    features = {
        "play_id": play_id,
        "receiver_id": receiver_id,

        # Distance metrics
        "total_distance": route["dis"].sum(),
        "max_depth": route["y"].max() - route["y"].iloc[0],
        "lateral_movement": abs(route["x"].max() - route["x"].min()),

        # Speed metrics
        "max_speed": route["s"].max(),
        "avg_speed": route["s"].mean(),
        "speed_at_catch": route[route["event"] == "pass_arrived"]["s"].iloc[0]
            if "pass_arrived" in route["event"].values else np.nan,

        # Acceleration
        "max_acceleration": route["a"].max(),
        "break_acceleration": route.loc[route["a"].idxmax(), "a"],

        # Direction changes
        "direction_changes": (
            (route["dir"].diff().abs() > 45).sum()
        ),

        # Separation at key moments
        "separation_at_target": route[
            route["event"] == "pass_arrived"
        ]["separation"].iloc[0] if "pass_arrived" in route["event"].values else np.nan
    }

    return features

def classify_routes(route_features_df, n_clusters=9):
    """Cluster routes into types."""
    features = [
        "max_depth", "lateral_movement", "direction_changes",
        "total_distance", "max_acceleration"
    ]

    X = route_features_df[features].dropna()
    X_scaled = (X - X.mean()) / X.std()

    # K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    route_features_df.loc[X.index, "route_cluster"] = kmeans.fit_predict(X_scaled)

    # Analyze clusters
    cluster_profiles = route_features_df.groupby("route_cluster")[features].mean()

    # Route type labels based on characteristics
    route_labels = {
        0: "Go/Fade",
        1: "Out",
        2: "In/Dig",
        3: "Slant",
        4: "Curl/Comeback",
        5: "Post",
        6: "Corner",
        7: "Flat/Screen",
        8: "Wheel"
    }

    return route_features_df, cluster_profiles

def analyze_receiver_routes(tracking_df, receiver_name):
    """Full route analysis for a receiver."""
    receiver_df = tracking_df[tracking_df["display_name"] == receiver_name]
    receiver_id = receiver_df["nfl_id"].iloc[0]

    play_ids = receiver_df["play_id"].unique()

    features_list = []
    for play_id in play_ids:
        features = extract_route_features(tracking_df, receiver_id, play_id)
        if features:
            features_list.append(features)

    route_df = pd.DataFrame(features_list)
    route_df, cluster_profiles = classify_routes(route_df)

    # Summary stats
    summary = {
        "receiver": receiver_name,
        "total_routes": len(route_df),
        "avg_separation": route_df["separation_at_target"].mean(),
        "avg_max_speed": route_df["max_speed"].mean(),
        "route_distribution": route_df["route_cluster"].value_counts().to_dict()
    }

    return route_df, summary

route_df, summary = analyze_receiver_routes(tracking_df, "Justin Jefferson")
print(summary)

python

Draft Pick Value Calculator

Calculate and compare draft pick values across rounds.

import pandas as pd
import numpy as np
from scipy.optimize import curve_fit

class DraftValueCalculator:
    """Calculate draft pick values based on historical production."""

    def __init__(self, historical_draft_df):
        self.draft_data = historical_draft_df
        self.value_curve = None

    def fit_value_curve(self):
        """Fit value curve to historical data."""
        # Group by pick and calculate average career value
        pick_values = self.draft_data.groupby("pick_number").agg({
            "career_war": "mean",
            "player_id": "count"
        }).rename(columns={"player_id": "sample_size"})

        # Filter to picks with enough sample
        pick_values = pick_values[pick_values["sample_size"] >= 10]

        # Fit exponential decay
        def value_func(x, a, b, c):
            return a * np.exp(-b * x) + c

        picks = pick_values.index.values
        values = pick_values["career_war"].values

        params, _ = curve_fit(value_func, picks, values, p0=[10, 0.05, 0])
        self.value_curve = lambda x: value_func(x, *params)

        return pick_values

    def get_pick_value(self, pick_number):
        """Get expected value for a pick."""
        if self.value_curve is None:
            self.fit_value_curve()
        return max(0, self.value_curve(pick_number))

    def compare_picks(self, pick1, pick2):
        """Compare value between two picks."""
        val1 = self.get_pick_value(pick1)
        val2 = self.get_pick_value(pick2)
        return {
            "pick1": pick1,
            "pick1_value": val1,
            "pick2": pick2,
            "pick2_value": val2,
            "difference": val1 - val2,
            "ratio": val1 / val2 if val2 > 0 else float("inf")
        }

    def equivalent_picks(self, pick_number, target_value_pct=1.0):
        """Find equivalent pick combinations."""
        target_value = self.get_pick_value(pick_number) * target_value_pct

        # Find single pick equivalents
        for p in range(pick_number + 1, 250):
            if self.get_pick_value(p) <= target_value:
                break

        # Find two-pick combinations
        combinations = []
        for p1 in range(pick_number + 5, 100):
            remaining = target_value - self.get_pick_value(p1)
            for p2 in range(p1 + 5, 200):
                combo_value = self.get_pick_value(p2)
                if abs(combo_value - remaining) < 0.5:
                    combinations.append({
                        "picks": [p1, p2],
                        "total_value": self.get_pick_value(p1) + combo_value
                    })

        return combinations[:5]

    def trade_analyzer(self, team1_picks, team2_picks):
        """Analyze a trade between two teams."""
        team1_value = sum(self.get_pick_value(p) for p in team1_picks)
        team2_value = sum(self.get_pick_value(p) for p in team2_picks)

        return {
            "team1_picks": team1_picks,
            "team1_value": team1_value,
            "team2_picks": team2_picks,
            "team2_value": team2_value,
            "difference": team1_value - team2_value,
            "winner": "Team 1" if team1_value > team2_value else "Team 2"
        }

# Usage
calculator = DraftValueCalculator(historical_drafts_df)
calculator.fit_value_curve()

# Compare picks
comparison = calculator.compare_picks(1, 10)
print(f"Pick 1 value: {comparison['pick1_value']:.2f} WAR")
print(f"Pick 10 value: {comparison['pick2_value']:.2f} WAR")

# Analyze trade
trade = calculator.trade_analyzer(
    team1_picks=[5],
    team2_picks=[15, 25, 50]
)
print(f"\nTrade Winner: {trade['winner']}")
print(f"Value difference: {abs(trade['difference']):.2f} WAR")

python Basketball

Plus-Minus Rating Calculator

Calculate plus-minus ratings for basketball players with adjustments.

import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.linear_model import RidgeCV

def calculate_raw_plus_minus(play_by_play_df):
    """Calculate raw plus-minus from play-by-play data."""

    # Get all players
    all_players = set()
    for col in ["home_1", "home_2", "home_3", "home_4", "home_5",
                "away_1", "away_2", "away_3", "away_4", "away_5"]:
        all_players.update(play_by_play_df[col].dropna().unique())

    player_list = sorted(list(all_players))
    player_idx = {p: i for i, p in enumerate(player_list)}

    # Calculate per-stint stats
    stints = play_by_play_df.groupby("stint_id").agg({
        "home_1": "first", "home_2": "first", "home_3": "first",
        "home_4": "first", "home_5": "first",
        "away_1": "first", "away_2": "first", "away_3": "first",
        "away_4": "first", "away_5": "first",
        "home_points": "sum",
        "away_points": "sum",
        "possessions": "sum"
    })

    stints["margin"] = stints["home_points"] - stints["away_points"]
    stints["margin_per_100"] = stints["margin"] / stints["possessions"] * 100

    return stints, player_list, player_idx

def calculate_rapm(stints_df, player_list, player_idx, alpha_range=(0.01, 100)):
    """Calculate Regularized Adjusted Plus-Minus (RAPM)."""

    n_stints = len(stints_df)
    n_players = len(player_list)

    # Build sparse design matrix
    rows, cols, data = [], [], []

    for idx, stint in stints_df.iterrows():
        row_idx = stints_df.index.get_loc(idx)

        # Home players get +1
        for col in ["home_1", "home_2", "home_3", "home_4", "home_5"]:
            player = stint[col]
            if pd.notna(player) and player in player_idx:
                rows.append(row_idx)
                cols.append(player_idx[player])
                data.append(1)

        # Away players get -1
        for col in ["away_1", "away_2", "away_3", "away_4", "away_5"]:
            player = stint[col]
            if pd.notna(player) and player in player_idx:
                rows.append(row_idx)
                cols.append(player_idx[player])
                data.append(-1)

    X = sparse.csr_matrix((data, (rows, cols)), shape=(n_stints, n_players))
    y = stints_df["margin_per_100"].values

    # Weight by possessions
    weights = np.sqrt(stints_df["possessions"].values)

    # Ridge regression with cross-validation
    alphas = np.logspace(np.log10(alpha_range[0]), np.log10(alpha_range[1]), 50)
    model = RidgeCV(alphas=alphas, fit_intercept=True)
    model.fit(X.multiply(weights[:, np.newaxis]), y * weights)

    print(f"Best alpha: {model.alpha_:.2f}")

    # Extract RAPM values
    rapm = pd.DataFrame({
        "player": player_list,
        "rapm": model.coef_
    })

    return rapm.sort_values("rapm", ascending=False)

# Calculate RAPM
stints, player_list, player_idx = calculate_raw_plus_minus(pbp_df)
rapm_ratings = calculate_rapm(stints, player_list, player_idx)

print("Top Players by RAPM:")
print(rapm_ratings.head(20))

print("\nBottom Players by RAPM:")
print(rapm_ratings.tail(10))

python Baseball

Catch Probability Model

Model catch probability for outfield fly balls using Statcast tracking data.

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def calculate_catch_features(batted_balls_df):
    """Calculate features for catch probability."""
    df = batted_balls_df.copy()

    # Distance from fielder to ball landing spot
    df["catch_distance"] = np.sqrt(
        (df["fielder_start_x"] - df["landing_x"]) ** 2 +
        (df["fielder_start_y"] - df["landing_y"]) ** 2
    )

    # Hang time (time for ball to land)
    df["hang_time"] = df["hit_time"] - df["pitch_time"]

    # Fielder reaction time needed
    # Average sprint speed ~27 ft/s
    df["time_needed"] = df["catch_distance"] / 27.0
    df["time_margin"] = df["hang_time"] - df["time_needed"]

    # Direction of ball relative to fielder (forward/back/lateral)
    df["angle_to_ball"] = np.arctan2(
        df["landing_y"] - df["fielder_start_y"],
        df["landing_x"] - df["fielder_start_x"]
    ) * 180 / np.pi

    # Going back is harder
    df["going_back"] = (df["landing_y"] > df["fielder_start_y"]).astype(int)

    return df

def train_catch_probability_model(batted_balls_df):
    """Train catch probability model."""
    df = calculate_catch_features(batted_balls_df)
    df = df[df["hit_type"] == "fly_ball"]

    features = [
        "catch_distance",
        "hang_time",
        "time_margin",
        "going_back",
        "launch_angle",
        "exit_velocity",
        "fielder_sprint_speed"
    ]

    df = df.dropna(subset=features + ["was_caught"])

    X = df[features]
    y = df["was_caught"].astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = GradientBoostingClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Add predictions
    df["catch_prob"] = model.predict_proba(X[features])[:, 1]

    print(f"Model Accuracy: {model.score(X_test, y_test):.3f}")

    # Outs Above Average
    df["oaa"] = df["was_caught"] - df["catch_prob"]

    # Aggregate by fielder
    fielder_oaa = df.groupby("fielder_name").agg({
        "oaa": "sum",
        "catch_prob": ["sum", "count"],
        "was_caught": "sum"
    })
    fielder_oaa.columns = ["OAA", "Expected_Catches", "Opportunities", "Actual_Catches"]
    fielder_oaa = fielder_oaa.sort_values("OAA", ascending=False)

    return model, df, fielder_oaa

model, catches_df, fielder_rankings = train_catch_probability_model(batted_balls_df)
print("\nTop Outfielders by Outs Above Average:")
print(fielder_rankings.head(15))

python

Player Embeddings with Neural Networks

Create player embeddings using neural networks to find similar players.

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

class PlayerEmbeddingNet(nn.Module):
    """Autoencoder to create player embeddings."""

    def __init__(self, input_dim, embedding_dim=32):
        super().__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, embedding_dim)
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        embedding = self.encoder(x)
        reconstruction = self.decoder(embedding)
        return reconstruction, embedding

    def get_embedding(self, x):
        with torch.no_grad():
            return self.encoder(x)

def train_embedding_model(player_df, stat_columns, epochs=100):
    """Train autoencoder for player embeddings."""

    # Prepare data
    X = player_df[stat_columns].fillna(player_df[stat_columns].median())
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_tensor = torch.FloatTensor(X_scaled)

    # Model
    model = PlayerEmbeddingNet(len(stat_columns), embedding_dim=16)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        reconstruction, _ = model(X_tensor)
        loss = criterion(reconstruction, X_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 20 == 0:
            print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    # Get embeddings
    model.eval()
    embeddings = model.get_embedding(X_tensor).numpy()

    # Create embedding DataFrame
    embedding_df = pd.DataFrame(
        embeddings,
        columns=[f"emb_{i}" for i in range(embeddings.shape[1])],
        index=player_df.index
    )
    embedding_df["player_name"] = player_df["name"]

    return model, scaler, embedding_df

def find_similar_players(embedding_df, player_name, n=10):
    """Find similar players using embeddings."""
    emb_cols = [c for c in embedding_df.columns if c.startswith("emb_")]

    nn = NearestNeighbors(n_neighbors=n+1, metric="cosine")
    nn.fit(embedding_df[emb_cols])

    player_idx = embedding_df[embedding_df["player_name"] == player_name].index[0]
    player_emb = embedding_df.loc[player_idx, emb_cols].values.reshape(1, -1)

    distances, indices = nn.kneighbors(player_emb)

    similar = embedding_df.iloc[indices[0][1:]].copy()  # Exclude self
    similar["similarity"] = 1 - distances[0][1:]

    return similar[["player_name", "similarity"]]

# Usage
stat_cols = ["avg", "obp", "slg", "hr_rate", "bb_rate", "k_rate", "sprint_speed", "war"]
model, scaler, embeddings = train_embedding_model(players_df, stat_cols)

# Find players similar to a specific player
similar = find_similar_players(embeddings, "Mike Trout")
print("Players similar to Mike Trout:")
print(similar)

python Baseball

Pitch Classification Model

Classify pitch types using Statcast data with machine learning.

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def build_pitch_classifier(pitches_df):
    """Classify pitch types from Statcast data."""

    # Features for classification
    features = [
        "release_speed",
        "release_spin_rate",
        "release_extension",
        "release_pos_x",
        "release_pos_z",
        "pfx_x",      # Horizontal movement
        "pfx_z",      # Vertical movement
        "plate_x",
        "plate_z",
        "vx0", "vy0", "vz0",  # Initial velocities
        "ax", "ay", "az"      # Accelerations
    ]

    # Filter to common pitch types
    pitch_types = ["FF", "SL", "CH", "CU", "SI", "FC", "KC"]
    df = pitches_df[pitches_df["pitch_type"].isin(pitch_types)].copy()
    df = df.dropna(subset=features + ["pitch_type"])

    X = df[features]
    y = df["pitch_type"]

    # Encode labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    # Train model
    model = GradientBoostingClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)

    print(f"Accuracy: {accuracy:.3f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # Feature importance
    importance = pd.DataFrame({
        "feature": features,
        "importance": model.feature_importances_
    }).sort_values("importance", ascending=False)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title("Pitch Type Classification Confusion Matrix")
    plt.ylabel("Actual")
    plt.xlabel("Predicted")

    return {
        "model": model,
        "scaler": scaler,
        "encoder": le,
        "accuracy": accuracy,
        "feature_importance": importance
    }

result = build_pitch_classifier(statcast_df)

python Soccer

Expected Goals (xG) Model

Build expected goals model for soccer shot analysis using logistic regression.

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

def calculate_shot_features(shots_df):
    """Engineer features for xG model."""
    df = shots_df.copy()

    # Distance to goal center (assuming goal at x=100, y=50)
    df["distance"] = np.sqrt(
        (100 - df["x"]) ** 2 + (50 - df["y"]) ** 2
    )

    # Angle to goal
    df["angle"] = np.abs(np.arctan2(
        df["y"] - 50,
        100 - df["x"]
    )) * 180 / np.pi

    # Goal mouth angle (visible goal width)
    goal_width = 7.32  # meters
    df["goal_angle"] = np.arctan(
        goal_width * (100 - df["x"]) /
        ((100 - df["x"]) ** 2 + (df["y"] - 50) ** 2 - (goal_width / 2) ** 2)
    )

    # Distance squared (non-linear effect)
    df["distance_sq"] = df["distance"] ** 2

    # Shot type encoding
    df["is_header"] = (df["body_part"] == "head").astype(int)
    df["is_foot"] = (df["body_part"] == "foot").astype(int)

    # Situation encoding
    df["is_penalty"] = (df["situation"] == "penalty").astype(int)
    df["is_free_kick"] = (df["situation"] == "free_kick").astype(int)
    df["is_corner"] = (df["situation"] == "from_corner").astype(int)

    return df

def train_xg_model(shots_df):
    """Train expected goals model."""
    df = calculate_shot_features(shots_df)

    features = [
        "distance", "distance_sq", "angle", "goal_angle",
        "is_header", "is_penalty", "is_free_kick", "is_corner"
    ]

    X = df[features].fillna(0)
    y = df["is_goal"].astype(int)

    # Calibrated logistic regression for accurate probabilities
    base_model = LogisticRegression(max_iter=1000, C=1.0)
    model = CalibratedClassifierCV(base_model, cv=5, method="isotonic")
    model.fit(X, y)

    # Cross-validation score
    cv_scores = cross_val_score(base_model, X, y, cv=5, scoring="neg_log_loss")
    print(f"CV Log Loss: {-cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

    # Add predictions
    df["xg"] = model.predict_proba(X)[:, 1]

    # xG by distance visualization
    bins = pd.cut(df["distance"], bins=10)
    xg_by_dist = df.groupby(bins).agg({
        "xg": "mean",
        "is_goal": "mean"
    })

    plt.figure(figsize=(10, 6))
    plt.plot(range(len(xg_by_dist)), xg_by_dist["xg"], "b-", label="Predicted xG")
    plt.plot(range(len(xg_by_dist)), xg_by_dist["is_goal"], "r--", label="Actual Goal Rate")
    plt.xlabel("Distance Bin")
    plt.ylabel("Probability")
    plt.legend()
    plt.title("xG Calibration by Distance")

    return model, df

model, shots_with_xg = train_xg_model(shots_df)

# Calculate player xG totals
player_xg = shots_with_xg.groupby("player_name").agg({
    "xg": "sum",
    "is_goal": "sum"
}).rename(columns={"is_goal": "goals"})
player_xg["xg_diff"] = player_xg["goals"] - player_xg["xg"]
print(player_xg.sort_values("xg", ascending=False).head(20))

sql

Streak Detection Query

Find winning/losing streaks and hitting streaks.

-- Detect hitting streaks
WITH game_hits AS (
    SELECT
        player_id,
        game_date,
        hits,
        CASE WHEN hits > 0 THEN 1 ELSE 0 END AS had_hit,
        ROW_NUMBER() OVER (PARTITION BY player_id ORDER BY game_date) AS game_num
    FROM game_log
    WHERE player_id = ?
      AND season = 2024
      AND at_bats > 0
),
streak_groups AS (
    SELECT
        *,
        game_num - ROW_NUMBER() OVER (
            PARTITION BY player_id, had_hit
            ORDER BY game_date
        ) AS streak_group
    FROM game_hits
),
streaks AS (
    SELECT
        player_id,
        MIN(game_date) AS streak_start,
        MAX(game_date) AS streak_end,
        COUNT(*) AS streak_length,
        SUM(hits) AS total_hits,
        had_hit
    FROM streak_groups
    GROUP BY player_id, streak_group, had_hit
)
SELECT
    streak_start,
    streak_end,
    streak_length,
    total_hits,
    CASE WHEN had_hit = 1 THEN 'Hit Streak' ELSE 'Hitless Streak' END AS streak_type
FROM streaks
WHERE had_hit = 1  -- Only hit streaks
  AND streak_length >= 5  -- Minimum 5 games
ORDER BY streak_length DESC;

-- Team winning streak detection
WITH team_games AS (
    SELECT
        team_id,
        game_date,
        CASE WHEN won = 1 THEN 1 ELSE 0 END AS won,
        ROW_NUMBER() OVER (PARTITION BY team_id ORDER BY game_date) AS game_num
    FROM (
        SELECT
            home_team_id AS team_id,
            game_date,
            CASE WHEN home_score > away_score THEN 1 ELSE 0 END AS won
        FROM games WHERE season = 2024
        UNION ALL
        SELECT
            away_team_id AS team_id,
            game_date,
            CASE WHEN away_score > home_score THEN 1 ELSE 0 END AS won
        FROM games WHERE season = 2024
    ) all_games
),
streak_groups AS (
    SELECT
        *,
        game_num - ROW_NUMBER() OVER (
            PARTITION BY team_id, won ORDER BY game_date
        ) AS streak_group
    FROM team_games
)
SELECT
    t.team_name,
    MIN(sg.game_date) AS streak_start,
    MAX(sg.game_date) AS streak_end,
    COUNT(*) AS streak_length,
    CASE WHEN sg.won = 1 THEN 'Winning' ELSE 'Losing' END AS streak_type
FROM streak_groups sg
JOIN teams t ON sg.team_id = t.team_id
GROUP BY t.team_name, sg.streak_group, sg.won
HAVING COUNT(*) >= 5
ORDER BY streak_length DESC
LIMIT 20;

sql

Split Statistics Query

Calculate player splits: home/away, vs lefty/righty, by month.

-- Player performance splits
SELECT
    p.name,
    -- Home/Away splits
    'Home' AS split_type,
    SUM(CASE WHEN gl.is_home = 1 THEN gl.at_bats ELSE 0 END) AS ab,
    SUM(CASE WHEN gl.is_home = 1 THEN gl.hits ELSE 0 END) AS h,
    ROUND(
        SUM(CASE WHEN gl.is_home = 1 THEN gl.hits ELSE 0 END) * 1.0 /
        NULLIF(SUM(CASE WHEN gl.is_home = 1 THEN gl.at_bats ELSE 0 END), 0),
        3
    ) AS avg,
    SUM(CASE WHEN gl.is_home = 1 THEN gl.home_runs ELSE 0 END) AS hr,
    SUM(CASE WHEN gl.is_home = 1 THEN gl.rbi ELSE 0 END) AS rbi
FROM players p
JOIN game_log gl ON p.player_id = gl.player_id
WHERE p.player_id = ? AND gl.season = 2024
GROUP BY p.name

UNION ALL

SELECT
    p.name,
    'Away' AS split_type,
    SUM(CASE WHEN gl.is_home = 0 THEN gl.at_bats ELSE 0 END),
    SUM(CASE WHEN gl.is_home = 0 THEN gl.hits ELSE 0 END),
    ROUND(
        SUM(CASE WHEN gl.is_home = 0 THEN gl.hits ELSE 0 END) * 1.0 /
        NULLIF(SUM(CASE WHEN gl.is_home = 0 THEN gl.at_bats ELSE 0 END), 0),
        3
    ),
    SUM(CASE WHEN gl.is_home = 0 THEN gl.home_runs ELSE 0 END),
    SUM(CASE WHEN gl.is_home = 0 THEN gl.rbi ELSE 0 END)
FROM players p
JOIN game_log gl ON p.player_id = gl.player_id
WHERE p.player_id = ? AND gl.season = 2024
GROUP BY p.name

UNION ALL

-- vs Left-handed pitchers
SELECT
    p.name,
    'vs LHP' AS split_type,
    SUM(CASE WHEN opp.throws = 'L' THEN gl.at_bats ELSE 0 END),
    SUM(CASE WHEN opp.throws = 'L' THEN gl.hits ELSE 0 END),
    ROUND(
        SUM(CASE WHEN opp.throws = 'L' THEN gl.hits ELSE 0 END) * 1.0 /
        NULLIF(SUM(CASE WHEN opp.throws = 'L' THEN gl.at_bats ELSE 0 END), 0),
        3
    ),
    SUM(CASE WHEN opp.throws = 'L' THEN gl.home_runs ELSE 0 END),
    SUM(CASE WHEN opp.throws = 'L' THEN gl.rbi ELSE 0 END)
FROM players p
JOIN game_log gl ON p.player_id = gl.player_id
JOIN players opp ON gl.opposing_pitcher_id = opp.player_id
WHERE p.player_id = ? AND gl.season = 2024
GROUP BY p.name

UNION ALL

-- vs Right-handed pitchers
SELECT
    p.name,
    'vs RHP' AS split_type,
    SUM(CASE WHEN opp.throws = 'R' THEN gl.at_bats ELSE 0 END),
    SUM(CASE WHEN opp.throws = 'R' THEN gl.hits ELSE 0 END),
    ROUND(
        SUM(CASE WHEN opp.throws = 'R' THEN gl.hits ELSE 0 END) * 1.0 /
        NULLIF(SUM(CASE WHEN opp.throws = 'R' THEN gl.at_bats ELSE 0 END), 0),
        3
    ),
    SUM(CASE WHEN opp.throws = 'R' THEN gl.home_runs ELSE 0 END),
    SUM(CASE WHEN opp.throws = 'R' THEN gl.rbi ELSE 0 END)
FROM players p
JOIN game_log gl ON p.player_id = gl.player_id
JOIN players opp ON gl.opposing_pitcher_id = opp.player_id
WHERE p.player_id = ? AND gl.season = 2024
GROUP BY p.name;

sql

Fantasy Points Calculation

Calculate fantasy sports points based on scoring rules.

-- Calculate fantasy baseball points (standard scoring)
WITH fantasy_scoring AS (
    SELECT
        p.player_id,
        p.name,
        pos.position,
        t.team_name,
        s.games,
        -- Batting points
        s.hits * 1 +                    -- 1 pt per hit
        s.doubles * 1 +                 -- +1 for 2B (total 2)
        s.triples * 2 +                 -- +2 for 3B (total 3)
        s.home_runs * 3 +               -- +3 for HR (total 4)
        s.rbi * 1 +                     -- 1 pt per RBI
        s.runs * 1 +                    -- 1 pt per run
        s.walks * 1 +                   -- 1 pt per walk
        s.stolen_bases * 2 +            -- 2 pts per SB
        s.caught_stealing * -1 +        -- -1 per CS
        s.strikeouts * -0.5             -- -0.5 per K
        AS batting_points,
        -- Pitching points
        COALESCE(ps.innings_pitched * 3, 0) +    -- 3 pts per IP
        COALESCE(ps.strikeouts * 1, 0) +         -- 1 pt per K
        COALESCE(ps.wins * 5, 0) +               -- 5 pts per W
        COALESCE(ps.saves * 5, 0) +              -- 5 pts per SV
        COALESCE(ps.earned_runs * -2, 0) +       -- -2 per ER
        COALESCE(ps.walks * -1, 0) +             -- -1 per BB
        COALESCE(ps.hits_allowed * -1, 0)        -- -1 per hit
        AS pitching_points
    FROM players p
    JOIN player_positions pos ON p.player_id = pos.player_id
    JOIN teams t ON p.team_id = t.team_id
    LEFT JOIN batting_stats s ON p.player_id = s.player_id AND s.season = 2024
    LEFT JOIN pitching_stats ps ON p.player_id = ps.player_id AND ps.season = 2024
)
SELECT
    player_id,
    name,
    position,
    team_name,
    games,
    ROUND(batting_points, 1) AS batting_pts,
    ROUND(pitching_points, 1) AS pitching_pts,
    ROUND(batting_points + pitching_points, 1) AS total_fantasy_pts,
    ROUND((batting_points + pitching_points) / NULLIF(games, 0), 2) AS pts_per_game,
    RANK() OVER (ORDER BY batting_points + pitching_points DESC) AS overall_rank,
    RANK() OVER (PARTITION BY position ORDER BY batting_points + pitching_points DESC) AS position_rank
FROM fantasy_scoring
WHERE games >= 20
ORDER BY total_fantasy_pts DESC
LIMIT 100;

sql

Team Standings with Run Differential

Calculate team standings with Pythagorean win expectation.

-- Team standings with Pythagorean expectation
WITH team_records AS (
    SELECT
        t.team_id,
        t.team_name,
        t.division,
        t.league,
        SUM(CASE WHEN g.winner_id = t.team_id THEN 1 ELSE 0 END) AS wins,
        SUM(CASE WHEN g.loser_id = t.team_id THEN 1 ELSE 0 END) AS losses,
        SUM(CASE WHEN g.home_team_id = t.team_id THEN g.home_score ELSE g.away_score END) AS runs_scored,
        SUM(CASE WHEN g.home_team_id = t.team_id THEN g.away_score ELSE g.home_score END) AS runs_allowed
    FROM teams t
    JOIN games g ON t.team_id = g.home_team_id OR t.team_id = g.away_team_id
    WHERE g.season = 2024 AND g.status = 'Final'
    GROUP BY t.team_id, t.team_name, t.division, t.league
)
SELECT
    team_name,
    division,
    wins,
    losses,
    ROUND(wins * 1.0 / (wins + losses), 3) AS win_pct,
    runs_scored AS RS,
    runs_allowed AS RA,
    runs_scored - runs_allowed AS run_diff,
    -- Pythagorean expectation (exponent 1.83 for baseball)
    ROUND(
        POWER(runs_scored, 1.83) /
        (POWER(runs_scored, 1.83) + POWER(runs_allowed, 1.83)),
        3
    ) AS pythag_pct,
    ROUND(
        (wins + losses) * POWER(runs_scored, 1.83) /
        (POWER(runs_scored, 1.83) + POWER(runs_allowed, 1.83)),
        0
    ) AS expected_wins,
    wins - ROUND(
        (wins + losses) * POWER(runs_scored, 1.83) /
        (POWER(runs_scored, 1.83) + POWER(runs_allowed, 1.83)),
        0
    ) AS luck_factor,
    -- Games behind division leader
    (
        SELECT MAX(w2.wins) - wins + (losses - MIN(w2.losses))
        FROM team_records w2
        WHERE w2.division = team_records.division
    ) / 2.0 AS games_behind
FROM team_records
ORDER BY division, wins DESC;

sql

Park Factor Calculation

Calculate park factors to adjust for home ballpark effects.

-- Calculate park factors for each ballpark
WITH park_stats AS (
    SELECT
        v.venue_id,
        v.venue_name,
        t.team_name AS home_team,
        -- Home games stats
        SUM(g.home_score + g.away_score) AS total_runs,
        SUM(g.home_hits + g.away_hits) AS total_hits,
        SUM(g.home_hr + g.away_hr) AS total_hr,
        COUNT(*) AS home_games
    FROM games g
    JOIN venues v ON g.venue_id = v.venue_id
    JOIN teams t ON g.home_team_id = t.team_id
    WHERE g.season = 2024
      AND g.status = 'Final'
    GROUP BY v.venue_id, v.venue_name, t.team_name
),
road_stats AS (
    SELECT
        t.team_id,
        t.team_name,
        SUM(CASE
            WHEN g.home_team_id = t.team_id THEN g.home_score + g.away_score
            ELSE g.home_score + g.away_score
        END) AS total_runs,
        SUM(CASE
            WHEN g.home_team_id = t.team_id THEN g.home_hits + g.away_hits
            ELSE g.home_hits + g.away_hits
        END) AS total_hits,
        SUM(CASE
            WHEN g.home_team_id = t.team_id THEN g.home_hr + g.away_hr
            ELSE g.home_hr + g.away_hr
        END) AS total_hr,
        SUM(CASE WHEN g.away_team_id = t.team_id THEN 1 ELSE 0 END) AS road_games
    FROM games g
    JOIN teams t ON g.away_team_id = t.team_id
    WHERE g.season = 2024
      AND g.status = 'Final'
    GROUP BY t.team_id, t.team_name
)
SELECT
    ps.venue_name,
    ps.home_team,
    ps.home_games,
    -- Runs park factor
    ROUND(
        (ps.total_runs / ps.home_games) /
        NULLIF((rs.total_runs / rs.road_games), 0),
        3
    ) AS runs_pf,
    -- Hits park factor
    ROUND(
        (ps.total_hits / ps.home_games) /
        NULLIF((rs.total_hits / rs.road_games), 0),
        3
    ) AS hits_pf,
    -- HR park factor
    ROUND(
        (ps.total_hr / ps.home_games) /
        NULLIF((rs.total_hr / NULLIF(rs.road_games, 0)), 0),
        3
    ) AS hr_pf,
    -- Average runs per game at park
    ROUND(ps.total_runs * 1.0 / ps.home_games, 2) AS runs_per_game,
    ROUND(ps.total_hr * 1.0 / ps.home_games, 2) AS hr_per_game
FROM park_stats ps
JOIN teams t ON ps.home_team = t.team_name
JOIN road_stats rs ON t.team_id = rs.team_id
ORDER BY runs_pf DESC;

sql

Advanced Pitching Metrics

Calculate FIP, xFIP, and other advanced pitching metrics.

-- Calculate FIP and advanced pitching metrics
WITH league_constants AS (
    SELECT
        season,
        -- FIP constant = lgERA - ((13*lgHR + 3*lgBB - 2*lgK) / lgIP)
        AVG(era) - (
            (13 * SUM(home_runs) + 3 * (SUM(walks) + SUM(hbp)) - 2 * SUM(strikeouts)) /
            NULLIF(SUM(innings_pitched), 0)
        ) AS fip_constant,
        -- League HR/FB rate for xFIP
        SUM(home_runs) * 1.0 / NULLIF(SUM(fly_balls), 0) AS lg_hr_fb_rate
    FROM pitching_stats
    WHERE season = 2024
    GROUP BY season
),
pitcher_metrics AS (
    SELECT
        p.player_id,
        p.name,
        t.team_name,
        ps.season,
        ps.games,
        ps.games_started,
        ps.innings_pitched AS ip,
        ps.wins,
        ps.losses,
        ps.saves,
        ps.strikeouts AS k,
        ps.walks AS bb,
        ps.hbp,
        ps.home_runs AS hr,
        ps.earned_runs,
        ps.fly_balls,
        ps.era,
        -- FIP = ((13*HR + 3*(BB+HBP) - 2*K) / IP) + FIP_constant
        ROUND(
            ((13 * ps.home_runs + 3 * (ps.walks + ps.hbp) - 2 * ps.strikeouts) /
             NULLIF(ps.innings_pitched, 0)) + lc.fip_constant,
            2
        ) AS fip,
        -- xFIP uses league average HR/FB rate
        ROUND(
            ((13 * (ps.fly_balls * lc.lg_hr_fb_rate) + 3 * (ps.walks + ps.hbp) - 2 * ps.strikeouts) /
             NULLIF(ps.innings_pitched, 0)) + lc.fip_constant,
            2
        ) AS xfip,
        -- K/9
        ROUND(ps.strikeouts * 9.0 / NULLIF(ps.innings_pitched, 0), 2) AS k_9,
        -- BB/9
        ROUND(ps.walks * 9.0 / NULLIF(ps.innings_pitched, 0), 2) AS bb_9,
        -- K/BB ratio
        ROUND(ps.strikeouts * 1.0 / NULLIF(ps.walks, 0), 2) AS k_bb,
        -- HR/9
        ROUND(ps.home_runs * 9.0 / NULLIF(ps.innings_pitched, 0), 2) AS hr_9,
        -- WHIP
        ROUND((ps.walks + ps.hits) / NULLIF(ps.innings_pitched, 0), 3) AS whip,
        -- BABIP
        ROUND(
            (ps.hits - ps.home_runs) * 1.0 /
            NULLIF(ps.at_bats - ps.strikeouts - ps.home_runs + ps.sacrifice_flies, 0),
            3
        ) AS babip
    FROM pitching_stats ps
    JOIN players p ON ps.player_id = p.player_id
    JOIN teams t ON ps.team_id = t.team_id
    CROSS JOIN league_constants lc
    WHERE ps.season = 2024
      AND ps.innings_pitched >= 50  -- Minimum IP qualifier
)
SELECT *
FROM pitcher_metrics
ORDER BY fip ASC
LIMIT 50;

sql

Pitching Matchup Analysis

Analyze batter vs pitcher historical matchups.

-- Batter vs Pitcher matchup history
SELECT
    b.name AS batter_name,
    p.name AS pitcher_name,
    COUNT(*) AS plate_appearances,
    SUM(CASE WHEN pa.event_type = 'single' THEN 1 ELSE 0 END) AS singles,
    SUM(CASE WHEN pa.event_type = 'double' THEN 1 ELSE 0 END) AS doubles,
    SUM(CASE WHEN pa.event_type = 'triple' THEN 1 ELSE 0 END) AS triples,
    SUM(CASE WHEN pa.event_type = 'home_run' THEN 1 ELSE 0 END) AS home_runs,
    SUM(CASE WHEN pa.event_type IN ('single','double','triple','home_run') THEN 1 ELSE 0 END) AS hits,
    SUM(CASE WHEN pa.event_type = 'strikeout' THEN 1 ELSE 0 END) AS strikeouts,
    SUM(CASE WHEN pa.event_type = 'walk' THEN 1 ELSE 0 END) AS walks,
    -- At bats (exclude walks, HBP, sac)
    SUM(CASE WHEN pa.is_at_bat = 1 THEN 1 ELSE 0 END) AS at_bats,
    -- Batting average
    ROUND(
        SUM(CASE WHEN pa.event_type IN ('single','double','triple','home_run') THEN 1 ELSE 0 END) * 1.0 /
        NULLIF(SUM(CASE WHEN pa.is_at_bat = 1 THEN 1 ELSE 0 END), 0),
        3
    ) AS avg,
    -- Slugging
    ROUND(
        (SUM(CASE WHEN pa.event_type = 'single' THEN 1 ELSE 0 END) +
         SUM(CASE WHEN pa.event_type = 'double' THEN 2 ELSE 0 END) +
         SUM(CASE WHEN pa.event_type = 'triple' THEN 3 ELSE 0 END) +
         SUM(CASE WHEN pa.event_type = 'home_run' THEN 4 ELSE 0 END)) * 1.0 /
        NULLIF(SUM(CASE WHEN pa.is_at_bat = 1 THEN 1 ELSE 0 END), 0),
        3
    ) AS slg,
    -- Recent form (last 2 seasons)
    SUM(CASE WHEN pa.season >= YEAR(CURDATE()) - 1 THEN 1 ELSE 0 END) AS recent_pa
FROM plate_appearances pa
JOIN players b ON pa.batter_id = b.player_id
JOIN players p ON pa.pitcher_id = p.player_id
WHERE pa.batter_id = ?
  AND pa.pitcher_id = ?
GROUP BY b.name, p.name
HAVING plate_appearances >= 5;

sql

Rolling Statistics Window

Calculate rolling averages over game windows for trend analysis.

-- Rolling 20-game batting average
WITH game_stats AS (
    SELECT
        gl.player_id,
        gl.game_date,
        gl.at_bats,
        gl.hits,
        ROW_NUMBER() OVER (
            PARTITION BY gl.player_id
            ORDER BY gl.game_date
        ) AS game_num
    FROM game_log gl
    WHERE gl.player_id = ?
      AND gl.season = 2024
      AND gl.at_bats > 0
)
SELECT
    game_date,
    at_bats,
    hits,
    game_num,
    -- Rolling 20-game totals
    SUM(hits) OVER (
        ORDER BY game_num
        ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
    ) AS rolling_hits,
    SUM(at_bats) OVER (
        ORDER BY game_num
        ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
    ) AS rolling_ab,
    -- Rolling 20-game average
    ROUND(
        SUM(hits) OVER (
            ORDER BY game_num
            ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
        ) * 1.0 /
        NULLIF(SUM(at_bats) OVER (
            ORDER BY game_num
            ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
        ), 0),
        3
    ) AS rolling_avg,
    -- Season-to-date average
    ROUND(
        SUM(hits) OVER (ORDER BY game_num) * 1.0 /
        NULLIF(SUM(at_bats) OVER (ORDER BY game_num), 0),
        3
    ) AS season_avg
FROM game_stats
ORDER BY game_date;

sql

Player Comparison Query

Compare two players head-to-head across multiple statistics.

-- Side-by-side player comparison
WITH player_stats AS (
    SELECT
        p.player_id,
        p.name,
        p.position,
        s.season,
        s.games,
        s.at_bats,
        s.hits,
        ROUND(s.hits * 1.0 / NULLIF(s.at_bats, 0), 3) AS avg,
        s.home_runs,
        s.rbi,
        s.stolen_bases,
        s.walks,
        s.strikeouts,
        ROUND(s.walks * 1.0 / s.plate_appearances * 100, 1) AS bb_pct,
        ROUND(s.strikeouts * 1.0 / s.plate_appearances * 100, 1) AS k_pct,
        s.war
    FROM players p
    JOIN player_stats s ON p.player_id = s.player_id
    WHERE p.player_id IN (?, ?)  -- Two player IDs to compare
      AND s.season = 2024
)
SELECT
    'Statistic' AS metric,
    MAX(CASE WHEN player_id = ? THEN name END) AS player1,
    MAX(CASE WHEN player_id = ? THEN name END) AS player2
FROM player_stats

UNION ALL

SELECT 'Games',
    MAX(CASE WHEN player_id = ? THEN CAST(games AS CHAR) END),
    MAX(CASE WHEN player_id = ? THEN CAST(games AS CHAR) END)
FROM player_stats

UNION ALL

SELECT 'AVG',
    MAX(CASE WHEN player_id = ? THEN CAST(avg AS CHAR) END),
    MAX(CASE WHEN player_id = ? THEN CAST(avg AS CHAR) END)
FROM player_stats

UNION ALL

SELECT 'Home Runs',
    MAX(CASE WHEN player_id = ? THEN CAST(home_runs AS CHAR) END),
    MAX(CASE WHEN player_id = ? THEN CAST(home_runs AS CHAR) END)
FROM player_stats

UNION ALL

SELECT 'RBI',
    MAX(CASE WHEN player_id = ? THEN CAST(rbi AS CHAR) END),
    MAX(CASE WHEN player_id = ? THEN CAST(rbi AS CHAR) END)
FROM player_stats

UNION ALL

SELECT 'WAR',
    MAX(CASE WHEN player_id = ? THEN CAST(war AS CHAR) END),
    MAX(CASE WHEN player_id = ? THEN CAST(war AS CHAR) END)
FROM player_stats;

sql

Player Career Statistics

Aggregate career statistics for a player across all seasons.

-- Career statistics with season-by-season breakdown
SELECT
    p.player_id,
    p.name,
    p.birth_date,
    TIMESTAMPDIFF(YEAR, p.birth_date, CURDATE()) AS current_age,
    MIN(s.season) AS first_season,
    MAX(s.season) AS last_season,
    COUNT(DISTINCT s.season) AS seasons_played,
    SUM(s.games) AS total_games,
    SUM(s.plate_appearances) AS total_pa,
    SUM(s.at_bats) AS total_ab,
    SUM(s.hits) AS total_hits,
    SUM(s.doubles) AS total_2b,
    SUM(s.triples) AS total_3b,
    SUM(s.home_runs) AS total_hr,
    SUM(s.rbi) AS total_rbi,
    SUM(s.stolen_bases) AS total_sb,
    SUM(s.walks) AS total_bb,
    SUM(s.strikeouts) AS total_so,
    ROUND(SUM(s.hits) * 1.0 / NULLIF(SUM(s.at_bats), 0), 3) AS career_avg,
    ROUND((SUM(s.hits) + SUM(s.walks)) * 1.0 /
          NULLIF(SUM(s.at_bats) + SUM(s.walks) + SUM(s.hbp), 0), 3) AS career_obp,
    ROUND((SUM(s.hits) + SUM(s.doubles) + 2*SUM(s.triples) + 3*SUM(s.home_runs)) * 1.0 /
          NULLIF(SUM(s.at_bats), 0), 3) AS career_slg,
    ROUND(SUM(s.war), 1) AS career_war
FROM players p
JOIN player_stats s ON p.player_id = s.player_id
WHERE p.player_id = ?  -- Parameter for specific player
GROUP BY p.player_id, p.name, p.birth_date

UNION ALL

-- Season breakdown
SELECT
    NULL AS player_id,
    CONCAT('  ', s.season, ' Season') AS name,
    NULL AS birth_date,
    NULL AS current_age,
    s.season AS first_season,
    s.season AS last_season,
    1 AS seasons_played,
    s.games,
    s.plate_appearances,
    s.at_bats,
    s.hits,
    s.doubles,
    s.triples,
    s.home_runs,
    s.rbi,
    s.stolen_bases,
    s.walks,
    s.strikeouts,
    ROUND(s.hits * 1.0 / NULLIF(s.at_bats, 0), 3) AS season_avg,
    ROUND((s.hits + s.walks) * 1.0 / NULLIF(s.at_bats + s.walks, 0), 3) AS season_obp,
    ROUND((s.hits + s.doubles + 2*s.triples + 3*s.home_runs) * 1.0 /
          NULLIF(s.at_bats, 0), 3) AS season_slg,
    ROUND(s.war, 1) AS season_war
FROM player_stats s
WHERE s.player_id = ?
ORDER BY first_season;

sql

Season Leaders Query

SQL query to find statistical leaders across batting categories with minimum qualifications.

-- Find batting leaders with minimum PA qualification
WITH qualified_batters AS (
    SELECT
        p.player_id,
        p.name,
        t.team_name,
        s.season,
        s.games,
        s.plate_appearances,
        s.at_bats,
        s.hits,
        s.doubles,
        s.triples,
        s.home_runs,
        s.rbi,
        s.stolen_bases,
        s.walks,
        s.strikeouts,
        ROUND(s.hits * 1.0 / NULLIF(s.at_bats, 0), 3) AS batting_avg,
        ROUND((s.hits + s.walks + s.hbp) * 1.0 /
              NULLIF(s.at_bats + s.walks + s.hbp + s.sacrifice_flies, 0), 3) AS obp,
        ROUND((s.hits + s.doubles + 2*s.triples + 3*s.home_runs) * 1.0 /
              NULLIF(s.at_bats, 0), 3) AS slg
    FROM player_stats s
    JOIN players p ON s.player_id = p.player_id
    JOIN teams t ON s.team_id = t.team_id
    WHERE s.season = 2024
      AND s.plate_appearances >= (SELECT MAX(team_games) * 3.1 FROM team_standings WHERE season = 2024)
)
SELECT
    player_id,
    name,
    team_name,
    batting_avg,
    obp,
    slg,
    ROUND(obp + slg, 3) AS ops,
    home_runs,
    rbi,
    stolen_bases,
    RANK() OVER (ORDER BY batting_avg DESC) AS avg_rank,
    RANK() OVER (ORDER BY home_runs DESC) AS hr_rank,
    RANK() OVER (ORDER BY rbi DESC) AS rbi_rank
FROM qualified_batters
ORDER BY batting_avg DESC
LIMIT 20;

python

Performance Testing for Analytics

Performance and load testing for sports analytics functions and APIs.

import pytest
import time
import pandas as pd
import numpy as np
from functools import wraps
from concurrent.futures import ThreadPoolExecutor
import psutil
import tracemalloc

from sports_analytics import (
    calculate_war_batch,
    generate_projections,
    simulate_season
)


def timing_decorator(func):
    """Decorator to measure function execution time."""
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        end = time.perf_counter()
        print(f"{func.__name__} took {end - start:.4f} seconds")
        return result
    return wrapper


class TestPerformance:
    """Performance tests for analytics functions."""

    @pytest.fixture
    def large_dataset(self):
        """Generate large test dataset."""
        np.random.seed(42)
        n = 100000

        return pd.DataFrame({
            "player_id": range(n),
            "games": np.random.randint(1, 162, n),
            "at_bats": np.random.randint(100, 600, n),
            "hits": np.random.randint(20, 200, n),
            "home_runs": np.random.randint(0, 50, n),
            "rbi": np.random.randint(0, 130, n),
            "walks": np.random.randint(10, 100, n),
            "strikeouts": np.random.randint(30, 200, n)
        })

    @pytest.mark.performance
    def test_war_calculation_performance(self, large_dataset):
        """WAR calculation should complete in reasonable time."""
        start = time.perf_counter()
        result = calculate_war_batch(large_dataset)
        duration = time.perf_counter() - start

        assert duration < 5.0, f"WAR calculation took {duration:.2f}s (max 5s)"
        assert len(result) == len(large_dataset)

    @pytest.mark.performance
    def test_projection_memory_usage(self, large_dataset):
        """Test memory usage during projections."""
        tracemalloc.start()

        result = generate_projections(large_dataset)

        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()

        peak_mb = peak / 1024 / 1024
        assert peak_mb < 500, f"Peak memory {peak_mb:.1f}MB exceeds 500MB limit"
        print(f"Peak memory usage: {peak_mb:.1f}MB")

    @pytest.mark.performance
    def test_simulation_scalability(self):
        """Test season simulation scales linearly."""
        times = []

        for n_games in [100, 500, 1000, 2000]:
            start = time.perf_counter()
            simulate_season(n_games=n_games)
            times.append((n_games, time.perf_counter() - start))

        # Check roughly linear scaling
        # Time for 2000 should be < 2.5x time for 1000
        ratio = times[-1][1] / times[-2][1]
        assert ratio < 2.5, f"Scaling ratio {ratio:.2f} suggests non-linear performance"

    @pytest.mark.performance
    def test_concurrent_requests(self):
        """Test handling concurrent requests."""
        from sports_api import APIClient
        client = APIClient()

        def make_request(player_id):
            start = time.perf_counter()
            result = client.get_player_stats(player_id)
            return time.perf_counter() - start, result

        player_ids = list(range(1, 51))  # 50 concurrent requests

        with ThreadPoolExecutor(max_workers=10) as executor:
            results = list(executor.map(make_request, player_ids))

        times = [r[0] for r in results]
        avg_time = sum(times) / len(times)

        assert avg_time < 1.0, f"Average request time {avg_time:.2f}s too slow"
        assert all(r[1] is not None for r in results), "Some requests failed"


@pytest.fixture(scope="session")
def benchmark_baseline():
    """Store baseline performance metrics."""
    return {
        "war_calculation": 2.0,  # seconds
        "projection_memory": 300,  # MB
        "api_response": 0.5  # seconds
    }


class TestBenchmarks:
    """Benchmark tests against baseline."""

    @pytest.mark.benchmark
    def test_against_baseline(self, benchmark_baseline, large_dataset):
        """Compare performance against baseline."""
        start = time.perf_counter()
        calculate_war_batch(large_dataset)
        duration = time.perf_counter() - start

        baseline = benchmark_baseline["war_calculation"]
        assert duration <= baseline * 1.2, (
            f"Performance regression: {duration:.2f}s vs baseline {baseline}s"
        )

python

Mock Testing for External Services

Test sports analytics code that depends on external APIs using mocks.

import pytest
from unittest.mock import Mock, patch, MagicMock
import pandas as pd
from datetime import datetime

from sports_service import (
    PlayerStatsService,
    GameDataService,
    LeaderboardService
)


class TestPlayerStatsService:
    """Test PlayerStatsService with mocked dependencies."""

    @pytest.fixture
    def mock_api(self):
        """Create mock API client."""
        mock = Mock()
        mock.get_player.return_value = {
            "id": 12345,
            "name": "Test Player",
            "team": "Test Team",
            "position": "OF"
        }
        mock.get_stats.return_value = {
            "batting_avg": 0.300,
            "home_runs": 25,
            "rbi": 80,
            "war": 4.5
        }
        return mock

    @pytest.fixture
    def mock_cache(self):
        """Create mock cache."""
        mock = Mock()
        mock.get.return_value = None  # Cache miss by default
        return mock

    @pytest.fixture
    def stats_service(self, mock_api, mock_cache):
        """Create service with mocked dependencies."""
        return PlayerStatsService(api=mock_api, cache=mock_cache)

    def test_get_player_stats(self, stats_service, mock_api):
        """Test fetching player stats."""
        stats = stats_service.get_player_stats(player_id=12345)

        assert stats["name"] == "Test Player"
        assert stats["batting_avg"] == 0.300
        mock_api.get_player.assert_called_once_with(12345)

    def test_uses_cache_when_available(self, stats_service, mock_api, mock_cache):
        """Test cache is used when data is available."""
        cached_data = {"name": "Cached Player", "batting_avg": 0.280}
        mock_cache.get.return_value = cached_data

        stats = stats_service.get_player_stats(player_id=12345)

        assert stats["name"] == "Cached Player"
        mock_api.get_player.assert_not_called()

    def test_caches_api_response(self, stats_service, mock_api, mock_cache):
        """Test API responses are cached."""
        stats_service.get_player_stats(player_id=12345)

        mock_cache.set.assert_called_once()
        cached_key, cached_value = mock_cache.set.call_args[0]
        assert "12345" in cached_key

    def test_handles_api_error(self, stats_service, mock_api):
        """Test graceful handling of API errors."""
        mock_api.get_player.side_effect = Exception("API Error")

        with pytest.raises(stats_service.ServiceError):
            stats_service.get_player_stats(player_id=12345)


class TestGameDataService:
    """Test GameDataService with mocked external services."""

    @pytest.fixture
    def mock_schedule_api(self):
        mock = Mock()
        mock.get_games.return_value = [
            {"game_id": "g1", "home": "Team A", "away": "Team B", "time": "19:00"},
            {"game_id": "g2", "home": "Team C", "away": "Team D", "time": "20:00"}
        ]
        return mock

    @pytest.fixture
    def mock_odds_api(self):
        mock = Mock()
        mock.get_odds.return_value = {
            "g1": {"home": -150, "away": +130},
            "g2": {"home": +110, "away": -120}
        }
        return mock

    @pytest.fixture
    def game_service(self, mock_schedule_api, mock_odds_api):
        return GameDataService(
            schedule_api=mock_schedule_api,
            odds_api=mock_odds_api
        )

    def test_get_games_with_odds(self, game_service):
        """Test combining game data with odds."""
        games = game_service.get_todays_games_with_odds()

        assert len(games) == 2
        assert games[0]["odds"]["home"] == -150
        assert games[1]["odds"]["away"] == -120

    @patch("sports_service.datetime")
    def test_filters_by_date(self, mock_datetime, game_service, mock_schedule_api):
        """Test date filtering."""
        mock_datetime.now.return_value = datetime(2024, 4, 15)

        game_service.get_todays_games_with_odds()

        mock_schedule_api.get_games.assert_called_with(date="2024-04-15")


class TestLeaderboardService:
    """Test leaderboard calculations with mocked data."""

    @pytest.fixture
    def mock_db(self):
        """Create mock database."""
        mock = MagicMock()

        # Mock query results
        mock.query.return_value = pd.DataFrame({
            "player_id": [1, 2, 3],
            "name": ["Player A", "Player B", "Player C"],
            "stat_value": [0.320, 0.305, 0.298]
        })
        return mock

    def test_get_batting_leaders(self, mock_db):
        """Test batting leaderboard."""
        service = LeaderboardService(db=mock_db)
        leaders = service.get_batting_leaders(stat="avg", limit=10)

        assert len(leaders) == 3
        assert leaders.iloc[0]["stat_value"] == 0.320
        mock_db.query.assert_called_once()

    def test_leaderboard_respects_limit(self, mock_db):
        """Test limit parameter."""
        service = LeaderboardService(db=mock_db)
        service.get_batting_leaders(stat="hr", limit=5)

        call_args = mock_db.query.call_args
        assert "LIMIT 5" in call_args[0][0] or call_args[1].get("limit") == 5

python

Property-Based Testing with Hypothesis

Use property-based testing to find edge cases in sports analytics functions.

import pytest
from hypothesis import given, strategies as st, assume, settings
from hypothesis.extra.pandas import columns, data_frames, column
import pandas as pd
import numpy as np

# Import functions to test
from sports_analytics import (
    calculate_batting_avg,
    calculate_obp,
    calculate_slg,
    calculate_ops,
    normalize_stats
)


class TestBattingStatsProperties:
    """Property-based tests for batting statistics."""

    @given(
        hits=st.integers(min_value=0, max_value=300),
        at_bats=st.integers(min_value=1, max_value=700)
    )
    def test_batting_avg_bounds(self, hits, at_bats):
        """Batting average must be between 0 and 1."""
        assume(hits <= at_bats)  # Hits can't exceed at bats

        avg = calculate_batting_avg(hits, at_bats)

        assert 0 <= avg <= 1
        assert avg == hits / at_bats

    @given(
        hits=st.integers(min_value=0, max_value=200),
        walks=st.integers(min_value=0, max_value=150),
        hbp=st.integers(min_value=0, max_value=20),
        at_bats=st.integers(min_value=1, max_value=600),
        sf=st.integers(min_value=0, max_value=15)
    )
    def test_obp_greater_than_avg(self, hits, walks, hbp, at_bats, sf):
        """OBP must be >= AVG (walks only help)."""
        assume(hits <= at_bats)

        avg = calculate_batting_avg(hits, at_bats)
        obp = calculate_obp(hits, walks, hbp, at_bats, sf)

        assert obp >= avg

    @given(
        singles=st.integers(min_value=0, max_value=150),
        doubles=st.integers(min_value=0, max_value=50),
        triples=st.integers(min_value=0, max_value=15),
        home_runs=st.integers(min_value=0, max_value=60),
        at_bats=st.integers(min_value=1, max_value=600)
    )
    def test_slg_greater_than_avg(self, singles, doubles, triples, home_runs, at_bats):
        """SLG must be >= AVG (extra base hits add value)."""
        hits = singles + doubles + triples + home_runs
        assume(hits <= at_bats)

        avg = calculate_batting_avg(hits, at_bats)
        slg = calculate_slg(singles, doubles, triples, home_runs, at_bats)

        assert slg >= avg

    @given(
        obp=st.floats(min_value=0.200, max_value=0.500),
        slg=st.floats(min_value=0.250, max_value=0.800)
    )
    def test_ops_is_sum(self, obp, slg):
        """OPS should be sum of OBP and SLG."""
        ops = calculate_ops(obp, slg)
        assert ops == pytest.approx(obp + slg, rel=1e-6)


class TestDataFrameProperties:
    """Property-based tests for DataFrame operations."""

    @given(
        df=data_frames(columns=[
            column("player_id", dtype=int),
            column("batting_avg", elements=st.floats(0.150, 0.400)),
            column("home_runs", elements=st.integers(0, 60)),
            column("rbi", elements=st.integers(0, 150))
        ])
    )
    @settings(max_examples=50)
    def test_normalize_preserves_rows(self, df):
        """Normalization should preserve row count."""
        assume(len(df) > 0)

        normalized = normalize_stats(df, ["batting_avg", "home_runs", "rbi"])

        assert len(normalized) == len(df)

    @given(
        df=data_frames(columns=[
            column("value", elements=st.floats(0, 100, allow_nan=False))
        ])
    )
    def test_normalize_bounds(self, df):
        """Normalized values should be between 0 and 1."""
        assume(len(df) > 1)
        assume(df["value"].std() > 0)  # Need variation

        normalized = normalize_stats(df, ["value"])

        assert normalized["value"].min() >= 0
        assert normalized["value"].max() <= 1

    @given(
        values=st.lists(
            st.floats(min_value=-100, max_value=100, allow_nan=False),
            min_size=2, max_size=100
        )
    )
    def test_percentile_ranking(self, values):
        """Percentile ranking should produce values 0-100."""
        from sports_analytics import percentile_rank

        df = pd.DataFrame({"stat": values})
        df["percentile"] = percentile_rank(df["stat"])

        assert df["percentile"].min() >= 0
        assert df["percentile"].max() <= 100


@given(
    runs_for=st.integers(min_value=1, max_value=1500),
    runs_against=st.integers(min_value=1, max_value=1500)
)
def test_pythagorean_expectation_bounds(runs_for, runs_against):
    """Pythagorean expectation must be between 0 and 1."""
    from sports_analytics import pythagorean_expectation

    win_pct = pythagorean_expectation(runs_for, runs_against)

    assert 0 < win_pct < 1

    # Equal runs should give 50%
    if runs_for == runs_against:
        assert win_pct == pytest.approx(0.5, rel=1e-6)

python

Integration Testing for APIs

Integration tests for sports data API clients and database connections.

import pytest
import requests
from unittest.mock import Mock, patch
from sports_api import BaseballAPIClient, DatabaseConnection

class TestAPIIntegration:
    """Integration tests for API clients."""

    @pytest.fixture
    def api_client(self):
        """Create API client for testing."""
        return BaseballAPIClient(api_key="test_key")

    @pytest.fixture
    def mock_response(self):
        """Create mock API response."""
        mock = Mock()
        mock.status_code = 200
        mock.json.return_value = {
            "players": [
                {"id": 1, "name": "Test Player", "avg": 0.300}
            ]
        }
        return mock

    def test_api_connection(self, api_client):
        """Test API can be reached."""
        # Use a mock in CI, real connection in integration
        with patch("requests.get") as mock_get:
            mock_get.return_value.status_code = 200
            mock_get.return_value.json.return_value = {"status": "ok"}

            result = api_client.health_check()
            assert result is True

    def test_get_player_data(self, api_client, mock_response):
        """Test fetching player data."""
        with patch("requests.get", return_value=mock_response):
            players = api_client.get_players()
            assert len(players) > 0
            assert players[0]["name"] == "Test Player"

    def test_rate_limiting(self, api_client):
        """Test rate limiting is respected."""
        with patch("requests.get") as mock_get:
            mock_get.return_value.status_code = 429
            mock_get.return_value.headers = {"Retry-After": "60"}

            with pytest.raises(api_client.RateLimitError):
                api_client.get_players()

    def test_api_error_handling(self, api_client):
        """Test API error handling."""
        with patch("requests.get") as mock_get:
            mock_get.return_value.status_code = 500
            mock_get.return_value.text = "Internal Server Error"

            with pytest.raises(api_client.APIError):
                api_client.get_players()


class TestDatabaseIntegration:
    """Integration tests for database operations."""

    @pytest.fixture
    def db_connection(self):
        """Create test database connection."""
        conn = DatabaseConnection(
            host="localhost",
            database="sports_test",
            user="test_user",
            password="test_pass"
        )
        yield conn
        conn.close()

    @pytest.fixture
    def setup_test_data(self, db_connection):
        """Set up test data in database."""
        db_connection.execute("""
            CREATE TEMPORARY TABLE test_players (
                id INT PRIMARY KEY,
                name VARCHAR(100),
                batting_avg DECIMAL(4,3)
            )
        """)
        db_connection.execute("""
            INSERT INTO test_players VALUES
            (1, 'Test Player A', 0.300),
            (2, 'Test Player B', 0.275)
        """)
        yield
        db_connection.execute("DROP TEMPORARY TABLE test_players")

    def test_database_connection(self, db_connection):
        """Test database connection works."""
        result = db_connection.execute("SELECT 1")
        assert result is not None

    def test_insert_and_retrieve(self, db_connection, setup_test_data):
        """Test data can be inserted and retrieved."""
        # Insert
        db_connection.execute(
            "INSERT INTO test_players VALUES (3, 'New Player', 0.285)"
        )

        # Retrieve
        result = db_connection.query(
            "SELECT * FROM test_players WHERE id = 3"
        )
        assert len(result) == 1
        assert result[0]["name"] == "New Player"

    def test_transaction_rollback(self, db_connection, setup_test_data):
        """Test transaction rollback works."""
        db_connection.begin_transaction()
        db_connection.execute(
            "INSERT INTO test_players VALUES (4, 'Should Rollback', 0.250)"
        )
        db_connection.rollback()

        result = db_connection.query(
            "SELECT * FROM test_players WHERE id = 4"
        )
        assert len(result) == 0


# Markers for different test environments
pytestmark = [
    pytest.mark.integration,
    pytest.mark.skipif(
        not pytest.config.getoption("--integration"),
        reason="Integration tests disabled"
    )
]

python

Data Validation Tests

Test data quality and validation rules for sports statistics datasets.

import pytest
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import great_expectations as gx

class TestDataQuality:
    """Test data quality for sports datasets."""

    @pytest.fixture
    def player_stats(self):
        """Load player statistics for testing."""
        return pd.read_csv("player_stats.csv")

    def test_no_duplicate_player_seasons(self, player_stats):
        """Ensure no duplicate player-season combinations."""
        duplicates = player_stats.duplicated(subset=["player_id", "season"])
        assert not duplicates.any(), f"Found {duplicates.sum()} duplicates"

    def test_no_null_player_ids(self, player_stats):
        """All records must have player_id."""
        null_ids = player_stats["player_id"].isna()
        assert not null_ids.any(), f"Found {null_ids.sum()} null player_ids"

    def test_games_played_positive(self, player_stats):
        """Games played must be positive."""
        assert (player_stats["games"] > 0).all()

    def test_batting_avg_valid_range(self, player_stats):
        """Batting average must be between 0 and 1."""
        avg_col = player_stats["batting_avg"]
        assert (avg_col >= 0).all() and (avg_col <= 1).all()

    def test_at_bats_greater_than_hits(self, player_stats):
        """At bats must be >= hits."""
        assert (player_stats["at_bats"] >= player_stats["hits"]).all()

    def test_season_values_valid(self, player_stats):
        """Season must be a valid year."""
        current_year = datetime.now().year
        assert (player_stats["season"] >= 1900).all()
        assert (player_stats["season"] <= current_year).all()


class TestGreatExpectations:
    """Use Great Expectations for data validation."""

    def test_player_stats_expectations(self):
        """Comprehensive data expectations."""
        df = pd.read_csv("player_stats.csv")
        context = gx.get_context()

        # Create expectation suite
        expectations = [
            # Column existence
            {"expectation_type": "expect_column_to_exist", "kwargs": {"column": "player_id"}},
            {"expectation_type": "expect_column_to_exist", "kwargs": {"column": "batting_avg"}},

            # Null checks
            {"expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "player_id"}},

            # Value ranges
            {
                "expectation_type": "expect_column_values_to_be_between",
                "kwargs": {"column": "batting_avg", "min_value": 0, "max_value": 1}
            },
            {
                "expectation_type": "expect_column_values_to_be_between",
                "kwargs": {"column": "era", "min_value": 0, "max_value": 50}
            },

            # Uniqueness
            {
                "expectation_type": "expect_compound_columns_to_be_unique",
                "kwargs": {"column_list": ["player_id", "season"]}
            }
        ]

        # Validate
        datasource = context.sources.add_pandas("pandas_datasource")
        data_asset = datasource.add_dataframe_asset(name="player_stats")
        batch_request = data_asset.build_batch_request(dataframe=df)

        results = context.run_checkpoint(
            checkpoint_name="player_stats_checkpoint",
            validations=[{
                "batch_request": batch_request,
                "expectation_suite_name": "player_stats_suite"
            }]
        )

        assert results.success, "Data validation failed"


class TestStatisticalValidity:
    """Test statistical properties of the data."""

    @pytest.fixture
    def batting_data(self):
        return pd.read_csv("batting_stats.csv")

    def test_avg_distribution_reasonable(self, batting_data):
        """Average should follow reasonable distribution."""
        avg = batting_data["batting_avg"]

        # Mean should be around .250-.270
        assert 0.200 < avg.mean() < 0.300

        # Standard deviation should be reasonable
        assert 0.020 < avg.std() < 0.050

    def test_outliers_flagged(self, batting_data):
        """Identify statistical outliers."""
        avg = batting_data["batting_avg"]
        mean, std = avg.mean(), avg.std()

        # Flag values > 3 std from mean
        outliers = batting_data[abs(avg - mean) > 3 * std]

        # Log outliers for review
        if len(outliers) > 0:
            print(f"Potential outliers found: {len(outliers)}")
            print(outliers[["player_id", "name", "batting_avg"]])

    def test_calculated_fields_match(self, batting_data):
        """Verify calculated fields are correct."""
        # Verify AVG = H / AB
        calculated_avg = batting_data["hits"] / batting_data["at_bats"]
        stored_avg = batting_data["batting_avg"]

        # Allow small floating point differences
        assert np.allclose(calculated_avg, stored_avg, rtol=1e-3, equal_nan=True)

python

Unit Testing Sports Functions

Comprehensive unit tests for sports analytics functions using pytest.

import pytest
import pandas as pd
import numpy as np
from sports_analytics import (
    calculate_batting_avg,
    calculate_era,
    calculate_war,
    pythagorean_expectation
)

class TestBattingStats:
    """Test batting statistics calculations."""

    def test_batting_avg_normal(self):
        """Test batting average calculation."""
        assert calculate_batting_avg(hits=150, at_bats=500) == 0.300

    def test_batting_avg_zero_at_bats(self):
        """Test batting average with zero at bats."""
        assert calculate_batting_avg(hits=0, at_bats=0) == 0.0

    def test_batting_avg_perfect(self):
        """Test perfect batting average."""
        assert calculate_batting_avg(hits=100, at_bats=100) == 1.0

    @pytest.mark.parametrize("hits,at_bats,expected", [
        (100, 400, 0.250),
        (175, 500, 0.350),
        (0, 300, 0.0),
        (1, 1, 1.0),
    ])
    def test_batting_avg_parametrized(self, hits, at_bats, expected):
        """Parametrized test for various scenarios."""
        result = calculate_batting_avg(hits, at_bats)
        assert result == pytest.approx(expected, rel=1e-3)


class TestPitchingStats:
    """Test pitching statistics calculations."""

    def test_era_normal(self):
        """Test ERA calculation."""
        # ERA = (ER / IP) * 9
        result = calculate_era(earned_runs=45, innings_pitched=180)
        assert result == pytest.approx(2.25, rel=1e-3)

    def test_era_zero_innings(self):
        """Test ERA with zero innings (should handle gracefully)."""
        result = calculate_era(earned_runs=5, innings_pitched=0)
        assert result == float("inf") or result is None


class TestAdvancedMetrics:
    """Test advanced analytics functions."""

    def test_pythagorean_expectation(self):
        """Test Pythagorean win expectation."""
        # With equal runs scored and allowed
        result = pythagorean_expectation(runs_scored=700, runs_allowed=700)
        assert result == pytest.approx(0.5, rel=1e-3)

        # Team that scores more should have > 50%
        result = pythagorean_expectation(runs_scored=800, runs_allowed=600)
        assert result > 0.5

    def test_war_calculation(self):
        """Test WAR calculation."""
        result = calculate_war(
            batting_runs=15,
            baserunning_runs=3,
            fielding_runs=5,
            positional_adjustment=10,
            league_adjustment=2,
            replacement_level=20
        )
        # Sum of components divided by runs per win (~10)
        expected = (15 + 3 + 5 + 10 + 2 + 20) / 10
        assert result == pytest.approx(expected, rel=0.1)


@pytest.fixture
def sample_player_data():
    """Fixture providing sample player DataFrame."""
    return pd.DataFrame({
        "player_id": [1, 2, 3],
        "name": ["Player A", "Player B", "Player C"],
        "avg": [0.300, 0.250, 0.275],
        "hr": [30, 15, 22],
        "rbi": [100, 60, 80]
    })


class TestDataProcessing:
    """Test data processing functions."""

    def test_dataframe_not_empty(self, sample_player_data):
        """Test that fixture provides non-empty data."""
        assert len(sample_player_data) > 0

    def test_required_columns_exist(self, sample_player_data):
        """Test required columns are present."""
        required = ["player_id", "name", "avg", "hr", "rbi"]
        for col in required:
            assert col in sample_player_data.columns

    def test_avg_in_valid_range(self, sample_player_data):
        """Test batting averages are valid."""
        assert (sample_player_data["avg"] >= 0).all()
        assert (sample_player_data["avg"] <= 1).all()


# Run with: pytest test_sports_analytics.py -v

python

ESPN API Integration

Fetch live scores, standings, and player data from ESPN's public API endpoints.

import requests
import pandas as pd
from datetime import datetime, timedelta

class ESPNClient:
    """Client for ESPN API endpoints."""

    BASE_URL = "https://site.api.espn.com/apis/site/v2/sports"

    SPORT_LEAGUES = {
        "mlb": ("baseball", "mlb"),
        "nba": ("basketball", "nba"),
        "nfl": ("football", "nfl"),
        "nhl": ("hockey", "nhl"),
        "mls": ("soccer", "usa.1"),
        "epl": ("soccer", "eng.1")
    }

    def __init__(self):
        self.session = requests.Session()

    def get_scoreboard(self, league: str, date: str = None) -> dict:
        """Get live/daily scoreboard."""
        sport, league_id = self.SPORT_LEAGUES.get(league, (league, league))
        url = f"{self.BASE_URL}/{sport}/{league_id}/scoreboard"

        params = {}
        if date:
            params["dates"] = date.replace("-", "")

        response = self.session.get(url, params=params)
        return response.json()

    def get_standings(self, league: str) -> pd.DataFrame:
        """Get current standings."""
        sport, league_id = self.SPORT_LEAGUES.get(league, (league, league))
        url = f"{self.BASE_URL}/{sport}/{league_id}/standings"

        response = self.session.get(url)
        data = response.json()

        teams = []
        for group in data.get("children", []):
            for team_entry in group.get("standings", {}).get("entries", []):
                team = team_entry.get("team", {})
                stats = {s["name"]: s["value"] for s in team_entry.get("stats", [])}

                teams.append({
                    "team_id": team.get("id"),
                    "name": team.get("displayName"),
                    "abbreviation": team.get("abbreviation"),
                    **stats
                })

        return pd.DataFrame(teams)

    def get_team_roster(self, league: str, team_id: str) -> pd.DataFrame:
        """Get team roster."""
        sport, league_id = self.SPORT_LEAGUES.get(league, (league, league))
        url = f"{self.BASE_URL}/{sport}/{league_id}/teams/{team_id}/roster"

        response = self.session.get(url)
        data = response.json()

        players = []
        for athlete in data.get("athletes", []):
            players.append({
                "player_id": athlete.get("id"),
                "name": athlete.get("fullName"),
                "position": athlete.get("position", {}).get("abbreviation"),
                "jersey": athlete.get("jersey"),
                "age": athlete.get("age"),
                "height": athlete.get("height"),
                "weight": athlete.get("weight")
            })

        return pd.DataFrame(players)

    def get_player_stats(self, league: str, player_id: str) -> dict:
        """Get player statistics."""
        sport, league_id = self.SPORT_LEAGUES.get(league, (league, league))
        url = f"{self.BASE_URL}/{sport}/{league_id}/athletes/{player_id}"

        response = self.session.get(url)
        return response.json()

# Usage
espn = ESPNClient()

# Get today's MLB scores
scores = espn.get_scoreboard("mlb")
for event in scores.get("events", []):
    competition = event["competitions"][0]
    teams = competition["competitors"]
    print(f"{teams[0]['team']['name']} {teams[0].get('score', 0)} - "
          f"{teams[1].get('score', 0)} {teams[1]['team']['name']}")

# Get NBA standings
nba_standings = espn.get_standings("nba")
print(nba_standings.head())

python

Odds API for Betting Lines

Fetch live betting odds from multiple sportsbooks using The Odds API.

import requests
import pandas as pd
from datetime import datetime

class OddsAPIClient:
    """Client for The Odds API."""

    BASE_URL = "https://api.the-odds-api.com/v4"

    SPORTS = {
        "mlb": "baseball_mlb",
        "nba": "basketball_nba",
        "nfl": "americanfootball_nfl",
        "nhl": "icehockey_nhl",
        "epl": "soccer_epl",
        "ncaab": "basketball_ncaab",
        "ncaaf": "americanfootball_ncaaf"
    }

    def __init__(self, api_key: str):
        self.api_key = api_key

    def get_sports(self) -> list:
        """Get list of available sports."""
        url = f"{self.BASE_URL}/sports"
        params = {"apiKey": self.api_key}

        response = requests.get(url, params=params)
        return response.json()

    def get_odds(self, sport: str, regions: str = "us",
                 markets: str = "h2h,spreads,totals") -> pd.DataFrame:
        """Get odds for upcoming games."""
        sport_key = self.SPORTS.get(sport, sport)

        url = f"{self.BASE_URL}/sports/{sport_key}/odds"
        params = {
            "apiKey": self.api_key,
            "regions": regions,
            "markets": markets,
            "oddsFormat": "american"
        }

        response = requests.get(url, params=params)
        data = response.json()

        games = []
        for game in data:
            game_info = {
                "game_id": game.get("id"),
                "sport": game.get("sport_key"),
                "commence_time": game.get("commence_time"),
                "home_team": game.get("home_team"),
                "away_team": game.get("away_team")
            }

            # Extract odds from each bookmaker
            for bookmaker in game.get("bookmakers", []):
                book = bookmaker.get("key")

                for market in bookmaker.get("markets", []):
                    market_key = market.get("key")

                    for outcome in market.get("outcomes", []):
                        col_name = f"{book}_{market_key}_{outcome.get('name', '').lower().replace(' ', '_')}"
                        game_info[col_name] = outcome.get("price")

                        if "point" in outcome:
                            game_info[f"{col_name}_line"] = outcome.get("point")

            games.append(game_info)

        return pd.DataFrame(games)

    def get_scores(self, sport: str, days_from: int = 1) -> pd.DataFrame:
        """Get scores for completed games."""
        sport_key = self.SPORTS.get(sport, sport)

        url = f"{self.BASE_URL}/sports/{sport_key}/scores"
        params = {
            "apiKey": self.api_key,
            "daysFrom": days_from
        }

        response = requests.get(url, params=params)
        data = response.json()

        games = []
        for game in data:
            scores = game.get("scores", [])
            home_score = next((s["score"] for s in scores if s["name"] == game["home_team"]), None)
            away_score = next((s["score"] for s in scores if s["name"] == game["away_team"]), None)

            games.append({
                "game_id": game.get("id"),
                "home_team": game.get("home_team"),
                "away_team": game.get("away_team"),
                "home_score": home_score,
                "away_score": away_score,
                "completed": game.get("completed")
            })

        return pd.DataFrame(games)

    def find_best_odds(self, odds_df: pd.DataFrame, team: str, market: str = "h2h") -> dict:
        """Find best odds for a team across all books."""
        team_cols = [c for c in odds_df.columns if market in c and team.lower() in c.lower()]

        best_odds = {}
        for col in team_cols:
            book = col.split("_")[0]
            odds = odds_df[col].iloc[0]
            if pd.notna(odds):
                best_odds[book] = odds

        if best_odds:
            best_book = max(best_odds, key=best_odds.get)
            return {"book": best_book, "odds": best_odds[best_book], "all_odds": best_odds}

        return {}

# Usage (requires API key from the-odds-api.com)
# odds_client = OddsAPIClient("YOUR_API_KEY")
# nfl_odds = odds_client.get_odds("nfl")
# print(nfl_odds.head())

python

NHL API Game Data

Access NHL game data, player stats, and play-by-play using the official NHL API.

import requests
import pandas as pd
from datetime import datetime

class NHLClient:
    """Client for NHL Stats API."""

    BASE_URL = "https://api-web.nhle.com/v1"

    def __init__(self):
        self.session = requests.Session()

    def get_schedule(self, date: str = None) -> dict:
        """Get schedule for a specific date."""
        if date is None:
            date = datetime.now().strftime("%Y-%m-%d")

        url = f"{self.BASE_URL}/schedule/{date}"
        response = self.session.get(url)
        return response.json()

    def get_standings(self, season: str = None) -> pd.DataFrame:
        """Get current standings."""
        url = f"{self.BASE_URL}/standings/now"
        response = self.session.get(url)
        data = response.json()

        teams = []
        for team in data.get("standings", []):
            teams.append({
                "team": team.get("teamName", {}).get("default"),
                "team_abbrev": team.get("teamAbbrev", {}).get("default"),
                "conference": team.get("conferenceName"),
                "division": team.get("divisionName"),
                "games_played": team.get("gamesPlayed"),
                "wins": team.get("wins"),
                "losses": team.get("losses"),
                "ot_losses": team.get("otLosses"),
                "points": team.get("points"),
                "goal_diff": team.get("goalDifferential"),
                "goals_for": team.get("goalFor"),
                "goals_against": team.get("goalAgainst")
            })

        return pd.DataFrame(teams)

    def get_player_stats(self, player_id: int, season: str = "20232024") -> dict:
        """Get player season statistics."""
        url = f"{self.BASE_URL}/player/{player_id}/landing"
        response = self.session.get(url)
        return response.json()

    def get_game_boxscore(self, game_id: int) -> dict:
        """Get game boxscore."""
        url = f"{self.BASE_URL}/gamecenter/{game_id}/boxscore"
        response = self.session.get(url)
        return response.json()

    def get_game_playbyplay(self, game_id: int) -> pd.DataFrame:
        """Get play-by-play data for a game."""
        url = f"{self.BASE_URL}/gamecenter/{game_id}/play-by-play"
        response = self.session.get(url)
        data = response.json()

        plays = []
        for play in data.get("plays", []):
            plays.append({
                "period": play.get("periodDescriptor", {}).get("number"),
                "time": play.get("timeInPeriod"),
                "type": play.get("typeDescKey"),
                "description": play.get("details", {}).get("reason", ""),
                "x_coord": play.get("details", {}).get("xCoord"),
                "y_coord": play.get("details", {}).get("yCoord")
            })

        return pd.DataFrame(plays)

    def search_players(self, query: str) -> list:
        """Search for players by name."""
        url = f"{self.BASE_URL}/search/player?q={query}"
        response = self.session.get(url)
        return response.json()

# Usage
nhl = NHLClient()

# Get standings
standings = nhl.get_standings()
print("NHL Standings:")
print(standings.sort_values("points", ascending=False).head(10))

# Get schedule
schedule = nhl.get_schedule()
for game_week in schedule.get("gameWeek", []):
    for game in game_week.get("games", []):
        home = game.get("homeTeam", {}).get("abbrev")
        away = game.get("awayTeam", {}).get("abbrev")
        print(f"{away} @ {home}")

python

Weather API for Game Conditions

Fetch weather data for outdoor sports games using OpenWeatherMap API.

import requests
import pandas as pd
from datetime import datetime, timedelta
from typing import Dict, Optional

class SportWeatherClient:
    """Weather data client for sports analytics."""

    BASE_URL = "https://api.openweathermap.org/data/2.5"

    # NFL stadium coordinates
    STADIUMS = {
        "lambeau_field": {"lat": 44.5013, "lon": -88.0622, "team": "Green Bay Packers"},
        "arrowhead": {"lat": 39.0489, "lon": -94.4839, "team": "Kansas City Chiefs"},
        "gillette": {"lat": 42.0909, "lon": -71.2643, "team": "New England Patriots"},
        "mile_high": {"lat": 39.7439, "lon": -105.0201, "team": "Denver Broncos"},
        "soldier_field": {"lat": 41.8623, "lon": -87.6167, "team": "Chicago Bears"},
        "heinz_field": {"lat": 40.4468, "lon": -80.0158, "team": "Pittsburgh Steelers"},
        "metlife": {"lat": 40.8128, "lon": -74.0742, "team": "Giants/Jets"},
    }

    def __init__(self, api_key: str):
        self.api_key = api_key

    def get_current_weather(self, lat: float, lon: float) -> Dict:
        """Get current weather conditions."""
        url = f"{self.BASE_URL}/weather"
        params = {
            "lat": lat,
            "lon": lon,
            "appid": self.api_key,
            "units": "imperial"
        }

        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()

        return {
            "temp": data["main"]["temp"],
            "feels_like": data["main"]["feels_like"],
            "humidity": data["main"]["humidity"],
            "wind_speed": data["wind"]["speed"],
            "wind_deg": data["wind"].get("deg", 0),
            "conditions": data["weather"][0]["main"],
            "description": data["weather"][0]["description"],
            "visibility": data.get("visibility", 10000) / 1000,  # km
            "precipitation": data.get("rain", {}).get("1h", 0) + data.get("snow", {}).get("1h", 0)
        }

    def get_forecast(self, lat: float, lon: float, hours: int = 24) -> pd.DataFrame:
        """Get hourly weather forecast."""
        url = f"{self.BASE_URL}/forecast"
        params = {
            "lat": lat,
            "lon": lon,
            "appid": self.api_key,
            "units": "imperial"
        }

        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()

        forecasts = []
        for item in data["list"][:hours // 3]:
            forecasts.append({
                "datetime": datetime.fromtimestamp(item["dt"]),
                "temp": item["main"]["temp"],
                "feels_like": item["main"]["feels_like"],
                "humidity": item["main"]["humidity"],
                "wind_speed": item["wind"]["speed"],
                "conditions": item["weather"][0]["main"],
                "pop": item.get("pop", 0) * 100  # Probability of precipitation
            })

        return pd.DataFrame(forecasts)

    def get_game_weather(self, stadium: str) -> Optional[Dict]:
        """Get weather for a specific stadium."""
        if stadium not in self.STADIUMS:
            return None

        coords = self.STADIUMS[stadium]
        weather = self.get_current_weather(coords["lat"], coords["lon"])
        weather["stadium"] = stadium
        weather["team"] = coords["team"]

        return weather

    def categorize_conditions(self, weather: Dict) -> Dict[str, str]:
        """Categorize weather impact on game."""
        impacts = {}

        # Temperature impact
        temp = weather["temp"]
        if temp < 20:
            impacts["temp_impact"] = "extreme_cold"
        elif temp < 40:
            impacts["temp_impact"] = "cold"
        elif temp > 90:
            impacts["temp_impact"] = "hot"
        else:
            impacts["temp_impact"] = "moderate"

        # Wind impact
        wind = weather["wind_speed"]
        if wind > 20:
            impacts["wind_impact"] = "high"
        elif wind > 10:
            impacts["wind_impact"] = "moderate"
        else:
            impacts["wind_impact"] = "low"

        # Precipitation
        conditions = weather["conditions"].lower()
        if "rain" in conditions or "snow" in conditions:
            impacts["precip_impact"] = "active"
        else:
            impacts["precip_impact"] = "none"

        return impacts

    def all_stadiums_weather(self) -> pd.DataFrame:
        """Get weather for all tracked stadiums."""
        weather_data = []

        for stadium in self.STADIUMS:
            weather = self.get_game_weather(stadium)
            if weather:
                impacts = self.categorize_conditions(weather)
                weather_data.append({**weather, **impacts})

        return pd.DataFrame(weather_data)

# Usage (requires OpenWeatherMap API key)
# weather = SportWeatherClient("YOUR_API_KEY")
# lambeau = weather.get_game_weather("lambeau_field")
# print(f"Lambeau Field: {lambeau['temp']}°F, {lambeau['conditions']}")

python

Sportradar API Client

Professional-grade API client for Sportradar data including live feeds, stats, and odds.

import requests
import pandas as pd
from datetime import datetime
from typing import Optional

class SportradarClient:
    """Client for Sportradar API."""

    BASE_URLS = {
        "mlb": "https://api.sportradar.us/mlb/trial/v7/en",
        "nba": "https://api.sportradar.us/nba/trial/v8/en",
        "nfl": "https://api.sportradar.us/nfl/official/trial/v7/en",
        "nhl": "https://api.sportradar.us/nhl/trial/v7/en"
    }

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.session = requests.Session()

    def _request(self, league: str, endpoint: str) -> dict:
        """Make API request."""
        base_url = self.BASE_URLS.get(league)
        if not base_url:
            raise ValueError(f"Unknown league: {league}")

        url = f"{base_url}/{endpoint}.json"
        params = {"api_key": self.api_key}

        response = self.session.get(url, params=params)
        response.raise_for_status()
        return response.json()

    def get_schedule(self, league: str, year: int, season_type: str = "REG") -> pd.DataFrame:
        """Get season schedule."""
        endpoint = f"games/{year}/{season_type}/schedule"
        data = self._request(league, endpoint)

        games = []
        for week in data.get("weeks", data.get("games", [])):
            game_list = week.get("games", [week]) if isinstance(week, dict) else [week]
            for game in game_list:
                games.append({
                    "game_id": game.get("id"),
                    "scheduled": game.get("scheduled"),
                    "home_team": game.get("home", {}).get("name"),
                    "away_team": game.get("away", {}).get("name"),
                    "venue": game.get("venue", {}).get("name"),
                    "status": game.get("status")
                })

        return pd.DataFrame(games)

    def get_standings(self, league: str, year: int, season_type: str = "REG") -> pd.DataFrame:
        """Get standings."""
        endpoint = f"seasons/{year}/{season_type}/standings"
        data = self._request(league, endpoint)

        teams = []
        for conference in data.get("conferences", []):
            for division in conference.get("divisions", []):
                for team in division.get("teams", []):
                    teams.append({
                        "team_id": team.get("id"),
                        "name": team.get("name"),
                        "conference": conference.get("name"),
                        "division": division.get("name"),
                        "wins": team.get("wins"),
                        "losses": team.get("losses"),
                        "win_pct": team.get("win_pct"),
                        "games_behind": team.get("games_behind")
                    })

        return pd.DataFrame(teams)

    def get_player_profile(self, league: str, player_id: str) -> dict:
        """Get player profile and stats."""
        endpoint = f"players/{player_id}/profile"
        return self._request(league, endpoint)

    def get_team_profile(self, league: str, team_id: str) -> dict:
        """Get team profile with roster."""
        endpoint = f"teams/{team_id}/profile"
        return self._request(league, endpoint)

    def get_game_boxscore(self, league: str, game_id: str) -> dict:
        """Get game boxscore."""
        endpoint = f"games/{game_id}/boxscore"
        return self._request(league, endpoint)

    def get_play_by_play(self, league: str, game_id: str) -> dict:
        """Get play-by-play data."""
        endpoint = f"games/{game_id}/pbp"
        return self._request(league, endpoint)

    def get_daily_transfers(self, league: str, year: int, month: int, day: int) -> dict:
        """Get player transactions for a date."""
        endpoint = f"league/{year}/{month:02d}/{day:02d}/transfers"
        return self._request(league, endpoint)

# Usage (requires Sportradar API key)
# sr = SportradarClient("YOUR_API_KEY")
# schedule = sr.get_schedule("nfl", 2024)
# standings = sr.get_standings("nba", 2024)

python

NBA Stats API (stats.nba.com)

Access official NBA statistics using the stats.nba.com API endpoints.

import requests
import pandas as pd
from typing import Optional

class NBAStatsClient:
    """Client for NBA Stats API (stats.nba.com)."""

    BASE_URL = "https://stats.nba.com/stats"

    HEADERS = {
        "Host": "stats.nba.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "en-US,en;q=0.5",
        "Referer": "https://www.nba.com/",
        "x-nba-stats-origin": "stats",
        "x-nba-stats-token": "true",
        "Connection": "keep-alive",
    }

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(self.HEADERS)

    def _request(self, endpoint: str, params: dict) -> pd.DataFrame:
        """Make API request and parse response."""
        url = f"{self.BASE_URL}/{endpoint}"
        response = self.session.get(url, params=params, timeout=30)
        response.raise_for_status()

        data = response.json()
        result_set = data.get("resultSets", [{}])[0]

        headers = result_set.get("headers", [])
        rows = result_set.get("rowSet", [])

        return pd.DataFrame(rows, columns=headers)

    def get_player_stats(self, season: str = "2023-24",
                         per_mode: str = "PerGame",
                         season_type: str = "Regular Season") -> pd.DataFrame:
        """Get league-wide player statistics."""
        params = {
            "LeagueID": "00",
            "Season": season,
            "SeasonType": season_type,
            "PerMode": per_mode,
            "MeasureType": "Base",
            "PaceAdjust": "N",
            "Rank": "N",
            "PlusMinus": "N"
        }

        return self._request("leaguedashplayerstats", params)

    def get_player_game_log(self, player_id: int, season: str = "2023-24") -> pd.DataFrame:
        """Get player game-by-game stats."""
        params = {
            "PlayerID": player_id,
            "Season": season,
            "SeasonType": "Regular Season"
        }

        return self._request("playergamelog", params)

    def get_team_stats(self, season: str = "2023-24",
                      per_mode: str = "PerGame") -> pd.DataFrame:
        """Get team statistics."""
        params = {
            "LeagueID": "00",
            "Season": season,
            "SeasonType": "Regular Season",
            "PerMode": per_mode,
            "MeasureType": "Base"
        }

        return self._request("leaguedashteamstats", params)

    def get_shot_chart(self, player_id: int, season: str = "2023-24") -> pd.DataFrame:
        """Get player shot chart data."""
        params = {
            "PlayerID": player_id,
            "Season": season,
            "SeasonType": "Regular Season",
            "ContextMeasure": "FGA",
            "LeagueID": "00"
        }

        return self._request("shotchartdetail", params)

    def get_play_by_play(self, game_id: str) -> pd.DataFrame:
        """Get play-by-play data for a game."""
        params = {
            "GameID": game_id,
            "StartPeriod": 0,
            "EndPeriod": 10
        }

        return self._request("playbyplayv2", params)

    def search_player(self, name: str) -> Optional[int]:
        """Find player ID by name."""
        all_players = self._request("commonallplayers", {
            "LeagueID": "00",
            "Season": "2023-24",
            "IsOnlyCurrentSeason": 0
        })

        matches = all_players[
            all_players["DISPLAY_FIRST_LAST"].str.contains(name, case=False)
        ]

        if not matches.empty:
            return matches.iloc[0]["PERSON_ID"]
        return None

# Usage
nba = NBAStatsClient()

# Get league player stats
player_stats = nba.get_player_stats(season="2023-24")
print(player_stats[["PLAYER_NAME", "TEAM_ABBREVIATION", "PTS", "REB", "AST"]].head(20))

# Get specific player game log
lebron_id = 2544  # LeBron James
game_log = nba.get_player_game_log(lebron_id, "2023-24")
print(game_log[["GAME_DATE", "MATCHUP", "PTS", "REB", "AST"]].head(10))

python

Soccer Data API (Football-Data.org)

Fetch comprehensive soccer data including matches, standings, and player stats.

import requests
import pandas as pd
from datetime import datetime, timedelta

class FootballDataClient:
    """Client for Football-Data.org API."""

    BASE_URL = "https://api.football-data.org/v4"

    COMPETITIONS = {
        "epl": "PL",       # Premier League
        "laliga": "PD",    # La Liga
        "bundesliga": "BL1",
        "seriea": "SA",
        "ligue1": "FL1",
        "ucl": "CL",       # Champions League
        "worldcup": "WC"
    }

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.headers = {"X-Auth-Token": api_key}

    def _request(self, endpoint: str, params: dict = None) -> dict:
        """Make API request."""
        url = f"{self.BASE_URL}/{endpoint}"
        response = requests.get(url, headers=self.headers, params=params)
        response.raise_for_status()
        return response.json()

    def get_competitions(self) -> pd.DataFrame:
        """Get available competitions."""
        data = self._request("competitions")

        competitions = []
        for comp in data.get("competitions", []):
            competitions.append({
                "id": comp.get("id"),
                "code": comp.get("code"),
                "name": comp.get("name"),
                "area": comp.get("area", {}).get("name"),
                "type": comp.get("type"),
                "current_season": comp.get("currentSeason", {}).get("id")
            })

        return pd.DataFrame(competitions)

    def get_standings(self, competition: str) -> pd.DataFrame:
        """Get competition standings."""
        comp_code = self.COMPETITIONS.get(competition, competition)
        data = self._request(f"competitions/{comp_code}/standings")

        teams = []
        for standing in data.get("standings", []):
            if standing.get("type") == "TOTAL":
                for team in standing.get("table", []):
                    teams.append({
                        "position": team.get("position"),
                        "team": team.get("team", {}).get("name"),
                        "played": team.get("playedGames"),
                        "won": team.get("won"),
                        "draw": team.get("draw"),
                        "lost": team.get("lost"),
                        "goals_for": team.get("goalsFor"),
                        "goals_against": team.get("goalsAgainst"),
                        "goal_diff": team.get("goalDifference"),
                        "points": team.get("points")
                    })

        return pd.DataFrame(teams)

    def get_matches(self, competition: str, status: str = None,
                   date_from: str = None, date_to: str = None) -> pd.DataFrame:
        """Get matches for a competition."""
        comp_code = self.COMPETITIONS.get(competition, competition)

        params = {}
        if status:
            params["status"] = status  # SCHEDULED, LIVE, FINISHED
        if date_from:
            params["dateFrom"] = date_from
        if date_to:
            params["dateTo"] = date_to

        data = self._request(f"competitions/{comp_code}/matches", params)

        matches = []
        for match in data.get("matches", []):
            matches.append({
                "match_id": match.get("id"),
                "matchday": match.get("matchday"),
                "date": match.get("utcDate"),
                "status": match.get("status"),
                "home_team": match.get("homeTeam", {}).get("name"),
                "away_team": match.get("awayTeam", {}).get("name"),
                "home_score": match.get("score", {}).get("fullTime", {}).get("home"),
                "away_score": match.get("score", {}).get("fullTime", {}).get("away"),
                "winner": match.get("score", {}).get("winner")
            })

        return pd.DataFrame(matches)

    def get_team(self, team_id: int) -> dict:
        """Get team details."""
        return self._request(f"teams/{team_id}")

    def get_top_scorers(self, competition: str) -> pd.DataFrame:
        """Get top scorers for a competition."""
        comp_code = self.COMPETITIONS.get(competition, competition)
        data = self._request(f"competitions/{comp_code}/scorers")

        scorers = []
        for scorer in data.get("scorers", []):
            scorers.append({
                "player": scorer.get("player", {}).get("name"),
                "team": scorer.get("team", {}).get("name"),
                "goals": scorer.get("goals"),
                "assists": scorer.get("assists"),
                "penalties": scorer.get("penalties")
            })

        return pd.DataFrame(scorers)

# Usage (requires API key from football-data.org)
# fd = FootballDataClient("YOUR_API_KEY")
# standings = fd.get_standings("epl")
# print(standings)

python

Fantasy Sports API Integration

Integrate with Yahoo Fantasy Sports API for leagues, rosters, and player data.

import requests
from requests_oauthlib import OAuth2Session
import pandas as pd

class YahooFantasyClient:
    """Client for Yahoo Fantasy Sports API."""

    AUTH_URL = "https://api.login.yahoo.com/oauth2/request_auth"
    TOKEN_URL = "https://api.login.yahoo.com/oauth2/get_token"
    BASE_URL = "https://fantasysports.yahooapis.com/fantasy/v2"

    def __init__(self, client_id: str, client_secret: str, redirect_uri: str):
        self.client_id = client_id
        self.client_secret = client_secret
        self.redirect_uri = redirect_uri
        self.oauth = None
        self.token = None

    def get_auth_url(self) -> str:
        """Get authorization URL for user consent."""
        self.oauth = OAuth2Session(
            self.client_id,
            redirect_uri=self.redirect_uri,
            scope=["fspt-r"]
        )
        auth_url, _ = self.oauth.authorization_url(self.AUTH_URL)
        return auth_url

    def fetch_token(self, authorization_response: str):
        """Exchange authorization code for access token."""
        self.token = self.oauth.fetch_token(
            self.TOKEN_URL,
            authorization_response=authorization_response,
            client_secret=self.client_secret
        )

    def _request(self, endpoint: str) -> dict:
        """Make authenticated API request."""
        url = f"{self.BASE_URL}/{endpoint}"
        headers = {"Authorization": f"Bearer {self.token['access_token']}"}
        params = {"format": "json"}

        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()

    def get_user_leagues(self, game_key: str = "nfl") -> pd.DataFrame:
        """Get user's fantasy leagues."""
        data = self._request(f"users;use_login=1/games;game_keys={game_key}/leagues")

        leagues = []
        for league in data.get("fantasy_content", {}).get("users", {}).get("0", {}).get("user", [])[1].get("games", {}).get("0", {}).get("game", [])[1].get("leagues", {}).values():
            if isinstance(league, dict):
                league_data = league.get("league", [])
                if league_data:
                    leagues.append({
                        "league_key": league_data[0].get("league_key"),
                        "league_id": league_data[0].get("league_id"),
                        "name": league_data[0].get("name"),
                        "num_teams": league_data[0].get("num_teams"),
                        "scoring_type": league_data[0].get("scoring_type")
                    })

        return pd.DataFrame(leagues)

    def get_league_standings(self, league_key: str) -> pd.DataFrame:
        """Get league standings."""
        data = self._request(f"league/{league_key}/standings")

        teams = []
        standings = data.get("fantasy_content", {}).get("league", [])[1].get("standings", [])[0].get("teams", {})

        for team_data in standings.values():
            if isinstance(team_data, dict):
                team = team_data.get("team", [])
                team_standings = team[1].get("team_standings", {})

                teams.append({
                    "team_key": team[0][0].get("team_key"),
                    "name": team[0][2].get("name"),
                    "rank": team_standings.get("rank"),
                    "wins": team_standings.get("outcome_totals", {}).get("wins"),
                    "losses": team_standings.get("outcome_totals", {}).get("losses"),
                    "points_for": team_standings.get("points_for"),
                    "points_against": team_standings.get("points_against")
                })

        return pd.DataFrame(teams)

    def get_roster(self, team_key: str) -> pd.DataFrame:
        """Get team roster."""
        data = self._request(f"team/{team_key}/roster")

        players = []
        roster = data.get("fantasy_content", {}).get("team", [])[1].get("roster", {}).get("0", {}).get("players", {})

        for player_data in roster.values():
            if isinstance(player_data, dict):
                player = player_data.get("player", [])
                if player:
                    players.append({
                        "player_key": player[0][0].get("player_key"),
                        "name": player[0][2].get("name", {}).get("full"),
                        "position": player[0][4].get("display_position"),
                        "team": player[0][6].get("editorial_team_abbr"),
                        "status": player[0][3].get("status") if len(player[0]) > 3 else None
                    })

        return pd.DataFrame(players)

# Usage (requires Yahoo Developer credentials)
# yahoo = YahooFantasyClient(
#     client_id="YOUR_CLIENT_ID",
#     client_secret="YOUR_CLIENT_SECRET",
#     redirect_uri="https://localhost/callback"
# )
# auth_url = yahoo.get_auth_url()
# print(f"Visit: {auth_url}")

Bootstrap Confidence Intervals

Calculate bootstrap confidence intervals for sports statistics with the boot package.

library(boot)
library(dplyr)
library(ggplot2)

# Sample batting data
player_games <- data.frame(
  game = 1:162,
  ab = sample(3:5, 162, replace = TRUE),
  hits = rbinom(162, 4, 0.280)
)

player_games <- player_games %>%
  mutate(
    avg = hits / ab,
    cum_avg = cumsum(hits) / cumsum(ab)
  )

# Define statistic function for bootstrap
batting_stat <- function(data, indices) {
  d <- data[indices, ]
  return(sum(d$hits) / sum(d$ab))
}

# Run bootstrap
set.seed(42)
boot_results <- boot(
  data = player_games,
  statistic = batting_stat,
  R = 10000
)

# View results
print(boot_results)

# Calculate confidence intervals
# BCa (bias-corrected and accelerated) is preferred
ci_results <- boot.ci(
  boot_results,
  type = c("norm", "basic", "perc", "bca")
)
print(ci_results)

# Visualize bootstrap distribution
boot_df <- data.frame(avg = boot_results$t)

ggplot(boot_df, aes(x = avg)) +
  geom_histogram(aes(y = after_stat(density)), bins = 50,
                 fill = "steelblue", alpha = 0.7) +
  geom_density(color = "red", size = 1) +
  geom_vline(xintercept = boot_results$t0, color = "black",
             linetype = "dashed", size = 1) +
  geom_vline(xintercept = ci_results$bca[4:5], color = "darkgreen",
             linetype = "dotted", size = 1) +
  labs(
    title = "Bootstrap Distribution of Batting Average",
    subtitle = paste("95% BCa CI: [", round(ci_results$bca[4], 3),
                     ", ", round(ci_results$bca[5], 3), "]"),
    x = "Batting Average",
    y = "Density"
  ) +
  theme_minimal()

# Bootstrap multiple statistics
multi_stat <- function(data, indices) {
  d <- data[indices, ]
  c(
    avg = sum(d$hits) / sum(d$ab),
    obp = (sum(d$hits) + 10) / (sum(d$ab) + 20),  # Simplified
    games_above_300 = mean(d$avg > 0.300)
  )
}

multi_boot <- boot(player_games, multi_stat, R = 5000)
print(multi_boot)

Bayesian Analysis with brms

Perform Bayesian regression analysis for sports data using brms package.

library(brms)
library(dplyr)
library(ggplot2)
library(bayesplot)

# Sample player projection data
set.seed(42)
players <- data.frame(
  player_id = 1:100,
  age = sample(23:38, 100, replace = TRUE),
  experience = pmax(1, sample(1:15, 100, replace = TRUE)),
  avg_3yr = rnorm(100, 0.265, 0.025),
  war_3yr = rnorm(100, 2.5, 1.5)
) %>%
  mutate(
    # Simulate next year WAR with age curve
    next_war = war_3yr * 0.6 +
               3 * exp(-(age - 27)^2 / 50) +
               rnorm(100, 0, 0.8)
  )

# Fit Bayesian model with brms
# Priors: informative based on domain knowledge
model <- brm(
  next_war ~ age + I(age^2) + experience + avg_3yr + war_3yr,
  data = players,
  family = gaussian(),
  prior = c(
    prior(normal(0, 5), class = "Intercept"),
    prior(normal(0, 1), class = "b"),
    prior(exponential(1), class = "sigma")
  ),
  chains = 4,
  iter = 2000,
  warmup = 1000,
  seed = 42
)

# Summary
summary(model)

# Posterior distributions
mcmc_areas(model, pars = c("b_age", "b_Iage2", "b_experience", "b_war_3yr"),
           prob = 0.95)

# Posterior predictive check
pp_check(model, ndraws = 100)

# Predictions with uncertainty
new_player <- data.frame(
  age = 28,
  experience = 5,
  avg_3yr = 0.285,
  war_3yr = 3.5
)

# Get posterior predictions
pred <- predict(model, newdata = new_player, summary = FALSE)

# Prediction summary
cat("Predicted WAR for new player:\n")
cat("Mean:", round(mean(pred), 2), "\n")
cat("95% CI:", round(quantile(pred, c(0.025, 0.975)), 2), "\n")

# Probability of being above average (WAR > 2)
prob_above_avg <- mean(pred > 2)
cat("P(WAR > 2):", round(prob_above_avg, 3), "\n")

# Conditional effects plot
conditional_effects(model, effects = "age")

Survival Analysis for Player Careers

Analyze player career longevity using survival analysis techniques in R.

library(survival)
library(survminer)
library(dplyr)

# Sample career data
set.seed(42)
n_players <- 200

careers <- data.frame(
  player_id = 1:n_players,
  draft_round = sample(1:10, n_players, replace = TRUE),
  position = sample(c("Pitcher", "Infielder", "Outfielder", "Catcher"),
                   n_players, replace = TRUE),
  college = sample(c("Yes", "No"), n_players, replace = TRUE, prob = c(0.6, 0.4)),
  debut_age = sample(21:26, n_players, replace = TRUE)
) %>%
  mutate(
    # Career length with position and draft effects
    base_career = 6 +
                  (11 - draft_round) * 0.3 +
                  ifelse(position == "Pitcher", -1, 0) +
                  ifelse(college == "Yes", 0.5, 0) +
                  rnorm(n_players, 0, 2),
    career_years = pmax(1, round(base_career)),
    # Censoring (still active)
    retired = rbinom(n_players, 1, 0.85)
  )

# Create survival object
surv_obj <- Surv(time = careers$career_years, event = careers$retired)

# Kaplan-Meier overall
km_fit <- survfit(surv_obj ~ 1)
print(km_fit)

# Plot survival curve
ggsurvplot(
  km_fit,
  data = careers,
  conf.int = TRUE,
  risk.table = TRUE,
  xlab = "Years in MLB",
  ylab = "Probability of Active Career",
  title = "MLB Career Survival Curve"
)

# Kaplan-Meier by position
km_position <- survfit(surv_obj ~ position, data = careers)

ggsurvplot(
  km_position,
  data = careers,
  pval = TRUE,
  risk.table = TRUE,
  legend.title = "Position",
  palette = "jco"
)

# Cox proportional hazards model
cox_model <- coxph(
  surv_obj ~ draft_round + position + college + debut_age,
  data = careers
)

summary(cox_model)

# Forest plot of hazard ratios
ggforest(cox_model, data = careers)

# Test proportional hazards assumption
ph_test <- cox.zph(cox_model)
print(ph_test)

# Predicted survival curves for different profiles
new_players <- data.frame(
  draft_round = c(1, 5, 10),
  position = "Infielder",
  college = "Yes",
  debut_age = 23
)

pred_surv <- survfit(cox_model, newdata = new_players)
ggsurvplot(pred_surv, legend.labs = c("Round 1", "Round 5", "Round 10"))

Lahman Database Analysis

Analyze historical baseball data using the Lahman package with dplyr for aggregations.

library(Lahman)
library(dplyr)
library(ggplot2)

# Load batting data
batting <- Lahman::Batting %>%
  filter(yearID >= 1950) %>%
  group_by(playerID, yearID) %>%
  summarize(
    AB = sum(AB, na.rm = TRUE),
    H = sum(H, na.rm = TRUE),
    HR = sum(HR, na.rm = TRUE),
    RBI = sum(RBI, na.rm = TRUE),
    BB = sum(BB, na.rm = TRUE),
    SO = sum(SO, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(AB >= 400) %>%
  mutate(
    AVG = H / AB,
    OBP = (H + BB) / (AB + BB),
    ISO = (HR * 4 + (H - HR)) / AB - AVG  # Simplified ISO
  )

# Add player names
people <- Lahman::People %>%
  select(playerID, nameFirst, nameLast) %>%
  mutate(name = paste(nameFirst, nameLast))

batting_named <- batting %>%
  left_join(people, by = "playerID")

# Top seasons by HR
top_hr_seasons <- batting_named %>%
  arrange(desc(HR)) %>%
  head(20) %>%
  select(name, yearID, AB, HR, AVG, OBP)

print(top_hr_seasons)

# Visualize HR trend over time
hr_trend <- batting %>%
  group_by(yearID) %>%
  summarize(
    avg_hr = mean(HR),
    total_hr = sum(HR),
    players = n()
  )

ggplot(hr_trend, aes(x = yearID, y = avg_hr)) +
  geom_line(color = "blue", size = 1) +
  geom_smooth(method = "loess", se = TRUE, alpha = 0.2) +
  labs(
    title = "Average Home Runs per Qualified Batter",
    x = "Year",
    y = "Average HR"
  ) +
  theme_minimal()

tidymodels Player Prediction

Build predictive model using tidymodels workflow for player performance forecasting.

library(tidymodels)
library(dplyr)

# Sample player data
set.seed(42)
player_data <- tibble(
  player_id = 1:500,
  age = sample(22:38, 500, replace = TRUE),
  experience = sample(1:15, 500, replace = TRUE),
  avg_3yr = runif(500, 0.220, 0.320),
  obp_3yr = runif(500, 0.280, 0.420),
  slg_3yr = runif(500, 0.350, 0.600),
  war_3yr = runif(500, -1, 8),
  next_year_war = war_3yr * 0.7 + rnorm(500, 0, 1)  # Target
)

# Split data
data_split <- initial_split(player_data, prop = 0.8, strata = next_year_war)
train_data <- training(data_split)
test_data <- testing(data_split)

# Define recipe (preprocessing)
war_recipe <- recipe(next_year_war ~ age + experience + avg_3yr + obp_3yr + slg_3yr + war_3yr,
                     data = train_data) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_poly(age, degree = 2) %>%
  step_interact(terms = ~ age:experience)

# Define model
rf_model <- rand_forest(
  mtry = tune(),
  trees = 500,
  min_n = tune()
) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# Create workflow
war_workflow <- workflow() %>%
  add_recipe(war_recipe) %>%
  add_model(rf_model)

# Cross-validation
cv_folds <- vfold_cv(train_data, v = 5)

# Tune hyperparameters
rf_grid <- grid_regular(
  mtry(range = c(2, 6)),
  min_n(range = c(5, 20)),
  levels = 5
)

tune_results <- tune_grid(
  war_workflow,
  resamples = cv_folds,
  grid = rf_grid,
  metrics = metric_set(rmse, rsq, mae)
)

# Best model
best_params <- select_best(tune_results, metric = "rmse")
final_workflow <- finalize_workflow(war_workflow, best_params)

# Fit final model
final_fit <- fit(final_workflow, data = train_data)

# Evaluate on test set
predictions <- predict(final_fit, test_data) %>%
  bind_cols(test_data)

metrics <- predictions %>%
  metrics(truth = next_year_war, estimate = .pred)

print(metrics)

Sports Regression Diagnostics

Comprehensive regression diagnostics for sports statistics models in R.

library(dplyr)
library(ggplot2)
library(broom)
library(car)

# Example: Predict ERA from various pitching metrics
pitching_data <- tibble(
  era = c(3.2, 4.1, 3.8, 2.9, 5.2, 3.5, 4.8, 3.1, 4.4, 3.7),
  fip = c(3.1, 4.3, 3.6, 2.8, 5.0, 3.4, 4.5, 3.0, 4.2, 3.5),
  whip = c(1.10, 1.32, 1.21, 1.05, 1.45, 1.18, 1.38, 1.08, 1.28, 1.15),
  k_per_9 = c(9.2, 7.5, 8.8, 10.1, 6.5, 8.2, 7.0, 9.8, 7.8, 8.5),
  bb_per_9 = c(2.5, 3.2, 2.8, 2.1, 4.0, 2.9, 3.5, 2.3, 3.1, 2.7)
)

# Fit model
model <- lm(era ~ fip + whip + k_per_9 + bb_per_9, data = pitching_data)

# Summary
summary(model)

# Get tidy output
tidy_results <- tidy(model, conf.int = TRUE)
print(tidy_results)

# Model statistics
glance(model)

# Residual diagnostics
augmented <- augment(model)

# 1. Residuals vs Fitted
p1 <- ggplot(augmented, aes(.fitted, .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(se = FALSE) +
  labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals") +
  theme_minimal()

# 2. Q-Q Plot
p2 <- ggplot(augmented, aes(sample = .std.resid)) +
  stat_qq() +
  stat_qq_line(color = "red") +
  labs(title = "Normal Q-Q", x = "Theoretical", y = "Standardized Residuals") +
  theme_minimal()

# 3. Scale-Location
p3 <- ggplot(augmented, aes(.fitted, sqrt(abs(.std.resid)))) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(title = "Scale-Location", x = "Fitted", y = "sqrt(|Standardized Residuals|)") +
  theme_minimal()

# VIF for multicollinearity
vif_values <- vif(model)
print("Variance Inflation Factors:")
print(vif_values)

# Cook's Distance for influential points
cooks_d <- cooks.distance(model)
influential <- which(cooks_d > 4 / nrow(pitching_data))
print(paste("Influential observations:", paste(influential, collapse = ", ")))

Player Similarity with Clustering

Find similar players using hierarchical clustering and visualization in R.

library(dplyr)
library(ggplot2)
library(cluster)
library(factoextra)

# Sample player statistics
players <- tibble(
  name = c("Player A", "Player B", "Player C", "Player D", "Player E",
           "Player F", "Player G", "Player H", "Player I", "Player J"),
  avg = c(.310, .245, .298, .275, .320, .258, .288, .265, .302, .278),
  obp = c(.380, .310, .365, .345, .395, .325, .355, .330, .375, .350),
  slg = c(.520, .420, .485, .450, .550, .400, .470, .435, .505, .460),
  hr = c(32, 15, 25, 20, 38, 12, 22, 18, 28, 21),
  sb = c(5, 25, 12, 18, 3, 30, 8, 22, 6, 15),
  bb_pct = c(12, 8, 10, 9, 14, 7, 9, 8, 11, 10),
  k_pct = c(18, 22, 16, 20, 15, 25, 17, 21, 14, 19)
)

# Prepare data for clustering
player_stats <- players %>%
  select(-name) %>%
  scale()

rownames(player_stats) <- players$name

# Hierarchical clustering
dist_matrix <- dist(player_stats, method = "euclidean")
hclust_result <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrogram
fviz_dend(hclust_result,
          k = 4,
          cex = 0.8,
          main = "Player Similarity Dendrogram",
          xlab = "Players",
          palette = "jco")

# Cut into clusters
clusters <- cutree(hclust_result, k = 4)
players$cluster <- as.factor(clusters)

# Visualize clusters with PCA
pca_result <- prcomp(player_stats)

fviz_pca_ind(pca_result,
             geom.ind = "point",
             col.ind = players$cluster,
             palette = "jco",
             addEllipses = TRUE,
             legend.title = "Cluster")

# Find similar players function
find_similar <- function(target_player, data, n = 5) {
  target_idx <- which(data$name == target_player)
  if (length(target_idx) == 0) return(NULL)

  distances <- as.matrix(dist_matrix)[target_idx, ]
  similar_idx <- order(distances)[2:(n + 1)]  # Exclude self

  data[similar_idx, ] %>%
    mutate(similarity = 1 - distances[similar_idx] / max(distances))
}

# Example: Find players similar to Player A
similar_to_A <- find_similar("Player A", players)
print(similar_to_A)

Time Series Forecasting with fable

Forecast seasonal sports metrics using the fable package for tidy time series.

library(fable)
library(tsibble)
library(feasts)
library(dplyr)
library(ggplot2)

# Create sample attendance data
set.seed(42)
dates <- seq(as.Date("2020-01-01"), as.Date("2023-12-31"), by = "month")
attendance <- tibble(
  date = dates,
  attendance = 30000 +
    10000 * sin(2 * pi * (1:length(dates)) / 12) +  # Seasonality
    500 * (1:length(dates)) +  # Trend
    rnorm(length(dates), 0, 2000)  # Noise
)

# Convert to tsibble
attendance_ts <- attendance %>%
  mutate(month = yearmonth(date)) %>%
  as_tsibble(index = month)

# Visualize decomposition
attendance_ts %>%
  model(STL(attendance ~ season(window = "periodic"))) %>%
  components() %>%
  autoplot()

# Fit multiple models
models <- attendance_ts %>%
  model(
    ets = ETS(attendance),
    arima = ARIMA(attendance),
    snaive = SNAIVE(attendance)
  )

# Compare accuracy
accuracy(models)

# Generate forecasts
forecasts <- models %>%
  forecast(h = "12 months")

# Plot forecasts
forecasts %>%
  autoplot(attendance_ts, level = c(80, 95)) +
  facet_wrap(~.model, ncol = 1) +
  labs(
    title = "Attendance Forecasts",
    y = "Monthly Attendance",
    x = "Date"
  ) +
  theme_minimal()

# Cross-validation
cv_results <- attendance_ts %>%
  stretch_tsibble(.init = 24, .step = 3) %>%
  model(
    ets = ETS(attendance),
    arima = ARIMA(attendance)
  ) %>%
  forecast(h = 3) %>%
  accuracy(attendance_ts)

print(cv_results)

Shiny Dashboard for Sports Stats

Create interactive Shiny dashboard for exploring player statistics.

library(shiny)
library(shinydashboard)
library(dplyr)
library(ggplot2)
library(DT)

# Sample data
players_data <- data.frame(
  name = paste("Player", LETTERS[1:20]),
  team = rep(c("Team A", "Team B", "Team C", "Team D"), each = 5),
  position = sample(c("C", "1B", "2B", "SS", "3B", "OF"), 20, replace = TRUE),
  avg = runif(20, 0.220, 0.320),
  hr = sample(5:40, 20),
  rbi = sample(30:120, 20),
  war = runif(20, -1, 7)
)

ui <- dashboardPage(
  dashboardHeader(title = "Sports Stats Dashboard"),

  dashboardSidebar(
    selectInput("team", "Select Team:",
                choices = c("All", unique(players_data$team))),
    selectInput("stat", "Select Statistic:",
                choices = c("avg", "hr", "rbi", "war")),
    sliderInput("min_war", "Minimum WAR:",
                min = -1, max = 7, value = 0)
  ),

  dashboardBody(
    fluidRow(
      valueBoxOutput("total_players"),
      valueBoxOutput("avg_stat"),
      valueBoxOutput("top_player")
    ),
    fluidRow(
      box(title = "Distribution", status = "primary",
          plotOutput("hist_plot"), width = 6),
      box(title = "Comparison", status = "info",
          plotOutput("bar_plot"), width = 6)
    ),
    fluidRow(
      box(title = "Player Data", status = "success",
          DTOutput("player_table"), width = 12)
    )
  )
)

server <- function(input, output) {

  filtered_data <- reactive({
    data <- players_data %>%
      filter(war >= input$min_war)

    if (input$team != "All") {
      data <- data %>% filter(team == input$team)
    }
    data
  })

  output$total_players <- renderValueBox({
    valueBox(nrow(filtered_data()), "Players", icon = icon("users"))
  })

  output$avg_stat <- renderValueBox({
    avg_val <- mean(filtered_data()[[input$stat]], na.rm = TRUE)
    valueBox(round(avg_val, 3), paste("Avg", input$stat), icon = icon("chart-line"))
  })

  output$top_player <- renderValueBox({
    top <- filtered_data() %>%
      arrange(desc(!!sym(input$stat))) %>%
      slice(1)
    valueBox(top$name, "Top Player", icon = icon("trophy"))
  })

  output$hist_plot <- renderPlot({
    ggplot(filtered_data(), aes_string(x = input$stat)) +
      geom_histogram(bins = 15, fill = "steelblue", color = "white") +
      labs(title = paste("Distribution of", input$stat)) +
      theme_minimal()
  })

  output$bar_plot <- renderPlot({
    filtered_data() %>%
      arrange(desc(!!sym(input$stat))) %>%
      head(10) %>%
      ggplot(aes_string(x = "reorder(name, -get(input$stat))", y = input$stat)) +
      geom_col(fill = "coral") +
      coord_flip() +
      labs(title = paste("Top 10 by", input$stat), x = "") +
      theme_minimal()
  })

  output$player_table <- renderDT({
    datatable(filtered_data(), options = list(pageLength = 10))
  })
}

# Run app
# shinyApp(ui, server)

Mixed Effects Model for Player Analysis

Use mixed effects models to account for team and park effects in player statistics.

library(lme4)
library(dplyr)
library(ggplot2)
library(broom.mixed)

# Sample data with player nested in team
set.seed(42)
n_teams <- 10
n_players_per_team <- 15
n_seasons <- 3

player_data <- expand.grid(
  team = paste("Team", LETTERS[1:n_teams]),
  player = 1:n_players_per_team,
  season = 2022:2024
) %>%
  mutate(
    player_id = paste(team, player, sep = "_"),
    # Team random effect
    team_effect = rep(rnorm(n_teams, 0, 0.02), each = n_players_per_team * n_seasons),
    # Player random effect
    player_skill = rep(rnorm(n_teams * n_players_per_team, 0.265, 0.025), each = n_seasons),
    # Season fixed effect
    season_effect = (season - 2022) * 0.003,
    # Home park factor
    park_factor = rep(runif(n_teams, 0.95, 1.05), each = n_players_per_team * n_seasons),
    # Observed batting average
    avg = player_skill + team_effect + season_effect + rnorm(n(), 0, 0.015)
  ) %>%
  filter(avg > 0.150 & avg < 0.400)  # Reasonable bounds

# Fit mixed effects model
# Fixed: season
# Random: player nested in team
model <- lmer(
  avg ~ season + (1 | team) + (1 | team:player_id),
  data = player_data,
  REML = TRUE
)

# Summary
summary(model)

# Extract variance components
VarCorr(model)

# Random effects
ranef_team <- ranef(model)$team
ranef_player <- ranef(model)$`team:player_id`

# Team effects
team_effects <- data.frame(
  team = rownames(ranef_team),
  effect = ranef_team[[1]]
) %>%
  arrange(desc(effect))

print("Team Random Effects:")
print(team_effects)

# Predictions
player_data$predicted_avg <- predict(model)

# Plot random effects
ggplot(team_effects, aes(x = reorder(team, effect), y = effect)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Team Random Effects on Batting Average",
    x = "Team",
    y = "Effect on AVG"
  ) +
  theme_minimal()

# ICC - proportion of variance at each level
icc <- as.data.frame(VarCorr(model))
total_var <- sum(icc$vcov)
print("Intraclass Correlations:")
print(icc %>% mutate(ICC = vcov / total_var))

python

JSON Data Flattener

Flatten nested JSON structures from sports APIs into tabular format for database storage.

import pandas as pd
import json
from typing import List, Dict, Any
from collections import defaultdict

class JSONFlattener:
    """Flatten nested JSON to tabular format."""

    def __init__(self, separator: str = "_"):
        self.separator = separator

    def flatten_dict(self, d: dict, parent_key: str = "", sep: str = "_") -> dict:
        """Recursively flatten nested dictionary."""
        items = []

        for k, v in d.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k

            if isinstance(v, dict):
                items.extend(self.flatten_dict(v, new_key, sep).items())
            elif isinstance(v, list):
                if len(v) > 0 and isinstance(v[0], dict):
                    # List of dicts - need to handle separately
                    items.append((new_key, json.dumps(v)))
                else:
                    items.append((new_key, v))
            else:
                items.append((new_key, v))

        return dict(items)

    def flatten_json(self, data: Any) -> pd.DataFrame:
        """Flatten JSON data to DataFrame."""

        if isinstance(data, dict):
            # Single record
            flattened = self.flatten_dict(data, sep=self.separator)
            return pd.DataFrame([flattened])

        elif isinstance(data, list):
            # List of records
            flattened_records = []
            for record in data:
                if isinstance(record, dict):
                    flattened_records.append(
                        self.flatten_dict(record, sep=self.separator)
                    )
                else:
                    flattened_records.append({"value": record})

            return pd.DataFrame(flattened_records)

        return pd.DataFrame()

    def extract_nested_array(self, data: List[dict],
                            array_key: str,
                            parent_keys: List[str] = None) -> pd.DataFrame:
        """Extract nested array and maintain parent context."""

        rows = []
        for record in data:
            parent_data = {}
            if parent_keys:
                for pk in parent_keys:
                    if pk in record:
                        parent_data[pk] = record[pk]

            nested_items = record.get(array_key, [])
            for item in nested_items:
                if isinstance(item, dict):
                    row = {**parent_data, **self.flatten_dict(item)}
                else:
                    row = {**parent_data, array_key: item}
                rows.append(row)

        return pd.DataFrame(rows)

# Example: Flatten game data with nested plays
game_json = {
    "game_id": "2024001",
    "date": "2024-04-01",
    "home_team": {"id": 1, "name": "Yankees", "city": "New York"},
    "away_team": {"id": 2, "name": "Red Sox", "city": "Boston"},
    "final_score": {"home": 5, "away": 3},
    "plays": [
        {"inning": 1, "batter": "Judge", "result": "single"},
        {"inning": 1, "batter": "Soto", "result": "home_run"}
    ]
}

flattener = JSONFlattener()

# Flatten game info
game_df = flattener.flatten_json(game_json)
print("Flattened game data:")
print(game_df.columns.tolist())

# Extract plays with game context
plays_df = flattener.extract_nested_array(
    [game_json],
    "plays",
    parent_keys=["game_id", "date"]
)
print("\nPlays data:")
print(plays_df)

python

CSV to Database Importer

Robust CSV importer with encoding detection, type inference, and batch loading.

import pandas as pd
import chardet
from sqlalchemy import create_engine, types
import numpy as np
import logging

logger = logging.getLogger(__name__)

class CSVImporter:
    """Import CSV files to database with automatic configuration."""

    def __init__(self, engine):
        self.engine = engine

    def detect_encoding(self, file_path: str) -> str:
        """Detect file encoding."""
        with open(file_path, "rb") as f:
            result = chardet.detect(f.read(10000))
        return result["encoding"]

    def infer_sql_types(self, df: pd.DataFrame) -> dict:
        """Infer SQL types from DataFrame."""
        type_map = {}

        for col in df.columns:
            dtype = df[col].dtype

            if pd.api.types.is_integer_dtype(dtype):
                max_val = df[col].max()
                if max_val < 32767:
                    type_map[col] = types.SmallInteger()
                elif max_val < 2147483647:
                    type_map[col] = types.Integer()
                else:
                    type_map[col] = types.BigInteger()

            elif pd.api.types.is_float_dtype(dtype):
                type_map[col] = types.Float()

            elif pd.api.types.is_datetime64_any_dtype(dtype):
                type_map[col] = types.DateTime()

            else:  # String/object
                max_len = df[col].astype(str).str.len().max()
                if max_len < 50:
                    type_map[col] = types.String(50)
                elif max_len < 255:
                    type_map[col] = types.String(255)
                else:
                    type_map[col] = types.Text()

        return type_map

    def import_csv(self, file_path: str, table_name: str,
                   chunk_size: int = 5000,
                   if_exists: str = "replace") -> dict:
        """Import CSV file to database table."""

        # Detect encoding
        encoding = self.detect_encoding(file_path)
        logger.info(f"Detected encoding: {encoding}")

        # Read CSV with detected encoding
        df = pd.read_csv(file_path, encoding=encoding, low_memory=False)
        logger.info(f"Read {len(df)} rows, {len(df.columns)} columns")

        # Clean column names
        df.columns = [c.lower().strip().replace(" ", "_").replace("-", "_")
                     for c in df.columns]

        # Parse dates
        for col in df.columns:
            if "date" in col.lower():
                df[col] = pd.to_datetime(df[col], errors="coerce")

        # Infer types
        sql_types = self.infer_sql_types(df)

        # Import in chunks
        total_loaded = 0
        for i in range(0, len(df), chunk_size):
            chunk = df.iloc[i:i+chunk_size]
            mode = if_exists if i == 0 else "append"

            chunk.to_sql(
                table_name,
                self.engine,
                if_exists=mode,
                index=False,
                dtype=sql_types if i == 0 else None,
                chunksize=1000
            )

            total_loaded += len(chunk)
            logger.info(f"Loaded {total_loaded}/{len(df)} rows")

        return {
            "rows_loaded": total_loaded,
            "columns": list(df.columns),
            "table": table_name,
            "encoding": encoding
        }

# Usage
engine = create_engine("mysql://user:pass@localhost/sports_db")
importer = CSVImporter(engine)

result = importer.import_csv(
    "batting_stats_2024.csv",
    "batting_stats",
    chunk_size=5000,
    if_exists="replace"
)
print(f"Imported {result['rows_loaded']} rows to {result['table']}")

python

Data Quality Monitoring

Monitor data quality metrics over time and alert on anomalies in sports data pipelines.

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import Dict, List, Callable
import logging
from dataclasses import dataclass

logger = logging.getLogger(__name__)

@dataclass
class QualityMetric:
    name: str
    value: float
    threshold: float
    status: str  # "pass", "warn", "fail"
    timestamp: datetime

class DataQualityMonitor:
    """Monitor data quality for sports statistics."""

    def __init__(self, db_connection):
        self.db = db_connection
        self.metrics_history: List[QualityMetric] = []

    def check_completeness(self, table: str, required_cols: List[str]) -> QualityMetric:
        """Check percentage of non-null values in required columns."""
        df = pd.read_sql(f"SELECT * FROM {table} ORDER BY created_at DESC LIMIT 10000", self.db)

        completeness_scores = []
        for col in required_cols:
            if col in df.columns:
                score = (df[col].notna().sum() / len(df)) * 100
                completeness_scores.append(score)

        avg_completeness = np.mean(completeness_scores)

        return QualityMetric(
            name=f"{table}_completeness",
            value=avg_completeness,
            threshold=95.0,
            status="pass" if avg_completeness >= 95 else "fail",
            timestamp=datetime.now()
        )

    def check_freshness(self, table: str, timestamp_col: str,
                       max_age_hours: int = 24) -> QualityMetric:
        """Check if data is being updated regularly."""
        query = f"SELECT MAX({timestamp_col}) as latest FROM {table}"
        result = pd.read_sql(query, self.db)

        latest = result["latest"].iloc[0]
        if pd.isna(latest):
            age_hours = float("inf")
        else:
            age_hours = (datetime.now() - latest).total_seconds() / 3600

        return QualityMetric(
            name=f"{table}_freshness",
            value=age_hours,
            threshold=max_age_hours,
            status="pass" if age_hours <= max_age_hours else "fail",
            timestamp=datetime.now()
        )

    def check_row_count(self, table: str, min_rows: int) -> QualityMetric:
        """Verify minimum expected row count."""
        query = f"SELECT COUNT(*) as cnt FROM {table}"
        count = pd.read_sql(query, self.db)["cnt"].iloc[0]

        return QualityMetric(
            name=f"{table}_row_count",
            value=count,
            threshold=min_rows,
            status="pass" if count >= min_rows else "fail",
            timestamp=datetime.now()
        )

    def check_stat_ranges(self, table: str,
                         range_checks: Dict[str, tuple]) -> List[QualityMetric]:
        """Verify statistics are within expected ranges."""
        df = pd.read_sql(f"SELECT * FROM {table}", self.db)
        metrics = []

        for col, (min_val, max_val) in range_checks.items():
            if col not in df.columns:
                continue

            out_of_range = ((df[col] < min_val) | (df[col] > max_val)).sum()
            pct_valid = ((len(df) - out_of_range) / len(df)) * 100

            metrics.append(QualityMetric(
                name=f"{table}_{col}_range",
                value=pct_valid,
                threshold=99.0,
                status="pass" if pct_valid >= 99 else "warn" if pct_valid >= 95 else "fail",
                timestamp=datetime.now()
            ))

        return metrics

    def run_all_checks(self, config: dict) -> Dict:
        """Run all quality checks and return report."""
        results = {"pass": [], "warn": [], "fail": []}

        for table, checks in config.items():
            # Completeness
            if "required_cols" in checks:
                m = self.check_completeness(table, checks["required_cols"])
                results[m.status].append(m)

            # Freshness
            if "timestamp_col" in checks:
                m = self.check_freshness(table, checks["timestamp_col"])
                results[m.status].append(m)

            # Row count
            if "min_rows" in checks:
                m = self.check_row_count(table, checks["min_rows"])
                results[m.status].append(m)

            # Stat ranges
            if "ranges" in checks:
                for m in self.check_stat_ranges(table, checks["ranges"]):
                    results[m.status].append(m)

        return results

# Usage
monitor = DataQualityMonitor(db_connection)

config = {
    "player_stats": {
        "required_cols": ["player_id", "team", "games", "at_bats"],
        "timestamp_col": "updated_at",
        "min_rows": 1000,
        "ranges": {
            "batting_avg": (0, 1),
            "era": (0, 50),
            "games": (0, 162)
        }
    }
}

report = monitor.run_all_checks(config)
print(f"Passed: {len(report['pass'])}")
print(f"Warnings: {len(report['warn'])}")
print(f"Failed: {len(report['fail'])}")

python

Data Validation Framework

Comprehensive data validation framework for sports statistics with customizable rules.

import pandas as pd
from typing import Callable, List, Dict, Any
from dataclasses import dataclass
from enum import Enum

class Severity(Enum):
    ERROR = "error"
    WARNING = "warning"
    INFO = "info"

@dataclass
class ValidationResult:
    rule_name: str
    passed: bool
    severity: Severity
    message: str
    affected_rows: int = 0

class ValidationRule:
    def __init__(self, name: str, check_fn: Callable, severity: Severity = Severity.ERROR):
        self.name = name
        self.check_fn = check_fn
        self.severity = severity

    def validate(self, df: pd.DataFrame) -> ValidationResult:
        try:
            passed, affected, msg = self.check_fn(df)
            return ValidationResult(
                rule_name=self.name,
                passed=passed,
                severity=self.severity,
                message=msg,
                affected_rows=affected
            )
        except Exception as e:
            return ValidationResult(
                rule_name=self.name,
                passed=False,
                severity=Severity.ERROR,
                message=f"Validation error: {str(e)}"
            )

class SportsDataValidator:
    """Validate sports statistics data."""

    def __init__(self):
        self.rules: List[ValidationRule] = []
        self._add_default_rules()

    def _add_default_rules(self):
        """Add common validation rules."""

        # No duplicate records
        self.add_rule(
            "no_duplicates",
            lambda df: (
                not df.duplicated().any(),
                df.duplicated().sum(),
                f"Found {df.duplicated().sum()} duplicate rows"
            ),
            Severity.ERROR
        )

        # Batting average in valid range
        self.add_rule(
            "valid_batting_avg",
            lambda df: (
                ((df["avg"] >= 0) & (df["avg"] <= 1)).all()
                    if "avg" in df.columns else True,
                ((df["avg"] < 0) | (df["avg"] > 1)).sum()
                    if "avg" in df.columns else 0,
                "Batting average must be between 0 and 1"
            ),
            Severity.ERROR
        )

        # ERA reasonable range
        self.add_rule(
            "valid_era",
            lambda df: (
                ((df["era"] >= 0) & (df["era"] <= 50)).all()
                    if "era" in df.columns else True,
                ((df["era"] < 0) | (df["era"] > 50)).sum()
                    if "era" in df.columns else 0,
                "ERA should be between 0 and 50"
            ),
            Severity.WARNING
        )

        # Games played positive
        self.add_rule(
            "positive_games",
            lambda df: (
                (df["games"] > 0).all() if "games" in df.columns else True,
                (df["games"] <= 0).sum() if "games" in df.columns else 0,
                "Games played must be positive"
            ),
            Severity.ERROR
        )

    def add_rule(self, name: str, check_fn: Callable, severity: Severity = Severity.ERROR):
        self.rules.append(ValidationRule(name, check_fn, severity))

    def validate(self, df: pd.DataFrame) -> Dict[str, Any]:
        results = []
        for rule in self.rules:
            result = rule.validate(df)
            results.append(result)

        return {
            "passed": all(r.passed for r in results if r.severity == Severity.ERROR),
            "results": results,
            "errors": [r for r in results if not r.passed and r.severity == Severity.ERROR],
            "warnings": [r for r in results if not r.passed and r.severity == Severity.WARNING]
        }

# Usage
validator = SportsDataValidator()

# Add custom rule
validator.add_rule(
    "valid_player_id",
    lambda df: (
        df["player_id"].notna().all(),
        df["player_id"].isna().sum(),
        "All records must have player_id"
    )
)

report = validator.validate(stats_df)
print(f"Validation passed: {report['passed']}")
for error in report["errors"]:
    print(f"ERROR: {error.rule_name} - {error.message}")

python

Schema Migration Tool

Handle database schema changes and data migrations for sports statistics tables.

from sqlalchemy import create_engine, text, inspect
from datetime import datetime
import pandas as pd
import logging

logger = logging.getLogger(__name__)

class SchemaMigration:
    """Manage database schema migrations."""

    def __init__(self, engine):
        self.engine = engine
        self.migrations_table = "schema_migrations"
        self._ensure_migrations_table()

    def _ensure_migrations_table(self):
        """Create migrations tracking table if not exists."""
        query = text(f"""
            CREATE TABLE IF NOT EXISTS {self.migrations_table} (
                id INT AUTO_INCREMENT PRIMARY KEY,
                version VARCHAR(50) NOT NULL UNIQUE,
                description VARCHAR(255),
                applied_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)
        with self.engine.connect() as conn:
            conn.execute(query)
            conn.commit()

    def get_applied_migrations(self) -> list:
        """Get list of applied migration versions."""
        query = text(f"SELECT version FROM {self.migrations_table} ORDER BY id")
        with self.engine.connect() as conn:
            result = conn.execute(query).fetchall()
        return [r[0] for r in result]

    def apply_migration(self, version: str, description: str, up_sql: str):
        """Apply a migration."""
        applied = self.get_applied_migrations()

        if version in applied:
            logger.info(f"Migration {version} already applied")
            return False

        logger.info(f"Applying migration {version}: {description}")

        with self.engine.connect() as conn:
            # Execute migration
            for statement in up_sql.split(";"):
                if statement.strip():
                    conn.execute(text(statement))

            # Record migration
            conn.execute(text(f"""
                INSERT INTO {self.migrations_table} (version, description)
                VALUES (:version, :description)
            """), {"version": version, "description": description})

            conn.commit()

        logger.info(f"Migration {version} applied successfully")
        return True

# Define migrations
MIGRATIONS = [
    {
        "version": "001",
        "description": "Add advanced batting stats columns",
        "up": """
            ALTER TABLE player_stats
            ADD COLUMN wrc_plus DECIMAL(5,1) NULL,
            ADD COLUMN war DECIMAL(4,2) NULL,
            ADD COLUMN babip DECIMAL(4,3) NULL;

            CREATE INDEX idx_player_stats_war ON player_stats(war)
        """
    },
    {
        "version": "002",
        "description": "Add pitch tracking table",
        "up": """
            CREATE TABLE pitch_tracking (
                id BIGINT AUTO_INCREMENT PRIMARY KEY,
                game_id VARCHAR(20) NOT NULL,
                pitcher_id INT NOT NULL,
                batter_id INT NOT NULL,
                pitch_type VARCHAR(10),
                velocity DECIMAL(4,1),
                spin_rate INT,
                release_x DECIMAL(4,2),
                release_z DECIMAL(4,2),
                plate_x DECIMAL(4,2),
                plate_z DECIMAL(4,2),
                is_strike BOOLEAN,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                INDEX idx_pitch_game (game_id),
                INDEX idx_pitch_pitcher (pitcher_id)
            )
        """
    },
    {
        "version": "003",
        "description": "Add team standings table",
        "up": """
            CREATE TABLE team_standings (
                id INT AUTO_INCREMENT PRIMARY KEY,
                team_id INT NOT NULL,
                season YEAR NOT NULL,
                wins INT DEFAULT 0,
                losses INT DEFAULT 0,
                run_diff INT DEFAULT 0,
                pythag_wins DECIMAL(5,2),
                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
                UNIQUE KEY uk_team_season (team_id, season)
            )
        """
    }
]

# Apply migrations
engine = create_engine("mysql://user:pass@localhost/sports_db")
migrator = SchemaMigration(engine)

for migration in MIGRATIONS:
    migrator.apply_migration(
        migration["version"],
        migration["description"],
        migration["up"]
    )

print("Applied migrations:", migrator.get_applied_migrations())

python

Data Deduplication

Identify and handle duplicate records in sports data using various strategies.

import pandas as pd
import numpy as np
from typing import List, Literal
from fuzzywuzzy import fuzz
import hashlib

class DataDeduplicator:
    """Handle duplicate records in sports data."""

    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
        self.duplicates_found = 0

    def exact_duplicates(self, subset: List[str] = None,
                         keep: Literal["first", "last", False] = "first") -> pd.DataFrame:
        """Remove exact duplicate rows."""
        before = len(self.df)
        self.df = self.df.drop_duplicates(subset=subset, keep=keep)
        self.duplicates_found = before - len(self.df)
        return self.df

    def create_record_hash(self, columns: List[str]) -> pd.Series:
        """Create hash for deduplication."""
        def hash_row(row):
            values = "".join(str(row[c]) for c in columns)
            return hashlib.md5(values.encode()).hexdigest()

        return self.df.apply(hash_row, axis=1)

    def fuzzy_player_match(self, name_col: str, threshold: int = 85) -> pd.DataFrame:
        """Find fuzzy duplicate player names."""
        names = self.df[name_col].unique()
        matches = []

        for i, name1 in enumerate(names):
            for name2 in names[i+1:]:
                score = fuzz.ratio(name1.lower(), name2.lower())
                if score >= threshold:
                    matches.append({
                        "name1": name1,
                        "name2": name2,
                        "similarity": score
                    })

        return pd.DataFrame(matches)

    def merge_duplicates(self, group_cols: List[str],
                        agg_rules: dict = None) -> pd.DataFrame:
        """Merge duplicate records using aggregation."""

        if agg_rules is None:
            # Default: sum numeric, first for others
            numeric_cols = self.df.select_dtypes(include=[np.number]).columns
            agg_rules = {c: "sum" for c in numeric_cols if c not in group_cols}

            other_cols = [c for c in self.df.columns
                         if c not in group_cols and c not in numeric_cols]
            for c in other_cols:
                agg_rules[c] = "first"

        self.df = self.df.groupby(group_cols, as_index=False).agg(agg_rules)
        return self.df

    def flag_duplicates(self, subset: List[str],
                       flag_col: str = "is_duplicate") -> pd.DataFrame:
        """Flag duplicates instead of removing them."""
        self.df[flag_col] = self.df.duplicated(subset=subset, keep=False)
        self.df["duplicate_group"] = self.df.groupby(subset).ngroup()
        return self.df

    def report(self) -> dict:
        """Generate deduplication report."""
        return {
            "total_records": len(self.df),
            "duplicates_removed": self.duplicates_found,
            "unique_players": self.df["player_id"].nunique() if "player_id" in self.df.columns else None,
            "unique_teams": self.df["team"].nunique() if "team" in self.df.columns else None
        }

# Usage
dedup = DataDeduplicator(raw_stats_df)

# Remove exact duplicates
clean_df = dedup.exact_duplicates(
    subset=["player_id", "game_date", "team"],
    keep="last"  # Keep most recent
)

# Find similar player names
fuzzy_matches = dedup.fuzzy_player_match("player_name", threshold=90)
print("Potential duplicate players:")
print(fuzzy_matches)

# Merge duplicate game logs
merged_df = dedup.merge_duplicates(
    group_cols=["player_id", "season"],
    agg_rules={
        "games": "sum",
        "at_bats": "sum",
        "hits": "sum",
        "avg": lambda x: x.iloc[-1]  # Use last avg
    }
)

print(dedup.report())

python

Incremental Data Loader

Load data incrementally based on timestamps to avoid reprocessing existing records.

import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine, text
import logging

logger = logging.getLogger(__name__)

class IncrementalLoader:
    """Load data incrementally based on watermarks."""

    def __init__(self, engine, table_name: str, timestamp_col: str = "updated_at"):
        self.engine = engine
        self.table_name = table_name
        self.timestamp_col = timestamp_col
        self.watermark_table = "etl_watermarks"

    def get_watermark(self) -> datetime:
        """Get last processed timestamp."""
        query = text(f"""
            SELECT last_processed
            FROM {self.watermark_table}
            WHERE table_name = :table
        """)

        with self.engine.connect() as conn:
            result = conn.execute(query, {"table": self.table_name}).fetchone()

        if result:
            return result[0]
        return datetime(1900, 1, 1)  # Default to process all

    def set_watermark(self, timestamp: datetime):
        """Update watermark after successful load."""
        query = text(f"""
            INSERT INTO {self.watermark_table} (table_name, last_processed, updated_at)
            VALUES (:table, :timestamp, NOW())
            ON DUPLICATE KEY UPDATE
                last_processed = :timestamp,
                updated_at = NOW()
        """)

        with self.engine.connect() as conn:
            conn.execute(query, {"table": self.table_name, "timestamp": timestamp})
            conn.commit()

    def extract_incremental(self, source_query: str) -> pd.DataFrame:
        """Extract only new/updated records."""
        watermark = self.get_watermark()
        logger.info(f"Extracting records since {watermark}")

        # Add watermark filter to query
        full_query = f"""
            {source_query}
            WHERE {self.timestamp_col} > %(watermark)s
            ORDER BY {self.timestamp_col}
        """

        df = pd.read_sql(full_query, self.engine, params={"watermark": watermark})
        logger.info(f"Extracted {len(df)} new/updated records")
        return df

    def upsert(self, df: pd.DataFrame, key_cols: list):
        """Insert or update records."""
        if df.empty:
            logger.info("No records to upsert")
            return 0

        # Build upsert query
        columns = df.columns.tolist()
        placeholders = ", ".join([f":{c}" for c in columns])

        update_cols = [c for c in columns if c not in key_cols]
        update_clause = ", ".join([f"{c} = VALUES({c})" for c in update_cols])

        query = text(f"""
            INSERT INTO {self.table_name} ({", ".join(columns)})
            VALUES ({placeholders})
            ON DUPLICATE KEY UPDATE {update_clause}
        """)

        with self.engine.connect() as conn:
            for _, row in df.iterrows():
                conn.execute(query, row.to_dict())
            conn.commit()

        # Update watermark
        max_timestamp = df[self.timestamp_col].max()
        self.set_watermark(max_timestamp)

        logger.info(f"Upserted {len(df)} records")
        return len(df)

    def run(self, source_query: str, key_cols: list, transform_fn=None):
        """Execute incremental load."""
        df = self.extract_incremental(source_query)

        if transform_fn:
            df = transform_fn(df)

        return self.upsert(df, key_cols)

# Usage
engine = create_engine("mysql://user:pass@localhost/sports_db")
loader = IncrementalLoader(engine, "player_stats", "last_updated")

loaded = loader.run(
    source_query="SELECT * FROM raw_player_stats",
    key_cols=["player_id", "season"],
    transform_fn=lambda df: df.assign(processed_at=datetime.now())
)

python

Complete ETL Pipeline

Full ETL pipeline for sports data with extraction, transformation, validation, and loading stages.

import pandas as pd
import numpy as np
from datetime import datetime
import logging
from typing import Dict, List, Optional
from dataclasses import dataclass

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class ETLConfig:
    source_path: str
    destination_table: str
    batch_size: int = 1000
    validate: bool = True

class SportsDataETL:
    """ETL pipeline for sports statistics."""

    def __init__(self, config: ETLConfig, db_connection):
        self.config = config
        self.db = db_connection
        self.errors = []
        self.stats = {"extracted": 0, "transformed": 0, "loaded": 0, "errors": 0}

    def extract(self) -> pd.DataFrame:
        """Extract data from source."""
        logger.info(f"Extracting from {self.config.source_path}")

        if self.config.source_path.endswith(".csv"):
            df = pd.read_csv(self.config.source_path)
        elif self.config.source_path.endswith(".json"):
            df = pd.read_json(self.config.source_path)
        elif self.config.source_path.startswith("http"):
            df = pd.read_csv(self.config.source_path)
        else:
            raise ValueError(f"Unsupported source: {self.config.source_path}")

        self.stats["extracted"] = len(df)
        logger.info(f"Extracted {len(df)} records")
        return df

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Transform and clean data."""
        logger.info("Transforming data")
        original_count = len(df)

        # Remove duplicates
        df = df.drop_duplicates()

        # Standardize column names
        df.columns = [c.lower().replace(" ", "_") for c in df.columns]

        # Handle missing values
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

        # Parse dates
        date_cols = [c for c in df.columns if "date" in c.lower()]
        for col in date_cols:
            df[col] = pd.to_datetime(df[col], errors="coerce")

        # Add metadata
        df["etl_timestamp"] = datetime.now()
        df["etl_source"] = self.config.source_path

        self.stats["transformed"] = len(df)
        logger.info(f"Transformed {len(df)} records ({original_count - len(df)} removed)")
        return df

    def validate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Validate data quality."""
        if not self.config.validate:
            return df

        logger.info("Validating data")
        valid_mask = pd.Series(True, index=df.index)

        # Check for required fields
        required_cols = ["player_id", "team", "season"]
        for col in required_cols:
            if col in df.columns:
                invalid = df[col].isna() | (df[col] == "")
                valid_mask &= ~invalid
                if invalid.any():
                    self.errors.append(f"Missing {col}: {invalid.sum()} rows")

        # Validate numeric ranges
        if "batting_avg" in df.columns:
            invalid = (df["batting_avg"] < 0) | (df["batting_avg"] > 1)
            valid_mask &= ~invalid

        df_valid = df[valid_mask].copy()
        self.stats["errors"] = len(df) - len(df_valid)

        logger.info(f"Validation complete: {len(df_valid)} valid, {self.stats['errors']} invalid")
        return df_valid

    def load(self, df: pd.DataFrame):
        """Load data to destination."""
        logger.info(f"Loading to {self.config.destination_table}")

        # Batch insert
        for i in range(0, len(df), self.config.batch_size):
            batch = df.iloc[i:i+self.config.batch_size]
            batch.to_sql(
                self.config.destination_table,
                self.db,
                if_exists="append",
                index=False
            )
            logger.info(f"Loaded batch {i//self.config.batch_size + 1}")

        self.stats["loaded"] = len(df)

    def run(self) -> Dict:
        """Execute full ETL pipeline."""
        try:
            df = self.extract()
            df = self.transform(df)
            df = self.validate(df)
            self.load(df)
            logger.info(f"ETL complete: {self.stats}")
        except Exception as e:
            logger.error(f"ETL failed: {e}")
            self.errors.append(str(e))

        return {"stats": self.stats, "errors": self.errors}

# Usage
config = ETLConfig(
    source_path="player_stats_2024.csv",
    destination_table="player_stats"
)
etl = SportsDataETL(config, db_connection)
results = etl.run()

python

Heat Map Shot Chart

Create basketball shot chart heat map using matplotlib and seaborn showing shot density and efficiency.

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.patches import Circle, Rectangle, Arc
from scipy.ndimage import gaussian_filter

def draw_court(ax=None, color="black", lw=2):
    """Draw basketball court lines."""
    if ax is None:
        ax = plt.gca()

    # Hoop
    hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)
    ax.add_patch(hoop)

    # Backboard
    ax.plot([-30, 30], [-7.5, -7.5], color=color, linewidth=lw)

    # Paint
    outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw,
                          color=color, fill=False)
    ax.add_patch(outer_box)

    # Free throw circle
    free_throw = Arc((0, 142.5), 120, 120, theta1=0, theta2=180,
                     linewidth=lw, color=color)
    ax.add_patch(free_throw)

    # Three point line
    three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158,
                    linewidth=lw, color=color)
    ax.add_patch(three_arc)
    ax.plot([-220, -220], [-47.5, 92.5], color=color, linewidth=lw)
    ax.plot([220, 220], [-47.5, 92.5], color=color, linewidth=lw)

    ax.set_xlim(-250, 250)
    ax.set_ylim(-47.5, 422.5)
    return ax

def shot_chart_heatmap(shots_df, title="Shot Chart"):
    """Create shot chart heat map."""
    fig, ax = plt.subplots(figsize=(12, 11))

    # Create 2D histogram
    heatmap, xedges, yedges = np.histogram2d(
        shots_df["loc_x"], shots_df["loc_y"],
        bins=30, range=[[-250, 250], [-50, 400]]
    )

    # Smooth with Gaussian filter
    heatmap = gaussian_filter(heatmap, sigma=1.5)

    # Plot heatmap
    im = ax.imshow(
        heatmap.T, origin="lower",
        extent=[-250, 250, -50, 400],
        cmap="YlOrRd", alpha=0.7
    )

    draw_court(ax, color="white", lw=2)

    ax.set_title(title, fontsize=16, fontweight="bold")
    ax.axis("off")

    cbar = fig.colorbar(im, ax=ax, shrink=0.7)
    cbar.set_label("Shot Frequency", fontsize=12)

    plt.tight_layout()
    return fig, ax

fig, ax = shot_chart_heatmap(shots_df, "Player Shot Chart 2024")

python

Rolling Average Performance Chart

Plot rolling averages over time to visualize player performance trends with confidence bands.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def plot_rolling_performance(df, date_col, value_col, window=20,
                             player_name="Player", stat_name="Stat"):
    """Plot rolling average with confidence bands."""

    df = df.sort_values(date_col).copy()

    # Calculate rolling statistics
    df["rolling_mean"] = df[value_col].rolling(window=window, min_periods=5).mean()
    df["rolling_std"] = df[value_col].rolling(window=window, min_periods=5).std()

    fig, ax = plt.subplots(figsize=(14, 6))

    # Individual games (faded)
    ax.scatter(df[date_col], df[value_col], alpha=0.3, s=30,
               color="gray", label="Individual Games")

    # Rolling average
    ax.plot(df[date_col], df["rolling_mean"], linewidth=2.5,
            color="#1f77b4", label=f"{window}-Game Rolling Avg")

    # Confidence band (±1 std)
    ax.fill_between(
        df[date_col],
        df["rolling_mean"] - df["rolling_std"],
        df["rolling_mean"] + df["rolling_std"],
        alpha=0.2, color="#1f77b4"
    )

    # Season average line
    season_avg = df[value_col].mean()
    ax.axhline(y=season_avg, color="red", linestyle="--",
               linewidth=1.5, label=f"Season Avg: {season_avg:.3f}")

    ax.set_xlabel("Date", fontsize=12)
    ax.set_ylabel(stat_name, fontsize=12)
    ax.set_title(f"{player_name} - {stat_name} Trend ({window}-Game Rolling)",
                 fontsize=14, fontweight="bold")
    ax.legend(loc="best")
    ax.grid(True, alpha=0.3)

    plt.xticks(rotation=45)
    plt.tight_layout()
    return fig, ax

# Example usage
fig, ax = plot_rolling_performance(
    game_log_df, "game_date", "batting_avg",
    window=15, player_name="Mike Trout", stat_name="Batting Average"
)

python

Radar Chart Player Comparison

Create radar/spider chart to compare multiple players across different statistical categories.

import matplotlib.pyplot as plt
import numpy as np

def create_radar_chart(players_data, categories, title="Player Comparison"):
    """Create radar chart comparing players across categories."""

    # Number of categories
    N = len(categories)

    # Angle for each category
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # Complete the loop

    # Create figure
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))

    colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"]

    for idx, (player_name, values) in enumerate(players_data.items()):
        values = values + values[:1]  # Complete the loop

        ax.plot(angles, values, "o-", linewidth=2,
                label=player_name, color=colors[idx % len(colors)])
        ax.fill(angles, values, alpha=0.25, color=colors[idx % len(colors)])

    # Set category labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories, fontsize=11)

    # Customize
    ax.set_ylim(0, 100)
    ax.set_title(title, size=16, fontweight="bold", y=1.08)
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))

    plt.tight_layout()
    return fig, ax

# Example: Compare players (values are percentiles 0-100)
categories = ["Power", "Contact", "Speed", "Defense", "Plate Discipline", "WAR"]
players_data = {
    "Player A": [85, 75, 60, 70, 80, 90],
    "Player B": [70, 90, 80, 85, 65, 82],
    "Player C": [95, 55, 40, 60, 70, 88]
}

fig, ax = create_radar_chart(players_data, categories)
plt.show()

python

Sankey Flow Diagram

Create Sankey diagram to visualize player movement, draft flows, or game state transitions.

import plotly.graph_objects as go
import pandas as pd

def create_trade_sankey(trades_df):
    """Create Sankey diagram for player trades/transactions."""

    # Get unique teams
    all_teams = list(set(
        trades_df["from_team"].tolist() + trades_df["to_team"].tolist()
    ))
    team_idx = {team: i for i, team in enumerate(all_teams)}

    # Build links
    links = trades_df.groupby(["from_team", "to_team"]).agg({
        "player_id": "count",
        "war": "sum"
    }).reset_index()

    source = [team_idx[t] for t in links["from_team"]]
    target = [team_idx[t] for t in links["to_team"]]
    value = links["player_id"].tolist()

    # Color based on WAR traded
    colors = []
    for war in links["war"]:
        if war > 5:
            colors.append("rgba(255, 0, 0, 0.5)")  # High WAR = red
        elif war > 0:
            colors.append("rgba(255, 165, 0, 0.5)")  # Positive WAR = orange
        else:
            colors.append("rgba(128, 128, 128, 0.5)")  # Negative WAR = gray

    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=all_teams,
            color="blue"
        ),
        link=dict(
            source=source,
            target=target,
            value=value,
            color=colors,
            label=[f"{v} players ({w:.1f} WAR)"
                   for v, w in zip(value, links["war"])]
        )
    ))

    fig.update_layout(
        title="Player Trade Flow Between Teams",
        font_size=12,
        height=600
    )

    return fig

fig = create_trade_sankey(trades_df)
fig.show()

python

Box Plot Comparison

Create box plots to compare statistical distributions across teams, positions, or seasons.

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def create_boxplot_comparison(df, value_col, group_col,
                              title=None, horizontal=False):
    """Create styled box plot comparison."""

    fig, ax = plt.subplots(figsize=(14, 8))

    # Order groups by median value
    order = df.groupby(group_col)[value_col].median().sort_values(
        ascending=not horizontal
    ).index.tolist()

    # Create box plot with swarm overlay
    if horizontal:
        sns.boxplot(
            data=df, y=group_col, x=value_col, order=order,
            palette="Set2", width=0.6, ax=ax
        )
        sns.swarmplot(
            data=df, y=group_col, x=value_col, order=order,
            color="black", alpha=0.4, size=3, ax=ax
        )
    else:
        sns.boxplot(
            data=df, x=group_col, y=value_col, order=order,
            palette="Set2", width=0.6, ax=ax
        )
        sns.swarmplot(
            data=df, x=group_col, y=value_col, order=order,
            color="black", alpha=0.4, size=3, ax=ax
        )

    # Add mean markers
    means = df.groupby(group_col)[value_col].mean()
    for i, group in enumerate(order):
        mean_val = means[group]
        if horizontal:
            ax.scatter(mean_val, i, color="red", s=100,
                      marker="D", zorder=5, label="Mean" if i == 0 else "")
        else:
            ax.scatter(i, mean_val, color="red", s=100,
                      marker="D", zorder=5, label="Mean" if i == 0 else "")

    # Styling
    if title:
        ax.set_title(title, fontsize=14, fontweight="bold")

    if not horizontal:
        plt.xticks(rotation=45, ha="right")

    ax.grid(True, alpha=0.3)
    ax.legend(loc="upper right")

    plt.tight_layout()
    return fig, ax

# Example: Compare ERA across teams
fig, ax = create_boxplot_comparison(
    pitchers_df, "era", "team",
    title="ERA Distribution by Team",
    horizontal=True
)

python

Interactive Plotly Dashboard

Build interactive multi-chart dashboard using Plotly for exploring player statistics.

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

def create_player_dashboard(player_df, game_log_df, player_name):
    """Create interactive dashboard for player analysis."""

    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            "Season Stats Trend", "Stat Distribution",
            "Performance by Month", "Category Breakdown"
        ),
        specs=[
            [{"type": "scatter"}, {"type": "histogram"}],
            [{"type": "bar"}, {"type": "pie"}]
        ]
    )

    # 1. Rolling average trend
    game_log_df = game_log_df.sort_values("game_date")
    game_log_df["rolling_avg"] = game_log_df["avg"].rolling(20).mean()

    fig.add_trace(
        go.Scatter(
            x=game_log_df["game_date"],
            y=game_log_df["rolling_avg"],
            mode="lines", name="20-Game Avg",
            line=dict(color="blue", width=2)
        ),
        row=1, col=1
    )

    # 2. Hit distribution histogram
    fig.add_trace(
        go.Histogram(
            x=game_log_df["hits"], name="Hits/Game",
            marker_color="green", opacity=0.7
        ),
        row=1, col=2
    )

    # 3. Monthly performance
    monthly = game_log_df.groupby(
        game_log_df["game_date"].dt.month
    ).agg({"avg": "mean", "hr": "sum"}).reset_index()

    fig.add_trace(
        go.Bar(
            x=["Apr", "May", "Jun", "Jul", "Aug", "Sep"],
            y=monthly["avg"],
            name="Monthly AVG",
            marker_color="orange"
        ),
        row=2, col=1
    )

    # 4. Hit type breakdown (pie)
    hit_types = player_df[["singles", "doubles", "triples", "hr"]].iloc[0]
    fig.add_trace(
        go.Pie(
            labels=["Singles", "Doubles", "Triples", "HR"],
            values=hit_types.values,
            hole=0.4
        ),
        row=2, col=2
    )

    fig.update_layout(
        title=dict(text=f"{player_name} - Season Dashboard", font=dict(size=20)),
        height=700,
        showlegend=True,
        template="plotly_white"
    )

    return fig

# Create dashboard
fig = create_player_dashboard(player_season_df, game_log_df, "Mike Trout")
fig.show()

python

Histogram Distribution Analysis

Create histogram with distribution fit and statistical annotations for analyzing stat distributions.

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd

def distribution_histogram(data, stat_name, bins=30, fit_dist="norm"):
    """Histogram with distribution fit and statistics."""

    fig, ax = plt.subplots(figsize=(12, 6))

    # Remove NaN values
    data = data.dropna() if hasattr(data, "dropna") else data[~np.isnan(data)]

    # Histogram
    n, bins_edges, patches = ax.hist(
        data, bins=bins, density=True, alpha=0.7,
        color="#1f77b4", edgecolor="white", linewidth=0.5
    )

    # Fit distribution
    if fit_dist == "norm":
        mu, sigma = stats.norm.fit(data)
        x = np.linspace(data.min(), data.max(), 100)
        pdf = stats.norm.pdf(x, mu, sigma)
        ax.plot(x, pdf, "r-", linewidth=2,
                label=f"Normal fit (μ={mu:.3f}, σ={sigma:.3f})")

    # Percentile lines
    percentiles = [25, 50, 75, 90]
    colors = ["green", "orange", "red", "purple"]

    for p, c in zip(percentiles, colors):
        pval = np.percentile(data, p)
        ax.axvline(x=pval, color=c, linestyle="--", linewidth=1.5,
                   label=f"{p}th percentile: {pval:.3f}")

    # Statistics box
    stats_text = (
        f"n = {len(data):,}\n"
        f"Mean: {data.mean():.3f}\n"
        f"Median: {np.median(data):.3f}\n"
        f"Std: {data.std():.3f}\n"
        f"Min: {data.min():.3f}\n"
        f"Max: {data.max():.3f}"
    )
    ax.text(0.95, 0.95, stats_text, transform=ax.transAxes,
            fontsize=10, verticalalignment="top", horizontalalignment="right",
            bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.8))

    ax.set_xlabel(stat_name, fontsize=12)
    ax.set_ylabel("Density", fontsize=12)
    ax.set_title(f"Distribution of {stat_name}", fontsize=14, fontweight="bold")
    ax.legend(loc="upper left", fontsize=9)
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    return fig, ax

# Example
fig, ax = distribution_histogram(players_df["war"], "WAR (Wins Above Replacement)")

python

Animated Play Visualization

Create animated visualization of play tracking data using matplotlib animation.

import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import pandas as pd

def animate_play(tracking_df, field_type="football"):
    """Animate player tracking data."""

    # Get unique frames
    frames = sorted(tracking_df["frame_id"].unique())

    fig, ax = plt.subplots(figsize=(14, 6))

    if field_type == "football":
        # Draw football field
        ax.set_xlim(0, 120)
        ax.set_ylim(0, 53.3)
        ax.set_facecolor("#228B22")

        # Yard lines
        for yard in range(0, 121, 10):
            ax.axvline(x=yard, color="white", linewidth=1, alpha=0.5)

        # End zones
        ax.axvspan(0, 10, alpha=0.3, color="blue")
        ax.axvspan(110, 120, alpha=0.3, color="red")

    # Initialize scatter plots for teams
    offense = ax.scatter([], [], s=200, c="blue", edgecolors="white",
                         linewidth=2, label="Offense")
    defense = ax.scatter([], [], s=200, c="red", edgecolors="white",
                         linewidth=2, label="Defense")
    ball = ax.scatter([], [], s=100, c="brown", marker="o",
                      edgecolors="white", linewidth=2, label="Ball")

    ax.legend(loc="upper right")
    title = ax.set_title("", fontsize=12, fontweight="bold")

    def init():
        offense.set_offsets(np.empty((0, 2)))
        defense.set_offsets(np.empty((0, 2)))
        ball.set_offsets(np.empty((0, 2)))
        return offense, defense, ball, title

    def update(frame):
        frame_data = tracking_df[tracking_df["frame_id"] == frame]

        off_data = frame_data[frame_data["team"] == "offense"]
        def_data = frame_data[frame_data["team"] == "defense"]
        ball_data = frame_data[frame_data["team"] == "ball"]

        offense.set_offsets(off_data[["x", "y"]].values)
        defense.set_offsets(def_data[["x", "y"]].values)

        if not ball_data.empty:
            ball.set_offsets(ball_data[["x", "y"]].values)

        title.set_text(f"Frame: {frame}")
        return offense, defense, ball, title

    ani = animation.FuncAnimation(
        fig, update, frames=frames,
        init_func=init, blit=True, interval=100
    )

    return fig, ani

# Save animation
fig, ani = animate_play(tracking_df)
ani.save("play_animation.gif", writer="pillow", fps=10)

python

Bar Chart with Error Bars

Create grouped bar chart with error bars for comparing statistics across categories or groups.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def grouped_bar_chart(df, group_col, categories, values_cols,
                      errors_cols=None, title="Comparison"):
    """Create grouped bar chart with optional error bars."""

    groups = df[group_col].unique()
    n_groups = len(groups)
    n_categories = len(categories)

    fig, ax = plt.subplots(figsize=(12, 7))

    bar_width = 0.8 / n_categories
    x = np.arange(n_groups)

    colors = plt.cm.Set2(np.linspace(0, 1, n_categories))

    for i, (cat, val_col) in enumerate(zip(categories, values_cols)):
        values = [df[df[group_col] == g][val_col].values[0] for g in groups]

        errors = None
        if errors_cols:
            errors = [df[df[group_col] == g][errors_cols[i]].values[0]
                     for g in groups]

        bars = ax.bar(
            x + i * bar_width, values, bar_width,
            label=cat, color=colors[i],
            yerr=errors, capsize=4,
            edgecolor="white", linewidth=1
        )

        # Add value labels on bars
        for bar, val in zip(bars, values):
            height = bar.get_height()
            ax.annotate(
                f"{val:.2f}",
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points",
                ha="center", va="bottom", fontsize=9
            )

    ax.set_xlabel(group_col, fontsize=12)
    ax.set_ylabel("Value", fontsize=12)
    ax.set_title(title, fontsize=14, fontweight="bold")
    ax.set_xticks(x + bar_width * (n_categories - 1) / 2)
    ax.set_xticklabels(groups, rotation=45, ha="right")
    ax.legend(loc="upper right")
    ax.grid(True, axis="y", alpha=0.3)

    plt.tight_layout()
    return fig, ax

# Example: Compare teams across stats
fig, ax = grouped_bar_chart(
    team_stats_df,
    group_col="team",
    categories=["OBP", "SLG", "wRC+"],
    values_cols=["obp", "slg", "wrc_plus"],
    title="Team Offensive Comparison"
)

python

Scatter Plot with Regression

Create scatter plot with regression line and annotations for analyzing stat correlations.

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd

def scatter_with_regression(df, x_col, y_col, label_col=None,
                           highlight_players=None):
    """Scatter plot with regression line and player labels."""

    fig, ax = plt.subplots(figsize=(12, 8))

    x = df[x_col]
    y = df[y_col]

    # Regression
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    line_x = np.linspace(x.min(), x.max(), 100)
    line_y = slope * line_x + intercept

    # Scatter
    scatter = ax.scatter(x, y, alpha=0.6, s=80, c="#1f77b4",
                         edgecolors="white", linewidth=0.5)

    # Regression line
    ax.plot(line_x, line_y, color="red", linewidth=2, linestyle="--",
            label=f"R² = {r_value**2:.3f}")

    # Highlight specific players
    if highlight_players and label_col:
        for player in highlight_players:
            player_data = df[df[label_col] == player]
            if not player_data.empty:
                px, py = player_data[x_col].values[0], player_data[y_col].values[0]
                ax.scatter(px, py, s=150, c="orange", edgecolors="black",
                          linewidth=2, zorder=5)
                ax.annotate(
                    player, (px, py), fontsize=10, fontweight="bold",
                    xytext=(10, 10), textcoords="offset points",
                    arrowprops=dict(arrowstyle="->", color="black")
                )

    # Labels and styling
    ax.set_xlabel(x_col, fontsize=12)
    ax.set_ylabel(y_col, fontsize=12)
    ax.set_title(f"{y_col} vs {x_col}", fontsize=14, fontweight="bold")
    ax.legend(loc="best", fontsize=11)
    ax.grid(True, alpha=0.3)

    # Add correlation annotation
    ax.text(0.05, 0.95, f"Correlation: {r_value:.3f}\np-value: {p_value:.2e}",
            transform=ax.transAxes, fontsize=10, verticalalignment="top",
            bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5))

    plt.tight_layout()
    return fig, ax

# Example
fig, ax = scatter_with_regression(
    players_df, "exit_velocity", "slg",
    label_col="player_name",
    highlight_players=["Aaron Judge", "Shohei Ohtani"]
)

python

XGBoost Game Outcome Predictor

Predict game outcomes using XGBoost gradient boosting with feature engineering for team matchups.

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

def build_game_predictor(games_df):
    """Build XGBoost model to predict game outcomes."""

    # Feature engineering
    features = [
        "home_win_pct", "away_win_pct",
        "home_pts_avg", "away_pts_avg",
        "home_pts_allowed_avg", "away_pts_allowed_avg",
        "home_streak", "away_streak",
        "home_rest_days", "away_rest_days",
        "home_elo", "away_elo"
    ]

    X = games_df[features]
    y = games_df["home_win"].astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # XGBoost parameters
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": 6,
        "learning_rate": 0.1,
        "n_estimators": 200,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42
    }

    model = xgb.XGBClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )

    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    return {
        "model": model,
        "accuracy": accuracy_score(y_test, y_pred),
        "auc": roc_auc_score(y_test, y_prob),
        "log_loss": log_loss(y_test, y_prob),
        "feature_importance": dict(zip(features, model.feature_importances_))
    }

results = build_game_predictor(games_df)
print(f"Accuracy: {results['accuracy']:.3f}")
print(f"AUC: {results['auc']:.3f}")

python

Random Forest Player Prediction

Use Random Forest classifier to predict player performance categories (elite, above-average, average, below-average) based on historical stats.

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

def train_player_classifier(df, features, target_col):
    """Train Random Forest to classify player performance tiers."""

    # Prepare features and target
    X = df[features].copy()
    y = df[target_col]

    # Handle missing values
    X = X.fillna(X.median())

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train model
    rf = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)

    # Evaluate
    y_pred = rf.predict(X_test)
    cv_scores = cross_val_score(rf, X_scaled, y, cv=5)

    # Feature importance
    importance = pd.DataFrame({
        "feature": features,
        "importance": rf.feature_importances_
    }).sort_values("importance", ascending=False)

    return {
        "model": rf,
        "scaler": scaler,
        "accuracy": rf.score(X_test, y_test),
        "cv_mean": cv_scores.mean(),
        "cv_std": cv_scores.std(),
        "feature_importance": importance,
        "classification_report": classification_report(y_test, y_pred),
        "confusion_matrix": confusion_matrix(y_test, y_pred)
    }

# Example usage
features = ["avg", "obp", "slg", "hr", "rbi", "sb", "bb_pct", "k_pct"]
results = train_player_classifier(player_df, features, "performance_tier")
print(f"Accuracy: {results['accuracy']:.3f}")
print(f"CV Score: {results['cv_mean']:.3f} (+/- {results['cv_std']:.3f})")
print("\nTop Features:")
print(results["feature_importance"].head(10))

python

PCA Dimensionality Reduction

Use Principal Component Analysis to reduce high-dimensional player statistics while preserving variance.

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def perform_player_pca(df, stat_columns, n_components=None, variance_threshold=0.95):
    """Reduce player stats dimensions using PCA."""

    X = df[stat_columns].fillna(df[stat_columns].median())

    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initial PCA to determine components
    pca_full = PCA()
    pca_full.fit(X_scaled)

    # Find components for variance threshold
    cumulative_var = np.cumsum(pca_full.explained_variance_ratio_)
    if n_components is None:
        n_components = np.argmax(cumulative_var >= variance_threshold) + 1

    # Final PCA
    pca = PCA(n_components=n_components)
    X_reduced = pca.fit_transform(X_scaled)

    # Component loadings
    loadings = pd.DataFrame(
        pca.components_.T,
        columns=[f"PC{i+1}" for i in range(n_components)],
        index=stat_columns
    )

    # Add to dataframe
    for i in range(n_components):
        df[f"PC{i+1}"] = X_reduced[:, i]

    return {
        "pca": pca,
        "scaler": scaler,
        "n_components": n_components,
        "explained_variance": pca.explained_variance_ratio_,
        "cumulative_variance": cumulative_var[:n_components],
        "loadings": loadings
    }

# Reduce batting stats
stat_cols = ["avg", "obp", "slg", "hr", "sb", "bb_pct", "k_pct",
             "iso", "babip", "wrc_plus", "war"]
results = perform_player_pca(batters_df, stat_cols)
print(f"Reduced to {results['n_components']} components")
print(f"Variance explained: {results['cumulative_variance'][-1]:.1%}")
print("\nTop loadings for PC1:")
print(results["loadings"]["PC1"].abs().sort_values(ascending=False).head())

python

Bayesian Regression with PyMC

Bayesian linear regression for player projections with uncertainty quantification using PyMC.

import pymc as pm
import numpy as np
import pandas as pd
import arviz as az

def bayesian_projection_model(df, features, target):
    """Build Bayesian regression with uncertainty estimates."""

    X = df[features].values
    y = df[target].values

    # Standardize
    X_mean, X_std = X.mean(axis=0), X.std(axis=0)
    y_mean, y_std = y.mean(), y.std()

    X_scaled = (X - X_mean) / X_std
    y_scaled = (y - y_mean) / y_std

    with pm.Model() as model:
        # Priors
        alpha = pm.Normal("alpha", mu=0, sigma=1)
        betas = pm.Normal("betas", mu=0, sigma=1, shape=len(features))
        sigma = pm.HalfNormal("sigma", sigma=1)

        # Linear model
        mu = alpha + pm.math.dot(X_scaled, betas)

        # Likelihood
        y_obs = pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y_scaled)

        # Sample
        trace = pm.sample(2000, tune=1000, cores=2, return_inferencedata=True)

    # Posterior summary
    summary = az.summary(trace, var_names=["alpha", "betas", "sigma"])

    def predict_with_uncertainty(new_X):
        """Predict with credible intervals."""
        new_X_scaled = (new_X - X_mean) / X_std

        posterior = trace.posterior
        alpha_samples = posterior["alpha"].values.flatten()
        beta_samples = posterior["betas"].values.reshape(-1, len(features))

        predictions = alpha_samples[:, None] + np.dot(beta_samples, new_X_scaled.T)
        predictions = predictions * y_std + y_mean

        return {
            "mean": predictions.mean(axis=0),
            "std": predictions.std(axis=0),
            "ci_95": np.percentile(predictions, [2.5, 97.5], axis=0)
        }

    return model, trace, predict_with_uncertainty

features = ["age", "pa", "avg_3yr", "obp_3yr", "slg_3yr"]
model, trace, predict = bayesian_projection_model(
    player_df, features, "next_year_war"
)

# Predict for new player
pred = predict(np.array([[28, 600, .280, .350, .450]]))
print(f"Projected WAR: {pred['mean'][0]:.2f} ± {pred['std'][0]:.2f}")

python

Ensemble Model Stacking

Combine multiple models using stacking to improve prediction accuracy for player projections.

from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor,
    StackingRegressor
)
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import pandas as pd

def build_stacked_projection_model(df, features, target):
    """Build stacked ensemble for player projections."""

    X = df[features].fillna(df[features].median())
    y = df[target]

    # Base models
    base_models = [
        ("rf", RandomForestRegressor(
            n_estimators=100, max_depth=8, random_state=42
        )),
        ("gb", GradientBoostingRegressor(
            n_estimators=100, max_depth=5, random_state=42
        )),
        ("ridge", Ridge(alpha=1.0)),
        ("elastic", ElasticNet(alpha=0.5, l1_ratio=0.5)),
    ]

    # Meta-learner
    meta_model = Ridge(alpha=0.5)

    # Stacking ensemble
    stacked = StackingRegressor(
        estimators=base_models,
        final_estimator=meta_model,
        cv=5,
        n_jobs=-1
    )

    # Evaluate
    cv_scores = cross_val_score(stacked, X, y, cv=5, scoring="neg_mean_absolute_error")

    # Fit final model
    stacked.fit(X, y)

    # Individual model scores for comparison
    individual_scores = {}
    for name, model in base_models:
        scores = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
        individual_scores[name] = -scores.mean()

    return {
        "model": stacked,
        "stacked_mae": -cv_scores.mean(),
        "individual_mae": individual_scores
    }

features = ["age", "pa", "avg_3yr", "obp_3yr", "slg_3yr", "war_3yr"]
results = build_stacked_projection_model(projections_df, features, "actual_war")
print(f"Stacked MAE: {results['stacked_mae']:.3f}")
print("Individual MAEs:", results["individual_mae"])

python

Logistic Regression Win Probability

Simple but interpretable logistic regression model for real-time win probability calculation.

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.calibration import CalibratedClassifierCV

def train_win_probability_model(plays_df):
    """Train calibrated win probability model."""

    # Features for win probability
    features = [
        "score_diff",      # Current score differential
        "time_remaining",  # Seconds remaining
        "possession",      # 1 if home has ball, 0 if away
        "yard_line",       # Field position (football)
        "down",            # Current down
        "distance"         # Yards to first down
    ]

    X = plays_df[features]
    y = plays_df["home_win"]

    # Add polynomial features for non-linear relationships
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly.fit_transform(X)

    # Train with calibration for accurate probabilities
    base_model = LogisticRegression(max_iter=1000, C=0.1)
    model = CalibratedClassifierCV(base_model, cv=5, method="isotonic")
    model.fit(X_poly, y)

    def predict_win_prob(score_diff, time_remaining, possession,
                         yard_line=50, down=1, distance=10):
        """Get win probability for current game state."""
        input_data = np.array([[
            score_diff, time_remaining, possession,
            yard_line, down, distance
        ]])
        input_poly = poly.transform(input_data)
        prob = model.predict_proba(input_poly)[0, 1]
        return prob

    return model, poly, predict_win_prob

model, poly, predict_wp = train_win_probability_model(plays_df)

# Example: Home team up 7, 5 min left, has ball
wp = predict_wp(score_diff=7, time_remaining=300, possession=1)
print(f"Home Win Probability: {wp:.1%}")

python

Gradient Boosting Injury Risk

Use LightGBM to predict player injury risk based on workload, age, and historical injury data.

import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve

def build_injury_risk_model(df):
    """Build injury risk prediction model."""

    features = [
        "age", "career_games", "games_last_season",
        "workload_index", "previous_injuries",
        "days_since_last_injury", "position_risk_factor",
        "bmi", "sprint_speed_decline", "throwing_velocity_change"
    ]

    X = df[features].fillna(-1)
    y = df["injured_next_season"].astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # LightGBM dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "num_leaves": 31,
        "learning_rate": 0.05,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "verbose": -1,
        "is_unbalance": True
    }

    model = lgb.train(
        params, train_data,
        num_boost_round=500,
        valid_sets=[test_data],
        callbacks=[lgb.early_stopping(50)]
    )

    # Predictions
    y_prob = model.predict(X_test)
    auc = roc_auc_score(y_test, y_prob)

    # Feature importance
    importance = pd.DataFrame({
        "feature": features,
        "importance": model.feature_importance(importance_type="gain")
    }).sort_values("importance", ascending=False)

    return {"model": model, "auc": auc, "importance": importance}

results = build_injury_risk_model(players_df)
print(f"AUC: {results['auc']:.3f}")
print("\nTop Risk Factors:")
print(results["importance"].head())

python

LSTM Sequence Prediction

Long Short-Term Memory network to predict player performance sequences over time, capturing temporal patterns.

import torch
import torch.nn as nn
import numpy as np
import pandas as pd

class PlayerLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, output_size=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers,
            batch_first=True, dropout=0.2
        )
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # x shape: (batch, seq_len, features)
        lstm_out, _ = self.lstm(x)
        # Take last time step
        out = self.fc(lstm_out[:, -1, :])
        return out

def create_sequences(df, player_col, features, target, seq_length=10):
    """Create sequences for LSTM training."""
    sequences = []
    targets = []

    for player in df[player_col].unique():
        player_data = df[df[player_col] == player].sort_values("season")

        if len(player_data) < seq_length + 1:
            continue

        for i in range(len(player_data) - seq_length):
            seq = player_data[features].iloc[i:i+seq_length].values
            tgt = player_data[target].iloc[i+seq_length]
            sequences.append(seq)
            targets.append(tgt)

    return np.array(sequences), np.array(targets)

# Create sequences
features = ["age", "pa", "avg", "obp", "slg", "war"]
X, y = create_sequences(career_df, "player_id", features, "war", seq_length=5)

# Train model
model = PlayerLSTM(input_size=len(features))
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

X_tensor = torch.FloatTensor(X)
y_tensor = torch.FloatTensor(y).unsqueeze(1)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_tensor)
    loss = criterion(outputs, y_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

python

K-Means Player Clustering

Cluster players into similar performance groups using K-Means algorithm with automatic optimal cluster selection.

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

def cluster_players(df, features, max_clusters=10):
    """Cluster players and find optimal number of clusters."""

    X = df[features].fillna(df[features].median())
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Find optimal clusters using elbow method and silhouette
    inertias = []
    silhouettes = []
    K_range = range(2, max_clusters + 1)

    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X_scaled)
        inertias.append(kmeans.inertia_)
        silhouettes.append(silhouette_score(X_scaled, labels))

    # Best k by silhouette
    optimal_k = K_range[np.argmax(silhouettes)]

    # Final clustering
    final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    df["cluster"] = final_kmeans.fit_predict(X_scaled)

    # Cluster profiles
    profiles = df.groupby("cluster")[features].mean()
    profiles["count"] = df.groupby("cluster").size()

    return {
        "optimal_k": optimal_k,
        "labels": df["cluster"],
        "profiles": profiles,
        "silhouette": max(silhouettes),
        "model": final_kmeans,
        "scaler": scaler
    }

# Cluster hitters
features = ["avg", "obp", "slg", "hr_rate", "bb_rate", "k_rate", "sprint_speed"]
results = cluster_players(hitters_df, features)
print(f"Optimal clusters: {results['optimal_k']}")
print("\nCluster Profiles:")
print(results["profiles"])

python

Neural Network Player Projections

Deep learning model using PyTorch to project future player statistics based on historical performance trends.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

class PlayerProjectionNet(nn.Module):
    def __init__(self, input_size, hidden_sizes=[64, 32], output_size=1):
        super().__init__()
        layers = []
        prev_size = input_size

        for hidden in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden),
                nn.BatchNorm1d(hidden),
                nn.ReLU(),
                nn.Dropout(0.2)
            ])
            prev_size = hidden

        layers.append(nn.Linear(prev_size, output_size))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

def train_projection_model(df, feature_cols, target_col, epochs=100):
    """Train neural network for player stat projections."""

    # Prepare data
    X = df[feature_cols].values
    y = df[target_col].values.reshape(-1, 1)

    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y)

    # Convert to tensors
    X_tensor = torch.FloatTensor(X_scaled)
    y_tensor = torch.FloatTensor(y_scaled)

    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=32, shuffle=True)

    # Model
    model = PlayerProjectionNet(len(feature_cols))
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in loader:
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        if (epoch + 1) % 20 == 0:
            print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")

    return model, scaler_X, scaler_y

# Usage
feature_cols = ["age", "games", "pa", "avg_3yr", "obp_3yr", "slg_3yr"]
model, scaler_X, scaler_y = train_projection_model(
    player_df, feature_cols, "next_year_war"
)

python Baseball

Daily Data Pipeline

Automated pipeline for daily sports data updates.

"""Automated daily data pipeline for sports analytics."""
import pandas as pd
import requests
from datetime import datetime, timedelta
import schedule
import time
import logging
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)

class DailyPipeline:
    """Automated data collection and processing pipeline."""

    def __init__(self, data_dir: str = "./data"):
        self.data_dir = Path(data_dir)
        self.data_dir.mkdir(exist_ok=True)

    def fetch_scores(self, sport: str, date: str = None) -> pd.DataFrame:
        """Fetch daily scores from API."""
        if date is None:
            date = datetime.now().strftime("%Y-%m-%d")

        # Example API call (replace with actual)
        # response = requests.get(f"https://api.example.com/{sport}/scores/{date}")
        # return pd.DataFrame(response.json())

        logger.info(f"Fetched {sport} scores for {date}")
        return pd.DataFrame()

    def update_database(self, df: pd.DataFrame, table: str):
        """Update database with new data."""
        # Example: df.to_sql(table, engine, if_exists="append")
        logger.info(f"Updated {table} with {len(df)} rows")

    def calculate_daily_metrics(self, sport: str) -> pd.DataFrame:
        """Calculate daily summary metrics."""
        # Load recent data and calculate metrics
        logger.info(f"Calculated daily metrics for {sport}")
        return pd.DataFrame()

    def send_alerts(self, alerts: list):
        """Send alerts for significant events."""
        for alert in alerts:
            logger.info(f"Alert: {alert}")

    def run_daily_job(self):
        """Run the complete daily pipeline."""
        logger.info("Starting daily pipeline...")

        sports = ["mlb", "nba", "nfl"]

        for sport in sports:
            try:
                # Fetch data
                scores = self.fetch_scores(sport)

                # Update database
                if not scores.empty:
                    self.update_database(scores, f"{sport}_scores")

                # Calculate metrics
                metrics = self.calculate_daily_metrics(sport)

            except Exception as e:
                logger.error(f"Error processing {sport}: {e}")

        logger.info("Daily pipeline complete")

def main():
    pipeline = DailyPipeline()

    # Schedule daily run at 6 AM
    schedule.every().day.at("06:00").do(pipeline.run_daily_job)

    # Run immediately for testing
    pipeline.run_daily_job()

    # Keep running
    while True:
        schedule.run_pending()
        time.sleep(60)

if __name__ == "__main__":
    main()

python MMA

MMA Fight Statistics

Calculate MMA fighter statistics and performance metrics.

"""MMA Fighter Statistics Calculator."""
import pandas as pd
import numpy as np

class MMAStats:
    """Calculate MMA fighter statistics."""

    @staticmethod
    def striking_accuracy(sig_strikes_landed: int, sig_strikes_attempted: int) -> float:
        """Calculate significant striking accuracy."""
        if sig_strikes_attempted == 0:
            return 0
        return sig_strikes_landed / sig_strikes_attempted

    @staticmethod
    def takedown_accuracy(takedowns_landed: int, takedowns_attempted: int) -> float:
        """Calculate takedown accuracy."""
        if takedowns_attempted == 0:
            return 0
        return takedowns_landed / takedowns_attempted

    @staticmethod
    def defense_rate(strikes_absorbed: int, strikes_attempted_against: int) -> float:
        """Calculate striking defense rate."""
        if strikes_attempted_against == 0:
            return 1.0
        return 1 - (strikes_absorbed / strikes_attempted_against)

    @staticmethod
    def submission_rate(wins: int, sub_wins: int) -> float:
        """Calculate submission win percentage."""
        if wins == 0:
            return 0
        return sub_wins / wins

    @staticmethod
    def calculate_fight_iq(df: pd.DataFrame) -> pd.Series:
        """Calculate composite fight IQ score."""
        # Normalize components
        strike_acc = df["striking_accuracy"].rank(pct=True)
        td_acc = df["takedown_accuracy"].rank(pct=True)
        defense = df["defense_rate"].rank(pct=True)

        # Weighted average
        return 0.4 * strike_acc + 0.3 * td_acc + 0.3 * defense

# Example
# fighter_stats["fight_iq"] = MMAStats.calculate_fight_iq(fighter_stats)

python Baseball

Draft Value Analysis

Analyze draft pick value and player development.

"""Draft pick value analysis."""
import pandas as pd
import numpy as np

def calculate_draft_value_curve(historical_drafts: pd.DataFrame,
                               value_metric: str = "career_war") -> pd.DataFrame:
    """Calculate expected value by draft position."""
    return historical_drafts.groupby("pick").agg({
        value_metric: ["mean", "std", "count"],
        "all_star": "mean",
        "years_played": "mean"
    }).reset_index()

def surplus_value(player_value: float, contract_value: float,
                 years: int) -> float:
    """Calculate surplus value over contract."""
    return (player_value * years) - contract_value

def draft_efficiency(team_drafts: pd.DataFrame,
                    expected_values: dict) -> pd.DataFrame:
    """Calculate team draft efficiency vs expected."""
    team_drafts = team_drafts.copy()
    team_drafts["expected_value"] = team_drafts["pick"].map(expected_values)
    team_drafts["value_over_expected"] = team_drafts["actual_value"] - team_drafts["expected_value"]

    return team_drafts.groupby("team").agg({
        "value_over_expected": "sum",
        "actual_value": "sum",
        "expected_value": "sum"
    })

def project_rookie_development(stats: pd.DataFrame,
                              similar_players: list) -> dict:
    """Project rookie development based on similar players."""
    similar_careers = stats[stats["player"].isin(similar_players)]

    projections = similar_careers.groupby("years_exp").agg({
        "war": ["mean", "std"],
        "games": "mean"
    }).reset_index()

    return {
        "year_projections": projections,
        "peak_year": projections.loc[projections[("war", "mean")].idxmax(), "years_exp"]
    }

python Baseball

Generate PDF Report

Create professional PDF reports for sports analytics.

"""Generate PDF reports for sports analytics."""
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image
from reportlab.lib.units import inch
import pandas as pd
import matplotlib.pyplot as plt
import io

class SportsReport:
    """Generate professional PDF sports reports."""

    def __init__(self, filename: str, title: str):
        self.doc = SimpleDocTemplate(filename, pagesize=letter)
        self.elements = []
        self.styles = getSampleStyleSheet()

        # Add title
        title_style = ParagraphStyle("Title", fontSize=24, spaceAfter=30)
        self.elements.append(Paragraph(title, title_style))

    def add_heading(self, text: str, level: int = 1):
        """Add a section heading."""
        style = self.styles[f"Heading{level}"]
        self.elements.append(Paragraph(text, style))
        self.elements.append(Spacer(1, 12))

    def add_paragraph(self, text: str):
        """Add a paragraph of text."""
        self.elements.append(Paragraph(text, self.styles["Normal"]))
        self.elements.append(Spacer(1, 12))

    def add_table(self, df: pd.DataFrame, title: str = None):
        """Add a data table."""
        if title:
            self.add_heading(title, level=2)

        # Convert DataFrame to list
        data = [df.columns.tolist()] + df.values.tolist()

        table = Table(data)
        table.setStyle(TableStyle([
            ("BACKGROUND", (0, 0), (-1, 0), colors.grey),
            ("TEXTCOLOR", (0, 0), (-1, 0), colors.whitesmoke),
            ("ALIGN", (0, 0), (-1, -1), "CENTER"),
            ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
            ("FONTSIZE", (0, 0), (-1, 0), 10),
            ("BOTTOMPADDING", (0, 0), (-1, 0), 12),
            ("GRID", (0, 0), (-1, -1), 1, colors.black)
        ]))

        self.elements.append(table)
        self.elements.append(Spacer(1, 20))

    def add_chart(self, fig, width: float = 6, height: float = 4):
        """Add a matplotlib figure."""
        img_buffer = io.BytesIO()
        fig.savefig(img_buffer, format="png", dpi=150, bbox_inches="tight")
        img_buffer.seek(0)

        img = Image(img_buffer, width=width*inch, height=height*inch)
        self.elements.append(img)
        self.elements.append(Spacer(1, 20))

    def build(self):
        """Generate the PDF."""
        self.doc.build(self.elements)

# Example usage
# report = SportsReport("team_analysis.pdf", "2024 Season Analysis")
# report.add_heading("Performance Summary")
# report.add_table(stats_df, "Key Statistics")
# report.add_chart(performance_chart)
# report.build()

python Baseball

Trade Value Calculator

Calculate player trade values based on multiple factors.

"""Calculate player trade values."""
import pandas as pd
import numpy as np

class TradeValueCalculator:
    """Calculate and compare player trade values."""

    def __init__(self, salary_cap: float = 150_000_000):
        self.salary_cap = salary_cap

    def calculate_value(self, player: dict) -> float:
        """Calculate total trade value for a player."""
        # Base value from production
        production_value = player["war"] * 8_000_000  # $8M per WAR

        # Age adjustment (peak = 27)
        age_factor = 1 - abs(player["age"] - 27) * 0.03

        # Contract value
        years_left = player.get("contract_years", 1)
        salary = player.get("salary", 0)
        contract_value = (production_value - salary) * years_left

        # Control premium for pre-arb/arb players
        control_premium = 0
        if player.get("service_time", 7) < 3:
            control_premium = production_value * 0.5
        elif player.get("service_time", 7) < 6:
            control_premium = production_value * 0.25

        return (production_value * age_factor) + contract_value + control_premium

    def evaluate_trade(self, team_a_gives: list, team_b_gives: list) -> dict:
        """Evaluate fairness of a proposed trade."""
        value_a = sum(self.calculate_value(p) for p in team_a_gives)
        value_b = sum(self.calculate_value(p) for p in team_b_gives)

        difference = value_a - value_b

        return {
            "team_a_value": value_a,
            "team_b_value": value_b,
            "difference": abs(difference),
            "favors": "Team A" if difference > 0 else "Team B" if difference < 0 else "Even",
            "fair": abs(difference) < (value_a + value_b) * 0.1
        }

    def find_matching_value(self, target_value: float,
                           available_players: pd.DataFrame,
                           max_players: int = 3) -> list:
        """Find combination of players matching target value."""
        available_players = available_players.copy()
        available_players["trade_value"] = available_players.apply(
            lambda x: self.calculate_value(x.to_dict()), axis=1
        )

        # Simple greedy approach
        selected = []
        remaining_value = target_value

        for _ in range(max_players):
            if remaining_value <= 0:
                break

            best_match = available_players.iloc[
                (available_players["trade_value"] - remaining_value).abs().argmin()
            ]

            selected.append(best_match["player_name"])
            remaining_value -= best_match["trade_value"]
            available_players = available_players[
                available_players["player_name"] != best_match["player_name"]
            ]

        return selected

python Basketball

Injury Impact Analysis

Analyze team performance impact from player injuries.

"""Analyze injury impact on team performance."""
import pandas as pd
import numpy as np
from scipy import stats

def calculate_injury_impact(games: pd.DataFrame, player: str,
                           team_col: str, result_col: str) -> dict:
    """Calculate team performance with/without a player."""
    with_player = games[games["active_players"].str.contains(player, na=False)]
    without_player = games[~games["active_players"].str.contains(player, na=False)]

    if len(with_player) < 5 or len(without_player) < 5:
        return {"error": "Insufficient data"}

    with_wins = with_player[result_col].mean()
    without_wins = without_player[result_col].mean()

    # Statistical significance
    t_stat, p_value = stats.ttest_ind(
        with_player[result_col],
        without_player[result_col]
    )

    return {
        "player": player,
        "games_with": len(with_player),
        "games_without": len(without_player),
        "win_pct_with": with_wins,
        "win_pct_without": without_wins,
        "impact": with_wins - without_wins,
        "p_value": p_value,
        "significant": p_value < 0.05
    }

def replacement_level_analysis(player_stats: pd.DataFrame,
                              injured_player: str,
                              replacement: str,
                              stat_cols: list) -> pd.DataFrame:
    """Compare injured player to replacement."""
    injured_stats = player_stats[player_stats["player"] == injured_player][stat_cols].iloc[0]
    replace_stats = player_stats[player_stats["player"] == replacement][stat_cols].iloc[0]

    comparison = pd.DataFrame({
        "Stat": stat_cols,
        "Injured": injured_stats.values,
        "Replacement": replace_stats.values,
        "Difference": injured_stats.values - replace_stats.values
    })

    return comparison

python Basketball

Interactive Plotly Visualizations

Create interactive sports visualizations using Plotly.

"""Interactive sports visualizations with Plotly."""
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

def create_player_comparison(players_df: pd.DataFrame, metrics: list,
                            player_col: str = "player_name") -> go.Figure:
    """Create interactive radar chart comparing players."""
    fig = go.Figure()

    for _, player in players_df.iterrows():
        fig.add_trace(go.Scatterpolar(
            r=[player[m] for m in metrics],
            theta=metrics,
            fill="toself",
            name=player[player_col]
        ))

    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
        showlegend=True,
        title="Player Comparison"
    )
    return fig

def create_timeline(df: pd.DataFrame, date_col: str, value_col: str,
                   group_col: str = None) -> go.Figure:
    """Create interactive timeline chart."""
    if group_col:
        fig = px.line(df, x=date_col, y=value_col, color=group_col,
                     title=f"{value_col} Over Time")
    else:
        fig = px.line(df, x=date_col, y=value_col, title=f"{value_col} Over Time")

    fig.update_xaxes(rangeslider_visible=True)
    return fig

def create_scatter_with_hover(df: pd.DataFrame, x: str, y: str,
                             hover_data: list = None) -> go.Figure:
    """Create scatter plot with detailed hover info."""
    fig = px.scatter(
        df, x=x, y=y,
        hover_data=hover_data,
        trendline="ols",
        title=f"{y} vs {x}"
    )
    return fig

def create_heatmap(df: pd.DataFrame, x: str, y: str,
                  value: str) -> go.Figure:
    """Create interactive heatmap."""
    pivot = df.pivot_table(index=y, columns=x, values=value)

    fig = go.Figure(data=go.Heatmap(
        z=pivot.values,
        x=pivot.columns,
        y=pivot.index,
        colorscale="RdYlGn",
        text=pivot.values.round(2),
        texttemplate="%{text}",
        textfont={"size": 10}
    ))

    fig.update_layout(title=f"{value} by {x} and {y}")
    return fig

def create_dashboard(df: pd.DataFrame, metrics: list) -> go.Figure:
    """Create multi-panel dashboard."""
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=metrics[:4],
        specs=[[{"type": "indicator"}, {"type": "indicator"}],
               [{"type": "bar"}, {"type": "scatter"}]]
    )

    # Add indicator gauges for first two metrics
    for i, metric in enumerate(metrics[:2]):
        fig.add_trace(
            go.Indicator(
                mode="gauge+number",
                value=df[metric].mean(),
                title={"text": metric},
                gauge={"axis": {"range": [df[metric].min(), df[metric].max()]}}
            ),
            row=1, col=i+1
        )

    return fig

# Example usage:
# fig = create_player_comparison(top_players, ["Points", "Assists", "Rebounds"])
# fig.show()

r Baseball

R Data Wrangling for Sports

Common R data wrangling operations for sports data.

# Common R data wrangling operations for sports
library(dplyr)
library(tidyr)
library(lubridate)

# Calculate per-game stats
per_game_stats <- function(df, counting_stats, games_col = "G") {
  df %>%
    mutate(across(all_of(counting_stats), ~ . / .data[[games_col]], .names = "{col}_per_game"))
}

# Rolling averages
add_rolling_stats <- function(df, stat_cols, windows = c(5, 10), group_col = "player_id") {
  df <- df %>% arrange(.data[[group_col]], date)

  for (w in windows) {
    for (col in stat_cols) {
      new_col <- paste0(col, "_MA", w)
      df <- df %>%
        group_by(.data[[group_col]]) %>%
        mutate(!!new_col := zoo::rollmean(.data[[col]], k = w, fill = NA, align = "right")) %>%
        ungroup()
    }
  }
  df
}

# Lag features for modeling
create_lag_features <- function(df, stat_cols, lags = 1:3, group_col = "player_id") {
  df <- df %>% arrange(.data[[group_col]], date)

  for (l in lags) {
    for (col in stat_cols) {
      new_col <- paste0(col, "_lag", l)
      df <- df %>%
        group_by(.data[[group_col]]) %>%
        mutate(!!new_col := lag(.data[[col]], l)) %>%
        ungroup()
    }
  }
  df
}

# Pivot stats long to wide
pivot_stats_wide <- function(df, stat_col, value_col, id_cols) {
  df %>%
    pivot_wider(
      id_cols = all_of(id_cols),
      names_from = all_of(stat_col),
      values_from = all_of(value_col)
    )
}

# Calculate year-over-year change
yoy_change <- function(df, stat_cols, year_col = "season", group_col = "player_id") {
  df %>%
    arrange(.data[[group_col]], .data[[year_col]]) %>%
    group_by(.data[[group_col]]) %>%
    mutate(across(all_of(stat_cols),
                  ~ . - lag(.),
                  .names = "{col}_yoy_change")) %>%
    ungroup()
}

sql Baseball

Common Sports SQL Queries

Useful SQL queries for sports databases.

-- Common SQL queries for sports analytics

-- Get player career stats with rankings
SELECT
    player_name,
    SUM(points) as career_points,
    AVG(points) as ppg,
    COUNT(DISTINCT season) as seasons,
    RANK() OVER (ORDER BY SUM(points) DESC) as points_rank
FROM player_game_stats
GROUP BY player_id, player_name
HAVING COUNT(*) >= 100
ORDER BY career_points DESC;

-- Calculate team win percentage by month
SELECT
    team,
    DATE_TRUNC('month', game_date) as month,
    COUNT(*) as games,
    SUM(CASE WHEN won THEN 1 ELSE 0 END) as wins,
    ROUND(AVG(CASE WHEN won THEN 1.0 ELSE 0.0 END), 3) as win_pct
FROM games
GROUP BY team, DATE_TRUNC('month', game_date)
ORDER BY team, month;

-- Find players with hot streaks (5+ games above average)
WITH player_avg AS (
    SELECT player_id, AVG(fantasy_points) as avg_pts
    FROM daily_stats
    GROUP BY player_id
),
streaks AS (
    SELECT
        d.player_id,
        d.game_date,
        d.fantasy_points,
        d.fantasy_points > p.avg_pts * 1.2 as hot,
        SUM(CASE WHEN d.fantasy_points > p.avg_pts * 1.2 THEN 0 ELSE 1 END)
            OVER (PARTITION BY d.player_id ORDER BY d.game_date) as streak_group
    FROM daily_stats d
    JOIN player_avg p ON d.player_id = p.player_id
)
SELECT player_id, MIN(game_date) as streak_start, COUNT(*) as streak_length
FROM streaks
WHERE hot
GROUP BY player_id, streak_group
HAVING COUNT(*) >= 5;

-- Head-to-head record between teams
SELECT
    home_team,
    away_team,
    COUNT(*) as games,
    SUM(CASE WHEN home_score > away_score THEN 1 ELSE 0 END) as home_wins,
    SUM(CASE WHEN away_score > home_score THEN 1 ELSE 0 END) as away_wins,
    AVG(home_score + away_score) as avg_total
FROM games
WHERE season = 2024
GROUP BY home_team, away_team;

python Baseball

Unit Conversion Utilities

Convert between different units commonly used in sports.

"""Sports unit conversion utilities."""

# Speed conversions
def mph_to_kph(mph: float) -> float:
    """Miles per hour to kilometers per hour."""
    return mph * 1.60934

def kph_to_mph(kph: float) -> float:
    """Kilometers per hour to miles per hour."""
    return kph / 1.60934

# Distance conversions
def feet_to_meters(feet: float) -> float:
    """Feet to meters."""
    return feet * 0.3048

def meters_to_feet(meters: float) -> float:
    """Meters to feet."""
    return meters / 0.3048

def yards_to_meters(yards: float) -> float:
    """Yards to meters."""
    return yards * 0.9144

# Weight conversions
def lbs_to_kg(lbs: float) -> float:
    """Pounds to kilograms."""
    return lbs * 0.453592

def kg_to_lbs(kg: float) -> float:
    """Kilograms to pounds."""
    return kg / 0.453592

# Height conversions
def inches_to_cm(inches: float) -> float:
    """Inches to centimeters."""
    return inches * 2.54

def height_string_to_inches(height: str) -> int:
    """Convert height string (e.g., '6-2' or '6'2\"') to inches."""
    import re
    match = re.match(r"(\d+)['\-](\d+)", height)
    if match:
        feet, inches = int(match.group(1)), int(match.group(2))
        return feet * 12 + inches
    return 0

# Time conversions
def min_sec_to_decimal(minutes: int, seconds: int) -> float:
    """Convert minutes:seconds to decimal minutes."""
    return minutes + seconds / 60

def pace_to_speed(pace_min_per_mile: float) -> float:
    """Convert pace (min/mile) to speed (mph)."""
    return 60 / pace_min_per_mile

python Baseball

Date and Schedule Utilities

Utility functions for working with sports schedules and dates.

"""Sports schedule and date utilities."""
import pandas as pd
from datetime import datetime, timedelta
from typing import List, Tuple

def get_week_of_season(date: datetime, season_start: datetime) -> int:
    """Calculate week of season from a date."""
    days_since_start = (date - season_start).days
    return (days_since_start // 7) + 1

def parse_game_time(time_str: str, timezone: str = "ET") -> datetime:
    """Parse game time string to datetime."""
    import pytz

    tz_map = {
        "ET": "America/New_York",
        "CT": "America/Chicago",
        "MT": "America/Denver",
        "PT": "America/Los_Angeles"
    }

    # Parse common formats
    for fmt in ["%I:%M %p", "%H:%M", "%I:%M%p"]:
        try:
            dt = datetime.strptime(time_str, fmt)
            tz = pytz.timezone(tz_map.get(timezone, "America/New_York"))
            return tz.localize(dt)
        except ValueError:
            continue

    raise ValueError(f"Cannot parse time: {time_str}")

def calculate_rest_days(schedule: pd.DataFrame, team_col: str,
                       date_col: str) -> pd.DataFrame:
    """Calculate rest days between games for each team."""
    schedule = schedule.sort_values(date_col)
    schedule["rest_days"] = schedule.groupby(team_col)[date_col].diff().dt.days
    return schedule

def find_back_to_backs(schedule: pd.DataFrame, team: str) -> pd.DataFrame:
    """Find back-to-back games for a team."""
    team_games = schedule[
        (schedule["home_team"] == team) | (schedule["away_team"] == team)
    ].sort_values("date")

    team_games["is_b2b"] = team_games["date"].diff().dt.days == 1
    return team_games[team_games["is_b2b"]]

def generate_playoff_bracket(teams: List[str], format: str = "single") -> dict:
    """Generate playoff bracket structure."""
    n_teams = len(teams)
    rounds = []

    current_round = [[teams[i], teams[n_teams-1-i]] for i in range(n_teams//2)]
    rounds.append(current_round)

    while len(current_round) > 1:
        next_round = [[f"Winner G{i*2+1}", f"Winner G{i*2+2}"]
                     for i in range(len(current_round)//2)]
        rounds.append(next_round)
        current_round = next_round

    return {"rounds": rounds, "format": format}

python Volleyball

Volleyball Rally Analysis

Analyze volleyball rally patterns and point sequences.

"""Volleyball rally analysis."""
import pandas as pd
import numpy as np

class VolleyballAnalysis:
    """Analyze volleyball match and rally statistics."""

    @staticmethod
    def sideout_percentage(points_on_receive: int, opponent_serves: int) -> float:
        """Calculate sideout percentage."""
        if opponent_serves == 0:
            return 0
        return points_on_receive / opponent_serves

    @staticmethod
    def kill_percentage(kills: int, errors: int, attempts: int) -> float:
        """Calculate kill percentage."""
        if attempts == 0:
            return 0
        return (kills - errors) / attempts

    @staticmethod
    def efficiency(kills: int, errors: int, total_attempts: int) -> float:
        """Calculate hitting efficiency."""
        if total_attempts == 0:
            return 0
        return (kills - errors) / total_attempts

    @staticmethod
    def passing_rating(passes: pd.DataFrame) -> float:
        """Calculate passing rating (3.0 scale)."""
        # Weight passes by quality (3 = perfect, 2 = good, 1 = ok, 0 = error)
        weights = {"perfect": 3, "good": 2, "ok": 1, "error": 0}
        passes["weighted"] = passes["quality"].map(weights)
        return passes["weighted"].mean()

    @staticmethod
    def serve_receive_analysis(rallies: pd.DataFrame) -> dict:
        """Analyze serve receive patterns."""
        return {
            "sideout_pct": rallies[rallies["serve_receive"]]["point_won"].mean(),
            "avg_rally_length": rallies[rallies["serve_receive"]]["touches"].mean(),
            "first_ball_kill_pct": rallies[
                rallies["serve_receive"] & (rallies["touches"] == 3)
            ]["point_won"].mean()
        }

python Basketball

Player Archetype Clustering

Cluster players into archetypes based on statistical profiles.

"""Cluster players into archetypes."""
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

class PlayerArchetypes:
    """Identify player archetypes using clustering."""

    def __init__(self, n_clusters: int = 8):
        self.n_clusters = n_clusters
        self.scaler = StandardScaler()
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        self.pca = PCA(n_components=2)

    def fit(self, df: pd.DataFrame, stat_cols: list) -> "PlayerArchetypes":
        """Fit clustering model."""
        X = df[stat_cols].fillna(0)
        X_scaled = self.scaler.fit_transform(X)

        self.kmeans.fit(X_scaled)
        self.pca.fit(X_scaled)

        return self

    def assign_archetypes(self, df: pd.DataFrame, stat_cols: list) -> pd.DataFrame:
        """Assign archetype labels to players."""
        df = df.copy()
        X = df[stat_cols].fillna(0)
        X_scaled = self.scaler.transform(X)

        df["archetype"] = self.kmeans.predict(X_scaled)
        return df

    def describe_archetypes(self, df: pd.DataFrame, stat_cols: list) -> pd.DataFrame:
        """Describe each archetype by average stats."""
        return df.groupby("archetype")[stat_cols].mean()

    def plot_archetypes(self, df: pd.DataFrame, stat_cols: list):
        """Visualize archetypes using PCA."""
        X = df[stat_cols].fillna(0)
        X_scaled = self.scaler.transform(X)
        X_pca = self.pca.transform(X_scaled)

        plt.figure(figsize=(10, 8))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1],
                            c=df["archetype"], cmap="tab10", alpha=0.6)
        plt.colorbar(scatter, label="Archetype")
        plt.xlabel("PC1")
        plt.ylabel("PC2")
        plt.title("Player Archetypes")
        return plt

# Usage
# archetypes = PlayerArchetypes(n_clusters=6)
# archetypes.fit(players, ["pts", "reb", "ast", "stl", "blk"])
# players = archetypes.assign_archetypes(players, stat_cols)

python Baseball

Player Performance Trends

Analyze player performance trends over time.

"""Analyze player performance trends over time."""
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression

def calculate_rolling_stats(df: pd.DataFrame, stat_col: str,
                           windows: list = [5, 10, 20]) -> pd.DataFrame:
    """Calculate rolling averages for a statistic."""
    df = df.copy()
    for w in windows:
        df[f"{stat_col}_MA{w}"] = df[stat_col].rolling(w, min_periods=1).mean()
        df[f"{stat_col}_STD{w}"] = df[stat_col].rolling(w, min_periods=1).std()
    return df

def detect_trend(values: np.ndarray) -> Dict:
    """Detect trend using linear regression."""
    x = np.arange(len(values)).reshape(-1, 1)
    y = values

    model = LinearRegression()
    model.fit(x, y)

    slope = model.coef_[0]
    r_squared = model.score(x, y)

    # Mann-Kendall trend test
    tau, p_value = stats.kendalltau(x.flatten(), y)

    return {
        "slope": slope,
        "r_squared": r_squared,
        "kendall_tau": tau,
        "p_value": p_value,
        "trend": "increasing" if slope > 0 and p_value < 0.05 else
                "decreasing" if slope < 0 and p_value < 0.05 else "stable"
    }

def identify_hot_cold_streaks(df: pd.DataFrame, stat_col: str,
                              threshold: float = 1.5) -> pd.DataFrame:
    """Identify hot and cold streaks."""
    df = df.copy()
    mean = df[stat_col].mean()
    std = df[stat_col].std()

    df["z_score"] = (df[stat_col] - mean) / std
    df["streak_type"] = np.where(df["z_score"] > threshold, "hot",
                                np.where(df["z_score"] < -threshold, "cold", "normal"))
    return df

python Baseball

Seasonal Decomposition

Decompose sports statistics into trend, seasonal, and residual components.

"""Seasonal decomposition for sports analytics."""
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

def decompose_season_stats(df: pd.DataFrame, stat_col: str,
                          period: int = 7) -> dict:
    """Decompose statistic into trend, seasonal, and residual."""
    # Ensure datetime index
    if not isinstance(df.index, pd.DatetimeIndex):
        df = df.set_index("date")

    # Fill missing dates
    df = df.resample("D").mean().interpolate()

    decomposition = seasonal_decompose(df[stat_col], model="additive", period=period)

    return {
        "trend": decomposition.trend,
        "seasonal": decomposition.seasonal,
        "residual": decomposition.resid,
        "observed": decomposition.observed
    }

def test_stationarity(series: pd.Series) -> dict:
    """Test for stationarity using Augmented Dickey-Fuller test."""
    result = adfuller(series.dropna())

    return {
        "adf_statistic": result[0],
        "p_value": result[1],
        "is_stationary": result[1] < 0.05,
        "critical_values": result[4]
    }

def forecast_stat(df: pd.DataFrame, stat_col: str,
                 periods: int = 10) -> pd.DataFrame:
    """Simple forecast using exponential smoothing."""
    from statsmodels.tsa.holtwinters import ExponentialSmoothing

    model = ExponentialSmoothing(df[stat_col], trend="add", seasonal="add",
                                 seasonal_periods=7)
    fitted = model.fit()
    forecast = fitted.forecast(periods)

    return pd.DataFrame({
        "forecast": forecast,
        "lower": forecast - 1.96 * fitted.sse,
        "upper": forecast + 1.96 * fitted.sse
    })

python Baseball

Handle Missing Sports Data

Strategies for handling missing values in sports datasets.

"""Handle missing data in sports datasets."""
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer

def analyze_missing(df: pd.DataFrame) -> pd.DataFrame:
    """Analyze missing data patterns."""
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    return pd.DataFrame({
        "Missing": missing,
        "Percent": missing_pct
    }).sort_values("Percent", ascending=False)

def impute_stats(df: pd.DataFrame, method: str = "mean") -> pd.DataFrame:
    """Impute missing statistics using various methods."""
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    if method == "mean":
        imputer = SimpleImputer(strategy="mean")
    elif method == "median":
        imputer = SimpleImputer(strategy="median")
    elif method == "knn":
        imputer = KNNImputer(n_neighbors=5)
    else:
        raise ValueError(f"Unknown method: {method}")

    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    return df

def fill_with_career_avg(df: pd.DataFrame, player_col: str, stat_cols: list) -> pd.DataFrame:
    """Fill missing stats with player career averages."""
    df = df.copy()
    for col in stat_cols:
        career_avg = df.groupby(player_col)[col].transform("mean")
        df[col] = df[col].fillna(career_avg)
    return df

# Example
# df = analyze_missing(player_stats)
# df = impute_stats(player_stats, method="knn")

python Baseball

Merge Multiple Data Sources

Combine data from multiple sources with proper matching.

"""Merge sports data from multiple sources."""
import pandas as pd
from fuzzywuzzy import fuzz, process
from typing import Tuple

def fuzzy_merge(df1: pd.DataFrame, df2: pd.DataFrame,
                key1: str, key2: str, threshold: int = 85) -> pd.DataFrame:
    """Merge DataFrames using fuzzy string matching on names."""
    matches = []
    for name in df1[key1].unique():
        match = process.extractOne(name, df2[key2].unique(), score_cutoff=threshold)
        if match:
            matches.append({"name1": name, "name2": match[0], "score": match[1]})

    match_df = pd.DataFrame(matches)
    df1_matched = df1.merge(match_df, left_on=key1, right_on="name1")
    return df1_matched.merge(df2, left_on="name2", right_on=key2)

def standardize_team_names(df: pd.DataFrame, team_col: str,
                          mapping: dict) -> pd.DataFrame:
    """Standardize team abbreviations across sources."""
    df = df.copy()
    df[team_col] = df[team_col].map(mapping).fillna(df[team_col])
    return df

def merge_with_validation(df1: pd.DataFrame, df2: pd.DataFrame,
                         on: list) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Merge and return unmatched rows for review."""
    merged = df1.merge(df2, on=on, how="outer", indicator=True)
    unmatched = merged[merged["_merge"] != "both"]
    matched = merged[merged["_merge"] == "both"].drop("_merge", axis=1)
    return matched, unmatched

python Basketball

Normalize Player Statistics

Normalize and standardize player statistics for comparison.

"""Normalize sports statistics for analysis."""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def per_game_normalize(df: pd.DataFrame, stat_cols: list, games_col: str = "G") -> pd.DataFrame:
    """Convert counting stats to per-game averages."""
    df = df.copy()
    for col in stat_cols:
        df[f"{col}_per_game"] = df[col] / df[games_col]
    return df

def per_100_possessions(df: pd.DataFrame, stats: list, poss_col: str) -> pd.DataFrame:
    """Normalize stats per 100 possessions (basketball)."""
    df = df.copy()
    for stat in stats:
        df[f"{stat}_per100"] = df[stat] / df[poss_col] * 100
    return df

def z_score_normalize(df: pd.DataFrame, stat_cols: list) -> pd.DataFrame:
    """Z-score normalize stats (mean=0, std=1)."""
    df = df.copy()
    scaler = StandardScaler()
    df[stat_cols] = scaler.fit_transform(df[stat_cols])
    return df

def percentile_rank(df: pd.DataFrame, stat_cols: list) -> pd.DataFrame:
    """Convert stats to percentile rankings."""
    df = df.copy()
    for col in stat_cols:
        df[f"{col}_pctl"] = df[col].rank(pct=True) * 100
    return df

python Football

Monte Carlo Game Simulator

Simulate game outcomes using Monte Carlo methods.

"""Monte Carlo simulation for game outcomes."""
import numpy as np
import pandas as pd
from typing import Tuple, Dict

class GameSimulator:
    """Simulate sports game outcomes using Monte Carlo."""

    def __init__(self, n_sims: int = 10000):
        self.n_sims = n_sims

    def simulate_game(self, home_mean: float, away_mean: float,
                     home_std: float = None, away_std: float = None) -> Dict:
        """Simulate a single game many times."""
        if home_std is None:
            home_std = home_mean * 0.3
        if away_std is None:
            away_std = away_mean * 0.3

        home_scores = np.random.normal(home_mean, home_std, self.n_sims)
        away_scores = np.random.normal(away_mean, away_std, self.n_sims)

        home_wins = (home_scores > away_scores).sum()
        ties = (home_scores == away_scores).sum()

        return {
            "home_win_prob": home_wins / self.n_sims,
            "away_win_prob": (self.n_sims - home_wins - ties) / self.n_sims,
            "avg_home_score": home_scores.mean(),
            "avg_away_score": away_scores.mean(),
            "avg_total": (home_scores + away_scores).mean()
        }

    def simulate_season(self, schedule: pd.DataFrame,
                       team_ratings: Dict[str, float]) -> pd.DataFrame:
        """Simulate full season standings."""
        results = {team: {"W": 0, "L": 0} for team in team_ratings}

        for _, game in schedule.iterrows():
            home, away = game["home"], game["away"]
            result = self.simulate_game(
                team_ratings[home] + 3,  # Home advantage
                team_ratings[away]
            )
            if np.random.random() < result["home_win_prob"]:
                results[home]["W"] += 1
                results[away]["L"] += 1
            else:
                results[away]["W"] += 1
                results[home]["L"] += 1

        return pd.DataFrame(results).T

python Baseball

Bootstrap Confidence Intervals

Calculate confidence intervals for sports statistics using bootstrap.

"""Bootstrap confidence intervals for sports statistics."""
import numpy as np
import pandas as pd
from typing import Tuple, Callable

def bootstrap_ci(data: np.ndarray, stat_func: Callable = np.mean,
                n_bootstrap: int = 10000, ci: float = 0.95) -> Tuple[float, float]:
    """Calculate bootstrap confidence interval."""
    bootstrap_stats = []
    n = len(data)

    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=n, replace=True)
        bootstrap_stats.append(stat_func(sample))

    alpha = (1 - ci) / 2
    lower = np.percentile(bootstrap_stats, alpha * 100)
    upper = np.percentile(bootstrap_stats, (1 - alpha) * 100)

    return lower, upper

def bootstrap_player_stat(df: pd.DataFrame, player: str, stat: str,
                         ci: float = 0.95) -> Dict:
    """Bootstrap confidence interval for a player stat."""
    player_data = df[df["player"] == player][stat].values

    if len(player_data) < 10:
        return {"error": "Insufficient data"}

    mean = player_data.mean()
    lower, upper = bootstrap_ci(player_data, np.mean, ci=ci)

    return {
        "player": player,
        "stat": stat,
        "mean": mean,
        "ci_lower": lower,
        "ci_upper": upper,
        "sample_size": len(player_data)
    }

# Example: Calculate 95% CI for batting average
# result = bootstrap_player_stat(batting_logs, "Mike Trout", "AVG")

python Football

Correlation Analysis for Stacking

Analyze player correlations for DFS game stacking strategies.

"""DFS Correlation Analysis for Game Stacking."""
import pandas as pd
import numpy as np
from scipy import stats
from typing import Dict, List, Tuple
import itertools

class CorrelationAnalyzer:
    """
    Analyze fantasy point correlations for stacking strategies.

    Key correlations:
    - QB-WR/TE (passing game)
    - RB-DEF (game script)
    - Bring-back (opposing players)
    """

    # Historical correlation estimates by position pair
    NFL_CORRELATIONS = {
        ("QB", "WR"): 0.35,
        ("QB", "TE"): 0.30,
        ("QB", "RB"): 0.15,
        ("WR", "WR"): 0.05,
        ("RB", "DST"): -0.10,
        ("QB", "opp_WR"): 0.10,  # Bring-back
        ("QB", "opp_RB"): 0.05,
    }

    NBA_CORRELATIONS = {
        ("PG", "C"): 0.20,  # Pick and roll
        ("SG", "SG"): -0.15,  # Same position, compete for shots
        ("PF", "C"): 0.10,
    }

    def __init__(self, sport: str = "nfl"):
        self.sport = sport
        self.correlations = self.NFL_CORRELATIONS if sport == "nfl" else self.NBA_CORRELATIONS

    def calculate_correlation(
        self,
        player1_scores: pd.Series,
        player2_scores: pd.Series
    ) -> Dict:
        """Calculate correlation between two players' fantasy scores."""
        # Align data
        combined = pd.DataFrame({
            "p1": player1_scores,
            "p2": player2_scores
        }).dropna()

        if len(combined) < 5:
            return {"correlation": np.nan, "p_value": np.nan, "sample_size": len(combined)}

        corr, p_value = stats.pearsonr(combined["p1"], combined["p2"])

        return {
            "correlation": corr,
            "p_value": p_value,
            "sample_size": len(combined),
            "significant": p_value < 0.05
        }

    def build_correlation_matrix(
        self,
        game_logs: pd.DataFrame,
        players: List[str]
    ) -> pd.DataFrame:
        """Build correlation matrix for a set of players."""
        # Pivot to get scores by game
        pivot = game_logs.pivot_table(
            index="game_id",
            columns="player_name",
            values="fantasy_points",
            aggfunc="first"
        )

        # Filter to specified players
        available = [p for p in players if p in pivot.columns]
        pivot = pivot[available]

        return pivot.corr()

    def get_stack_correlation(
        self,
        qb: str,
        pass_catchers: List[str],
        player_positions: Dict[str, str]
    ) -> float:
        """
        Calculate expected correlation for a stack.

        Args:
            qb: Quarterback name
            pass_catchers: List of WR/TE names
            player_positions: Dict mapping player names to positions
        """
        total_corr = 0

        for player in pass_catchers:
            pos = player_positions.get(player, "WR")
            corr = self.correlations.get(("QB", pos), 0.25)
            total_corr += corr

        # Average correlation
        return total_corr / len(pass_catchers) if pass_catchers else 0

    def find_optimal_stacks(
        self,
        players: pd.DataFrame,
        correlation_matrix: pd.DataFrame = None,
        stack_size: int = 3
    ) -> pd.DataFrame:
        """
        Find optimal player stacks based on correlation and projection.

        Args:
            players: Player pool with projections
            correlation_matrix: Pre-calculated correlations
            stack_size: Number of players in stack
        """
        stacks = []

        # Group by team
        teams = players.groupby("Team")

        for team, team_players in teams:
            if len(team_players) < stack_size:
                continue

            # Get QBs
            qbs = team_players[team_players["Position"] == "QB"]
            pass_catchers = team_players[team_players["Position"].isin(["WR", "TE"])]

            if len(qbs) == 0:
                continue

            qb = qbs.iloc[0]

            # Generate combinations of pass catchers
            for combo in itertools.combinations(pass_catchers.index, min(stack_size - 1, len(pass_catchers))):
                stack_players = [qb.name] + list(combo)
                stack_df = players.loc[stack_players]

                # Calculate stack metrics
                total_projection = stack_df["Projection"].sum()
                total_salary = stack_df["Salary"].sum()
                avg_ownership = stack_df["Ownership"].mean()

                # Estimate correlation boost
                pos_dict = dict(zip(stack_df["Name"], stack_df["Position"]))
                corr = self.get_stack_correlation(
                    qb["Name"],
                    [players.loc[i, "Name"] for i in combo],
                    pos_dict
                )

                # Stack ceiling (projection * (1 + correlation factor))
                ceiling_boost = 1 + corr * 0.5
                stack_ceiling = total_projection * ceiling_boost

                stacks.append({
                    "Team": team,
                    "Players": ", ".join(stack_df["Name"].tolist()),
                    "Positions": ", ".join(stack_df["Position"].tolist()),
                    "Projection": total_projection,
                    "Salary": total_salary,
                    "Avg_Ownership": avg_ownership,
                    "Correlation": corr,
                    "Stack_Ceiling": stack_ceiling,
                    "Value": stack_ceiling / total_salary * 1000
                })

        return pd.DataFrame(stacks).sort_values("Stack_Ceiling", ascending=False)

    def analyze_bring_back(
        self,
        primary_stack: List[str],
        opponent_players: pd.DataFrame,
        game_total: float = 48
    ) -> pd.DataFrame:
        """
        Find optimal bring-back (opposing) players for a stack.

        Higher game totals favor bring-backs.
        """
        # Bring-back correlation increases with game total
        base_corr = 0.08
        game_factor = (game_total - 40) / 20  # Normalized around 50 total
        adjusted_corr = base_corr * (1 + game_factor)

        opponent_players = opponent_players.copy()
        opponent_players["Bring_Back_Value"] = (
            opponent_players["Projection"] *
            (1 + adjusted_corr) /
            opponent_players["Ownership"].clip(0.05)
        )

        return opponent_players.sort_values("Bring_Back_Value", ascending=False)

    def simulate_stack_outcomes(
        self,
        stack_projection: float,
        stack_correlation: float,
        individual_std: float = 8,
        n_simulations: int = 10000
    ) -> Dict:
        """
        Simulate stack outcomes to understand ceiling/floor.
        """
        # Correlation affects how scores move together
        # Higher correlation = more extreme outcomes
        stack_std = individual_std * np.sqrt(1 + stack_correlation)

        simulations = np.random.normal(stack_projection, stack_std, n_simulations)

        return {
            "mean": np.mean(simulations),
            "std": np.std(simulations),
            "median": np.median(simulations),
            "25th_percentile": np.percentile(simulations, 25),
            "75th_percentile": np.percentile(simulations, 75),
            "90th_percentile": np.percentile(simulations, 90),
            "ceiling": np.percentile(simulations, 95),
            "floor": np.percentile(simulations, 5)
        }


# Example usage
if __name__ == "__main__":
    analyzer = CorrelationAnalyzer("nfl")

    # Sample players
    players = pd.DataFrame({
        "Name": ["Patrick Mahomes", "Travis Kelce", "Rashee Rice", "Isiah Pacheco", "Chiefs DST",
                "Josh Allen", "Stefon Diggs", "Dalton Kincaid", "James Cook", "Bills DST"],
        "Position": ["QB", "TE", "WR", "RB", "DST", "QB", "WR", "TE", "RB", "DST"],
        "Team": ["KC", "KC", "KC", "KC", "KC", "BUF", "BUF", "BUF", "BUF", "BUF"],
        "Salary": [8200, 6800, 6200, 5800, 3500, 7800, 6500, 5200, 6000, 3200],
        "Projection": [22.5, 15.2, 14.8, 13.5, 7.5, 21.0, 14.5, 11.2, 14.0, 6.8],
        "Ownership": [0.22, 0.18, 0.15, 0.12, 0.08, 0.20, 0.14, 0.10, 0.13, 0.05]
    })

    # Find optimal stacks
    stacks = analyzer.find_optimal_stacks(players, stack_size=3)
    print("Top Stacks:")
    print(stacks[["Team", "Players", "Projection", "Correlation", "Stack_Ceiling"]].head(5))

    # Simulate stack outcomes
    sim_results = analyzer.simulate_stack_outcomes(
        stack_projection=52.5,  # QB + 2 pass catchers
        stack_correlation=0.32
    )
    print(f"\nStack Simulation:")
    print(f"  Ceiling (95th): {sim_results['ceiling']:.1f}")
    print(f"  Floor (5th): {sim_results['floor']:.1f}")

python Football

Bankroll Management System

Implement proper bankroll management for sports betting using Kelly criterion.

"""Sports Betting Bankroll Management."""
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime

@dataclass
class Bet:
    """Represents a single bet."""
    id: str
    sport: str
    bet_type: str
    selection: str
    odds: int  # American odds
    stake: float
    result: Optional[str] = None  # "win", "loss", "push"
    profit: Optional[float] = None
    date: datetime = None
    edge: Optional[float] = None

class BankrollManager:
    """
    Manage sports betting bankroll with Kelly criterion.

    Features:
    - Kelly and fractional Kelly staking
    - Unit-based tracking
    - Risk of ruin calculations
    - Performance analytics
    """

    def __init__(
        self,
        initial_bankroll: float,
        unit_size: float = None,
        kelly_fraction: float = 0.25,
        max_bet_pct: float = 0.05
    ):
        """
        Initialize bankroll manager.

        Args:
            initial_bankroll: Starting bankroll
            unit_size: Standard unit size (default: 1% of bankroll)
            kelly_fraction: Fraction of Kelly to use (0.25 = quarter Kelly)
            max_bet_pct: Maximum bet as percentage of bankroll
        """
        self.initial_bankroll = initial_bankroll
        self.current_bankroll = initial_bankroll
        self.unit_size = unit_size or (initial_bankroll * 0.01)
        self.kelly_fraction = kelly_fraction
        self.max_bet_pct = max_bet_pct
        self.bets: List[Bet] = []
        self.history = [{"date": datetime.now(), "bankroll": initial_bankroll}]

    def kelly_criterion(
        self,
        probability: float,
        american_odds: int
    ) -> float:
        """
        Calculate Kelly criterion bet size.

        Kelly% = (p * b - q) / b
        where p = win prob, q = loss prob, b = decimal odds - 1
        """
        if american_odds > 0:
            decimal_odds = (american_odds / 100) + 1
        else:
            decimal_odds = (100 / abs(american_odds)) + 1

        b = decimal_odds - 1
        q = 1 - probability

        kelly = (probability * b - q) / b

        return max(0, kelly)

    def calculate_stake(
        self,
        probability: float,
        odds: int,
        confidence: str = "normal"
    ) -> Dict:
        """
        Calculate recommended stake.

        Args:
            probability: Estimated win probability
            odds: American odds
            confidence: "low", "normal", "high"
        """
        # Base Kelly
        kelly_pct = self.kelly_criterion(probability, odds)

        # Apply Kelly fraction
        adjusted_kelly = kelly_pct * self.kelly_fraction

        # Confidence adjustment
        confidence_mult = {"low": 0.5, "normal": 1.0, "high": 1.5}
        adjusted_kelly *= confidence_mult.get(confidence, 1.0)

        # Apply max bet limit
        final_pct = min(adjusted_kelly, self.max_bet_pct)

        # Calculate stake
        stake = self.current_bankroll * final_pct

        # Round to unit size
        units = round(stake / self.unit_size, 1)
        stake = units * self.unit_size

        return {
            "stake": stake,
            "units": units,
            "kelly_pct": kelly_pct,
            "adjusted_pct": final_pct,
            "pct_of_bankroll": stake / self.current_bankroll
        }

    def place_bet(self, bet: Bet) -> None:
        """Record a placed bet."""
        self.bets.append(bet)

    def settle_bet(self, bet_id: str, result: str) -> float:
        """
        Settle a bet and update bankroll.

        Args:
            bet_id: ID of bet to settle
            result: "win", "loss", or "push"

        Returns:
            Profit/loss amount
        """
        bet = next((b for b in self.bets if b.id == bet_id), None)
        if not bet:
            raise ValueError(f"Bet {bet_id} not found")

        bet.result = result
        bet.date = datetime.now()

        if result == "win":
            if bet.odds > 0:
                profit = bet.stake * (bet.odds / 100)
            else:
                profit = bet.stake * (100 / abs(bet.odds))
            self.current_bankroll += profit + bet.stake
        elif result == "loss":
            profit = -bet.stake
            self.current_bankroll -= bet.stake
        else:  # push
            profit = 0

        bet.profit = profit

        self.history.append({
            "date": datetime.now(),
            "bankroll": self.current_bankroll
        })

        return profit

    def risk_of_ruin(
        self,
        win_rate: float,
        avg_odds: int,
        ruin_threshold: float = 0.1
    ) -> float:
        """
        Calculate risk of ruin.

        Probability of losing (1 - ruin_threshold) of bankroll.
        """
        # Convert odds to decimal
        if avg_odds > 0:
            decimal_odds = (avg_odds / 100) + 1
        else:
            decimal_odds = (100 / abs(avg_odds)) + 1

        # Calculate edge
        edge = (win_rate * decimal_odds) - 1

        if edge <= 0:
            return 1.0  # Guaranteed ruin with negative edge

        # Simplified RoR formula
        # Assumes fixed bet size relative to bankroll
        bet_size = self.unit_size / self.current_bankroll
        variance = win_rate * (1 - win_rate) * (decimal_odds ** 2)

        # Risk of ruin approximation
        ror = ((1 - edge / variance) ** (1 / bet_size)) ** (
            (1 - ruin_threshold) * self.current_bankroll / self.unit_size
        )

        return min(ror, 1.0)

    def get_performance_stats(self) -> Dict:
        """Calculate betting performance statistics."""
        settled = [b for b in self.bets if b.result is not None]

        if not settled:
            return {"message": "No settled bets"}

        wins = [b for b in settled if b.result == "win"]
        losses = [b for b in settled if b.result == "loss"]

        total_staked = sum(b.stake for b in settled)
        total_profit = sum(b.profit for b in settled)

        return {
            "total_bets": len(settled),
            "wins": len(wins),
            "losses": len(losses),
            "win_rate": len(wins) / len(settled),
            "total_staked": total_staked,
            "total_profit": total_profit,
            "roi": total_profit / total_staked if total_staked > 0 else 0,
            "current_bankroll": self.current_bankroll,
            "bankroll_growth": (self.current_bankroll - self.initial_bankroll) / self.initial_bankroll,
            "avg_stake": total_staked / len(settled),
            "avg_profit_per_bet": total_profit / len(settled),
            "largest_win": max((b.profit for b in wins), default=0),
            "largest_loss": min((b.profit for b in losses), default=0),
            "current_streak": self._get_streak(settled)
        }

    def _get_streak(self, bets: List[Bet]) -> Dict:
        """Calculate current winning/losing streak."""
        if not bets:
            return {"type": None, "length": 0}

        sorted_bets = sorted(bets, key=lambda x: x.date or datetime.min, reverse=True)
        streak_type = sorted_bets[0].result
        streak_length = 0

        for bet in sorted_bets:
            if bet.result == streak_type:
                streak_length += 1
            else:
                break

        return {"type": streak_type, "length": streak_length}

    def get_bankroll_history(self) -> pd.DataFrame:
        """Get bankroll history as DataFrame."""
        return pd.DataFrame(self.history)


# Example usage
if __name__ == "__main__":
    # Initialize with $10,000 bankroll
    manager = BankrollManager(
        initial_bankroll=10000,
        kelly_fraction=0.25,
        max_bet_pct=0.03
    )

    print(f"Starting Bankroll: ${manager.current_bankroll:,.2f}")
    print(f"Unit Size: ${manager.unit_size:,.2f}")

    # Calculate stake for a bet
    stake_info = manager.calculate_stake(
        probability=0.55,  # 55% estimated win probability
        odds=-110,
        confidence="normal"
    )

    print(f"\nRecommended Stake:")
    print(f"  Amount: ${stake_info['stake']:,.2f}")
    print(f"  Units: {stake_info['units']}")
    print(f"  Kelly%: {stake_info['kelly_pct']:.2%}")
    print(f"  % of Bankroll: {stake_info['pct_of_bankroll']:.2%}")

    # Risk of ruin
    ror = manager.risk_of_ruin(win_rate=0.52, avg_odds=-110)
    print(f"\nRisk of Ruin: {ror:.2%}")

python Football

Betting Value Finder

Identify value bets by comparing projections to betting lines.

"""Sports Betting Value Finder."""
import pandas as pd
import numpy as np
from scipy import stats
from typing import Dict, List, Tuple, Optional

class BettingValueFinder:
    """
    Find value bets by comparing model projections to market odds.

    Supports: Spreads, Totals, Moneylines, Player Props
    """

    def __init__(self, edge_threshold: float = 0.03):
        """
        Initialize value finder.

        Args:
            edge_threshold: Minimum edge to flag a bet (default 3%)
        """
        self.edge_threshold = edge_threshold

    @staticmethod
    def american_to_decimal(american: int) -> float:
        """Convert American odds to decimal."""
        if american > 0:
            return (american / 100) + 1
        else:
            return (100 / abs(american)) + 1

    @staticmethod
    def decimal_to_american(decimal: float) -> int:
        """Convert decimal odds to American."""
        if decimal >= 2:
            return int((decimal - 1) * 100)
        else:
            return int(-100 / (decimal - 1))

    @staticmethod
    def implied_probability(american: int) -> float:
        """Calculate implied probability from American odds."""
        if american > 0:
            return 100 / (american + 100)
        else:
            return abs(american) / (abs(american) + 100)

    def find_spread_value(
        self,
        team: str,
        spread: float,
        odds: int,
        projected_margin: float,
        margin_std: float = 13.5  # NFL typical std dev
    ) -> Dict:
        """
        Find value on spread bets.

        Args:
            team: Team name
            spread: Betting spread (negative = favorite)
            odds: American odds
            projected_margin: Model projected margin
            margin_std: Standard deviation of margin
        """
        # Calculate cover probability
        cover_margin = projected_margin + spread  # Adjusted margin needed to cover
        cover_prob = 1 - stats.norm.cdf(0, cover_margin, margin_std)

        # Compare to implied probability
        implied_prob = self.implied_probability(odds)
        edge = cover_prob - implied_prob

        # Calculate Kelly criterion bet size
        decimal_odds = self.american_to_decimal(odds)
        kelly = (cover_prob * decimal_odds - 1) / (decimal_odds - 1) if edge > 0 else 0
        kelly = max(0, min(kelly, 0.25))  # Cap at 25%

        return {
            "bet_type": "spread",
            "team": team,
            "spread": spread,
            "odds": odds,
            "cover_probability": cover_prob,
            "implied_probability": implied_prob,
            "edge": edge,
            "is_value": edge >= self.edge_threshold,
            "kelly_fraction": kelly,
            "rating": "Strong" if edge > 0.08 else "Moderate" if edge > 0.05 else "Slight"
        }

    def find_total_value(
        self,
        game: str,
        total: float,
        side: str,  # "over" or "under"
        odds: int,
        projected_total: float,
        total_std: float = 10.0
    ) -> Dict:
        """Find value on totals."""
        if side.lower() == "over":
            prob = 1 - stats.norm.cdf(total, projected_total, total_std)
        else:
            prob = stats.norm.cdf(total, projected_total, total_std)

        implied_prob = self.implied_probability(odds)
        edge = prob - implied_prob

        decimal_odds = self.american_to_decimal(odds)
        kelly = (prob * decimal_odds - 1) / (decimal_odds - 1) if edge > 0 else 0
        kelly = max(0, min(kelly, 0.25))

        return {
            "bet_type": "total",
            "game": game,
            "total": total,
            "side": side,
            "odds": odds,
            "hit_probability": prob,
            "implied_probability": implied_prob,
            "edge": edge,
            "is_value": edge >= self.edge_threshold,
            "kelly_fraction": kelly
        }

    def find_moneyline_value(
        self,
        team: str,
        odds: int,
        win_probability: float
    ) -> Dict:
        """Find value on moneyline bets."""
        implied_prob = self.implied_probability(odds)
        edge = win_probability - implied_prob

        decimal_odds = self.american_to_decimal(odds)
        kelly = (win_probability * decimal_odds - 1) / (decimal_odds - 1) if edge > 0 else 0
        kelly = max(0, min(kelly, 0.25))

        return {
            "bet_type": "moneyline",
            "team": team,
            "odds": odds,
            "win_probability": win_probability,
            "implied_probability": implied_prob,
            "edge": edge,
            "is_value": edge >= self.edge_threshold,
            "kelly_fraction": kelly,
            "expected_value": win_probability * (decimal_odds - 1) - (1 - win_probability)
        }

    def find_prop_value(
        self,
        player: str,
        stat: str,
        line: float,
        side: str,
        odds: int,
        projection: float,
        std: float
    ) -> Dict:
        """Find value on player props."""
        if side.lower() == "over":
            prob = 1 - stats.norm.cdf(line, projection, std)
        else:
            prob = stats.norm.cdf(line, projection, std)

        implied_prob = self.implied_probability(odds)
        edge = prob - implied_prob

        return {
            "bet_type": "prop",
            "player": player,
            "stat": stat,
            "line": line,
            "side": side,
            "odds": odds,
            "projection": projection,
            "hit_probability": prob,
            "implied_probability": implied_prob,
            "edge": edge,
            "is_value": edge >= self.edge_threshold
        }

    def scan_market(
        self,
        odds_df: pd.DataFrame,
        projections: Dict[str, Dict]
    ) -> pd.DataFrame:
        """
        Scan entire market for value bets.

        Args:
            odds_df: DataFrame with current odds
            projections: Dict with model projections
        """
        value_bets = []

        for _, row in odds_df.iterrows():
            game_key = f"{row['away_team']}@{row['home_team']}"

            if game_key not in projections:
                continue

            proj = projections[game_key]

            # Check spread value
            for team, spread, odds in [
                (row["home_team"], row["home_spread"], row["home_spread_odds"]),
                (row["away_team"], -row["home_spread"], row["away_spread_odds"])
            ]:
                if team == row["home_team"]:
                    margin = proj["home_margin"]
                else:
                    margin = -proj["home_margin"]

                result = self.find_spread_value(team, spread, odds, margin)
                if result["is_value"]:
                    result["game"] = game_key
                    value_bets.append(result)

            # Check total value
            for side, odds in [("over", row["over_odds"]), ("under", row["under_odds"])]:
                result = self.find_total_value(
                    game_key, row["total"], side, odds, proj["total"]
                )
                if result["is_value"]:
                    value_bets.append(result)

        return pd.DataFrame(value_bets)


def calculate_clv(bet_odds: int, closing_odds: int) -> float:
    """
    Calculate Closing Line Value.

    CLV is one of the best predictors of long-term betting success.
    """
    bet_implied = BettingValueFinder.implied_probability(bet_odds)
    close_implied = BettingValueFinder.implied_probability(closing_odds)

    return close_implied - bet_implied


# Example usage
if __name__ == "__main__":
    finder = BettingValueFinder(edge_threshold=0.03)

    # Example: Find spread value
    spread_value = finder.find_spread_value(
        team="Kansas City",
        spread=-3.5,
        odds=-110,
        projected_margin=6.5  # Model says KC wins by 6.5
    )

    print("Spread Bet Analysis:")
    print(f"  Team: {spread_value['team']} {spread_value['spread']}")
    print(f"  Cover Prob: {spread_value['cover_probability']:.1%}")
    print(f"  Implied Prob: {spread_value['implied_probability']:.1%}")
    print(f"  Edge: {spread_value['edge']:.1%}")
    print(f"  Is Value: {spread_value['is_value']}")

    # Example: Player prop
    prop_value = finder.find_prop_value(
        player="Patrick Mahomes",
        stat="Passing Yards",
        line=275.5,
        side="over",
        odds=-115,
        projection=295,
        std=45
    )

    print(f"\nProp Bet Analysis:")
    print(f"  {prop_value['player']} {prop_value['stat']} {prop_value['side']} {prop_value['line']}")
    print(f"  Hit Prob: {prop_value['hit_probability']:.1%}")
    print(f"  Edge: {prop_value['edge']:.1%}")

python Football

Player Projection Model

Build fantasy sports projection models using machine learning.

"""Fantasy Sports Projection Model."""
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from typing import Dict, List, Tuple

class ProjectionModel:
    """
    Build fantasy point projections using historical data.
    """

    def __init__(self, sport: str = "nfl", position: str = None):
        self.sport = sport
        self.position = position
        self.model = None
        self.scaler = StandardScaler()
        self.feature_names = None

    def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Prepare features for modeling.

        Creates rolling averages, matchup adjustments, etc.
        """
        df = df.copy()

        # Sort by player and date
        df = df.sort_values(["player_id", "game_date"])

        # Rolling averages (last 3 games)
        rolling_cols = ["fantasy_points", "yards", "touchdowns", "receptions"]
        for col in rolling_cols:
            if col in df.columns:
                df[f"{col}_L3"] = df.groupby("player_id")[col].transform(
                    lambda x: x.rolling(3, min_periods=1).mean().shift(1)
                )
                df[f"{col}_L5"] = df.groupby("player_id")[col].transform(
                    lambda x: x.rolling(5, min_periods=1).mean().shift(1)
                )

        # Season averages
        for col in rolling_cols:
            if col in df.columns:
                df[f"{col}_season_avg"] = df.groupby(["player_id", "season"])[col].transform(
                    lambda x: x.expanding().mean().shift(1)
                )

        # Opponent defense ranking
        if "opponent_def_rank" not in df.columns and "opponent" in df.columns:
            # Calculate opponent strength from data
            opp_avg = df.groupby("opponent")["fantasy_points"].mean()
            df["opponent_def_rank"] = df["opponent"].map(opp_avg)

        # Home/away indicator
        if "is_home" not in df.columns and "location" in df.columns:
            df["is_home"] = (df["location"] == "home").astype(int)

        # Rest days (simplified)
        df["days_rest"] = df.groupby("player_id")["game_date"].diff().dt.days.fillna(7)

        return df

    def select_features(self, df: pd.DataFrame) -> List[str]:
        """Select features for modeling."""
        # Base features
        features = [
            "fantasy_points_L3", "fantasy_points_L5", "fantasy_points_season_avg",
            "opponent_def_rank", "is_home", "days_rest"
        ]

        # Position-specific features
        if self.position == "QB":
            features.extend([
                "passing_yards_L3", "passing_tds_L3",
                "rushing_yards_L3", "interceptions_L3"
            ])
        elif self.position in ["RB", "WR", "TE"]:
            features.extend([
                "targets_L3", "receptions_L3", "yards_L3",
                "touchdowns_L3", "snap_pct_L3"
            ])

        # Filter to available features
        available = [f for f in features if f in df.columns]
        self.feature_names = available

        return available

    def train(
        self,
        df: pd.DataFrame,
        target: str = "fantasy_points",
        model_type: str = "gbm"
    ) -> Dict:
        """
        Train projection model.

        Args:
            df: Historical data
            target: Target variable (fantasy points)
            model_type: "gbm" or "rf"
        """
        # Prepare features
        df = self.prepare_features(df)
        features = self.select_features(df)

        # Remove rows with missing values
        df = df.dropna(subset=features + [target])

        X = df[features]
        y = df[target]

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Select model
        if model_type == "gbm":
            model = GradientBoostingRegressor(
                n_estimators=100,
                max_depth=5,
                learning_rate=0.1,
                random_state=42
            )
        else:
            model = RandomForestRegressor(
                n_estimators=100,
                max_depth=10,
                random_state=42
            )

        # Create pipeline with scaling
        self.model = Pipeline([
            ("scaler", StandardScaler()),
            ("model", model)
        ])

        # Fit
        self.model.fit(X_train, y_train)

        # Evaluate
        train_score = self.model.score(X_train, y_train)
        test_score = self.model.score(X_test, y_test)
        cv_scores = cross_val_score(self.model, X, y, cv=5)

        # Feature importance
        if model_type == "gbm":
            importance = model.feature_importances_
        else:
            importance = self.model.named_steps["model"].feature_importances_

        return {
            "train_r2": train_score,
            "test_r2": test_score,
            "cv_mean": cv_scores.mean(),
            "cv_std": cv_scores.std(),
            "feature_importance": dict(zip(features, importance))
        }

    def predict(self, df: pd.DataFrame) -> pd.Series:
        """Generate projections for players."""
        if self.model is None:
            raise ValueError("Model not trained")

        df = self.prepare_features(df)
        X = df[self.feature_names]

        return pd.Series(self.model.predict(X), index=df.index)

    def get_projections(self, df: pd.DataFrame) -> pd.DataFrame:
        """Get projections with confidence intervals."""
        predictions = self.predict(df)

        # Estimate uncertainty from CV or ensemble
        # Simplified: use fixed percentage
        uncertainty = predictions * 0.15

        result = df[["player_name", "team", "opponent"]].copy()
        result["projection"] = predictions
        result["floor"] = predictions - 1.5 * uncertainty
        result["ceiling"] = predictions + 1.5 * uncertainty

        return result.sort_values("projection", ascending=False)


# Example usage
if __name__ == "__main__":
    # Generate sample historical data
    np.random.seed(42)
    n_games = 500

    historical = pd.DataFrame({
        "player_id": np.repeat(range(50), 10),
        "player_name": [f"Player {i//10}" for i in range(n_games)],
        "game_date": pd.date_range("2023-01-01", periods=n_games, freq="W"),
        "season": [2023] * 250 + [2024] * 250,
        "team": np.random.choice(["KC", "BUF", "PHI", "SF", "DAL"], n_games),
        "opponent": np.random.choice(["NYG", "WAS", "CHI", "DET", "MIN"], n_games),
        "location": np.random.choice(["home", "away"], n_games),
        "fantasy_points": np.random.normal(15, 8, n_games).clip(0),
        "yards": np.random.normal(70, 30, n_games).clip(0),
        "touchdowns": np.random.poisson(0.5, n_games),
        "receptions": np.random.poisson(4, n_games)
    })

    # Train model
    model = ProjectionModel("nfl", "WR")
    results = model.train(historical)

    print("Model Performance:")
    print(f"  Test R2: {results['test_r2']:.3f}")
    print(f"  CV Mean: {results['cv_mean']:.3f}")

    print("\nFeature Importance:")
    for feat, imp in sorted(results["feature_importance"].items(), key=lambda x: -x[1])[:5]:
        print(f"  {feat}: {imp:.3f}")

python Football

DFS Lineup Optimizer

Optimize daily fantasy sports lineups using linear programming.

"""DFS Lineup Optimizer using Linear Programming."""
import pandas as pd
import numpy as np
from scipy.optimize import linprog, milp, LinearConstraint, Bounds
from typing import List, Dict, Tuple, Optional

class DFSOptimizer:
    """
    Optimize DFS lineups using mixed integer linear programming.

    Supports: DraftKings, FanDuel salary structures
    """

    SITE_CONFIGS = {
        "draftkings": {
            "nfl": {
                "positions": ["QB", "RB", "RB", "WR", "WR", "WR", "TE", "FLEX", "DST"],
                "salary_cap": 50000,
                "roster_size": 9
            },
            "nba": {
                "positions": ["PG", "SG", "SF", "PF", "C", "G", "F", "UTIL"],
                "salary_cap": 50000,
                "roster_size": 8
            },
            "mlb": {
                "positions": ["P", "P", "C", "1B", "2B", "3B", "SS", "OF", "OF", "OF"],
                "salary_cap": 50000,
                "roster_size": 10
            }
        },
        "fanduel": {
            "nfl": {
                "positions": ["QB", "RB", "RB", "WR", "WR", "WR", "TE", "FLEX", "DST"],
                "salary_cap": 60000,
                "roster_size": 9
            },
            "nba": {
                "positions": ["PG", "PG", "SG", "SG", "SF", "SF", "PF", "PF", "C"],
                "salary_cap": 60000,
                "roster_size": 9
            }
        }
    }

    def __init__(self, site: str = "draftkings", sport: str = "nfl"):
        self.site = site
        self.sport = sport
        self.config = self.SITE_CONFIGS[site][sport]
        self.players = None

    def load_players(self, df: pd.DataFrame):
        """
        Load player pool.

        Required columns: Name, Position, Salary, Projection
        Optional: Team, Opponent, Ownership
        """
        self.players = df.copy()
        self.players["idx"] = range(len(self.players))
        return self

    def _create_position_matrix(self) -> np.ndarray:
        """Create position eligibility matrix."""
        n_players = len(self.players)
        n_roster_spots = self.config["roster_size"]

        # Map positions to roster spots
        position_matrix = np.zeros((n_roster_spots, n_players))

        for spot_idx, required_pos in enumerate(self.config["positions"]):
            for player_idx, player_pos in enumerate(self.players["Position"]):
                # Check if player is eligible for this spot
                if self._is_eligible(player_pos, required_pos):
                    position_matrix[spot_idx, player_idx] = 1

        return position_matrix

    def _is_eligible(self, player_pos: str, roster_spot: str) -> bool:
        """Check if player position is eligible for roster spot."""
        # Direct match
        if player_pos == roster_spot:
            return True

        # FLEX eligibility
        if roster_spot == "FLEX" and player_pos in ["RB", "WR", "TE"]:
            return True
        if roster_spot == "UTIL":
            return True
        if roster_spot == "G" and player_pos in ["PG", "SG"]:
            return True
        if roster_spot == "F" and player_pos in ["SF", "PF"]:
            return True

        return False

    def optimize(
        self,
        min_salary: float = None,
        max_from_team: int = 4,
        locked: List[str] = None,
        excluded: List[str] = None,
        max_ownership: float = None
    ) -> pd.DataFrame:
        """
        Generate optimal lineup.

        Args:
            min_salary: Minimum salary to use
            max_from_team: Max players from same team
            locked: Players that must be in lineup
            excluded: Players to exclude
            max_ownership: Maximum ownership percentage

        Returns:
            DataFrame with optimal lineup
        """
        n_players = len(self.players)
        salary_cap = self.config["salary_cap"]

        # Objective: maximize projections
        c = -self.players["Projection"].values  # Negative for maximization

        # Constraints
        constraints = []
        bounds = Bounds(0, 1)  # Binary variables

        # Salary cap constraint: sum(salary * x) <= cap
        A_salary = self.players["Salary"].values.reshape(1, -1)
        constraints.append(LinearConstraint(A_salary, 0, salary_cap))

        # Minimum salary constraint
        if min_salary:
            constraints.append(LinearConstraint(A_salary, min_salary, salary_cap))

        # Roster size constraint: sum(x) = roster_size
        A_roster = np.ones((1, n_players))
        constraints.append(LinearConstraint(A_roster, self.config["roster_size"],
                                           self.config["roster_size"]))

        # Position constraints
        pos_matrix = self._create_position_matrix()
        for i, required_pos in enumerate(self.config["positions"]):
            eligible = pos_matrix[i, :].reshape(1, -1)
            # Must select at least 1 from eligible players for this spot
            # This is simplified - real optimizer uses more complex position constraints

        # Team stacking constraints
        if max_from_team:
            for team in self.players["Team"].unique():
                team_mask = (self.players["Team"] == team).astype(int).values
                constraints.append(LinearConstraint(team_mask.reshape(1, -1), 0, max_from_team))

        # Locked players
        if locked:
            for name in locked:
                idx = self.players[self.players["Name"] == name].index
                if len(idx) > 0:
                    lock_constraint = np.zeros(n_players)
                    lock_constraint[idx[0]] = 1
                    constraints.append(LinearConstraint(lock_constraint.reshape(1, -1), 1, 1))

        # Excluded players
        if excluded:
            for name in excluded:
                idx = self.players[self.players["Name"] == name].index
                if len(idx) > 0:
                    exclude_constraint = np.zeros(n_players)
                    exclude_constraint[idx[0]] = 1
                    constraints.append(LinearConstraint(exclude_constraint.reshape(1, -1), 0, 0))

        # Ownership ceiling
        if max_ownership and "Ownership" in self.players.columns:
            for idx, row in self.players.iterrows():
                if row["Ownership"] > max_ownership:
                    own_constraint = np.zeros(n_players)
                    own_constraint[idx] = 1
                    constraints.append(LinearConstraint(own_constraint.reshape(1, -1), 0, 0))

        # Solve MILP
        integrality = np.ones(n_players)  # All binary
        result = milp(c, constraints=constraints, integrality=integrality, bounds=bounds)

        if not result.success:
            raise ValueError(f"Optimization failed: {result.message}")

        # Extract lineup
        selected_idx = np.where(result.x > 0.5)[0]
        lineup = self.players.iloc[selected_idx].copy()

        return lineup

    def generate_multiple_lineups(
        self,
        n_lineups: int,
        max_exposure: float = 0.6,
        **kwargs
    ) -> List[pd.DataFrame]:
        """
        Generate multiple unique lineups.

        Args:
            n_lineups: Number of lineups to generate
            max_exposure: Maximum times a player can appear (as fraction)
        """
        lineups = []
        player_counts = {name: 0 for name in self.players["Name"]}

        for i in range(n_lineups):
            # Exclude overexposed players
            excluded = kwargs.get("excluded", [])
            max_count = max_exposure * (i + 1)

            for name, count in player_counts.items():
                if count >= max_count:
                    excluded.append(name)

            kwargs["excluded"] = list(set(excluded))

            try:
                lineup = self.optimize(**kwargs)
                lineups.append(lineup)

                # Update counts
                for name in lineup["Name"]:
                    player_counts[name] = player_counts.get(name, 0) + 1

            except ValueError:
                continue

        return lineups


# Example usage
if __name__ == "__main__":
    # Sample player pool
    players = pd.DataFrame({
        "Name": ["Player A", "Player B", "Player C", "Player D", "Player E",
                "Player F", "Player G", "Player H", "Player I", "Player J"],
        "Position": ["QB", "RB", "RB", "WR", "WR", "WR", "TE", "RB", "WR", "DST"],
        "Team": ["KC", "KC", "BUF", "KC", "BUF", "PHI", "PHI", "PHI", "DAL", "DAL"],
        "Salary": [8000, 7500, 7200, 6800, 6500, 6200, 5500, 5000, 4500, 3500],
        "Projection": [22.5, 18.2, 17.8, 16.5, 15.2, 14.8, 12.5, 11.0, 10.5, 8.0],
        "Ownership": [0.25, 0.20, 0.18, 0.15, 0.12, 0.10, 0.08, 0.05, 0.04, 0.03]
    })

    optimizer = DFSOptimizer("draftkings", "nfl")
    optimizer.load_players(players)

    # Generate single lineup
    # lineup = optimizer.optimize(max_from_team=3)
    # print(lineup)

    print("DFS Optimizer loaded successfully")

python Football

Ownership Leverage Strategy

Analyze ownership leverage for GPP tournament strategy in DFS.

"""DFS Ownership Leverage Analysis."""
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple

class OwnershipLeverage:
    """
    Analyze ownership leverage for GPP (tournament) DFS strategy.

    Key concepts:
    - Leverage = being different from the field
    - Positive leverage = overweight on high-upside, underowned players
    - Game theory optimal (GTO) considerations
    """

    def __init__(self, field_size: int = 10000):
        self.field_size = field_size

    def calculate_leverage(
        self,
        lineup_ownership: float,
        field_ownership: float
    ) -> float:
        """
        Calculate ownership leverage.

        Leverage = (Lineup Ownership - Field Ownership) / Field Ownership

        Positive = overweight relative to field
        Negative = underweight relative to field
        """
        if field_ownership == 0:
            return float("inf") if lineup_ownership > 0 else 0
        return (lineup_ownership - field_ownership) / field_ownership

    def expected_duplicates(self, lineup_ownership: float) -> float:
        """
        Estimate expected duplicate lineups.

        Uses Poisson approximation.
        """
        expected = (self.field_size - 1) * lineup_ownership
        return expected

    def duplication_probability(self, lineup_ownership: float) -> float:
        """Probability of at least one duplicate."""
        expected = self.expected_duplicates(lineup_ownership)
        return 1 - np.exp(-expected)

    def optimal_exposure(
        self,
        player_projection: float,
        player_ceiling: float,
        field_ownership: float,
        contest_type: str = "gpp"
    ) -> float:
        """
        Calculate optimal player exposure.

        For GPPs: Consider ceiling and correlation
        For cash: Focus on floor and consistency
        """
        if contest_type == "gpp":
            # Higher exposure for high ceiling, low ownership
            ceiling_factor = player_ceiling / player_projection
            ownership_factor = 1 / (field_ownership + 0.05)  # Avoid division by zero

            # Base exposure on projection
            base_exposure = min(player_projection / 20, 0.5)

            # Adjust for ceiling and ownership
            optimal = base_exposure * (ceiling_factor * 0.4 + ownership_factor * 0.1)
            return min(optimal, 1.0)

        else:  # Cash game
            # Focus on floor
            floor_factor = (player_projection * 0.8) / player_projection
            return min(floor_factor * 0.6, 1.0)

    def correlation_leverage(
        self,
        player1_ownership: float,
        player2_ownership: float,
        correlation: float
    ) -> float:
        """
        Calculate leverage from correlated players (stacking).

        Stack leverage = how different your correlation is from the field.
        """
        # Field stack probability (simplified)
        field_stack_prob = player1_ownership * player2_ownership * 2

        # Your stack probability (if you stack them)
        your_stack_prob = 1.0

        return self.calculate_leverage(your_stack_prob, field_stack_prob)

    def analyze_lineup_leverage(
        self,
        lineup: pd.DataFrame,
        field_ownership: pd.DataFrame
    ) -> Dict:
        """
        Analyze total lineup leverage.

        Args:
            lineup: Your lineup with player names
            field_ownership: Field ownership percentages
        """
        merged = lineup.merge(
            field_ownership[["Name", "Ownership"]],
            on="Name", how="left"
        )

        # Overall ownership metrics
        total_ownership = merged["Ownership"].sum()
        avg_ownership = merged["Ownership"].mean()
        max_ownership = merged["Ownership"].max()
        min_ownership = merged["Ownership"].min()

        # Leverage by player
        player_leverage = []
        for _, row in merged.iterrows():
            leverage = self.calculate_leverage(1/len(merged), row["Ownership"])
            player_leverage.append({
                "Name": row["Name"],
                "Ownership": row["Ownership"],
                "Leverage": leverage
            })

        # Uniqueness score
        uniqueness = 1 - self.duplication_probability(total_ownership / 100)

        return {
            "total_ownership": total_ownership,
            "avg_ownership": avg_ownership,
            "max_ownership": max_ownership,
            "min_ownership": min_ownership,
            "player_leverage": player_leverage,
            "expected_duplicates": self.expected_duplicates(total_ownership / 100),
            "uniqueness_score": uniqueness
        }

    def find_leverage_plays(
        self,
        players: pd.DataFrame,
        min_projection: float = 10,
        max_ownership: float = 0.15
    ) -> pd.DataFrame:
        """
        Find high-leverage plays (low owned, high projection).
        """
        leverage_plays = players[
            (players["Projection"] >= min_projection) &
            (players["Ownership"] <= max_ownership)
        ].copy()

        leverage_plays["Leverage_Score"] = (
            leverage_plays["Projection"] / leverage_plays["Ownership"].clip(0.01)
        )

        leverage_plays["Value"] = (
            leverage_plays["Projection"] / leverage_plays["Salary"] * 1000
        )

        return leverage_plays.sort_values("Leverage_Score", ascending=False)

    def game_theory_exposure(
        self,
        players: pd.DataFrame,
        your_edge: float = 0.02
    ) -> pd.DataFrame:
        """
        Calculate game theory optimal exposure.

        Based on your perceived edge over the field.
        """
        players = players.copy()

        # GTO exposure formula (simplified)
        # If you have edge, deviate from chalk toward your edge
        players["GTO_Exposure"] = players.apply(
            lambda row: self.optimal_exposure(
                row["Projection"],
                row.get("Ceiling", row["Projection"] * 1.5),
                row["Ownership"]
            ) * (1 + your_edge * 10),
            axis=1
        )

        return players[["Name", "Position", "Ownership", "Projection", "GTO_Exposure"]]


def simulate_tournament_roi(
    lineup_score: float,
    lineup_ownership: float,
    field_scores: np.ndarray,
    payout_structure: Dict[int, float]
) -> float:
    """
    Simulate tournament ROI given a lineup score.

    Args:
        lineup_score: Your lineup's fantasy score
        lineup_ownership: Estimated uniqueness factor
        field_scores: Array of simulated field scores
        payout_structure: Dict mapping finish positions to payout multipliers
    """
    # Rank in field
    better_scores = (field_scores > lineup_score).sum()
    finish = better_scores + 1

    # Account for ties (simplified)
    ties = (field_scores == lineup_score).sum()
    if ties > 0:
        # Split payout among tied positions
        avg_finish = finish + ties / 2

    # Get payout
    payout = 0
    for pos, mult in payout_structure.items():
        if finish <= pos:
            payout = mult
            break

    return payout - 1  # ROI (subtract entry)


# Example usage
if __name__ == "__main__":
    leverage = OwnershipLeverage(field_size=10000)

    # Sample player pool
    players = pd.DataFrame({
        "Name": ["Star RB", "Popular WR", "Sneaky TE", "Chalk QB", "Value RB"],
        "Position": ["RB", "WR", "TE", "QB", "RB"],
        "Salary": [9000, 7500, 5000, 7800, 4500],
        "Projection": [22.5, 17.5, 12.0, 20.0, 11.0],
        "Ownership": [0.35, 0.28, 0.08, 0.42, 0.05],
        "Ceiling": [35, 28, 22, 32, 20]
    })

    # Find leverage plays
    leverage_plays = leverage.find_leverage_plays(players, min_projection=10, max_ownership=0.15)
    print("Leverage Plays:")
    print(leverage_plays[["Name", "Projection", "Ownership", "Leverage_Score"]])

    # GTO exposure
    gto = leverage.game_theory_exposure(players)
    print("\nGTO Exposure:")
    print(gto)

r Football

Statistical Modeling in R

Build predictive models for sports using R statistical packages.

# Statistical Modeling for Sports Analytics in R
library(tidymodels)
library(dplyr)
library(ggplot2)

# =====================
# Data Preparation
# =====================

#' Prepare data for modeling
prep_model_data <- function(data, outcome_var, predictors) {
  data %>%
    select(all_of(c(outcome_var, predictors))) %>%
    drop_na()
}

#' Create train/test split
create_splits <- function(data, prop = 0.8, strata = NULL) {
  if (!is.null(strata)) {
    initial_split(data, prop = prop, strata = all_of(strata))
  } else {
    initial_split(data, prop = prop)
  }
}

# =====================
# Linear Models
# =====================

#' Fit linear regression model
fit_linear_model <- function(data, formula) {
  spec <- linear_reg() %>%
    set_engine("lm")

  workflow() %>%
    add_formula(formula) %>%
    add_model(spec) %>%
    fit(data)
}

#' Fit ridge/lasso regression
fit_regularized_model <- function(data, formula, penalty = 0.01, mixture = 0.5) {
  # mixture: 0 = ridge, 1 = lasso, between = elastic net
  spec <- linear_reg(penalty = penalty, mixture = mixture) %>%
    set_engine("glmnet")

  workflow() %>%
    add_formula(formula) %>%
    add_model(spec) %>%
    fit(data)
}

# =====================
# Classification Models
# =====================

#' Fit logistic regression (binary classification)
fit_logistic <- function(data, formula) {
  spec <- logistic_reg() %>%
    set_engine("glm")

  workflow() %>%
    add_formula(formula) %>%
    add_model(spec) %>%
    fit(data)
}

#' Fit random forest classifier
fit_rf_classifier <- function(data, formula, trees = 500, mtry = NULL) {
  spec <- rand_forest(trees = trees, mtry = mtry) %>%
    set_engine("ranger", importance = "impurity") %>%
    set_mode("classification")

  workflow() %>%
    add_formula(formula) %>%
    add_model(spec) %>%
    fit(data)
}

#' Fit XGBoost classifier
fit_xgb_classifier <- function(data, formula, trees = 100, tree_depth = 6,
                               learn_rate = 0.3) {
  spec <- boost_tree(
    trees = trees,
    tree_depth = tree_depth,
    learn_rate = learn_rate
  ) %>%
    set_engine("xgboost") %>%
    set_mode("classification")

  workflow() %>%
    add_formula(formula) %>%
    add_model(spec) %>%
    fit(data)
}

# =====================
# Model Evaluation
# =====================

#' Evaluate regression model
evaluate_regression <- function(model, test_data, truth_col) {
  predictions <- predict(model, test_data) %>%
    bind_cols(test_data)

  metrics <- predictions %>%
    metrics(truth = !!sym(truth_col), estimate = .pred)

  list(
    predictions = predictions,
    metrics = metrics,
    rmse = metrics %>% filter(.metric == "rmse") %>% pull(.estimate),
    r_squared = metrics %>% filter(.metric == "rsq") %>% pull(.estimate)
  )
}

#' Evaluate classification model
evaluate_classification <- function(model, test_data, truth_col) {
  predictions <- predict(model, test_data, type = "prob") %>%
    bind_cols(predict(model, test_data)) %>%
    bind_cols(test_data)

  truth_sym <- sym(truth_col)

  list(
    predictions = predictions,
    accuracy = predictions %>%
      accuracy(truth = !!truth_sym, estimate = .pred_class) %>%
      pull(.estimate),
    auc = predictions %>%
      roc_auc(truth = !!truth_sym, .pred_1) %>%  # Adjust column name
      pull(.estimate),
    confusion = predictions %>%
      conf_mat(truth = !!truth_sym, estimate = .pred_class)
  )
}

# =====================
# Cross-Validation
# =====================

#' Perform k-fold cross-validation
cross_validate <- function(data, formula, model_spec, folds = 10) {
  cv_folds <- vfold_cv(data, v = folds)

  wf <- workflow() %>%
    add_formula(formula) %>%
    add_model(model_spec)

  cv_results <- fit_resamples(
    wf,
    resamples = cv_folds,
    metrics = metric_set(rmse, rsq, mae)
  )

  collect_metrics(cv_results)
}

#' Tune hyperparameters
tune_model <- function(data, formula, model_spec, grid_size = 20, folds = 5) {
  cv_folds <- vfold_cv(data, v = folds)

  wf <- workflow() %>%
    add_formula(formula) %>%
    add_model(model_spec)

  tuned <- tune_grid(
    wf,
    resamples = cv_folds,
    grid = grid_size
  )

  list(
    results = collect_metrics(tuned),
    best = select_best(tuned, metric = "rmse")
  )
}

# =====================
# Example: Win Prediction Model
# =====================

build_win_model <- function(team_data) {
  # Prepare data
  model_data <- team_data %>%
    mutate(win = factor(win, levels = c(0, 1))) %>%
    select(win, points_for, points_against, turnovers, yards, time_of_possession)

  # Split
  splits <- initial_split(model_data, prop = 0.8, strata = win)
  train <- training(splits)
  test <- testing(splits)

  # Fit model
  rf_spec <- rand_forest(trees = 500) %>%
    set_engine("ranger", importance = "impurity") %>%
    set_mode("classification")

  model <- workflow() %>%
    add_formula(win ~ .) %>%
    add_model(rf_spec) %>%
    fit(train)

  # Evaluate
  evaluation <- evaluate_classification(model, test, "win")

  list(
    model = model,
    accuracy = evaluation$accuracy,
    auc = evaluation$auc,
    feature_importance = model %>%
      extract_fit_parsnip() %>%
      vip::vi()
  )
}

print("Statistical modeling functions loaded")

r Baseball

Sports Visualization with ggplot2

Create professional sports visualizations using ggplot2.

# Professional sports visualizations with ggplot2
library(ggplot2)
library(dplyr)
library(scales)
library(patchwork)  # For combining plots

# =====================
# Theme Setup
# =====================

#' Custom theme for sports analytics
theme_sports <- function(base_size = 12) {
  theme_minimal(base_size = base_size) +
    theme(
      plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0),
      plot.subtitle = element_text(color = "gray40", size = rel(0.9)),
      plot.caption = element_text(color = "gray60", size = rel(0.7)),
      panel.grid.minor = element_blank(),
      panel.grid.major = element_line(color = "gray90"),
      axis.title = element_text(face = "bold", size = rel(0.9)),
      legend.position = "bottom",
      legend.title = element_text(face = "bold", size = rel(0.8))
    )
}

# =====================
# Scatter Plots
# =====================

#' Create scatter plot with quadrants
plot_quadrant <- function(data, x_var, y_var, label_var,
                          title = NULL, highlight_top = 5) {
  x_mean <- mean(data[[x_var]], na.rm = TRUE)
  y_mean <- mean(data[[y_var]], na.rm = TRUE)

  # Identify top performers
  data <- data %>%
    mutate(
      quadrant = case_when(
        .data[[x_var]] >= x_mean & .data[[y_var]] >= y_mean ~ "Elite",
        .data[[x_var]] >= x_mean & .data[[y_var]] < y_mean ~ "Efficient",
        .data[[x_var]] < x_mean & .data[[y_var]] >= y_mean ~ "Volume",
        TRUE ~ "Below Average"
      ),
      highlight = rank(-.data[[x_var]] * .data[[y_var]]) <= highlight_top
    )

  ggplot(data, aes(x = .data[[x_var]], y = .data[[y_var]])) +
    geom_hline(yintercept = y_mean, linetype = "dashed", alpha = 0.5) +
    geom_vline(xintercept = x_mean, linetype = "dashed", alpha = 0.5) +
    geom_point(aes(color = quadrant), size = 3, alpha = 0.7) +
    geom_text(
      data = filter(data, highlight),
      aes(label = .data[[label_var]]),
      vjust = -0.5, size = 3, check_overlap = TRUE
    ) +
    scale_color_manual(values = c(
      "Elite" = "#2ecc71", "Efficient" = "#3498db",
      "Volume" = "#f39c12", "Below Average" = "#95a5a6"
    )) +
    labs(title = title, x = x_var, y = y_var, color = "Category") +
    theme_sports()
}

# =====================
# Bar Charts
# =====================

#' Create horizontal bar chart with team colors
plot_team_bars <- function(data, stat_var, team_var, title = NULL,
                          team_colors = NULL) {
  data <- data %>% arrange(desc(.data[[stat_var]]))

  p <- ggplot(data, aes(x = reorder(.data[[team_var]], .data[[stat_var]]),
                        y = .data[[stat_var]])) +
    geom_col(fill = "#3498db", alpha = 0.8) +
    geom_text(aes(label = round(.data[[stat_var]], 1)),
              hjust = -0.1, size = 3) +
    coord_flip() +
    labs(title = title, x = "", y = stat_var) +
    theme_sports() +
    theme(panel.grid.major.y = element_blank())

  if (!is.null(team_colors)) {
    p <- p + geom_col(aes(fill = .data[[team_var]])) +
      scale_fill_manual(values = team_colors) +
      guides(fill = "none")
  }

  p
}

# =====================
# Line Charts
# =====================

#' Create rolling average line chart
plot_rolling_avg <- function(data, date_var, value_var, window = 10,
                            title = NULL, group_var = NULL) {
  data <- data %>%
    arrange(.data[[date_var]])

  if (!is.null(group_var)) {
    data <- data %>%
      group_by(.data[[group_var]]) %>%
      mutate(rolling_avg = zoo::rollmean(.data[[value_var]], k = window,
                                         fill = NA, align = "right")) %>%
      ungroup()
  } else {
    data <- data %>%
      mutate(rolling_avg = zoo::rollmean(.data[[value_var]], k = window,
                                         fill = NA, align = "right"))
  }

  p <- ggplot(data, aes(x = .data[[date_var]], y = rolling_avg))

  if (!is.null(group_var)) {
    p <- p + geom_line(aes(color = .data[[group_var]]), size = 1)
  } else {
    p <- p + geom_line(color = "#3498db", size = 1)
  }

  p +
    labs(
      title = title,
      subtitle = paste0(window, "-game rolling average"),
      x = "", y = value_var
    ) +
    theme_sports()
}

# =====================
# Distribution Plots
# =====================

#' Create density ridge plot
plot_density_ridges <- function(data, value_var, group_var, title = NULL) {
  library(ggridges)

  ggplot(data, aes(x = .data[[value_var]], y = .data[[group_var]],
                   fill = stat(x))) +
    geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
    scale_fill_viridis_c(option = "C") +
    labs(title = title, x = value_var, y = "") +
    theme_sports() +
    theme(legend.position = "none")
}

#' Create box plot comparison
plot_box_comparison <- function(data, value_var, group_var, title = NULL) {
  ggplot(data, aes(x = reorder(.data[[group_var]], .data[[value_var]], median),
                   y = .data[[value_var]], fill = .data[[group_var]])) +
    geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
    stat_summary(fun = mean, geom = "point", shape = 23, size = 3,
                fill = "white") +
    labs(title = title, x = "", y = value_var) +
    theme_sports() +
    theme(legend.position = "none") +
    coord_flip()
}

# =====================
# Heat Maps
# =====================

#' Create performance heat map
plot_heatmap <- function(data, x_var, y_var, fill_var, title = NULL) {
  ggplot(data, aes(x = .data[[x_var]], y = .data[[y_var]],
                   fill = .data[[fill_var]])) +
    geom_tile(color = "white", size = 0.5) +
    scale_fill_gradient2(low = "#e74c3c", mid = "white", high = "#2ecc71",
                        midpoint = median(data[[fill_var]], na.rm = TRUE)) +
    labs(title = title, x = x_var, y = y_var, fill = fill_var) +
    theme_sports() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      panel.grid = element_blank()
    )
}

# =====================
# Composite Dashboards
# =====================

#' Create player comparison dashboard
create_player_dashboard <- function(player_data, player_name) {
  player <- player_data %>% filter(name == player_name)

  # Multiple plots combined
  p1 <- plot_quadrant(player_data, "offense", "defense", "name",
                     title = "Offense vs Defense")

  p2 <- player_data %>%
    select(name, stat1, stat2, stat3) %>%
    pivot_longer(-name) %>%
    filter(name == player_name) %>%
    ggplot(aes(x = name, y = value, fill = name)) +
    geom_col() +
    facet_wrap(~name, scales = "free_y") +
    theme_sports()

  # Combine with patchwork
  p1 + p2 +
    plot_annotation(
      title = paste(player_name, "Performance Dashboard"),
      theme = theme_sports()
    )
}

print("Sports visualization functions loaded")

r Soccer

Soccer Analysis with worldfootballR

European soccer data analysis using worldfootballR package.

# Soccer analysis with worldfootballR
library(worldfootballR)
library(dplyr)
library(ggplot2)
library(tidyr)

# =====================
# Data Loading
# =====================

#' Get player stats from FBref
#'
#' @param country Country (e.g., "ENG" for England)
#' @param gender "M" or "F"
#' @param season_end_year End year of season
#' @param stat_type Type of stats
get_player_stats <- function(country, gender = "M", season_end_year, stat_type = "standard") {
  fb_league_stats(
    country = country,
    gender = gender,
    season_end_year = season_end_year,
    stat_type = stat_type
  )
}

# Get Premier League player stats
# pl_standard <- get_player_stats("ENG", "M", 2024, "standard")
# pl_shooting <- get_player_stats("ENG", "M", 2024, "shooting")
# pl_passing <- get_player_stats("ENG", "M", 2024, "passing")

#' Get match results
get_match_results <- function(country, gender = "M", season_end_year) {
  fb_match_results(
    country = country,
    gender = gender,
    season_end_year = season_end_year
  )
}

# =====================
# Player Analysis
# =====================

#' Calculate player per 90 stats
calc_per_90 <- function(player_stats) {
  player_stats %>%
    filter(Mins_Per_90 >= 10) %>%  # At least 10 90s played
    mutate(
      Goals_per90 = Gls / Mins_Per_90,
      Assists_per90 = Ast / Mins_Per_90,
      GA_per90 = `G+A` / Mins_Per_90,
      xG_per90 = xG / Mins_Per_90,
      xA_per90 = xA / Mins_Per_90,
      xGA_per90 = `xG+xA` / Mins_Per_90,
      GminusxG = Gls - xG,  # Overperformance
      AminusxA = Ast - xA
    ) %>%
    select(Player, Squad, Pos, Mins_Per_90, Goals_per90, Assists_per90,
           xG_per90, xA_per90, GminusxG, AminusxA) %>%
    arrange(desc(xGA_per90))
}

#' Find similar players based on stats
find_similar_players <- function(player_stats, target_player, n = 10) {
  # Select numeric columns for comparison
  numeric_cols <- player_stats %>%
    select(where(is.numeric)) %>%
    select(-matches("Mins|90"))

  # Scale the data
  scaled <- scale(numeric_cols)
  row.names(scaled) <- player_stats$Player

  # Find target player
  target_idx <- which(player_stats$Player == target_player)

  if (length(target_idx) == 0) {
    stop("Player not found")
  }

  # Calculate distances
  distances <- apply(scaled, 1, function(x) {
    sqrt(sum((x - scaled[target_idx, ])^2))
  })

  # Return most similar
  similar <- sort(distances)[2:(n+1)]

  tibble(
    Player = names(similar),
    Similarity = 1 - (similar / max(similar))
  )
}

# =====================
# Team Analysis
# =====================

#' Calculate team expected points
calc_expected_points <- function(matches) {
  matches %>%
    mutate(
      Home_xPts = case_when(
        Home_xG > Away_xG + 0.5 ~ 3 * (1 - pnorm(0, Home_xG - Away_xG, 1)),
        Home_xG < Away_xG - 0.5 ~ 0 + 3 * pnorm(0, Home_xG - Away_xG, 1),
        TRUE ~ 1 + 2 * dnorm(0, Home_xG - Away_xG, 1)
      ),
      Away_xPts = 3 - Home_xPts  # Simplified
    )
}

#' Create league table
create_league_table <- function(matches) {
  home <- matches %>%
    group_by(Team = Home) %>%
    summarise(
      P = n(),
      W = sum(HomeGoals > AwayGoals),
      D = sum(HomeGoals == AwayGoals),
      L = sum(HomeGoals < AwayGoals),
      GF = sum(HomeGoals),
      GA = sum(AwayGoals),
      .groups = "drop"
    )

  away <- matches %>%
    group_by(Team = Away) %>%
    summarise(
      P = n(),
      W = sum(AwayGoals > HomeGoals),
      D = sum(AwayGoals == HomeGoals),
      L = sum(AwayGoals < HomeGoals),
      GF = sum(AwayGoals),
      GA = sum(HomeGoals),
      .groups = "drop"
    )

  bind_rows(home, away) %>%
    group_by(Team) %>%
    summarise(across(everything(), sum)) %>%
    mutate(
      GD = GF - GA,
      Pts = W * 3 + D
    ) %>%
    arrange(desc(Pts), desc(GD), desc(GF))
}

# =====================
# xG Analysis
# =====================

#' Plot xG timeline for a match
plot_xg_timeline <- function(match_shots) {
  # Assumes match_shots has minute, team, xG columns

  match_shots %>%
    arrange(minute) %>%
    group_by(team) %>%
    mutate(cumulative_xG = cumsum(xG)) %>%
    ungroup() %>%
    ggplot(aes(x = minute, y = cumulative_xG, color = team)) +
    geom_step(size = 1.2) +
    geom_point(aes(size = xG), alpha = 0.6) +
    scale_x_continuous(breaks = seq(0, 90, 15)) +
    labs(
      title = "Match xG Timeline",
      x = "Minute",
      y = "Cumulative xG"
    ) +
    theme_minimal()
}

#' Calculate shot quality distribution
analyze_shot_quality <- function(shots) {
  shots %>%
    mutate(
      xG_bucket = cut(xG,
        breaks = c(0, 0.05, 0.1, 0.2, 0.4, 1),
        labels = c("Very Low", "Low", "Medium", "High", "Very High")
      )
    ) %>%
    group_by(team, xG_bucket) %>%
    summarise(
      Shots = n(),
      Goals = sum(is_goal, na.rm = TRUE),
      Conversion = Goals / Shots,
      .groups = "drop"
    )
}

# =====================
# Visualization
# =====================

#' Create player radar chart
create_radar_chart <- function(player_stats, player_name, metrics) {
  player <- player_stats %>% filter(Player == player_name)

  # Normalize metrics to 0-100 scale
  normalized <- player_stats %>%
    select(all_of(metrics)) %>%
    mutate(across(everything(), ~ percent_rank(.) * 100))

  player_normalized <- normalized[player_stats$Player == player_name, ]

  # Would need specialized radar chart package like ggradar
  # This is a placeholder for the concept
  tibble(
    Metric = metrics,
    Value = as.numeric(player_normalized)
  )
}

print("worldfootballR soccer analysis functions loaded")

r Football

NFL Analysis with nflfastR

Comprehensive NFL play-by-play analysis using nflfastR package.

# NFL analysis with nflfastR
library(nflfastR)
library(dplyr)
library(ggplot2)
library(tidyr)

# =====================
# Loading Data
# =====================

#' Load NFL play-by-play data
#'
#' @param seasons Vector of seasons to load
load_pbp_data <- function(seasons) {
  load_pbp(seasons) %>%
    filter(season_type == "REG")  # Regular season only
}

#' Load roster data
load_rosters <- function(seasons) {
  load_rosters(seasons)
}

# Load recent seasons
# pbp <- load_pbp_data(2020:2024)
# rosters <- load_rosters(2024)

# =====================
# EPA Analysis
# =====================

#' Calculate QB EPA metrics
calc_qb_epa <- function(pbp) {
  pbp %>%
    filter(!is.na(epa), !is.na(passer_id)) %>%
    group_by(passer_id, passer) %>%
    summarise(
      Games = n_distinct(game_id),
      Dropbacks = n(),
      EPA_Total = sum(epa),
      EPA_per_Play = mean(epa),
      CPOE = mean(cpoe, na.rm = TRUE),
      Success_Rate = mean(success, na.rm = TRUE),
      Comp_Pct = mean(complete_pass, na.rm = TRUE),
      Air_EPA = mean(air_epa, na.rm = TRUE),
      YAC_EPA = mean(yac_epa, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    filter(Dropbacks >= 200) %>%
    arrange(desc(EPA_per_Play))
}

#' Calculate rushing EPA
calc_rush_epa <- function(pbp) {
  pbp %>%
    filter(!is.na(epa), rush == 1, !is.na(rusher_id)) %>%
    group_by(rusher_id, rusher) %>%
    summarise(
      Carries = n(),
      Yards = sum(rushing_yards, na.rm = TRUE),
      TDs = sum(rush_touchdown, na.rm = TRUE),
      EPA_Total = sum(epa),
      EPA_per_Carry = mean(epa),
      Success_Rate = mean(success, na.rm = TRUE),
      YPC = mean(rushing_yards, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    filter(Carries >= 100) %>%
    arrange(desc(EPA_per_Carry))
}

#' Calculate receiver EPA
calc_receiver_epa <- function(pbp) {
  pbp %>%
    filter(!is.na(epa), !is.na(receiver_id), complete_pass == 1) %>%
    group_by(receiver_id, receiver) %>%
    summarise(
      Targets = n(),
      Receptions = sum(complete_pass),
      Yards = sum(receiving_yards, na.rm = TRUE),
      TDs = sum(pass_touchdown, na.rm = TRUE),
      EPA_Total = sum(epa),
      EPA_per_Target = mean(epa),
      YAC = sum(yards_after_catch, na.rm = TRUE),
      ADOT = mean(air_yards, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    filter(Targets >= 50) %>%
    arrange(desc(EPA_Total))
}

# =====================
# Team Analysis
# =====================

#' Calculate team offensive efficiency
calc_team_offense <- function(pbp) {
  pbp %>%
    filter(!is.na(epa), !is.na(posteam)) %>%
    group_by(posteam) %>%
    summarise(
      Plays = n(),
      Pass_Plays = sum(pass, na.rm = TRUE),
      Rush_Plays = sum(rush, na.rm = TRUE),
      Pass_Rate = Pass_Plays / Plays,
      EPA_Total = sum(epa),
      EPA_per_Play = mean(epa),
      Pass_EPA = mean(epa[pass == 1], na.rm = TRUE),
      Rush_EPA = mean(epa[rush == 1], na.rm = TRUE),
      Success_Rate = mean(success, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    arrange(desc(EPA_per_Play))
}

#' Calculate team defensive efficiency
calc_team_defense <- function(pbp) {
  pbp %>%
    filter(!is.na(epa), !is.na(defteam)) %>%
    group_by(defteam) %>%
    summarise(
      Plays = n(),
      EPA_Allowed = sum(epa),
      EPA_per_Play_Allowed = mean(epa),
      Pass_EPA_Allowed = mean(epa[pass == 1], na.rm = TRUE),
      Rush_EPA_Allowed = mean(epa[rush == 1], na.rm = TRUE),
      Success_Rate_Allowed = mean(success, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    arrange(EPA_per_Play_Allowed)  # Lower is better
}

# =====================
# Situational Analysis
# =====================

#' Analyze performance by down and distance
analyze_situations <- function(pbp, team = NULL) {
  data <- pbp %>%
    filter(!is.na(epa), !is.na(down))

  if (!is.null(team)) {
    data <- data %>% filter(posteam == team)
  }

  data %>%
    mutate(
      ydstogo_bucket = case_when(
        ydstogo <= 3 ~ "Short (1-3)",
        ydstogo <= 7 ~ "Medium (4-7)",
        ydstogo <= 10 ~ "Long (8-10)",
        TRUE ~ "Very Long (11+)"
      )
    ) %>%
    group_by(down, ydstogo_bucket) %>%
    summarise(
      Plays = n(),
      Pass_Rate = mean(pass, na.rm = TRUE),
      EPA_per_Play = mean(epa),
      Success_Rate = mean(success, na.rm = TRUE),
      .groups = "drop"
    )
}

# =====================
# Visualization
# =====================

#' Plot EPA by team
plot_team_epa <- function(offense_stats, defense_stats) {
  combined <- offense_stats %>%
    select(team = posteam, Off_EPA = EPA_per_Play) %>%
    left_join(
      defense_stats %>%
        select(team = defteam, Def_EPA = EPA_per_Play_Allowed),
      by = "team"
    )

  ggplot(combined, aes(x = Off_EPA, y = -Def_EPA)) +
    geom_point(size = 3) +
    geom_text(aes(label = team), vjust = -0.5, size = 3) +
    geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
    geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
    labs(
      title = "Team Efficiency",
      x = "Offensive EPA/Play",
      y = "Defensive EPA/Play (inverted)"
    ) +
    theme_minimal()
}

#' Create passing chart
plot_passing_chart <- function(pbp, passer_name) {
  passes <- pbp %>%
    filter(passer == passer_name, !is.na(air_yards))

  ggplot(passes, aes(x = air_yards, fill = factor(complete_pass))) +
    geom_histogram(binwidth = 5, position = "dodge", alpha = 0.7) +
    scale_fill_manual(values = c("0" = "red", "1" = "green"),
                     labels = c("Incomplete", "Complete")) +
    labs(
      title = paste(passer_name, "Air Yards Distribution"),
      x = "Air Yards", y = "Count", fill = ""
    ) +
    theme_minimal()
}

print("nflfastR NFL analysis functions loaded")

r Basketball

Basketball Analysis with hoopR

NBA data analysis using hoopR package for player and team statistics.

# NBA analysis with hoopR package
library(hoopR)
library(dplyr)
library(ggplot2)
library(tidyr)

# =====================
# Loading NBA Data
# =====================

#' Get NBA player box scores
#'
#' @param season NBA season (e.g., 2024 for 2023-24)
get_player_box <- function(season) {
  load_nba_player_box(seasons = season)
}

#' Get team box scores
get_team_box <- function(season) {
  load_nba_team_box(seasons = season)
}

#' Get play-by-play data
get_pbp <- function(game_id) {
  espn_nba_pbp(game_id)
}

# Load 2024 season data
# player_stats <- get_player_box(2024)
# team_stats <- get_team_box(2024)

# =====================
# Player Analysis
# =====================

#' Calculate advanced stats for players
calc_advanced_stats <- function(player_box) {
  player_box %>%
    group_by(athlete_id, athlete_display_name) %>%
    summarise(
      Games = n(),
      MPG = mean(minutes, na.rm = TRUE),
      PPG = mean(points, na.rm = TRUE),
      RPG = mean(rebounds, na.rm = TRUE),
      APG = mean(assists, na.rm = TRUE),
      FG_Pct = sum(field_goals_made) / sum(field_goals_attempted),
      Three_Pct = sum(three_point_field_goals_made) / sum(three_point_field_goals_attempted),
      FT_Pct = sum(free_throws_made) / sum(free_throws_attempted),
      TS_Pct = sum(points) / (2 * (sum(field_goals_attempted) + 0.44 * sum(free_throws_attempted))),
      .groups = "drop"
    ) %>%
    filter(Games >= 20) %>%
    arrange(desc(PPG))
}

#' Calculate usage rate
calc_usage <- function(player_box, team_box) {
  player_box %>%
    left_join(
      team_box %>%
        select(game_id, team_id, team_fga = field_goals_attempted,
               team_fta = free_throws_attempted, team_tov = turnovers),
      by = c("game_id", "team_id")
    ) %>%
    mutate(
      USG = 100 * ((field_goals_attempted + 0.44 * free_throws_attempted + turnovers) *
                    (team_minutes / 5)) /
            (minutes * (team_fga + 0.44 * team_fta + team_tov))
    )
}

# =====================
# Team Analysis
# =====================

#' Calculate team efficiency metrics
calc_team_efficiency <- function(team_box) {
  team_box %>%
    group_by(team_id, team_display_name) %>%
    summarise(
      Games = n(),
      Wins = sum(team_winner, na.rm = TRUE),
      PPG = mean(team_score, na.rm = TRUE),
      OppPPG = mean(opponent_team_score, na.rm = TRUE),
      Net_Rating = PPG - OppPPG,
      Pace = mean((field_goals_attempted + 0.44 * free_throws_attempted -
                   offensive_rebounds + turnovers) * 48 / minutes, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    mutate(Win_Pct = Wins / Games) %>%
    arrange(desc(Net_Rating))
}

#' Four Factors analysis
calc_four_factors <- function(team_box) {
  team_box %>%
    group_by(team_display_name) %>%
    summarise(
      # Effective FG%
      eFG = (sum(field_goals_made) + 0.5 * sum(three_point_field_goals_made)) /
            sum(field_goals_attempted),
      # Turnover Rate
      TOV_Rate = sum(turnovers) / (sum(field_goals_attempted) +
                                    0.44 * sum(free_throws_attempted) + sum(turnovers)),
      # Offensive Rebound Rate
      ORB_Rate = sum(offensive_rebounds) /
                 (sum(offensive_rebounds) + sum(opponent_defensive_rebounds)),
      # Free Throw Rate
      FT_Rate = sum(free_throws_made) / sum(field_goals_attempted),
      .groups = "drop"
    )
}

# =====================
# Shot Chart
# =====================

#' Create shot chart from play-by-play
create_shot_chart <- function(pbp_data, player_name = NULL) {
  shots <- pbp_data %>%
    filter(shooting_play == TRUE) %>%
    filter(!is.na(coordinate_x), !is.na(coordinate_y))

  if (!is.null(player_name)) {
    shots <- shots %>% filter(grepl(player_name, text, ignore.case = TRUE))
  }

  ggplot(shots, aes(x = coordinate_x, y = coordinate_y)) +
    # Court outline would be added here
    geom_point(aes(color = scoring_play), alpha = 0.6, size = 2) +
    scale_color_manual(values = c("TRUE" = "green", "FALSE" = "red")) +
    coord_fixed() +
    labs(title = "Shot Chart", color = "Made") +
    theme_minimal()
}

# =====================
# Win Probability
# =====================

#' Calculate win probability from game state
calc_win_prob <- function(score_diff, time_remaining_sec, possession = 0) {
  # Simplified logistic model
  # Real model would be trained on historical data
  z <- (score_diff + possession * 2) / sqrt(time_remaining_sec / 60)
  1 / (1 + exp(-0.15 * z))
}

#' Add win probability to play-by-play
add_win_prob <- function(pbp_data) {
  pbp_data %>%
    mutate(
      score_diff = home_score - away_score,
      time_remaining = (4 - period) * 720 + clock_minutes * 60 + clock_seconds,
      home_win_prob = calc_win_prob(score_diff, time_remaining)
    )
}

print("hoopR NBA analysis functions loaded")

r Baseball

Baseball Stats with baseballr

Fetch and analyze baseball statistics using the baseballr package.

# Baseball analysis with baseballr package
library(baseballr)
library(dplyr)
library(ggplot2)

# =====================
# Fetching Player Stats
# =====================

#' Get batting stats for a season
#'
#' @param year Season year
#' @param qual Minimum PA qualifier (default 100)
get_batting_stats <- function(year, qual = 100) {
  stats <- fg_batter_leaders(year, year, qual = qual)

  stats %>%
    select(Name, Team, G, PA, AB, H, HR, RBI, BB, SO,
           AVG, OBP, SLG, wOBA, wRC_plus = `wRC+`, WAR) %>%
    arrange(desc(WAR))
}

# Get 2024 batting leaders
batting_2024 <- get_batting_stats(2024)
print(head(batting_2024, 20))

# =====================
# Statcast Data
# =====================

#' Get Statcast data for a date range
get_statcast <- function(start_date, end_date) {
  statcast_search(
    start_date = start_date,
    end_date = end_date,
    player_type = "batter"
  )
}

# Get recent Statcast data
# statcast_data <- get_statcast("2024-06-01", "2024-06-07")

#' Calculate Statcast metrics for a player
calc_statcast_metrics <- function(player_data) {
  player_data %>%
    filter(!is.na(launch_speed)) %>%
    summarise(
      Batted_Balls = n(),
      Avg_EV = mean(launch_speed, na.rm = TRUE),
      Max_EV = max(launch_speed, na.rm = TRUE),
      Avg_LA = mean(launch_angle, na.rm = TRUE),
      Barrel_Pct = mean(barrel == 1, na.rm = TRUE) * 100,
      HardHit_Pct = mean(launch_speed >= 95, na.rm = TRUE) * 100,
      Sweet_Spot_Pct = mean(launch_angle >= 8 & launch_angle <= 32, na.rm = TRUE) * 100
    )
}

# =====================
# Pitching Analysis
# =====================

#' Get pitching stats
get_pitching_stats <- function(year, qual = 50) {
  fg_pitcher_leaders(year, year, qual = qual) %>%
    select(Name, Team, G, GS, IP, W, L, ERA, WHIP, K9 = `K/9`,
           BB9 = `BB/9`, FIP, xFIP, WAR) %>%
    arrange(desc(WAR))
}

#' Calculate pitch mix for a pitcher
calc_pitch_mix <- function(statcast_data, pitcher_name) {
  statcast_data %>%
    filter(player_name == pitcher_name) %>%
    group_by(pitch_type) %>%
    summarise(
      Count = n(),
      Avg_Velo = mean(release_speed, na.rm = TRUE),
      Avg_Spin = mean(release_spin_rate, na.rm = TRUE),
      Whiff_Pct = mean(description %in% c("swinging_strike", "swinging_strike_blocked"), na.rm = TRUE) * 100
    ) %>%
    mutate(Usage_Pct = Count / sum(Count) * 100) %>%
    arrange(desc(Usage_Pct))
}

# =====================
# Visualization
# =====================

#' Create spray chart
create_spray_chart <- function(batted_balls, player_name = NULL) {
  if (!is.null(player_name)) {
    batted_balls <- batted_balls %>% filter(player_name == !!player_name)
  }

  ggplot(batted_balls, aes(x = hc_x - 125, y = 200 - hc_y)) +
    geom_point(aes(color = events), alpha = 0.6, size = 2) +
    scale_color_manual(values = c(
      "single" = "blue", "double" = "green",
      "triple" = "orange", "home_run" = "red",
      "field_out" = "gray"
    )) +
    coord_fixed() +
    labs(title = paste(player_name, "Spray Chart"),
         x = "Horizontal Position", y = "Distance") +
    theme_minimal()
}

#' Create pitch movement plot
plot_pitch_movement <- function(pitches) {
  ggplot(pitches, aes(x = pfx_x * 12, y = pfx_z * 12, color = pitch_type)) +
    geom_point(alpha = 0.5, size = 2) +
    geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
    geom_vline(xintercept = 0, linestyle = "dashed", alpha = 0.5) +
    labs(title = "Pitch Movement Profile",
         x = "Horizontal Movement (inches)",
         y = "Vertical Movement (inches)") +
    theme_minimal() +
    coord_fixed()
}

# Example: WAR vs Salary analysis
# salary_data <- chadwick_player_lu() %>%
#   left_join(batting_2024, by = c("name_last", "name_first"))
# ggplot(salary_data, aes(x = WAR, y = salary/1e6)) +
#   geom_point() + geom_smooth(method = "lm") +
#   labs(x = "WAR", y = "Salary (Millions)")

r Hockey

Hockey Analysis with hockeyR

NHL data analysis using hockeyR and related packages.

# NHL Hockey Analysis in R
library(hockeyR)  # Or fastRhockey
library(dplyr)
library(ggplot2)
library(tidyr)

# =====================
# Data Loading
# =====================

#' Load NHL play-by-play data
#'
#' @param season Season in YYYYYYYY format (e.g., 20232024)
load_nhl_pbp <- function(season) {
  load_pbp(season)
}

#' Load player stats
load_player_stats <- function(season) {
  # Using NHL API or other sources
  get_skater_stats(season)
}

# =====================
# Shot Analysis
# =====================

#' Calculate shot metrics
calc_shot_metrics <- function(pbp) {
  shots <- pbp %>%
    filter(event_type %in% c("SHOT", "GOAL", "MISS", "BLOCK"))

  shots %>%
    group_by(event_player_1_name, event_team) %>%
    summarise(
      Shots = sum(event_type == "SHOT"),
      Goals = sum(event_type == "GOAL"),
      Missed = sum(event_type == "MISS"),
      Blocked = sum(event_type == "BLOCK"),
      Sh_Pct = Goals / (Shots + Goals),
      .groups = "drop"
    )
}

#' Calculate expected goals (simplified model)
calc_xg <- function(shots) {
  # Distance-based xG (simplified)
  shots %>%
    mutate(
      distance = sqrt(x_fixed^2 + y_fixed^2),
      angle = atan2(abs(y_fixed), 89 - x_fixed) * 180 / pi,
      xG = case_when(
        event_type == "GOAL" & penalty_shot ~ 0.33,  # Penalty shot
        distance < 10 ~ 0.20,
        distance < 20 ~ 0.12,
        distance < 30 ~ 0.06,
        distance < 40 ~ 0.03,
        TRUE ~ 0.02
      ) * (1 + 0.01 * pmax(0, 45 - angle))  # Angle adjustment
    )
}

# =====================
# Corsi/Fenwick
# =====================

#' Calculate Corsi metrics for teams
calc_team_corsi <- function(pbp) {
  shot_events <- pbp %>%
    filter(event_type %in% c("SHOT", "GOAL", "MISS", "BLOCK"))

  # For each team
  corsi_for <- shot_events %>%
    group_by(event_team) %>%
    summarise(CF = n())

  corsi_against <- shot_events %>%
    group_by(away_team = ifelse(event_team == home_team, away_team, home_team)) %>%
    summarise(CA = n()) %>%
    rename(event_team = away_team)

  corsi_for %>%
    left_join(corsi_against, by = "event_team") %>%
    mutate(
      Corsi_Diff = CF - CA,
      Corsi_Pct = CF / (CF + CA) * 100
    ) %>%
    arrange(desc(Corsi_Pct))
}

#' Calculate player Corsi (on-ice)
calc_player_corsi <- function(pbp, min_toi = 200) {
  # This would require tracking on-ice players

  # Simplified version using events only

  pbp %>%
    filter(event_type %in% c("SHOT", "GOAL", "MISS", "BLOCK")) %>%
    group_by(event_player_1_name) %>%
    summarise(
      iCF = n(),  # Individual Corsi For
      # Would need on-ice data for full Corsi
      .groups = "drop"
    )
}

# =====================
# Game State Analysis
# =====================

#' Analyze performance by game state
analyze_by_strength <- function(pbp) {
  pbp %>%
    filter(event_type %in% c("SHOT", "GOAL")) %>%
    mutate(
      strength_state = case_when(
        strength_code == "EV" ~ "Even Strength",
        strength_code == "PP" ~ "Power Play",
        strength_code == "SH" ~ "Shorthanded",
        TRUE ~ "Other"
      )
    ) %>%
    group_by(event_team, strength_state) %>%
    summarise(
      Shots = sum(event_type == "SHOT"),
      Goals = sum(event_type == "GOAL"),
      Sh_Pct = Goals / (Shots + Goals) * 100,
      .groups = "drop"
    )
}

#' Calculate special teams efficiency
calc_special_teams <- function(pbp) {
  # Power Play
  pp_data <- pbp %>%
    filter(strength_code == "PP") %>%
    group_by(event_team) %>%
    summarise(
      PP_Goals = sum(event_type == "GOAL"),
      PP_Shots = sum(event_type %in% c("SHOT", "GOAL")),
      .groups = "drop"
    )

  # Penalty Kill (when opponent is on PP)
  pk_data <- pbp %>%
    filter(strength_code == "PP") %>%
    group_by(defending_team = ifelse(event_team == home_team, away_team, home_team)) %>%
    summarise(
      GA_on_PK = sum(event_type == "GOAL"),
      SA_on_PK = sum(event_type %in% c("SHOT", "GOAL")),
      .groups = "drop"
    )

  pp_data %>%
    left_join(pk_data, by = c("event_team" = "defending_team"))
}

# =====================
# Visualization
# =====================

#' Create shot plot (rink view)
plot_shots <- function(shots, team = NULL) {
  plot_data <- shots %>%
    filter(event_type %in% c("SHOT", "GOAL"))

  if (!is.null(team)) {
    plot_data <- plot_data %>% filter(event_team == team)
  }

  ggplot(plot_data, aes(x = x_fixed, y = y_fixed)) +
    # Add rink markings here
    geom_point(aes(color = event_type, size = ifelse(event_type == "GOAL", 3, 1)),
               alpha = 0.6) +
    scale_color_manual(values = c("SHOT" = "blue", "GOAL" = "red")) +
    coord_fixed(xlim = c(-100, 100), ylim = c(-42.5, 42.5)) +
    labs(title = "Shot Location Plot", x = "", y = "") +
    theme_minimal() +
    theme(legend.position = "bottom")
}

#' Create win probability chart
plot_win_probability <- function(pbp, game_id) {
  game_data <- pbp %>%
    filter(game_id == !!game_id) %>%
    arrange(period, time)

  # Calculate simple win probability based on score and time
  game_data <- game_data %>%
    mutate(
      score_diff = home_score - away_score,
      time_remaining = (3 - period) * 20 + time / 60,
      home_wp = pnorm(score_diff, sd = sqrt(time_remaining / 10))
    )

  ggplot(game_data, aes(x = row_number(), y = home_wp)) +
    geom_line(color = "blue", size = 1) +
    geom_hline(yintercept = 0.5, linetype = "dashed", alpha = 0.5) +
    scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
    labs(
      title = "Win Probability",
      x = "Play Number",
      y = "Home Team Win Probability"
    ) +
    theme_minimal()
}

print("NHL hockey analysis functions loaded")

python Golf

Strokes Gained Analysis

Calculate strokes gained statistics for golf performance analysis.

"""Golf Strokes Gained Analysis."""
import pandas as pd
import numpy as np

class StrokesGained:
    """
    Calculate Strokes Gained statistics.

    Strokes Gained measures performance relative to baseline
    expected strokes from each position.
    """

    # Baseline expected strokes to hole out from various distances
    # Based on PGA Tour averages
    BASELINE_TEE = {
        # Distance in yards: Expected strokes
        100: 2.92, 125: 2.99, 150: 3.05, 175: 3.12, 200: 3.18,
        225: 3.25, 250: 3.33, 275: 3.42, 300: 3.51, 325: 3.61,
        350: 3.71, 375: 3.82, 400: 3.94, 425: 4.06, 450: 4.18,
        475: 4.31, 500: 4.44, 525: 4.58, 550: 4.73
    }

    BASELINE_FAIRWAY = {
        # Distance in yards: Expected strokes
        25: 2.40, 50: 2.65, 75: 2.77, 100: 2.87, 125: 2.96,
        150: 3.04, 175: 3.12, 200: 3.21, 225: 3.31, 250: 3.42,
        275: 3.55, 300: 3.70
    }

    BASELINE_ROUGH = {
        # Typically 0.1-0.2 strokes worse than fairway
        25: 2.50, 50: 2.75, 75: 2.90, 100: 3.00, 125: 3.10,
        150: 3.20, 175: 3.30, 200: 3.42, 225: 3.55, 250: 3.70
    }

    BASELINE_SAND = {
        # Greenside bunkers
        5: 2.40, 10: 2.50, 15: 2.60, 20: 2.70, 25: 2.80,
        30: 2.90, 40: 3.10, 50: 3.30
    }

    BASELINE_GREEN = {
        # Distance in feet: Expected putts
        2: 1.01, 3: 1.05, 4: 1.12, 5: 1.18, 6: 1.25,
        7: 1.32, 8: 1.39, 9: 1.46, 10: 1.53, 12: 1.61,
        15: 1.70, 20: 1.80, 25: 1.87, 30: 1.93, 35: 1.98,
        40: 2.02, 45: 2.06, 50: 2.09, 60: 2.15, 70: 2.20,
        80: 2.24, 90: 2.27, 100: 2.30
    }

    @classmethod
    def _interpolate_baseline(cls, distance: float, baseline: dict) -> float:
        """Interpolate baseline strokes for a given distance."""
        distances = sorted(baseline.keys())

        if distance <= distances[0]:
            return baseline[distances[0]]
        if distance >= distances[-1]:
            return baseline[distances[-1]]

        # Find surrounding distances
        for i in range(len(distances) - 1):
            if distances[i] <= distance <= distances[i + 1]:
                d1, d2 = distances[i], distances[i + 1]
                s1, s2 = baseline[d1], baseline[d2]
                # Linear interpolation
                return s1 + (s2 - s1) * (distance - d1) / (d2 - d1)

        return baseline[distances[-1]]

    @classmethod
    def expected_strokes(cls, distance: float, lie: str) -> float:
        """
        Get expected strokes from a position.

        Args:
            distance: Distance to hole (yards or feet for putting)
            lie: "tee", "fairway", "rough", "sand", "green"
        """
        baselines = {
            "tee": cls.BASELINE_TEE,
            "fairway": cls.BASELINE_FAIRWAY,
            "rough": cls.BASELINE_ROUGH,
            "sand": cls.BASELINE_SAND,
            "green": cls.BASELINE_GREEN
        }

        baseline = baselines.get(lie.lower(), cls.BASELINE_FAIRWAY)
        return cls._interpolate_baseline(distance, baseline)

    @classmethod
    def strokes_gained_shot(
        cls,
        start_distance: float,
        start_lie: str,
        end_distance: float,
        end_lie: str,
        strokes: int = 1
    ) -> float:
        """
        Calculate strokes gained for a single shot.

        SG = Expected_before - Expected_after - strokes_taken
        """
        exp_before = cls.expected_strokes(start_distance, start_lie)

        if end_distance == 0 and end_lie == "hole":
            exp_after = 0
        else:
            exp_after = cls.expected_strokes(end_distance, end_lie)

        return exp_before - exp_after - strokes

    @classmethod
    def analyze_round(cls, shots: pd.DataFrame) -> dict:
        """
        Analyze a complete round.

        Expected columns: hole, shot_num, start_distance, start_lie,
                         end_distance, end_lie
        """
        # Calculate SG for each shot
        shots = shots.copy()
        shots["sg"] = shots.apply(
            lambda r: cls.strokes_gained_shot(
                r["start_distance"], r["start_lie"],
                r["end_distance"], r["end_lie"]
            ), axis=1
        )

        # Categorize shots
        off_tee = shots[shots["start_lie"] == "tee"]["sg"].sum()
        approach = shots[
            (shots["start_lie"].isin(["fairway", "rough"])) &
            (shots["end_lie"] == "green")
        ]["sg"].sum()
        around_green = shots[
            (shots["start_lie"].isin(["rough", "sand"])) &
            (shots["end_distance"] < 30) &
            (shots["end_lie"] == "green")
        ]["sg"].sum()
        putting = shots[shots["start_lie"] == "green"]["sg"].sum()

        return {
            "SG_Total": shots["sg"].sum(),
            "SG_Off_Tee": off_tee,
            "SG_Approach": approach,
            "SG_Around_Green": around_green,
            "SG_Putting": putting,
            "Total_Shots": len(shots),
            "Score_vs_Par": len(shots) - 72  # Assuming par 72
        }


def simulate_round() -> pd.DataFrame:
    """Simulate a round of golf for demonstration."""
    np.random.seed(42)
    shots = []

    for hole in range(1, 19):
        # Par 4 for simplicity
        par = 4

        # Tee shot
        drive_dist = np.random.normal(280, 25)
        fairway_hit = np.random.random() > 0.35
        remaining = 450 - drive_dist  # 450 yard hole

        shots.append({
            "hole": hole, "shot_num": 1,
            "start_distance": 450, "start_lie": "tee",
            "end_distance": remaining, "end_lie": "fairway" if fairway_hit else "rough"
        })

        # Approach shot
        gir = np.random.random() > 0.3
        putt_dist = np.random.exponential(20) if gir else 50 + np.random.exponential(10)

        shots.append({
            "hole": hole, "shot_num": 2,
            "start_distance": remaining, "start_lie": "fairway" if fairway_hit else "rough",
            "end_distance": putt_dist, "end_lie": "green"
        })

        # Putting
        current_dist = putt_dist
        putt_num = 3
        while current_dist > 0:
            if current_dist < 3 or (current_dist < 10 and np.random.random() > 0.5):
                # Hole it
                shots.append({
                    "hole": hole, "shot_num": putt_num,
                    "start_distance": current_dist, "start_lie": "green",
                    "end_distance": 0, "end_lie": "hole"
                })
                current_dist = 0
            else:
                # Leave a shorter putt
                miss_dist = max(2, current_dist * 0.2 * np.random.random())
                shots.append({
                    "hole": hole, "shot_num": putt_num,
                    "start_distance": current_dist, "start_lie": "green",
                    "end_distance": miss_dist, "end_lie": "green"
                })
                current_dist = miss_dist
                putt_num += 1

    return pd.DataFrame(shots)


# Example usage
if __name__ == "__main__":
    # Simulate a round
    round_data = simulate_round()

    # Analyze
    sg = StrokesGained()
    results = sg.analyze_round(round_data)

    print("Round Analysis:")
    for k, v in results.items():
        print(f"  {k}: {v:.2f}" if isinstance(v, float) else f"  {k}: {v}")

    # Single shot example
    shot_sg = sg.strokes_gained_shot(
        start_distance=175, start_lie="fairway",
        end_distance=10, end_lie="green"
    )
    print(f"\nSample approach shot (175 yards to 10 feet): {shot_sg:.3f} SG")

python Basketball

Player Efficiency Rating (PER) Calculator

Calculate NBA Player Efficiency Rating from box score statistics.

"""Calculate Player Efficiency Rating (PER)."""
import pandas as pd
import numpy as np

def calculate_per(
    player_stats: pd.DataFrame,
    league_stats: dict = None
) -> pd.Series:
    """
    Calculate Player Efficiency Rating.

    PER formula by John Hollinger - summarizes player productivity
    in a single number. League average is ~15.

    Args:
        player_stats: DataFrame with player box score stats
        league_stats: Dict with league averages (pace, scoring, etc.)
    """
    # Default league stats if not provided
    if league_stats is None:
        league_stats = {
            "lg_AST": 24.0,   # League assists per game
            "lg_FG": 41.0,    # League FG per game
            "lg_FT": 17.0,    # League FT made per game
            "lg_PTS": 110.0,  # League points per game
            "lg_FGA": 88.0,   # League FGA per game
            "lg_FTA": 22.0,   # League FTA per game
            "lg_TRB": 44.0,   # League rebounds per game
            "lg_ORB": 10.0,   # League offensive rebounds
            "lg_TOV": 14.0,   # League turnovers
            "lg_PACE": 100.0, # League pace
            "lg_VOP": 1.0,    # Value of possession
            "lg_DRBP": 0.77   # Defensive rebound percentage
        }

    # Calculate factor and VOP
    factor = (2/3) - (0.5 * (league_stats["lg_AST"] / league_stats["lg_FG"])) / \
             (2 * (league_stats["lg_FG"] / league_stats["lg_FT"]))

    VOP = league_stats["lg_PTS"] / (
        league_stats["lg_FGA"] -
        league_stats["lg_ORB"] +
        league_stats["lg_TOV"] +
        0.44 * league_stats["lg_FTA"]
    )

    DRBP = (league_stats["lg_TRB"] - league_stats["lg_ORB"]) / league_stats["lg_TRB"]

    # Extract stats
    df = player_stats.copy()

    # uPER (unadjusted PER)
    uPER = (1 / df["MP"]) * (
        df["3P"] +
        (2/3) * df["AST"] +
        (2 - factor * (df["TM_AST"] / df["TM_FG"])) * df["FG"] +
        (df["FT"] * 0.5 * (1 + (1 - (df["TM_AST"] / df["TM_FG"])) + (2/3) * (df["TM_AST"] / df["TM_FG"]))) -
        VOP * df["TOV"] -
        VOP * DRBP * (df["FGA"] - df["FG"]) -
        VOP * 0.44 * (0.44 + (0.56 * DRBP)) * (df["FTA"] - df["FT"]) +
        VOP * (1 - DRBP) * (df["TRB"] - df["ORB"]) +
        VOP * DRBP * df["ORB"] +
        VOP * df["STL"] +
        VOP * DRBP * df["BLK"] -
        df["PF"] * ((league_stats["lg_FT"] / league_stats["lg_PF"]) -
                   0.44 * (league_stats["lg_FTA"] / league_stats["lg_PF"]) * VOP)
    )

    # Pace adjustment
    pace_adj = league_stats["lg_PACE"] / df["TM_PACE"]

    # Final PER (league average = 15)
    PER = uPER * pace_adj * (15 / uPER.mean())

    return PER

def per_components(player_stats: pd.DataFrame) -> pd.DataFrame:
    """
    Break down PER into positive and negative components.
    """
    df = player_stats.copy()

    components = pd.DataFrame({
        "Player": df["PLAYER_NAME"],
        "Scoring": (df["PTS"] - df["FGA"] * 0.44 - df["FTA"] * 0.44) / df["MP"] * 48,
        "Rebounding": df["TRB"] / df["MP"] * 48,
        "Assists": df["AST"] / df["MP"] * 48 * 0.67,
        "Steals": df["STL"] / df["MP"] * 48,
        "Blocks": df["BLK"] / df["MP"] * 48,
        "Turnovers": -df["TOV"] / df["MP"] * 48,
        "Fouls": -df["PF"] / df["MP"] * 48 * 0.2
    })

    components["Total"] = components.drop("Player", axis=1).sum(axis=1)

    return components

def calculate_ts_pct(pts: int, fga: int, fta: int) -> float:
    """Calculate True Shooting Percentage."""
    return pts / (2 * (fga + 0.44 * fta))

def calculate_efg_pct(fg: int, fg3: int, fga: int) -> float:
    """Calculate Effective Field Goal Percentage."""
    return (fg + 0.5 * fg3) / fga

def calculate_usg_pct(fga: int, fta: int, tov: int, mp: float,
                      tm_fga: int, tm_fta: int, tm_tov: int, tm_mp: float) -> float:
    """Calculate Usage Percentage."""
    return 100 * ((fga + 0.44 * fta + tov) * (tm_mp / 5)) / \
           (mp * (tm_fga + 0.44 * tm_fta + tm_tov))


# Example usage
if __name__ == "__main__":
    # Sample player data (season totals)
    players = pd.DataFrame({
        "PLAYER_NAME": ["Player A", "Player B", "Player C"],
        "GP": [72, 65, 70],
        "MP": [2400, 2000, 1800],
        "PTS": [1800, 1200, 900],
        "FG": [650, 450, 350],
        "FGA": [1400, 950, 750],
        "3P": [150, 100, 80],
        "FT": [350, 200, 120],
        "FTA": [420, 250, 150],
        "ORB": [60, 180, 50],
        "TRB": [360, 650, 200],
        "AST": [500, 200, 400],
        "STL": [90, 50, 80],
        "BLK": [40, 120, 20],
        "TOV": [200, 150, 120],
        "PF": [150, 180, 120],
        # Team stats needed for PER
        "TM_AST": [1800, 1800, 1800],
        "TM_FG": [3000, 3000, 3000],
        "TM_PACE": [100, 100, 100]
    })

    # Calculate efficiency stats
    players["TS%"] = players.apply(
        lambda r: calculate_ts_pct(r["PTS"], r["FGA"], r["FTA"]), axis=1
    )
    players["eFG%"] = players.apply(
        lambda r: calculate_efg_pct(r["FG"], r["3P"], r["FGA"]), axis=1
    )

    print("Player Efficiency Stats:")
    print(players[["PLAYER_NAME", "PTS", "TRB", "AST", "TS%", "eFG%"]].round(3))

python Tennis

Tennis Match Statistics

Calculate comprehensive tennis match statistics including serve and return metrics.

"""Tennis Match Statistics Calculator."""
import pandas as pd
import numpy as np

class TennisStats:
    """Calculate tennis match and career statistics."""

    @staticmethod
    def serve_stats(
        aces: int, double_faults: int,
        first_serves_in: int, first_serves_total: int,
        first_serve_points_won: int, first_serve_points: int,
        second_serve_points_won: int, second_serve_points: int
    ) -> dict:
        """Calculate serve statistics."""
        total_serve_points = first_serve_points + second_serve_points
        total_serve_points_won = first_serve_points_won + second_serve_points_won

        return {
            "Aces": aces,
            "Double_Faults": double_faults,
            "First_Serve_Pct": first_serves_in / first_serves_total * 100 if first_serves_total > 0 else 0,
            "First_Serve_Won_Pct": first_serve_points_won / first_serve_points * 100 if first_serve_points > 0 else 0,
            "Second_Serve_Won_Pct": second_serve_points_won / second_serve_points * 100 if second_serve_points > 0 else 0,
            "Service_Points_Won_Pct": total_serve_points_won / total_serve_points * 100 if total_serve_points > 0 else 0,
            "Service_Games_Won_Pct": None  # Need service games data
        }

    @staticmethod
    def return_stats(
        return_points_won: int, return_points: int,
        first_return_won: int, first_return_points: int,
        second_return_won: int, second_return_points: int,
        break_points_won: int, break_points: int
    ) -> dict:
        """Calculate return statistics."""
        return {
            "Return_Points_Won_Pct": return_points_won / return_points * 100 if return_points > 0 else 0,
            "First_Return_Won_Pct": first_return_won / first_return_points * 100 if first_return_points > 0 else 0,
            "Second_Return_Won_Pct": second_return_won / second_return_points * 100 if second_return_points > 0 else 0,
            "Break_Points_Won_Pct": break_points_won / break_points * 100 if break_points > 0 else 0,
            "Break_Points_Faced": break_points
        }

    @staticmethod
    def dominance_ratio(serve_won_pct: float, return_won_pct: float) -> float:
        """
        Calculate dominance ratio.

        DR = (Serve Points Won + Return Points Won) / Total Points
        Values > 1 indicate dominance
        """
        return (serve_won_pct + return_won_pct) / 100

    @staticmethod
    def efficiency_stats(
        winners: int, unforced_errors: int,
        forced_errors: int, total_points: int
    ) -> dict:
        """Calculate efficiency and aggression metrics."""
        return {
            "Winners": winners,
            "Unforced_Errors": unforced_errors,
            "W_UE_Ratio": winners / unforced_errors if unforced_errors > 0 else float("inf"),
            "Winner_Pct": winners / total_points * 100,
            "Error_Pct": unforced_errors / total_points * 100,
            "Aggression_Index": (winners + forced_errors) / total_points * 100
        }

    @staticmethod
    def tiebreak_stats(tiebreaks_won: int, tiebreaks_played: int) -> dict:
        """Calculate tiebreak statistics."""
        return {
            "Tiebreaks_Won": tiebreaks_won,
            "Tiebreaks_Lost": tiebreaks_played - tiebreaks_won,
            "Tiebreak_Win_Pct": tiebreaks_won / tiebreaks_played * 100 if tiebreaks_played > 0 else 0
        }

    @staticmethod
    def surface_performance(matches_df: pd.DataFrame, surface: str) -> dict:
        """Calculate performance on a specific surface."""
        surface_matches = matches_df[matches_df["surface"] == surface]

        if len(surface_matches) == 0:
            return {"surface": surface, "matches": 0}

        wins = surface_matches["won"].sum()
        total = len(surface_matches)

        return {
            "Surface": surface,
            "Matches": total,
            "Wins": wins,
            "Losses": total - wins,
            "Win_Pct": wins / total * 100
        }


def calculate_match_stats(match_data: dict) -> pd.DataFrame:
    """
    Calculate comprehensive match statistics.

    Args:
        match_data: Dict with player stats
    """
    stats = TennisStats()

    results = []
    for player, data in match_data.items():
        serve = stats.serve_stats(
            data["aces"], data["double_faults"],
            data["first_serve_in"], data["first_serve_total"],
            data["first_serve_won"], data["first_serve_points"],
            data["second_serve_won"], data["second_serve_points"]
        )

        ret = stats.return_stats(
            data["return_won"], data["return_points"],
            data["first_return_won"], data["first_return_points"],
            data["second_return_won"], data["second_return_points"],
            data["break_points_won"], data["break_points"]
        )

        eff = stats.efficiency_stats(
            data["winners"], data["unforced_errors"],
            data.get("forced_errors", 0), data["total_points"]
        )

        results.append({
            "Player": player,
            **serve,
            **ret,
            **eff,
            "Dominance_Ratio": stats.dominance_ratio(
                serve["Service_Points_Won_Pct"],
                ret["Return_Points_Won_Pct"]
            )
        })

    return pd.DataFrame(results)


def expected_games_won(serve_hold_pct: float, return_break_pct: float,
                       total_games: int) -> float:
    """
    Calculate expected games won based on serve/return percentages.
    """
    # In a match, roughly half are serve games
    serve_games = total_games / 2
    return_games = total_games / 2

    return serve_games * serve_hold_pct + return_games * return_break_pct


# Example usage
if __name__ == "__main__":
    # Sample match data
    match_data = {
        "Player A": {
            "aces": 12, "double_faults": 3,
            "first_serve_in": 48, "first_serve_total": 65,
            "first_serve_won": 38, "first_serve_points": 48,
            "second_serve_won": 10, "second_serve_points": 17,
            "return_won": 32, "return_points": 75,
            "first_return_won": 18, "first_return_points": 50,
            "second_return_won": 14, "second_return_points": 25,
            "break_points_won": 4, "break_points": 8,
            "winners": 35, "unforced_errors": 22,
            "total_points": 140
        },
        "Player B": {
            "aces": 8, "double_faults": 5,
            "first_serve_in": 50, "first_serve_total": 75,
            "first_serve_won": 35, "first_serve_points": 50,
            "second_serve_won": 8, "second_serve_points": 25,
            "return_won": 17, "return_points": 65,
            "first_return_won": 10, "first_return_points": 48,
            "second_return_won": 7, "second_return_points": 17,
            "break_points_won": 2, "break_points": 6,
            "winners": 28, "unforced_errors": 30,
            "total_points": 140
        }
    }

    results = calculate_match_stats(match_data)

    print("Match Statistics:")
    cols = ["Player", "Aces", "First_Serve_Pct", "Service_Points_Won_Pct",
            "Return_Points_Won_Pct", "W_UE_Ratio", "Dominance_Ratio"]
    print(results[cols].round(1).to_string(index=False))

python Baseball

Statcast Pitch Analysis

Analyze pitch characteristics using Statcast data including velocity, spin, and movement.

"""Statcast pitch analysis and classification."""
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

class PitchAnalyzer:
    """Analyze and classify pitches using Statcast data."""

    def __init__(self):
        self.scaler = StandardScaler()

    def analyze_repertoire(self, pitches: pd.DataFrame, pitcher_name: str = None) -> pd.DataFrame:
        """
        Analyze a pitcher's pitch repertoire.

        Expected columns: pitch_type, release_speed, release_spin_rate,
                         pfx_x (horizontal movement), pfx_z (vertical movement)
        """
        if pitcher_name:
            pitches = pitches[pitches["player_name"] == pitcher_name].copy()

        # Group by pitch type
        repertoire = pitches.groupby("pitch_type").agg({
            "release_speed": ["mean", "std", "count"],
            "release_spin_rate": ["mean", "std"],
            "pfx_x": "mean",  # Horizontal break (inches)
            "pfx_z": "mean",  # Vertical break (inches)
        }).round(1)

        repertoire.columns = [
            "Velocity", "Velo_STD", "Count",
            "Spin_Rate", "Spin_STD",
            "Horiz_Break", "Vert_Break"
        ]

        # Calculate usage percentage
        repertoire["Usage%"] = (repertoire["Count"] / repertoire["Count"].sum() * 100).round(1)

        return repertoire.sort_values("Usage%", ascending=False)

    def pitch_movement_plot(self, pitches: pd.DataFrame, ax=None):
        """Create pitch movement plot."""
        if ax is None:
            fig, ax = plt.subplots(figsize=(10, 10))

        pitch_colors = {
            "FF": "red",      # 4-seam fastball
            "SI": "orange",   # Sinker
            "FC": "purple",   # Cutter
            "SL": "blue",     # Slider
            "CU": "green",    # Curveball
            "CH": "gray",     # Changeup
            "FS": "brown",    # Splitter
        }

        for pitch_type in pitches["pitch_type"].unique():
            subset = pitches[pitches["pitch_type"] == pitch_type]
            color = pitch_colors.get(pitch_type, "black")
            ax.scatter(subset["pfx_x"], subset["pfx_z"],
                      label=pitch_type, alpha=0.5, c=color, s=20)

        ax.axhline(y=0, color="gray", linestyle="--", alpha=0.5)
        ax.axvline(x=0, color="gray", linestyle="--", alpha=0.5)
        ax.set_xlabel("Horizontal Movement (inches)")
        ax.set_ylabel("Vertical Movement (inches)")
        ax.set_title("Pitch Movement Profile")
        ax.legend()
        ax.set_xlim(-25, 25)
        ax.set_ylim(-25, 25)

        return ax

    def stuff_plus(self, pitches: pd.DataFrame, league_avgs: dict) -> pd.Series:
        """
        Calculate Stuff+ style metric.
        Compares pitch characteristics to league average.

        100 = league average, higher = better
        """
        # This is a simplified version
        # Real Stuff+ uses ML models trained on outcomes

        results = []
        for _, pitch in pitches.iterrows():
            pitch_type = pitch["pitch_type"]

            if pitch_type not in league_avgs:
                results.append(100)
                continue

            la = league_avgs[pitch_type]

            # Compare to league average (simplified)
            velo_diff = (pitch["release_speed"] - la["velocity"]) / la["velocity_std"]
            spin_diff = (pitch["release_spin_rate"] - la["spin"]) / la["spin_std"]

            # Weighted combination (higher velo/spin = better for most pitches)
            stuff = 100 + (velo_diff * 5) + (spin_diff * 3)
            results.append(stuff)

        return pd.Series(results, index=pitches.index)

    def classify_pitches(self, pitches: pd.DataFrame, n_clusters: int = 6) -> pd.Series:
        """
        Classify pitches using clustering.
        Useful for finding misclassified pitches.
        """
        features = ["release_speed", "release_spin_rate", "pfx_x", "pfx_z"]
        X = pitches[features].dropna()

        X_scaled = self.scaler.fit_transform(X)

        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(X_scaled)

        return pd.Series(clusters, index=X.index, name="cluster")


# Example usage
if __name__ == "__main__":
    # Create sample pitch data
    np.random.seed(42)

    pitches = pd.DataFrame({
        "pitch_type": np.random.choice(["FF", "SL", "CH", "CU"], 500, p=[0.5, 0.25, 0.15, 0.1]),
        "release_speed": np.random.normal(93, 5, 500),
        "release_spin_rate": np.random.normal(2300, 300, 500),
        "pfx_x": np.random.normal(0, 8, 500),
        "pfx_z": np.random.normal(10, 8, 500),
        "player_name": "Sample Pitcher"
    })

    analyzer = PitchAnalyzer()
    print(analyzer.analyze_repertoire(pitches))

python Baseball

Spray Chart Generator

Create baseball spray charts showing batted ball locations and outcomes.

"""Baseball spray chart generator."""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import Polygon, Arc

def draw_field(ax, field_type="outfield"):
    """Draw baseball field outline."""
    # Set equal aspect ratio
    ax.set_aspect("equal")

    if field_type == "outfield":
        # Outfield arc
        theta = np.linspace(-np.pi/4, np.pi/4 + np.pi/2, 100)
        r = 300  # feet to outfield wall
        x = r * np.sin(theta)
        y = r * np.cos(theta)
        ax.plot(x, y, "green", linewidth=2)

        # Infield
        infield_r = 90 * np.sqrt(2)
        ax.add_patch(patches.RegularPolygon(
            (0, 63.5), 4, infield_r, orientation=np.pi/4,
            fill=False, edgecolor="brown", linewidth=2
        ))

        # Foul lines
        ax.plot([0, -250], [0, 250], "white", linewidth=1)
        ax.plot([0, 250], [0, 250], "white", linewidth=1)

        # Home plate
        ax.scatter([0], [0], c="white", s=100, marker="^", zorder=5)

        ax.set_xlim(-350, 350)
        ax.set_ylim(-50, 400)

    ax.set_facecolor("darkgreen")
    ax.set_xticks([])
    ax.set_yticks([])

    return ax

def create_spray_chart(
    batted_balls: pd.DataFrame,
    player_name: str = None,
    color_by: str = "hit_outcome",
    ax=None
):
    """
    Create spray chart from batted ball data.

    Expected columns:
    - hc_x, hc_y: hit coordinates (Statcast uses 0-250 scale)
    - hit_outcome: single, double, triple, home_run, out
    - launch_speed: exit velocity
    - launch_angle: launch angle
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 10))

    draw_field(ax)

    if player_name:
        batted_balls = batted_balls[batted_balls["player_name"] == player_name]

    # Convert Statcast coordinates (if necessary)
    # Statcast: 0-250 scale, need to convert to feet from home plate
    if "hc_x" in batted_balls.columns:
        x = batted_balls["hc_x"].copy()
        y = batted_balls["hc_y"].copy()

        # Convert from Statcast coords (125.42, 199.27) is home plate
        x = (x - 125.42) * 2.5
        y = (199.27 - y) * 2.5
    else:
        x = batted_balls["hit_x"]
        y = batted_balls["hit_y"]

    # Color mapping
    if color_by == "hit_outcome":
        color_map = {
            "single": "blue",
            "double": "green",
            "triple": "orange",
            "home_run": "red",
            "out": "gray",
            "field_out": "gray",
            "field_error": "yellow"
        }
        colors = batted_balls["events"].map(lambda x: color_map.get(x, "gray"))

    elif color_by == "exit_velocity":
        colors = batted_balls["launch_speed"]

    elif color_by == "launch_angle":
        colors = batted_balls["launch_angle"]
    else:
        colors = "blue"

    scatter = ax.scatter(x, y, c=colors, alpha=0.6, s=30, edgecolors="black", linewidth=0.5)

    # Add colorbar for continuous variables
    if color_by in ["exit_velocity", "launch_angle"]:
        plt.colorbar(scatter, ax=ax, label=color_by.replace("_", " ").title())

    # Add legend for categorical
    if color_by == "hit_outcome":
        for outcome, color in color_map.items():
            ax.scatter([], [], c=color, label=outcome.replace("_", " ").title())
        ax.legend(loc="upper right")

    title = "Spray Chart"
    if player_name:
        title = f"{player_name} Spray Chart"
    ax.set_title(title, fontsize=14, fontweight="bold")

    return ax

def calculate_pull_tendency(batted_balls: pd.DataFrame, batter_hand: str = "R") -> dict:
    """
    Calculate pull/center/opposite field tendencies.
    """
    # Convert coordinates
    x = (batted_balls["hc_x"] - 125.42) * 2.5

    # Adjust for batter handedness
    if batter_hand == "L":
        x = -x

    # Pull = positive x for RHB, negative for LHB
    pull = (x > 40).sum()
    center = ((x >= -40) & (x <= 40)).sum()
    oppo = (x < -40).sum()

    total = pull + center + oppo

    return {
        "Pull%": round(pull / total * 100, 1) if total > 0 else 0,
        "Center%": round(center / total * 100, 1) if total > 0 else 0,
        "Oppo%": round(oppo / total * 100, 1) if total > 0 else 0,
        "Total_BIP": total
    }


# Example usage
if __name__ == "__main__":
    # Create sample batted ball data
    np.random.seed(42)
    n = 200

    # Random positions (roughly realistic spray pattern)
    angles = np.random.uniform(-45, 45, n)  # degrees from center
    distances = np.random.uniform(100, 350, n)

    x = distances * np.sin(np.radians(angles))
    y = distances * np.cos(np.radians(angles))

    # Convert to Statcast-like coordinates
    hc_x = x / 2.5 + 125.42
    hc_y = 199.27 - y / 2.5

    batted_balls = pd.DataFrame({
        "hc_x": hc_x,
        "hc_y": hc_y,
        "launch_speed": np.random.normal(90, 10, n),
        "launch_angle": np.random.normal(15, 15, n),
        "events": np.random.choice(
            ["single", "double", "triple", "home_run", "field_out"],
            n, p=[0.2, 0.05, 0.01, 0.04, 0.7]
        ),
        "player_name": "Sample Batter"
    })

    # Create spray chart
    fig, ax = plt.subplots(figsize=(10, 10))
    create_spray_chart(batted_balls, color_by="hit_outcome", ax=ax)
    plt.tight_layout()
    # plt.savefig("spray_chart.png")
    # plt.show()

    print("Pull Tendency:", calculate_pull_tendency(batted_balls))

python Basketball

NBA Shot Chart Generator

Create basketball shot charts with zones and percentages.

"""NBA shot chart generator."""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, Rectangle, Arc

def draw_court(ax=None, color="black", lw=2, outer_lines=False):
    """Draw NBA court lines."""
    if ax is None:
        ax = plt.gca()

    # Hoop
    hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)
    ax.add_patch(hoop)

    # Backboard
    backboard = Rectangle((-30, -7.5), 60, -1, linewidth=lw, color=color)
    ax.add_patch(backboard)

    # Paint/Key
    outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw, color=color, fill=False)
    inner_box = Rectangle((-60, -47.5), 120, 190, linewidth=lw, color=color, fill=False)
    ax.add_patch(outer_box)
    ax.add_patch(inner_box)

    # Free throw circle
    top_arc = Arc((0, 142.5), 120, 120, theta1=0, theta2=180, linewidth=lw, color=color)
    bottom_arc = Arc((0, 142.5), 120, 120, theta1=180, theta2=360, linewidth=lw, color=color, linestyle="dashed")
    ax.add_patch(top_arc)
    ax.add_patch(bottom_arc)

    # Restricted Area
    restricted = Arc((0, 0), 80, 80, theta1=0, theta2=180, linewidth=lw, color=color)
    ax.add_patch(restricted)

    # Three point line
    corner_three_left = Rectangle((-220, -47.5), 0, 140, linewidth=lw, color=color)
    corner_three_right = Rectangle((220, -47.5), 0, 140, linewidth=lw, color=color)
    ax.add_patch(corner_three_left)
    ax.add_patch(corner_three_right)

    three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158, linewidth=lw, color=color)
    ax.add_patch(three_arc)

    # Center court
    center_outer = Arc((0, 422.5), 120, 120, theta1=180, theta2=0, linewidth=lw, color=color)
    center_inner = Arc((0, 422.5), 40, 40, theta1=180, theta2=0, linewidth=lw, color=color)
    ax.add_patch(center_outer)
    ax.add_patch(center_inner)

    if outer_lines:
        outer = Rectangle((-250, -47.5), 500, 470, linewidth=lw, color=color, fill=False)
        ax.add_patch(outer)

    ax.set_xlim(-250, 250)
    ax.set_ylim(-47.5, 422.5)
    ax.set_aspect("equal")
    ax.set_xticks([])
    ax.set_yticks([])

    return ax

def create_shot_chart(
    shots: pd.DataFrame,
    player_name: str = None,
    mode: str = "scatter",
    ax=None
):
    """
    Create shot chart.

    Args:
        shots: DataFrame with LOC_X, LOC_Y, SHOT_MADE_FLAG columns
        player_name: Filter to specific player
        mode: "scatter", "hexbin", or "zone"
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=(12, 11))

    if player_name:
        shots = shots[shots["PLAYER_NAME"] == player_name]

    draw_court(ax, color="black", lw=1)

    x = shots["LOC_X"]
    y = shots["LOC_Y"]
    made = shots["SHOT_MADE_FLAG"]

    if mode == "scatter":
        # Green for makes, red for misses
        colors = ["green" if m == 1 else "red" for m in made]
        ax.scatter(x, y, c=colors, alpha=0.5, s=20)

    elif mode == "hexbin":
        # Hexbin showing shooting percentage
        hb = ax.hexbin(x, y, C=made, gridsize=30, cmap="RdYlGn",
                       reduce_C_function=np.mean, mincnt=3, extent=[-250, 250, -47.5, 400])
        plt.colorbar(hb, ax=ax, label="FG%")

    elif mode == "zone":
        # Zone-based percentages
        zones = define_shot_zones(shots)
        for zone_name, zone_shots in zones.items():
            if len(zone_shots) > 0:
                fg_pct = zone_shots["SHOT_MADE_FLAG"].mean()
                count = len(zone_shots)
                centroid_x = zone_shots["LOC_X"].mean()
                centroid_y = zone_shots["LOC_Y"].mean()

                color = "green" if fg_pct > 0.4 else "orange" if fg_pct > 0.3 else "red"
                ax.text(centroid_x, centroid_y, f"{fg_pct:.1%}\n({count})",
                       ha="center", va="center", fontsize=9, fontweight="bold", color=color)

    title = "Shot Chart"
    if player_name:
        title = f"{player_name} Shot Chart"
    ax.set_title(title, fontsize=14, fontweight="bold")

    return ax

def define_shot_zones(shots: pd.DataFrame) -> dict:
    """Define shot zones and classify shots."""
    zones = {}

    x = shots["LOC_X"]
    y = shots["LOC_Y"]

    # Distance from basket
    dist = np.sqrt(x**2 + y**2)

    # Three point line (approx 237.5 at arc)
    is_three = dist > 237.5
    is_three = is_three | ((np.abs(x) > 220) & (y < 92.5))

    # Restricted area
    zones["Restricted Area"] = shots[dist <= 40]

    # Paint (non-restricted)
    zones["Paint"] = shots[(dist > 40) & (dist <= 80) & (np.abs(x) < 80)]

    # Mid-range
    zones["Mid-Range Left"] = shots[~is_three & (x < -80) & (dist > 40)]
    zones["Mid-Range Center"] = shots[~is_three & (np.abs(x) <= 80) & (dist > 80)]
    zones["Mid-Range Right"] = shots[~is_three & (x > 80) & (dist > 40)]

    # Three pointers
    zones["Corner 3 Left"] = shots[is_three & (x < -220)]
    zones["Corner 3 Right"] = shots[is_three & (x > 220)]
    zones["Above Break 3"] = shots[is_three & (np.abs(x) <= 220)]

    return zones

def shot_zone_summary(shots: pd.DataFrame) -> pd.DataFrame:
    """Get shooting summary by zone."""
    zones = define_shot_zones(shots)

    summary = []
    for zone_name, zone_shots in zones.items():
        if len(zone_shots) > 0:
            summary.append({
                "Zone": zone_name,
                "FGA": len(zone_shots),
                "FGM": zone_shots["SHOT_MADE_FLAG"].sum(),
                "FG%": zone_shots["SHOT_MADE_FLAG"].mean(),
                "Pts/Shot": zone_shots["SHOT_MADE_FLAG"].mean() * (3 if "3" in zone_name else 2)
            })

    return pd.DataFrame(summary).sort_values("Pts/Shot", ascending=False).round(3)


# Example usage
if __name__ == "__main__":
    # Create sample shot data
    np.random.seed(42)
    n = 500

    shots = pd.DataFrame({
        "LOC_X": np.random.normal(0, 100, n),
        "LOC_Y": np.random.uniform(0, 300, n),
        "SHOT_MADE_FLAG": np.random.binomial(1, 0.45, n),
        "PLAYER_NAME": "Sample Player"
    })

    # Create shot chart
    fig, axes = plt.subplots(1, 2, figsize=(20, 10))
    create_shot_chart(shots, mode="scatter", ax=axes[0])
    create_shot_chart(shots, mode="hexbin", ax=axes[1])
    plt.tight_layout()

    # Zone summary
    print(shot_zone_summary(shots))

python Football

NFL Passing Stats Calculator

Calculate advanced NFL passing metrics including ANY/A, QBR components.

"""NFL Advanced Passing Statistics Calculator."""
import pandas as pd
import numpy as np

class PassingMetrics:
    """Calculate advanced NFL passing statistics."""

    @staticmethod
    def passer_rating(comp: int, att: int, yards: int, td: int, int_: int) -> float:
        """
        Calculate NFL Passer Rating.
        Perfect rating is 158.3
        """
        if att == 0:
            return 0

        # Four components, each capped at 2.375
        a = max(0, min(((comp / att) - 0.3) * 5, 2.375))
        b = max(0, min(((yards / att) - 3) * 0.25, 2.375))
        c = max(0, min((td / att) * 20, 2.375))
        d = max(0, min(2.375 - ((int_ / att) * 25), 2.375))

        return ((a + b + c + d) / 6) * 100

    @staticmethod
    def adjusted_yards_per_attempt(yards: int, td: int, int_: int, att: int) -> float:
        """
        Adjusted Yards per Attempt (AY/A).
        AY/A = (Yards + 20*TD - 45*INT) / Attempts
        """
        if att == 0:
            return 0
        return (yards + 20 * td - 45 * int_) / att

    @staticmethod
    def adjusted_net_yards_per_attempt(
        yards: int, td: int, int_: int, att: int,
        sacks: int, sack_yards: int
    ) -> float:
        """
        Adjusted Net Yards per Attempt (ANY/A).
        Includes sack impact.
        """
        total_plays = att + sacks
        if total_plays == 0:
            return 0
        return (yards + 20 * td - 45 * int_ - sack_yards) / total_plays

    @staticmethod
    def completion_percentage_over_expected(
        actual_comp_pct: float,
        expected_comp_pct: float
    ) -> float:
        """
        Completion Percentage Over Expected (CPOE).
        Requires pre-calculated expected completion percentage.
        """
        return actual_comp_pct - expected_comp_pct

    @staticmethod
    def air_yards_metrics(
        air_yards: int, att: int, comp: int,
        completed_air_yards: int, yards: int
    ) -> dict:
        """
        Calculate air yards related metrics.
        """
        return {
            "Intended_Air_Yards_per_Att": air_yards / att if att > 0 else 0,
            "Completed_Air_Yards_per_Comp": completed_air_yards / comp if comp > 0 else 0,
            "Air_Yards_per_Att": completed_air_yards / att if att > 0 else 0,
            "YAC": yards - completed_air_yards,
            "YAC_per_Comp": (yards - completed_air_yards) / comp if comp > 0 else 0,
            "RACR": yards / air_yards if air_yards > 0 else 0  # Receiver Air Conversion Ratio
        }

    @staticmethod
    def pressure_stats(
        pressures: int, dropbacks: int,
        pressured_comp: int, pressured_att: int, pressured_yards: int,
        clean_comp: int, clean_att: int, clean_yards: int
    ) -> dict:
        """
        Calculate passing stats under pressure vs clean pocket.
        """
        return {
            "Pressure_Rate": pressures / dropbacks if dropbacks > 0 else 0,
            "Pressured_Comp%": pressured_comp / pressured_att if pressured_att > 0 else 0,
            "Pressured_YPA": pressured_yards / pressured_att if pressured_att > 0 else 0,
            "Clean_Comp%": clean_comp / clean_att if clean_att > 0 else 0,
            "Clean_YPA": clean_yards / clean_att if clean_att > 0 else 0
        }


def calculate_qb_stats(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate comprehensive QB statistics.

    Expected columns: player, completions, attempts, yards, td, int,
                     sacks, sack_yards, air_yards
    """
    results = df.copy()

    # Passer Rating
    results["Passer_Rating"] = df.apply(
        lambda r: PassingMetrics.passer_rating(
            r["completions"], r["attempts"], r["yards"], r["td"], r["int"]
        ), axis=1
    )

    # AY/A
    results["AY/A"] = df.apply(
        lambda r: PassingMetrics.adjusted_yards_per_attempt(
            r["yards"], r["td"], r["int"], r["attempts"]
        ), axis=1
    )

    # ANY/A
    if "sacks" in df.columns:
        results["ANY/A"] = df.apply(
            lambda r: PassingMetrics.adjusted_net_yards_per_attempt(
                r["yards"], r["td"], r["int"], r["attempts"],
                r["sacks"], r["sack_yards"]
            ), axis=1
        )

    # Basic stats
    results["Comp%"] = df["completions"] / df["attempts"]
    results["YPA"] = df["yards"] / df["attempts"]
    results["TD%"] = df["td"] / df["attempts"]
    results["INT%"] = df["int"] / df["attempts"]

    return results


# Example usage
if __name__ == "__main__":
    # Sample QB season data
    qb_stats = pd.DataFrame({
        "player": ["QB1", "QB2", "QB3"],
        "completions": [380, 360, 340],
        "attempts": [560, 550, 520],
        "yards": [4500, 4200, 3900],
        "td": [35, 30, 28],
        "int": [10, 12, 8],
        "sacks": [25, 35, 20],
        "sack_yards": [180, 250, 140],
        "air_yards": [4800, 4500, 4000]
    })

    results = calculate_qb_stats(qb_stats)

    print("QB Statistics:")
    print(results[["player", "Comp%", "YPA", "Passer_Rating", "AY/A", "ANY/A"]].round(2))

python Soccer

Soccer Pass Network Analysis

Analyze team passing networks and player connectivity in soccer matches.

"""Soccer Pass Network Analysis."""
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

class PassNetwork:
    """Analyze passing networks in soccer matches."""

    def __init__(self, events: pd.DataFrame):
        """
        Initialize with event data.

        Expected columns: player_name, pass_recipient, team, location, type
        """
        self.events = events
        self.graph = None

    def build_network(self, team: str) -> nx.DiGraph:
        """Build directed graph of passes for a team."""
        # Filter to team passes
        passes = self.events[
            (self.events["team"] == team) &
            (self.events["type"] == "Pass") &
            (self.events["pass_recipient"].notna())
        ].copy()

        # Create directed graph
        G = nx.DiGraph()

        # Count passes between players
        pass_counts = passes.groupby(["player_name", "pass_recipient"]).size().reset_index(name="weight")

        # Add edges
        for _, row in pass_counts.iterrows():
            G.add_edge(row["player_name"], row["pass_recipient"], weight=row["weight"])

        self.graph = G
        return G

    def centrality_metrics(self) -> pd.DataFrame:
        """Calculate network centrality metrics for each player."""
        if self.graph is None:
            raise ValueError("Build network first")

        metrics = pd.DataFrame({
            "Player": list(self.graph.nodes()),
            "Degree_Centrality": list(nx.degree_centrality(self.graph).values()),
            "Betweenness": list(nx.betweenness_centrality(self.graph).values()),
            "Closeness": list(nx.closeness_centrality(self.graph).values()),
            "PageRank": list(nx.pagerank(self.graph).values())
        })

        # Total passes
        in_degree = dict(self.graph.in_degree(weight="weight"))
        out_degree = dict(self.graph.out_degree(weight="weight"))

        metrics["Passes_Received"] = metrics["Player"].map(in_degree).fillna(0)
        metrics["Passes_Made"] = metrics["Player"].map(out_degree).fillna(0)
        metrics["Total_Involvement"] = metrics["Passes_Received"] + metrics["Passes_Made"]

        return metrics.sort_values("PageRank", ascending=False)

    def key_partnerships(self, top_n: int = 10) -> pd.DataFrame:
        """Find most frequent passing partnerships."""
        if self.graph is None:
            raise ValueError("Build network first")

        edges = []
        for u, v, data in self.graph.edges(data=True):
            edges.append({
                "From": u,
                "To": v,
                "Passes": data["weight"]
            })

        # Combine both directions
        edge_df = pd.DataFrame(edges)

        # Create undirected pairs
        edge_df["Pair"] = edge_df.apply(
            lambda r: tuple(sorted([r["From"], r["To"]])), axis=1
        )

        partnerships = edge_df.groupby("Pair")["Passes"].sum().sort_values(ascending=False)

        result = pd.DataFrame({
            "Partnership": [f"{p[0]} - {p[1]}" for p in partnerships.head(top_n).index],
            "Passes": partnerships.head(top_n).values
        })

        return result

    def plot_network(self, ax=None, layout="spring"):
        """Visualize the passing network."""
        if self.graph is None:
            raise ValueError("Build network first")

        if ax is None:
            fig, ax = plt.subplots(figsize=(12, 10))

        # Layout
        if layout == "spring":
            pos = nx.spring_layout(self.graph, k=2, iterations=50)
        elif layout == "circular":
            pos = nx.circular_layout(self.graph)
        elif layout == "kamada_kawai":
            pos = nx.kamada_kawai_layout(self.graph)
        else:
            pos = nx.spring_layout(self.graph)

        # Node sizes based on total involvement
        in_degree = dict(self.graph.in_degree(weight="weight"))
        out_degree = dict(self.graph.out_degree(weight="weight"))
        node_sizes = [
            (in_degree.get(n, 0) + out_degree.get(n, 0)) * 10 + 100
            for n in self.graph.nodes()
        ]

        # Edge widths based on pass count
        edge_weights = [self.graph[u][v]["weight"] for u, v in self.graph.edges()]
        max_weight = max(edge_weights) if edge_weights else 1
        edge_widths = [w / max_weight * 5 for w in edge_weights]

        # Draw
        nx.draw_networkx_nodes(self.graph, pos, node_size=node_sizes,
                              node_color="lightblue", alpha=0.8, ax=ax)
        nx.draw_networkx_labels(self.graph, pos, font_size=8, ax=ax)
        nx.draw_networkx_edges(self.graph, pos, width=edge_widths,
                              alpha=0.5, edge_color="gray",
                              connectionstyle="arc3,rad=0.1", ax=ax)

        ax.set_title("Team Passing Network", fontsize=14, fontweight="bold")
        ax.axis("off")

        return ax

    def network_stats(self) -> dict:
        """Calculate overall network statistics."""
        if self.graph is None:
            raise ValueError("Build network first")

        # Convert to undirected for some metrics
        G_undirected = self.graph.to_undirected()

        return {
            "Nodes": self.graph.number_of_nodes(),
            "Edges": self.graph.number_of_edges(),
            "Density": nx.density(self.graph),
            "Average_Clustering": nx.average_clustering(G_undirected),
            "Avg_Shortest_Path": nx.average_shortest_path_length(G_undirected)
                                 if nx.is_connected(G_undirected) else None,
            "Total_Passes": sum(d["weight"] for _, _, d in self.graph.edges(data=True))
        }


# Example usage
if __name__ == "__main__":
    # Create sample event data
    np.random.seed(42)
    players = ["GK", "LB", "CB1", "CB2", "RB", "CDM", "CM1", "CM2", "LW", "RW", "ST"]

    events = []
    for _ in range(300):  # 300 passes in a game
        passer = np.random.choice(players)
        recipient = np.random.choice([p for p in players if p != passer])
        events.append({
            "player_name": passer,
            "pass_recipient": recipient,
            "team": "Home Team",
            "type": "Pass",
            "location": [np.random.uniform(0, 120), np.random.uniform(0, 80)]
        })

    events_df = pd.DataFrame(events)

    # Build and analyze network
    network = PassNetwork(events_df)
    G = network.build_network("Home Team")

    print("Network Stats:")
    for k, v in network.network_stats().items():
        print(f"  {k}: {v}")

    print("\nPlayer Centrality:")
    print(network.centrality_metrics().head(5))

    print("\nKey Partnerships:")
    print(network.key_partnerships(5))

python Hockey

NHL Corsi and Fenwick Calculator

Calculate advanced hockey possession metrics including Corsi, Fenwick, and PDO.

"""NHL Advanced Possession Metrics."""
import pandas as pd
import numpy as np

class HockeyMetrics:
    """Calculate advanced NHL possession and efficiency metrics."""

    @staticmethod
    def corsi(shots_for: int, shots_against: int, blocked_for: int,
              blocked_against: int, missed_for: int, missed_against: int) -> dict:
        """
        Calculate Corsi metrics (all shot attempts).

        Corsi For (CF) = Shots + Blocked + Missed
        Corsi% = CF / (CF + CA)
        """
        cf = shots_for + blocked_for + missed_for
        ca = shots_against + blocked_against + missed_against

        return {
            "CF": cf,
            "CA": ca,
            "Corsi_Diff": cf - ca,
            "Corsi%": cf / (cf + ca) * 100 if (cf + ca) > 0 else 50
        }

    @staticmethod
    def fenwick(shots_for: int, shots_against: int,
                missed_for: int, missed_against: int) -> dict:
        """
        Calculate Fenwick metrics (unblocked shot attempts).

        Fenwick excludes blocked shots as they are somewhat random.
        """
        ff = shots_for + missed_for
        fa = shots_against + missed_against

        return {
            "FF": ff,
            "FA": fa,
            "Fenwick_Diff": ff - fa,
            "Fenwick%": ff / (ff + fa) * 100 if (ff + fa) > 0 else 50
        }

    @staticmethod
    def pdo(shooting_pct: float, save_pct: float) -> float:
        """
        Calculate PDO (shooting % + save %).

        PDO around 100 is sustainable, extreme values tend to regress.
        """
        return shooting_pct + save_pct * 100

    @staticmethod
    def expected_goals(shots: pd.DataFrame) -> float:
        """
        Calculate expected goals based on shot quality.

        Simplified model - real xG uses ML with many features.
        """
        # Base xG by shot type (simplified)
        xg_by_type = {
            "WRIST": 0.05,
            "SLAP": 0.04,
            "SNAP": 0.06,
            "BACKHAND": 0.08,
            "DEFLECTION": 0.15,
            "TIP-IN": 0.20,
            "WRAP": 0.10
        }

        # Distance adjustment
        def distance_factor(dist):
            if dist < 10:
                return 2.0
            elif dist < 20:
                return 1.5
            elif dist < 30:
                return 1.0
            elif dist < 40:
                return 0.7
            else:
                return 0.3

        total_xg = 0
        for _, shot in shots.iterrows():
            base = xg_by_type.get(shot.get("shot_type", "WRIST"), 0.05)
            dist_mult = distance_factor(shot.get("distance", 30))
            total_xg += base * dist_mult

        return total_xg

    @staticmethod
    def relative_metrics(player_on: dict, player_off: dict) -> dict:
        """
        Calculate relative metrics (player on ice vs off ice).

        Positive = team is better with player on ice.
        """
        return {
            "Rel_Corsi%": player_on.get("Corsi%", 50) - player_off.get("Corsi%", 50),
            "Rel_Fenwick%": player_on.get("Fenwick%", 50) - player_off.get("Fenwick%", 50),
            "Rel_GF%": player_on.get("GF%", 50) - player_off.get("GF%", 50)
        }

    @staticmethod
    def zone_starts(off_zone: int, def_zone: int, neutral: int) -> dict:
        """
        Calculate zone start percentages.

        High offensive zone starts = easier deployment.
        """
        total = off_zone + def_zone + neutral
        return {
            "OZS%": off_zone / total * 100 if total > 0 else 0,
            "DZS%": def_zone / total * 100 if total > 0 else 0,
            "NZS%": neutral / total * 100 if total > 0 else 0,
            "ZS_Diff": (off_zone - def_zone) / total * 100 if total > 0 else 0
        }


def calculate_team_metrics(team_stats: pd.DataFrame) -> pd.DataFrame:
    """Calculate advanced metrics for teams."""
    results = []

    for _, row in team_stats.iterrows():
        corsi = HockeyMetrics.corsi(
            row["SOG"], row["SOG_Against"],
            row["Blocked_For"], row["Blocked_Against"],
            row["Missed_For"], row["Missed_Against"]
        )

        fenwick = HockeyMetrics.fenwick(
            row["SOG"], row["SOG_Against"],
            row["Missed_For"], row["Missed_Against"]
        )

        pdo = HockeyMetrics.pdo(
            row["Goals_For"] / row["SOG"] * 100,
            1 - row["Goals_Against"] / row["SOG_Against"]
        )

        results.append({
            "Team": row["Team"],
            **corsi,
            **fenwick,
            "PDO": pdo,
            "Sh%": row["Goals_For"] / row["SOG"] * 100,
            "Sv%": (1 - row["Goals_Against"] / row["SOG_Against"]) * 100
        })

    return pd.DataFrame(results)


def calculate_player_metrics(player_events: pd.DataFrame, player_name: str) -> dict:
    """
    Calculate metrics for a specific player.

    Uses on-ice events when player was on ice.
    """
    on_ice = player_events[player_events["player_on_ice"].str.contains(player_name, na=False)]
    off_ice = player_events[~player_events["player_on_ice"].str.contains(player_name, na=False)]

    def get_metrics(events):
        shots_for = len(events[events["event_type"] == "SHOT"])
        shots_against = len(events[events["event_type"] == "SHOT_AGAINST"])
        blocked = len(events[events["event_type"] == "BLOCKED_SHOT"])
        blocked_against = len(events[events["event_type"] == "BLOCKED_SHOT_AGAINST"])
        missed = len(events[events["event_type"] == "MISSED_SHOT"])
        missed_against = len(events[events["event_type"] == "MISSED_SHOT_AGAINST"])

        return HockeyMetrics.corsi(shots_for, shots_against, blocked, blocked_against, missed, missed_against)

    on_metrics = get_metrics(on_ice)
    off_metrics = get_metrics(off_ice)
    rel_metrics = HockeyMetrics.relative_metrics(on_metrics, off_metrics)

    return {
        "Player": player_name,
        **on_metrics,
        **rel_metrics,
        "TOI": len(on_ice) / 60  # Simplified TOI estimate
    }


# Example usage
if __name__ == "__main__":
    # Sample team data
    teams = pd.DataFrame({
        "Team": ["TOR", "BOS", "TBL", "FLA", "NYR"],
        "SOG": [2500, 2450, 2600, 2400, 2550],
        "SOG_Against": [2300, 2200, 2400, 2500, 2350],
        "Goals_For": [250, 240, 270, 230, 255],
        "Goals_Against": [220, 200, 235, 245, 225],
        "Blocked_For": [800, 750, 850, 700, 780],
        "Blocked_Against": [720, 680, 780, 750, 700],
        "Missed_For": [600, 550, 650, 580, 620],
        "Missed_Against": [550, 500, 600, 620, 570]
    })

    results = calculate_team_metrics(teams)

    print("Team Advanced Metrics:")
    print(results[["Team", "Corsi%", "Fenwick%", "PDO", "Sh%", "Sv%"]].round(2))

python Football

Elo Rating System

Implement Elo rating system for any sport with customizable K-factor and home advantage.

"""Elo Rating System for sports."""
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional
from datetime import date

class EloRating:
    """
    Elo rating system for sports teams/players.

    Features:
    - Customizable K-factor
    - Home field advantage
    - Margin of victory adjustment
    - Season regression
    """

    def __init__(
        self,
        k_factor: float = 20,
        home_advantage: float = 100,
        initial_rating: float = 1500,
        regression_factor: float = 0.33
    ):
        """
        Initialize Elo system.

        Args:
            k_factor: Maximum rating change per game
            home_advantage: Elo points for home team
            initial_rating: Starting rating for new teams
            regression_factor: How much ratings regress to mean between seasons
        """
        self.k = k_factor
        self.home_adv = home_advantage
        self.initial = initial_rating
        self.regression = regression_factor

        self.ratings: Dict[str, float] = {}
        self.history: List[Dict] = []

    def get_rating(self, team: str) -> float:
        """Get current rating for a team."""
        return self.ratings.get(team, self.initial)

    def expected_score(self, rating_a: float, rating_b: float) -> float:
        """
        Calculate expected score for team A vs team B.

        Returns probability of team A winning.
        """
        return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

    def margin_of_victory_mult(
        self,
        score_diff: float,
        winner_elo: float,
        loser_elo: float
    ) -> float:
        """
        Calculate margin of victory multiplier.

        Larger victories = larger rating changes, but with diminishing returns.
        Also adjusts for elo difference (upsets get bigger boost).
        """
        elo_diff = winner_elo - loser_elo

        # FiveThirtyEight NFL formula
        return np.log(abs(score_diff) + 1) * (2.2 / ((elo_diff * 0.001) + 2.2))

    def update(
        self,
        team_a: str,
        team_b: str,
        score_a: float,
        score_b: float,
        is_home_a: bool = True,
        use_mov: bool = True
    ) -> Tuple[float, float]:
        """
        Update ratings after a game.

        Args:
            team_a: First team
            team_b: Second team
            score_a: Team A score
            score_b: Team B score
            is_home_a: Is team A the home team?
            use_mov: Use margin of victory adjustment?

        Returns:
            Tuple of (new_rating_a, new_rating_b)
        """
        # Get current ratings
        rating_a = self.get_rating(team_a)
        rating_b = self.get_rating(team_b)

        # Apply home advantage
        adjusted_a = rating_a + (self.home_adv if is_home_a else -self.home_adv)
        adjusted_b = rating_b + (self.home_adv if not is_home_a else -self.home_adv)

        # Calculate expected scores
        exp_a = self.expected_score(adjusted_a, adjusted_b)
        exp_b = 1 - exp_a

        # Actual scores (1 for win, 0.5 for tie, 0 for loss)
        if score_a > score_b:
            actual_a, actual_b = 1, 0
        elif score_a < score_b:
            actual_a, actual_b = 0, 1
        else:
            actual_a, actual_b = 0.5, 0.5

        # K-factor adjustment for margin of victory
        k_mult = 1
        if use_mov and score_a != score_b:
            winner_elo = rating_a if score_a > score_b else rating_b
            loser_elo = rating_b if score_a > score_b else rating_a
            k_mult = self.margin_of_victory_mult(
                abs(score_a - score_b), winner_elo, loser_elo
            )

        # Update ratings
        k_adjusted = self.k * k_mult
        new_rating_a = rating_a + k_adjusted * (actual_a - exp_a)
        new_rating_b = rating_b + k_adjusted * (actual_b - exp_b)

        # Store updates
        self.ratings[team_a] = new_rating_a
        self.ratings[team_b] = new_rating_b

        # Record history
        self.history.append({
            "team_a": team_a,
            "team_b": team_b,
            "score_a": score_a,
            "score_b": score_b,
            "rating_a_before": rating_a,
            "rating_b_before": rating_b,
            "rating_a_after": new_rating_a,
            "rating_b_after": new_rating_b,
            "expected_a": exp_a,
            "k_mult": k_mult
        })

        return new_rating_a, new_rating_b

    def new_season(self):
        """
        Apply regression to mean for new season.
        """
        mean_rating = np.mean(list(self.ratings.values())) if self.ratings else self.initial

        for team in self.ratings:
            self.ratings[team] = (
                self.ratings[team] * (1 - self.regression) +
                mean_rating * self.regression
            )

    def predict(self, team_a: str, team_b: str, is_home_a: bool = True) -> Dict:
        """
        Predict game outcome.

        Returns dict with win probabilities and predicted spread.
        """
        rating_a = self.get_rating(team_a) + (self.home_adv if is_home_a else -self.home_adv)
        rating_b = self.get_rating(team_b) + (self.home_adv if not is_home_a else -self.home_adv)

        prob_a = self.expected_score(rating_a, rating_b)

        # Elo to spread conversion (rough: 25 Elo = 1 point)
        spread = (rating_a - rating_b) / 25

        return {
            "prob_a": prob_a,
            "prob_b": 1 - prob_a,
            "spread": round(spread, 1),
            "rating_a": self.get_rating(team_a),
            "rating_b": self.get_rating(team_b)
        }

    def get_rankings(self) -> pd.DataFrame:
        """Get current rankings."""
        df = pd.DataFrame([
            {"Team": team, "Rating": rating}
            for team, rating in self.ratings.items()
        ])
        df = df.sort_values("Rating", ascending=False).reset_index(drop=True)
        df["Rank"] = df.index + 1
        return df[["Rank", "Team", "Rating"]]

    def process_games(self, games: pd.DataFrame) -> "EloRating":
        """
        Process multiple games.

        Expected columns: team_a, team_b, score_a, score_b, is_home_a (optional)
        """
        for _, row in games.iterrows():
            is_home_a = row.get("is_home_a", True)
            self.update(
                row["team_a"],
                row["team_b"],
                row["score_a"],
                row["score_b"],
                is_home_a
            )
        return self


# Example usage
if __name__ == "__main__":
    # Initialize Elo system
    elo = EloRating(k_factor=20, home_advantage=65)

    # Sample NFL season games
    np.random.seed(42)
    teams = ["KC", "BUF", "PHI", "SF", "DAL", "MIA", "BAL", "DET",
             "CIN", "JAX", "NYJ", "LAC", "SEA", "MIN", "GB", "TB"]

    # Generate sample games
    games = []
    for week in range(17):
        np.random.shuffle(teams)
        for i in range(0, len(teams), 2):
            home = teams[i]
            away = teams[i+1]
            # Generate scores
            home_score = np.random.poisson(24)
            away_score = np.random.poisson(21)
            games.append({
                "week": week + 1,
                "team_a": home,
                "team_b": away,
                "score_a": home_score,
                "score_b": away_score,
                "is_home_a": True
            })

    games_df = pd.DataFrame(games)

    # Process all games
    elo.process_games(games_df)

    # Show rankings
    print("Final Elo Rankings:")
    print(elo.get_rankings())

    # Make a prediction
    prediction = elo.predict("KC", "BUF", is_home_a=True)
    print(f"\nKC vs BUF prediction:")
    print(f"  KC win prob: {prediction['prob_a']:.1%}")
    print(f"  Spread: KC {prediction['spread']}")

python Basketball

Win Probability Model

Calculate real-time win probability for sports games based on game state.

"""Win Probability model for sports games."""
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.ensemble import GradientBoostingClassifier
from typing import Dict, List, Tuple

class WinProbability:
    """
    Calculate real-time win probability based on game state.

    Features:
    - Score differential
    - Time remaining
    - Possession (for applicable sports)
    - Field position (football)
    - Historical modeling
    """

    def __init__(self, sport: str = "basketball"):
        self.sport = sport
        self.model = None

    def analytical_wp(
        self,
        score_diff: float,
        time_remaining: float,
        total_time: float,
        tempo: float = None
    ) -> float:
        """
        Calculate win probability analytically.

        Uses normal distribution assumption for score differential.

        Args:
            score_diff: Current lead (positive = winning)
            time_remaining: Time remaining in game
            total_time: Total game time
            tempo: Scoring rate (points per time unit)
        """
        if time_remaining <= 0:
            return 1.0 if score_diff > 0 else (0.5 if score_diff == 0 else 0.0)

        # Estimate scoring volatility
        if tempo is None:
            tempo_map = {
                "basketball": 100 / 48,  # ~100 points per 48 min
                "football": 24 / 60,     # ~24 points per 60 min
                "soccer": 2.7 / 90,      # ~2.7 goals per 90 min
                "hockey": 6 / 60,        # ~6 goals per 60 min
                "baseball": 9 / 9        # ~9 runs per 9 innings
            }
            tempo = tempo_map.get(self.sport, 1)

        # Standard deviation of scoring in remaining time
        # Approximate: std grows with sqrt of time
        remaining_pct = time_remaining / total_time
        expected_std = tempo * np.sqrt(time_remaining * 2)  # Both teams score

        # Win probability using normal CDF
        if expected_std > 0:
            z_score = score_diff / expected_std
            wp = stats.norm.cdf(z_score)
        else:
            wp = 1.0 if score_diff > 0 else 0.5

        return wp

    def train_model(
        self,
        game_data: pd.DataFrame,
        feature_cols: List[str],
        target_col: str = "home_win"
    ) -> "WinProbability":
        """
        Train ML model for win probability.

        Args:
            game_data: DataFrame with game state features and outcomes
            feature_cols: Columns to use as features
            target_col: Binary win indicator column
        """
        X = game_data[feature_cols]
        y = game_data[target_col]

        self.feature_cols = feature_cols
        self.model = GradientBoostingClassifier(
            n_estimators=100,
            max_depth=4,
            learning_rate=0.1,
            random_state=42
        )
        self.model.fit(X, y)

        return self

    def predict_wp(self, game_state: Dict) -> float:
        """
        Predict win probability for a game state.

        Args:
            game_state: Dict with feature values
        """
        if self.model is None:
            # Use analytical method
            return self.analytical_wp(
                game_state.get("score_diff", 0),
                game_state.get("time_remaining", 0),
                game_state.get("total_time", 48)
            )

        # Use trained model
        X = pd.DataFrame([{col: game_state.get(col, 0) for col in self.feature_cols}])
        return self.model.predict_proba(X)[0, 1]

    def calculate_wpa(
        self,
        game_log: pd.DataFrame,
        time_col: str = "time_remaining",
        score_col: str = "score_diff"
    ) -> pd.DataFrame:
        """
        Calculate Win Probability Added for each play.

        WPA = WP(after) - WP(before)
        """
        game_log = game_log.copy()

        # Calculate WP before each play
        game_log["wp_before"] = game_log.apply(
            lambda row: self.analytical_wp(
                row[score_col],
                row[time_col],
                game_log[time_col].max()
            ),
            axis=1
        )

        # WP after is WP before of next play
        game_log["wp_after"] = game_log["wp_before"].shift(-1)
        game_log.loc[game_log.index[-1], "wp_after"] = (
            1.0 if game_log[score_col].iloc[-1] > 0 else 0.0
        )

        # Calculate WPA
        game_log["wpa"] = game_log["wp_after"] - game_log["wp_before"]

        return game_log

    def leverage_index(
        self,
        score_diff: float,
        time_remaining: float,
        total_time: float
    ) -> float:
        """
        Calculate Leverage Index (importance of situation).

        LI = sensitivity of WP to scoring events.
        Higher LI = more important moment.
        """
        # Calculate WP at current state
        wp = self.analytical_wp(score_diff, time_remaining, total_time)

        # Calculate WP if +1 and -1 scoring event
        wp_plus = self.analytical_wp(score_diff + 1, time_remaining, total_time)
        wp_minus = self.analytical_wp(score_diff - 1, time_remaining, total_time)

        # LI is the change in WP from a scoring event
        # Normalized so average is ~1.0
        li = (wp_plus - wp_minus) / 0.04  # 0.04 is approximately average WP swing

        return max(li, 0)


def plot_win_probability(wp_series: pd.Series):
    """
    Create win probability chart.
    """
    import matplotlib.pyplot as plt

    plt.figure(figsize=(12, 6))
    plt.plot(wp_series.values, linewidth=2)
    plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
    plt.fill_between(range(len(wp_series)),
                     wp_series.values,
                     0.5,
                     alpha=0.3,
                     where=(wp_series.values >= 0.5),
                     color='green')
    plt.fill_between(range(len(wp_series)),
                     wp_series.values,
                     0.5,
                     alpha=0.3,
                     where=(wp_series.values < 0.5),
                     color='red')
    plt.xlabel("Play Number")
    plt.ylabel("Home Team Win Probability")
    plt.title("Win Probability Chart")
    plt.ylim(0, 1)
    plt.grid(True, alpha=0.3)
    return plt


# Example usage
if __name__ == "__main__":
    # Basketball example
    wp_model = WinProbability(sport="basketball")

    # Sample game situations
    situations = [
        {"score_diff": 0, "time_remaining": 48, "description": "Start of game"},
        {"score_diff": 10, "time_remaining": 24, "description": "Up 10 at half"},
        {"score_diff": 5, "time_remaining": 12, "description": "Up 5, 4th quarter"},
        {"score_diff": -3, "time_remaining": 2, "description": "Down 3, 2 min left"},
        {"score_diff": 2, "time_remaining": 0.1, "description": "Up 2, 6 seconds left"}
    ]

    print("Basketball Win Probabilities:")
    for sit in situations:
        wp = wp_model.analytical_wp(sit["score_diff"], sit["time_remaining"], 48)
        li = wp_model.leverage_index(sit["score_diff"], sit["time_remaining"], 48)
        print(f"  {sit['description']}: WP={wp:.1%}, LI={li:.2f}")

    # Simulate a game for WP chart
    np.random.seed(42)
    plays = []
    score_diff = 0
    time_left = 48

    while time_left > 0:
        time_elapsed = np.random.exponential(0.5)  # Minutes between scores
        time_left = max(0, time_left - time_elapsed)
        score_change = np.random.choice([-3, -2, 2, 3], p=[0.2, 0.3, 0.3, 0.2])
        score_diff += score_change

        plays.append({
            "time_remaining": time_left,
            "score_diff": score_diff
        })

    game_df = pd.DataFrame(plays)
    game_df = wp_model.calculate_wpa(game_df)

    print(f"\nSimulated game: Final score diff = {score_diff}")
    print("Highest WPA plays:")
    print(game_df.nlargest(5, "wpa")[["time_remaining", "score_diff", "wp_before", "wp_after", "wpa"]])

python Baseball

Bayesian Player True Talent Estimation

Estimate player true talent levels using Bayesian methods with population priors.

"""Bayesian True Talent Estimation for player statistics."""
import numpy as np
import pandas as pd
from scipy import stats
from typing import Tuple, Dict, Optional

class BayesianTrueTalent:
    """
    Estimate true talent using Bayesian methods.

    Combines observed performance with league-wide prior
    to get better estimates, especially for small samples.
    """

    def __init__(self, prior_mean: float = None, prior_var: float = None):
        """
        Initialize with prior parameters.

        If not provided, priors will be estimated from data.
        """
        self.prior_mean = prior_mean
        self.prior_var = prior_var
        self.binomial_n = None

    def fit_prior(self, observed: pd.Series, n: pd.Series = None) -> "BayesianTrueTalent":
        """
        Estimate prior parameters from population data.

        For rates: use beta-binomial model
        For continuous: use normal-normal model

        Args:
            observed: Observed values (rates or means)
            n: Sample sizes (for rates)
        """
        if n is not None:
            # For rates - use beta-binomial
            successes = observed * n
            self._fit_beta_binomial(successes.values, n.values)
        else:
            # For continuous stats - use normal model
            self.prior_mean = observed.mean()
            # Estimate true variance (observed variance minus sampling variance)
            self.prior_var = max(observed.var() - (observed.mean() * (1-observed.mean()) / 500), 0.001)

        return self

    def _fit_beta_binomial(self, successes: np.ndarray, trials: np.ndarray):
        """
        Fit beta prior parameters using method of moments.
        """
        rates = successes / trials

        # Weighted mean and variance
        weights = trials / trials.sum()
        mean_rate = np.average(rates, weights=weights)
        var_rate = np.average((rates - mean_rate)**2, weights=weights)

        # Estimate within-player variance
        expected_binomial_var = np.mean(rates * (1 - rates) / trials)

        # Between-player variance (true talent variance)
        between_var = max(var_rate - expected_binomial_var, 0.0001)

        self.prior_mean = mean_rate
        self.prior_var = between_var

    def estimate(
        self,
        observed: float,
        n: int,
        return_interval: bool = False
    ) -> Dict:
        """
        Estimate true talent for a single observation.

        Args:
            observed: Observed rate or mean
            n: Sample size
            return_interval: Return credible interval?

        Returns:
            Dict with estimate and optional interval
        """
        if self.prior_mean is None:
            raise ValueError("Must fit prior first or provide prior parameters")

        # Calculate posterior parameters
        # Using normal approximation for simplicity

        # Observation variance
        if 0 <= observed <= 1:  # Rate
            obs_var = observed * (1 - observed) / n
        else:  # Continuous stat
            obs_var = self.prior_var / n  # Approximate

        # Posterior mean (weighted average)
        total_precision = 1/self.prior_var + 1/obs_var
        posterior_mean = (
            (self.prior_mean / self.prior_var + observed / obs_var) /
            total_precision
        )

        # Posterior variance
        posterior_var = 1 / total_precision

        # Regression to mean
        regression_pct = obs_var / (self.prior_var + obs_var)

        result = {
            "observed": observed,
            "estimated": posterior_mean,
            "regression_pct": regression_pct,
            "n": n
        }

        if return_interval:
            # 95% credible interval
            posterior_std = np.sqrt(posterior_var)
            result["ci_lower"] = posterior_mean - 1.96 * posterior_std
            result["ci_upper"] = posterior_mean + 1.96 * posterior_std

        return result

    def estimate_population(
        self,
        df: pd.DataFrame,
        obs_col: str,
        n_col: str,
        name_col: str = None
    ) -> pd.DataFrame:
        """
        Estimate true talent for all players.

        Args:
            df: DataFrame with player data
            obs_col: Column with observed rates/means
            n_col: Column with sample sizes
            name_col: Column with player names
        """
        results = []
        for _, row in df.iterrows():
            est = self.estimate(row[obs_col], row[n_col], return_interval=True)
            if name_col:
                est["player"] = row[name_col]
            results.append(est)

        result_df = pd.DataFrame(results)

        # Sort by estimate
        result_df = result_df.sort_values("estimated", ascending=False)

        return result_df


def regress_to_mean(
    observed: float,
    n: int,
    league_mean: float,
    regression_n: int = 1200
) -> float:
    """
    Simple regression to mean formula.

    Args:
        observed: Observed rate
        n: Sample size (e.g., PA for batting average)
        league_mean: League average
        regression_n: Sample size where regression = 50%
                     (~1200 PA for batting average)
    """
    weight = n / (n + regression_n)
    return weight * observed + (1 - weight) * league_mean


# Example usage
if __name__ == "__main__":
    # Create sample batting data
    np.random.seed(42)
    n_players = 100

    # True talent (unknown in real life)
    true_talent = np.random.beta(80, 240, n_players)  # ~.250 average

    # Observed performance
    pa = np.random.randint(100, 600, n_players)
    hits = np.array([np.random.binomial(p, t) for p, t in zip(pa, true_talent)])
    observed_avg = hits / pa

    players = pd.DataFrame({
        "Player": [f"Player_{i}" for i in range(n_players)],
        "PA": pa,
        "H": hits,
        "AVG": observed_avg,
        "True_Talent": true_talent  # Usually unknown
    })

    # Fit Bayesian model
    bayes = BayesianTrueTalent()
    bayes.fit_prior(players["AVG"], players["PA"])

    print(f"Prior Mean: {bayes.prior_mean:.3f}")
    print(f"Prior Std: {np.sqrt(bayes.prior_var):.3f}")

    # Estimate true talent
    estimates = bayes.estimate_population(players, "AVG", "PA", "Player")

    # Compare with actual (we know true talent in simulation)
    estimates = estimates.merge(
        players[["Player", "True_Talent"]],
        left_on="player",
        right_on="Player"
    )

    print("\nTop 10 Estimated Players:")
    print(estimates[["player", "observed", "estimated", "True_Talent", "n"]].head(10).round(3))

    # Error comparison
    obs_error = np.abs(players["AVG"] - players["True_Talent"]).mean()
    est_error = np.abs(estimates["estimated"] - estimates["True_Talent"]).mean()
    print(f"\nMean Absolute Error:")
    print(f"  Observed: {obs_error:.4f}")
    print(f"  Estimated: {est_error:.4f}")
    print(f"  Improvement: {(obs_error - est_error) / obs_error:.1%}")

python Baseball

Player Aging Curves

Model how player performance changes with age across different sports.

"""Player Aging Curve Analysis."""
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from sklearn.linear_model import Ridge
import warnings

class AgingCurve:
    """
    Model player performance aging curves.

    Methods:
    - Delta method (year-over-year changes)
    - Regression-based
    - Parametric curve fitting
    """

    def __init__(self, peak_age: int = None, sport: str = "baseball"):
        """
        Initialize aging curve model.

        Args:
            peak_age: Expected peak age (None = estimate from data)
            sport: Sport for default parameters
        """
        # Default peak ages by sport
        default_peaks = {
            "baseball": 27,
            "basketball": 27,
            "football": 27,
            "soccer": 26,
            "hockey": 25
        }
        self.peak_age = peak_age or default_peaks.get(sport, 27)
        self.sport = sport
        self.curve_params = None

    @staticmethod
    def quadratic_aging(age: np.ndarray, a: float, b: float, c: float) -> np.ndarray:
        """Quadratic aging curve: performance = a*age^2 + b*age + c"""
        return a * age**2 + b * age + c

    @staticmethod
    def asymmetric_aging(
        age: np.ndarray,
        peak: float,
        peak_age: float,
        growth_rate: float,
        decline_rate: float
    ) -> np.ndarray:
        """
        Asymmetric aging curve with different growth/decline rates.

        Allows for faster decline than growth.
        """
        result = np.zeros_like(age, dtype=float)
        young_mask = age <= peak_age
        old_mask = age > peak_age

        # Growth phase
        result[young_mask] = peak * (1 - np.exp(-growth_rate * (age[young_mask] - 18)))

        # Decline phase
        result[old_mask] = peak * np.exp(-decline_rate * (age[old_mask] - peak_age))

        return result

    def delta_method(
        self,
        df: pd.DataFrame,
        player_col: str = "Player",
        age_col: str = "Age",
        stat_col: str = "WAR",
        min_pa: int = 200,
        pa_col: str = "PA"
    ) -> pd.DataFrame:
        """
        Calculate aging curve using delta method.

        Compares same players across consecutive seasons.

        Args:
            df: DataFrame with player seasons
            player_col: Player identifier column
            age_col: Age column
            stat_col: Statistic to model
            min_pa: Minimum playing time threshold
            pa_col: Playing time column
        """
        # Filter to qualified seasons
        qualified = df[df[pa_col] >= min_pa].copy()
        qualified = qualified.sort_values([player_col, age_col])

        # Calculate year-over-year changes
        deltas = []
        for player, group in qualified.groupby(player_col):
            if len(group) < 2:
                continue

            for i in range(len(group) - 1):
                curr = group.iloc[i]
                next_yr = group.iloc[i + 1]

                # Only consecutive ages
                if next_yr[age_col] - curr[age_col] != 1:
                    continue

                deltas.append({
                    "player": player,
                    "age_from": curr[age_col],
                    "age_to": next_yr[age_col],
                    "mid_age": (curr[age_col] + next_yr[age_col]) / 2,
                    "delta": next_yr[stat_col] - curr[stat_col],
                    "weight": min(curr[pa_col], next_yr[pa_col])
                })

        delta_df = pd.DataFrame(deltas)

        # Weight-adjusted average delta by age
        aging = delta_df.groupby("age_from").apply(
            lambda x: np.average(x["delta"], weights=x["weight"])
        ).reset_index()
        aging.columns = ["Age", "Delta"]

        # Cumulative aging curve (relative to peak age)
        aging = aging.sort_values("Age")
        aging["Cumulative"] = aging["Delta"].cumsum()

        # Normalize to 0 at peak age
        peak_value = aging.loc[aging["Age"] == self.peak_age, "Cumulative"]
        if len(peak_value) > 0:
            aging["Cumulative"] = aging["Cumulative"] - peak_value.values[0]

        return aging

    def fit_parametric(
        self,
        ages: np.ndarray,
        values: np.ndarray,
        weights: np.ndarray = None,
        model: str = "quadratic"
    ) -> dict:
        """
        Fit parametric aging curve.

        Args:
            ages: Array of ages
            values: Array of performance values
            weights: Optional weights
            model: "quadratic" or "asymmetric"
        """
        if model == "quadratic":
            func = self.quadratic_aging
            p0 = [-0.1, 5, -50]  # Initial guess
            bounds = ([-1, 0, -200], [0, 20, 0])
        else:
            func = self.asymmetric_aging
            p0 = [10, 27, 0.5, 0.1]  # peak, peak_age, growth, decline
            bounds = ([0, 22, 0, 0], [50, 32, 2, 1])

        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                if weights is not None:
                    sigma = 1 / np.sqrt(weights)
                    popt, pcov = curve_fit(func, ages, values, p0=p0,
                                          bounds=bounds, sigma=sigma, maxfev=5000)
                else:
                    popt, pcov = curve_fit(func, ages, values, p0=p0,
                                          bounds=bounds, maxfev=5000)

            self.curve_params = popt
            self.curve_func = func

            # Find peak age
            test_ages = np.linspace(20, 40, 100)
            pred_values = func(test_ages, *popt)
            self.peak_age = test_ages[np.argmax(pred_values)]

            return {
                "params": popt,
                "peak_age": self.peak_age,
                "peak_value": np.max(pred_values)
            }

        except Exception as e:
            print(f"Fitting failed: {e}")
            return None

    def predict(self, ages: np.ndarray) -> np.ndarray:
        """Predict performance at given ages."""
        if self.curve_params is None:
            raise ValueError("Must fit curve first")
        return self.curve_func(ages, *self.curve_params)

    def age_adjust(
        self,
        stat: float,
        current_age: int,
        target_age: int = None
    ) -> float:
        """
        Age-adjust a statistic.

        Args:
            stat: Current statistic value
            current_age: Player's current age
            target_age: Age to adjust to (default: peak age)
        """
        if target_age is None:
            target_age = self.peak_age

        current_adj = self.predict(np.array([current_age]))[0]
        target_adj = self.predict(np.array([target_age]))[0]

        # Adjust stat to target age level
        adjustment = target_adj - current_adj
        return stat + adjustment


# Example usage
if __name__ == "__main__":
    # Create synthetic career data
    np.random.seed(42)
    players = []

    for player_id in range(100):
        peak = np.random.normal(3, 1.5)  # WAR peak
        peak_age = np.random.normal(27, 2)
        career_start = np.random.randint(22, 26)
        career_end = np.random.randint(33, 40)

        for age in range(career_start, career_end + 1):
            # Quadratic aging
            war = peak - 0.05 * (age - peak_age)**2 + np.random.normal(0, 0.5)
            pa = np.random.randint(300, 650)

            players.append({
                "Player": f"Player_{player_id}",
                "Age": age,
                "WAR": max(war, -1),
                "PA": pa
            })

    df = pd.DataFrame(players)
    print(f"Generated {len(df)} player-seasons")

    # Calculate aging curve using delta method
    aging = AgingCurve(sport="baseball")
    curve = aging.delta_method(df, stat_col="WAR")

    print("\nAging Curve (Delta Method):")
    print(curve)

    # Fit parametric curve
    ages = df.groupby("Age")["WAR"].mean()
    result = aging.fit_parametric(
        ages.index.values,
        ages.values,
        model="quadratic"
    )

    print(f"\nParametric fit peak age: {result['peak_age']:.1f}")

    # Age-adjust a player
    sample_war = 5.0
    sample_age = 32

    adjusted = aging.age_adjust(sample_war, sample_age, 27)
    print(f"\nAge {sample_age} WAR {sample_war:.1f} -> Age 27 equivalent: {adjusted:.1f}")

python Football

EPA (Expected Points Added) Calculator

Calculate Expected Points Added for football plays using play-by-play data.

"""EPA (Expected Points Added) calculator for football."""
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from typing import Dict, List, Tuple

class EPACalculator:
    """
    Calculate Expected Points Added for football plays.

    EPA = EP(end state) - EP(start state)

    EP is based on field position, down, distance, and time.
    """

    def __init__(self):
        self.ep_model = None
        self.is_fitted = False

    def calculate_ep_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Calculate features for EP model.

        Expected columns: yardline_100, down, ydstogo, half_seconds_remaining
        """
        features = pd.DataFrame()

        features["yardline"] = df["yardline_100"]
        features["down"] = df["down"]
        features["ydstogo"] = df["ydstogo"].clip(1, 30)
        features["goal_to_go"] = (df["ydstogo"] >= df["yardline_100"]).astype(int)

        # Log transform of time
        features["log_time"] = np.log(df["half_seconds_remaining"].clip(1, 1800) + 1)

        # Interaction features
        features["down_yardline"] = features["down"] * features["yardline"]
        features["down_ydstogo"] = features["down"] * features["ydstogo"]

        # Field position zones
        features["red_zone"] = (df["yardline_100"] <= 20).astype(int)
        features["fg_range"] = ((df["yardline_100"] <= 35) & (df["yardline_100"] > 20)).astype(int)
        features["own_territory"] = (df["yardline_100"] > 50).astype(int)

        return features

    def train_ep_model(
        self,
        plays: pd.DataFrame,
        ep_col: str = "next_score_ep"
    ) -> "EPACalculator":
        """
        Train EP model on historical play data.

        Args:
            plays: DataFrame with play-by-play data
            ep_col: Column with actual next score value
        """
        # Filter to valid plays
        valid = plays[
            (plays["down"].between(1, 4)) &
            (plays["yardline_100"].between(1, 99)) &
            plays[ep_col].notna()
        ].copy()

        X = self.calculate_ep_features(valid)
        y = valid[ep_col]

        self.ep_model = GradientBoostingRegressor(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            random_state=42
        )
        self.ep_model.fit(X, y)
        self.is_fitted = True

        return self

    def predict_ep(self, plays: pd.DataFrame) -> np.ndarray:
        """Predict expected points for game states."""
        if not self.is_fitted:
            raise ValueError("Model must be trained first")

        X = self.calculate_ep_features(plays)
        return self.ep_model.predict(X)

    def calculate_epa(self, plays: pd.DataFrame) -> pd.Series:
        """
        Calculate EPA for each play.

        Requires pre_ep and post_ep columns, or will calculate them.
        """
        plays = plays.copy()

        # Calculate start state EP
        plays["ep_before"] = self.predict_ep(plays)

        # Calculate end state EP (need to transform for possession changes)
        end_states = plays.copy()

        # Handle possession changes
        if "posteam_change" in plays.columns:
            # Flip yardline for turnovers
            mask = plays["posteam_change"] == 1
            end_states.loc[mask, "yardline_100"] = 100 - end_states.loc[mask, "yardline_100"]
            # EP flips sign for turnovers
            ep_after = self.predict_ep(end_states)
            ep_after[mask] = -ep_after[mask]
        else:
            ep_after = self.predict_ep(end_states)

        # Handle scoring plays
        if "touchdown" in plays.columns:
            ep_after = np.where(plays["touchdown"] == 1, 7, ep_after)
        if "field_goal_result" in plays.columns:
            ep_after = np.where(plays["field_goal_result"] == "made", 3, ep_after)
        if "safety" in plays.columns:
            ep_after = np.where(plays["safety"] == 1, -2, ep_after)

        # EPA = EP(after) - EP(before)
        return ep_after - plays["ep_before"]

    @staticmethod
    def get_base_ep_values() -> Dict[Tuple, float]:
        """
        Return base EP values by down/distance/field position.

        These are approximate values based on historical NFL data.
        """
        # (down, ydstogo_bucket, yardline_bucket): EP
        # Simplified for demonstration
        return {
            (1, 10, 80): -0.5,  # 1st & 10, own 20
            (1, 10, 50): 1.0,   # 1st & 10, midfield
            (1, 10, 20): 4.0,   # 1st & 10, opponent 20
            (1, 10, 5): 5.5,    # 1st & goal from 5
            # ... would have many more entries
        }


def aggregate_epa(plays: pd.DataFrame, group_col: str) -> pd.DataFrame:
    """
    Aggregate EPA by a grouping column (player, team, etc.)
    """
    agg = plays.groupby(group_col).agg({
        "epa": ["sum", "mean", "count"],
        "success": "mean"  # If success column exists
    }).round(3)

    agg.columns = ["total_epa", "epa_per_play", "plays", "success_rate"]
    return agg.sort_values("total_epa", ascending=False)


# Example usage
if __name__ == "__main__":
    # Create sample play data
    np.random.seed(42)
    n_plays = 1000

    plays = pd.DataFrame({
        "yardline_100": np.random.randint(1, 100, n_plays),
        "down": np.random.choice([1, 2, 3, 4], n_plays, p=[0.4, 0.3, 0.2, 0.1]),
        "ydstogo": np.random.randint(1, 20, n_plays),
        "half_seconds_remaining": np.random.randint(1, 1800, n_plays),
        "yards_gained": np.random.normal(5, 8, n_plays),
        "passer_player_name": np.random.choice(["QB1", "QB2", "QB3"], n_plays),
        "rusher_player_name": np.random.choice(["RB1", "RB2", None], n_plays)
    })

    # Create target (simplified)
    plays["next_score_ep"] = (
        7 * (1 - plays["yardline_100"]/100) -
        2 * (plays["yardline_100"]/100) +
        np.random.normal(0, 1, n_plays)
    )

    # Train EP model
    epa_calc = EPACalculator()
    epa_calc.train_ep_model(plays, "next_score_ep")

    # Calculate EPA
    plays["epa"] = epa_calc.calculate_epa(plays)
    plays["success"] = (plays["epa"] > 0).astype(int)

    print("EPA Statistics:")
    print(f"Mean EPA: {plays['epa'].mean():.3f}")
    print(f"Success Rate: {plays['success'].mean():.1%}")

    # Aggregate by passer
    print("\nQB EPA Rankings:")
    qb_epa = aggregate_epa(plays.dropna(subset=["passer_player_name"]), "passer_player_name")
    print(qb_epa)

python Basketball

RAPM (Regularized Adjusted Plus-Minus)

Calculate basketball player impact using regularized adjusted plus-minus regression.

"""RAPM (Regularized Adjusted Plus-Minus) for basketball."""
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV
from scipy import sparse
from typing import Dict, List, Tuple

class RAPM:
    """
    Calculate Regularized Adjusted Plus-Minus.

    RAPM estimates player impact by regressing point differential
    on player participation while controlling for teammates and opponents.
    """

    def __init__(self, lambda_values: List[float] = None):
        """
        Initialize RAPM model.

        Args:
            lambda_values: Ridge regularization values to try
        """
        if lambda_values is None:
            lambda_values = [0.01, 0.1, 1, 10, 100, 1000]

        self.model = RidgeCV(alphas=lambda_values, cv=5)
        self.player_ids = None
        self.player_names = None
        self.rapm_values = None

    def prepare_data(
        self,
        stints: pd.DataFrame,
        home_players: List[str],
        away_players: List[str],
        margin_col: str = "margin",
        possessions_col: str = "possessions"
    ) -> Tuple[sparse.csr_matrix, np.ndarray, np.ndarray]:
        """
        Prepare stint data for RAPM calculation.

        Args:
            stints: DataFrame where each row is a stint (period with same 10 players)
            home_players: Column names for 5 home player IDs
            away_players: Column names for 5 away player IDs
            margin_col: Column with point margin (home - away)
            possessions_col: Column with possession count

        Returns:
            X: Sparse matrix of player participation
            y: Point margin per 100 possessions
            weights: Possession-based weights
        """
        # Get unique players
        all_players = set()
        for col in home_players + away_players:
            all_players.update(stints[col].dropna().unique())

        self.player_ids = sorted(list(all_players))
        player_to_idx = {p: i for i, p in enumerate(self.player_ids)}
        n_players = len(self.player_ids)

        # Build sparse matrix
        n_stints = len(stints)
        rows, cols, data = [], [], []

        for stint_idx, row in stints.iterrows():
            # Home players get +1
            for col in home_players:
                if pd.notna(row[col]):
                    player_idx = player_to_idx[row[col]]
                    rows.append(stint_idx)
                    cols.append(player_idx)
                    data.append(1)

            # Away players get -1
            for col in away_players:
                if pd.notna(row[col]):
                    player_idx = player_to_idx[row[col]]
                    rows.append(stint_idx)
                    cols.append(player_idx)
                    data.append(-1)

        X = sparse.csr_matrix((data, (rows, cols)), shape=(n_stints, n_players))

        # Target: margin per 100 possessions
        y = (stints[margin_col] / stints[possessions_col]) * 100

        # Weights: sqrt of possessions
        weights = np.sqrt(stints[possessions_col].values)

        return X, y.values, weights

    def fit(
        self,
        stints: pd.DataFrame,
        home_players: List[str] = None,
        away_players: List[str] = None,
        player_names: Dict[str, str] = None
    ) -> "RAPM":
        """
        Fit RAPM model.

        Args:
            stints: Stint data
            home_players: Column names for home players (default: H1-H5)
            away_players: Column names for away players (default: A1-A5)
            player_names: Dict mapping player IDs to names
        """
        if home_players is None:
            home_players = ["H1", "H2", "H3", "H4", "H5"]
        if away_players is None:
            away_players = ["A1", "A2", "A3", "A4", "A5"]

        X, y, weights = self.prepare_data(stints, home_players, away_players)

        # Fit weighted ridge regression
        self.model.fit(X, y, sample_weight=weights)

        # Extract RAPM values
        self.rapm_values = pd.Series(
            self.model.coef_,
            index=self.player_ids,
            name="RAPM"
        ).sort_values(ascending=False)

        # Add names if provided
        if player_names:
            self.player_names = player_names

        return self

    def get_rankings(self, top_n: int = None) -> pd.DataFrame:
        """Get RAPM rankings."""
        if self.rapm_values is None:
            raise ValueError("Model not fitted")

        df = self.rapm_values.reset_index()
        df.columns = ["player_id", "RAPM"]

        if self.player_names:
            df["Player"] = df["player_id"].map(self.player_names)
        else:
            df["Player"] = df["player_id"]

        df["Rank"] = range(1, len(df) + 1)

        if top_n:
            df = df.head(top_n)

        return df[["Rank", "Player", "RAPM"]]

    def get_player_rapm(self, player_id: str) -> float:
        """Get RAPM for a specific player."""
        if self.rapm_values is None:
            raise ValueError("Model not fitted")
        return self.rapm_values.get(player_id, np.nan)


def create_sample_stints(n_stints: int = 5000, n_players: int = 100) -> pd.DataFrame:
    """Create sample stint data for demonstration."""
    np.random.seed(42)

    # Generate player effects
    player_effects = {f"P{i}": np.random.normal(0, 3) for i in range(n_players)}

    stints = []
    for _ in range(n_stints):
        # Randomly select 10 players
        players = np.random.choice(list(player_effects.keys()), 10, replace=False)
        home = players[:5]
        away = players[5:]

        # Calculate expected margin
        home_effect = sum(player_effects[p] for p in home)
        away_effect = sum(player_effects[p] for p in away)
        expected_margin = home_effect - away_effect + np.random.normal(2, 0)  # Home advantage

        possessions = np.random.randint(5, 30)
        actual_margin = expected_margin * possessions / 100 + np.random.normal(0, 3)

        stint = {
            "H1": home[0], "H2": home[1], "H3": home[2], "H4": home[3], "H5": home[4],
            "A1": away[0], "A2": away[1], "A3": away[2], "A4": away[3], "A5": away[4],
            "margin": actual_margin,
            "possessions": possessions
        }
        stints.append(stint)

    return pd.DataFrame(stints)


# Example usage
if __name__ == "__main__":
    # Generate sample data
    stints = create_sample_stints(10000, 50)
    print(f"Generated {len(stints)} stints with {50} players")

    # Fit RAPM
    rapm = RAPM()
    rapm.fit(stints)

    # Get rankings
    print("\nTop 15 Players by RAPM:")
    print(rapm.get_rankings(15))

    print(f"\nBest lambda: {rapm.model.alpha_:.2f}")

python Baseball

Player Similarity Analysis

Find similar players using statistical profiles and dimensionality reduction.

"""Player similarity analysis using statistical methods."""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.cluster import KMeans
from typing import List, Tuple, Dict

class PlayerSimilarity:
    """
    Find similar players based on statistical profiles.

    Methods:
    - Cosine similarity
    - Euclidean distance
    - PCA-based similarity
    """

    def __init__(self, stat_columns: List[str] = None):
        self.stat_columns = stat_columns
        self.scaler = StandardScaler()
        self.pca = None
        self.player_data = None
        self.scaled_data = None

    def fit(self, df: pd.DataFrame, player_col: str = "Player") -> "PlayerSimilarity":
        """
        Fit the similarity model.

        Args:
            df: DataFrame with player statistics
            player_col: Column containing player names
        """
        self.player_col = player_col
        self.player_data = df.copy()

        # Auto-detect numeric columns if not specified
        if self.stat_columns is None:
            self.stat_columns = df.select_dtypes(include=[np.number]).columns.tolist()
            # Remove common non-stat columns
            exclude = ["Age", "G", "GS", "Year", "Season"]
            self.stat_columns = [c for c in self.stat_columns if c not in exclude]

        # Scale the data
        stats = df[self.stat_columns].fillna(0)
        self.scaled_data = self.scaler.fit_transform(stats)

        return self

    def find_similar(
        self,
        player_name: str,
        n: int = 10,
        method: str = "cosine",
        exclude_self: bool = True
    ) -> pd.DataFrame:
        """
        Find most similar players.

        Args:
            player_name: Name of target player
            n: Number of similar players to return
            method: "cosine", "euclidean", or "pca"
            exclude_self: Whether to exclude the player from results
        """
        # Find player index
        player_idx = self.player_data[
            self.player_data[self.player_col].str.contains(player_name, case=False)
        ].index

        if len(player_idx) == 0:
            raise ValueError(f"Player not found: {player_name}")

        player_idx = player_idx[0]
        target_vector = self.scaled_data[player_idx].reshape(1, -1)

        # Calculate similarities
        if method == "cosine":
            similarities = cosine_similarity(target_vector, self.scaled_data)[0]
            higher_is_better = True
        elif method == "euclidean":
            similarities = -euclidean_distances(target_vector, self.scaled_data)[0]
            higher_is_better = True
        elif method == "pca":
            if self.pca is None:
                self.pca = PCA(n_components=min(10, len(self.stat_columns)))
                pca_data = self.pca.fit_transform(self.scaled_data)
            else:
                pca_data = self.pca.transform(self.scaled_data)
            target_pca = pca_data[player_idx].reshape(1, -1)
            similarities = cosine_similarity(target_pca, pca_data)[0]
            higher_is_better = True
        else:
            raise ValueError(f"Unknown method: {method}")

        # Create results DataFrame
        results = self.player_data.copy()
        results["Similarity"] = similarities

        # Sort and filter
        results = results.sort_values("Similarity", ascending=not higher_is_better)

        if exclude_self:
            results = results[results.index != player_idx]

        return results.head(n)[[self.player_col, "Similarity"] + self.stat_columns[:5]]

    def cluster_players(
        self,
        n_clusters: int = 8,
        method: str = "kmeans"
    ) -> pd.DataFrame:
        """
        Cluster players into groups.
        """
        if method == "kmeans":
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
            clusters = kmeans.fit_predict(self.scaled_data)

        results = self.player_data.copy()
        results["Cluster"] = clusters
        return results

    def get_player_percentiles(self, player_name: str) -> pd.Series:
        """
        Get percentile rankings for a player.
        """
        player_row = self.player_data[
            self.player_data[self.player_col].str.contains(player_name, case=False)
        ]

        if len(player_row) == 0:
            raise ValueError(f"Player not found: {player_name}")

        player_stats = player_row[self.stat_columns].iloc[0]

        percentiles = {}
        for col in self.stat_columns:
            pct = (self.player_data[col] < player_stats[col]).mean() * 100
            percentiles[col] = round(pct, 1)

        return pd.Series(percentiles)

    def compare_players(self, players: List[str]) -> pd.DataFrame:
        """
        Compare multiple players side by side with percentiles.
        """
        comparison = []

        for player in players:
            try:
                percentiles = self.get_player_percentiles(player)
                percentiles.name = player
                comparison.append(percentiles)
            except ValueError:
                print(f"Warning: Player not found: {player}")

        return pd.DataFrame(comparison)


# Example with baseball data
if __name__ == "__main__":
    # Create sample batting data
    np.random.seed(42)
    n_players = 200

    players = pd.DataFrame({
        "Player": [f"Player_{i}" for i in range(n_players)],
        "Team": np.random.choice(["NYY", "BOS", "LAD", "CHC", "HOU"], n_players),
        "PA": np.random.randint(300, 700, n_players),
        "AVG": np.random.normal(0.260, 0.030, n_players).clip(0.180, 0.350),
        "OBP": np.random.normal(0.330, 0.040, n_players).clip(0.250, 0.450),
        "SLG": np.random.normal(0.420, 0.060, n_players).clip(0.300, 0.650),
        "HR": np.random.randint(5, 50, n_players),
        "SB": np.random.randint(0, 40, n_players),
        "BB%": np.random.normal(0.09, 0.03, n_players).clip(0.03, 0.20),
        "K%": np.random.normal(0.22, 0.05, n_players).clip(0.08, 0.35),
        "wRC+": np.random.normal(100, 25, n_players).clip(50, 180)
    })

    # Calculate OPS
    players["OPS"] = players["OBP"] + players["SLG"]

    # Create similarity model
    sim = PlayerSimilarity(stat_columns=["AVG", "OBP", "SLG", "HR", "SB", "BB%", "K%", "wRC+"])
    sim.fit(players)

    # Find similar players
    print("Players similar to Player_0:")
    similar = sim.find_similar("Player_0", n=5)
    print(similar)

    # Get percentiles
    print("\nPlayer_0 Percentiles:")
    print(sim.get_player_percentiles("Player_0"))

    # Cluster players
    clustered = sim.cluster_players(n_clusters=5)
    print("\nCluster distribution:")
    print(clustered["Cluster"].value_counts().sort_index())

python Soccer

Expected Goals (xG) Model

Build a simple expected goals model for soccer using shot location and other features.

"""Expected Goals (xG) model for soccer."""
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss
from typing import Tuple

class ExpectedGoalsModel:
    """
    Simple xG model based on shot characteristics.

    Features typically used:
    - Distance to goal
    - Angle to goal
    - Body part (head/foot)
    - Shot type (open play, set piece, penalty)
    - Defender presence
    - Goalkeeper position
    """

    def __init__(self):
        self.model = LogisticRegression(max_iter=1000)
        self.feature_names = None
        self.is_fitted = False

    def calculate_distance(self, x: float, y: float) -> float:
        """
        Calculate distance from shot location to goal center.

        Assumes pitch coordinates: (0-100) x (0-100)
        Goal center at (100, 50)
        """
        goal_x, goal_y = 100, 50
        return np.sqrt((x - goal_x)**2 + (y - goal_y)**2)

    def calculate_angle(self, x: float, y: float) -> float:
        """
        Calculate angle to goal from shot location.

        Returns angle in degrees.
        """
        goal_width = 7.32  # meters, scaled to pitch units
        goal_y_min = 50 - (goal_width / 2) * (100 / 68)  # Scale to pitch
        goal_y_max = 50 + (goal_width / 2) * (100 / 68)

        # Calculate angles to both posts
        angle_1 = np.arctan2(goal_y_min - y, 100 - x)
        angle_2 = np.arctan2(goal_y_max - y, 100 - x)

        angle = abs(angle_2 - angle_1)
        return np.degrees(angle)

    def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Prepare features from shot data.

        Expected columns: x, y, body_part, shot_type
        """
        features = pd.DataFrame()

        # Distance and angle
        features["distance"] = df.apply(
            lambda r: self.calculate_distance(r["x"], r["y"]), axis=1
        )
        features["angle"] = df.apply(
            lambda r: self.calculate_angle(r["x"], r["y"]), axis=1
        )

        # Derived features
        features["distance_sq"] = features["distance"] ** 2
        features["log_distance"] = np.log(features["distance"] + 1)

        # Body part (one-hot)
        if "body_part" in df.columns:
            features["is_header"] = (df["body_part"] == "head").astype(int)
            features["is_foot"] = (df["body_part"].isin(["left_foot", "right_foot"])).astype(int)

        # Shot type
        if "shot_type" in df.columns:
            features["is_penalty"] = (df["shot_type"] == "penalty").astype(int)
            features["is_free_kick"] = (df["shot_type"] == "free_kick").astype(int)

        self.feature_names = features.columns.tolist()
        return features

    def fit(self, shots_df: pd.DataFrame, target_col: str = "is_goal") -> "ExpectedGoalsModel":
        """
        Train the xG model.

        Args:
            shots_df: DataFrame with shot data
            target_col: Column name for goal (1) or no goal (0)
        """
        X = self.prepare_features(shots_df)
        y = shots_df[target_col]

        self.model.fit(X, y)
        self.is_fitted = True

        return self

    def predict_xg(self, shots_df: pd.DataFrame) -> pd.Series:
        """Predict xG for shots."""
        if not self.is_fitted:
            raise ValueError("Model must be fitted first")

        X = self.prepare_features(shots_df)
        return pd.Series(self.model.predict_proba(X)[:, 1], index=shots_df.index)

    def evaluate(self, shots_df: pd.DataFrame, target_col: str = "is_goal") -> dict:
        """Evaluate model performance."""
        X = self.prepare_features(shots_df)
        y = shots_df[target_col]
        y_pred = self.model.predict_proba(X)[:, 1]

        return {
            "auc": roc_auc_score(y, y_pred),
            "log_loss": log_loss(y, y_pred),
            "avg_xg": y_pred.mean(),
            "actual_rate": y.mean()
        }


def create_xg_features_statsbomb(events_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create features from StatsBomb event data.
    """
    shots = events_df[events_df["type.name"] == "Shot"].copy()

    # Extract coordinates
    shots["x"] = shots["location"].apply(lambda loc: loc[0] if isinstance(loc, list) else None)
    shots["y"] = shots["location"].apply(lambda loc: loc[1] if isinstance(loc, list) else None)

    # Scale to 0-100
    shots["x"] = shots["x"] * (100 / 120)  # StatsBomb uses 120x80
    shots["y"] = shots["y"] * (100 / 80)

    # Outcome
    shots["is_goal"] = (shots["shot.outcome.name"] == "Goal").astype(int)

    # Body part
    shots["body_part"] = shots["shot.body_part.name"].str.lower().replace({
        "right foot": "right_foot",
        "left foot": "left_foot"
    })

    return shots


# Example usage
if __name__ == "__main__":
    # Create synthetic training data
    np.random.seed(42)
    n_shots = 1000

    # Generate shot locations (weighted toward penalty area)
    x = np.clip(np.random.normal(85, 10, n_shots), 60, 99)
    y = np.clip(np.random.normal(50, 15, n_shots), 0, 100)

    # Calculate base probability based on location
    dist = np.sqrt((x - 100)**2 + (y - 50)**2)
    base_prob = 1 / (1 + np.exp(0.15 * dist - 2))

    # Add noise and generate outcomes
    shots_df = pd.DataFrame({
        "x": x,
        "y": y,
        "body_part": np.random.choice(["right_foot", "left_foot", "head"], n_shots, p=[0.5, 0.3, 0.2]),
        "shot_type": np.random.choice(["open_play", "free_kick", "corner", "penalty"], n_shots, p=[0.8, 0.1, 0.08, 0.02]),
        "is_goal": (np.random.random(n_shots) < base_prob).astype(int)
    })

    # Split data
    train, test = train_test_split(shots_df, test_size=0.2, random_state=42)

    # Train model
    xg_model = ExpectedGoalsModel()
    xg_model.fit(train)

    # Evaluate
    metrics = xg_model.evaluate(test)
    print(f"Model Performance:")
    print(f"  AUC: {metrics['auc']:.3f}")
    print(f"  Log Loss: {metrics['log_loss']:.3f}")

    # Predict xG
    test["xG"] = xg_model.predict_xg(test)
    print(f"\nSample predictions:")
    print(test[["x", "y", "is_goal", "xG"]].head(10))

python Baseball

Calculate Pythagorean Win Expectation

Estimate team win percentage using Pythagorean expectation formula with various exponents.

"""Calculate Pythagorean Win Expectation for sports teams."""
import pandas as pd
import numpy as np
from typing import Union

def pythagorean_expectation(
    runs_scored: Union[int, pd.Series],
    runs_allowed: Union[int, pd.Series],
    exponent: float = None,
    sport: str = "baseball"
) -> Union[float, pd.Series]:
    """
    Calculate Pythagorean win expectation.

    The formula: Win% = RS^exp / (RS^exp + RA^exp)

    Args:
        runs_scored: Runs/points scored
        runs_allowed: Runs/points allowed
        exponent: Custom exponent (None uses sport default)
        sport: Sport for default exponent

    Default exponents by sport:
    - Baseball: 1.83 (original), 2.0 (simplified)
    - Basketball: 13.91 (NBA), 10.25 (College)
    - Football: 2.37 (NFL)
    - Hockey: 2.05 (NHL)
    - Soccer: 1.3-1.5

    Returns:
        Expected win percentage
    """
    # Default exponents by sport
    default_exponents = {
        "baseball": 1.83,
        "basketball": 13.91,
        "football": 2.37,
        "hockey": 2.05,
        "soccer": 1.35
    }

    if exponent is None:
        exponent = default_exponents.get(sport.lower(), 2.0)

    rs_exp = np.power(runs_scored, exponent)
    ra_exp = np.power(runs_allowed, exponent)

    return rs_exp / (rs_exp + ra_exp)


def pythagenpat(
    runs_scored: Union[int, pd.Series],
    runs_allowed: Union[int, pd.Series],
    games: int = 162
) -> Union[float, pd.Series]:
    """
    Calculate Pythagenpat (variable exponent Pythagorean).

    Uses: exponent = ((RS + RA) / G) ^ 0.287

    More accurate than fixed exponent for extreme teams.
    """
    rpg = (runs_scored + runs_allowed) / games
    exponent = np.power(rpg, 0.287)

    return pythagorean_expectation(runs_scored, runs_allowed, exponent)


def expected_wins(
    runs_scored: Union[int, pd.Series],
    runs_allowed: Union[int, pd.Series],
    games: int,
    method: str = "pythagenpat"
) -> Union[float, pd.Series]:
    """
    Calculate expected wins.

    Args:
        runs_scored: Total runs/points scored
        runs_allowed: Total runs/points allowed
        games: Number of games
        method: "pythagorean" or "pythagenpat"
    """
    if method == "pythagenpat":
        win_pct = pythagenpat(runs_scored, runs_allowed, games)
    else:
        win_pct = pythagorean_expectation(runs_scored, runs_allowed)

    return win_pct * games


def luck_factor(actual_wins: int, expected_wins: float) -> float:
    """
    Calculate luck factor (actual - expected wins).

    Positive = lucky (won more than expected)
    Negative = unlucky (won fewer than expected)
    """
    return actual_wins - expected_wins


# Example with MLB data
if __name__ == "__main__":
    # Sample team data
    teams = pd.DataFrame({
        "Team": ["NYY", "BOS", "TOR", "BAL", "TBR"],
        "W": [95, 89, 84, 78, 73],
        "L": [67, 73, 78, 84, 89],
        "RS": [850, 820, 780, 750, 700],
        "RA": [700, 720, 760, 800, 820]
    })

    teams["G"] = teams["W"] + teams["L"]
    teams["Win%"] = teams["W"] / teams["G"]
    teams["Pyth_Win%"] = pythagorean_expectation(teams["RS"], teams["RA"])
    teams["ExpWins"] = expected_wins(teams["RS"], teams["RA"], teams["G"])
    teams["Luck"] = luck_factor(teams["W"], teams["ExpWins"])

    print(teams[["Team", "W", "ExpWins", "Luck"]].round(1))

r Baseball

R API Client with httr2

Build a reusable sports API client in R using httr2 package with authentication and error handling.

# Sports API client in R using httr2
library(httr2)
library(jsonlite)
library(dplyr)
library(purrr)

#' Create a base API client
#'
#' @param base_url Base URL for the API
#' @param api_key Optional API key
#' @param rate_limit Requests per minute
#' @return API client object
create_api_client <- function(base_url, api_key = NULL, rate_limit = 60) {
  structure(
    list(
      base_url = base_url,
      api_key = api_key,
      rate_limit = rate_limit,
      last_request = Sys.time() - 60/rate_limit
    ),
    class = "sports_api_client"
  )
}

#' Make API request with rate limiting
#'
#' @param client API client object
#' @param endpoint API endpoint
#' @param params Query parameters
#' @param headers Additional headers
#' @return Parsed JSON response
api_request <- function(client, endpoint, params = list(), headers = list()) {
  # Rate limiting
  elapsed <- as.numeric(Sys.time() - client$last_request)
  min_interval <- 60 / client$rate_limit
  if (elapsed < min_interval) {
    Sys.sleep(min_interval - elapsed)
  }

  # Build request
  req <- request(paste0(client$base_url, "/", endpoint))

  # Add API key if present
  if (!is.null(client$api_key)) {
    req <- req %>% req_headers("X-API-Key" = client$api_key)
  }

  # Add custom headers
  if (length(headers) > 0) {
    req <- req %>% req_headers(!!!headers)
  }

  # Add query parameters
  if (length(params) > 0) {
    req <- req %>% req_url_query(!!!params)
  }

  # Make request with retry
  resp <- req %>%
    req_retry(max_tries = 3, backoff = ~ 2) %>%
    req_perform()

  # Update last request time
  client$last_request <<- Sys.time()

  # Parse response
  resp %>%
    resp_body_json()
}

# =====================
# MLB Stats API Example
# =====================

#' Create MLB API client
mlb_client <- function() {
  create_api_client("https://statsapi.mlb.com/api/v1", rate_limit = 60)
}

#' Get MLB teams
#'
#' @param client MLB API client
#' @param season Season year
#' @return Data frame of teams
mlb_get_teams <- function(client, season = NULL) {
  params <- list(sportId = 1)
  if (!is.null(season)) params$season <- season

  data <- api_request(client, "teams", params)

  map_df(data$teams, ~ tibble(
    id = .x$id,
    name = .x$name,
    abbreviation = .x$abbreviation %||% NA,
    team_name = .x$teamName,
    location = .x$locationName %||% NA,
    league = .x$league$name %||% NA,
    division = .x$division$name %||% NA
  ))
}

#' Get player stats
#'
#' @param client MLB API client
#' @param player_id Player ID
#' @param stat_type Type of stats (season, career, yearByYear)
#' @param stat_group Stat group (hitting, pitching, fielding)
#' @param season Season year
mlb_get_player_stats <- function(client, player_id, stat_type = "season",
                                   stat_group = "hitting", season = NULL) {
  params <- list(
    stats = stat_type,
    group = stat_group
  )
  if (!is.null(season)) params$season <- season

  data <- api_request(client, paste0("people/", player_id, "/stats"), params)

  if (length(data$stats) == 0) return(tibble())

  # Extract stats from nested structure
  stats <- data$stats[[1]]$splits

  if (length(stats) == 0) return(tibble())

  map_df(stats, function(split) {
    stat_data <- split$stat
    tibble(
      season = split$season %||% NA,
      team = split$team$name %||% NA,
      games = stat_data$gamesPlayed %||% NA,
      at_bats = stat_data$atBats %||% NA,
      hits = stat_data$hits %||% NA,
      home_runs = stat_data$homeRuns %||% NA,
      rbi = stat_data$rbi %||% NA,
      avg = stat_data$avg %||% NA,
      obp = stat_data$obp %||% NA,
      slg = stat_data$slg %||% NA,
      ops = stat_data$ops %||% NA
    )
  })
}

#' Get schedule
#'
#' @param client MLB API client
#' @param date Date string (YYYY-MM-DD)
mlb_get_schedule <- function(client, date = Sys.Date()) {
  params <- list(
    sportId = 1,
    date = format(as.Date(date), "%Y-%m-%d")
  )

  data <- api_request(client, "schedule", params)

  if (length(data$dates) == 0) return(tibble())

  games <- data$dates[[1]]$games

  map_df(games, ~ tibble(
    game_pk = .x$gamePk,
    game_date = .x$gameDate,
    status = .x$status$detailedState,
    home_team = .x$teams$home$team$name,
    away_team = .x$teams$away$team$name,
    home_score = .x$teams$home$score %||% NA,
    away_score = .x$teams$away$score %||% NA,
    venue = .x$venue$name %||% NA
  ))
}

# =====================
# Generic API Functions
# =====================

#' Safely extract nested value
`%||%` <- function(x, y) if (is.null(x)) y else x

#' Batch API requests with progress
#'
#' @param client API client
#' @param endpoint_template Template with {id} placeholder
#' @param ids Vector of IDs
#' @param parse_fn Function to parse each response
batch_requests <- function(client, endpoint_template, ids, parse_fn) {
  results <- list()

  pb <- txtProgressBar(min = 0, max = length(ids), style = 3)

  for (i in seq_along(ids)) {
    endpoint <- gsub("\{id\}", ids[i], endpoint_template)

    tryCatch({
      data <- api_request(client, endpoint)
      results[[as.character(ids[i])]] <- parse_fn(data)
    }, error = function(e) {
      warning(paste("Error for ID", ids[i], ":", e$message))
      results[[as.character(ids[i])]] <<- NULL
    })

    setTxtProgressBar(pb, i)
  }

  close(pb)
  bind_rows(results, .id = "request_id")
}

# Example usage
# client <- mlb_client()
# teams <- mlb_get_teams(client, 2024)
# schedule <- mlb_get_schedule(client, "2024-06-15")
# player_stats <- mlb_get_player_stats(client, 545361, "yearByYear", "hitting")  # Mike Trout

python Football

Odds API for Sports Betting Data

Access live betting odds from multiple sportsbooks via The Odds API.

"""The Odds API client for sports betting data."""
import requests
import pandas as pd
from typing import Optional, Dict, List
from datetime import datetime

class OddsAPI:
    """
    Client for The Odds API.

    Get API key at: https://the-odds-api.com/

    Free tier: 500 requests/month
    """

    BASE_URL = "https://api.the-odds-api.com/v4"

    SPORTS = {
        # US Sports
        "nfl": "americanfootball_nfl",
        "nba": "basketball_nba",
        "mlb": "baseball_mlb",
        "nhl": "icehockey_nhl",
        "ncaaf": "americanfootball_ncaaf",
        "ncaab": "basketball_ncaab",
        "mls": "soccer_usa_mls",
        # Soccer
        "epl": "soccer_epl",
        "la_liga": "soccer_spain_la_liga",
        "bundesliga": "soccer_germany_bundesliga",
        "serie_a": "soccer_italy_serie_a",
        "ligue_1": "soccer_france_ligue_one",
        "champions_league": "soccer_uefa_champs_league",
        # Other
        "ufc": "mma_mixed_martial_arts",
        "pga": "golf_pga_championship",
        "atp": "tennis_atp_aus_open"
    }

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.session = requests.Session()
        self.requests_remaining = None
        self.requests_used = None

    def _get(self, endpoint: str, params: Dict = None) -> Dict:
        """Make API request and track usage."""
        url = f"{self.BASE_URL}/{endpoint}"
        params = params or {}
        params["apiKey"] = self.api_key

        response = self.session.get(url, params=params)
        response.raise_for_status()

        # Track API usage
        self.requests_remaining = response.headers.get("x-requests-remaining")
        self.requests_used = response.headers.get("x-requests-used")

        return response.json()

    def get_sports(self, all_sports: bool = False) -> pd.DataFrame:
        """Get list of available sports."""
        params = {"all": "true"} if all_sports else {}
        data = self._get("sports", params)
        return pd.DataFrame(data)

    def get_odds(
        self,
        sport: str,
        regions: str = "us",
        markets: str = "h2h",
        odds_format: str = "american",
        bookmakers: List[str] = None
    ) -> pd.DataFrame:
        """
        Get current odds for a sport.

        Args:
            sport: Sport key (use SPORTS dict or raw key)
            regions: us, uk, eu, au (comma-separated for multiple)
            markets: h2h (moneyline), spreads, totals
            odds_format: american, decimal, fractional
            bookmakers: List of specific bookmakers

        Returns:
            DataFrame with odds data
        """
        sport_key = self.SPORTS.get(sport, sport)

        params = {
            "regions": regions,
            "markets": markets,
            "oddsFormat": odds_format
        }

        if bookmakers:
            params["bookmakers"] = ",".join(bookmakers)

        data = self._get(f"sports/{sport_key}/odds", params)

        # Flatten the nested structure
        games = []
        for game in data:
            game_info = {
                "id": game.get("id"),
                "sport": game.get("sport_key"),
                "commence_time": game.get("commence_time"),
                "home_team": game.get("home_team"),
                "away_team": game.get("away_team")
            }

            for bookmaker in game.get("bookmakers", []):
                book_name = bookmaker.get("key")
                for market in bookmaker.get("markets", []):
                    market_key = market.get("key")
                    for outcome in market.get("outcomes", []):
                        games.append({
                            **game_info,
                            "bookmaker": book_name,
                            "market": market_key,
                            "team": outcome.get("name"),
                            "price": outcome.get("price"),
                            "point": outcome.get("point")  # For spreads/totals
                        })

        return pd.DataFrame(games)

    def get_best_odds(self, sport: str, market: str = "h2h") -> pd.DataFrame:
        """Get best available odds across bookmakers."""
        odds_df = self.get_odds(sport, markets=market)

        if odds_df.empty:
            return odds_df

        # Find best price for each team/game
        best_odds = odds_df.loc[
            odds_df.groupby(['id', 'team'])['price'].idxmax()
        ]

        return best_odds[['commence_time', 'home_team', 'away_team',
                         'team', 'price', 'bookmaker']]

    def find_arbitrage(self, sport: str) -> pd.DataFrame:
        """
        Find potential arbitrage opportunities.

        Returns games where combined implied probabilities < 100%
        """
        odds_df = self.get_odds(sport, markets="h2h", odds_format="decimal")

        if odds_df.empty:
            return odds_df

        arb_opportunities = []

        for game_id in odds_df['id'].unique():
            game_odds = odds_df[odds_df['id'] == game_id]
            game_info = game_odds.iloc[0]

            # Get best odds for each outcome
            home_best = game_odds[game_odds['team'] == game_info['home_team']]['price'].max()
            away_best = game_odds[game_odds['team'] == game_info['away_team']]['price'].max()

            if pd.isna(home_best) or pd.isna(away_best):
                continue

            # Calculate implied probability
            implied_prob = (1/home_best + 1/away_best) * 100

            if implied_prob < 100:
                profit_margin = 100 - implied_prob
                arb_opportunities.append({
                    "game": f"{game_info['away_team']} @ {game_info['home_team']}",
                    "commence_time": game_info['commence_time'],
                    "home_odds": home_best,
                    "away_odds": away_best,
                    "implied_prob": round(implied_prob, 2),
                    "profit_margin": round(profit_margin, 2)
                })

        return pd.DataFrame(arb_opportunities)

    def compare_bookmakers(self, sport: str, market: str = "h2h") -> pd.DataFrame:
        """Compare odds across all bookmakers for a sport."""
        odds_df = self.get_odds(sport, markets=market)

        if odds_df.empty:
            return odds_df

        # Pivot to show bookmakers as columns
        pivot = odds_df.pivot_table(
            index=['commence_time', 'home_team', 'away_team', 'team'],
            columns='bookmaker',
            values='price',
            aggfunc='first'
        ).reset_index()

        return pivot


# Example usage
if __name__ == "__main__":
    odds = OddsAPI("YOUR_API_KEY")

    # Get available sports
    sports = odds.get_sports()
    print("Available Sports:")
    print(sports[['key', 'title', 'active']].head(20))

    # Get NFL odds
    # nfl_odds = odds.get_odds("nfl")
    # print("\nNFL Odds:")
    # print(nfl_odds.head())

    # Find best odds
    # best = odds.get_best_odds("nfl")
    # print("\nBest NFL Odds:")
    # print(best)

    print(f"\nAPI Usage: {odds.requests_used} used, {odds.requests_remaining} remaining")

python Football

SportsDataIO API Wrapper

Universal wrapper for SportsDataIO APIs covering NFL, NBA, MLB, NHL, and more.

"""SportsDataIO API wrapper for multiple sports."""
import requests
import pandas as pd
from typing import Optional, Dict, List
from datetime import date
from enum import Enum

class Sport(Enum):
    NFL = "nfl"
    NBA = "nba"
    MLB = "mlb"
    NHL = "nhl"
    CFB = "cfb"  # College Football
    CBB = "cbb"  # College Basketball
    NASCAR = "nascar"
    GOLF = "golf"
    MMA = "mma"
    SOCCER = "soccer"

class SportsDataIO:
    """
    Unified client for SportsDataIO APIs.

    Requires API key: https://sportsdata.io/
    """

    BASE_URLS = {
        Sport.NFL: "https://api.sportsdata.io/v3/nfl",
        Sport.NBA: "https://api.sportsdata.io/v3/nba",
        Sport.MLB: "https://api.sportsdata.io/v3/mlb",
        Sport.NHL: "https://api.sportsdata.io/v3/nhl",
        Sport.CFB: "https://api.sportsdata.io/v3/cfb",
        Sport.CBB: "https://api.sportsdata.io/v3/cbb",
        Sport.GOLF: "https://api.sportsdata.io/golf/v2",
        Sport.MMA: "https://api.sportsdata.io/v3/mma",
        Sport.SOCCER: "https://api.sportsdata.io/v4/soccer"
    }

    def __init__(self, api_keys: Dict[Sport, str]):
        """
        Initialize with API keys for each sport.

        Args:
            api_keys: Dictionary mapping Sport enum to API key
        """
        self.api_keys = api_keys
        self.session = requests.Session()

    def _get(self, sport: Sport, endpoint: str, params: Dict = None) -> Dict:
        """Make API request."""
        base_url = self.BASE_URLS.get(sport)
        if not base_url:
            raise ValueError(f"Unsupported sport: {sport}")

        api_key = self.api_keys.get(sport)
        if not api_key:
            raise ValueError(f"No API key for {sport}")

        url = f"{base_url}/{endpoint}"
        headers = {"Ocp-Apim-Subscription-Key": api_key}

        response = self.session.get(url, headers=headers, params=params or {})
        response.raise_for_status()
        return response.json()

    # Universal endpoints
    def get_teams(self, sport: Sport) -> pd.DataFrame:
        """Get all teams for a sport."""
        if sport == Sport.SOCCER:
            data = self._get(sport, "scores/json/Teams")
        else:
            data = self._get(sport, "scores/json/Teams")
        return pd.DataFrame(data)

    def get_players(self, sport: Sport) -> pd.DataFrame:
        """Get all active players."""
        if sport == Sport.GOLF:
            data = self._get(sport, "json/Players")
        else:
            data = self._get(sport, "scores/json/Players")
        return pd.DataFrame(data)

    def get_schedule(self, sport: Sport, season: int) -> pd.DataFrame:
        """Get season schedule."""
        if sport == Sport.NFL:
            data = self._get(sport, f"scores/json/Schedules/{season}")
        elif sport == Sport.MLB:
            data = self._get(sport, f"scores/json/Games/{season}")
        elif sport == Sport.NBA:
            data = self._get(sport, f"scores/json/Games/{season}")
        elif sport == Sport.NHL:
            data = self._get(sport, f"scores/json/Games/{season}")
        else:
            data = self._get(sport, f"scores/json/Games/{season}")
        return pd.DataFrame(data)

    def get_standings(self, sport: Sport, season: int) -> pd.DataFrame:
        """Get current standings."""
        data = self._get(sport, f"scores/json/Standings/{season}")
        return pd.DataFrame(data)

    def get_scores_by_date(self, sport: Sport, date_str: str) -> pd.DataFrame:
        """Get scores for a specific date (format: YYYY-MM-DD or YYYY-MON-DD)."""
        if sport == Sport.NFL:
            # NFL uses week-based
            data = self._get(sport, f"scores/json/ScoresByDate/{date_str}")
        else:
            data = self._get(sport, f"scores/json/GamesByDate/{date_str}")
        return pd.DataFrame(data)

    # Sport-specific stats
    def get_player_season_stats(
        self,
        sport: Sport,
        season: int,
        player_id: int = None
    ) -> pd.DataFrame:
        """Get player season statistics."""
        if player_id:
            data = self._get(sport, f"stats/json/PlayerSeasonStats/{season}")
            df = pd.DataFrame(data)
            return df[df['PlayerID'] == player_id]
        else:
            data = self._get(sport, f"stats/json/PlayerSeasonStats/{season}")
            return pd.DataFrame(data)

    def get_team_season_stats(self, sport: Sport, season: int) -> pd.DataFrame:
        """Get team season statistics."""
        data = self._get(sport, f"scores/json/TeamSeasonStats/{season}")
        return pd.DataFrame(data)

    # Projections (for fantasy/betting)
    def get_player_projections(self, sport: Sport, season: int) -> pd.DataFrame:
        """Get player stat projections."""
        data = self._get(sport, f"projections/json/PlayerSeasonProjectionStats/{season}")
        return pd.DataFrame(data)

    # News
    def get_news(self, sport: Sport) -> pd.DataFrame:
        """Get latest news."""
        data = self._get(sport, "scores/json/News")
        return pd.DataFrame(data)

    def get_player_news(self, sport: Sport, player_id: int) -> pd.DataFrame:
        """Get news for a specific player."""
        data = self._get(sport, f"scores/json/NewsByPlayerID/{player_id}")
        return pd.DataFrame(data)


# Example usage
if __name__ == "__main__":
    # Initialize with your API keys
    api_keys = {
        Sport.NFL: "YOUR_NFL_KEY",
        Sport.NBA: "YOUR_NBA_KEY",
        Sport.MLB: "YOUR_MLB_KEY"
    }

    sportsdata = SportsDataIO(api_keys)

    # Get NFL teams
    # nfl_teams = sportsdata.get_teams(Sport.NFL)
    # print(nfl_teams)

    # Get NBA standings
    # nba_standings = sportsdata.get_standings(Sport.NBA, 2024)
    # print(nba_standings)

python Hockey

NHL Stats API Client

Access NHL statistics through the official NHL Stats API.

"""NHL Stats API client."""
import requests
import pandas as pd
from typing import Optional, Dict, List
from datetime import date

class NHLStatsAPI:
    """
    Client for NHL Stats API.

    No authentication required.
    """

    BASE_URL = "https://api-web.nhle.com/v1"
    STATS_URL = "https://api.nhle.com/stats/rest/en"

    def __init__(self):
        self.session = requests.Session()

    def _get(self, url: str, params: Dict = None) -> Dict:
        """Make API request."""
        response = self.session.get(url, params=params or {})
        response.raise_for_status()
        return response.json()

    # Players
    def get_player(self, player_id: int) -> Dict:
        """Get player details."""
        return self._get(f"{self.BASE_URL}/player/{player_id}/landing")

    def get_player_stats(self, player_id: int, season: str = None) -> Dict:
        """Get player statistics."""
        return self._get(f"{self.BASE_URL}/player/{player_id}/game-log/{season or 'now'}")

    # Teams
    def get_teams(self) -> pd.DataFrame:
        """Get all NHL teams."""
        data = self._get(f"{self.STATS_URL}/team")
        return pd.DataFrame(data.get("data", []))

    def get_team_roster(self, team_abbrev: str, season: str = "20242025") -> Dict:
        """Get team roster."""
        return self._get(f"{self.BASE_URL}/roster/{team_abbrev}/{season}")

    def get_team_schedule(self, team_abbrev: str, season: str = "20242025") -> pd.DataFrame:
        """Get team schedule."""
        data = self._get(f"{self.BASE_URL}/club-schedule-season/{team_abbrev}/{season}")

        games = []
        for game in data.get("games", []):
            games.append({
                "game_id": game.get("id"),
                "date": game.get("gameDate"),
                "game_type": game.get("gameType"),
                "home_team": game.get("homeTeam", {}).get("abbrev"),
                "away_team": game.get("awayTeam", {}).get("abbrev"),
                "home_score": game.get("homeTeam", {}).get("score"),
                "away_score": game.get("awayTeam", {}).get("score"),
                "venue": game.get("venue", {}).get("default")
            })

        return pd.DataFrame(games)

    # Standings
    def get_standings(self, date_str: str = None) -> pd.DataFrame:
        """Get league standings."""
        endpoint = f"{self.BASE_URL}/standings/{date_str or 'now'}"
        data = self._get(endpoint)

        standings = []
        for team in data.get("standings", []):
            standings.append({
                "team": team.get("teamName", {}).get("default"),
                "abbrev": team.get("teamAbbrev", {}).get("default"),
                "conference": team.get("conferenceName"),
                "division": team.get("divisionName"),
                "games_played": team.get("gamesPlayed"),
                "wins": team.get("wins"),
                "losses": team.get("losses"),
                "ot_losses": team.get("otLosses"),
                "points": team.get("points"),
                "points_pct": team.get("pointPctg"),
                "goals_for": team.get("goalFor"),
                "goals_against": team.get("goalAgainst"),
                "goal_diff": team.get("goalDifferential")
            })

        return pd.DataFrame(standings)

    # Schedule/Scores
    def get_schedule(self, date_str: str = None) -> pd.DataFrame:
        """Get games for a date."""
        endpoint = f"{self.BASE_URL}/schedule/{date_str or 'now'}"
        data = self._get(endpoint)

        games = []
        for day in data.get("gameWeek", []):
            for game in day.get("games", []):
                games.append({
                    "game_id": game.get("id"),
                    "date": game.get("startTimeUTC"),
                    "game_state": game.get("gameState"),
                    "home_team": game.get("homeTeam", {}).get("abbrev"),
                    "away_team": game.get("awayTeam", {}).get("abbrev"),
                    "home_score": game.get("homeTeam", {}).get("score"),
                    "away_score": game.get("awayTeam", {}).get("score"),
                    "venue": game.get("venue", {}).get("default")
                })

        return pd.DataFrame(games)

    # Leaders
    def get_skater_leaders(self, category: str = "points", limit: int = 10) -> pd.DataFrame:
        """
        Get league leaders for skaters.

        Categories: points, goals, assists, plusMinus, penaltyMins
        """
        data = self._get(f"{self.BASE_URL}/skater-stats-leaders/current", {
            "categories": category,
            "limit": limit
        })

        leaders = []
        for player in data.get(category, []):
            leaders.append({
                "rank": player.get("rank"),
                "player": f"{player.get('firstName', {}).get('default')} {player.get('lastName', {}).get('default')}",
                "team": player.get("teamAbbrev"),
                "value": player.get("value")
            })

        return pd.DataFrame(leaders)

    def get_goalie_leaders(self, category: str = "wins", limit: int = 10) -> pd.DataFrame:
        """
        Get league leaders for goalies.

        Categories: wins, savePctg, goalsAgainstAverage, shutouts
        """
        data = self._get(f"{self.BASE_URL}/goalie-stats-leaders/current", {
            "categories": category,
            "limit": limit
        })

        leaders = []
        for player in data.get(category, []):
            leaders.append({
                "rank": player.get("rank"),
                "player": f"{player.get('firstName', {}).get('default')} {player.get('lastName', {}).get('default')}",
                "team": player.get("teamAbbrev"),
                "value": player.get("value")
            })

        return pd.DataFrame(leaders)

    # Game details
    def get_game_boxscore(self, game_id: int) -> Dict:
        """Get game box score."""
        return self._get(f"{self.BASE_URL}/gamecenter/{game_id}/boxscore")

    def get_game_play_by_play(self, game_id: int) -> Dict:
        """Get play-by-play data."""
        return self._get(f"{self.BASE_URL}/gamecenter/{game_id}/play-by-play")


# Example usage
if __name__ == "__main__":
    nhl = NHLStatsAPI()

    # Get standings
    standings = nhl.get_standings()
    print("NHL Standings:")
    print(standings[['team', 'points', 'wins', 'losses']].head(10))

    # Get point leaders
    leaders = nhl.get_skater_leaders("points", 20)
    print("\nPoints Leaders:")
    print(leaders)

    # Get today's games
    schedule = nhl.get_schedule()
    print("\nToday's Games:")
    print(schedule)

python Soccer

StatsBomb Open Data API

Access free event-level soccer data from StatsBomb for detailed match analysis.

"""StatsBomb Open Data API client."""
import requests
import pandas as pd
from typing import Optional, List, Dict

class StatsBombAPI:
    """
    Client for StatsBomb Open Data.

    Free data includes:
    - FIFA World Cups
    - FA Women's Super League
    - NWSL
    - UEFA Euro 2020
    - Select club competitions
    """

    BASE_URL = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"

    def __init__(self):
        self.session = requests.Session()

    def _get_json(self, path: str) -> Dict:
        """Fetch JSON data."""
        url = f"{self.BASE_URL}/{path}"
        response = self.session.get(url)
        response.raise_for_status()
        return response.json()

    # Competitions
    def get_competitions(self) -> pd.DataFrame:
        """Get available competitions."""
        data = self._get_json("competitions.json")
        return pd.DataFrame(data)

    # Matches
    def get_matches(self, competition_id: int, season_id: int) -> pd.DataFrame:
        """Get matches for a competition season."""
        data = self._get_json(f"matches/{competition_id}/{season_id}.json")

        matches = []
        for match in data:
            matches.append({
                "match_id": match.get("match_id"),
                "match_date": match.get("match_date"),
                "kick_off": match.get("kick_off"),
                "competition": match.get("competition", {}).get("competition_name"),
                "season": match.get("season", {}).get("season_name"),
                "home_team": match.get("home_team", {}).get("home_team_name"),
                "away_team": match.get("away_team", {}).get("away_team_name"),
                "home_score": match.get("home_score"),
                "away_score": match.get("away_score"),
                "stadium": match.get("stadium", {}).get("name") if match.get("stadium") else None,
                "referee": match.get("referee", {}).get("name") if match.get("referee") else None
            })

        return pd.DataFrame(matches)

    # Events (Play-by-play)
    def get_events(self, match_id: int) -> pd.DataFrame:
        """Get all events for a match."""
        data = self._get_json(f"events/{match_id}.json")
        return pd.json_normalize(data)

    def get_shots(self, match_id: int) -> pd.DataFrame:
        """Get shot events with xG data."""
        events = self.get_events(match_id)
        shots = events[events['type.name'] == 'Shot'].copy()

        # Extract relevant columns
        cols = [
            'id', 'minute', 'second', 'team.name', 'player.name',
            'location', 'shot.statsbomb_xg', 'shot.outcome.name',
            'shot.body_part.name', 'shot.technique.name',
            'shot.type.name', 'shot.end_location'
        ]
        available_cols = [c for c in cols if c in shots.columns]
        return shots[available_cols]

    def get_passes(self, match_id: int) -> pd.DataFrame:
        """Get pass events."""
        events = self.get_events(match_id)
        passes = events[events['type.name'] == 'Pass'].copy()

        cols = [
            'id', 'minute', 'second', 'team.name', 'player.name',
            'location', 'pass.end_location', 'pass.length',
            'pass.angle', 'pass.height.name', 'pass.outcome.name',
            'pass.recipient.name', 'pass.body_part.name'
        ]
        available_cols = [c for c in cols if c in passes.columns]
        return passes[available_cols]

    # Lineups
    def get_lineups(self, match_id: int) -> Dict[str, pd.DataFrame]:
        """Get lineups for both teams."""
        data = self._get_json(f"lineups/{match_id}.json")

        lineups = {}
        for team in data:
            team_name = team.get("team_name")
            players = []
            for player in team.get("lineup", []):
                players.append({
                    "player_id": player.get("player_id"),
                    "player_name": player.get("player_name"),
                    "player_nickname": player.get("player_nickname"),
                    "jersey_number": player.get("jersey_number"),
                    "country": player.get("country", {}).get("name")
                })
            lineups[team_name] = pd.DataFrame(players)

        return lineups

    # 360 Data (freeze frames)
    def get_360_data(self, match_id: int) -> pd.DataFrame:
        """Get 360 freeze frame data (if available)."""
        try:
            data = self._get_json(f"three-sixty/{match_id}.json")
            return pd.json_normalize(data)
        except:
            return pd.DataFrame()

    # Aggregated stats
    def calculate_xg(self, match_id: int) -> Dict:
        """Calculate total xG for each team."""
        shots = self.get_shots(match_id)

        if 'shot.statsbomb_xg' not in shots.columns:
            return {}

        xg_by_team = shots.groupby('team.name')['shot.statsbomb_xg'].sum().to_dict()
        goals_by_team = shots[shots['shot.outcome.name'] == 'Goal'].groupby('team.name').size().to_dict()

        return {
            team: {'xG': xg_by_team.get(team, 0), 'Goals': goals_by_team.get(team, 0)}
            for team in set(list(xg_by_team.keys()) + list(goals_by_team.keys()))
        }


# Example usage
if __name__ == "__main__":
    sb = StatsBombAPI()

    # Get available competitions
    competitions = sb.get_competitions()
    print("Available Competitions:")
    print(competitions[['competition_name', 'season_name']].head(20))

    # Get World Cup 2022 matches (competition_id=43, season_id=106)
    matches = sb.get_matches(43, 106)
    print("\nWorld Cup 2022 Matches:")
    print(matches[['home_team', 'away_team', 'home_score', 'away_score']].head(10))

    # Get events from a match
    if len(matches) > 0:
        match_id = matches.iloc[0]['match_id']
        xg = sb.calculate_xg(match_id)
        print(f"\nxG for match {match_id}:")
        print(xg)

python Baseball

MLB Stats API Client

Complete Python client for the official MLB Stats API with authentication and rate limiting.

"""MLB Stats API client with caching and rate limiting."""
import requests
from datetime import datetime, date
from typing import Optional, Dict, List, Any
import time
from functools import lru_cache

class MLBStatsAPI:
    """
    Client for MLB Stats API (statsapi.mlb.com).

    Features:
    - No authentication required
    - Automatic rate limiting
    - Response caching
    """

    BASE_URL = "https://statsapi.mlb.com/api/v1"

    def __init__(self, requests_per_minute: int = 60):
        self.session = requests.Session()
        self.min_interval = 60.0 / requests_per_minute
        self.last_request = 0

    def _rate_limit(self):
        """Enforce rate limiting."""
        elapsed = time.time() - self.last_request
        if elapsed < self.min_interval:
            time.sleep(self.min_interval - elapsed)
        self.last_request = time.time()

    def _get(self, endpoint: str, params: Dict = None) -> Dict:
        """Make GET request to API."""
        self._rate_limit()
        url = f"{self.BASE_URL}/{endpoint}"
        response = self.session.get(url, params=params)
        response.raise_for_status()
        return response.json()

    # Teams
    def get_teams(self, season: int = None, sport_id: int = 1) -> List[Dict]:
        """Get all MLB teams."""
        params = {"sportId": sport_id}
        if season:
            params["season"] = season
        data = self._get("teams", params)
        return data.get("teams", [])

    def get_team(self, team_id: int) -> Dict:
        """Get team details."""
        data = self._get(f"teams/{team_id}")
        return data.get("teams", [{}])[0]

    def get_team_roster(self, team_id: int, roster_type: str = "active") -> List[Dict]:
        """Get team roster."""
        data = self._get(f"teams/{team_id}/roster", {"rosterType": roster_type})
        return data.get("roster", [])

    # Players
    def get_player(self, player_id: int) -> Dict:
        """Get player details."""
        data = self._get(f"people/{player_id}")
        return data.get("people", [{}])[0]

    def get_player_stats(
        self,
        player_id: int,
        stats_type: str = "season",
        group: str = "hitting",
        season: int = None
    ) -> Dict:
        """
        Get player statistics.

        Args:
            player_id: MLB player ID
            stats_type: season, career, yearByYear, etc.
            group: hitting, pitching, fielding
            season: Season year (required for season stats)
        """
        params = {
            "stats": stats_type,
            "group": group
        }
        if season:
            params["season"] = season
        data = self._get(f"people/{player_id}/stats", params)
        return data.get("stats", [])

    # Games/Schedule
    def get_schedule(
        self,
        start_date: date,
        end_date: date = None,
        team_id: int = None
    ) -> List[Dict]:
        """Get game schedule."""
        params = {
            "sportId": 1,
            "startDate": start_date.strftime("%Y-%m-%d"),
            "endDate": (end_date or start_date).strftime("%Y-%m-%d")
        }
        if team_id:
            params["teamId"] = team_id
        data = self._get("schedule", params)
        games = []
        for date_data in data.get("dates", []):
            games.extend(date_data.get("games", []))
        return games

    def get_game(self, game_pk: int) -> Dict:
        """Get game details."""
        data = self._get(f"game/{game_pk}/feed/live")
        return data

    def get_game_boxscore(self, game_pk: int) -> Dict:
        """Get game box score."""
        data = self._get(f"game/{game_pk}/boxscore")
        return data

    def get_game_play_by_play(self, game_pk: int) -> Dict:
        """Get play-by-play data."""
        data = self._get(f"game/{game_pk}/playByPlay")
        return data

    # Standings
    def get_standings(self, league_id: int = None, season: int = None) -> List[Dict]:
        """Get league standings."""
        params = {}
        if league_id:
            params["leagueId"] = league_id
        if season:
            params["season"] = season
        data = self._get("standings", params)
        return data.get("records", [])

    # Search
    @lru_cache(maxsize=1000)
    def search_player(self, name: str) -> List[Dict]:
        """Search for players by name."""
        data = self._get("sports/1/players", {"search": name})
        return data.get("people", [])


# Example usage
if __name__ == "__main__":
    mlb = MLBStatsAPI()

    # Get Yankees roster
    yankees_roster = mlb.get_team_roster(147)
    print(f"Yankees roster: {len(yankees_roster)} players")

    # Get Aaron Judge stats
    judge_stats = mlb.get_player_stats(592450, "season", "hitting", 2024)
    print("Aaron Judge 2024:", judge_stats)

    # Get today's games
    today = date.today()
    games = mlb.get_schedule(today)
    print(f"Games today: {len(games)}")

python Baseball

Statcast Data API

Access MLB Statcast data through Baseball Savant API for pitch-level analytics.

"""Baseball Savant Statcast API client."""
import requests
import pandas as pd
from datetime import date, datetime
from typing import Optional, List
import io

class StatcastAPI:
    """
    Client for Baseball Savant Statcast data.

    Access pitch-level data including:
    - Exit velocity
    - Launch angle
    - Spin rate
    - Pitch movement
    """

    BASE_URL = "https://baseballsavant.mlb.com"

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        })

    def get_statcast_data(
        self,
        start_date: date,
        end_date: date,
        player_type: str = "batter",
        team: str = None
    ) -> pd.DataFrame:
        """
        Get Statcast pitch-level data.

        Args:
            start_date: Start date
            end_date: End date
            player_type: 'batter' or 'pitcher'
            team: Team abbreviation filter

        Returns:
            DataFrame with Statcast data
        """
        url = f"{self.BASE_URL}/statcast_search/csv"
        params = {
            "all": "true",
            "hfPT": "",
            "hfAB": "",
            "hfBBT": "",
            "hfPR": "",
            "hfZ": "",
            "stadium": "",
            "hfBBL": "",
            "hfNewZones": "",
            "hfGT": "R|",
            "hfC": "",
            "hfSea": "",
            "hfSit": "",
            "player_type": player_type,
            "hfOuts": "",
            "opponent": "",
            "pitcher_throws": "",
            "batter_stands": "",
            "hfSA": "",
            "game_date_gt": start_date.strftime("%Y-%m-%d"),
            "game_date_lt": end_date.strftime("%Y-%m-%d"),
            "team": team or "",
            "position": "",
            "hfRO": "",
            "home_road": "",
            "hfFlag": "",
            "metric_1": "",
            "hfInn": "",
            "min_pitches": "0",
            "min_results": "0",
            "group_by": "name",
            "sort_col": "pitches",
            "player_event_sort": "h_launch_speed",
            "sort_order": "desc",
            "min_abs": "0",
            "type": "details"
        }

        response = self.session.get(url, params=params)
        response.raise_for_status()

        df = pd.read_csv(io.StringIO(response.text))
        return df

    def get_player_statcast(
        self,
        player_id: int,
        season: int,
        player_type: str = "batter"
    ) -> pd.DataFrame:
        """Get Statcast data for a specific player."""
        url = f"{self.BASE_URL}/statcast_search/csv"
        lookup_key = "batters_lookup[]" if player_type == "batter" else "pitchers_lookup[]"
        params = {
            "all": "true",
            "player_type": player_type,
            lookup_key: player_id,
            "hfSea": f"{season}|",
            "type": "details"
        }

        response = self.session.get(url, params=params)
        df = pd.read_csv(io.StringIO(response.text))
        return df

    def get_leaderboard(
        self,
        stat: str = "exit_velocity",
        year: int = 2024,
        min_pa: int = 100
    ) -> pd.DataFrame:
        """
        Get Statcast leaderboard.

        Stats: exit_velocity, launch_angle, barrel, hard_hit, xba, xslg, xwoba
        """
        url = f"{self.BASE_URL}/leaderboard/expected_statistics"
        params = {
            "type": stat,
            "year": year,
            "position": "",
            "team": "",
            "min": min_pa,
            "csv": "true"
        }

        response = self.session.get(url, params=params)
        df = pd.read_csv(io.StringIO(response.text))
        return df

    def get_pitch_arsenal(self, pitcher_id: int, season: int) -> pd.DataFrame:
        """Get pitcher's pitch arsenal breakdown."""
        url = f"{self.BASE_URL}/savant-player/{pitcher_id}"
        params = {"stats": "statcast", "season": season}

        # Parse JSON endpoint
        api_url = f"{self.BASE_URL}/player-services/statcast-pitching-breakdown"
        params = {"playerId": pitcher_id, "season": season}

        response = self.session.get(api_url, params=params)
        data = response.json()
        return pd.DataFrame(data.get("pitchBreakdown", []))


# Example usage
if __name__ == "__main__":
    statcast = StatcastAPI()

    # Get Statcast data for last week
    from datetime import timedelta
    end = date.today()
    start = end - timedelta(days=7)

    data = statcast.get_statcast_data(start, end, "batter")
    print(f"Retrieved {len(data)} pitches")

    # Get exit velocity leaders
    leaders = statcast.get_leaderboard("exit_velocity", 2024, 200)
    print("Exit Velocity Leaders:")
    print(leaders[['player_name', 'avg_hit_speed']].head(10))

python Basketball

NBA Stats API Client

Python client for the official NBA Stats API with all major endpoints.

"""NBA Stats API client."""
import requests
import pandas as pd
from typing import Optional, Dict, List
import time

class NBAStatsAPI:
    """
    Client for NBA Stats API (stats.nba.com).

    Note: NBA API requires specific headers to work properly.
    """

    BASE_URL = "https://stats.nba.com/stats"

    HEADERS = {
        "Host": "stats.nba.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "x-nba-stats-origin": "stats",
        "x-nba-stats-token": "true",
        "Connection": "keep-alive",
        "Referer": "https://stats.nba.com/",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
    }

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(self.HEADERS)
        self.last_request = 0

    def _request(self, endpoint: str, params: Dict) -> Dict:
        """Make request with rate limiting."""
        # Rate limit: 1 request per second
        elapsed = time.time() - self.last_request
        if elapsed < 1:
            time.sleep(1 - elapsed)

        url = f"{self.BASE_URL}/{endpoint}"
        response = self.session.get(url, params=params, timeout=30)
        response.raise_for_status()
        self.last_request = time.time()
        return response.json()

    def _parse_response(self, data: Dict, result_index: int = 0) -> pd.DataFrame:
        """Parse NBA API response into DataFrame."""
        result_sets = data.get("resultSets", data.get("resultSet", []))
        if isinstance(result_sets, list) and len(result_sets) > result_index:
            result = result_sets[result_index]
            return pd.DataFrame(
                result.get("rowSet", []),
                columns=result.get("headers", [])
            )
        return pd.DataFrame()

    # Player endpoints
    def get_player_info(self, player_id: int) -> pd.DataFrame:
        """Get player biographical info."""
        data = self._request("commonplayerinfo", {"PlayerID": player_id})
        return self._parse_response(data)

    def get_player_career_stats(self, player_id: int, per_mode: str = "PerGame") -> pd.DataFrame:
        """Get player career statistics."""
        data = self._request("playercareerstats", {
            "PlayerID": player_id,
            "PerMode": per_mode
        })
        return self._parse_response(data)

    def get_player_game_log(
        self,
        player_id: int,
        season: str = "2024-25",
        season_type: str = "Regular Season"
    ) -> pd.DataFrame:
        """Get player game log."""
        data = self._request("playergamelog", {
            "PlayerID": player_id,
            "Season": season,
            "SeasonType": season_type
        })
        return self._parse_response(data)

    # League endpoints
    def get_league_leaders(
        self,
        stat_category: str = "PTS",
        season: str = "2024-25",
        per_mode: str = "PerGame"
    ) -> pd.DataFrame:
        """Get league leaders for a stat category."""
        data = self._request("leagueleaders", {
            "LeagueID": "00",
            "PerMode": per_mode,
            "Scope": "S",
            "Season": season,
            "SeasonType": "Regular Season",
            "StatCategory": stat_category
        })
        return self._parse_response(data)

    def get_all_players(self, season: str = "2024-25") -> pd.DataFrame:
        """Get all players for a season."""
        data = self._request("commonallplayers", {
            "LeagueID": "00",
            "Season": season,
            "IsOnlyCurrentSeason": 1
        })
        return self._parse_response(data)

    # Team endpoints
    def get_team_info(self, team_id: int) -> pd.DataFrame:
        """Get team information."""
        data = self._request("teamdetails", {"TeamID": team_id})
        return self._parse_response(data)

    def get_team_roster(self, team_id: int, season: str = "2024-25") -> pd.DataFrame:
        """Get team roster."""
        data = self._request("commonteamroster", {
            "TeamID": team_id,
            "Season": season
        })
        return self._parse_response(data)

    def get_team_stats(
        self,
        season: str = "2024-25",
        per_mode: str = "PerGame"
    ) -> pd.DataFrame:
        """Get all team statistics."""
        data = self._request("leaguedashteamstats", {
            "Conference": "",
            "DateFrom": "",
            "DateTo": "",
            "Division": "",
            "GameScope": "",
            "GameSegment": "",
            "LastNGames": 0,
            "LeagueID": "00",
            "Location": "",
            "MeasureType": "Base",
            "Month": 0,
            "OpponentTeamID": 0,
            "Outcome": "",
            "PORound": 0,
            "PaceAdjust": "N",
            "PerMode": per_mode,
            "Period": 0,
            "PlayerExperience": "",
            "PlayerPosition": "",
            "PlusMinus": "N",
            "Rank": "N",
            "Season": season,
            "SeasonSegment": "",
            "SeasonType": "Regular Season",
            "ShotClockRange": "",
            "StarterBench": "",
            "TeamID": 0,
            "TwoWay": 0,
            "VsConference": "",
            "VsDivision": ""
        })
        return self._parse_response(data)

    # Shot chart
    def get_shot_chart(
        self,
        player_id: int,
        season: str = "2024-25"
    ) -> pd.DataFrame:
        """Get player shot chart data."""
        data = self._request("shotchartdetail", {
            "ContextMeasure": "FGA",
            "DateFrom": "",
            "DateTo": "",
            "GameID": "",
            "GameSegment": "",
            "LastNGames": 0,
            "LeagueID": "00",
            "Location": "",
            "Month": 0,
            "OpponentTeamID": 0,
            "Outcome": "",
            "Period": 0,
            "PlayerID": player_id,
            "PlayerPosition": "",
            "RookieYear": "",
            "Season": season,
            "SeasonSegment": "",
            "SeasonType": "Regular Season",
            "TeamID": 0,
            "VsConference": "",
            "VsDivision": ""
        })
        return self._parse_response(data)


# Example usage
if __name__ == "__main__":
    nba = NBAStatsAPI()

    # Get scoring leaders
    leaders = nba.get_league_leaders("PTS", "2024-25")
    print("Scoring Leaders:")
    print(leaders[['PLAYER', 'TEAM', 'PTS']].head(10))

    # Get LeBron's career stats (player_id: 2544)
    lebron = nba.get_player_career_stats(2544)
    print("\nLeBron Career Stats:")
    print(lebron.head())

python Football

ESPN API Client

Access ESPN's public API for scores, standings, and team data across multiple sports.

"""ESPN API client for multiple sports."""
import requests
import pandas as pd
from typing import Optional, Dict, List
from datetime import date

class ESPNAPI:
    """
    Client for ESPN's public API endpoints.

    Supports: NFL, NBA, MLB, NHL, College Football, College Basketball
    """

    BASE_URL = "https://site.api.espn.com/apis/site/v2/sports"

    SPORTS = {
        "nfl": "football/nfl",
        "nba": "basketball/nba",
        "mlb": "baseball/mlb",
        "nhl": "hockey/nhl",
        "cfb": "football/college-football",
        "cbb": "basketball/mens-college-basketball",
        "wnba": "basketball/wnba",
        "mls": "soccer/usa.1"
    }

    def __init__(self):
        self.session = requests.Session()

    def _get(self, sport: str, endpoint: str, params: Dict = None) -> Dict:
        """Make API request."""
        sport_path = self.SPORTS.get(sport, sport)
        url = f"{self.BASE_URL}/{sport_path}/{endpoint}"
        response = self.session.get(url, params=params or {})
        response.raise_for_status()
        return response.json()

    # Scoreboard
    def get_scoreboard(self, sport: str, date_str: str = None) -> Dict:
        """
        Get scoreboard/schedule.

        Args:
            sport: Sport key (nfl, nba, mlb, etc.)
            date_str: Date in YYYYMMDD format

        Returns:
            Scoreboard data with games
        """
        params = {}
        if date_str:
            params["dates"] = date_str
        return self._get(sport, "scoreboard", params)

    def get_scores_df(self, sport: str, date_str: str = None) -> pd.DataFrame:
        """Get scoreboard as DataFrame."""
        data = self.get_scoreboard(sport, date_str)
        games = []

        for event in data.get("events", []):
            competition = event.get("competitions", [{}])[0]
            competitors = competition.get("competitors", [])

            if len(competitors) >= 2:
                home = next((c for c in competitors if c.get("homeAway") == "home"), competitors[0])
                away = next((c for c in competitors if c.get("homeAway") == "away"), competitors[1])

                games.append({
                    "game_id": event.get("id"),
                    "date": event.get("date"),
                    "status": event.get("status", {}).get("type", {}).get("description"),
                    "home_team": home.get("team", {}).get("displayName"),
                    "home_score": int(home.get("score", 0)),
                    "away_team": away.get("team", {}).get("displayName"),
                    "away_score": int(away.get("score", 0)),
                    "venue": competition.get("venue", {}).get("fullName")
                })

        return pd.DataFrame(games)

    # Standings
    def get_standings(self, sport: str, season: int = None) -> Dict:
        """Get league standings."""
        params = {}
        if season:
            params["season"] = season
        return self._get(sport, "standings", params)

    def get_standings_df(self, sport: str) -> pd.DataFrame:
        """Get standings as DataFrame."""
        data = self.get_standings(sport)
        teams = []

        for group in data.get("children", []):
            for team_entry in group.get("standings", {}).get("entries", []):
                team = team_entry.get("team", {})
                stats = {s["name"]: s["value"] for s in team_entry.get("stats", [])}

                teams.append({
                    "team": team.get("displayName"),
                    "wins": int(stats.get("wins", 0)),
                    "losses": int(stats.get("losses", 0)),
                    "pct": float(stats.get("winPercent", 0)),
                    "gb": stats.get("gamesBehind", "-"),
                    "division": group.get("name", "")
                })

        return pd.DataFrame(teams)

    # Teams
    def get_teams(self, sport: str) -> List[Dict]:
        """Get all teams for a sport."""
        data = self._get(sport, "teams")
        teams = []
        for group in data.get("sports", [{}])[0].get("leagues", [{}])[0].get("teams", []):
            team = group.get("team", {})
            teams.append({
                "id": team.get("id"),
                "name": team.get("displayName"),
                "abbreviation": team.get("abbreviation"),
                "location": team.get("location"),
                "color": team.get("color"),
                "logo": team.get("logos", [{}])[0].get("href") if team.get("logos") else None
            })
        return teams

    def get_team_info(self, sport: str, team_id: str) -> Dict:
        """Get detailed team information."""
        return self._get(sport, f"teams/{team_id}")

    def get_team_roster(self, sport: str, team_id: str) -> pd.DataFrame:
        """Get team roster."""
        data = self._get(sport, f"teams/{team_id}/roster")
        athletes = data.get("athletes", [])
        roster = []

        for group in athletes:
            for athlete in group.get("items", []):
                roster.append({
                    "id": athlete.get("id"),
                    "name": athlete.get("displayName"),
                    "position": athlete.get("position", {}).get("abbreviation"),
                    "jersey": athlete.get("jersey"),
                    "height": athlete.get("displayHeight"),
                    "weight": athlete.get("displayWeight"),
                    "age": athlete.get("age")
                })

        return pd.DataFrame(roster)

    # News
    def get_news(self, sport: str, limit: int = 10) -> List[Dict]:
        """Get latest news for a sport."""
        data = self._get(sport, "news", {"limit": limit})
        articles = []
        for article in data.get("articles", []):
            articles.append({
                "headline": article.get("headline"),
                "description": article.get("description"),
                "published": article.get("published"),
                "link": article.get("links", {}).get("web", {}).get("href")
            })
        return articles


# Example usage
if __name__ == "__main__":
    espn = ESPNAPI()

    # Get NFL scores
    nfl_scores = espn.get_scores_df("nfl")
    print("NFL Scores:")
    print(nfl_scores)

    # Get NBA standings
    nba_standings = espn.get_standings_df("nba")
    print("\nNBA Standings:")
    print(nba_standings.head(10))

    # Get MLB teams
    mlb_teams = espn.get_teams("mlb")
    print(f"\nMLB Teams: {len(mlb_teams)}")

python Soccer

Football-Data.org API

Access European soccer data including Premier League, La Liga, Bundesliga, and more.

"""Football-Data.org API client for European soccer."""
import requests
import pandas as pd
from typing import Optional, Dict, List
from datetime import date, datetime

class FootballDataAPI:
    """
    Client for Football-Data.org API.

    Free tier includes:
    - Premier League
    - La Liga
    - Bundesliga
    - Serie A
    - Ligue 1
    - Champions League
    """

    BASE_URL = "https://api.football-data.org/v4"

    COMPETITIONS = {
        "premier_league": "PL",
        "la_liga": "PD",
        "bundesliga": "BL1",
        "serie_a": "SA",
        "ligue_1": "FL1",
        "champions_league": "CL",
        "world_cup": "WC"
    }

    def __init__(self, api_key: str):
        """
        Initialize with API key.

        Get free key at: https://www.football-data.org/client/register
        """
        self.session = requests.Session()
        self.session.headers.update({"X-Auth-Token": api_key})

    def _get(self, endpoint: str, params: Dict = None) -> Dict:
        """Make API request."""
        url = f"{self.BASE_URL}/{endpoint}"
        response = self.session.get(url, params=params or {})
        response.raise_for_status()
        return response.json()

    # Competitions
    def get_competitions(self) -> List[Dict]:
        """Get all available competitions."""
        data = self._get("competitions")
        return data.get("competitions", [])

    def get_competition(self, code: str) -> Dict:
        """Get competition details."""
        return self._get(f"competitions/{code}")

    # Standings
    def get_standings(self, competition: str) -> pd.DataFrame:
        """Get league standings."""
        code = self.COMPETITIONS.get(competition, competition)
        data = self._get(f"competitions/{code}/standings")

        standings = []
        for table in data.get("standings", []):
            if table.get("type") == "TOTAL":
                for entry in table.get("table", []):
                    standings.append({
                        "position": entry.get("position"),
                        "team": entry.get("team", {}).get("name"),
                        "played": entry.get("playedGames"),
                        "won": entry.get("won"),
                        "draw": entry.get("draw"),
                        "lost": entry.get("lost"),
                        "goals_for": entry.get("goalsFor"),
                        "goals_against": entry.get("goalsAgainst"),
                        "goal_diff": entry.get("goalDifference"),
                        "points": entry.get("points")
                    })

        return pd.DataFrame(standings)

    # Matches
    def get_matches(
        self,
        competition: str = None,
        date_from: date = None,
        date_to: date = None,
        status: str = None
    ) -> pd.DataFrame:
        """
        Get matches.

        Args:
            competition: Competition code or name
            date_from: Start date
            date_to: End date
            status: SCHEDULED, LIVE, IN_PLAY, PAUSED, FINISHED
        """
        params = {}
        if date_from:
            params["dateFrom"] = date_from.strftime("%Y-%m-%d")
        if date_to:
            params["dateTo"] = date_to.strftime("%Y-%m-%d")
        if status:
            params["status"] = status

        if competition:
            code = self.COMPETITIONS.get(competition, competition)
            data = self._get(f"competitions/{code}/matches", params)
        else:
            data = self._get("matches", params)

        matches = []
        for match in data.get("matches", []):
            matches.append({
                "id": match.get("id"),
                "competition": match.get("competition", {}).get("name"),
                "date": match.get("utcDate"),
                "status": match.get("status"),
                "home_team": match.get("homeTeam", {}).get("name"),
                "away_team": match.get("awayTeam", {}).get("name"),
                "home_score": match.get("score", {}).get("fullTime", {}).get("home"),
                "away_score": match.get("score", {}).get("fullTime", {}).get("away"),
                "matchday": match.get("matchday")
            })

        return pd.DataFrame(matches)

    # Teams
    def get_teams(self, competition: str) -> pd.DataFrame:
        """Get teams in a competition."""
        code = self.COMPETITIONS.get(competition, competition)
        data = self._get(f"competitions/{code}/teams")

        teams = []
        for team in data.get("teams", []):
            teams.append({
                "id": team.get("id"),
                "name": team.get("name"),
                "short_name": team.get("shortName"),
                "tla": team.get("tla"),
                "venue": team.get("venue"),
                "founded": team.get("founded"),
                "coach": team.get("coach", {}).get("name")
            })

        return pd.DataFrame(teams)

    def get_team_matches(self, team_id: int, status: str = None) -> pd.DataFrame:
        """Get matches for a specific team."""
        params = {}
        if status:
            params["status"] = status
        data = self._get(f"teams/{team_id}/matches", params)

        matches = []
        for match in data.get("matches", []):
            matches.append({
                "date": match.get("utcDate"),
                "competition": match.get("competition", {}).get("name"),
                "home_team": match.get("homeTeam", {}).get("name"),
                "away_team": match.get("awayTeam", {}).get("name"),
                "score": f"{match.get('score', {}).get('fullTime', {}).get('home', '-')}-{match.get('score', {}).get('fullTime', {}).get('away', '-')}"
            })

        return pd.DataFrame(matches)

    # Scorers
    def get_scorers(self, competition: str, limit: int = 10) -> pd.DataFrame:
        """Get top scorers."""
        code = self.COMPETITIONS.get(competition, competition)
        data = self._get(f"competitions/{code}/scorers", {"limit": limit})

        scorers = []
        for scorer in data.get("scorers", []):
            player = scorer.get("player", {})
            team = scorer.get("team", {})
            scorers.append({
                "player": player.get("name"),
                "team": team.get("name"),
                "goals": scorer.get("goals"),
                "assists": scorer.get("assists"),
                "penalties": scorer.get("penalties"),
                "nationality": player.get("nationality")
            })

        return pd.DataFrame(scorers)


# Example usage
if __name__ == "__main__":
    # Get API key from https://www.football-data.org/
    api = FootballDataAPI("YOUR_API_KEY")

    # Get Premier League standings
    standings = api.get_standings("premier_league")
    print("Premier League Standings:")
    print(standings)

    # Get top scorers
    scorers = api.get_scorers("premier_league", 20)
    print("\nTop Scorers:")
    print(scorers)

sql Baseball

Create Sports Database Schema (PostgreSQL)

Complete PostgreSQL schema for storing sports statistics with players, teams, games, and performance data.

-- Sports Analytics Database Schema (PostgreSQL)

-- Enable extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";

-- Teams table
CREATE TABLE teams (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    name VARCHAR(100) NOT NULL,
    abbreviation VARCHAR(10),
    city VARCHAR(100),
    conference VARCHAR(50),
    division VARCHAR(50),
    sport VARCHAR(50) NOT NULL,
    active BOOLEAN DEFAULT true,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(abbreviation, sport)
);

-- Players table
CREATE TABLE players (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    first_name VARCHAR(100) NOT NULL,
    last_name VARCHAR(100) NOT NULL,
    team_id UUID REFERENCES teams(id),
    position VARCHAR(50),
    jersey_number INTEGER,
    birth_date DATE,
    height_inches INTEGER,
    weight_lbs INTEGER,
    bats VARCHAR(10),  -- Baseball specific
    throws VARCHAR(10),  -- Baseball specific
    active BOOLEAN DEFAULT true,
    external_id VARCHAR(50),  -- ID from external APIs
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_players_team ON players(team_id);
CREATE INDEX idx_players_name ON players(last_name, first_name);
CREATE INDEX idx_players_external ON players(external_id);

-- Seasons table
CREATE TABLE seasons (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    sport VARCHAR(50) NOT NULL,
    year INTEGER NOT NULL,
    start_date DATE,
    end_date DATE,
    type VARCHAR(20) DEFAULT 'regular',  -- regular, postseason
    UNIQUE(sport, year, type)
);

-- Games table
CREATE TABLE games (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    season_id UUID REFERENCES seasons(id),
    home_team_id UUID REFERENCES teams(id) NOT NULL,
    away_team_id UUID REFERENCES teams(id) NOT NULL,
    game_date DATE NOT NULL,
    game_time TIME,
    venue VARCHAR(200),
    home_score INTEGER,
    away_score INTEGER,
    status VARCHAR(20) DEFAULT 'scheduled',
    external_id VARCHAR(50),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_games_date ON games(game_date);
CREATE INDEX idx_games_teams ON games(home_team_id, away_team_id);
CREATE INDEX idx_games_season ON games(season_id);

-- Player game stats (normalized design)
CREATE TABLE player_game_stats (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    player_id UUID REFERENCES players(id) NOT NULL,
    game_id UUID REFERENCES games(id) NOT NULL,
    stat_type VARCHAR(50) NOT NULL,  -- e.g., 'batting', 'pitching', 'passing'
    stats JSONB NOT NULL,  -- Flexible JSON for different stat types
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(player_id, game_id, stat_type)
);

CREATE INDEX idx_player_stats_player ON player_game_stats(player_id);
CREATE INDEX idx_player_stats_game ON player_game_stats(game_id);
CREATE INDEX idx_player_stats_type ON player_game_stats(stat_type);
CREATE INDEX idx_player_stats_json ON player_game_stats USING GIN(stats);

-- Season aggregates for faster queries
CREATE TABLE player_season_stats (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    player_id UUID REFERENCES players(id) NOT NULL,
    season_id UUID REFERENCES seasons(id) NOT NULL,
    stat_type VARCHAR(50) NOT NULL,
    games_played INTEGER DEFAULT 0,
    stats JSONB NOT NULL,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(player_id, season_id, stat_type)
);

-- Trigger to update timestamps
CREATE OR REPLACE FUNCTION update_timestamp()
RETURNS TRIGGER AS $$
BEGIN
    NEW.updated_at = CURRENT_TIMESTAMP;
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER update_teams_timestamp BEFORE UPDATE ON teams
    FOR EACH ROW EXECUTE FUNCTION update_timestamp();

CREATE TRIGGER update_players_timestamp BEFORE UPDATE ON players
    FOR EACH ROW EXECUTE FUNCTION update_timestamp();

CREATE TRIGGER update_games_timestamp BEFORE UPDATE ON games
    FOR EACH ROW EXECUTE FUNCTION update_timestamp();

python Baseball

SQLAlchemy ORM Models for Sports Data

Python SQLAlchemy ORM models for sports analytics database with relationships and query methods.

"""SQLAlchemy ORM models for sports analytics database."""
from datetime import date, datetime
from typing import List, Optional, Dict, Any
from sqlalchemy import (
    create_engine, Column, Integer, String, Float, Date, DateTime,
    ForeignKey, Boolean, JSON, Index, UniqueConstraint
)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker, Session
from sqlalchemy.dialects.postgresql import UUID
import uuid

Base = declarative_base()


class Team(Base):
    """Team model."""
    __tablename__ = 'teams'

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    name = Column(String(100), nullable=False)
    abbreviation = Column(String(10))
    city = Column(String(100))
    conference = Column(String(50))
    division = Column(String(50))
    sport = Column(String(50), nullable=False)
    active = Column(Boolean, default=True)
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    players = relationship("Player", back_populates="team")
    home_games = relationship("Game", foreign_keys="Game.home_team_id", back_populates="home_team")
    away_games = relationship("Game", foreign_keys="Game.away_team_id", back_populates="away_team")

    __table_args__ = (
        UniqueConstraint('abbreviation', 'sport', name='uq_team_abbr_sport'),
    )


class Player(Base):
    """Player model."""
    __tablename__ = 'players'

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    first_name = Column(String(100), nullable=False)
    last_name = Column(String(100), nullable=False)
    team_id = Column(UUID(as_uuid=True), ForeignKey('teams.id'))
    position = Column(String(50))
    jersey_number = Column(Integer)
    birth_date = Column(Date)
    height_inches = Column(Integer)
    weight_lbs = Column(Integer)
    active = Column(Boolean, default=True)
    external_id = Column(String(50))
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    team = relationship("Team", back_populates="players")
    game_stats = relationship("PlayerGameStats", back_populates="player")
    season_stats = relationship("PlayerSeasonStats", back_populates="player")

    @property
    def full_name(self) -> str:
        return f"{self.first_name} {self.last_name}"

    @property
    def height_formatted(self) -> str:
        if self.height_inches:
            feet = self.height_inches // 12
            inches = self.height_inches % 12
            return f"{feet}'{inches}\""
        return ""

    __table_args__ = (
        Index('idx_players_team', 'team_id'),
        Index('idx_players_name', 'last_name', 'first_name'),
        Index('idx_players_external', 'external_id'),
    )


class Season(Base):
    """Season model."""
    __tablename__ = 'seasons'

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    sport = Column(String(50), nullable=False)
    year = Column(Integer, nullable=False)
    start_date = Column(Date)
    end_date = Column(Date)
    type = Column(String(20), default='regular')

    games = relationship("Game", back_populates="season")
    player_stats = relationship("PlayerSeasonStats", back_populates="season")

    __table_args__ = (
        UniqueConstraint('sport', 'year', 'type', name='uq_season'),
    )


class Game(Base):
    """Game model."""
    __tablename__ = 'games'

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    season_id = Column(UUID(as_uuid=True), ForeignKey('seasons.id'))
    home_team_id = Column(UUID(as_uuid=True), ForeignKey('teams.id'), nullable=False)
    away_team_id = Column(UUID(as_uuid=True), ForeignKey('teams.id'), nullable=False)
    game_date = Column(Date, nullable=False)
    venue = Column(String(200))
    home_score = Column(Integer)
    away_score = Column(Integer)
    status = Column(String(20), default='scheduled')
    external_id = Column(String(50))

    season = relationship("Season", back_populates="games")
    home_team = relationship("Team", foreign_keys=[home_team_id], back_populates="home_games")
    away_team = relationship("Team", foreign_keys=[away_team_id], back_populates="away_games")
    player_stats = relationship("PlayerGameStats", back_populates="game")

    @property
    def winner(self) -> Optional["Team"]:
        if self.home_score is not None and self.away_score is not None:
            if self.home_score > self.away_score:
                return self.home_team
            elif self.away_score > self.home_score:
                return self.away_team
        return None


class PlayerGameStats(Base):
    """Player game statistics with JSON stats."""
    __tablename__ = 'player_game_stats'

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    player_id = Column(UUID(as_uuid=True), ForeignKey('players.id'), nullable=False)
    game_id = Column(UUID(as_uuid=True), ForeignKey('games.id'), nullable=False)
    stat_type = Column(String(50), nullable=False)
    stats = Column(JSON, nullable=False)
    created_at = Column(DateTime, default=datetime.utcnow)

    player = relationship("Player", back_populates="game_stats")
    game = relationship("Game", back_populates="player_stats")

    __table_args__ = (
        UniqueConstraint('player_id', 'game_id', 'stat_type', name='uq_player_game_stat'),
    )


class PlayerSeasonStats(Base):
    """Aggregated season statistics."""
    __tablename__ = 'player_season_stats'

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    player_id = Column(UUID(as_uuid=True), ForeignKey('players.id'), nullable=False)
    season_id = Column(UUID(as_uuid=True), ForeignKey('seasons.id'), nullable=False)
    stat_type = Column(String(50), nullable=False)
    games_played = Column(Integer, default=0)
    stats = Column(JSON, nullable=False)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    player = relationship("Player", back_populates="season_stats")
    season = relationship("Season", back_populates="player_stats")


# Database connection and session management
def get_engine(database_url: str):
    return create_engine(database_url)

def get_session(engine) -> Session:
    SessionLocal = sessionmaker(bind=engine)
    return SessionLocal()

# Example usage
if __name__ == "__main__":
    DATABASE_URL = "postgresql://user:password@localhost/sports_db"
    engine = get_engine(DATABASE_URL)

    # Create tables
    Base.metadata.create_all(engine)

    # Use session
    with get_session(engine) as session:
        team = Team(name="New York Yankees", abbreviation="NYY", sport="baseball")
        session.add(team)
        session.commit()

python Baseball

Bulk Insert Sports Data with pandas

Efficiently bulk insert sports statistics from pandas DataFrames into a database using SQLAlchemy.

"""Bulk insert sports data from pandas to database."""
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.orm import Session
from typing import Optional
import uuid

def bulk_insert_players(
    df: pd.DataFrame,
    engine,
    team_mapping: dict,
    chunk_size: int = 1000
) -> int:
    """
    Bulk insert player data from DataFrame.

    Args:
        df: DataFrame with player data
        engine: SQLAlchemy engine
        team_mapping: Dict mapping team names to IDs
        chunk_size: Records per chunk

    Returns:
        Number of rows inserted
    """
    # Prepare data
    df = df.copy()

    # Map team names to IDs
    if 'team' in df.columns:
        df['team_id'] = df['team'].map(team_mapping)

    # Generate UUIDs
    df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]

    # Select and rename columns to match schema
    column_mapping = {
        'player_name': 'last_name',  # Split if needed
        'pos': 'position',
        'number': 'jersey_number',
        'ht': 'height_inches',
        'wt': 'weight_lbs'
    }

    df = df.rename(columns=column_mapping)

    # Split name if needed
    if 'player_name' in df.columns or 'name' in df.columns:
        name_col = 'player_name' if 'player_name' in df.columns else 'name'
        names = df[name_col].str.split(' ', n=1, expand=True)
        df['first_name'] = names[0]
        df['last_name'] = names[1] if names.shape[1] > 1 else ''

    # Select only valid columns
    valid_columns = ['id', 'first_name', 'last_name', 'team_id', 'position',
                    'jersey_number', 'height_inches', 'weight_lbs', 'external_id']
    df = df[[c for c in valid_columns if c in df.columns]]

    # Bulk insert
    total_inserted = 0
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size]
        chunk.to_sql('players', engine, if_exists='append', index=False, method='multi')
        total_inserted += len(chunk)
        print(f"Inserted {total_inserted}/{len(df)} players")

    return total_inserted


def bulk_insert_game_stats(
    df: pd.DataFrame,
    engine,
    player_mapping: dict,
    game_mapping: dict,
    stat_type: str = 'batting',
    chunk_size: int = 500
) -> int:
    """
    Bulk insert game statistics with JSON stats column.

    Args:
        df: DataFrame with game statistics
        engine: SQLAlchemy engine
        player_mapping: Dict mapping player names/IDs to database IDs
        game_mapping: Dict mapping game identifiers to database IDs
        stat_type: Type of statistics
        chunk_size: Records per chunk

    Returns:
        Number of rows inserted
    """
    import json

    df = df.copy()

    # Map to database IDs
    df['player_id'] = df['player_id'].map(player_mapping)
    df['game_id'] = df['game_id'].map(game_mapping)

    # Generate IDs
    df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]
    df['stat_type'] = stat_type

    # Identify stat columns (everything not id/mapping related)
    id_columns = ['id', 'player_id', 'game_id', 'stat_type']
    stat_columns = [c for c in df.columns if c not in id_columns]

    # Create JSON stats column
    df['stats'] = df[stat_columns].apply(
        lambda row: json.dumps(row.dropna().to_dict()),
        axis=1
    )

    # Select final columns
    insert_df = df[['id', 'player_id', 'game_id', 'stat_type', 'stats']]

    # Remove rows with missing IDs
    insert_df = insert_df.dropna(subset=['player_id', 'game_id'])

    # Bulk insert
    total_inserted = 0
    for i in range(0, len(insert_df), chunk_size):
        chunk = insert_df.iloc[i:i+chunk_size]
        chunk.to_sql('player_game_stats', engine, if_exists='append', index=False)
        total_inserted += len(chunk)
        print(f"Inserted {total_inserted}/{len(insert_df)} stat records")

    return total_inserted


def upsert_season_aggregates(
    engine,
    season_id: str,
    stat_type: str = 'batting'
):
    """
    Aggregate game stats into season totals using SQL.
    """
    sql = text("""
        INSERT INTO player_season_stats (id, player_id, season_id, stat_type, games_played, stats)
        SELECT
            gen_random_uuid() as id,
            pgs.player_id,
            :season_id as season_id,
            :stat_type as stat_type,
            COUNT(*) as games_played,
            jsonb_build_object(
                'games', COUNT(*),
                'total_stats', jsonb_agg(pgs.stats)
            ) as stats
        FROM player_game_stats pgs
        JOIN games g ON pgs.game_id = g.id
        WHERE g.season_id = :season_id AND pgs.stat_type = :stat_type
        GROUP BY pgs.player_id
        ON CONFLICT (player_id, season_id, stat_type)
        DO UPDATE SET
            games_played = EXCLUDED.games_played,
            stats = EXCLUDED.stats,
            updated_at = CURRENT_TIMESTAMP
    """)

    with engine.begin() as conn:
        result = conn.execute(sql, {"season_id": season_id, "stat_type": stat_type})
        return result.rowcount


# Example usage
if __name__ == "__main__":
    import pybaseball as pyb

    DATABASE_URL = "postgresql://user:password@localhost/sports_db"
    engine = create_engine(DATABASE_URL)

    # Get batting data
    batting = pyb.batting_stats(2024)

    # Create team mapping (simplified)
    team_mapping = {'NYY': 'uuid-here', 'BOS': 'uuid-here'}

    # Insert (would need proper mappings)
    # inserted = bulk_insert_game_stats(batting, engine, player_map, game_map)

python Baseball

Query Sports Stats with pandas read_sql

Efficient SQL queries for sports analytics using pandas and SQLAlchemy with parameterized queries.

"""Query sports statistics database with pandas."""
import pandas as pd
from sqlalchemy import create_engine, text
from typing import Optional, List

class SportsStatsQuery:
    """Query builder for sports statistics."""

    def __init__(self, database_url: str):
        self.engine = create_engine(database_url)

    def get_player_season_stats(
        self,
        season_year: int,
        sport: str,
        stat_type: str = 'batting',
        min_games: int = 0
    ) -> pd.DataFrame:
        """
        Get player statistics for a season.
        """
        query = """
            SELECT
                p.first_name,
                p.last_name,
                t.name as team,
                p.position,
                pss.games_played,
                pss.stats
            FROM player_season_stats pss
            JOIN players p ON pss.player_id = p.id
            JOIN teams t ON p.team_id = t.id
            JOIN seasons s ON pss.season_id = s.id
            WHERE s.year = :year
              AND s.sport = :sport
              AND pss.stat_type = :stat_type
              AND pss.games_played >= :min_games
            ORDER BY pss.games_played DESC
        """

        df = pd.read_sql(
            text(query),
            self.engine,
            params={
                'year': season_year,
                'sport': sport,
                'stat_type': stat_type,
                'min_games': min_games
            }
        )

        # Expand JSON stats column
        if 'stats' in df.columns and len(df) > 0:
            stats_df = pd.json_normalize(df['stats'])
            df = pd.concat([df.drop('stats', axis=1), stats_df], axis=1)

        return df

    def get_team_standings(
        self,
        season_year: int,
        sport: str,
        division: Optional[str] = None
    ) -> pd.DataFrame:
        """
        Calculate team standings from game results.
        """
        query = """
            WITH game_results AS (
                SELECT
                    t.id as team_id,
                    t.name as team,
                    t.division,
                    t.conference,
                    CASE
                        WHEN g.home_team_id = t.id AND g.home_score > g.away_score THEN 1
                        WHEN g.away_team_id = t.id AND g.away_score > g.home_score THEN 1
                        ELSE 0
                    END as win,
                    CASE
                        WHEN g.home_team_id = t.id AND g.home_score < g.away_score THEN 1
                        WHEN g.away_team_id = t.id AND g.away_score < g.home_score THEN 1
                        ELSE 0
                    END as loss,
                    CASE WHEN g.home_team_id = t.id THEN g.home_score ELSE g.away_score END as runs_scored,
                    CASE WHEN g.home_team_id = t.id THEN g.away_score ELSE g.home_score END as runs_allowed
                FROM teams t
                JOIN games g ON t.id = g.home_team_id OR t.id = g.away_team_id
                JOIN seasons s ON g.season_id = s.id
                WHERE s.year = :year
                  AND s.sport = :sport
                  AND g.status = 'final'
            )
            SELECT
                team,
                division,
                conference,
                SUM(win) as wins,
                SUM(loss) as losses,
                ROUND(SUM(win)::numeric / NULLIF(SUM(win) + SUM(loss), 0), 3) as win_pct,
                SUM(runs_scored) as runs_scored,
                SUM(runs_allowed) as runs_allowed,
                SUM(runs_scored) - SUM(runs_allowed) as run_diff
            FROM game_results
            GROUP BY team_id, team, division, conference
            ORDER BY win_pct DESC
        """

        df = pd.read_sql(
            text(query),
            self.engine,
            params={'year': season_year, 'sport': sport}
        )

        if division:
            df = df[df['division'] == division]

        return df

    def get_player_game_log(
        self,
        player_name: str,
        season_year: int,
        stat_type: str = 'batting'
    ) -> pd.DataFrame:
        """
        Get player's game-by-game statistics.
        """
        query = """
            SELECT
                g.game_date,
                CASE
                    WHEN g.home_team_id = t.id THEN 'vs'
                    ELSE '@'
                END as home_away,
                opp.abbreviation as opponent,
                pgs.stats
            FROM player_game_stats pgs
            JOIN players p ON pgs.player_id = p.id
            JOIN teams t ON p.team_id = t.id
            JOIN games g ON pgs.game_id = g.id
            JOIN teams opp ON (
                CASE
                    WHEN g.home_team_id = t.id THEN g.away_team_id
                    ELSE g.home_team_id
                END = opp.id
            )
            JOIN seasons s ON g.season_id = s.id
            WHERE CONCAT(p.first_name, ' ', p.last_name) ILIKE :player_name
              AND s.year = :year
              AND pgs.stat_type = :stat_type
            ORDER BY g.game_date
        """

        df = pd.read_sql(
            text(query),
            self.engine,
            params={
                'player_name': f'%{player_name}%',
                'year': season_year,
                'stat_type': stat_type
            }
        )

        # Expand stats JSON
        if 'stats' in df.columns and len(df) > 0:
            stats_df = pd.json_normalize(df['stats'])
            df = pd.concat([df.drop('stats', axis=1), stats_df], axis=1)

        return df

    def compare_players(
        self,
        player_names: List[str],
        season_year: int,
        stats: List[str],
        stat_type: str = 'batting'
    ) -> pd.DataFrame:
        """
        Compare multiple players' statistics.
        """
        placeholders = ', '.join([f':player_{i}' for i in range(len(player_names))])

        query = f"""
            SELECT
                CONCAT(p.first_name, ' ', p.last_name) as player,
                t.name as team,
                pss.games_played,
                pss.stats
            FROM player_season_stats pss
            JOIN players p ON pss.player_id = p.id
            JOIN teams t ON p.team_id = t.id
            JOIN seasons s ON pss.season_id = s.id
            WHERE CONCAT(p.first_name, ' ', p.last_name) IN ({placeholders})
              AND s.year = :year
              AND pss.stat_type = :stat_type
        """

        params = {f'player_{i}': name for i, name in enumerate(player_names)}
        params['year'] = season_year
        params['stat_type'] = stat_type

        df = pd.read_sql(text(query), self.engine, params=params)

        # Expand and select specific stats
        if 'stats' in df.columns and len(df) > 0:
            stats_df = pd.json_normalize(df['stats'])
            df = pd.concat([df.drop('stats', axis=1), stats_df], axis=1)

            # Filter to requested stats
            base_cols = ['player', 'team', 'games_played']
            available_stats = [s for s in stats if s in df.columns]
            df = df[base_cols + available_stats]

        return df


# Example usage
if __name__ == "__main__":
    db = SportsStatsQuery("postgresql://user:password@localhost/sports_db")

    # Get season batting stats
    batting = db.get_player_season_stats(2024, 'baseball', 'batting', min_games=100)
    print(batting.head())

    # Get standings
    standings = db.get_team_standings(2024, 'baseball')
    print(standings)

    # Compare players
    comparison = db.compare_players(
        ['Mike Trout', 'Aaron Judge', 'Mookie Betts'],
        2024,
        ['avg', 'hr', 'rbi', 'ops']
    )
    print(comparison)

python Baseball

Redis Caching for Live Sports Data

Implement Redis caching for live sports scores and frequently accessed statistics.

"""Redis caching for live sports data."""
import redis
import json
from datetime import datetime, timedelta
from typing import Optional, Dict, Any, List
import hashlib

class SportsDataCache:
    """
    Redis cache for sports data with TTL management.

    Supports:
    - Live scores (short TTL)
    - Player stats (medium TTL)
    - Historical data (long TTL)
    """

    def __init__(
        self,
        host: str = 'localhost',
        port: int = 6379,
        db: int = 0,
        password: Optional[str] = None
    ):
        self.redis = redis.Redis(
            host=host,
            port=port,
            db=db,
            password=password,
            decode_responses=True
        )

        # TTL settings (seconds)
        self.ttl = {
            'live_scores': 30,       # 30 seconds
            'game_stats': 300,       # 5 minutes
            'player_stats': 3600,    # 1 hour
            'standings': 1800,       # 30 minutes
            'historical': 86400,     # 24 hours
        }

    def _make_key(self, prefix: str, *args) -> str:
        """Create a cache key from prefix and arguments."""
        key_parts = [prefix] + [str(a) for a in args]
        return ':'.join(key_parts)

    def _hash_params(self, params: Dict) -> str:
        """Create hash of parameters for cache key."""
        param_str = json.dumps(params, sort_keys=True)
        return hashlib.md5(param_str.encode()).hexdigest()[:8]

    # Live Scores
    def set_live_scores(self, sport: str, date: str, scores: List[Dict]):
        """Cache live scores for a sport and date."""
        key = self._make_key('live', sport, date)
        self.redis.setex(
            key,
            self.ttl['live_scores'],
            json.dumps(scores)
        )

    def get_live_scores(self, sport: str, date: str) -> Optional[List[Dict]]:
        """Get cached live scores."""
        key = self._make_key('live', sport, date)
        data = self.redis.get(key)
        return json.loads(data) if data else None

    def update_single_score(self, sport: str, date: str, game_id: str, score_data: Dict):
        """Update a single game score in the cache."""
        key = self._make_key('live', sport, date)
        # Use pipeline for atomic update
        pipe = self.redis.pipeline()

        existing = self.redis.get(key)
        if existing:
            scores = json.loads(existing)
            # Find and update the game
            for i, game in enumerate(scores):
                if game.get('game_id') == game_id:
                    scores[i].update(score_data)
                    break
            pipe.setex(key, self.ttl['live_scores'], json.dumps(scores))
        else:
            pipe.setex(key, self.ttl['live_scores'], json.dumps([score_data]))

        pipe.execute()

    # Player Stats
    def set_player_stats(
        self,
        player_id: str,
        season: int,
        stat_type: str,
        stats: Dict
    ):
        """Cache player statistics."""
        key = self._make_key('player', player_id, season, stat_type)
        self.redis.setex(
            key,
            self.ttl['player_stats'],
            json.dumps(stats)
        )

    def get_player_stats(
        self,
        player_id: str,
        season: int,
        stat_type: str
    ) -> Optional[Dict]:
        """Get cached player statistics."""
        key = self._make_key('player', player_id, season, stat_type)
        data = self.redis.get(key)
        return json.loads(data) if data else None

    # Standings
    def set_standings(self, sport: str, season: int, standings: List[Dict]):
        """Cache league standings."""
        key = self._make_key('standings', sport, season)
        self.redis.setex(
            key,
            self.ttl['standings'],
            json.dumps(standings)
        )

    def get_standings(self, sport: str, season: int) -> Optional[List[Dict]]:
        """Get cached standings."""
        key = self._make_key('standings', sport, season)
        data = self.redis.get(key)
        return json.loads(data) if data else None

    # Query Cache
    def cache_query_result(
        self,
        query_type: str,
        params: Dict,
        result: Any,
        ttl_type: str = 'player_stats'
    ):
        """Cache arbitrary query result."""
        param_hash = self._hash_params(params)
        key = self._make_key('query', query_type, param_hash)
        self.redis.setex(
            key,
            self.ttl.get(ttl_type, 3600),
            json.dumps(result)
        )

    def get_cached_query(self, query_type: str, params: Dict) -> Optional[Any]:
        """Get cached query result."""
        param_hash = self._hash_params(params)
        key = self._make_key('query', query_type, param_hash)
        data = self.redis.get(key)
        return json.loads(data) if data else None

    # Leaderboards using sorted sets
    def update_leaderboard(
        self,
        stat_name: str,
        season: int,
        player_id: str,
        value: float
    ):
        """Update a statistical leaderboard."""
        key = self._make_key('leaderboard', stat_name, season)
        self.redis.zadd(key, {player_id: value})

    def get_leaderboard(
        self,
        stat_name: str,
        season: int,
        top_n: int = 10,
        descending: bool = True
    ) -> List[tuple]:
        """Get top N from leaderboard."""
        key = self._make_key('leaderboard', stat_name, season)
        if descending:
            return self.redis.zrevrange(key, 0, top_n - 1, withscores=True)
        return self.redis.zrange(key, 0, top_n - 1, withscores=True)

    # Cache invalidation
    def invalidate_player(self, player_id: str):
        """Invalidate all cache entries for a player."""
        pattern = f'player:{player_id}:*'
        keys = self.redis.keys(pattern)
        if keys:
            self.redis.delete(*keys)

    def invalidate_sport_date(self, sport: str, date: str):
        """Invalidate live scores for a sport/date."""
        key = self._make_key('live', sport, date)
        self.redis.delete(key)


# Example usage
if __name__ == "__main__":
    cache = SportsDataCache()

    # Cache live scores
    scores = [
        {'game_id': '123', 'home': 'NYY', 'away': 'BOS', 'home_score': 5, 'away_score': 3},
        {'game_id': '124', 'home': 'LAD', 'away': 'SFG', 'home_score': 2, 'away_score': 2}
    ]
    cache.set_live_scores('mlb', '2024-06-15', scores)

    # Retrieve
    cached = cache.get_live_scores('mlb', '2024-06-15')
    print(cached)

    # Update leaderboard
    cache.update_leaderboard('hr', 2024, 'player_1', 45)
    cache.update_leaderboard('hr', 2024, 'player_2', 42)

    # Get HR leaders
    leaders = cache.get_leaderboard('hr', 2024)
    print(leaders)

python Baseball

MongoDB Schema for Sports Events

NoSQL MongoDB schema design for storing sports events, play-by-play data, and flexible statistics.

"""MongoDB schema and operations for sports analytics."""
from pymongo import MongoClient, ASCENDING, DESCENDING
from pymongo.collection import Collection
from datetime import datetime
from typing import Dict, List, Optional, Any
from bson import ObjectId

class SportsMongoDB:
    """
    MongoDB operations for sports analytics.

    Collections:
    - games: Game metadata and scores
    - plays: Play-by-play data
    - players: Player information
    - stats: Aggregated statistics
    """

    def __init__(self, connection_string: str, db_name: str = 'sports_analytics'):
        self.client = MongoClient(connection_string)
        self.db = self.client[db_name]
        self._setup_indexes()

    def _setup_indexes(self):
        """Create necessary indexes."""
        # Games collection
        self.db.games.create_index([
            ('sport', ASCENDING),
            ('date', DESCENDING)
        ])
        self.db.games.create_index('external_id', unique=True)

        # Plays collection (for play-by-play)
        self.db.plays.create_index([
            ('game_id', ASCENDING),
            ('sequence', ASCENDING)
        ])
        self.db.plays.create_index('player_ids')

        # Players collection
        self.db.players.create_index('external_id', unique=True)
        self.db.players.create_index('name')

        # Stats collection
        self.db.stats.create_index([
            ('player_id', ASCENDING),
            ('season', ASCENDING),
            ('stat_type', ASCENDING)
        ], unique=True)

    # Game Operations
    def insert_game(self, game_data: Dict) -> str:
        """
        Insert a game document.

        Schema:
        {
            sport: "baseball",
            date: ISODate,
            home_team: { id, name, abbreviation },
            away_team: { id, name, abbreviation },
            venue: { name, city, state },
            weather: { temp, wind, conditions },
            score: { home: 5, away: 3 },
            innings: [...],  # Sport-specific
            status: "final",
            external_id: "mlb_123456"
        }
        """
        game_data['created_at'] = datetime.utcnow()
        game_data['updated_at'] = datetime.utcnow()
        result = self.db.games.insert_one(game_data)
        return str(result.inserted_id)

    def update_game_score(self, game_id: str, score: Dict, status: str = None):
        """Update game score."""
        update = {
            '$set': {
                'score': score,
                'updated_at': datetime.utcnow()
            }
        }
        if status:
            update['$set']['status'] = status
        self.db.games.update_one({'_id': ObjectId(game_id)}, update)

    def get_games_by_date(self, sport: str, date: datetime) -> List[Dict]:
        """Get all games for a sport on a date."""
        start = datetime(date.year, date.month, date.day)
        end = datetime(date.year, date.month, date.day, 23, 59, 59)

        return list(self.db.games.find({
            'sport': sport,
            'date': {'$gte': start, '$lte': end}
        }).sort('date', ASCENDING))

    # Play-by-Play Operations
    def insert_plays(self, game_id: str, plays: List[Dict]):
        """
        Insert play-by-play data.

        Play schema (baseball example):
        {
            game_id: ObjectId,
            sequence: 1,
            inning: 1,
            half: "top",
            outs_before: 0,
            outs_after: 1,
            runners_before: [],
            runners_after: [],
            batter_id: "player_123",
            pitcher_id: "player_456",
            player_ids: ["player_123", "player_456"],
            event_type: "strikeout",
            description: "Batter struck out swinging",
            pitch_data: {
                velocity: 95.2,
                spin_rate: 2400,
                pitch_type: "FF"
            },
            result: { runs: 0, hits: 0 }
        }
        """
        for play in plays:
            play['game_id'] = ObjectId(game_id)
            play['created_at'] = datetime.utcnow()

        self.db.plays.insert_many(plays)

    def get_plays_for_game(self, game_id: str) -> List[Dict]:
        """Get all plays for a game in order."""
        return list(self.db.plays.find({
            'game_id': ObjectId(game_id)
        }).sort('sequence', ASCENDING))

    def get_player_plays(
        self,
        player_id: str,
        season: int,
        event_types: List[str] = None
    ) -> List[Dict]:
        """Get all plays involving a player."""
        query = {
            'player_ids': player_id,
            'season': season
        }
        if event_types:
            query['event_type'] = {'$in': event_types}

        return list(self.db.plays.find(query))

    # Aggregation Pipelines
    def aggregate_batting_stats(self, season: int) -> List[Dict]:
        """
        Aggregate batting statistics from play-by-play data.
        """
        pipeline = [
            {'$match': {'season': season, 'event_type': {'$ne': None}}},
            {'$group': {
                '_id': '$batter_id',
                'games': {'$addToSet': '$game_id'},
                'at_bats': {
                    '$sum': {
                        '$cond': [
                            {'$in': ['$event_type', ['single', 'double', 'triple', 'home_run', 'strikeout', 'groundout', 'flyout']]},
                            1, 0
                        ]
                    }
                },
                'hits': {
                    '$sum': {
                        '$cond': [
                            {'$in': ['$event_type', ['single', 'double', 'triple', 'home_run']]},
                            1, 0
                        ]
                    }
                },
                'home_runs': {
                    '$sum': {'$cond': [{'$eq': ['$event_type', 'home_run']}, 1, 0]}
                },
                'walks': {
                    '$sum': {'$cond': [{'$eq': ['$event_type', 'walk']}, 1, 0]}
                },
                'strikeouts': {
                    '$sum': {'$cond': [{'$eq': ['$event_type', 'strikeout']}, 1, 0]}
                }
            }},
            {'$addFields': {
                'games_played': {'$size': '$games'},
                'batting_avg': {
                    '$cond': [
                        {'$gt': ['$at_bats', 0]},
                        {'$divide': ['$hits', '$at_bats']},
                        0
                    ]
                }
            }},
            {'$sort': {'batting_avg': -1}}
        ]

        return list(self.db.plays.aggregate(pipeline))

    def get_hot_zones(self, player_id: str, pitch_type: str = None) -> Dict:
        """
        Aggregate batting performance by pitch location zones.
        """
        match = {'batter_id': player_id, 'pitch_data.location': {'$exists': True}}
        if pitch_type:
            match['pitch_data.pitch_type'] = pitch_type

        pipeline = [
            {'$match': match},
            {'$group': {
                '_id': '$pitch_data.zone',
                'pitches': {'$sum': 1},
                'swings': {'$sum': {'$cond': ['$swing', 1, 0]}},
                'hits': {
                    '$sum': {
                        '$cond': [
                            {'$in': ['$event_type', ['single', 'double', 'triple', 'home_run']]},
                            1, 0
                        ]
                    }
                },
                'avg_exit_velo': {'$avg': '$batted_ball.exit_velocity'}
            }},
            {'$project': {
                'zone': '$_id',
                'pitches': 1,
                'swing_rate': {'$divide': ['$swings', '$pitches']},
                'ba': {
                    '$cond': [
                        {'$gt': ['$swings', 0]},
                        {'$divide': ['$hits', '$swings']},
                        0
                    ]
                },
                'avg_exit_velo': 1
            }}
        ]

        results = list(self.db.plays.aggregate(pipeline))
        return {r['zone']: r for r in results}


# Example usage
if __name__ == "__main__":
    mongo = SportsMongoDB("mongodb://localhost:27017")

    # Insert a game
    game = {
        'sport': 'baseball',
        'date': datetime(2024, 6, 15, 19, 5),
        'home_team': {'id': 'nyy', 'name': 'New York Yankees', 'abbreviation': 'NYY'},
        'away_team': {'id': 'bos', 'name': 'Boston Red Sox', 'abbreviation': 'BOS'},
        'external_id': 'mlb_2024_06_15_nyy_bos'
    }
    game_id = mongo.insert_game(game)

python Baseball

DuckDB for Sports Analytics

Use DuckDB for fast analytics queries on sports data files and DataFrames without a database server.

"""DuckDB for fast sports analytics queries."""
import duckdb
import pandas as pd
from pathlib import Path
from typing import Optional

class SportsAnalyticsDuckDB:
    """
    Fast analytics on sports data using DuckDB.

    Benefits:
    - No server required
    - Direct query on Parquet/CSV files
    - SQL interface
    - Fast aggregations
    """

    def __init__(self, db_path: str = ':memory:'):
        """
        Initialize DuckDB connection.

        Args:
            db_path: Path to database file or ':memory:' for in-memory
        """
        self.conn = duckdb.connect(db_path)
        self._setup_extensions()

    def _setup_extensions(self):
        """Load useful extensions."""
        self.conn.execute("INSTALL httpfs")
        self.conn.execute("LOAD httpfs")

    def query(self, sql: str) -> pd.DataFrame:
        """Execute query and return DataFrame."""
        return self.conn.execute(sql).fetchdf()

    def register_dataframe(self, name: str, df: pd.DataFrame):
        """Register a DataFrame as a queryable table."""
        self.conn.register(name, df)

    def load_parquet(self, path: str, table_name: Optional[str] = None) -> pd.DataFrame:
        """Load Parquet file(s) into DuckDB."""
        if table_name:
            self.conn.execute(f"""
                CREATE TABLE IF NOT EXISTS {table_name} AS
                SELECT * FROM parquet_scan('{path}')
            """)
        return self.conn.execute(f"SELECT * FROM parquet_scan('{path}')").fetchdf()

    def load_csv(self, path: str, table_name: Optional[str] = None) -> pd.DataFrame:
        """Load CSV file(s) into DuckDB."""
        if table_name:
            self.conn.execute(f"""
                CREATE TABLE IF NOT EXISTS {table_name} AS
                SELECT * FROM read_csv_auto('{path}')
            """)
        return self.conn.execute(f"SELECT * FROM read_csv_auto('{path}')").fetchdf()

    # Pre-built analytics queries
    def batting_leaders(
        self,
        stat: str,
        min_pa: int = 200,
        limit: int = 10
    ) -> pd.DataFrame:
        """Get batting leaders for a statistic."""
        stat_calc = {
            'avg': 'H / NULLIF(AB, 0)',
            'obp': '(H + BB + HBP) / NULLIF(AB + BB + HBP + SF, 0)',
            'slg': '(H + "2B" + 2*"3B" + 3*HR) / NULLIF(AB, 0)',
            'ops': '((H + BB + HBP) / NULLIF(AB + BB + HBP + SF, 0)) + ((H + "2B" + 2*"3B" + 3*HR) / NULLIF(AB, 0))',
            'hr': 'HR',
            'rbi': 'RBI',
            'sb': 'SB'
        }

        calc = stat_calc.get(stat.lower(), stat)

        return self.query(f"""
            SELECT
                Name,
                Team,
                G,
                PA,
                AB,
                H,
                HR,
                RBI,
                ROUND({calc}, 3) as {stat}
            FROM batting
            WHERE PA >= {min_pa}
            ORDER BY {stat} DESC
            LIMIT {limit}
        """)

    def pitching_leaders(
        self,
        stat: str,
        min_ip: float = 50.0,
        limit: int = 10
    ) -> pd.DataFrame:
        """Get pitching leaders for a statistic."""
        stat_calc = {
            'era': 'ER * 9.0 / NULLIF(IP, 0)',
            'whip': '(BB + H) / NULLIF(IP, 0)',
            'k9': 'SO * 9.0 / NULLIF(IP, 0)',
            'bb9': 'BB * 9.0 / NULLIF(IP, 0)',
            'wins': 'W',
            'so': 'SO'
        }

        calc = stat_calc.get(stat.lower(), stat)

        return self.query(f"""
            SELECT
                Name,
                Team,
                G,
                GS,
                IP,
                W,
                L,
                SO,
                BB,
                ROUND({calc}, 2) as {stat}
            FROM pitching
            WHERE IP >= {min_ip}
            ORDER BY {stat} {"ASC" if stat.lower() in ['era', 'whip', 'bb9'] else "DESC"}
            LIMIT {limit}
        """)

    def team_stats_summary(self) -> pd.DataFrame:
        """Aggregate team statistics."""
        return self.query("""
            SELECT
                Team,
                COUNT(*) as Players,
                SUM(G) as TotalGames,
                ROUND(AVG(H / NULLIF(AB, 0)), 3) as TeamAVG,
                SUM(HR) as TotalHR,
                SUM(RBI) as TotalRBI,
                SUM(SB) as TotalSB
            FROM batting
            GROUP BY Team
            ORDER BY TeamAVG DESC
        """)

    def player_comparison(self, players: list, stats: list) -> pd.DataFrame:
        """Compare multiple players across stats."""
        player_filter = ", ".join([f"'{p}'" for p in players])
        stat_cols = ", ".join(stats)

        return self.query(f"""
            SELECT Name, Team, {stat_cols}
            FROM batting
            WHERE Name IN ({player_filter})
        """)

    def rolling_avg(
        self,
        player: str,
        stat: str,
        window: int = 10
    ) -> pd.DataFrame:
        """Calculate rolling average for a player's game log."""
        return self.query(f"""
            SELECT
                Date,
                {stat},
                AVG({stat}) OVER (
                    ORDER BY Date
                    ROWS BETWEEN {window - 1} PRECEDING AND CURRENT ROW
                ) as Rolling{window}Avg
            FROM game_log
            WHERE Name = '{player}'
            ORDER BY Date
        """)

    def correlation_matrix(self, stats: list) -> pd.DataFrame:
        """Calculate correlation matrix between statistics."""
        corr_calcs = []
        for s1 in stats:
            row = []
            for s2 in stats:
                row.append(f"CORR({s1}, {s2}) as {s1}_{s2}")
            corr_calcs.append(", ".join(row))

        # This is simplified - real implementation would pivot
        return self.query(f"""
            SELECT {", ".join([f'CORR({s}, HR) as {s}_HR' for s in stats])}
            FROM batting
            WHERE PA >= 200
        """)


# Example usage
if __name__ == "__main__":
    db = SportsAnalyticsDuckDB()

    # Load data from pybaseball
    import pybaseball as pyb

    batting = pyb.batting_stats(2024)
    pitching = pyb.pitching_stats(2024)

    # Register DataFrames
    db.register_dataframe('batting', batting)
    db.register_dataframe('pitching', pitching)

    # Get batting leaders
    print("HR Leaders:")
    print(db.batting_leaders('HR', min_pa=300))

    print("\nERA Leaders:")
    print(db.pitching_leaders('ERA', min_ip=100))

    print("\nTeam Stats:")
    print(db.team_stats_summary())

    # Direct SQL on registered DataFrames
    custom = db.query("""
        SELECT Name, Team, HR, RBI, SB,
               HR + SB as PowerSpeed
        FROM batting
        WHERE PA >= 400 AND HR >= 20 AND SB >= 15
        ORDER BY PowerSpeed DESC
    """)
    print("\nPower-Speed:")
    print(custom)

r Baseball

R Database Operations with DBI

Connect to databases and query sports data using R's DBI package with parameterized queries.

# Database operations for sports analytics in R
library(DBI)
library(dplyr)
library(dbplyr)

#' Connect to a sports database
#'
#' @param driver Database driver ("postgres", "mysql", "sqlite")
#' @param ... Connection parameters
#' @return DBI connection object
connect_sports_db <- function(driver = "postgres", ...) {
  drv <- switch(driver,
    "postgres" = RPostgres::Postgres(),
    "mysql" = RMariaDB::MariaDB(),
    "sqlite" = RSQLite::SQLite(),
    stop("Unsupported driver")
  )

  dbConnect(drv, ...)
}

#' Get player statistics from database
#'
#' @param con DBI connection
#' @param season Season year
#' @param stat_type Type of statistics
#' @param min_games Minimum games played
#' @return Data frame with player statistics
get_player_stats <- function(con, season, stat_type = "batting", min_games = 50) {
  query <- "
    SELECT
      p.first_name,
      p.last_name,
      t.name as team,
      pss.games_played,
      pss.stats
    FROM player_season_stats pss
    JOIN players p ON pss.player_id = p.id
    JOIN teams t ON p.team_id = t.id
    JOIN seasons s ON pss.season_id = s.id
    WHERE s.year = $1
      AND pss.stat_type = $2
      AND pss.games_played >= $3
    ORDER BY pss.games_played DESC
  "

  dbGetQuery(con, query, params = list(season, stat_type, min_games))
}

#' Insert game results
#'
#' @param con DBI connection
#' @param games Data frame with game data
#' @return Number of rows inserted
insert_games <- function(con, games) {
  # Use parameterized insert for safety
  dbWriteTable(con, "games", games, append = TRUE)
}

#' Batch upsert player statistics
#'
#' @param con DBI connection
#' @param stats Data frame with statistics
upsert_stats <- function(con, stats) {
  # Start transaction

dbBegin(con)

  tryCatch({
    for (i in seq_len(nrow(stats))) {
      row <- stats[i, ]

      # Check if exists
      existing <- dbGetQuery(con,
        "SELECT id FROM player_season_stats
         WHERE player_id = $1 AND season_id = $2 AND stat_type = $3",
        params = list(row$player_id, row$season_id, row$stat_type)
      )

      if (nrow(existing) > 0) {
        # Update
        dbExecute(con,
          "UPDATE player_season_stats
           SET games_played = $1, stats = $2, updated_at = NOW()
           WHERE id = $3",
          params = list(row$games_played, row$stats, existing$id[1])
        )
      } else {
        # Insert
        dbExecute(con,
          "INSERT INTO player_season_stats
           (player_id, season_id, stat_type, games_played, stats)
           VALUES ($1, $2, $3, $4, $5)",
          params = list(row$player_id, row$season_id, row$stat_type,
                       row$games_played, row$stats)
        )
      }
    }

    dbCommit(con)
    return(nrow(stats))

  }, error = function(e) {
    dbRollback(con)
    stop(e)
  })
}

#' Use dplyr/dbplyr for database queries
#'
#' @param con DBI connection
#' @return Lazy query object
query_with_dbplyr <- function(con) {
  # Create lazy table references
  players <- tbl(con, "players")
  teams <- tbl(con, "teams")
  stats <- tbl(con, "player_season_stats")
  seasons <- tbl(con, "seasons")

  # Build query with dplyr verbs
  result <- stats %>%
    inner_join(players, by = c("player_id" = "id")) %>%
    inner_join(teams, by = c("team_id" = "id")) %>%
    inner_join(seasons, by = c("season_id" = "id")) %>%
    filter(year == 2024, stat_type == "batting") %>%
    select(
      first_name, last_name,
      team = name.y,
      games_played, stats
    ) %>%
    arrange(desc(games_played))

  # Execute and collect
  collect(result)
}

#' Get standings using window functions
#'
#' @param con DBI connection
#' @param season Season year
get_standings <- function(con, season) {
  query <- "
    WITH game_results AS (
      SELECT
        t.id,
        t.name as team,
        t.division,
        SUM(CASE
          WHEN (g.home_team_id = t.id AND g.home_score > g.away_score) OR
               (g.away_team_id = t.id AND g.away_score > g.home_score)
          THEN 1 ELSE 0
        END) as wins,
        SUM(CASE
          WHEN (g.home_team_id = t.id AND g.home_score < g.away_score) OR
               (g.away_team_id = t.id AND g.away_score < g.home_score)
          THEN 1 ELSE 0
        END) as losses
      FROM teams t
      JOIN games g ON t.id IN (g.home_team_id, g.away_team_id)
      JOIN seasons s ON g.season_id = s.id
      WHERE s.year = $1 AND g.status = 'final'
      GROUP BY t.id, t.name, t.division
    )
    SELECT
      team,
      division,
      wins,
      losses,
      ROUND(wins::numeric / NULLIF(wins + losses, 0), 3) as pct,
      wins - FIRST_VALUE(wins) OVER (
        PARTITION BY division ORDER BY wins DESC
      ) as gb
    FROM game_results
    ORDER BY division, wins DESC
  "

  dbGetQuery(con, query, params = list(season))
}

# Example usage
# con <- connect_sports_db("postgres",
#   dbname = "sports_db",
#   host = "localhost",
#   user = "user",
#   password = "password"
# )
#
# stats <- get_player_stats(con, 2024, "batting", 100)
# standings <- get_standings(con, 2024)
#
# dbDisconnect(con)

python Baseball

Scrape Baseball Reference Player Stats

Web scraping Baseball Reference for historical player statistics using BeautifulSoup and requests.

"""Scrape Baseball Reference for player batting statistics."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_player_batting(player_id: str, year: int = None) -> pd.DataFrame:
    """
    Scrape batting stats from Baseball Reference.

    Args:
        player_id: Baseball Reference player ID (e.g., 'troutmi01')
        year: Optional specific year, None for career stats

    Returns:
        DataFrame with batting statistics
    """
    url = f"https://www.baseball-reference.com/players/{player_id[0]}/{player_id}.shtml"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the batting stats table
    batting_table = soup.find('table', {'id': 'batting_standard'})

    if not batting_table:
        raise ValueError(f"Could not find batting table for {player_id}")

    # Parse table into DataFrame
    rows = []
    headers_row = batting_table.find('thead').find_all('th')
    columns = [th.text for th in headers_row]

    for row in batting_table.find('tbody').find_all('tr'):
        if 'class' in row.attrs and 'thead' in row.attrs['class']:
            continue
        cells = row.find_all(['td', 'th'])
        row_data = [cell.text for cell in cells]
        if len(row_data) == len(columns):
            rows.append(row_data)

    df = pd.DataFrame(rows, columns=columns)

    # Filter by year if specified
    if year and 'Year' in df.columns:
        df = df[df['Year'] == str(year)]

    # Convert numeric columns
    numeric_cols = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

# Example usage
if __name__ == "__main__":
    # Scrape Mike Trout's stats
    trout_stats = scrape_player_batting('troutmi01', 2023)
    print(trout_stats)

    # Be respectful - wait between requests
    time.sleep(3)

python Baseball

Scrape MLB Standings

Scrape current MLB standings from ESPN using Python requests and BeautifulSoup.

"""Scrape MLB standings from ESPN."""
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_mlb_standings(year: int = 2024) -> dict:
    """
    Scrape MLB standings from ESPN.

    Args:
        year: Season year

    Returns:
        Dictionary with AL and NL standings DataFrames
    """
    url = f"https://www.espn.com/mlb/standings/_/season/{year}"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    standings = {'AL': [], 'NL': []}

    # Find all standings tables
    tables = soup.find_all('table', class_='Table')

    for i, table in enumerate(tables[:2]):  # AL and NL
        league = 'AL' if i == 0 else 'NL'

        rows = table.find_all('tr')
        for row in rows[1:]:  # Skip header
            cells = row.find_all('td')
            if len(cells) >= 6:
                team_data = {
                    'Team': cells[0].text.strip(),
                    'W': int(cells[1].text) if cells[1].text.isdigit() else 0,
                    'L': int(cells[2].text) if cells[2].text.isdigit() else 0,
                    'PCT': float(cells[3].text) if cells[3].text else 0,
                    'GB': cells[4].text.strip(),
                    'Diff': cells[5].text.strip()
                }
                standings[league].append(team_data)

    return {
        'AL': pd.DataFrame(standings['AL']),
        'NL': pd.DataFrame(standings['NL'])
    }

# Example usage
standings = scrape_mlb_standings(2024)
print("American League:")
print(standings['AL'].head(10))
print("\nNational League:")
print(standings['NL'].head(10))

python Basketball

Scrape Basketball Reference Box Scores

Scrape NBA game box scores from Basketball Reference for detailed game statistics.

"""Scrape NBA box scores from Basketball Reference."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

def scrape_game_box_score(game_id: str) -> dict:
    """
    Scrape box score from Basketball Reference.

    Args:
        game_id: Basketball Reference game ID (e.g., '202401150LAL')

    Returns:
        Dictionary with home and away team box scores
    """
    url = f"https://www.basketball-reference.com/boxscores/{game_id}.html"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    result = {}

    # Find all box score tables (basic and advanced for each team)
    tables = soup.find_all('table', {'class': 'sortable'})

    for table in tables:
        table_id = table.get('id', '')
        if 'basic' in table_id:
            team = table_id.replace('box-', '').replace('-game-basic', '')

            headers_row = table.find('thead').find_all('tr')[-1]
            columns = [th.text for th in headers_row.find_all('th')]

            rows = []
            for row in table.find('tbody').find_all('tr'):
                if 'class' in row.attrs and 'thead' in row.attrs['class']:
                    continue
                cells = row.find_all(['td', 'th'])
                row_data = [cell.text for cell in cells]
                if row_data:
                    rows.append(row_data)

            df = pd.DataFrame(rows, columns=columns[:len(rows[0])] if rows else columns)

            # Convert numeric columns
            numeric_cols = ['MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA',
                          'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
            for col in numeric_cols:
                if col in df.columns:
                    df[col] = pd.to_numeric(df[col], errors='coerce')

            result[team] = df

    return result

def scrape_games_on_date(date: str) -> list:
    """
    Get list of game IDs for a specific date.

    Args:
        date: Date string in format 'YYYY-MM-DD'

    Returns:
        List of game IDs
    """
    dt = datetime.strptime(date, '%Y-%m-%d')
    url = f"https://www.basketball-reference.com/boxscores/?month={dt.month}&day={dt.day}&year={dt.year}"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    game_ids = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if '/boxscores/' in href and href.endswith('.html'):
            game_id = href.split('/')[-1].replace('.html', '')
            if len(game_id) > 10:  # Valid game ID
                game_ids.append(game_id)

    return list(set(game_ids))

# Example usage
box_scores = scrape_game_box_score('202401150LAL')
for team, df in box_scores.items():
    print(f"\n{team} Box Score:")
    print(df[['Starters', 'PTS', 'TRB', 'AST']].head() if 'Starters' in df.columns else df.head())

python Basketball

Scrape NBA Draft Data

Scrape historical NBA draft data from Basketball Reference for draft analysis.

"""Scrape NBA draft data from Basketball Reference."""
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_nba_draft(year: int) -> pd.DataFrame:
    """
    Scrape NBA draft results for a specific year.

    Args:
        year: Draft year

    Returns:
        DataFrame with draft picks and player info
    """
    url = f"https://www.basketball-reference.com/draft/NBA_{year}.html"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the draft table
    draft_table = soup.find('table', {'id': 'stats'})

    if not draft_table:
        raise ValueError(f"Could not find draft table for {year}")

    # Get headers
    header_row = draft_table.find('thead').find_all('tr')[-1]
    columns = [th.text.strip() for th in header_row.find_all('th')]

    # Parse rows
    rows = []
    for row in draft_table.find('tbody').find_all('tr'):
        if row.get('class') and 'thead' in row.get('class'):
            continue
        cells = row.find_all(['td', 'th'])
        row_data = [cell.text.strip() for cell in cells]
        if row_data and len(row_data) > 5:
            rows.append(row_data)

    df = pd.DataFrame(rows)
    if len(df.columns) == len(columns):
        df.columns = columns

    # Clean and convert columns
    if 'Pk' in df.columns:
        df['Pk'] = pd.to_numeric(df['Pk'], errors='coerce')

    # Add draft year
    df['Draft_Year'] = year

    return df

def get_draft_history(start_year: int, end_year: int) -> pd.DataFrame:
    """
    Get draft history for multiple years.

    Args:
        start_year: Starting year
        end_year: Ending year (inclusive)

    Returns:
        Combined DataFrame of all drafts
    """
    import time

    all_drafts = []
    for year in range(start_year, end_year + 1):
        try:
            draft_df = scrape_nba_draft(year)
            all_drafts.append(draft_df)
            print(f"Scraped {year} draft: {len(draft_df)} picks")
            time.sleep(3)  # Be respectful
        except Exception as e:
            print(f"Error scraping {year}: {e}")

    return pd.concat(all_drafts, ignore_index=True)

# Example usage
draft_2023 = scrape_nba_draft(2023)
print(f"2023 NBA Draft - {len(draft_2023)} picks")
print(draft_2023[['Pk', 'Player', 'Tm']].head(10))

python Football

Scrape Pro Football Reference Stats

Scrape NFL player statistics from Pro Football Reference using BeautifulSoup.

"""Scrape NFL stats from Pro Football Reference."""
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_passing_stats(year: int) -> pd.DataFrame:
    """
    Scrape NFL passing statistics for a season.

    Args:
        year: Season year

    Returns:
        DataFrame with passing statistics
    """
    url = f"https://www.pro-football-reference.com/years/{year}/passing.htm"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the passing table
    table = soup.find('table', {'id': 'passing'})

    if not table:
        raise ValueError(f"Could not find passing table for {year}")

    # Get headers
    header_row = table.find('thead').find_all('tr')[-1]
    columns = [th.text.strip() for th in header_row.find_all('th')]

    # Parse rows
    rows = []
    for row in table.find('tbody').find_all('tr'):
        if row.get('class') and 'thead' in row.get('class'):
            continue
        cells = row.find_all(['td', 'th'])
        row_data = [cell.text.strip() for cell in cells]
        if row_data:
            rows.append(row_data)

    df = pd.DataFrame(rows)
    if len(df.columns) == len(columns):
        df.columns = columns

    # Convert numeric columns
    numeric_cols = ['G', 'GS', 'Cmp', 'Att', 'Yds', 'TD', 'Int', 'Sk']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

def scrape_rushing_stats(year: int) -> pd.DataFrame:
    """
    Scrape NFL rushing statistics for a season.
    """
    url = f"https://www.pro-football-reference.com/years/{year}/rushing.htm"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find('table', {'id': 'rushing'})

    if not table:
        raise ValueError(f"Could not find rushing table for {year}")

    header_row = table.find('thead').find_all('tr')[-1]
    columns = [th.text.strip() for th in header_row.find_all('th')]

    rows = []
    for row in table.find('tbody').find_all('tr'):
        if row.get('class') and 'thead' in row.get('class'):
            continue
        cells = row.find_all(['td', 'th'])
        row_data = [cell.text.strip() for cell in cells]
        if row_data:
            rows.append(row_data)

    df = pd.DataFrame(rows)
    if len(df.columns) == len(columns):
        df.columns = columns

    return df

# Example usage
passing_2023 = scrape_passing_stats(2023)
print(f"2023 NFL Passing Leaders:")
print(passing_2023[['Player', 'Tm', 'Yds', 'TD']].head(10))

python Soccer

Scrape FBref Soccer Stats

Scrape soccer statistics from FBref (Football Reference) for player and team data.

"""Scrape soccer stats from FBref."""
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_league_stats(league_id: str, season: str) -> pd.DataFrame:
    """
    Scrape league-wide player statistics from FBref.

    Args:
        league_id: FBref league ID (e.g., '9' for Premier League)
        season: Season string (e.g., '2023-2024')

    Returns:
        DataFrame with player statistics
    """
    # League IDs: 9=Premier League, 12=La Liga, 20=Bundesliga, 11=Serie A, 13=Ligue 1
    url = f"https://fbref.com/en/comps/{league_id}/{season}/stats/{season}-Stats"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the standard stats table
    table = soup.find('table', {'id': 'stats_standard'})

    if not table:
        # Try alternate table ID
        table = soup.find('table', class_='stats_table')

    if not table:
        raise ValueError(f"Could not find stats table")

    # Parse table headers
    header_rows = table.find('thead').find_all('tr')
    columns = [th.text.strip() for th in header_rows[-1].find_all('th')]

    # Parse data rows
    rows = []
    for row in table.find('tbody').find_all('tr'):
        if row.get('class') and any(c in str(row.get('class')) for c in ['thead', 'spacer']):
            continue
        cells = row.find_all(['td', 'th'])
        row_data = [cell.text.strip() for cell in cells]
        if row_data and len(row_data) > 5:
            rows.append(row_data)

    df = pd.DataFrame(rows)
    if len(df.columns) == len(columns):
        df.columns = columns

    # Convert numeric columns
    numeric_cols = ['MP', 'Starts', 'Min', 'Gls', 'Ast', 'G+A', 'PK', 'PKatt']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

def scrape_team_stats(team_id: str, season: str) -> dict:
    """
    Scrape detailed team statistics.

    Args:
        team_id: FBref team ID
        season: Season string

    Returns:
        Dictionary with various stat DataFrames
    """
    url = f"https://fbref.com/en/squads/{team_id}/{season}/all_comps"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    tables = {}
    table_ids = ['stats_standard_combined', 'stats_shooting_combined', 'stats_passing_combined']

    for table_id in table_ids:
        table = soup.find('table', {'id': table_id})
        if table:
            # Parse similar to above
            header_rows = table.find('thead').find_all('tr')
            columns = [th.text.strip() for th in header_rows[-1].find_all('th')]

            rows = []
            for row in table.find('tbody').find_all('tr'):
                cells = row.find_all(['td', 'th'])
                row_data = [cell.text.strip() for cell in cells]
                if row_data:
                    rows.append(row_data)

            df = pd.DataFrame(rows)
            if len(df.columns) == len(columns):
                df.columns = columns
            tables[table_id] = df

    return tables

# Example usage
epl_stats = scrape_league_stats('9', '2023-2024')
print(f"Premier League 2023-24 Stats: {len(epl_stats)} players")
print(epl_stats[['Player', 'Squad', 'Gls', 'Ast']].head(10) if 'Player' in epl_stats.columns else epl_stats.head())

python Soccer

Scrape Transfermarkt Player Values

Scrape player market values and transfer data from Transfermarkt.

"""Scrape player market values from Transfermarkt."""
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_team_values(team_id: str, team_name: str) -> pd.DataFrame:
    """
    Scrape squad market values from Transfermarkt.

    Args:
        team_id: Transfermarkt team ID
        team_name: Team name slug (e.g., 'manchester-city')

    Returns:
        DataFrame with player values
    """
    url = f"https://www.transfermarkt.com/{team_name}/kader/verein/{team_id}"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    players = []

    # Find player rows in squad table
    table = soup.find('table', class_='items')

    if table:
        for row in table.find_all('tr', class_=['odd', 'even']):
            try:
                # Player name
                name_cell = row.find('td', class_='hauptlink')
                name = name_cell.find('a').text.strip() if name_cell else None

                # Position
                position_cell = row.find_all('td')
                position = None
                for td in position_cell:
                    if td.get('class') and 'posrela' in str(td.get('class')):
                        pos_text = td.find_all('tr')
                        if len(pos_text) > 1:
                            position = pos_text[1].text.strip()

                # Market value
                value_cell = row.find('td', class_='rechts hauptlink')
                value = value_cell.text.strip() if value_cell else None

                if name:
                    players.append({
                        'Name': name,
                        'Position': position,
                        'Market_Value': value
                    })
            except Exception as e:
                continue

    df = pd.DataFrame(players)

    # Parse market values to numeric
    def parse_value(val):
        if pd.isna(val) or val == '-':
            return None
        val = val.replace('€', '').strip()
        multiplier = 1
        if 'm' in val.lower():
            multiplier = 1_000_000
            val = val.lower().replace('m', '')
        elif 'k' in val.lower():
            multiplier = 1_000
            val = val.lower().replace('k', '')
        try:
            return float(val) * multiplier
        except:
            return None

    if 'Market_Value' in df.columns:
        df['Value_EUR'] = df['Market_Value'].apply(parse_value)

    return df

# Example: Manchester City (team_id=281)
# values = scrape_team_values('281', 'manchester-city')
# print(values.head(10))

python Hockey

Scrape Hockey Reference Stats

Scrape NHL statistics from Hockey Reference for player and team analysis.

"""Scrape NHL stats from Hockey Reference."""
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_skater_stats(year: int) -> pd.DataFrame:
    """
    Scrape NHL skater statistics for a season.

    Args:
        year: Season ending year (e.g., 2024 for 2023-24 season)

    Returns:
        DataFrame with skater statistics
    """
    url = f"https://www.hockey-reference.com/leagues/NHL_{year}_skaters.html"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find('table', {'id': 'stats'})

    if not table:
        raise ValueError(f"Could not find skater stats table for {year}")

    header_row = table.find('thead').find_all('tr')[-1]
    columns = [th.text.strip() for th in header_row.find_all('th')]

    rows = []
    for row in table.find('tbody').find_all('tr'):
        if row.get('class') and 'thead' in row.get('class'):
            continue
        cells = row.find_all(['td', 'th'])
        row_data = [cell.text.strip() for cell in cells]
        if row_data:
            rows.append(row_data)

    df = pd.DataFrame(rows)
    if len(df.columns) == len(columns):
        df.columns = columns

    # Convert numeric columns
    numeric_cols = ['GP', 'G', 'A', 'PTS', '+/-', 'PIM', 'EVG', 'PPG', 'SHG', 'S']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

def scrape_goalie_stats(year: int) -> pd.DataFrame:
    """
    Scrape NHL goalie statistics for a season.
    """
    url = f"https://www.hockey-reference.com/leagues/NHL_{year}_goalies.html"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find('table', {'id': 'stats'})

    if not table:
        raise ValueError(f"Could not find goalie stats table for {year}")

    header_row = table.find('thead').find_all('tr')[-1]
    columns = [th.text.strip() for th in header_row.find_all('th')]

    rows = []
    for row in table.find('tbody').find_all('tr'):
        if row.get('class') and 'thead' in row.get('class'):
            continue
        cells = row.find_all(['td', 'th'])
        row_data = [cell.text.strip() for cell in cells]
        if row_data:
            rows.append(row_data)

    df = pd.DataFrame(rows)
    if len(df.columns) == len(columns):
        df.columns = columns

    return df

# Example usage
skaters_2024 = scrape_skater_stats(2024)
print(f"2023-24 NHL Skater Stats: {len(skaters_2024)} players")
if 'Player' in skaters_2024.columns:
    print(skaters_2024[['Player', 'Tm', 'G', 'A', 'PTS']].head(10))

python Golf

Scrape PGA Tour Statistics

Scrape PGA Tour player statistics from the official website.

"""Scrape PGA Tour statistics."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_pga_stats(stat_id: str = '02675', year: int = 2024) -> pd.DataFrame:
    """
    Scrape PGA Tour statistics.

    Common stat IDs:
    - 02675: Scoring Average
    - 02568: Driving Distance
    - 02567: Driving Accuracy
    - 02564: Greens in Regulation
    - 02428: Strokes Gained Total

    Args:
        stat_id: PGA Tour statistic ID
        year: Season year

    Returns:
        DataFrame with player statistics
    """
    url = f"https://www.pgatour.com/stats/stat.{stat_id}.y{year}.html"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find stats table
    table = soup.find('table', class_='table-styled')

    if not table:
        # Try alternate approach with JavaScript data
        script_data = soup.find('script', {'type': 'application/json'})
        if script_data:
            data = json.loads(script_data.string)
            # Parse JSON structure (varies by page)
            return pd.DataFrame(data.get('rows', []))
        raise ValueError(f"Could not find stats table")

    # Parse table
    headers_row = table.find('thead').find('tr')
    columns = [th.text.strip() for th in headers_row.find_all('th')]

    rows = []
    for row in table.find('tbody').find_all('tr'):
        cells = row.find_all('td')
        row_data = [cell.text.strip() for cell in cells]
        if row_data:
            rows.append(row_data)

    df = pd.DataFrame(rows)
    if len(df.columns) == len(columns):
        df.columns = columns

    return df

def scrape_tournament_leaderboard(tournament_id: str) -> pd.DataFrame:
    """
    Scrape tournament leaderboard from PGA Tour.

    Args:
        tournament_id: PGA Tour tournament ID

    Returns:
        DataFrame with leaderboard data
    """
    url = f"https://www.pgatour.com/tournaments/{tournament_id}/leaderboard.html"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # PGA Tour uses React, so data is often in JSON
    script_tags = soup.find_all('script')

    for script in script_tags:
        if script.string and 'leaderboardData' in str(script.string):
            # Extract JSON data
            text = script.string
            start = text.find('{')
            end = text.rfind('}') + 1
            if start > -1 and end > start:
                try:
                    data = json.loads(text[start:end])
                    return pd.DataFrame(data.get('players', []))
                except:
                    pass

    return pd.DataFrame()

# Example usage
# stats = scrape_pga_stats('02675', 2024)  # Scoring average
# print(stats.head(10))

python Tennis

Scrape ATP Tennis Rankings

Scrape ATP tennis rankings and player statistics from the ATP Tour website.

"""Scrape ATP tennis rankings and stats."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_atp_rankings() -> pd.DataFrame:
    """
    Scrape current ATP singles rankings.

    Returns:
        DataFrame with player rankings
    """
    url = "https://www.atptour.com/en/rankings/singles"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    players = []

    # Find ranking table
    table = soup.find('table', class_='mega-table')

    if table:
        for row in table.find_all('tr')[1:]:  # Skip header
            cells = row.find_all('td')
            if len(cells) >= 5:
                rank = cells[0].text.strip()

                # Player name might be in a link
                name_cell = cells[2]
                name_link = name_cell.find('a')
                name = name_link.text.strip() if name_link else name_cell.text.strip()

                country = cells[3].text.strip() if len(cells) > 3 else ''
                points = cells[5].text.strip() if len(cells) > 5 else ''

                players.append({
                    'Rank': int(rank) if rank.isdigit() else None,
                    'Player': name,
                    'Country': country,
                    'Points': int(points.replace(',', '')) if points.replace(',', '').isdigit() else None
                })

    return pd.DataFrame(players)

def scrape_player_stats(player_id: str) -> dict:
    """
    Scrape individual player statistics from ATP.

    Args:
        player_id: ATP player ID slug (e.g., 'n409' for Novak Djokovic)

    Returns:
        Dictionary with player statistics
    """
    url = f"https://www.atptour.com/en/players/-/{player_id}/overview"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    stats = {}

    # Career stats
    stat_items = soup.find_all('div', class_='stat-item')
    for item in stat_items:
        label = item.find('div', class_='stat-label')
        value = item.find('div', class_='stat-value')
        if label and value:
            stats[label.text.strip()] = value.text.strip()

    return stats

# Example usage
rankings = scrape_atp_rankings()
print(f"ATP Rankings: {len(rankings)} players")
print(rankings.head(20))

python Baseball

Generic Sports Table Scraper

A reusable class for scraping sports statistics tables from various websites.

"""Generic sports table scraper for various websites."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Optional, Dict, Any
import time
from urllib.parse import urljoin

class SportsTableScraper:
    """
    A generic scraper for sports statistics tables.

    Features:
    - Automatic table detection
    - Header parsing with colspan handling
    - Rate limiting
    - Error handling
    """

    def __init__(self, base_url: str = None, delay: float = 2.0):
        """
        Initialize the scraper.

        Args:
            base_url: Base URL for relative links
            delay: Delay between requests in seconds
        """
        self.base_url = base_url
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
        })
        self.last_request = 0

    def _rate_limit(self):
        """Enforce rate limiting between requests."""
        elapsed = time.time() - self.last_request
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self.last_request = time.time()

    def fetch_page(self, url: str) -> BeautifulSoup:
        """Fetch and parse a page."""
        self._rate_limit()

        if self.base_url and not url.startswith('http'):
            url = urljoin(self.base_url, url)

        response = self.session.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')

    def parse_table(
        self,
        soup: BeautifulSoup,
        table_id: str = None,
        table_class: str = None,
        table_index: int = 0
    ) -> pd.DataFrame:
        """
        Parse an HTML table into a DataFrame.

        Args:
            soup: BeautifulSoup object
            table_id: Table ID attribute
            table_class: Table class attribute
            table_index: Index if multiple tables match

        Returns:
            DataFrame with table data
        """
        # Find table
        if table_id:
            table = soup.find('table', {'id': table_id})
        elif table_class:
            tables = soup.find_all('table', class_=table_class)
            table = tables[table_index] if tables and len(tables) > table_index else None
        else:
            tables = soup.find_all('table')
            table = tables[table_index] if tables and len(tables) > table_index else None

        if not table:
            return pd.DataFrame()

        # Parse headers
        columns = self._parse_headers(table)

        # Parse body
        rows = self._parse_body(table, len(columns))

        df = pd.DataFrame(rows)
        if len(df.columns) == len(columns):
            df.columns = columns

        return df

    def _parse_headers(self, table) -> List[str]:
        """Parse table headers, handling colspan."""
        thead = table.find('thead')
        if not thead:
            # Try first row of table
            first_row = table.find('tr')
            headers = first_row.find_all(['th', 'td']) if first_row else []
            return [h.text.strip() for h in headers]

        header_rows = thead.find_all('tr')
        if not header_rows:
            return []

        # Use last header row (most specific)
        last_row = header_rows[-1]
        return [th.text.strip() for th in last_row.find_all('th')]

    def _parse_body(self, table, expected_cols: int) -> List[List[str]]:
        """Parse table body rows."""
        tbody = table.find('tbody')
        rows_container = tbody if tbody else table

        rows = []
        for row in rows_container.find_all('tr'):
            # Skip header rows in body
            if row.get('class') and any(c in str(row.get('class')) for c in ['thead', 'header']):
                continue

            cells = row.find_all(['td', 'th'])
            row_data = [cell.text.strip() for cell in cells]

            if row_data:
                rows.append(row_data)

        return rows

    def scrape_table(
        self,
        url: str,
        table_id: str = None,
        table_class: str = None,
        table_index: int = 0
    ) -> pd.DataFrame:
        """
        Fetch URL and parse table in one call.
        """
        soup = self.fetch_page(url)
        return self.parse_table(soup, table_id, table_class, table_index)

# Example usage
scraper = SportsTableScraper(delay=3.0)

# Scrape from any sports reference site
# df = scraper.scrape_table(
#     "https://www.baseball-reference.com/leagues/majors/2024-standard-batting.shtml",
#     table_id="players_standard_batting"
# )
# print(df.head())

python Baseball

Scrape ESPN Scoreboard

Scrape live scores and game information from ESPN for multiple sports.

"""Scrape ESPN scoreboard for live scores."""
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from typing import List, Dict

def scrape_espn_scores(sport: str, date: str = None) -> List[Dict]:
    """
    Scrape scores from ESPN scoreboard.

    Args:
        sport: Sport code ('mlb', 'nba', 'nfl', 'nhl', 'soccer')
        date: Date string YYYYMMDD, None for today

    Returns:
        List of game dictionaries
    """
    if date is None:
        date = datetime.now().strftime('%Y%m%d')

    url = f"https://www.espn.com/{sport}/scoreboard/_/date/{date}"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    games = []

    # ESPN uses React, but some data is in the initial HTML
    # Look for scoreboard containers
    scoreboard = soup.find_all('section', class_='Scoreboard')

    for game_section in scoreboard:
        try:
            game = {}

            # Teams
            teams = game_section.find_all('div', class_='ScoreCell__TeamName')
            if len(teams) >= 2:
                game['away_team'] = teams[0].text.strip()
                game['home_team'] = teams[1].text.strip()

            # Scores
            scores = game_section.find_all('div', class_='ScoreCell__Score')
            if len(scores) >= 2:
                game['away_score'] = scores[0].text.strip()
                game['home_score'] = scores[1].text.strip()

            # Status
            status = game_section.find('div', class_='ScoreCell__Time')
            if status:
                game['status'] = status.text.strip()

            if game.get('away_team'):
                games.append(game)

        except Exception as e:
            continue

    return games

def get_mlb_scores(date: str = None) -> pd.DataFrame:
    """Get MLB scores as DataFrame."""
    games = scrape_espn_scores('mlb', date)
    return pd.DataFrame(games)

def get_nba_scores(date: str = None) -> pd.DataFrame:
    """Get NBA scores as DataFrame."""
    games = scrape_espn_scores('nba', date)
    return pd.DataFrame(games)

def get_nfl_scores(date: str = None) -> pd.DataFrame:
    """Get NFL scores as DataFrame."""
    games = scrape_espn_scores('nfl', date)
    return pd.DataFrame(games)

def get_nhl_scores(date: str = None) -> pd.DataFrame:
    """Get NHL scores as DataFrame."""
    games = scrape_espn_scores('nhl', date)
    return pd.DataFrame(games)

# Example usage
print("Today's MLB Scores:")
mlb_games = get_mlb_scores()
print(mlb_games)

print("\nToday's NBA Scores:")
nba_games = get_nba_scores()
print(nba_games)

r Baseball

Web Scraping in R with rvest

Scrape sports statistics tables using R and the rvest package.

# Web scraping sports stats with rvest
library(rvest)
library(dplyr)
library(stringr)

#' Scrape HTML table from URL
#'
#' @param url URL to scrape
#' @param table_selector CSS selector for the table
#' @return Data frame with table contents
scrape_sports_table <- function(url, table_selector = "table") {
  # Read the page
  page <- read_html(url)

  # Extract the table
  table <- page %>%
    html_element(table_selector) %>%
    html_table(fill = TRUE)

  # Clean column names
  names(table) <- make.names(names(table), unique = TRUE)

  return(table)
}

#' Scrape Baseball Reference batting stats
#'
#' @param year Season year
#' @return Data frame with batting statistics
scrape_bbref_batting <- function(year) {
  url <- paste0(
    "https://www.baseball-reference.com/leagues/majors/",
    year, "-standard-batting.shtml"
  )

  tryCatch({
    page <- read_html(url)

    # The main stats table
    table <- page %>%
      html_element("#players_standard_batting") %>%
      html_table()

    # Clean and process
    table <- table %>%
      filter(Rk != "Rk") %>%  # Remove repeated headers
      mutate(across(c(G, AB, R, H, HR, RBI, BB, SO), as.numeric))

    return(table)

  }, error = function(e) {
    warning(paste("Error scraping:", e$message))
    return(NULL)
  })
}

#' Scrape Pro Football Reference passing stats
#'
#' @param year Season year
#' @return Data frame with passing statistics
scrape_pfr_passing <- function(year) {
  url <- paste0(
    "https://www.pro-football-reference.com/years/",
    year, "/passing.htm"
  )

  tryCatch({
    page <- read_html(url)

    table <- page %>%
      html_element("#passing") %>%
      html_table()

    # Clean
    table <- table %>%
      filter(Rk != "Rk") %>%
      mutate(across(c(G, GS, Cmp, Att, Yds, TD, Int), as.numeric))

    return(table)

  }, error = function(e) {
    warning(paste("Error scraping:", e$message))
    return(NULL)
  })
}

#' Scrape Basketball Reference player stats
#'
#' @param year Season year
#' @return Data frame with player statistics
scrape_bkref_players <- function(year) {
  url <- paste0(
    "https://www.basketball-reference.com/leagues/NBA_",
    year, "_per_game.html"
  )

  tryCatch({
    page <- read_html(url)

    table <- page %>%
      html_element("#per_game_stats") %>%
      html_table()

    # Clean
    table <- table %>%
      filter(Rk != "Rk") %>%
      mutate(across(c(G, GS, MP, PTS, TRB, AST), as.numeric))

    return(table)

  }, error = function(e) {
    warning(paste("Error scraping:", e$message))
    return(NULL)
  })
}

# Example usage
# Be respectful with rate limiting
Sys.sleep(3)

# batting <- scrape_bbref_batting(2024)
# print(head(batting))

# Generic scraping example
url <- "https://www.espn.com/mlb/standings"
# standings <- scrape_sports_table(url, "table.standings")
# print(standings)

python

Hypothesis Testing for Player Performance

Use t-tests to determine if performance differences between players or time periods are statistically significant.

import numpy as np
import pandas as pd
from scipy import stats

def compare_player_performance(player1_stats, player2_stats, metric, alpha=0.05):
    """
    Compare two players using t-test.
    """
    t_stat, p_value = stats.ttest_ind(player1_stats, player2_stats)

    print(f"Comparing {metric}:")
    print(f"  Player 1 mean: {np.mean(player1_stats):.2f}")
    print(f"  Player 2 mean: {np.mean(player2_stats):.2f}")
    print(f"  t-statistic: {t_stat:.3f}")
    print(f"  p-value: {p_value:.4f}")

    if p_value < alpha:
        print(f"  Result: Significant difference (p < {alpha})")
    else:
        print(f"  Result: No significant difference")

    return t_stat, p_value

def before_after_analysis(before_stats, after_stats, metric, alpha=0.05):
    """
    Paired t-test for before/after comparison (e.g., injury, trade).
    """
    t_stat, p_value = stats.ttest_rel(before_stats, after_stats)

    print(f"Before/After Analysis - {metric}:")
    print(f"  Before mean: {np.mean(before_stats):.2f}")
    print(f"  After mean: {np.mean(after_stats):.2f}")
    print(f"  Change: {np.mean(after_stats) - np.mean(before_stats):+.2f}")
    print(f"  p-value: {p_value:.4f}")

    return t_stat, p_value

# Example: Compare two players' scoring
np.random.seed(42)
player1_pts = np.random.normal(25, 5, 50)  # 50 games
player2_pts = np.random.normal(22, 6, 50)

compare_player_performance(player1_pts, player2_pts, "Points Per Game")

print("\n" + "="*50 + "\n")

# Before/after trade
before = np.random.normal(18, 4, 30)
after = np.random.normal(23, 5, 30)
before_after_analysis(before, after, "PPG")

Output Example:

Comparing Points Per Game:
  Player 1 mean: 24.52
  Player 2 mean: 22.15
  t-statistic: 2.124
  p-value: 0.0362
  Result: Significant difference (p < 0.05)

python

Regression Analysis for Performance Prediction

Perform multiple linear regression to identify which factors predict performance outcomes.

import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_absolute_error

def regression_analysis(df, target, features):
    """
    Perform multiple linear regression with detailed output.
    """
    X = df[features]
    y = df[target]

    # Add constant for intercept
    X = sm.add_constant(X)

    # Fit model
    model = sm.OLS(y, X).fit()

    print(model.summary())

    return model

def feature_importance_regression(model, feature_names):
    """Extract feature importance from regression."""
    # Standardized coefficients
    coeffs = model.params[1:]  # Exclude constant
    std_errors = model.bse[1:]
    p_values = model.pvalues[1:]

    importance = pd.DataFrame({
        'Feature': feature_names,
        'Coefficient': coeffs,
        'Std Error': std_errors,
        'p-value': p_values,
        'Significant': p_values < 0.05
    }).sort_values('Coefficient', key=abs, ascending=False)

    return importance

# Example: What predicts wins?
np.random.seed(42)
n = 100
data = pd.DataFrame({
    'offensive_rating': np.random.uniform(105, 120, n),
    'defensive_rating': np.random.uniform(100, 115, n),
    'pace': np.random.uniform(95, 105, n),
    'turnovers': np.random.uniform(10, 18, n)
})
data['wins'] = (0.5 * data['offensive_rating'] - 0.4 * data['defensive_rating'] +
                0.1 * data['pace'] - 0.2 * data['turnovers'] +
                np.random.normal(0, 3, n))

features = ['offensive_rating', 'defensive_rating', 'pace', 'turnovers']
model = regression_analysis(data, 'wins', features)

print("\nFeature Importance:")
print(feature_importance_regression(model, features))

Output Example:

OLS Regression Results
==============================================================================
R-squared:                       0.842
Adj. R-squared:                  0.835
                   coef    std err          t
offensive_rating   0.498      0.032     15.562
defensive_rating  -0.395      0.031    -12.742

python

Calculate Correlation Matrix

Calculate correlation matrices and identify strongly related variables in sports data.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def correlation_analysis(df, columns=None, method='pearson'):
    """
    Calculate and visualize correlation matrix.

    Methods: 'pearson', 'spearman', 'kendall'
    """
    if columns:
        df_subset = df[columns]
    else:
        df_subset = df.select_dtypes(include=[np.number])

    # Calculate correlation matrix
    corr_matrix = df_subset.corr(method=method)

    # Create heatmap
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(
        corr_matrix,
        annot=True,
        fmt='.2f',
        cmap='RdBu_r',
        center=0,
        vmin=-1,
        vmax=1,
        ax=ax
    )
    ax.set_title(f'{method.capitalize()} Correlation Matrix')
    plt.tight_layout()

    return corr_matrix, fig

def find_strong_correlations(corr_matrix, threshold=0.7):
    """Find pairs with strong correlations."""
    strong = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr = corr_matrix.iloc[i, j]
            if abs(corr) >= threshold:
                strong.append({
                    'var1': corr_matrix.columns[i],
                    'var2': corr_matrix.columns[j],
                    'correlation': round(corr, 3)
                })
    return pd.DataFrame(strong).sort_values('correlation', key=abs, ascending=False)

# Example
np.random.seed(42)
stats = pd.DataFrame({
    'points': np.random.uniform(10, 30, 100),
    'minutes': np.random.uniform(20, 40, 100),
    'fg_attempts': np.random.uniform(8, 22, 100),
    'assists': np.random.uniform(2, 10, 100),
    'turnovers': np.random.uniform(1, 5, 100)
})
# Add correlations
stats['points'] = stats['points'] + 0.5 * stats['minutes'] + 0.3 * stats['fg_attempts']

corr, fig = correlation_analysis(stats)
strong_corrs = find_strong_correlations(corr, threshold=0.5)
print("Strong correlations (|r| >= 0.5):")
print(strong_corrs)

Output Example:

Strong correlations (|r| >= 0.5):
         var1         var2  correlation
0      points      minutes        0.723
1      points  fg_attempts        0.581

python

Bayesian Estimation for Player True Talent

Use Bayesian estimation to regress small sample statistics toward population mean.

import numpy as np
from scipy import stats

def bayesian_batting_average(hits, at_bats, prior_mean=0.260, prior_std=0.030):
    """
    Estimate player true batting average using Bayesian shrinkage.

    Uses Beta-Binomial conjugate prior.
    """
    # Convert prior mean/std to Beta parameters
    prior_var = prior_std ** 2
    prior_alpha = prior_mean * (prior_mean * (1 - prior_mean) / prior_var - 1)
    prior_beta = (1 - prior_mean) * (prior_mean * (1 - prior_mean) / prior_var - 1)

    # Posterior parameters
    post_alpha = prior_alpha + hits
    post_beta = prior_beta + (at_bats - hits)

    # Posterior estimates
    post_mean = post_alpha / (post_alpha + post_beta)
    post_std = np.sqrt((post_alpha * post_beta) /
                       ((post_alpha + post_beta)**2 * (post_alpha + post_beta + 1)))

    # 95% credible interval
    ci_low = stats.beta.ppf(0.025, post_alpha, post_beta)
    ci_high = stats.beta.ppf(0.975, post_alpha, post_beta)

    # Raw average
    raw_avg = hits / at_bats if at_bats > 0 else 0

    return {
        'raw_avg': round(raw_avg, 3),
        'estimated_true_avg': round(post_mean, 3),
        '95_ci': (round(ci_low, 3), round(ci_high, 3)),
        'shrinkage': round(raw_avg - post_mean, 3)
    }

# Example: Early season stats
players = [
    {'name': 'Hot Start', 'hits': 15, 'ab': 40},   # .375 in 40 AB
    {'name': 'Slow Start', 'hits': 6, 'ab': 40},   # .150 in 40 AB
    {'name': 'Full Season', 'hits': 170, 'ab': 550} # .309 in 550 AB
]

print("Bayesian Batting Average Estimates\n")
for p in players:
    result = bayesian_batting_average(p['hits'], p['ab'])
    print(f"{p['name']}:")
    print(f"  Raw: {result['raw_avg']:.3f}, Estimated: {result['estimated_true_avg']:.3f}")
    print(f"  95% CI: {result['95_ci']}")
    print(f"  Shrinkage: {result['shrinkage']}\n")

Output Example:

Bayesian Batting Average Estimates

Hot Start:
  Raw: 0.375, Estimated: 0.312
  95% CI: (0.238, 0.392)
  Shrinkage: 0.063

Slow Start:
  Raw: 0.150, Estimated: 0.214
  95% CI: (0.148, 0.289)
  Shrinkage: -0.064

ANOVA for Group Comparisons

Use ANOVA to test if there are significant differences between groups (e.g., positions, teams).

library(dplyr)
library(broom)

# One-way ANOVA: Compare performance across positions
perform_anova <- function(data, value_col, group_col) {
  formula <- as.formula(paste(value_col, "~", group_col))
  model <- aov(formula, data = data)

  # Summary
  cat("ANOVA Results:\n")
  print(summary(model))

  # Post-hoc Tukey test
  cat("\nTukey HSD Post-hoc Test:\n")
  tukey <- TukeyHSD(model)
  print(tukey)

  return(model)
}

# Effect size (eta-squared)
calculate_eta_squared <- function(aov_model) {
  ss <- summary(aov_model)[[1]]$`Sum Sq`
  eta_sq <- ss[1] / sum(ss)
  cat("\nEffect Size (eta-squared):", round(eta_sq, 3))
  return(eta_sq)
}

# Example: Compare scoring by position
set.seed(42)
players <- data.frame(
  position = rep(c("Guard", "Forward", "Center"), each = 30),
  ppg = c(
    rnorm(30, 18, 5),   # Guards
    rnorm(30, 15, 4),   # Forwards
    rnorm(30, 12, 4)    # Centers
  )
)

model <- perform_anova(players, "ppg", "position")
calculate_eta_squared(model)

Output Example:

ANOVA Results:
            Df Sum Sq Mean Sq F value Pr(>F)
position     2  582.3   291.2   14.53  <0.001 ***
Residuals   87 1743.5    20.0

Effect Size (eta-squared): 0.250

python

Calculate Percentiles and Rankings

Calculate percentile rankings and create composite player rankings from multiple metrics.

import pandas as pd
import numpy as np
from scipy import stats

def calculate_percentiles(df, columns, method='rank'):
    """
    Calculate percentile rankings for multiple columns.

    Methods:
    - 'rank': Rank-based percentile
    - 'distribution': Assuming normal distribution
    """
    result = df.copy()

    for col in columns:
        if method == 'rank':
            result[f'{col}_pctl'] = df[col].rank(pct=True) * 100
        elif method == 'distribution':
            z_scores = stats.zscore(df[col])
            result[f'{col}_pctl'] = stats.norm.cdf(z_scores) * 100

    return result

def create_player_rankings(df, metrics, weights=None, ascending=None):
    """
    Create composite player rankings.

    Args:
        df: DataFrame with player stats
        metrics: List of metric columns
        weights: Optional dict of weights (default equal)
        ascending: Dict of {metric: bool} for direction
    """
    if weights is None:
        weights = {m: 1/len(metrics) for m in metrics}
    if ascending is None:
        ascending = {m: False for m in metrics}  # Higher is better default

    # Calculate percentiles
    df_pctl = df.copy()
    for metric in metrics:
        if ascending.get(metric, False):
            # Lower is better - invert percentile
            df_pctl[f'{metric}_pctl'] = 100 - df[metric].rank(pct=True) * 100
        else:
            df_pctl[f'{metric}_pctl'] = df[metric].rank(pct=True) * 100

    # Calculate weighted composite score
    df_pctl['composite_score'] = sum(
        df_pctl[f'{m}_pctl'] * weights[m] for m in metrics
    )

    # Overall rank
    df_pctl['overall_rank'] = df_pctl['composite_score'].rank(ascending=False).astype(int)

    return df_pctl.sort_values('overall_rank')

# Example
np.random.seed(42)
players = pd.DataFrame({
    'player': [f'Player_{i}' for i in range(50)],
    'points': np.random.uniform(10, 28, 50),
    'rebounds': np.random.uniform(3, 12, 50),
    'assists': np.random.uniform(2, 10, 50),
    'turnovers': np.random.uniform(1, 5, 50)
})

# Rank players (turnovers: lower is better)
ranked = create_player_rankings(
    players,
    metrics=['points', 'rebounds', 'assists', 'turnovers'],
    weights={'points': 0.4, 'rebounds': 0.2, 'assists': 0.25, 'turnovers': 0.15},
    ascending={'turnovers': True}  # Lower turnovers is better
)

print("Top 10 Players:")
print(ranked[['player', 'points', 'rebounds', 'assists', 'composite_score', 'overall_rank']].head(10))

Output Example:

Top 10 Players:
       player  points  rebounds  assists  composite_score  overall_rank
23  Player_23   26.85      9.23     8.95            89.42             1
8   Player_8    24.12     10.82     7.34            85.18             2

python

Time Series Decomposition

Decompose time series data to separate trend, seasonal patterns, and noise in performance.

import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

def decompose_performance(data, period=10, model='additive'):
    """
    Decompose player performance into trend, seasonal, and residual components.

    Useful for:
    - Identifying true improvement vs noise
    - Finding periodic patterns (e.g., home/away, schedule effects)
    """
    # Ensure data is a Series with numeric index
    if isinstance(data, pd.DataFrame):
        data = data.iloc[:, 0]

    result = seasonal_decompose(data, model=model, period=period)

    fig, axes = plt.subplots(4, 1, figsize=(12, 10))

    result.observed.plot(ax=axes[0], title='Observed')
    result.trend.plot(ax=axes[1], title='Trend')
    result.seasonal.plot(ax=axes[2], title='Seasonal')
    result.resid.plot(ax=axes[3], title='Residual')

    plt.tight_layout()

    return result, fig

def detect_trend(data, window=20):
    """
    Detect if performance is trending up or down.
    """
    rolling_mean = data.rolling(window=window).mean()

    # Linear regression on rolling mean
    x = np.arange(len(rolling_mean.dropna()))
    y = rolling_mean.dropna().values

    slope, intercept = np.polyfit(x, y, 1)

    trend = 'increasing' if slope > 0 else 'decreasing'
    strength = abs(slope) / np.std(y)

    return {
        'direction': trend,
        'slope': round(slope, 4),
        'strength': round(strength, 3),
        'interpretation': f"Performance is {trend} at {abs(slope):.3f} per game"
    }

# Example: Season-long performance
np.random.seed(42)
games = 82
base = 20
trend = np.linspace(0, 3, games)  # Slight improvement
seasonal = 2 * np.sin(np.linspace(0, 8*np.pi, games))  # Home/away pattern
noise = np.random.normal(0, 3, games)

performance = pd.Series(base + trend + seasonal + noise)

result, fig = decompose_performance(performance, period=10)
trend_info = detect_trend(performance)

print("Trend Analysis:")
print(f"  Direction: {trend_info['direction']}")
print(f"  {trend_info['interpretation']}")

Output Example:

Trend Analysis:
  Direction: increasing
  Performance is increasing at 0.041 per game

python

Bootstrap Confidence Intervals

Use bootstrap resampling to calculate confidence intervals for any statistic.

import numpy as np
import pandas as pd

def bootstrap_mean_ci(data, n_bootstrap=10000, ci=0.95):
    """
    Calculate bootstrap confidence interval for the mean.
    """
    boot_means = []
    n = len(data)

    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=n, replace=True)
        boot_means.append(np.mean(sample))

    alpha = (1 - ci) / 2
    ci_low = np.percentile(boot_means, alpha * 100)
    ci_high = np.percentile(boot_means, (1 - alpha) * 100)

    return {
        'mean': np.mean(data),
        'ci_low': ci_low,
        'ci_high': ci_high,
        'ci_width': ci_high - ci_low
    }

def bootstrap_statistic(data, statistic_func, n_bootstrap=10000, ci=0.95):
    """
    Bootstrap CI for any statistic (median, percentile, correlation, etc.)
    """
    boot_stats = []
    n = len(data) if not isinstance(data, tuple) else len(data[0])

    for _ in range(n_bootstrap):
        if isinstance(data, tuple):
            # For correlation between two arrays
            idx = np.random.choice(n, size=n, replace=True)
            sample = tuple(d[idx] for d in data)
        else:
            sample = np.random.choice(data, size=n, replace=True)

        boot_stats.append(statistic_func(sample))

    alpha = (1 - ci) / 2
    return {
        'estimate': statistic_func(data),
        'ci_low': np.percentile(boot_stats, alpha * 100),
        'ci_high': np.percentile(boot_stats, (1 - alpha) * 100)
    }

# Example: Confidence intervals for player stats
np.random.seed(42)
player_pts = np.random.normal(22, 6, 50)  # 50 games

# Mean CI
mean_ci = bootstrap_mean_ci(player_pts)
print(f"Points Per Game:")
print(f"  Mean: {mean_ci['mean']:.2f}")
print(f"  95% CI: [{mean_ci['ci_low']:.2f}, {mean_ci['ci_high']:.2f}]")

# Median CI
median_ci = bootstrap_statistic(player_pts, np.median)
print(f"\nMedian PPG:")
print(f"  Estimate: {median_ci['estimate']:.2f}")
print(f"  95% CI: [{median_ci['ci_low']:.2f}, {median_ci['ci_high']:.2f}]")

# Correlation CI
assists = player_pts * 0.3 + np.random.normal(5, 2, 50)
def corr_func(data):
    return np.corrcoef(data[0], data[1])[0, 1]

corr_ci = bootstrap_statistic((player_pts, assists), corr_func)
print(f"\nPoints-Assists Correlation:")
print(f"  r = {corr_ci['estimate']:.3f}")
print(f"  95% CI: [{corr_ci['ci_low']:.3f}, {corr_ci['ci_high']:.3f}]")

Output Example:

Points Per Game:
  Mean: 22.14
  95% CI: [20.42, 23.89]

Median PPG:
  Estimate: 21.87
  95% CI: [20.08, 24.12]

Points-Assists Correlation:
  r = 0.724
  95% CI: [0.562, 0.841]

Mixed Effects Model for Repeated Measures

Fit mixed effects models to analyze repeated measurements (multiple games per player) while accounting for player-level variation.

library(lme4)
library(lmerTest)
library(dplyr)

# Mixed effects model for player performance
# Accounts for repeated measurements (games) within players

fit_mixed_model <- function(data) {
  # Model: performance ~ fixed effects + random intercept for player
  model <- lmer(
    points ~ minutes + home_game + rest_days + (1 | player_id),
    data = data
  )

  cat("Mixed Effects Model Summary:\n")
  print(summary(model))

  # Extract variance components
  var_comp <- as.data.frame(VarCorr(model))
  cat("\nVariance Components:\n")
  cat("  Between-player variance:", round(var_comp$vcov[1], 2), "\n")
  cat("  Within-player (residual) variance:", round(var_comp$vcov[2], 2), "\n")

  # ICC (proportion of variance due to between-player differences)
  icc <- var_comp$vcov[1] / sum(var_comp$vcov)
  cat("  ICC:", round(icc, 3), "\n")

  return(model)
}

# Example data
set.seed(42)
n_players <- 20
games_per_player <- 40

data <- expand.grid(
  player_id = 1:n_players,
  game = 1:games_per_player
) %>%
  mutate(
    # Player-specific baseline
    player_ability = rep(rnorm(n_players, 20, 5), each = games_per_player),
    minutes = round(runif(n(), 20, 38)),
    home_game = sample(0:1, n(), replace = TRUE),
    rest_days = sample(1:4, n(), replace = TRUE),
    # Performance with player random effect
    points = player_ability + 0.5 * minutes + 2 * home_game +
             0.5 * rest_days + rnorm(n(), 0, 4)
  )

model <- fit_mixed_model(data)

Output Example:

Mixed Effects Model Summary:
Fixed effects:
             Estimate Std. Error t value
(Intercept)    8.234      1.152   7.147
minutes        0.498      0.028  17.786
home_game      1.982      0.318   6.233

Variance Components:
  Between-player variance: 24.12
  Within-player variance: 15.89
  ICC: 0.603

python

Handle Missing Values in Sports Data

Implement smart missing value handling that chooses appropriate strategies based on column type.

import pandas as pd
import numpy as np

def handle_missing_values(df, strategy='smart'):
    """
    Handle missing values in sports statistics.

    Strategies:
    - 'smart': Use appropriate method based on column type
    - 'drop': Drop rows with missing values
    - 'fill_zero': Fill with zeros (for counting stats)
    """
    df_clean = df.copy()

    if strategy == 'smart':
        for col in df_clean.columns:
            missing_pct = df_clean[col].isna().sum() / len(df_clean) * 100

            if missing_pct > 50:
                print(f"Warning: {col} has {missing_pct:.1f}% missing")
                continue

            if df_clean[col].dtype in ['int64', 'float64']:
                # Numeric: use median for stats, 0 for counting
                if col.endswith(('_pct', '_rate', '_avg')):
                    df_clean[col].fillna(df_clean[col].median(), inplace=True)
                else:
                    df_clean[col].fillna(0, inplace=True)
            else:
                # Categorical: use mode
                df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)

    elif strategy == 'drop':
        df_clean.dropna(inplace=True)

    elif strategy == 'fill_zero':
        df_clean.fillna(0, inplace=True)

    return df_clean

# Example
df = pd.DataFrame({
    'player': ['A', 'B', 'C', 'D', 'E'],
    'points': [20, np.nan, 15, 25, np.nan],
    'fg_pct': [0.45, 0.52, np.nan, 0.48, 0.50],
    'team': ['LAL', np.nan, 'BOS', 'MIA', 'GSW']
})

print("Before:")
print(df)
print("\nAfter smart cleaning:")
print(handle_missing_values(df, 'smart'))

Output Example:

Before:
  player  points  fg_pct team
0      A    20.0    0.45  LAL
1      B     NaN    0.52  NaN

After smart cleaning:
  player  points  fg_pct team
0      A    20.0   0.450  LAL
1      B     0.0   0.520  LAL

python

Standardize Player Names

Standardize player names across different formats and find potential duplicates.

import re
import pandas as pd
from difflib import SequenceMatcher

def standardize_name(name):
    """
    Standardize player name format.
    Handles: "Last, First" -> "First Last", special characters, suffixes.
    """
    if pd.isna(name):
        return name

    # Handle "Last, First" format
    if ',' in name:
        parts = name.split(',')
        name = f"{parts[1].strip()} {parts[0].strip()}"

    # Remove special characters
    name = re.sub(r'[^a-zA-Z\s\.\-Jr\.Sr\.III]', '', name)

    # Standardize spacing
    name = ' '.join(name.split())

    # Capitalize properly
    name = name.title()

    # Fix common suffixes
    name = name.replace('Jr.', 'Jr').replace('Sr.', 'Sr')
    name = name.replace('Iii', 'III').replace('Ii', 'II')

    return name

def find_similar_names(name, name_list, threshold=0.85):
    """Find similar names in a list (for deduplication)."""
    matches = []
    for candidate in name_list:
        ratio = SequenceMatcher(None, name.lower(), candidate.lower()).ratio()
        if ratio >= threshold:
            matches.append((candidate, ratio))
    return sorted(matches, key=lambda x: x[1], reverse=True)

def create_name_mapping(df, name_col, reference_names):
    """Create mapping from messy names to standardized names."""
    mapping = {}
    for name in df[name_col].unique():
        std_name = standardize_name(name)
        matches = find_similar_names(std_name, reference_names)
        if matches:
            mapping[name] = matches[0][0]
        else:
            mapping[name] = std_name
    return mapping

# Example
names = pd.Series([
    'James, LeBron',
    'LeBron James',
    'lebron james',
    'JAMES LEBRON',
    'Stephen Curry Jr.',
    'curry, stephen'
])

print("Standardized names:")
for name in names:
    print(f"  {name} -> {standardize_name(name)}")

Output Example:

Standardized names:
  James, LeBron -> Lebron James
  LeBron James -> Lebron James
  lebron james -> Lebron James
  JAMES LEBRON -> James Lebron
  Stephen Curry Jr. -> Stephen Curry Jr
  curry, stephen -> Stephen Curry

python

Detect and Handle Outliers

Detect and handle statistical outliers in sports data using IQR and Z-score methods.

import pandas as pd
import numpy as np
from scipy import stats

def detect_outliers(df, cols, method='iqr', threshold=1.5):
    """
    Detect outliers using IQR or Z-score method.
    """
    outliers = pd.DataFrame(index=df.index)

    for col in cols:
        if method == 'iqr':
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - threshold * IQR
            upper = Q3 + threshold * IQR
            outliers[col] = (df[col] < lower) | (df[col] > upper)

        elif method == 'zscore':
            z_scores = np.abs(stats.zscore(df[col].dropna()))
            outliers[col] = pd.Series(z_scores > threshold, index=df[col].dropna().index)

    return outliers

def handle_outliers(df, cols, method='cap', lower_pct=0.01, upper_pct=0.99):
    """
    Handle outliers by capping or removal.

    Methods:
    - 'cap': Cap at percentiles
    - 'remove': Remove outlier rows
    - 'log': Log transform (for right-skewed)
    """
    df_clean = df.copy()

    for col in cols:
        if method == 'cap':
            lower = df[col].quantile(lower_pct)
            upper = df[col].quantile(upper_pct)
            df_clean[col] = df_clean[col].clip(lower, upper)

        elif method == 'remove':
            outliers = detect_outliers(df[[col]], [col])
            df_clean = df_clean[~outliers[col]]

        elif method == 'log':
            # Add small constant to handle zeros
            df_clean[col] = np.log1p(df_clean[col])

    return df_clean

# Example with sports data
np.random.seed(42)
df = pd.DataFrame({
    'player': [f'Player_{i}' for i in range(100)],
    'points': np.concatenate([np.random.normal(15, 5, 95), [80, 85, 90, 95, 100]]),  # Outliers
    'rebounds': np.random.normal(7, 2, 100)
})

print("Original stats:")
print(df['points'].describe())

# Detect outliers
outliers = detect_outliers(df, ['points'], method='iqr')
print(f"\nOutliers detected: {outliers['points'].sum()}")

# Handle by capping
df_capped = handle_outliers(df, ['points'], method='cap')
print("\nAfter capping:")
print(df_capped['points'].describe())

Output Example:

Original stats:
count    100.000
mean      17.425
max      100.000

Outliers detected: 5

After capping:
count    100.000
mean      15.892
max       26.123

python

Merge Data from Multiple Sources

Merge player data from multiple sources and resolve conflicting values.

import pandas as pd

def merge_player_data(primary_df, secondary_df, join_cols, how='left',
                       suffix=('_primary', '_secondary')):
    """
    Merge player data from multiple sources with conflict resolution.
    """
    merged = pd.merge(
        primary_df, secondary_df,
        on=join_cols, how=how,
        suffixes=suffix
    )

    return merged

def resolve_conflicts(df, primary_suffix='_primary', secondary_suffix='_secondary',
                      strategy='primary'):
    """
    Resolve conflicting values from merged data.

    Strategies:
    - 'primary': Keep primary source
    - 'secondary': Keep secondary source
    - 'average': Average numeric values
    - 'non_null': Use non-null value
    """
    df_resolved = df.copy()

    # Find columns with conflicts
    primary_cols = [c for c in df.columns if c.endswith(primary_suffix)]

    for pcol in primary_cols:
        base_name = pcol.replace(primary_suffix, '')
        scol = base_name + secondary_suffix

        if scol in df.columns:
            if strategy == 'primary':
                df_resolved[base_name] = df[pcol]
            elif strategy == 'secondary':
                df_resolved[base_name] = df[scol]
            elif strategy == 'average':
                df_resolved[base_name] = df[[pcol, scol]].mean(axis=1)
            elif strategy == 'non_null':
                df_resolved[base_name] = df[pcol].fillna(df[scol])

            # Drop original columns
            df_resolved.drop([pcol, scol], axis=1, inplace=True)

    return df_resolved

# Example
source1 = pd.DataFrame({
    'player_id': [1, 2, 3, 4],
    'name': ['Player A', 'Player B', 'Player C', 'Player D'],
    'points': [25.0, 20.0, 15.0, None],
    'team': ['LAL', 'BOS', 'MIA', 'GSW']
})

source2 = pd.DataFrame({
    'player_id': [1, 2, 3, 5],
    'name': ['Player A', 'Player B', 'Player C', 'Player E'],
    'points': [24.5, 21.0, 15.5, 18.0],
    'assists': [8, 5, 3, 6]
})

merged = merge_player_data(source1, source2, ['player_id', 'name'])
resolved = resolve_conflicts(merged, strategy='average')
print(resolved)

Output Example:

   player_id     name   team  points  assists
0          1  Player A    LAL    24.75      8.0
1          2  Player B    BOS    20.50      5.0

Clean and Transform Data with dplyr

Use dplyr to clean and transform sports data with a comprehensive pipeline.

library(dplyr)
library(tidyr)
library(stringr)

# Clean sports data pipeline
clean_player_stats <- function(df) {
  df %>%
    # Standardize column names
    rename_with(tolower) %>%
    rename_with(~ str_replace_all(., " ", "_")) %>%

    # Remove duplicates
    distinct(player_id, .keep_all = TRUE) %>%

    # Handle missing values
    mutate(across(where(is.numeric), ~ replace_na(., 0))) %>%
    mutate(across(where(is.character), ~ replace_na(., "Unknown"))) %>%

    # Calculate derived columns
    mutate(
      pts_per_game = total_pts / games_played,
      efficiency = (pts + reb + ast - tov) / games_played
    ) %>%

    # Remove impossible values
    filter(
      games_played > 0,
      fg_pct >= 0 & fg_pct <= 1,
      pts_per_game >= 0
    ) %>%

    # Arrange by performance
    arrange(desc(pts_per_game))
}

# Pivot data from wide to long format
stats_to_long <- function(df, id_cols, stat_cols) {
  df %>%
    pivot_longer(
      cols = all_of(stat_cols),
      names_to = "stat_type",
      values_to = "value"
    )
}

# Example usage
set.seed(42)
raw_data <- data.frame(
  player_id = 1:10,
  player_name = paste("Player", LETTERS[1:10]),
  games_played = sample(50:82, 10),
  total_pts = round(runif(10, 500, 2000)),
  reb = round(runif(10, 200, 600)),
  ast = round(runif(10, 100, 500)),
  tov = round(runif(10, 50, 200)),
  fg_pct = round(runif(10, 0.4, 0.55), 3)
)

# Clean the data
clean_data <- clean_player_stats(raw_data)
print(head(clean_data))

Output Example:

  player_id player_name games_played pts_per_game efficiency
1         3    Player C           75        25.33       8.42
2         7    Player G           68        23.88       7.91

python

Validate Data Quality

Implement a data quality validator to check for common issues in sports datasets.

import pandas as pd
import numpy as np

class DataValidator:
    """Validate sports data quality."""

    def __init__(self, df):
        self.df = df
        self.issues = []

    def check_missing(self, threshold=0.05):
        """Check for columns with too many missing values."""
        for col in self.df.columns:
            missing_pct = self.df[col].isna().sum() / len(self.df)
            if missing_pct > threshold:
                self.issues.append({
                    'type': 'missing_values',
                    'column': col,
                    'detail': f'{missing_pct:.1%} missing'
                })
        return self

    def check_range(self, col, min_val, max_val):
        """Check if values are within expected range."""
        out_of_range = ((self.df[col] < min_val) | (self.df[col] > max_val)).sum()
        if out_of_range > 0:
            self.issues.append({
                'type': 'out_of_range',
                'column': col,
                'detail': f'{out_of_range} values outside [{min_val}, {max_val}]'
            })
        return self

    def check_duplicates(self, cols):
        """Check for duplicate entries."""
        dups = self.df.duplicated(subset=cols).sum()
        if dups > 0:
            self.issues.append({
                'type': 'duplicates',
                'column': str(cols),
                'detail': f'{dups} duplicate rows'
            })
        return self

    def check_consistency(self, col1, col2, relation):
        """Check logical consistency between columns."""
        if relation == '<=':
            violations = (self.df[col1] > self.df[col2]).sum()
        elif relation == '<':
            violations = (self.df[col1] >= self.df[col2]).sum()

        if violations > 0:
            self.issues.append({
                'type': 'inconsistency',
                'column': f'{col1} {relation} {col2}',
                'detail': f'{violations} violations'
            })
        return self

    def report(self):
        """Generate validation report."""
        if not self.issues:
            print("✓ All validation checks passed!")
        else:
            print(f"Found {len(self.issues)} issues:")
            for issue in self.issues:
                print(f"  - [{issue['type']}] {issue['column']}: {issue['detail']}")
        return self.issues

# Example
df = pd.DataFrame({
    'player_id': [1, 2, 3, 4, 4],  # Duplicate
    'fg_made': [200, 150, 180, 250, 250],
    'fg_attempted': [400, 300, 350, 450, 450],
    'fg_pct': [0.50, 0.50, 0.51, 1.20, 0.56],  # Invalid percentage
    'minutes': [2500, None, 2200, 2800, 2800]  # Missing
})

validator = DataValidator(df)
validator.check_missing(threshold=0.1) \
         .check_range('fg_pct', 0, 1) \
         .check_duplicates(['player_id']) \
         .check_consistency('fg_made', 'fg_attempted', '<=') \
         .report()

Output Example:

Found 3 issues:
  - [missing_values] minutes: 20.0% missing
  - [out_of_range] fg_pct: 1 values outside [0, 1]
  - [duplicates] ['player_id']: 1 duplicate rows

python

Convert Between Rate and Counting Stats

Convert between rate stats and counting stats with per-game, per-minute, and pace adjustments.

import pandas as pd
import numpy as np

def per_game_to_totals(df, per_game_cols, games_col='games_played'):
    """Convert per-game averages to season totals."""
    df_totals = df.copy()
    for col in per_game_cols:
        df_totals[col.replace('_per_game', '_total')] = df[col] * df[games_col]
    return df_totals

def totals_to_per_game(df, total_cols, games_col='games_played'):
    """Convert season totals to per-game averages."""
    df_per_game = df.copy()
    for col in total_cols:
        df_per_game[col.replace('_total', '_per_game')] = df[col] / df[games_col]
    return df_per_game

def per_minute_rates(df, stat_cols, minutes_col='minutes_played', per=36):
    """Convert to per-minute rates (per 36 or per 48)."""
    df_rates = df.copy()
    for col in stat_cols:
        df_rates[f'{col}_per_{per}'] = (df[col] / df[minutes_col]) * per
    return df_rates

def pace_adjust(df, stat_cols, pace_col='team_pace', league_avg_pace=100):
    """Pace-adjust statistics."""
    df_adjusted = df.copy()
    for col in stat_cols:
        df_adjusted[f'{col}_pace_adj'] = df[col] * (league_avg_pace / df[pace_col])
    return df_adjusted

# Example
df = pd.DataFrame({
    'player': ['Player A', 'Player B', 'Player C'],
    'games_played': [75, 70, 82],
    'minutes_played': [2500, 2100, 2800],
    'points_total': [1875, 1400, 2050],
    'assists_total': [600, 350, 425],
    'team_pace': [102, 98, 105]
})

# Convert totals to per-game
per_game = totals_to_per_game(df, ['points_total', 'assists_total'])
print("Per-game stats:")
print(per_game[['player', 'points_total_per_game', 'assists_total_per_game']])

# Per 36 minutes
per_36 = per_minute_rates(df, ['points_total', 'assists_total'])
print("\nPer 36 minutes:")
print(per_36[['player', 'points_total_per_36', 'assists_total_per_36']].round(1))

# Pace adjusted
pace_adj = pace_adjust(df, ['points_total'])
print("\nPace adjusted:")
print(pace_adj[['player', 'points_total', 'points_total_pace_adj']].round(0))

Output Example:

Per-game stats:
      player  points_total_per_game  assists_total_per_game
0  Player A                   25.0                     8.0
1  Player B                   20.0                     5.0

Per 36 minutes:
      player  points_total_per_36  assists_total_per_36
0  Player A                 27.0                   8.6
1  Player B                 24.0                   6.0

sql

SQL Queries for Data Cleaning

SQL queries for common data cleaning tasks: deduplication, null handling, standardization, and merging.

-- Remove duplicates keeping most recent
WITH ranked AS (
    SELECT *,
           ROW_NUMBER() OVER (
               PARTITION BY player_id, season
               ORDER BY updated_at DESC
           ) as rn
    FROM player_stats
)
SELECT * FROM ranked WHERE rn = 1;

-- Handle missing values with COALESCE
SELECT
    player_id,
    player_name,
    COALESCE(points, 0) as points,
    COALESCE(assists, 0) as assists,
    COALESCE(fg_pct, (SELECT AVG(fg_pct) FROM player_stats)) as fg_pct
FROM player_stats;

-- Standardize team names
UPDATE player_stats
SET team_abbr = CASE
    WHEN team_abbr IN ('LAL', 'Los Angeles Lakers', 'LA Lakers') THEN 'LAL'
    WHEN team_abbr IN ('BOS', 'Boston Celtics', 'Boston') THEN 'BOS'
    WHEN team_abbr IN ('GSW', 'Golden State Warriors', 'GS Warriors') THEN 'GSW'
    ELSE team_abbr
END;

-- Find and flag outliers using percentiles
WITH percentiles AS (
    SELECT
        PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY points) as p01,
        PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY points) as p99
    FROM player_stats
)
SELECT
    ps.*,
    CASE
        WHEN ps.points < p.p01 OR ps.points > p.p99 THEN 1
        ELSE 0
    END as is_outlier
FROM player_stats ps
CROSS JOIN percentiles p;

-- Merge data from two tables with conflict resolution
SELECT
    COALESCE(a.player_id, b.player_id) as player_id,
    COALESCE(a.player_name, b.player_name) as player_name,
    -- Use source A points if available, else source B
    COALESCE(a.points, b.points) as points,
    -- Average when both sources have data
    COALESCE((a.fg_pct + b.fg_pct) / 2, a.fg_pct, b.fg_pct) as fg_pct
FROM source_a a
FULL OUTER JOIN source_b b
    ON a.player_id = b.player_id;

Output Example:

[Query results for each cleaning operation]

python

Train XGBoost Model for Prediction

Train an XGBoost classifier for sports outcome prediction with feature importance analysis.

import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

def train_xgboost_model(X, y, test_size=0.2):
    """
    Train XGBoost classifier for sports prediction.
    """
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )

    # Initialize model
    model = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    # Train
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Feature importance
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    return model, accuracy, importance

# Example usage
np.random.seed(42)
X = pd.DataFrame({
    'home_elo': np.random.uniform(1400, 1700, 500),
    'away_elo': np.random.uniform(1400, 1700, 500),
    'home_rest_days': np.random.randint(1, 7, 500),
    'away_rest_days': np.random.randint(1, 7, 500)
})
y = (X['home_elo'] > X['away_elo']).astype(int)

model, acc, importance = train_xgboost_model(X, y)
print(f"Accuracy: {acc:.3f}")
print("\nTop Features:")
print(importance.head())

Output Example:

Accuracy: 0.840

Top Features:
      feature  importance
0    home_elo       0.412
1    away_elo       0.398

python

Build Player Performance Projection Model

Build a Random Forest model to project player performance for the next season.

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def build_projection_model(historical_data, features, target):
    """
    Build player projection model using Random Forest.

    Args:
        historical_data: DataFrame with player seasons
        features: List of feature column names
        target: Target column to predict
    """
    X = historical_data[features]
    y = historical_data[target]

    # Create pipeline with scaling
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RandomForestRegressor(
            n_estimators=200,
            max_depth=10,
            min_samples_leaf=5,
            random_state=42
        ))
    ])

    pipeline.fit(X, y)

    return pipeline

def project_next_season(model, current_season_data, features):
    """Project next season performance."""
    X = current_season_data[features]
    projections = model.predict(X)

    result = current_season_data[['player_name']].copy()
    result['projected'] = projections
    return result

# Example: Project next season points
np.random.seed(42)
data = pd.DataFrame({
    'player_name': [f'Player_{i}' for i in range(100)],
    'age': np.random.randint(22, 35, 100),
    'last_season_pts': np.random.uniform(10, 30, 100),
    'usage_rate': np.random.uniform(15, 35, 100),
    'next_season_pts': np.random.uniform(10, 30, 100)  # Target
})

features = ['age', 'last_season_pts', 'usage_rate']
model = build_projection_model(data, features, 'next_season_pts')

# Make projections
projections = project_next_season(model, data.head(5), features)
print(projections)

Output Example:

  player_name  projected
0    Player_0      22.45
1    Player_1      18.32

python

Implement Logistic Regression for Win Probability

Build a logistic regression model for in-game win probability calculation.

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

def build_win_probability_model(games_data):
    """
    Build logistic regression win probability model.
    """
    # Features: score differential, time remaining, etc.
    features = ['score_diff', 'time_remaining', 'home_team']
    X = games_data[features]
    y = games_data['home_win']

    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)

    return model

def calculate_win_probability(model, score_diff, time_remaining, is_home=True):
    """Calculate in-game win probability."""
    X = pd.DataFrame({
        'score_diff': [score_diff],
        'time_remaining': [time_remaining],
        'home_team': [1 if is_home else 0]
    })
    prob = model.predict_proba(X)[0, 1]
    return prob

def plot_calibration(model, X_test, y_test):
    """Plot calibration curve."""
    y_prob = model.predict_proba(X_test)[:, 1]

    fraction_of_positives, mean_predicted_value = calibration_curve(
        y_test, y_prob, n_bins=10
    )

    fig, ax = plt.subplots(figsize=(8, 6))
    ax.plot([0, 1], [0, 1], 'k--', label='Perfectly Calibrated')
    ax.plot(mean_predicted_value, fraction_of_positives, 's-', label='Model')
    ax.set_xlabel('Mean Predicted Probability')
    ax.set_ylabel('Fraction of Positives')
    ax.set_title('Win Probability Model Calibration')
    ax.legend()
    return fig

# Example
np.random.seed(42)
games = pd.DataFrame({
    'score_diff': np.random.randint(-20, 20, 1000),
    'time_remaining': np.random.uniform(0, 48, 1000),
    'home_team': np.random.choice([0, 1], 1000),
})
games['home_win'] = ((games['score_diff'] + games['home_team'] * 3 +
                       np.random.randn(1000) * 10) > 0).astype(int)

model = build_win_probability_model(games)
wp = calculate_win_probability(model, score_diff=5, time_remaining=2.0)
print(f"Win Probability (up 5, 2 min left): {wp:.1%}")

Output Example:

Win Probability (up 5, 2 min left): 78.2%

python Baseball

Build xBA Model with Neural Network

Build an Expected Batting Average (xBA) neural network model using exit velocity and launch angle.

import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

def build_xba_model(batted_ball_data):
    """
    Build Expected Batting Average (xBA) model.
    Uses exit velocity and launch angle to predict hit probability.
    """
    # Features
    X = batted_ball_data[['exit_velocity', 'launch_angle']]
    y = batted_ball_data['is_hit']

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Neural network classifier
    model = MLPClassifier(
        hidden_layer_sizes=(32, 16),
        activation='relu',
        max_iter=500,
        random_state=42
    )
    model.fit(X_scaled, y)

    return model, scaler

def predict_xba(model, scaler, exit_velocity, launch_angle):
    """Predict xBA for a batted ball."""
    X = scaler.transform([[exit_velocity, launch_angle]])
    xba = model.predict_proba(X)[0, 1]
    return round(xba, 3)

# Generate sample data
np.random.seed(42)
n = 5000
ev = np.random.uniform(70, 115, n)
la = np.random.uniform(-30, 60, n)

# Simulated hit probability based on EV/LA
hit_prob = 1 / (1 + np.exp(-0.15 * (ev - 95) - 0.02 * (la - 15)**2 / 100 + 0.5))
is_hit = np.random.binomial(1, hit_prob)

data = pd.DataFrame({'exit_velocity': ev, 'launch_angle': la, 'is_hit': is_hit})
model, scaler = build_xba_model(data)

# Test predictions
test_cases = [(95, 15), (105, 25), (80, 5), (110, 30)]
for ev, la in test_cases:
    xba = predict_xba(model, scaler, ev, la)
    print(f"EV: {ev} mph, LA: {la}°, xBA: {xba}")

Output Example:

EV: 95 mph, LA: 15°, xBA: 0.512
EV: 105 mph, LA: 25°, xBA: 0.821
EV: 80 mph, LA: 5°, xBA: 0.234
EV: 110 mph, LA: 30°, xBA: 0.892

python Soccer

Build xG Model with Gradient Boosting

Build an Expected Goals (xG) model using Gradient Boosting with multiple shot features.

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

def build_xg_model(shots_data):
    """
    Build Expected Goals (xG) model using Gradient Boosting.
    """
    # Features
    features = ['distance', 'angle', 'is_header', 'is_free_kick',
                'is_counter', 'num_defenders']

    X = shots_data[features]
    y = shots_data['is_goal']

    model = GradientBoostingClassifier(
        n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        min_samples_leaf=20,
        random_state=42
    )

    # Cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    print(f"CV AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")

    model.fit(X, y)
    return model

def calculate_shot_xg(model, distance, angle, is_header=False,
                      is_free_kick=False, is_counter=False, num_defenders=2):
    """Calculate xG for a single shot."""
    X = pd.DataFrame([{
        'distance': distance,
        'angle': angle,
        'is_header': int(is_header),
        'is_free_kick': int(is_free_kick),
        'is_counter': int(is_counter),
        'num_defenders': num_defenders
    }])
    return round(model.predict_proba(X)[0, 1], 3)

# Generate training data
np.random.seed(42)
n = 10000
shots = pd.DataFrame({
    'distance': np.random.uniform(5, 35, n),
    'angle': np.random.uniform(5, 80, n),
    'is_header': np.random.choice([0, 1], n, p=[0.85, 0.15]),
    'is_free_kick': np.random.choice([0, 1], n, p=[0.95, 0.05]),
    'is_counter': np.random.choice([0, 1], n, p=[0.80, 0.20]),
    'num_defenders': np.random.randint(0, 5, n)
})

# Simulated goal probability
goal_prob = np.exp(-0.08 * shots['distance']) * np.sin(np.radians(shots['angle'])) * \
            (1 - 0.3 * shots['is_header']) * (1 - 0.05 * shots['num_defenders'])
shots['is_goal'] = np.random.binomial(1, goal_prob.clip(0.01, 0.95))

model = build_xg_model(shots)
xg = calculate_shot_xg(model, distance=10, angle=45)
print(f"\nExample shot xG (10m, 45°): {xg}")

Output Example:

CV AUC: 0.812 (+/- 0.015)

Example shot xG (10m, 45°): 0.324

python Basketball

Clustering Players by Playing Style

Use K-Means clustering to identify player archetypes based on playing style.

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def cluster_players(player_stats, n_clusters=5):
    """
    Cluster players by playing style using K-Means.
    """
    # Select features for clustering
    features = ['pts_per_36', 'ast_per_36', 'reb_per_36', 'stl_per_36',
                'blk_per_36', 'usg_rate', 'ts_pct', 'ast_ratio']

    X = player_stats[features].dropna()

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_scaled)

    # Add cluster labels
    result = player_stats.loc[X.index].copy()
    result['cluster'] = clusters

    # PCA for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    result['pca1'] = X_pca[:, 0]
    result['pca2'] = X_pca[:, 1]

    return result, kmeans, scaler

def describe_clusters(clustered_data, features):
    """Describe each cluster by average stats."""
    summary = clustered_data.groupby('cluster')[features].mean()

    # Name clusters based on characteristics
    cluster_names = []
    for idx, row in summary.iterrows():
        if row['ast_per_36'] > summary['ast_per_36'].mean():
            if row['pts_per_36'] > summary['pts_per_36'].mean():
                name = "Playmaking Scorer"
            else:
                name = "Pure Playmaker"
        elif row['blk_per_36'] > summary['blk_per_36'].mean():
            name = "Rim Protector"
        elif row['pts_per_36'] > summary['pts_per_36'].mean():
            name = "Volume Scorer"
        else:
            name = "Role Player"
        cluster_names.append(name)

    summary['archetype'] = cluster_names
    return summary

# Example
np.random.seed(42)
players = pd.DataFrame({
    'player': [f'Player_{i}' for i in range(200)],
    'pts_per_36': np.random.uniform(8, 28, 200),
    'ast_per_36': np.random.uniform(1, 10, 200),
    'reb_per_36': np.random.uniform(2, 12, 200),
    'stl_per_36': np.random.uniform(0.5, 2.5, 200),
    'blk_per_36': np.random.uniform(0.2, 3, 200),
    'usg_rate': np.random.uniform(12, 35, 200),
    'ts_pct': np.random.uniform(0.48, 0.65, 200),
    'ast_ratio': np.random.uniform(5, 35, 200)
})

clustered, model, scaler = cluster_players(players)
features = ['pts_per_36', 'ast_per_36', 'reb_per_36', 'blk_per_36']
summary = describe_clusters(clustered, features)
print(summary[['pts_per_36', 'ast_per_36', 'archetype']])

Output Example:

         pts_per_36  ast_per_36        archetype
cluster
0             18.2         3.1     Volume Scorer
1             15.5         7.2  Playmaking Scorer
2             12.1         2.8      Role Player

Build Random Forest Model in R

Build and evaluate a Random Forest model in R with feature importance analysis.

library(randomForest)
library(caret)
library(dplyr)

build_rf_model <- function(data, target_col, features) {
  # Prepare data
  formula <- as.formula(paste(target_col, "~", paste(features, collapse = " + ")))

  # Split data
  set.seed(42)
  train_idx <- createDataPartition(data[[target_col]], p = 0.8, list = FALSE)
  train_data <- data[train_idx, ]
  test_data <- data[-train_idx, ]

  # Train model
  rf_model <- randomForest(
    formula,
    data = train_data,
    ntree = 200,
    mtry = floor(sqrt(length(features))),
    importance = TRUE
  )

  # Evaluate
  predictions <- predict(rf_model, test_data)

  if (is.factor(data[[target_col]])) {
    accuracy <- mean(predictions == test_data[[target_col]])
    cat("Accuracy:", round(accuracy, 3), "\n")
  } else {
    rmse <- sqrt(mean((predictions - test_data[[target_col]])^2))
    cat("RMSE:", round(rmse, 3), "\n")
  }

  # Feature importance
  importance <- importance(rf_model) %>%
    as.data.frame() %>%
    mutate(Feature = rownames(.)) %>%
    arrange(desc(MeanDecreaseGini))

  return(list(model = rf_model, importance = importance))
}

# Example: Predict game outcome
set.seed(42)
games <- data.frame(
  home_rating = runif(500, 1400, 1700),
  away_rating = runif(500, 1400, 1700),
  home_rest = sample(1:7, 500, replace = TRUE),
  away_rest = sample(1:7, 500, replace = TRUE)
) %>%
  mutate(home_win = factor(ifelse(home_rating > away_rating + rnorm(n(), 0, 100), "Win", "Loss")))

result <- build_rf_model(games, "home_win", c("home_rating", "away_rating", "home_rest", "away_rest"))
print(head(result$importance))

Output Example:

Accuracy: 0.78

  MeanDecreaseGini    Feature
1           45.23 home_rating
2           42.18 away_rating

python

LSTM for Time Series Prediction

Build an LSTM neural network for predicting player performance time series.

import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler

def prepare_sequences(data, seq_length):
    """Prepare sequences for LSTM."""
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:(i + seq_length)])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

def build_lstm_model(input_shape):
    """Build LSTM model for time series."""
    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(50, return_sequences=False),
        Dropout(0.2),
        Dense(25),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

def predict_future(model, last_sequence, scaler, n_steps=5):
    """Predict future values."""
    predictions = []
    current_seq = last_sequence.copy()

    for _ in range(n_steps):
        pred = model.predict(current_seq.reshape(1, -1, 1), verbose=0)
        predictions.append(pred[0, 0])
        current_seq = np.roll(current_seq, -1)
        current_seq[-1] = pred

    return scaler.inverse_transform(np.array(predictions).reshape(-1, 1))

# Example: Predict player performance trend
np.random.seed(42)
n_games = 100
performance = 20 + np.cumsum(np.random.randn(n_games) * 0.5)  # Random walk

# Scale data
scaler = MinMaxScaler()
scaled = scaler.fit_transform(performance.reshape(-1, 1))

# Prepare sequences
seq_length = 10
X, y = prepare_sequences(scaled, seq_length)
X = X.reshape((X.shape[0], X.shape[1], 1))

# Build and train
model = build_lstm_model((seq_length, 1))
model.fit(X, y, epochs=50, batch_size=16, verbose=0)

# Predict next 5 games
last_seq = scaled[-seq_length:]
future = predict_future(model, last_seq, scaler, 5)
print("Predicted next 5 games:")
print(future.flatten())

Output Example:

Predicted next 5 games:
[21.34 21.52 21.67 21.78 21.89]

python Football

Build EPA Prediction Model

Build a Gradient Boosting model to predict Expected Points Added for play situations.

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

def build_epa_model(pbp_data):
    """
    Build model to predict Expected Points Added (EPA).
    """
    # Features for EPA prediction
    features = ['down', 'distance', 'yardline', 'seconds_remaining',
                'score_differential', 'is_pass']

    X = pbp_data[features].dropna()
    y = pbp_data.loc[X.index, 'epa']

    model = GradientBoostingRegressor(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    )

    # Cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    print(f"CV R²: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")

    model.fit(X, y)
    return model

def predict_play_epa(model, down, distance, yardline, seconds, score_diff, is_pass):
    """Predict EPA for a play situation."""
    X = pd.DataFrame([{
        'down': down,
        'distance': distance,
        'yardline': yardline,
        'seconds_remaining': seconds,
        'score_differential': score_diff,
        'is_pass': is_pass
    }])
    return round(model.predict(X)[0], 3)

# Generate sample play-by-play data
np.random.seed(42)
pbp = pd.DataFrame({
    'down': np.random.choice([1, 2, 3, 4], 5000, p=[0.4, 0.3, 0.25, 0.05]),
    'distance': np.random.randint(1, 20, 5000),
    'yardline': np.random.randint(1, 100, 5000),
    'seconds_remaining': np.random.randint(0, 3600, 5000),
    'score_differential': np.random.randint(-21, 21, 5000),
    'is_pass': np.random.choice([0, 1], 5000, p=[0.4, 0.6])
})

# Simulated EPA based on situation
pbp['epa'] = (100 - pbp['yardline']) / 100 * 0.5 - pbp['down'] * 0.1 + \
              np.random.randn(5000) * 0.5

model = build_epa_model(pbp)

# Test predictions
print("\nSample Predictions:")
print(f"1st & 10 at 50, pass: {predict_play_epa(model, 1, 10, 50, 1800, 0, 1)}")
print(f"3rd & 15 at own 20, pass: {predict_play_epa(model, 3, 15, 80, 300, -7, 1)}")

Output Example:

CV R²: 0.423 (+/- 0.025)

Sample Predictions:
1st & 10 at 50, pass: 0.152
3rd & 15 at own 20, pass: -0.284

python Hockey

Build xG Model for Hockey Shots

Build a hockey Expected Goals (xG) model using shot location and situational features.

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def build_hockey_xg_model(shots_data):
    """
    Build Expected Goals model for hockey.
    """
    features = ['distance', 'angle', 'shot_type', 'is_rebound',
                'time_since_event', 'is_rush']

    # Encode categorical
    shots_encoded = pd.get_dummies(shots_data, columns=['shot_type'])
    feature_cols = [c for c in shots_encoded.columns if c.startswith(('distance', 'angle', 'is_', 'time_', 'shot_type_'))]

    X = shots_encoded[feature_cols]
    y = shots_data['is_goal']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate
    from sklearn.metrics import roc_auc_score
    y_prob = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)
    print(f"AUC-ROC: {auc:.3f}")

    return model, feature_cols

# Generate sample shot data
np.random.seed(42)
shots = pd.DataFrame({
    'distance': np.random.uniform(5, 60, 5000),
    'angle': np.random.uniform(0, 90, 5000),
    'shot_type': np.random.choice(['wrist', 'slap', 'snap', 'backhand'], 5000),
    'is_rebound': np.random.choice([0, 1], 5000, p=[0.85, 0.15]),
    'time_since_event': np.random.uniform(0, 20, 5000),
    'is_rush': np.random.choice([0, 1], 5000, p=[0.75, 0.25])
})

# Simulated goal probability
goal_prob = np.exp(-0.05 * shots['distance']) * np.sin(np.radians(shots['angle'])) * \
            (1 + 0.2 * shots['is_rebound']) * (1 + 0.1 * shots['is_rush'])
shots['is_goal'] = np.random.binomial(1, goal_prob.clip(0.01, 0.5))

model, features = build_hockey_xg_model(shots)
print(f"\nModel features: {len(features)}")

Output Example:

AUC-ROC: 0.782

Model features: 9

python Baseball

Create Baseball Spray Chart

Create a baseball spray chart showing batted ball locations colored by hit outcome.

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Arc, Circle, Rectangle, Polygon

def draw_baseball_field(ax):
    """Draw a baseball field outline."""
    # Infield dirt
    infield = plt.Circle((0, 0), 95, fill=False, color='brown', linewidth=2)
    ax.add_patch(infield)

    # Basepaths
    bases = np.array([[0, 0], [63.6, 63.6], [0, 127.3],
                      [-63.6, 63.6], [0, 0]])
    ax.plot(bases[:, 0], bases[:, 1], 'k-', linewidth=2)

    # Outfield fence
    theta = np.linspace(np.pi/4, 3*np.pi/4, 100)
    fence_r = 330
    ax.plot(fence_r * np.cos(theta), fence_r * np.sin(theta), 'g-', linewidth=3)

    ax.set_xlim(-350, 350)
    ax.set_ylim(-50, 400)
    ax.set_aspect('equal')
    ax.axis('off')

def plot_spray_chart(hit_data, player_name):
    """Create spray chart from batted ball data."""
    fig, ax = plt.subplots(figsize=(10, 10))
    draw_baseball_field(ax)

    # Plot hits by outcome
    colors = {'single': 'blue', 'double': 'green',
              'triple': 'orange', 'home_run': 'red', 'out': 'gray'}

    for outcome, color in colors.items():
        hits = hit_data[hit_data['events'] == outcome]
        ax.scatter(hits['hc_x'], hits['hc_y'],
                  c=color, alpha=0.6, s=50, label=outcome.replace('_', ' ').title())

    ax.legend(loc='upper right')
    ax.set_title(f'{player_name} Spray Chart', fontsize=16, fontweight='bold')
    plt.tight_layout()
    return fig

# Example with sample data
import pandas as pd
sample_data = pd.DataFrame({
    'hc_x': np.random.uniform(-200, 200, 100),
    'hc_y': np.random.uniform(50, 350, 100),
    'events': np.random.choice(['single', 'double', 'home_run', 'out'], 100, p=[0.25, 0.1, 0.05, 0.6])
})

fig = plot_spray_chart(sample_data, "Mike Trout")
plt.show()

Output Example:

[Spray chart visualization with hits plotted on baseball field]

python Baseball

Plot Exit Velocity vs Launch Angle

Create a heatmap showing the relationship between exit velocity and launch angle with Statcast outcome zones.

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

def plot_ev_la_heatmap(statcast_data, player_name=None):
    """
    Create exit velocity vs launch angle heatmap with outcome zones.
    """
    fig, ax = plt.subplots(figsize=(12, 8))

    # Create heatmap
    sns.kdeplot(
        data=statcast_data,
        x='launch_angle',
        y='launch_speed',
        cmap='YlOrRd',
        fill=True,
        levels=20,
        ax=ax
    )

    # Add outcome zones
    # Ground balls
    ax.axvspan(-90, 10, alpha=0.1, color='brown', label='Ground Ball Zone')
    # Line drives
    ax.axvspan(10, 25, alpha=0.1, color='green', label='Line Drive Zone')
    # Fly balls
    ax.axvspan(25, 50, alpha=0.1, color='blue', label='Fly Ball Zone')

    # Barrel zone (optimal EV/LA combo)
    barrel_la = np.array([26, 30, 32, 30, 26])
    barrel_ev = np.array([98, 98, 100, 102, 102])
    ax.fill(barrel_la, barrel_ev, alpha=0.3, color='red', label='Barrel Zone')

    ax.set_xlabel('Launch Angle (degrees)', fontsize=12)
    ax.set_ylabel('Exit Velocity (mph)', fontsize=12)
    ax.set_xlim(-30, 60)
    ax.set_ylim(60, 120)
    ax.legend(loc='upper right')

    title = f'{player_name} Exit Velocity vs Launch Angle' if player_name else 'Exit Velocity vs Launch Angle'
    ax.set_title(title, fontsize=14, fontweight='bold')

    plt.tight_layout()
    return fig

# Example with sample data
import pandas as pd
sample = pd.DataFrame({
    'launch_angle': np.random.normal(15, 15, 200),
    'launch_speed': np.random.normal(90, 8, 200)
})

fig = plot_ev_la_heatmap(sample, "Shohei Ohtani")
plt.show()

Output Example:

[Heatmap showing batted ball quality distribution]

python Basketball

Create NBA Shot Chart

Create an NBA shot chart showing made and missed shots plotted on a basketball court.

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Circle, Rectangle, Arc

def draw_court(ax=None, color='black', lw=2):
    """Draw NBA basketball court."""
    if ax is None:
        ax = plt.gca()

    # Hoop
    hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)
    ax.add_patch(hoop)

    # Backboard
    ax.plot([-30, 30], [-7.5, -7.5], color, linewidth=lw)

    # Paint
    outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw,
                          color=color, fill=False)
    inner_box = Rectangle((-60, -47.5), 120, 190, linewidth=lw,
                          color=color, fill=False)
    ax.add_patch(outer_box)
    ax.add_patch(inner_box)

    # Free throw circle
    ft_circle = Arc((0, 142.5), 120, 120, theta1=0, theta2=180,
                    linewidth=lw, color=color)
    ax.add_patch(ft_circle)

    # Three point line
    corner_three = ax.plot([-220, -220], [-47.5, 92.5], color, linewidth=lw)
    corner_three = ax.plot([220, 220], [-47.5, 92.5], color, linewidth=lw)
    three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158,
                    linewidth=lw, color=color)
    ax.add_patch(three_arc)

    # Court boundary
    ax.plot([-250, 250], [-47.5, -47.5], color, linewidth=lw)
    ax.plot([-250, 250], [422.5, 422.5], color, linewidth=lw)
    ax.plot([-250, -250], [-47.5, 422.5], color, linewidth=lw)
    ax.plot([250, 250], [-47.5, 422.5], color, linewidth=lw)

    ax.set_xlim(-250, 250)
    ax.set_ylim(-50, 450)
    ax.set_aspect('equal')
    ax.axis('off')
    return ax

def plot_shot_chart(shots_df, player_name):
    """Create shot chart with makes and misses."""
    fig, ax = plt.subplots(figsize=(12, 11))
    draw_court(ax)

    # Plot makes
    makes = shots_df[shots_df['shot_made'] == 1]
    ax.scatter(makes['x'], makes['y'], c='green', marker='o',
              s=50, alpha=0.7, label='Made')

    # Plot misses
    misses = shots_df[shots_df['shot_made'] == 0]
    ax.scatter(misses['x'], misses['y'], c='red', marker='x',
              s=50, alpha=0.7, label='Missed')

    ax.legend(loc='upper right')
    ax.set_title(f'{player_name} Shot Chart', fontsize=16, fontweight='bold')
    plt.tight_layout()
    return fig

# Example
import pandas as pd
sample_shots = pd.DataFrame({
    'x': np.random.uniform(-200, 200, 100),
    'y': np.random.uniform(0, 300, 100),
    'shot_made': np.random.choice([0, 1], 100, p=[0.55, 0.45])
})

fig = plot_shot_chart(sample_shots, "Stephen Curry")
plt.show()

Output Example:

[Shot chart visualization with court and shot locations]

r Basketball

NBA Shot Heatmap with ggplot2

Create an NBA shot heatmap using ggplot2 with court overlay and density visualization.

library(ggplot2)
library(dplyr)

# Function to draw court
draw_court <- function() {
  list(
    # Paint
    geom_rect(aes(xmin = -80, xmax = 80, ymin = -47.5, ymax = 142.5),
              fill = NA, color = "black"),
    # Three point arc
    geom_path(data = data.frame(
      x = 237.5 * cos(seq(0.38, pi - 0.38, length.out = 100)),
      y = 237.5 * sin(seq(0.38, pi - 0.38, length.out = 100))
    ), aes(x, y), color = "black"),
    # Corner threes
    geom_segment(aes(x = -220, xend = -220, y = -47.5, yend = 92.5), color = "black"),
    geom_segment(aes(x = 220, xend = 220, y = -47.5, yend = 92.5), color = "black"),
    # Hoop
    annotate("point", x = 0, y = 0, size = 3),
    # Limits
    coord_fixed(),
    xlim(-250, 250),
    ylim(-50, 300)
  )
}

# Create shot heatmap
create_shot_heatmap <- function(shots_df, player_name) {
  ggplot(shots_df, aes(x = x, y = y)) +
    draw_court() +
    stat_density_2d(
      aes(fill = after_stat(level)),
      geom = "polygon",
      alpha = 0.6
    ) +
    scale_fill_gradient(low = "yellow", high = "red") +
    labs(title = paste(player_name, "Shot Heatmap"),
         fill = "Density") +
    theme_minimal() +
    theme(
      axis.text = element_blank(),
      axis.title = element_blank(),
      panel.grid = element_blank()
    )
}

# Example usage
set.seed(42)
shots <- data.frame(
  x = c(rnorm(50, 0, 50), rnorm(30, -150, 30), rnorm(30, 150, 30)),
  y = c(rnorm(50, 50, 50), rnorm(30, 50, 20), rnorm(30, 50, 20))
)

create_shot_heatmap(shots, "LeBron James")

Output Example:

[Heatmap showing shot frequency by court location]

python Soccer

Create Soccer Pitch with Passes

Draw a soccer pitch and visualize player passes as arrows with success/failure coloring.

import matplotlib.pyplot as plt
from matplotlib.patches import Arc, Circle, Rectangle

def draw_pitch(ax, pitch_length=120, pitch_width=80):
    """Draw a soccer pitch."""
    # Pitch outline
    ax.plot([0, pitch_length, pitch_length, 0, 0],
            [0, 0, pitch_width, pitch_width, 0], 'white', linewidth=2)

    # Center circle
    center_circle = Circle((pitch_length/2, pitch_width/2), 9.15,
                           fill=False, color='white', linewidth=2)
    ax.add_patch(center_circle)
    ax.plot([pitch_length/2, pitch_length/2], [0, pitch_width], 'white', linewidth=2)

    # Penalty areas
    for x in [0, pitch_length]:
        sign = 1 if x == 0 else -1
        pa_x = 16.5 if x == 0 else pitch_length - 16.5

        # Penalty box
        ax.plot([x, pa_x, pa_x, x],
                [pitch_width/2 - 20.15, pitch_width/2 - 20.15,
                 pitch_width/2 + 20.15, pitch_width/2 + 20.15], 'white', linewidth=2)

        # Goal box
        gb_x = 5.5 if x == 0 else pitch_length - 5.5
        ax.plot([x, gb_x, gb_x, x],
                [pitch_width/2 - 9.15, pitch_width/2 - 9.15,
                 pitch_width/2 + 9.15, pitch_width/2 + 9.15], 'white', linewidth=2)

        # Penalty spot
        spot_x = 11 if x == 0 else pitch_length - 11
        ax.plot(spot_x, pitch_width/2, 'wo', markersize=3)

    ax.set_facecolor('#228B22')
    ax.set_xlim(-5, pitch_length + 5)
    ax.set_ylim(-5, pitch_width + 5)
    ax.set_aspect('equal')
    ax.axis('off')
    return ax

def plot_passes(ax, passes_df, player_name):
    """Plot passes as arrows on pitch."""
    for _, pass_event in passes_df.iterrows():
        color = 'yellow' if pass_event.get('successful', True) else 'red'
        ax.annotate('',
                   xy=(pass_event['end_x'], pass_event['end_y']),
                   xytext=(pass_event['start_x'], pass_event['start_y']),
                   arrowprops=dict(arrowstyle='->', color=color, lw=1.5, alpha=0.7))

    ax.set_title(f'{player_name} Passes', color='white', fontsize=14, fontweight='bold')
    return ax

# Example
import pandas as pd
import numpy as np

fig, ax = plt.subplots(figsize=(12, 8))
draw_pitch(ax)

passes = pd.DataFrame({
    'start_x': np.random.uniform(30, 90, 20),
    'start_y': np.random.uniform(20, 60, 20),
    'end_x': np.random.uniform(40, 110, 20),
    'end_y': np.random.uniform(20, 60, 20),
    'successful': np.random.choice([True, False], 20, p=[0.85, 0.15])
})

plot_passes(ax, passes, "Kevin De Bruyne")
fig.patch.set_facecolor('#1a1a1a')
plt.show()

Output Example:

[Soccer pitch with pass arrows showing direction and outcome]

python Soccer

Create xG Shot Map

Create a shot map showing xG values as circle sizes and goals/non-goals as colors.

import matplotlib.pyplot as plt
import numpy as np

def plot_xg_shot_map(shots_df, team_name):
    """
    Create shot map with xG values.
    Circle size = xG value
    Color = goal (green) or no goal (red)
    """
    fig, ax = plt.subplots(figsize=(12, 8))

    # Draw half pitch (attacking half only)
    ax.set_facecolor('#228B22')

    # Goal
    ax.plot([0, 0], [30, 50], 'white', linewidth=5)

    # Penalty area
    ax.plot([0, 16.5, 16.5, 0], [20, 20, 60, 60], 'white', linewidth=2)

    # 6-yard box
    ax.plot([0, 5.5, 5.5, 0], [32, 32, 48, 48], 'white', linewidth=2)

    # Plot shots
    for _, shot in shots_df.iterrows():
        color = '#00ff00' if shot['is_goal'] else '#ff4444'
        size = shot['xg'] * 500  # Scale xG to marker size

        ax.scatter(shot['x'], shot['y'], s=size, c=color,
                  alpha=0.7, edgecolors='white', linewidths=1)

    # Calculate totals
    total_xg = shots_df['xg'].sum()
    total_goals = shots_df['is_goal'].sum()

    ax.set_xlim(-2, 60)
    ax.set_ylim(15, 65)
    ax.set_aspect('equal')
    ax.axis('off')

    ax.text(30, 63, f'{team_name}', fontsize=16, ha='center',
           color='white', fontweight='bold')
    ax.text(30, 17, f'xG: {total_xg:.2f} | Goals: {total_goals}',
           fontsize=12, ha='center', color='white')

    fig.patch.set_facecolor('#1a1a1a')
    plt.tight_layout()
    return fig

# Example
import pandas as pd
shots = pd.DataFrame({
    'x': [8, 12, 20, 6, 25, 10, 15, 35],
    'y': [40, 45, 38, 42, 50, 35, 40, 40],
    'xg': [0.45, 0.22, 0.08, 0.55, 0.04, 0.35, 0.18, 0.02],
    'is_goal': [True, False, False, True, False, False, True, False]
})

fig = plot_xg_shot_map(shots, "Manchester City")
plt.show()

Output Example:

[Shot map with xG circles showing shot quality and outcomes]

python Football

Plot EPA by Play Type

Create EPA visualizations showing distribution by play type and weekly trends.

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def plot_epa_by_play_type(pbp_df, team_name):
    """Visualize EPA distribution by play type."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    # Filter for pass and run plays
    passes = pbp_df[pbp_df['play_type'] == 'pass']['epa'].dropna()
    runs = pbp_df[pbp_df['play_type'] == 'run']['epa'].dropna()

    # Density plots
    sns.kdeplot(passes, ax=axes[0], fill=True, color='#3498db', alpha=0.7)
    axes[0].axvline(passes.mean(), color='red', linestyle='--',
                    label=f'Mean: {passes.mean():.3f}')
    axes[0].set_title(f'{team_name} Pass EPA Distribution', fontsize=12)
    axes[0].set_xlabel('EPA')
    axes[0].legend()

    sns.kdeplot(runs, ax=axes[1], fill=True, color='#2ecc71', alpha=0.7)
    axes[1].axvline(runs.mean(), color='red', linestyle='--',
                    label=f'Mean: {runs.mean():.3f}')
    axes[1].set_title(f'{team_name} Rush EPA Distribution', fontsize=12)
    axes[1].set_xlabel('EPA')
    axes[1].legend()

    plt.tight_layout()
    return fig

def plot_weekly_epa(pbp_df, team_name):
    """Plot EPA trend by week."""
    weekly = pbp_df.groupby('week')['epa'].agg(['sum', 'mean', 'count']).reset_index()

    fig, ax = plt.subplots(figsize=(12, 6))

    ax.bar(weekly['week'], weekly['sum'], color='steelblue', alpha=0.7)
    ax.axhline(0, color='black', linestyle='-', linewidth=0.5)

    ax.set_xlabel('Week', fontsize=12)
    ax.set_ylabel('Total EPA', fontsize=12)
    ax.set_title(f'{team_name} Weekly EPA', fontsize=14, fontweight='bold')

    # Color bars by positive/negative
    for i, bar in enumerate(ax.patches):
        if weekly.iloc[i]['sum'] < 0:
            bar.set_color('#e74c3c')

    plt.tight_layout()
    return fig

# Example
np.random.seed(42)
sample_pbp = pd.DataFrame({
    'play_type': np.random.choice(['pass', 'run'], 500, p=[0.6, 0.4]),
    'epa': np.random.normal(0.05, 0.5, 500),
    'week': np.random.randint(1, 18, 500)
})

fig = plot_epa_by_play_type(sample_pbp, "Kansas City Chiefs")
plt.show()

Output Example:

[EPA distribution plots for pass/run plays and weekly bar chart]

python Hockey

Create Hockey Rink Shot Plot

Draw an NHL rink and plot shots with xG values as colors and sizes, highlighting goals.

import matplotlib.pyplot as plt
from matplotlib.patches import Circle, Rectangle, Arc
import numpy as np

def draw_rink(ax):
    """Draw NHL hockey rink (half rink for shots)."""
    ax.set_facecolor('#FFFFFF')

    # Rink outline (half)
    ax.plot([0, 100, 100, 0], [0, 0, 85, 85], 'black', linewidth=2)

    # Blue line
    ax.axvline(x=25, color='blue', linewidth=3)

    # Goal line
    ax.axvline(x=89, color='red', linewidth=2)

    # Goal crease
    crease = Arc((89, 42.5), 12, 12, theta1=90, theta2=270,
                 color='blue', linewidth=2)
    ax.add_patch(crease)

    # Goal
    ax.plot([89, 93, 93, 89], [40, 40, 45, 45], 'red', linewidth=3)

    # Face-off circles
    for y in [20, 65]:
        circle = Circle((69, y), 15, fill=False, color='red', linewidth=2)
        ax.add_patch(circle)
        ax.plot(69, y, 'ro', markersize=3)

    # Center zone circles
    circle = Circle((25, 42.5), 15, fill=False, color='red', linewidth=2)
    ax.add_patch(circle)

    ax.set_xlim(-5, 105)
    ax.set_ylim(-5, 90)
    ax.set_aspect('equal')
    ax.axis('off')
    return ax

def plot_shots(ax, shots_df, title):
    """Plot shots on rink with xG coloring."""
    scatter = ax.scatter(
        shots_df['x'], shots_df['y'],
        c=shots_df['xg'],
        cmap='RdYlGn',
        s=shots_df['xg'] * 200 + 20,
        alpha=0.7,
        edgecolors='black',
        linewidths=0.5
    )

    # Mark goals
    goals = shots_df[shots_df['is_goal'] == True]
    ax.scatter(goals['x'], goals['y'], marker='*', s=200,
              c='gold', edgecolors='black', linewidths=1, zorder=5)

    ax.set_title(title, fontsize=14, fontweight='bold')
    plt.colorbar(scatter, ax=ax, label='xG', shrink=0.6)
    return ax

# Example
import pandas as pd
fig, ax = plt.subplots(figsize=(12, 8))
draw_rink(ax)

shots = pd.DataFrame({
    'x': np.random.uniform(50, 88, 30),
    'y': np.random.uniform(20, 65, 30),
    'xg': np.random.uniform(0.02, 0.25, 30),
    'is_goal': np.random.choice([True, False], 30, p=[0.1, 0.9])
})

plot_shots(ax, shots, "Colorado Avalanche Shots")
plt.tight_layout()
plt.show()

Output Example:

[Hockey rink with shot locations colored by xG and starred goals]

python

Create Player Comparison Radar Chart

Create a radar/spider chart for comparing two players across multiple statistical categories.

import matplotlib.pyplot as plt
import numpy as np
from math import pi

def create_radar_chart(categories, player1_values, player2_values,
                       player1_name, player2_name, title="Player Comparison"):
    """
    Create radar/spider chart comparing two players.

    Args:
        categories: List of stat categories
        player1_values: Normalized values (0-100) for player 1
        player2_values: Normalized values (0-100) for player 2
    """
    N = len(categories)

    # Calculate angles for each axis
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]  # Complete the loop

    # Add first value to end to close the polygon
    player1_values = list(player1_values) + [player1_values[0]]
    player2_values = list(player2_values) + [player2_values[0]]

    # Create plot
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))

    # Plot data
    ax.plot(angles, player1_values, 'o-', linewidth=2, label=player1_name, color='#3498db')
    ax.fill(angles, player1_values, alpha=0.25, color='#3498db')

    ax.plot(angles, player2_values, 'o-', linewidth=2, label=player2_name, color='#e74c3c')
    ax.fill(angles, player2_values, alpha=0.25, color='#e74c3c')

    # Set category labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories, fontsize=11)

    # Set y-axis
    ax.set_ylim(0, 100)
    ax.set_yticks([20, 40, 60, 80, 100])
    ax.set_yticklabels(['20', '40', '60', '80', '100'], fontsize=9)

    ax.set_title(title, fontsize=16, fontweight='bold', pad=20)
    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))

    plt.tight_layout()
    return fig

# Example: NFL QB comparison
categories = ['Completion %', 'TD Rate', 'INT Avoid', 'Yards/Att',
              'Pressure Rate', 'Deep Ball', 'Red Zone', 'Clutch']
qb1 = [85, 90, 92, 78, 70, 82, 88, 95]  # Mahomes
qb2 = [80, 75, 85, 82, 65, 78, 75, 72]  # Comparable QB

fig = create_radar_chart(categories, qb1, qb2,
                         "Patrick Mahomes", "Josh Allen",
                         "2023 QB Comparison")
plt.show()

Output Example:

[Radar chart showing overlapping player performance areas]

python

Create Rolling Average Performance Chart

Create a rolling average chart to visualize performance trends over a season.

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

def plot_rolling_performance(df, metric_col, date_col, player_name,
                            window=10, title=None):
    """
    Create rolling average chart to show performance trends.

    Args:
        df: DataFrame with player game-by-game data
        metric_col: Column name of the metric to plot
        date_col: Column name with dates or game numbers
        window: Rolling window size
    """
    fig, ax = plt.subplots(figsize=(14, 6))

    # Calculate rolling average
    rolling_avg = df[metric_col].rolling(window=window, min_periods=1).mean()

    # Plot raw values
    ax.scatter(df[date_col], df[metric_col], alpha=0.4, s=50,
              c='#3498db', label='Game Value')

    # Plot rolling average
    ax.plot(df[date_col], rolling_avg, color='#e74c3c', linewidth=3,
           label=f'{window}-Game Rolling Avg')

    # Season average line
    season_avg = df[metric_col].mean()
    ax.axhline(season_avg, color='gray', linestyle='--', alpha=0.7,
               label=f'Season Avg: {season_avg:.2f}')

    # Styling
    ax.set_xlabel('Game', fontsize=12)
    ax.set_ylabel(metric_col, fontsize=12)
    ax.set_title(title or f'{player_name} {metric_col} Trend',
                fontsize=14, fontweight='bold')
    ax.legend(loc='upper right')
    ax.grid(True, alpha=0.3)

    # Highlight improvement/decline zones
    ax.fill_between(df[date_col], rolling_avg, season_avg,
                    where=(rolling_avg > season_avg),
                    alpha=0.2, color='green', label='Above Average')
    ax.fill_between(df[date_col], rolling_avg, season_avg,
                    where=(rolling_avg <= season_avg),
                    alpha=0.2, color='red', label='Below Average')

    plt.tight_layout()
    return fig

# Example: Player scoring trend
np.random.seed(42)
games = pd.DataFrame({
    'game': range(1, 83),
    'points': np.random.normal(25, 8, 82).clip(5, 50)
})

# Add some trend
games.loc[40:, 'points'] += 5  # Player improved mid-season

fig = plot_rolling_performance(games, 'points', 'game', "Jayson Tatum", window=10)
plt.show()

Output Example:

[Line chart showing raw performance and rolling average with trend zones]

Create Team Rankings Bump Chart

Create a bump chart showing how team rankings change over time using ggbump.

library(ggplot2)
library(ggbump)
library(dplyr)

# Create sample ranking data
create_bump_chart <- function(rankings_df, title = "Team Rankings Over Time") {
  ggplot(rankings_df, aes(x = week, y = rank, color = team)) +
    geom_bump(size = 2, smooth = 8) +
    geom_point(size = 4) +
    geom_text(data = rankings_df %>% filter(week == min(week)),
              aes(label = team), x = 0.7, hjust = 1, fontface = "bold") +
    geom_text(data = rankings_df %>% filter(week == max(week)),
              aes(label = team), x = max(rankings_df$week) + 0.3, hjust = 0, fontface = "bold") +
    scale_y_reverse(breaks = 1:10) +
    scale_x_continuous(breaks = 1:max(rankings_df$week)) +
    labs(title = title,
         x = "Week",
         y = "Ranking") +
    theme_minimal() +
    theme(
      legend.position = "none",
      panel.grid.major.y = element_blank(),
      panel.grid.minor = element_blank(),
      axis.text = element_text(size = 11),
      plot.title = element_text(size = 16, face = "bold", hjust = 0.5)
    )
}

# Example data
set.seed(42)
teams <- c("Chiefs", "49ers", "Eagles", "Bills", "Cowboys")
rankings <- expand.grid(team = teams, week = 1:10) %>%
  group_by(team) %>%
  mutate(
    base_rank = case_when(
      team == "Chiefs" ~ 1,
      team == "49ers" ~ 2,
      team == "Eagles" ~ 3,
      team == "Bills" ~ 4,
      team == "Cowboys" ~ 5
    ),
    rank = base_rank + sample(-2:2, n(), replace = TRUE)
  ) %>%
  mutate(rank = pmin(pmax(rank, 1), 5)) %>%
  ungroup()

create_bump_chart(rankings, "NFL Power Rankings by Week")

Output Example:

[Bump chart showing team ranking trajectories across weeks]

python Baseball

Calculate WAR Components

Calculate Wins Above Replacement (WAR) from its component parts: batting, baserunning, fielding, and adjustments.

import pandas as pd
import numpy as np

def calculate_batting_runs(row, league_woba, woba_scale):
    """Calculate batting runs above average."""
    woba = row['wOBA']
    pa = row['PA']
    wRAA = ((woba - league_woba) / woba_scale) * pa
    return wRAA

def calculate_war(batting_runs, baserunning_runs, fielding_runs,
                  positional_adj, league_adj, replacement_runs, rpw=10):
    """
    Calculate Wins Above Replacement.

    Args:
        batting_runs: Offensive runs above average
        baserunning_runs: Baserunning runs
        fielding_runs: Defensive runs above average
        positional_adj: Position adjustment
        league_adj: League adjustment
        replacement_runs: Runs vs replacement level
        rpw: Runs per win (default 10)

    Returns:
        WAR value
    """
    total_runs = (batting_runs + baserunning_runs + fielding_runs +
                  positional_adj + league_adj + replacement_runs)
    war = total_runs / rpw
    return round(war, 1)

# Example
player_war = calculate_war(
    batting_runs=35.2,
    baserunning_runs=2.5,
    fielding_runs=-5.0,
    positional_adj=-7.5,
    league_adj=2.0,
    replacement_runs=20.0
)
print(f"Player WAR: {player_war}")

Output Example:

Player WAR: 4.7

python Baseball

Calculate wOBA and wRC+

Calculate wOBA (weighted On-Base Average) and wRC+ from raw batting statistics.

import pandas as pd

def calculate_woba(bb, hbp, singles, doubles, triples, hr, ab, sf, ibb=0):
    """
    Calculate weighted On-Base Average.
    Weights are approximate (2023 values).
    """
    numerator = (0.69 * (bb - ibb) + 0.72 * hbp + 0.88 * singles +
                 1.24 * doubles + 1.56 * triples + 2.00 * hr)
    denominator = ab + bb - ibb + sf + hbp
    return numerator / denominator if denominator > 0 else 0

def calculate_wrc_plus(woba, pa, league_woba=0.320, woba_scale=1.15,
                       park_factor=100, league_rppa=0.12):
    """
    Calculate wRC+ (park and league adjusted runs created).
    100 = league average.
    """
    wraa = ((woba - league_woba) / woba_scale) * pa
    wrc = (wraa / pa + league_rppa) + (league_rppa - (park_factor / 100) * league_rppa)
    wrc_plus = (wrc / league_rppa) * 100
    return round(wrc_plus)

# Example player stats
player = {
    'BB': 60, 'HBP': 5, '1B': 100, '2B': 30,
    '3B': 5, 'HR': 25, 'AB': 500, 'SF': 4, 'PA': 600
}

woba = calculate_woba(
    player['BB'], player['HBP'], player['1B'], player['2B'],
    player['3B'], player['HR'], player['AB'], player['SF']
)
wrc_plus = calculate_wrc_plus(woba, player['PA'])

print(f"wOBA: {woba:.3f}")
print(f"wRC+: {wrc_plus}")

Output Example:

wOBA: .382
wRC+: 142

r Baseball

Calculate FIP and xFIP

Calculate FIP (Fielding Independent Pitching) and xFIP which regresses home runs to league average fly ball rate.

calculate_fip <- function(hr, bb, hbp, k, ip, fip_constant = 3.10) {
  # Fielding Independent Pitching
  fip <- ((13 * hr + 3 * (bb + hbp) - 2 * k) / ip) + fip_constant
  return(round(fip, 2))
}

calculate_xfip <- function(fb, lg_hr_fb_rate, bb, hbp, k, ip, fip_constant = 3.10) {
  # Expected FIP (regresses HR to league average)
  expected_hr <- fb * lg_hr_fb_rate
  xfip <- ((13 * expected_hr + 3 * (bb + hbp) - 2 * k) / ip) + fip_constant
  return(round(xfip, 2))
}

# Example pitcher stats
pitcher <- list(
  HR = 20, BB = 50, HBP = 5, K = 180, IP = 180,
  FB = 200  # Fly balls allowed
)

fip <- calculate_fip(pitcher$HR, pitcher$BB, pitcher$HBP, pitcher$K, pitcher$IP)
xfip <- calculate_xfip(pitcher$FB, 0.10, pitcher$BB, pitcher$HBP, pitcher$K, pitcher$IP)

cat("FIP:", fip, "\n")
cat("xFIP:", xfip, "\n")

Output Example:

FIP: 3.64
xFIP: 3.44

python Basketball

Calculate Advanced Shooting Metrics

Calculate True Shooting %, Effective FG%, and Points Per Shot for accurate shooting efficiency analysis.

def true_shooting_pct(points, fga, fta):
    """True Shooting Percentage - most accurate shooting efficiency."""
    return points / (2 * (fga + 0.44 * fta)) * 100

def effective_fg_pct(fgm, three_pm, fga):
    """eFG% - adjusts for 3-point value."""
    return (fgm + 0.5 * three_pm) / fga * 100

def points_per_shot(points, fga, fta):
    """Points generated per shooting possession."""
    return points / (fga + 0.44 * fta)

# Example player game
game = {
    'points': 35, 'fga': 22, 'fgm': 12,
    'three_pm': 5, 'fta': 8, 'ftm': 6
}

ts = true_shooting_pct(game['points'], game['fga'], game['fta'])
efg = effective_fg_pct(game['fgm'], game['three_pm'], game['fga'])
pps = points_per_shot(game['points'], game['fga'], game['fta'])

print(f"True Shooting %: {ts:.1f}%")
print(f"Effective FG %: {efg:.1f}%")
print(f"Points Per Shot: {pps:.2f}")

Output Example:

True Shooting %: 67.3%
Effective FG %: 65.9%
Points Per Shot: 1.35

python Basketball

Calculate PER (Player Efficiency Rating)

Player Efficiency Rating (PER) calculation framework. Full implementation requires team and league adjustment factors.

def calculate_per(stats, team_stats, league_stats):
    """
    Calculate Player Efficiency Rating (simplified version).

    Note: Full PER calculation involves many adjustments.
    This is a simplified approximation.
    """
    # Factor calculations
    factor = (2/3) - (0.5 * (league_stats['ast'] / league_stats['fg'])) / \
             (2 * (league_stats['fg'] / league_stats['ft']))

    vop = league_stats['pts'] / (league_stats['fga'] - league_stats['orb'] +
                                   league_stats['tov'] + 0.44 * league_stats['fta'])

    drbp = (league_stats['trb'] - league_stats['orb']) / league_stats['trb']

    # uPER calculation (unadjusted)
    uper = (1 / stats['min']) * (
        stats['3pm'] +
        (2/3) * stats['ast'] +
        (2 - factor * (team_stats['ast'] / team_stats['fg'])) * stats['fg'] +
        stats['ft'] * 0.5 * (1 + (1 - (team_stats['ast'] / team_stats['fg'])) +
                              (2/3) * (team_stats['ast'] / team_stats['fg'])) -
        vop * stats['tov'] -
        vop * drbp * (stats['fga'] - stats['fg']) -
        vop * 0.44 * (0.44 + (0.56 * drbp)) * (stats['fta'] - stats['ft']) +
        vop * (1 - drbp) * (stats['trb'] - stats['orb']) +
        vop * drbp * stats['orb'] +
        vop * stats['stl'] +
        vop * drbp * stats['blk'] -
        stats['pf'] * (league_stats['ft'] / league_stats['pf'] - 0.44 *
                        (league_stats['fta'] / league_stats['pf']) * vop)
    )

    return round(uper, 1)

print("PER calculation requires full season/league context.")

Output Example:

PER calculation requires full season/league context.

r Basketball

Calculate Four Factors

Calculate Dean Oliver's Four Factors: eFG%, Turnover Rate, Offensive Rebounding Rate, and Free Throw Rate.

# Dean Oliver's Four Factors of Basketball Success

calculate_four_factors <- function(fgm, fga, three_pm, tov, orb, opp_drb,
                                   ftm, pts_allowed, opp_pts) {
  # 1. Effective FG% (~40% weight)
  efg <- (fgm + 0.5 * three_pm) / fga

  # 2. Turnover Rate (~25% weight)
  tov_rate <- tov / (fga + 0.44 * ftm + tov)

  # 3. Offensive Rebounding Rate (~20% weight)
  orb_rate <- orb / (orb + opp_drb)

  # 4. Free Throw Rate (~15% weight)
  ft_rate <- ftm / fga

  return(list(
    eFG_pct = round(efg * 100, 1),
    TOV_pct = round(tov_rate * 100, 1),
    ORB_pct = round(orb_rate * 100, 1),
    FT_rate = round(ft_rate * 100, 1)
  ))
}

# Example team game stats
factors <- calculate_four_factors(
  fgm = 42, fga = 88, three_pm = 12,
  tov = 12, orb = 10, opp_drb = 35,
  ftm = 18, pts_allowed = 105, opp_pts = 110
)

cat("Four Factors Analysis:\n")
cat("eFG%:", factors$eFG_pct, "%\n")
cat("TOV%:", factors$TOV_pct, "%\n")
cat("ORB%:", factors$ORB_pct, "%\n")
cat("FT Rate:", factors$FT_rate, "%\n")

Output Example:

Four Factors Analysis:
eFG%: 54.5%
TOV%: 11.3%
ORB%: 22.2%
FT Rate: 20.5%

python Football

Calculate NFL Passer Rating

Calculate NFL Passer Rating from completions, attempts, yards, touchdowns, and interceptions.

def calculate_passer_rating(comp, att, yards, td, int_):
    """
    Calculate NFL Passer Rating (0-158.3 scale).

    Each component is bounded between 0 and 2.375.
    """
    # Component a: Completion percentage
    a = ((comp / att) - 0.3) * 5
    a = max(0, min(a, 2.375))

    # Component b: Yards per attempt
    b = ((yards / att) - 3) * 0.25
    b = max(0, min(b, 2.375))

    # Component c: TD percentage
    c = (td / att) * 20
    c = max(0, min(c, 2.375))

    # Component d: INT percentage (inverted)
    d = 2.375 - ((int_ / att) * 25)
    d = max(0, min(d, 2.375))

    # Final rating
    rating = ((a + b + c + d) / 6) * 100
    return round(rating, 1)

# Example game
game = {'comp': 28, 'att': 35, 'yards': 350, 'td': 4, 'int': 0}
rating = calculate_passer_rating(**game)
print(f"Passer Rating: {rating}")

# Perfect passer rating requirements
print("\nFor 158.3 rating, need:")
print("- 77.5%+ completion")
print("- 12.5+ yards/attempt")
print("- 11.875%+ TD rate")
print("- 0% INT rate")

Output Example:

Passer Rating: 153.0

For 158.3 rating, need:
- 77.5%+ completion
- 12.5+ yards/attempt
- 11.875%+ TD rate
- 0% INT rate

python Football

Calculate Expected Points Added (EPA)

Calculate Expected Points Added (EPA) using a simplified expected points model based on down, distance, and field position.

import numpy as np

def get_expected_points(down, distance, yard_line):
    """
    Simplified expected points model.
    In practice, this uses logistic regression on historical data.
    """
    # Simplified model (real models use play-by-play training)
    base_ep = (yard_line / 100) * 7  # 0 at own goal, 7 at opponent's

    # Down adjustments
    down_adj = {1: 0, 2: -0.5, 3: -1.0, 4: -2.0}
    ep = base_ep + down_adj.get(down, 0)

    # Distance adjustment
    if down < 4:
        ep -= (distance - 10) * 0.05

    return round(ep, 2)

def calculate_epa(ep_before, ep_after, points_scored=0):
    """Calculate EPA for a single play."""
    return round(ep_after - ep_before + points_scored, 2)

# Example: 15-yard gain on 1st and 10 from own 25
ep_before = get_expected_points(down=1, distance=10, yard_line=25)
ep_after = get_expected_points(down=1, distance=10, yard_line=40)
epa = calculate_epa(ep_before, ep_after)

print(f"EP Before: {ep_before}")
print(f"EP After: {ep_after}")
print(f"EPA: {epa}")

Output Example:

EP Before: 1.75
EP After: 2.80
EPA: 1.05

python Soccer

Calculate xG from Shot Location

Simplified xG model based on shot distance and angle. Real models use machine learning with additional features.

import numpy as np

def calculate_xg(distance, angle, is_header=False, is_penalty=False):
    """
    Simplified xG model based on shot location.

    Real models use:
    - Distance from goal
    - Angle to goal
    - Body part (foot vs header)
    - Assist type
    - Game state
    - Defender positions
    """
    if is_penalty:
        return 0.76  # Historical penalty conversion rate

    # Base xG from distance (exponential decay)
    base_xg = np.exp(-0.1 * distance)

    # Angle adjustment (0-1 scale)
    angle_factor = np.sin(np.radians(angle)) ** 2

    # Header penalty
    header_mult = 0.7 if is_header else 1.0

    xg = base_xg * angle_factor * header_mult
    return round(min(xg, 0.95), 3)  # Cap at 0.95

# Example shots
shots = [
    {'distance': 6, 'angle': 45, 'is_header': False},   # Close range
    {'distance': 18, 'angle': 30, 'is_header': False},  # Edge of box
    {'distance': 25, 'angle': 20, 'is_header': False},  # Long range
    {'distance': 8, 'angle': 35, 'is_header': True},    # Header
]

for shot in shots:
    xg = calculate_xg(**shot)
    print(f"Distance: {shot['distance']}m, Angle: {shot['angle']}°, "
          f"Header: {shot['is_header']}, xG: {xg}")

Output Example:

Distance: 6m, Angle: 45°, Header: False, xG: 0.274
Distance: 18m, Angle: 30°, Header: False, xG: 0.041
Distance: 25m, Angle: 20°, Header: False, xG: 0.010
Distance: 8m, Angle: 35°, Header: True, xG: 0.147

python Soccer

Calculate PPDA (Pressing Intensity)

Calculate PPDA (pressing intensity) and Field Tilt (territorial dominance) from match event data.

def calculate_ppda(opponent_passes_def_third, defensive_actions_opp_third):
    """
    Calculate Passes Per Defensive Action (PPDA).

    Lower PPDA = more aggressive pressing.
    - < 8: Very high press (e.g., Liverpool, Dortmund)
    - 8-10: High press
    - 10-12: Medium press
    - > 12: Low press

    Args:
        opponent_passes_def_third: Opponent passes in their defensive third
        defensive_actions_opp_third: Your defensive actions in opponent's def third
    """
    if defensive_actions_opp_third == 0:
        return float('inf')
    return round(opponent_passes_def_third / defensive_actions_opp_third, 2)

def calculate_field_tilt(own_final_third_touches, opp_final_third_touches):
    """
    Field tilt: % of touches in final thirds that belong to team.
    > 60% = dominant possession in attacking areas.
    """
    total = own_final_third_touches + opp_final_third_touches
    return round((own_final_third_touches / total) * 100, 1) if total > 0 else 50.0

# Example match data
match = {
    'opp_passes_def_third': 85,
    'def_actions_opp_third': 12,
    'own_final_third_touches': 180,
    'opp_final_third_touches': 95
}

ppda = calculate_ppda(match['opp_passes_def_third'], match['def_actions_opp_third'])
tilt = calculate_field_tilt(match['own_final_third_touches'], match['opp_final_third_touches'])

print(f"PPDA: {ppda} (lower = more pressing)")
print(f"Field Tilt: {tilt}%")

Output Example:

PPDA: 7.08 (lower = more pressing)
Field Tilt: 65.5%

python Hockey

Calculate Corsi and Fenwick

Calculate Corsi, Fenwick, and PDO - foundational possession and luck metrics in hockey analytics.

def calculate_corsi(shots_for, shots_against, missed_for, missed_against,
                        blocked_for, blocked_against):
    """
    Calculate Corsi (all shot attempts).

    CF% > 50% = controlling play
    """
    cf = shots_for + missed_for + blocked_for
    ca = shots_against + missed_against + blocked_against
    cf_pct = (cf / (cf + ca)) * 100 if (cf + ca) > 0 else 50
    return {'CF': cf, 'CA': ca, 'CF%': round(cf_pct, 1)}

def calculate_fenwick(shots_for, shots_against, missed_for, missed_against):
    """
    Calculate Fenwick (shot attempts excluding blocked shots).

    Some prefer Fenwick as blocked shots are partially random.
    """
    ff = shots_for + missed_for
    fa = shots_against + missed_against
    ff_pct = (ff / (ff + fa)) * 100 if (ff + fa) > 0 else 50
    return {'FF': ff, 'FA': fa, 'FF%': round(ff_pct, 1)}

def calculate_pdo(sh_pct, sv_pct):
    """
    PDO = Shooting% + Save%
    Regresses strongly to 100 (or 1.000).
    High PDO often indicates luck.
    """
    return round(sh_pct + sv_pct, 1)

# Example player on-ice stats
on_ice = {
    'shots_for': 30, 'shots_against': 25,
    'missed_for': 12, 'missed_against': 10,
    'blocked_for': 8, 'blocked_against': 6,
    'sh_pct': 12.0, 'sv_pct': 92.5
}

corsi = calculate_corsi(
    on_ice['shots_for'], on_ice['shots_against'],
    on_ice['missed_for'], on_ice['missed_against'],
    on_ice['blocked_for'], on_ice['blocked_against']
)
fenwick = calculate_fenwick(
    on_ice['shots_for'], on_ice['shots_against'],
    on_ice['missed_for'], on_ice['missed_against']
)
pdo = calculate_pdo(on_ice['sh_pct'], on_ice['sv_pct'])

print(f"Corsi: {corsi}")
print(f"Fenwick: {fenwick}")
print(f"PDO: {pdo}")

Output Example:

Corsi: {'CF': 50, 'CA': 41, 'CF%': 54.9}
Fenwick: {'FF': 42, 'FA': 35, 'FF%': 54.5}
PDO: 104.5

python Golf

Calculate Strokes Gained

Calculate Strokes Gained for individual shots using baseline expected strokes from each position.

import numpy as np

# Expected strokes from distance (baseline data)
EXPECTED_STROKES = {
    'tee': {250: 4.1, 300: 3.9, 350: 3.8, 400: 4.0, 450: 4.2},
    'fairway': {50: 2.8, 100: 2.9, 150: 3.0, 200: 3.2, 250: 3.5},
    'rough': {50: 3.0, 100: 3.1, 150: 3.2, 200: 3.5, 250: 3.8},
    'green': {5: 1.5, 10: 1.8, 20: 2.0, 30: 2.1, 50: 2.3}
}

def get_expected_strokes(distance, lie):
    """Get expected strokes from current position."""
    if lie not in EXPECTED_STROKES:
        return 3.0
    distances = sorted(EXPECTED_STROKES[lie].keys())
    for d in distances:
        if distance <= d:
            return EXPECTED_STROKES[lie][d]
    return EXPECTED_STROKES[lie][distances[-1]]

def calculate_strokes_gained(start_lie, start_dist, end_lie, end_dist, strokes_taken=1):
    """
    Calculate strokes gained for a single shot.

    SG = Expected_before - Expected_after - Strokes_taken
    """
    exp_before = get_expected_strokes(start_dist, start_lie)
    exp_after = get_expected_strokes(end_dist, end_lie) if end_lie != 'hole' else 0
    sg = exp_before - exp_after - strokes_taken
    return round(sg, 2)

# Example: Drive from tee
drive_sg = calculate_strokes_gained('tee', 450, 'fairway', 150)
print(f"Drive SG: {drive_sg}")

# Example: Approach shot
approach_sg = calculate_strokes_gained('fairway', 150, 'green', 15)
print(f"Approach SG: {approach_sg}")

# Example: Holed putt
putt_sg = calculate_strokes_gained('green', 15, 'hole', 0)
print(f"Putt SG: {putt_sg}")

Output Example:

Drive SG: 0.20
Approach SG: 0.20
Putt SG: 0.80

python Tennis

Calculate Tennis Performance Metrics

Calculate key tennis performance metrics: Service Points Won, Return Points Won, Dominance Ratio, and Performance Index.

def service_points_won(first_in, first_won, second_won, double_faults, total_serves):
    """Calculate overall service points won percentage."""
    second_serves = total_serves - first_in
    total_won = first_won + second_won
    spw = total_won / total_serves * 100 if total_serves > 0 else 0
    return round(spw, 1)

def return_points_won(first_ret_won, first_faced, second_ret_won, second_faced):
    """Calculate return points won percentage."""
    total_won = first_ret_won + second_ret_won
    total_faced = first_faced + second_faced
    rpw = total_won / total_faced * 100 if total_faced > 0 else 0
    return round(rpw, 1)

def dominance_ratio(spw_pct, rpw_pct):
    """
    Dominance Ratio = SPW% / (100 - RPW%)
    > 1.0 means winning more on serve than opponent
    """
    opp_spw = 100 - rpw_pct
    return round(spw_pct / opp_spw, 2) if opp_spw > 0 else 1.0

def performance_index(spw_pct, rpw_pct):
    """
    PI = SPW% + RPW% - 100
    Positive = outperforming opponent
    """
    return round(spw_pct + rpw_pct - 100, 1)

# Example match stats
match = {
    'first_in': 50, 'first_won': 38, 'second_won': 18,
    'double_faults': 3, 'total_serves': 75,
    'first_ret_won': 18, 'first_faced': 55,
    'second_ret_won': 14, 'second_faced': 25
}

spw = service_points_won(match['first_in'], match['first_won'],
                          match['second_won'], match['double_faults'],
                          match['total_serves'])
rpw = return_points_won(match['first_ret_won'], match['first_faced'],
                         match['second_ret_won'], match['second_faced'])

print(f"Service Points Won: {spw}%")
print(f"Return Points Won: {rpw}%")
print(f"Dominance Ratio: {dominance_ratio(spw, rpw)}")
print(f"Performance Index: {performance_index(spw, rpw)}")

Output Example:

Service Points Won: 74.7%
Return Points Won: 40.0%
Dominance Ratio: 1.24
Performance Index: +14.7

python Baseball

Get Statcast Data with pybaseball

Fetch granular Statcast tracking data including exit velocity, launch angle, and expected stats based on batted ball quality.

from pybaseball import statcast
import pandas as pd

# Get Statcast data for a date range
statcast_data = statcast(start_dt="2023-06-01", end_dt="2023-06-07")

# Filter for home runs
home_runs = statcast_data[statcast_data['events'] == 'home_run']

# Key Statcast metrics
print(statcast_data[['player_name', 'launch_speed', 'launch_angle',
                     'hit_distance_sc', 'estimated_ba_using_speedangle']].head())

Output Example:

          player_name  launch_speed  launch_angle  hit_distance_sc  estimated_ba
0       Shohei Ohtani         108.5          28.0            425.0         0.940
1         Mike Trout         104.2          32.0            398.0         0.850

r Baseball

Fetch MLB Data with baseballr

The baseballr package provides access to FanGraphs, Baseball Reference, and MLB data in R.

library(baseballr)
library(dplyr)

# Get team batting stats
team_batting <- fg_team_batter(2023)

# Top 5 teams by wRC+
team_batting %>%
  select(team_name, wRC_plus, BB_pct, K_pct, ISO) %>%
  arrange(desc(wRC_plus)) %>%
  head(5)

# Get individual player stats
fg_batters <- fg_batter_leaders(2023, qual = 400)
print(paste("Retrieved", nrow(fg_batters), "qualified batters"))

Output Example:

  team_name  wRC_plus BB_pct  K_pct   ISO
1   Dodgers     118   9.8%   20.1%  .186
2    Braves     115   9.2%   22.3%  .198

python Basketball

Fetch NBA Stats with nba_api

The nba_api library provides comprehensive access to NBA.com statistics including player stats, team data, and play-by-play.

from nba_api.stats.endpoints import leagueleaders, playercareerstats
from nba_api.stats.static import players
import pandas as pd

# Get league leaders for current season
leaders = leagueleaders.LeagueLeaders(season='2023-24')
df = leaders.get_data_frames()[0]

# Top scorers
top_scorers = df.nlargest(10, 'PTS')[['PLAYER', 'TEAM', 'GP', 'PTS', 'AST', 'REB']]
print(top_scorers)

# Get specific player career stats
player_dict = players.find_players_by_full_name("LeBron James")[0]
career = playercareerstats.PlayerCareerStats(player_id=player_dict['id'])
print(career.get_data_frames()[0][['SEASON_ID', 'PTS', 'AST', 'REB']].tail())

Output Example:

              PLAYER    TEAM  GP   PTS  AST  REB
0     Luka Doncic     DAL  70  33.9  9.8  9.2
1     Giannis A.     MIL  73  30.4  6.5 11.5

python Basketball

Get NBA Play-by-Play Data

Fetch detailed play-by-play data for NBA games including shot locations, player actions, and game context.

from nba_api.stats.endpoints import playbyplayv2
import pandas as pd

# Get play-by-play for a specific game
game_id = "0022300001"  # Example game ID
pbp = playbyplayv2.PlayByPlayV2(game_id=game_id)
df = pbp.get_data_frames()[0]

# Filter for scoring plays
scoring = df[df['SCOREMARGIN'].notna()][[
    'PERIOD', 'PCTIMESTRING', 'HOMEDESCRIPTION',
    'VISITORDESCRIPTION', 'SCORE'
]]

print(f"Total plays: {len(df)}")
print(f"Scoring plays: {len(scoring)}")

Output Example:

Total plays: 478
Scoring plays: 212

r Basketball

Fetch NBA Data with hoopR

hoopR provides easy access to NBA and WNBA data in R with pre-loaded datasets for efficient analysis.

library(hoopR)
library(dplyr)

# Load NBA player box scores
nba_box <- load_nba_player_box(seasons = 2024)

# Get season averages
player_avgs <- nba_box %>%
  group_by(athlete_display_name, team_short_display_name) %>%
  summarize(
    games = n(),
    ppg = mean(points, na.rm = TRUE),
    rpg = mean(rebounds, na.rm = TRUE),
    apg = mean(assists, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(games >= 20) %>%
  arrange(desc(ppg))

head(player_avgs, 10)

Output Example:

# A tibble: 10 x 5
   athlete_display_name team_short_display_name games   ppg   rpg
1  Luka Dončić          DAL                        70  33.9   9.2

python Football

Load NFL Play-by-Play with nfl_data_py

nfl_data_py provides access to NFL play-by-play data with pre-calculated EPA, WPA, and other advanced metrics.

import nfl_data_py as nfl
import pandas as pd

# Load play-by-play data
pbp = nfl.import_pbp_data([2023])
print(f"Total plays: {len(pbp)}")

# Filter for pass plays
pass_plays = pbp[pbp['play_type'] == 'pass']

# Get EPA leaders
qb_epa = pass_plays.groupby('passer_player_name').agg({
    'epa': ['sum', 'mean', 'count']
}).round(3)
qb_epa.columns = ['total_epa', 'epa_per_play', 'attempts']
qb_epa = qb_epa[qb_epa['attempts'] >= 200].sort_values('total_epa', ascending=False)
print(qb_epa.head(10))

Output Example:

                    total_epa  epa_per_play  attempts
passer_player_name
T.Tagovailoa          152.34         0.281       542
J.Goff                145.21         0.265       548

r Football

Load NFL Data with nflfastR

nflfastR is the premier NFL analytics package in R, providing play-by-play data with EPA, CPOE, and other advanced metrics.

library(nflfastR)
library(dplyr)

# Load play-by-play data
pbp <- load_pbp(2023)

# Calculate QB efficiency metrics
qb_stats <- pbp %>%
  filter(!is.na(epa), play_type == "pass") %>%
  group_by(passer_player_name, posteam) %>%
  summarize(
    plays = n(),
    total_epa = sum(epa),
    epa_play = mean(epa),
    cpoe = mean(cpoe, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(plays >= 200) %>%
  arrange(desc(total_epa))

head(qb_stats, 10)

Output Example:

# A tibble: 10 x 6
   passer_player_name posteam plays total_epa epa_play  cpoe
1  T.Tagovailoa       MIA       542     152.3    0.281  4.21
2  J.Goff             DET       548     145.2    0.265  3.89

python Soccer

Access StatsBomb Open Data

StatsBomb provides free open data for selected competitions. Their data includes detailed event data with xG values.

from statsbombpy import sb
import pandas as pd

# Get available competitions
comps = sb.competitions()
print(comps[['competition_name', 'season_name']].drop_duplicates().head(10))

# Get matches for a competition
matches = sb.matches(competition_id=43, season_id=106)  # World Cup 2022
print(f"Total matches: {len(matches)}")

# Get events for a specific match
events = sb.events(match_id=matches.iloc[0]['match_id'])
print(f"Events in match: {len(events)}")

# Filter for shots
shots = events[events['type'] == 'Shot']
print(f"Total shots: {len(shots)}")

Output Example:

    competition_name    season_name
0   FIFA World Cup      2022
1   La Liga             2020/2021
Total matches: 64
Events in match: 3247
Total shots: 28

r Soccer

Scrape FBref with worldfootballR

worldfootballR scrapes FBref, Transfermarkt, and other sources for comprehensive soccer data in R.

library(worldfootballR)
library(dplyr)

# Get Premier League player stats
pl_stats <- fb_big5_advanced_season_stats(
  season_end_year = 2024,
  stat_type = "standard",
  team_or_player = "player"
)

# Filter for Premier League
epl <- pl_stats %>%
  filter(Comp == "Premier League") %>%
  select(Player, Squad, MP, Gls, Ast, xG, xAG) %>%
  arrange(desc(xG))

head(epl, 10)

# Get team-level shooting data
team_shooting <- fb_season_team_stats(
  country = "ENG",
  tier = "1st",
  stat_type = "shooting"
)

Output Example:

# A tibble: 10 x 7
   Player           Squad       MP   Gls   Ast    xG   xAG
1  Erling Haaland   Man City    31    27     5  26.2   3.1
2  Cole Palmer      Chelsea     33    22    11  17.8   9.4

python Hockey

Fetch NHL Data with hockey_scraper

Access NHL data through the official NHL API. Data includes player stats, game logs, and play-by-play.

import requests
import pandas as pd

# NHL API endpoint
base_url = "https://api-web.nhle.com/v1"

# Get team roster
team_abbr = "TOR"  # Toronto Maple Leafs
response = requests.get(f"{base_url}/roster/{team_abbr}/current")
roster = response.json()

# Get player stats
stats_url = "https://api.nhle.com/stats/rest/en/skater/summary"
params = {"cayenneExp": "seasonId=20232024", "limit": 100}
response = requests.get(stats_url, params=params)
stats = pd.DataFrame(response.json()['data'])

# Top scorers
top_scorers = stats.nlargest(10, 'points')[[
    'skaterFullName', 'teamAbbrevs', 'goals', 'assists', 'points'
]]
print(top_scorers)

Output Example:

       skaterFullName teamAbbrevs  goals  assists  points
0      Nathan MacKinnon        COL     51       89     140
1      Nikita Kucherov        TBL     44       100    144

r Hockey

Load NHL Data with hockeyR

hockeyR provides easy access to NHL play-by-play data with pre-calculated advanced stats like Corsi and xG.

library(hockeyR)
library(dplyr)

# Load play-by-play data
pbp <- load_pbp(2024)

# Get shot attempts (Corsi)
shots <- pbp %>%
  filter(event_type %in% c("SHOT", "MISSED_SHOT", "BLOCKED_SHOT", "GOAL"))

# Calculate team Corsi
team_corsi <- shots %>%
  group_by(event_team_abbr) %>%
  summarize(
    CF = n(),
    goals = sum(event_type == "GOAL")
  ) %>%
  arrange(desc(CF))

print(team_corsi)

# Get player-level stats
player_stats <- get_skater_stats_hr(2024)

Output Example:

# A tibble: 32 x 3
   event_team_abbr    CF goals
1  COL              5234   285
2  FLA              5102   264

python Golf

Scrape PGA Tour Stats

Access PGA Tour statistics through their public JSON API for strokes gained and other metrics.

import requests
import pandas as pd

# PGA Tour Stats API
base_url = "https://statdata.pgatour.com/r"

# Get Strokes Gained data
sg_url = f"{base_url}/stat/02675.json"  # Strokes Gained: Total
response = requests.get(sg_url)

if response.status_code == 200:
    data = response.json()
    players = pd.DataFrame(data['tours'][0]['years'][0]['stats'])

    # Clean and sort
    players['statValue'] = pd.to_numeric(players['statValue'])
    top_sg = players.nlargest(10, 'statValue')[['playerName', 'statValue']]
    print("Top 10 - Strokes Gained: Total")
    print(top_sg)

Output Example:

Top 10 - Strokes Gained: Total
         playerName  statValue
0     Scottie Scheffler      2.82
1     Xander Schauffele      1.95

python Tennis

Load Tennis Match Data

Jeff Sackmann maintains comprehensive ATP and WTA match data on GitHub, free for analysis.

import pandas as pd

# Jeff Sackmann's tennis data (GitHub)
base_url = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master"

# Load ATP matches
atp_2023 = pd.read_csv(f"{base_url}/atp_matches_2023.csv")

# Filter for Grand Slams
grand_slams = atp_2023[atp_2023['tourney_level'] == 'G']

# Top winners
top_winners = atp_2023.groupby('winner_name').size().nlargest(10)
print("Most wins in 2023:")
print(top_winners)

# Service stats analysis
serve_stats = atp_2023[['winner_name', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_ace']].dropna()
print(f"\nMatches with serve data: {len(serve_stats)}")

Output Example:

Most wins in 2023:
Novak Djokovic      73
Jannik Sinner       64
Carlos Alcaraz      61

python MMA

Scrape UFC Fighter Stats

Scrape UFC fighter statistics from UFCStats.com for comprehensive MMA analytics.

import requests
from bs4 import BeautifulSoup
import pandas as pd

# UFC Stats page
url = "http://ufcstats.com/statistics/fighters"
params = {"char": "a", "page": "all"}

response = requests.get(url, params=params)
soup = BeautifulSoup(response.content, 'html.parser')

# Parse fighter table
table = soup.find('table', class_='b-statistics__table')
rows = table.find_all('tr')[2:]  # Skip headers

fighters = []
for row in rows:
    cols = row.find_all('td')
    if cols:
        fighters.append({
            'name': cols[0].text.strip() + " " + cols[1].text.strip(),
            'height': cols[3].text.strip(),
            'weight': cols[4].text.strip(),
            'reach': cols[5].text.strip(),
            'stance': cols[6].text.strip()
        })

df = pd.DataFrame(fighters)
print(f"Total fighters: {len(df)}")

Output Example:

Total fighters: 234

python Volleyball

Load NCAA Volleyball Data

Calculate volleyball statistics like hitting efficiency and kill percentage from box score data.

import pandas as pd
import requests

# NCAA stats endpoint example
# Note: NCAA data often requires web scraping or specific APIs

# Example: Load volleyball box score data
# This would typically come from a CSV or database

# Sample data structure
data = {
    'player': ['Player A', 'Player B', 'Player C'],
    'kills': [15, 12, 8],
    'errors': [3, 2, 4],
    'attempts': [35, 28, 22],
    'digs': [8, 5, 12],
    'blocks': [2, 4, 1]
}

df = pd.DataFrame(data)

# Calculate hitting efficiency
df['efficiency'] = (df['kills'] - df['errors']) / df['attempts']
df['kill_pct'] = df['kills'] / df['attempts'] * 100

print(df[['player', 'kills', 'efficiency', 'kill_pct']].round(3))

Output Example:

    player  kills  efficiency  kill_pct
0  Player A     15       0.343     42.9
1  Player B     12       0.357     42.9

python

Generic Sports API Request

A reusable function for fetching data from various sports APIs with error handling.

import requests
import pandas as pd

def fetch_sports_api(url, params=None, headers=None):
    """
    Generic function to fetch data from sports APIs.

    Args:
        url: API endpoint URL
        params: Query parameters
        headers: Request headers (for authentication)

    Returns:
        DataFrame with the response data
    """
    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()

        data = response.json()

        # Handle common response structures
        if isinstance(data, list):
            return pd.DataFrame(data)
        elif 'data' in data:
            return pd.DataFrame(data['data'])
        elif 'results' in data:
            return pd.DataFrame(data['results'])
        else:
            return pd.DataFrame([data])

    except requests.exceptions.RequestException as e:
        print(f"API Error: {e}")
        return pd.DataFrame()

# Example usage
df = fetch_sports_api("https://api.example.com/stats")
print(df.head())

Output Example:

   id    name  value
0   1  Stat A    100
1   2  Stat B    200

python Baseball

Fetch MLB Player Stats with pybaseball

Use pybaseball to fetch MLB batting and pitching statistics. The library pulls from FanGraphs, Baseball Reference, and Statcast.

from pybaseball import batting_stats, pitching_stats

# Get batting stats for a season
batting_2023 = batting_stats(2023)
print(f"Retrieved {len(batting_2023)} batters")

# Get pitching stats
pitching_2023 = pitching_stats(2023)
print(f"Retrieved {len(pitching_2023)} pitchers")

# Filter for qualified batters (minimum PA)
qualified = batting_2023[batting_2023['PA'] >= 502]
print(f"Qualified batters: {len(qualified)}")

Output Example:

Retrieved 789 batters
Retrieved 634 pitchers
Qualified batters: 143

python Baseball

Calculate OBP

Python function to calculate On-Base Percentage with error handling

def calculate_obp(hits, walks, hbp, at_bats, sacrifice_flies):
    """Calculate On-Base Percentage"""
    numerator = hits + walks + hbp
    denominator = at_bats + walks + hbp + sacrifice_flies
    if denominator == 0:
        return 0
    return round(numerator / denominator, 3)

# Example
obp = calculate_obp(hits=145, walks=72, hbp=8, at_bats=502, sacrifice_flies=4)
print(f"OBP: {obp}")  # Output: OBP: 0.385

python Baseball

Calculate True Shooting %

Calculate True Shooting Percentage - measures overall shooting efficiency

def calculate_true_shooting(points, fga, fta):
    """Calculate True Shooting Percentage
    Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
    """
    tsa = 2 * (fga + 0.44 * fta)
    if tsa == 0:
        return 0
    return round((points / tsa) * 100, 1)

# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%")  # Elite: >60%, Good: 55-60%

python Baseball

Calculate True Shooting %

Calculate True Shooting Percentage - measures overall shooting efficiency

def calculate_true_shooting(points, fga, fta):
    """Calculate True Shooting Percentage
    Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
    """
    tsa = 2 * (fga + 0.44 * fta)
    if tsa == 0:
        return 0
    return round((points / tsa) * 100, 1)

# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%")  # Elite: >60%, Good: 55-60%

python Baseball

Calculate True Shooting %

Calculate True Shooting Percentage - measures overall shooting efficiency

def calculate_true_shooting(points, fga, fta):
    """Calculate True Shooting Percentage
    Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
    """
    tsa = 2 * (fga + 0.44 * fta)
    if tsa == 0:
        return 0
    return round((points / tsa) * 100, 1)

# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%")  # Elite: >60%, Good: 55-60%

python Football

Calculate True Shooting %

Calculate True Shooting Percentage - measures overall shooting efficiency

def calculate_true_shooting(points, fga, fta):
    """Calculate True Shooting Percentage
    Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
    """
    tsa = 2 * (fga + 0.44 * fta)
    if tsa == 0:
        return 0
    return round((points / tsa) * 100, 1)

# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%")  # Elite: >60%, Good: 55-60%

python Football

Calculate True Shooting %

Calculate True Shooting Percentage - measures overall shooting efficiency

def calculate_true_shooting(points, fga, fta):
    """Calculate True Shooting Percentage
    Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
    """
    tsa = 2 * (fga + 0.44 * fta)
    if tsa == 0:
        return 0
    return round((points / tsa) * 100, 1)

# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%")  # Elite: >60%, Good: 55-60%

python Football

EPA Calculation

Calculate Expected Points Added (EPA) for NFL plays

def calculate_epa(ep_before, ep_after, touchdown=False, turnover=False):
    """Calculate Expected Points Added
    EPA = EP_after - EP_before
    """
    if touchdown:
        ep_after = 7.0  # Assume PAT made
    elif turnover:
        ep_after = -ep_after  # Flip for opponent

    return round(ep_after - ep_before, 2)

# Example: 2nd & 8 from own 35, gain 12 yards for 1st down
epa = calculate_epa(ep_before=1.2, ep_after=2.8)
print(f"EPA: +{epa}")  # Positive EPA = good play

python Football

Calculate True Shooting %

Calculate True Shooting Percentage - measures overall shooting efficiency

def calculate_true_shooting(points, fga, fta):
    """Calculate True Shooting Percentage
    Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
    """
    tsa = 2 * (fga + 0.44 * fta)
    if tsa == 0:
        return 0
    return round((points / tsa) * 100, 1)

# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%")  # Elite: >60%, Good: 55-60%

python Soccer

Simple xG Model

Basic xG model using logistic regression with distance and angle features

import numpy as np
from sklearn.linear_model import LogisticRegression

def build_xg_model(shots_df):
    """Build Expected Goals model"""
    # Features: distance, angle, body_part
    shots_df["distance"] = np.sqrt((shots_df["x"] - 100)**2 + (shots_df["y"] - 50)**2)
    shots_df["angle"] = np.arctan2(7.32/2, shots_df["distance"]) * 2

    X = shots_df[["distance", "angle"]]
    y = shots_df["goal"]

    model = LogisticRegression()
    model.fit(X, y)
    return model

python Soccer

Calculate True Shooting %

Calculate True Shooting Percentage - measures overall shooting efficiency

def calculate_true_shooting(points, fga, fta):
    """Calculate True Shooting Percentage
    Formula: TS% = PTS / (2 * (FGA + 0.44 * FTA))
    """
    tsa = 2 * (fga + 0.44 * fta)
    if tsa == 0:
        return 0
    return round((points / tsa) * 100, 1)

# Example: Player with 25 PPG, 18 FGA, 8 FTA
ts = calculate_true_shooting(25, 18, 8)
print(f"TS%: {ts}%")  # Elite: >60%, Good: 55-60%

python Soccer

Calculate Corsi

Calculate Corsi For Percentage (CF%) - a hockey possession metric

def calculate_corsi(shots_for, shots_against, goals_for, goals_against,
                      missed_for, missed_against, blocked_for, blocked_against):
    """Calculate Corsi For Percentage"""
    cf = shots_for + goals_for + missed_for + blocked_for
    ca = shots_against + goals_against + missed_against + blocked_against

    if cf + ca == 0:
        return 50.0

    cf_pct = (cf / (cf + ca)) * 100
    return round(cf_pct, 1)

# Example: Team with 60 CF, 45 CA
corsi = calculate_corsi(30, 25, 3, 2, 15, 10, 12, 8)
print(f"CF%: {corsi}%")  # Should be above 50% (good)

python Soccer

Simple xG Model

Basic xG model using logistic regression with distance and angle features

import numpy as np
from sklearn.linear_model import LogisticRegression

def build_xg_model(shots_df):
    """Build Expected Goals model"""
    # Features: distance, angle, body_part
    shots_df["distance"] = np.sqrt((shots_df["x"] - 100)**2 + (shots_df["y"] - 50)**2)
    shots_df["angle"] = np.arctan2(7.32/2, shots_df["distance"]) * 2

    X = shots_df[["distance", "angle"]]
    y = shots_df["goal"]

    model = LogisticRegression()
    model.fit(X, y)
    return model

python Baseball

Loading Baseball Data with pybaseball

This code uses the pybaseball library to fetch Statcast data from Baseball Savant.

from pybaseball import statcast
import pandas as pd

# Get Statcast data for a date range
data = statcast(start_dt='2023-04-01', end_dt='2023-04-30')

# Display basic info
print(f'Total pitches: {len(data)}')
print(data.head())

python Football

Basic NBA API Query

Using nba_api to fetch player career statistics from NBA.com.

from nba_api.stats.endpoints import playercareerstats

# Get career stats for a player
career = playercareerstats.PlayerCareerStats(player_id='201566')
df = career.get_data_frames()[0]

print(df[['SEASON_ID', 'PTS', 'REB', 'AST']].head(10))

python Hockey

Loading NFL Data with nfl_data_py

Loading NFL play-by-play data and calculating EPA metrics.

import nfl_data_py as nfl

# Load play-by-play data
pbp = nfl.import_pbp_data([2023])

# Filter to pass plays
pass_plays = pbp[pbp['play_type'] == 'pass']

# Calculate EPA per play by team
epa_by_team = pass_plays.groupby('posteam')['epa'].mean().sort_values(ascending=False)
print(epa_by_team.head(10))

r Basketball

Calculate True Shooting Percentage in R

R code to calculate and visualize True Shooting Percentage for NBA players

# Function to calculate True Shooting Percentage
calculate_ts_percentage <- function(points, fga, fta) {
  # TS% Formula: Points / (2 * (FGA + 0.44 * FTA))
  ts_percentage <- (points / (2 * (fga + 0.44 * fta))) * 100
  return(ts_percentage)
}

# Example NBA player stats
players <- data.frame(
  player = c("Player A", "Player B", "Player C"),
  points = c(1832, 2140, 1654),
  fga = c(1420, 1680, 1510),
  fta = c(425, 380, 290)
)

# Calculate TS% for each player
players$ts_percent <- with(players,
  calculate_ts_percentage(points, fga, fta)
)

# Display results
library(dplyr)
players %>%
  arrange(desc(ts_percent)) %>%
  mutate(ts_percent = round(ts_percent, 1)) %>%
  print()

# League average comparison
league_avg_ts <- 56.5
players$above_average <- players$ts_percent > league_avg_ts

# Visualization
library(ggplot2)
ggplot(players, aes(x = player, y = ts_percent, fill = above_average)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = league_avg_ts, linetype = "dashed", color = "red") +
  labs(title = "True Shooting Percentage Comparison",
       y = "TS%", x = "") +
  theme_minimal()

r Baseball

Loading Baseball Data with baseballr

R code using the baseballr package to access Statcast data.

library(baseballr)
library(dplyr)

# Get Statcast data
data <- statcast_search(
  start_date = '2023-04-01',
  end_date = '2023-04-30'
)

# Summary
data %>% 
  summarize(total_pitches = n())

Quick Reference

Common patterns and formulas for sports analytics

Common Imports

Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

Common Metrics

Formulas

# Basketball: True Shooting %
TS_pct = points / (2 * (FGA + 0.44 * FTA))

# Baseball: OPS
OPS = on_base_pct + slugging_pct

# Soccer: Expected Goals (simplified)
xG = shot_quality * shot_location_weight