Appendix A: Python Quick Reference for Sports Analytics
Installation and Setup
Creating a Virtual Environment
# Create virtual environment
python -m venv cfb_analytics
# Activate (Windows)
cfb_analytics\Scripts\activate
# Activate (macOS/Linux)
source cfb_analytics/bin/activate
Installing Core Packages
pip install pandas numpy scipy matplotlib seaborn scikit-learn
pip install requests beautifulsoup4 jupyter notebook
pip install plotly dash statsmodels
Recommended requirements.txt
pandas>=2.0.0
numpy>=1.24.0
scipy>=1.10.0
matplotlib>=3.7.0
seaborn>=0.12.0
scikit-learn>=1.2.0
statsmodels>=0.14.0
requests>=2.28.0
beautifulsoup4>=4.12.0
plotly>=5.14.0
dash>=2.10.0
jupyter>=1.0.0
Data Manipulation with Pandas
Loading Data
import pandas as pd
# From CSV
df = pd.read_csv('plays.csv')
# From JSON
df = pd.read_json('plays.json')
# From API response
import requests
response = requests.get('https://api.example.com/plays')
df = pd.DataFrame(response.json())
# From SQL database
from sqlalchemy import create_engine
engine = create_engine('postgresql://user:pass@localhost/db')
df = pd.read_sql('SELECT * FROM plays', engine)
Basic Operations
# View data
df.head() # First 5 rows
df.tail(10) # Last 10 rows
df.shape # (rows, columns)
df.info() # Column types and memory
df.describe() # Statistical summary
# Select columns
df['column_name'] # Single column (Series)
df[['col1', 'col2']] # Multiple columns (DataFrame)
# Filter rows
df[df['yards'] > 10] # Condition filter
df[(df['down'] == 3) & (df['distance'] < 5)] # Multiple conditions
df[df['play_type'].isin(['Pass', 'Rush'])] # Value in list
df.query('yards > 10 and down == 1') # Query string
# Sort
df.sort_values('yards', ascending=False)
df.sort_values(['game_id', 'play_number'])
Data Cleaning
# Handle missing values
df.isna().sum() # Count missing by column
df.dropna() # Drop rows with any missing
df.dropna(subset=['yards']) # Drop rows where specific column is missing
df['yards'].fillna(0) # Fill missing with value
df['yards'].fillna(df['yards'].mean()) # Fill with mean
# Convert types
df['yards'] = df['yards'].astype(int)
df['date'] = pd.to_datetime(df['date'])
# Rename columns
df.rename(columns={'old_name': 'new_name'})
df.columns = df.columns.str.lower().str.replace(' ', '_')
# Remove duplicates
df.drop_duplicates()
df.drop_duplicates(subset=['game_id', 'play_number'])
Aggregation and Grouping
# Basic aggregations
df['yards'].mean()
df['yards'].sum()
df['yards'].std()
df['yards'].median()
# Group by
df.groupby('team')['yards'].mean()
df.groupby(['team', 'play_type'])['yards'].agg(['mean', 'std', 'count'])
# Pivot tables
pd.pivot_table(df, values='yards', index='team',
columns='play_type', aggfunc='mean')
# Custom aggregation
df.groupby('team').agg({
'yards': ['mean', 'sum'],
'epa': 'mean',
'play_id': 'count'
})
Merging and Joining
# Merge (SQL-style join)
merged = pd.merge(plays_df, games_df, on='game_id')
merged = pd.merge(plays_df, teams_df,
left_on='offense', right_on='team_name',
how='left')
# Concatenate
combined = pd.concat([df1, df2]) # Stack vertically
combined = pd.concat([df1, df2], axis=1) # Stack horizontally
NumPy Operations
Array Creation
import numpy as np
# Create arrays
arr = np.array([1, 2, 3, 4, 5])
zeros = np.zeros((3, 4))
ones = np.ones((3, 4))
range_arr = np.arange(0, 10, 0.5)
linspace = np.linspace(0, 1, 100)
Statistical Operations
# Basic stats
np.mean(arr)
np.median(arr)
np.std(arr)
np.var(arr)
np.percentile(arr, [25, 50, 75])
# Aggregations
np.sum(arr)
np.min(arr)
np.max(arr)
np.argmax(arr) # Index of max
# Correlation
np.corrcoef(x, y)
Linear Algebra
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
np.dot(A, B) # Matrix multiplication
A @ B # Matrix multiplication (Python 3.5+)
np.transpose(A) # Transpose
np.linalg.inv(A) # Inverse
np.linalg.det(A) # Determinant
eigenvalues, eigenvectors = np.linalg.eig(A)
Statistical Analysis with SciPy
Hypothesis Testing
from scipy import stats
# t-test (compare two groups)
t_stat, p_value = stats.ttest_ind(group1, group2)
# Paired t-test
t_stat, p_value = stats.ttest_rel(before, after)
# One-sample t-test
t_stat, p_value = stats.ttest_1samp(sample, population_mean)
# Chi-square test
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
# Mann-Whitney U test (non-parametric)
u_stat, p_value = stats.mannwhitneyu(group1, group2)
Distributions
# Normal distribution
from scipy.stats import norm
norm.pdf(x, loc=mean, scale=std) # Probability density
norm.cdf(x, loc=mean, scale=std) # Cumulative distribution
norm.ppf(q, loc=mean, scale=std) # Quantile function
norm.rvs(loc=mean, scale=std, size=100) # Random samples
# Other distributions
from scipy.stats import poisson, binom, expon
Regression with Statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
# OLS regression
X = df[['feature1', 'feature2']]
X = sm.add_constant(X)
y = df['target']
model = sm.OLS(y, X).fit()
print(model.summary())
# Formula interface
model = smf.ols('yards ~ down + distance + yard_line', data=df).fit()
# Logistic regression
model = smf.logit('success ~ down + distance + yard_line', data=df).fit()
Machine Learning with Scikit-learn
Data Preparation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Label encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)
Classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
Regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Gradient Boosting
model = GradientBoostingRegressor(n_estimators=100, max_depth=5)
model.fit(X_train, y_train)
# Evaluation
print(f"MSE: {mean_squared_error(y_test, y_pred):.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.3f}")
print(f"R²: {r2_score(y_test, y_pred):.3f}")
Cross-Validation
from sklearn.model_selection import cross_val_score, KFold
# Simple cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"CV Accuracy: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")
# Custom K-Fold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, test_idx in kfold.split(X):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# Train and evaluate
Data Visualization
Matplotlib Basics
import matplotlib.pyplot as plt
# Basic plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(x, y, label='Data')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_title('Title')
ax.legend()
plt.savefig('plot.png', dpi=300, bbox_inches='tight')
plt.show()
# Multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0, 0].plot(x, y)
axes[0, 1].bar(categories, values)
axes[1, 0].scatter(x, y)
axes[1, 1].hist(data, bins=30)
plt.tight_layout()
Seaborn Statistical Plots
import seaborn as sns
# Distribution plots
sns.histplot(df['yards'], kde=True)
sns.kdeplot(df['yards'])
sns.boxplot(x='team', y='yards', data=df)
sns.violinplot(x='play_type', y='yards', data=df)
# Relationship plots
sns.scatterplot(x='distance', y='yards', hue='play_type', data=df)
sns.regplot(x='distance', y='yards', data=df)
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
# Categorical plots
sns.barplot(x='team', y='yards', data=df)
sns.countplot(x='play_type', data=df)
Plotly Interactive
import plotly.express as px
import plotly.graph_objects as go
# Scatter plot
fig = px.scatter(df, x='distance', y='yards', color='play_type',
hover_data=['team', 'game_id'])
fig.show()
# Line plot with multiple traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1, mode='lines', name='Series 1'))
fig.add_trace(go.Scatter(x=x, y=y2, mode='lines', name='Series 2'))
fig.update_layout(title='Win Probability', xaxis_title='Play', yaxis_title='WP')
fig.show()
Common Sports Analytics Patterns
EPA Calculation
def calculate_epa(plays_df):
"""Calculate EPA for plays."""
# Expected points lookup (simplified)
def get_ep(down, distance, yard_line):
base_ep = 7 / (1 + np.exp(-0.08 * (yard_line - 50))) - 0.5
down_adj = {1: 0, 2: -0.3, 3: -0.7, 4: -1.5}.get(down, 0)
return base_ep + down_adj
plays_df['ep_before'] = plays_df.apply(
lambda x: get_ep(x['down'], x['distance'], x['yard_line']), axis=1
)
plays_df['ep_after'] = plays_df['ep_before'].shift(-1)
plays_df['epa'] = plays_df['ep_after'] - plays_df['ep_before']
return plays_df
Win Probability
def calculate_win_prob(score_diff, time_remaining, possession=True):
"""Calculate win probability."""
# Simple logistic model
time_factor = time_remaining / 3600
possession_adj = 0.03 if possession else 0
logit = 0.15 * score_diff + 0.3 * (score_diff * (1 - time_factor)) + possession_adj
prob = 1 / (1 + np.exp(-logit))
return prob
Success Rate
def is_successful(row):
"""Determine if play was successful."""
pct_gained = row['yards'] / max(row['distance'], 1)
if row['down'] == 1:
return pct_gained >= 0.4
elif row['down'] == 2:
return pct_gained >= 0.6
else:
return row['yards'] >= row['distance']
df['success'] = df.apply(is_successful, axis=1)
success_rate = df['success'].mean()
API Interaction
Making API Requests
import requests
import time
def fetch_with_retry(url, headers=None, max_retries=3):
"""Fetch URL with retry logic."""
for attempt in range(max_retries):
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
if attempt == max_retries - 1:
raise
time.sleep(2 ** attempt) # Exponential backoff
# Example: College Football Data API
API_KEY = 'your_api_key'
headers = {'Authorization': f'Bearer {API_KEY}'}
plays = fetch_with_retry(
'https://api.collegefootballdata.com/plays?year=2024&week=1',
headers=headers
)
df = pd.DataFrame(plays)
Database Operations
SQLite (Local)
import sqlite3
# Connect and query
conn = sqlite3.connect('cfb_analytics.db')
df = pd.read_sql('SELECT * FROM plays WHERE season = 2024', conn)
conn.close()
# Write data
conn = sqlite3.connect('cfb_analytics.db')
df.to_sql('plays', conn, if_exists='append', index=False)
conn.close()
PostgreSQL
from sqlalchemy import create_engine
# Create connection
engine = create_engine('postgresql://user:password@localhost:5432/cfb_analytics')
# Read
df = pd.read_sql('SELECT * FROM plays', engine)
# Write
df.to_sql('plays', engine, if_exists='append', index=False)
Useful Snippets
Timing Code
import time
start = time.time()
# Your code here
duration = time.time() - start
print(f"Execution time: {duration:.2f} seconds")
Progress Bars
from tqdm import tqdm
for item in tqdm(items, desc="Processing"):
process(item)
Logging
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.info("Starting analysis")
logger.warning("Data quality issue detected")
logger.error("Failed to process game")
Caching Results
import functools
@functools.lru_cache(maxsize=128)
def expensive_calculation(game_id):
# Expensive computation
return result