Appendix C: Python Quick Reference
This appendix provides a comprehensive quick reference for the Python libraries essential to basketball analytics. Each section includes syntax, common operations, and basketball-specific examples.
C.1 NumPy Reference
NumPy is the fundamental package for numerical computing in Python, providing support for large, multi-dimensional arrays and matrices.
Importing NumPy
import numpy as np
Array Creation
# From Python lists
arr = np.array([1, 2, 3, 4, 5])
matrix = np.array([[1, 2, 3], [4, 5, 6]])
# Special arrays
zeros = np.zeros((3, 4)) # 3x4 array of zeros
ones = np.ones((2, 3)) # 2x3 array of ones
empty = np.empty((2, 2)) # Uninitialized 2x2 array
identity = np.eye(4) # 4x4 identity matrix
full = np.full((3, 3), 7) # 3x3 array filled with 7
# Sequences
range_arr = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
linspace = np.linspace(0, 1, 5) # [0, 0.25, 0.5, 0.75, 1.0]
logspace = np.logspace(0, 2, 3) # [1, 10, 100]
# Random arrays
random_uniform = np.random.rand(3, 3) # Uniform [0, 1)
random_normal = np.random.randn(3, 3) # Standard normal
random_int = np.random.randint(0, 100, 10) # Random integers
Array Attributes
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr.shape # (2, 3) - dimensions
arr.ndim # 2 - number of dimensions
arr.size # 6 - total elements
arr.dtype # dtype('int64') - data type
arr.itemsize # 8 - bytes per element
Array Operations
# Element-wise operations
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
a + b # [5, 7, 9]
a - b # [-3, -3, -3]
a * b # [4, 10, 18]
a / b # [0.25, 0.4, 0.5]
a ** 2 # [1, 4, 9]
np.sqrt(a) # [1.0, 1.414, 1.732]
np.exp(a) # [2.718, 7.389, 20.086]
np.log(a) # [0, 0.693, 1.099]
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
np.dot(A, B) # Matrix multiplication
A @ B # Matrix multiplication (Python 3.5+)
A.T # Transpose
np.linalg.inv(A) # Matrix inverse
np.linalg.det(A) # Determinant
np.linalg.eig(A) # Eigenvalues and eigenvectors
Statistical Functions
data = np.array([23.5, 28.1, 21.4, 25.8, 30.2, 22.1])
np.mean(data) # 25.18 - mean
np.median(data) # 24.65 - median
np.std(data) # 3.25 - standard deviation
np.var(data) # 10.56 - variance
np.min(data) # 21.4 - minimum
np.max(data) # 30.2 - maximum
np.sum(data) # 151.1 - sum
np.prod(data) # product of elements
np.percentile(data, 75) # 75th percentile
np.cumsum(data) # cumulative sum
np.corrcoef(a, b) # correlation coefficient matrix
np.cov(a, b) # covariance matrix
# Axis-specific operations (for 2D arrays)
matrix = np.array([[1, 2, 3], [4, 5, 6]])
np.mean(matrix, axis=0) # Column means: [2.5, 3.5, 4.5]
np.mean(matrix, axis=1) # Row means: [2.0, 5.0]
Indexing and Slicing
arr = np.array([10, 20, 30, 40, 50])
arr[0] # 10 - first element
arr[-1] # 50 - last element
arr[1:4] # [20, 30, 40] - slice
arr[::2] # [10, 30, 50] - every other element
arr[::-1] # [50, 40, 30, 20, 10] - reversed
# 2D indexing
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
matrix[0, 0] # 1 - element at row 0, col 0
matrix[1, :] # [4, 5, 6] - entire row 1
matrix[:, 2] # [3, 6, 9] - entire column 2
matrix[0:2, 1:3] # [[2, 3], [5, 6]] - submatrix
# Boolean indexing
arr = np.array([15, 22, 31, 18, 25])
arr[arr > 20] # [22, 31, 25] - elements > 20
arr[arr % 2 == 0] # [22, 18] - even elements
Basketball Example: Player Similarity
import numpy as np
# Player stat vectors (PPG, RPG, APG, SPG, BPG)
player_a = np.array([25.3, 7.2, 5.8, 1.4, 0.8])
player_b = np.array([23.1, 6.8, 6.2, 1.2, 0.5])
# Euclidean distance (lower = more similar)
distance = np.linalg.norm(player_a - player_b)
# Cosine similarity (higher = more similar)
cosine_sim = np.dot(player_a, player_b) / (
np.linalg.norm(player_a) * np.linalg.norm(player_b)
)
# Standardize stats for comparison
all_players = np.array([player_a, player_b])
means = np.mean(all_players, axis=0)
stds = np.std(all_players, axis=0)
standardized = (all_players - means) / stds
C.2 pandas Reference
pandas provides high-performance, easy-to-use data structures for data analysis.
Importing pandas
import pandas as pd
Creating DataFrames
# From dictionary
df = pd.DataFrame({
'player': ['Player A', 'Player B', 'Player C'],
'ppg': [25.3, 18.7, 22.1],
'rpg': [7.2, 4.5, 8.8],
'apg': [5.8, 8.2, 3.1]
})
# From list of dictionaries
data = [
{'player': 'Player A', 'ppg': 25.3},
{'player': 'Player B', 'ppg': 18.7}
]
df = pd.DataFrame(data)
# From NumPy array
arr = np.array([[1, 2, 3], [4, 5, 6]])
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])
# Reading files
df = pd.read_csv('player_stats.csv')
df = pd.read_excel('player_stats.xlsx')
df = pd.read_json('player_stats.json')
DataFrame Inspection
df.head() # First 5 rows
df.head(10) # First 10 rows
df.tail() # Last 5 rows
df.shape # (rows, columns)
df.columns # Column names
df.dtypes # Data types
df.info() # Summary info
df.describe() # Statistical summary
df.isnull().sum() # Count missing values per column
df.nunique() # Count unique values per column
df.memory_usage() # Memory usage
Selecting Data
# Column selection
df['ppg'] # Single column (Series)
df[['ppg', 'rpg']] # Multiple columns (DataFrame)
# Row selection by index
df.iloc[0] # First row by position
df.iloc[0:5] # First 5 rows
df.iloc[[0, 2, 4]] # Specific rows by position
df.iloc[0:5, 0:3] # Rows 0-4, columns 0-2
# Row selection by label
df.loc[0] # Row with label 0
df.loc[df['ppg'] > 20] # Rows where ppg > 20
# Boolean filtering
df[df['ppg'] > 20] # Simple condition
df[(df['ppg'] > 20) & (df['rpg'] > 5)] # Multiple conditions (AND)
df[(df['ppg'] > 25) | (df['apg'] > 8)] # Multiple conditions (OR)
df[df['player'].str.contains('Player')] # String matching
df[df['team'].isin(['LAL', 'BOS', 'GSW'])] # Value in list
Data Manipulation
# Adding columns
df['efficiency'] = df['ppg'] / df['fga']
df['ts_pct'] = df['pts'] / (2 * (df['fga'] + 0.44 * df['fta']))
# Renaming columns
df.rename(columns={'ppg': 'points_per_game'}, inplace=True)
df.columns = ['col1', 'col2', 'col3'] # Rename all
# Dropping columns/rows
df.drop('column_name', axis=1, inplace=True) # Drop column
df.drop(0, axis=0, inplace=True) # Drop row
# Sorting
df.sort_values('ppg', ascending=False) # Sort by column
df.sort_values(['team', 'ppg'], ascending=[True, False]) # Multiple columns
# Handling missing data
df.dropna() # Drop rows with any NaN
df.dropna(subset=['ppg']) # Drop rows where ppg is NaN
df.fillna(0) # Fill NaN with 0
df.fillna(df.mean()) # Fill with column means
df.fillna(method='ffill') # Forward fill
df.interpolate() # Interpolate missing values
# Applying functions
df['ppg_squared'] = df['ppg'].apply(lambda x: x ** 2)
df['tier'] = df['ppg'].apply(lambda x: 'Star' if x > 20 else 'Role Player')
Grouping and Aggregation
# Basic groupby
df.groupby('team')['ppg'].mean() # Mean ppg by team
df.groupby('team')['ppg'].agg(['mean', 'std', 'count'])
# Multiple aggregations
df.groupby('team').agg({
'ppg': 'mean',
'rpg': 'sum',
'apg': ['mean', 'max']
})
# Custom aggregation
df.groupby('team').agg(
avg_ppg=('ppg', 'mean'),
total_rebounds=('rpg', 'sum'),
player_count=('player', 'count')
)
# Transform (returns same shape as original)
df['team_avg_ppg'] = df.groupby('team')['ppg'].transform('mean')
df['ppg_vs_team_avg'] = df['ppg'] - df.groupby('team')['ppg'].transform('mean')
Merging and Joining
# Merge (SQL-style joins)
merged = pd.merge(df1, df2, on='player_id') # Inner join
merged = pd.merge(df1, df2, on='player_id', how='left') # Left join
merged = pd.merge(df1, df2, on='player_id', how='outer') # Outer join
merged = pd.merge(df1, df2, left_on='id1', right_on='id2') # Different keys
# Concatenation
combined = pd.concat([df1, df2]) # Vertical stack
combined = pd.concat([df1, df2], axis=1) # Horizontal stack
combined = pd.concat([df1, df2], ignore_index=True) # Reset index
Pivot Tables
# Create pivot table
pivot = pd.pivot_table(
df,
values='ppg',
index='team',
columns='season',
aggfunc='mean'
)
# Melt (unpivot)
melted = pd.melt(
df,
id_vars=['player', 'team'],
value_vars=['ppg', 'rpg', 'apg'],
var_name='stat',
value_name='value'
)
Time Series Operations
# Convert to datetime
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
# Resampling
df.resample('W').mean() # Weekly average
df.resample('M').sum() # Monthly sum
# Rolling calculations
df['rolling_avg'] = df['ppg'].rolling(window=10).mean()
df['expanding_avg'] = df['ppg'].expanding().mean()
# Lag/shift
df['prev_game_ppg'] = df['ppg'].shift(1)
df['next_game_ppg'] = df['ppg'].shift(-1)
Basketball Example: Season Analysis
import pandas as pd
# Load player stats
df = pd.read_csv('player_stats.csv')
# Calculate advanced metrics
df['ts_pct'] = df['pts'] / (2 * (df['fga'] + 0.44 * df['fta']))
df['efg_pct'] = (df['fgm'] + 0.5 * df['fg3m']) / df['fga']
df['usg_rate'] = 100 * ((df['fga'] + 0.44 * df['fta'] + df['tov']) *
(df['team_mp'] / 5)) / (df['mp'] *
(df['team_fga'] + 0.44 * df['team_fta'] + df['team_tov']))
# Team-level aggregations
team_stats = df.groupby('team').agg(
total_pts=('pts', 'sum'),
avg_ts=('ts_pct', 'mean'),
player_count=('player', 'count')
).sort_values('total_pts', ascending=False)
# Find top scorers per team
top_scorers = df.loc[df.groupby('team')['ppg'].idxmax()]
# Rank players within teams
df['team_ppg_rank'] = df.groupby('team')['ppg'].rank(ascending=False)
C.3 Matplotlib Reference
Matplotlib is the fundamental plotting library for Python.
Importing Matplotlib
import matplotlib.pyplot as plt
Basic Plots
# Line plot
plt.plot(x, y)
plt.plot(x, y, 'r-', linewidth=2, label='Points')
plt.plot(x, y, marker='o', linestyle='--', color='blue')
# Scatter plot
plt.scatter(x, y)
plt.scatter(x, y, c=colors, s=sizes, alpha=0.7, cmap='viridis')
# Bar plot
plt.bar(categories, values)
plt.barh(categories, values) # Horizontal
# Histogram
plt.hist(data, bins=20, edgecolor='black', alpha=0.7)
# Box plot
plt.boxplot([data1, data2, data3], labels=['A', 'B', 'C'])
# Pie chart
plt.pie(values, labels=labels, autopct='%1.1f%%')
Plot Customization
# Figure and axes
fig, ax = plt.subplots(figsize=(10, 6))
# Labels and title
plt.xlabel('X Label', fontsize=12)
plt.ylabel('Y Label', fontsize=12)
plt.title('Plot Title', fontsize=14, fontweight='bold')
# Axis limits and ticks
plt.xlim(0, 100)
plt.ylim(0, 50)
plt.xticks(rotation=45)
plt.yticks([0, 10, 20, 30, 40, 50])
# Grid
plt.grid(True, alpha=0.3)
plt.grid(axis='y', linestyle='--')
# Legend
plt.legend(loc='upper right')
plt.legend(loc='best', fontsize=10, framealpha=0.9)
# Annotations
plt.annotate('Peak', xy=(x_point, y_point),
xytext=(x_text, y_text),
arrowprops=dict(arrowstyle='->', color='red'))
# Text
plt.text(x, y, 'Text Here', fontsize=10, ha='center')
# Saving
plt.savefig('plot.png', dpi=300, bbox_inches='tight')
plt.savefig('plot.pdf', format='pdf')
Subplots
# Multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0, 0].plot(x, y1)
axes[0, 0].set_title('Plot 1')
axes[0, 1].scatter(x, y2)
axes[0, 1].set_title('Plot 2')
axes[1, 0].bar(categories, values)
axes[1, 0].set_title('Plot 3')
axes[1, 1].hist(data, bins=20)
axes[1, 1].set_title('Plot 4')
plt.tight_layout()
plt.show()
Style Customization
# Use built-in styles
plt.style.use('seaborn-v0_8')
plt.style.use('ggplot')
plt.style.use('fivethirtyeight')
# Color maps
plt.scatter(x, y, c=values, cmap='viridis')
plt.colorbar(label='Value')
# Custom colors
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
Basketball Example: Shot Chart
import matplotlib.pyplot as plt
import numpy as np
# Sample shot data
shots_x = np.random.uniform(-25, 25, 200)
shots_y = np.random.uniform(0, 47, 200)
made = np.random.choice([0, 1], 200, p=[0.45, 0.55])
# Create shot chart
fig, ax = plt.subplots(figsize=(10, 9.4))
# Plot shots (green for made, red for missed)
colors = ['red' if m == 0 else 'green' for m in made]
ax.scatter(shots_x, shots_y, c=colors, alpha=0.6, s=50)
# Draw court elements (simplified)
# Three-point line arc
theta = np.linspace(0, np.pi, 100)
ax.plot(23.75 * np.cos(theta), 23.75 * np.sin(theta) + 4.75, 'k-', linewidth=2)
# Paint area
ax.plot([-8, -8], [0, 19], 'k-', linewidth=2)
ax.plot([8, 8], [0, 19], 'k-', linewidth=2)
ax.plot([-8, 8], [19, 19], 'k-', linewidth=2)
# Free throw circle
circle = plt.Circle((0, 19), 6, fill=False, color='black', linewidth=2)
ax.add_patch(circle)
# Rim
rim = plt.Circle((0, 4.75), 0.75, fill=False, color='orange', linewidth=3)
ax.add_patch(rim)
ax.set_xlim(-30, 30)
ax.set_ylim(0, 50)
ax.set_aspect('equal')
ax.set_title('Player Shot Chart', fontsize=14, fontweight='bold')
plt.show()
C.4 Seaborn Reference
Seaborn is built on Matplotlib and provides a high-level interface for statistical graphics.
Importing Seaborn
import seaborn as sns
Distribution Plots
# Histogram with KDE
sns.histplot(data=df, x='ppg', kde=True)
# KDE plot
sns.kdeplot(data=df, x='ppg', fill=True)
# Box plot
sns.boxplot(data=df, x='position', y='ppg')
# Violin plot
sns.violinplot(data=df, x='position', y='ppg')
# Strip plot (scatter for categorical)
sns.stripplot(data=df, x='position', y='ppg', jitter=True)
# Swarm plot
sns.swarmplot(data=df, x='position', y='ppg')
Relational Plots
# Scatter plot
sns.scatterplot(data=df, x='ppg', y='rpg', hue='position', size='mpg')
# Line plot
sns.lineplot(data=df, x='game_num', y='ppg', hue='player')
# Regression plot
sns.regplot(data=df, x='fga', y='pts')
sns.lmplot(data=df, x='fga', y='pts', hue='position')
Categorical Plots
# Count plot
sns.countplot(data=df, x='position')
# Bar plot (with error bars)
sns.barplot(data=df, x='position', y='ppg', errorbar='sd')
# Point plot
sns.pointplot(data=df, x='position', y='ppg', hue='team')
Matrix Plots
# Correlation heatmap
corr_matrix = df[['ppg', 'rpg', 'apg', 'spg', 'bpg']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
# Clustermap
sns.clustermap(corr_matrix, annot=True, cmap='viridis')
Multi-Plot Grids
# Pair plot
sns.pairplot(df[['ppg', 'rpg', 'apg', 'position']], hue='position')
# FacetGrid
g = sns.FacetGrid(df, col='position', col_wrap=3, height=4)
g.map(sns.histplot, 'ppg')
# JointGrid
g = sns.JointGrid(data=df, x='ppg', y='rpg')
g.plot(sns.scatterplot, sns.histplot)
Styling
# Set theme
sns.set_theme(style='whitegrid')
sns.set_theme(style='darkgrid')
sns.set_theme(style='ticks')
# Color palettes
sns.set_palette('husl')
sns.set_palette('Set2')
sns.color_palette('coolwarm', n_colors=10)
Basketball Example: Player Comparison
import seaborn as sns
import matplotlib.pyplot as plt
# Load data
df = pd.read_csv('player_stats.csv')
# Set style
sns.set_theme(style='whitegrid')
# Create figure with multiple plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 1. PPG distribution by position
sns.boxplot(data=df, x='position', y='ppg', ax=axes[0, 0],
order=['PG', 'SG', 'SF', 'PF', 'C'])
axes[0, 0].set_title('Points Per Game by Position')
# 2. PPG vs RPG scatter
sns.scatterplot(data=df, x='ppg', y='rpg', hue='position',
size='mpg', sizes=(20, 200), ax=axes[0, 1])
axes[0, 1].set_title('Points vs Rebounds')
# 3. Correlation heatmap
stats = ['ppg', 'rpg', 'apg', 'spg', 'bpg', 'ts_pct']
corr = df[stats].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, ax=axes[1, 0])
axes[1, 0].set_title('Stat Correlations')
# 4. True shooting by usage
sns.regplot(data=df, x='usg_pct', y='ts_pct', ax=axes[1, 1])
axes[1, 1].set_title('Efficiency vs Usage Trade-off')
plt.tight_layout()
plt.savefig('player_analysis.png', dpi=300)
plt.show()
C.5 scikit-learn Reference
scikit-learn is the primary machine learning library for Python.
Importing scikit-learn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.cluster import KMeans
Data Preparation
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Min-Max scaling
minmax = MinMaxScaler()
X_normalized = minmax.fit_transform(X)
Regression Models
# Linear Regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Coefficients
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')
# Ridge Regression
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
# Lasso Regression
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
# Elastic Net
from sklearn.linear_model import ElasticNet
elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic.fit(X_train, y_train)
Classification Models
# Logistic Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
# Feature importance
importance = pd.DataFrame({
'feature': feature_names,
'importance': rf_clf.feature_importances_
}).sort_values('importance', ascending=False)
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
gb_clf.fit(X_train, y_train)
Model Evaluation
# Regression metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Classification metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
# Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f'CV R2: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})')
Clustering
# K-Means
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)
# Cluster centers
centers = kmeans.cluster_centers_
# Elbow method for optimal k
inertias = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
inertias.append(kmeans.inertia_)
# Hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
hierarchical = AgglomerativeClustering(n_clusters=5)
clusters = hierarchical.fit_predict(X)
Dimensionality Reduction
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Explained variance
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')
print(f'Total variance explained: {sum(pca.explained_variance_ratio_):.2%}')
# t-SNE
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
Hyperparameter Tuning
# Grid Search
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, None],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(
RandomForestRegressor(random_state=42),
param_grid,
cv=5,
scoring='r2',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print(f'Best params: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')
Basketball Example: Win Prediction Model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Prepare features
features = ['team_off_rtg', 'team_def_rtg', 'opp_off_rtg', 'opp_def_rtg',
'home_court', 'rest_days', 'back_to_back']
X = df[features]
y = df['win']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train logistic regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)
# Train random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# Evaluate
print("Logistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, log_reg.predict(X_test_scaled)):.3f}")
print("\nRandom Forest:")
print(f"Accuracy: {accuracy_score(y_test, rf.predict(X_test)):.3f}")
# Feature importance from Random Forest
importance = pd.DataFrame({
'feature': features,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(importance)
C.6 Additional Useful Libraries
SciPy for Statistical Tests
from scipy import stats
# T-test
t_stat, p_value = stats.ttest_ind(group1, group2)
# Chi-square test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
# Correlation with p-value
r, p = stats.pearsonr(x, y)
r, p = stats.spearmanr(x, y)
# Normal distribution
stats.norm.pdf(x, loc=mean, scale=std)
stats.norm.cdf(x, loc=mean, scale=std)
stats.norm.ppf(0.95) # 95th percentile
# ANOVA
f_stat, p_value = stats.f_oneway(group1, group2, group3)
Statsmodels for Regression
import statsmodels.api as sm
import statsmodels.formula.api as smf
# OLS Regression
X = sm.add_constant(X) # Add intercept
model = sm.OLS(y, X).fit()
print(model.summary())
# Formula-based regression
model = smf.ols('ppg ~ fga + fta + mpg', data=df).fit()
print(model.summary())
# Logistic regression
model = sm.Logit(y, X).fit()
print(model.summary())
Plotly for Interactive Visualizations
import plotly.express as px
import plotly.graph_objects as go
# Scatter plot
fig = px.scatter(df, x='ppg', y='rpg', color='position',
hover_data=['player', 'team'])
fig.show()
# Bar chart
fig = px.bar(df, x='team', y='wins', color='conference')
fig.show()
# Line chart
fig = px.line(df, x='game_date', y='ppg', color='player')
fig.show()
C.7 Common Code Patterns
Loading and Cleaning Data
import pandas as pd
import numpy as np
# Load data
df = pd.read_csv('data.csv')
# Basic cleaning
df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df.drop_duplicates()
df = df.dropna(subset=['player_id', 'game_id'])
df['date'] = pd.to_datetime(df['date'])
# Type conversion
df['ppg'] = pd.to_numeric(df['ppg'], errors='coerce')
df['team'] = df['team'].astype('category')
Feature Engineering Pipeline
def engineer_features(df):
"""Create basketball analytics features."""
# Shooting efficiency
df['ts_pct'] = df['pts'] / (2 * (df['fga'] + 0.44 * df['fta']))
df['efg_pct'] = (df['fgm'] + 0.5 * df['fg3m']) / df['fga']
# Usage
df['usg_rate'] = 100 * ((df['fga'] + 0.44 * df['fta'] + df['tov']) *
(df['team_mp'] / 5)) / (df['mp'] *
(df['team_fga'] + 0.44 * df['team_fta'] + df['team_tov']))
# Per 36 minute stats
for stat in ['pts', 'reb', 'ast', 'stl', 'blk']:
df[f'{stat}_per36'] = df[stat] / df['mp'] * 36
return df
Complete Analysis Template
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
# 1. Load and inspect data
df = pd.read_csv('player_stats.csv')
print(df.info())
print(df.describe())
# 2. Clean data
df = df.dropna()
df = df[df['mpg'] >= 15] # Filter for rotation players
# 3. Feature engineering
df['ts_pct'] = df['pts'] / (2 * (df['fga'] + 0.44 * df['fta']))
# 4. Exploratory visualization
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(data=df, x='usg_pct', y='ts_pct', hue='position')
plt.title('Usage vs Efficiency')
plt.savefig('usage_efficiency.png', dpi=300)
# 5. Model building
features = ['fga', 'fta', 'fg3a', 'mpg']
X = df[features]
y = df['pts']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# 6. Evaluate
y_pred = model.predict(X_test_scaled)
print(f'R2 Score: {r2_score(y_test, y_pred):.3f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}')
# 7. Feature importance
for name, coef in zip(features, model.coef_):
print(f'{name}: {coef:.3f}')
This appendix provides quick reference material for the most commonly used Python libraries in basketball analytics. For complete documentation, refer to the official library documentation.