Appendix C: Python Quick Reference

This appendix provides a comprehensive quick reference for the Python libraries essential to basketball analytics. Each section includes syntax, common operations, and basketball-specific examples.


C.1 NumPy Reference

NumPy is the fundamental package for numerical computing in Python, providing support for large, multi-dimensional arrays and matrices.

Importing NumPy

import numpy as np

Array Creation

# From Python lists
arr = np.array([1, 2, 3, 4, 5])
matrix = np.array([[1, 2, 3], [4, 5, 6]])

# Special arrays
zeros = np.zeros((3, 4))          # 3x4 array of zeros
ones = np.ones((2, 3))            # 2x3 array of ones
empty = np.empty((2, 2))          # Uninitialized 2x2 array
identity = np.eye(4)              # 4x4 identity matrix
full = np.full((3, 3), 7)         # 3x3 array filled with 7

# Sequences
range_arr = np.arange(0, 10, 2)   # [0, 2, 4, 6, 8]
linspace = np.linspace(0, 1, 5)   # [0, 0.25, 0.5, 0.75, 1.0]
logspace = np.logspace(0, 2, 3)   # [1, 10, 100]

# Random arrays
random_uniform = np.random.rand(3, 3)        # Uniform [0, 1)
random_normal = np.random.randn(3, 3)        # Standard normal
random_int = np.random.randint(0, 100, 10)   # Random integers

Array Attributes

arr = np.array([[1, 2, 3], [4, 5, 6]])

arr.shape      # (2, 3) - dimensions
arr.ndim       # 2 - number of dimensions
arr.size       # 6 - total elements
arr.dtype      # dtype('int64') - data type
arr.itemsize   # 8 - bytes per element

Array Operations

# Element-wise operations
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])

a + b          # [5, 7, 9]
a - b          # [-3, -3, -3]
a * b          # [4, 10, 18]
a / b          # [0.25, 0.4, 0.5]
a ** 2         # [1, 4, 9]
np.sqrt(a)     # [1.0, 1.414, 1.732]
np.exp(a)      # [2.718, 7.389, 20.086]
np.log(a)      # [0, 0.693, 1.099]

# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

np.dot(A, B)           # Matrix multiplication
A @ B                  # Matrix multiplication (Python 3.5+)
A.T                    # Transpose
np.linalg.inv(A)       # Matrix inverse
np.linalg.det(A)       # Determinant
np.linalg.eig(A)       # Eigenvalues and eigenvectors

Statistical Functions

data = np.array([23.5, 28.1, 21.4, 25.8, 30.2, 22.1])

np.mean(data)          # 25.18 - mean
np.median(data)        # 24.65 - median
np.std(data)           # 3.25 - standard deviation
np.var(data)           # 10.56 - variance
np.min(data)           # 21.4 - minimum
np.max(data)           # 30.2 - maximum
np.sum(data)           # 151.1 - sum
np.prod(data)          # product of elements
np.percentile(data, 75) # 75th percentile
np.cumsum(data)        # cumulative sum
np.corrcoef(a, b)      # correlation coefficient matrix
np.cov(a, b)           # covariance matrix

# Axis-specific operations (for 2D arrays)
matrix = np.array([[1, 2, 3], [4, 5, 6]])
np.mean(matrix, axis=0)  # Column means: [2.5, 3.5, 4.5]
np.mean(matrix, axis=1)  # Row means: [2.0, 5.0]

Indexing and Slicing

arr = np.array([10, 20, 30, 40, 50])

arr[0]         # 10 - first element
arr[-1]        # 50 - last element
arr[1:4]       # [20, 30, 40] - slice
arr[::2]       # [10, 30, 50] - every other element
arr[::-1]      # [50, 40, 30, 20, 10] - reversed

# 2D indexing
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
matrix[0, 0]       # 1 - element at row 0, col 0
matrix[1, :]       # [4, 5, 6] - entire row 1
matrix[:, 2]       # [3, 6, 9] - entire column 2
matrix[0:2, 1:3]   # [[2, 3], [5, 6]] - submatrix

# Boolean indexing
arr = np.array([15, 22, 31, 18, 25])
arr[arr > 20]      # [22, 31, 25] - elements > 20
arr[arr % 2 == 0]  # [22, 18] - even elements

Basketball Example: Player Similarity

import numpy as np

# Player stat vectors (PPG, RPG, APG, SPG, BPG)
player_a = np.array([25.3, 7.2, 5.8, 1.4, 0.8])
player_b = np.array([23.1, 6.8, 6.2, 1.2, 0.5])

# Euclidean distance (lower = more similar)
distance = np.linalg.norm(player_a - player_b)

# Cosine similarity (higher = more similar)
cosine_sim = np.dot(player_a, player_b) / (
    np.linalg.norm(player_a) * np.linalg.norm(player_b)
)

# Standardize stats for comparison
all_players = np.array([player_a, player_b])
means = np.mean(all_players, axis=0)
stds = np.std(all_players, axis=0)
standardized = (all_players - means) / stds

C.2 pandas Reference

pandas provides high-performance, easy-to-use data structures for data analysis.

Importing pandas

import pandas as pd

Creating DataFrames

# From dictionary
df = pd.DataFrame({
    'player': ['Player A', 'Player B', 'Player C'],
    'ppg': [25.3, 18.7, 22.1],
    'rpg': [7.2, 4.5, 8.8],
    'apg': [5.8, 8.2, 3.1]
})

# From list of dictionaries
data = [
    {'player': 'Player A', 'ppg': 25.3},
    {'player': 'Player B', 'ppg': 18.7}
]
df = pd.DataFrame(data)

# From NumPy array
arr = np.array([[1, 2, 3], [4, 5, 6]])
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])

# Reading files
df = pd.read_csv('player_stats.csv')
df = pd.read_excel('player_stats.xlsx')
df = pd.read_json('player_stats.json')

DataFrame Inspection

df.head()          # First 5 rows
df.head(10)        # First 10 rows
df.tail()          # Last 5 rows
df.shape           # (rows, columns)
df.columns         # Column names
df.dtypes          # Data types
df.info()          # Summary info
df.describe()      # Statistical summary
df.isnull().sum()  # Count missing values per column
df.nunique()       # Count unique values per column
df.memory_usage()  # Memory usage

Selecting Data

# Column selection
df['ppg']              # Single column (Series)
df[['ppg', 'rpg']]     # Multiple columns (DataFrame)

# Row selection by index
df.iloc[0]             # First row by position
df.iloc[0:5]           # First 5 rows
df.iloc[[0, 2, 4]]     # Specific rows by position
df.iloc[0:5, 0:3]      # Rows 0-4, columns 0-2

# Row selection by label
df.loc[0]              # Row with label 0
df.loc[df['ppg'] > 20] # Rows where ppg > 20

# Boolean filtering
df[df['ppg'] > 20]                      # Simple condition
df[(df['ppg'] > 20) & (df['rpg'] > 5)]  # Multiple conditions (AND)
df[(df['ppg'] > 25) | (df['apg'] > 8)]  # Multiple conditions (OR)
df[df['player'].str.contains('Player')] # String matching
df[df['team'].isin(['LAL', 'BOS', 'GSW'])] # Value in list

Data Manipulation

# Adding columns
df['efficiency'] = df['ppg'] / df['fga']
df['ts_pct'] = df['pts'] / (2 * (df['fga'] + 0.44 * df['fta']))

# Renaming columns
df.rename(columns={'ppg': 'points_per_game'}, inplace=True)
df.columns = ['col1', 'col2', 'col3']  # Rename all

# Dropping columns/rows
df.drop('column_name', axis=1, inplace=True)  # Drop column
df.drop(0, axis=0, inplace=True)              # Drop row

# Sorting
df.sort_values('ppg', ascending=False)        # Sort by column
df.sort_values(['team', 'ppg'], ascending=[True, False])  # Multiple columns

# Handling missing data
df.dropna()                    # Drop rows with any NaN
df.dropna(subset=['ppg'])      # Drop rows where ppg is NaN
df.fillna(0)                   # Fill NaN with 0
df.fillna(df.mean())           # Fill with column means
df.fillna(method='ffill')      # Forward fill
df.interpolate()               # Interpolate missing values

# Applying functions
df['ppg_squared'] = df['ppg'].apply(lambda x: x ** 2)
df['tier'] = df['ppg'].apply(lambda x: 'Star' if x > 20 else 'Role Player')

Grouping and Aggregation

# Basic groupby
df.groupby('team')['ppg'].mean()           # Mean ppg by team
df.groupby('team')['ppg'].agg(['mean', 'std', 'count'])

# Multiple aggregations
df.groupby('team').agg({
    'ppg': 'mean',
    'rpg': 'sum',
    'apg': ['mean', 'max']
})

# Custom aggregation
df.groupby('team').agg(
    avg_ppg=('ppg', 'mean'),
    total_rebounds=('rpg', 'sum'),
    player_count=('player', 'count')
)

# Transform (returns same shape as original)
df['team_avg_ppg'] = df.groupby('team')['ppg'].transform('mean')
df['ppg_vs_team_avg'] = df['ppg'] - df.groupby('team')['ppg'].transform('mean')

Merging and Joining

# Merge (SQL-style joins)
merged = pd.merge(df1, df2, on='player_id')                    # Inner join
merged = pd.merge(df1, df2, on='player_id', how='left')        # Left join
merged = pd.merge(df1, df2, on='player_id', how='outer')       # Outer join
merged = pd.merge(df1, df2, left_on='id1', right_on='id2')     # Different keys

# Concatenation
combined = pd.concat([df1, df2])                    # Vertical stack
combined = pd.concat([df1, df2], axis=1)            # Horizontal stack
combined = pd.concat([df1, df2], ignore_index=True) # Reset index

Pivot Tables

# Create pivot table
pivot = pd.pivot_table(
    df,
    values='ppg',
    index='team',
    columns='season',
    aggfunc='mean'
)

# Melt (unpivot)
melted = pd.melt(
    df,
    id_vars=['player', 'team'],
    value_vars=['ppg', 'rpg', 'apg'],
    var_name='stat',
    value_name='value'
)

Time Series Operations

# Convert to datetime
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Resampling
df.resample('W').mean()     # Weekly average
df.resample('M').sum()      # Monthly sum

# Rolling calculations
df['rolling_avg'] = df['ppg'].rolling(window=10).mean()
df['expanding_avg'] = df['ppg'].expanding().mean()

# Lag/shift
df['prev_game_ppg'] = df['ppg'].shift(1)
df['next_game_ppg'] = df['ppg'].shift(-1)

Basketball Example: Season Analysis

import pandas as pd

# Load player stats
df = pd.read_csv('player_stats.csv')

# Calculate advanced metrics
df['ts_pct'] = df['pts'] / (2 * (df['fga'] + 0.44 * df['fta']))
df['efg_pct'] = (df['fgm'] + 0.5 * df['fg3m']) / df['fga']
df['usg_rate'] = 100 * ((df['fga'] + 0.44 * df['fta'] + df['tov']) *
                        (df['team_mp'] / 5)) / (df['mp'] *
                        (df['team_fga'] + 0.44 * df['team_fta'] + df['team_tov']))

# Team-level aggregations
team_stats = df.groupby('team').agg(
    total_pts=('pts', 'sum'),
    avg_ts=('ts_pct', 'mean'),
    player_count=('player', 'count')
).sort_values('total_pts', ascending=False)

# Find top scorers per team
top_scorers = df.loc[df.groupby('team')['ppg'].idxmax()]

# Rank players within teams
df['team_ppg_rank'] = df.groupby('team')['ppg'].rank(ascending=False)

C.3 Matplotlib Reference

Matplotlib is the fundamental plotting library for Python.

Importing Matplotlib

import matplotlib.pyplot as plt

Basic Plots

# Line plot
plt.plot(x, y)
plt.plot(x, y, 'r-', linewidth=2, label='Points')
plt.plot(x, y, marker='o', linestyle='--', color='blue')

# Scatter plot
plt.scatter(x, y)
plt.scatter(x, y, c=colors, s=sizes, alpha=0.7, cmap='viridis')

# Bar plot
plt.bar(categories, values)
plt.barh(categories, values)  # Horizontal

# Histogram
plt.hist(data, bins=20, edgecolor='black', alpha=0.7)

# Box plot
plt.boxplot([data1, data2, data3], labels=['A', 'B', 'C'])

# Pie chart
plt.pie(values, labels=labels, autopct='%1.1f%%')

Plot Customization

# Figure and axes
fig, ax = plt.subplots(figsize=(10, 6))

# Labels and title
plt.xlabel('X Label', fontsize=12)
plt.ylabel('Y Label', fontsize=12)
plt.title('Plot Title', fontsize=14, fontweight='bold')

# Axis limits and ticks
plt.xlim(0, 100)
plt.ylim(0, 50)
plt.xticks(rotation=45)
plt.yticks([0, 10, 20, 30, 40, 50])

# Grid
plt.grid(True, alpha=0.3)
plt.grid(axis='y', linestyle='--')

# Legend
plt.legend(loc='upper right')
plt.legend(loc='best', fontsize=10, framealpha=0.9)

# Annotations
plt.annotate('Peak', xy=(x_point, y_point),
             xytext=(x_text, y_text),
             arrowprops=dict(arrowstyle='->', color='red'))

# Text
plt.text(x, y, 'Text Here', fontsize=10, ha='center')

# Saving
plt.savefig('plot.png', dpi=300, bbox_inches='tight')
plt.savefig('plot.pdf', format='pdf')

Subplots

# Multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

axes[0, 0].plot(x, y1)
axes[0, 0].set_title('Plot 1')

axes[0, 1].scatter(x, y2)
axes[0, 1].set_title('Plot 2')

axes[1, 0].bar(categories, values)
axes[1, 0].set_title('Plot 3')

axes[1, 1].hist(data, bins=20)
axes[1, 1].set_title('Plot 4')

plt.tight_layout()
plt.show()

Style Customization

# Use built-in styles
plt.style.use('seaborn-v0_8')
plt.style.use('ggplot')
plt.style.use('fivethirtyeight')

# Color maps
plt.scatter(x, y, c=values, cmap='viridis')
plt.colorbar(label='Value')

# Custom colors
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

Basketball Example: Shot Chart

import matplotlib.pyplot as plt
import numpy as np

# Sample shot data
shots_x = np.random.uniform(-25, 25, 200)
shots_y = np.random.uniform(0, 47, 200)
made = np.random.choice([0, 1], 200, p=[0.45, 0.55])

# Create shot chart
fig, ax = plt.subplots(figsize=(10, 9.4))

# Plot shots (green for made, red for missed)
colors = ['red' if m == 0 else 'green' for m in made]
ax.scatter(shots_x, shots_y, c=colors, alpha=0.6, s=50)

# Draw court elements (simplified)
# Three-point line arc
theta = np.linspace(0, np.pi, 100)
ax.plot(23.75 * np.cos(theta), 23.75 * np.sin(theta) + 4.75, 'k-', linewidth=2)

# Paint area
ax.plot([-8, -8], [0, 19], 'k-', linewidth=2)
ax.plot([8, 8], [0, 19], 'k-', linewidth=2)
ax.plot([-8, 8], [19, 19], 'k-', linewidth=2)

# Free throw circle
circle = plt.Circle((0, 19), 6, fill=False, color='black', linewidth=2)
ax.add_patch(circle)

# Rim
rim = plt.Circle((0, 4.75), 0.75, fill=False, color='orange', linewidth=3)
ax.add_patch(rim)

ax.set_xlim(-30, 30)
ax.set_ylim(0, 50)
ax.set_aspect('equal')
ax.set_title('Player Shot Chart', fontsize=14, fontweight='bold')
plt.show()

C.4 Seaborn Reference

Seaborn is built on Matplotlib and provides a high-level interface for statistical graphics.

Importing Seaborn

import seaborn as sns

Distribution Plots

# Histogram with KDE
sns.histplot(data=df, x='ppg', kde=True)

# KDE plot
sns.kdeplot(data=df, x='ppg', fill=True)

# Box plot
sns.boxplot(data=df, x='position', y='ppg')

# Violin plot
sns.violinplot(data=df, x='position', y='ppg')

# Strip plot (scatter for categorical)
sns.stripplot(data=df, x='position', y='ppg', jitter=True)

# Swarm plot
sns.swarmplot(data=df, x='position', y='ppg')

Relational Plots

# Scatter plot
sns.scatterplot(data=df, x='ppg', y='rpg', hue='position', size='mpg')

# Line plot
sns.lineplot(data=df, x='game_num', y='ppg', hue='player')

# Regression plot
sns.regplot(data=df, x='fga', y='pts')
sns.lmplot(data=df, x='fga', y='pts', hue='position')

Categorical Plots

# Count plot
sns.countplot(data=df, x='position')

# Bar plot (with error bars)
sns.barplot(data=df, x='position', y='ppg', errorbar='sd')

# Point plot
sns.pointplot(data=df, x='position', y='ppg', hue='team')

Matrix Plots

# Correlation heatmap
corr_matrix = df[['ppg', 'rpg', 'apg', 'spg', 'bpg']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)

# Clustermap
sns.clustermap(corr_matrix, annot=True, cmap='viridis')

Multi-Plot Grids

# Pair plot
sns.pairplot(df[['ppg', 'rpg', 'apg', 'position']], hue='position')

# FacetGrid
g = sns.FacetGrid(df, col='position', col_wrap=3, height=4)
g.map(sns.histplot, 'ppg')

# JointGrid
g = sns.JointGrid(data=df, x='ppg', y='rpg')
g.plot(sns.scatterplot, sns.histplot)

Styling

# Set theme
sns.set_theme(style='whitegrid')
sns.set_theme(style='darkgrid')
sns.set_theme(style='ticks')

# Color palettes
sns.set_palette('husl')
sns.set_palette('Set2')
sns.color_palette('coolwarm', n_colors=10)

Basketball Example: Player Comparison

import seaborn as sns
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('player_stats.csv')

# Set style
sns.set_theme(style='whitegrid')

# Create figure with multiple plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. PPG distribution by position
sns.boxplot(data=df, x='position', y='ppg', ax=axes[0, 0],
            order=['PG', 'SG', 'SF', 'PF', 'C'])
axes[0, 0].set_title('Points Per Game by Position')

# 2. PPG vs RPG scatter
sns.scatterplot(data=df, x='ppg', y='rpg', hue='position',
                size='mpg', sizes=(20, 200), ax=axes[0, 1])
axes[0, 1].set_title('Points vs Rebounds')

# 3. Correlation heatmap
stats = ['ppg', 'rpg', 'apg', 'spg', 'bpg', 'ts_pct']
corr = df[stats].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, ax=axes[1, 0])
axes[1, 0].set_title('Stat Correlations')

# 4. True shooting by usage
sns.regplot(data=df, x='usg_pct', y='ts_pct', ax=axes[1, 1])
axes[1, 1].set_title('Efficiency vs Usage Trade-off')

plt.tight_layout()
plt.savefig('player_analysis.png', dpi=300)
plt.show()

C.5 scikit-learn Reference

scikit-learn is the primary machine learning library for Python.

Importing scikit-learn

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.cluster import KMeans

Data Preparation

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Min-Max scaling
minmax = MinMaxScaler()
X_normalized = minmax.fit_transform(X)

Regression Models

# Linear Regression
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Coefficients
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')

# Ridge Regression
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Lasso Regression
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

# Elastic Net
from sklearn.linear_model import ElasticNet

elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic.fit(X_train, y_train)

Classification Models

# Logistic Regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)

# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Feature importance
importance = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_clf.feature_importances_
}).sort_values('importance', ascending=False)

# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
gb_clf.fit(X_train, y_train)

Model Evaluation

# Regression metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Classification metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

# Cross-validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f'CV R2: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})')

Clustering

# K-Means
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)

# Cluster centers
centers = kmeans.cluster_centers_

# Elbow method for optimal k
inertias = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

# Hierarchical clustering
from sklearn.cluster import AgglomerativeClustering

hierarchical = AgglomerativeClustering(n_clusters=5)
clusters = hierarchical.fit_predict(X)

Dimensionality Reduction

# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Explained variance
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')
print(f'Total variance explained: {sum(pca.explained_variance_ratio_):.2%}')

# t-SNE
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

Hyperparameter Tuning

# Grid Search
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print(f'Best params: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

Basketball Example: Win Prediction Model

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Prepare features
features = ['team_off_rtg', 'team_def_rtg', 'opp_off_rtg', 'opp_def_rtg',
            'home_court', 'rest_days', 'back_to_back']
X = df[features]
y = df['win']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Train random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
print("Logistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, log_reg.predict(X_test_scaled)):.3f}")

print("\nRandom Forest:")
print(f"Accuracy: {accuracy_score(y_test, rf.predict(X_test)):.3f}")

# Feature importance from Random Forest
importance = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(importance)

C.6 Additional Useful Libraries

SciPy for Statistical Tests

from scipy import stats

# T-test
t_stat, p_value = stats.ttest_ind(group1, group2)

# Chi-square test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

# Correlation with p-value
r, p = stats.pearsonr(x, y)
r, p = stats.spearmanr(x, y)

# Normal distribution
stats.norm.pdf(x, loc=mean, scale=std)
stats.norm.cdf(x, loc=mean, scale=std)
stats.norm.ppf(0.95)  # 95th percentile

# ANOVA
f_stat, p_value = stats.f_oneway(group1, group2, group3)

Statsmodels for Regression

import statsmodels.api as sm
import statsmodels.formula.api as smf

# OLS Regression
X = sm.add_constant(X)  # Add intercept
model = sm.OLS(y, X).fit()
print(model.summary())

# Formula-based regression
model = smf.ols('ppg ~ fga + fta + mpg', data=df).fit()
print(model.summary())

# Logistic regression
model = sm.Logit(y, X).fit()
print(model.summary())

Plotly for Interactive Visualizations

import plotly.express as px
import plotly.graph_objects as go

# Scatter plot
fig = px.scatter(df, x='ppg', y='rpg', color='position',
                 hover_data=['player', 'team'])
fig.show()

# Bar chart
fig = px.bar(df, x='team', y='wins', color='conference')
fig.show()

# Line chart
fig = px.line(df, x='game_date', y='ppg', color='player')
fig.show()

C.7 Common Code Patterns

Loading and Cleaning Data

import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('data.csv')

# Basic cleaning
df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df.drop_duplicates()
df = df.dropna(subset=['player_id', 'game_id'])
df['date'] = pd.to_datetime(df['date'])

# Type conversion
df['ppg'] = pd.to_numeric(df['ppg'], errors='coerce')
df['team'] = df['team'].astype('category')

Feature Engineering Pipeline

def engineer_features(df):
    """Create basketball analytics features."""
    # Shooting efficiency
    df['ts_pct'] = df['pts'] / (2 * (df['fga'] + 0.44 * df['fta']))
    df['efg_pct'] = (df['fgm'] + 0.5 * df['fg3m']) / df['fga']

    # Usage
    df['usg_rate'] = 100 * ((df['fga'] + 0.44 * df['fta'] + df['tov']) *
                            (df['team_mp'] / 5)) / (df['mp'] *
                            (df['team_fga'] + 0.44 * df['team_fta'] + df['team_tov']))

    # Per 36 minute stats
    for stat in ['pts', 'reb', 'ast', 'stl', 'blk']:
        df[f'{stat}_per36'] = df[stat] / df['mp'] * 36

    return df

Complete Analysis Template

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# 1. Load and inspect data
df = pd.read_csv('player_stats.csv')
print(df.info())
print(df.describe())

# 2. Clean data
df = df.dropna()
df = df[df['mpg'] >= 15]  # Filter for rotation players

# 3. Feature engineering
df['ts_pct'] = df['pts'] / (2 * (df['fga'] + 0.44 * df['fta']))

# 4. Exploratory visualization
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(data=df, x='usg_pct', y='ts_pct', hue='position')
plt.title('Usage vs Efficiency')
plt.savefig('usage_efficiency.png', dpi=300)

# 5. Model building
features = ['fga', 'fta', 'fg3a', 'mpg']
X = df[features]
y = df['pts']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)

# 6. Evaluate
y_pred = model.predict(X_test_scaled)
print(f'R2 Score: {r2_score(y_test, y_pred):.3f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}')

# 7. Feature importance
for name, coef in zip(features, model.coef_):
    print(f'{name}: {coef:.3f}')

This appendix provides quick reference material for the most commonly used Python libraries in basketball analytics. For complete documentation, refer to the official library documentation.