Appendix A: Python ML Library Quick Reference
This appendix is a cheat sheet, not a tutorial. Skim it when you need a pattern fast. Every snippet assumes standard imports and is ready to paste into a working script.
Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
train_test_split, StratifiedKFold, cross_val_score, GridSearchCV,
RandomizedSearchCV
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, average_precision_score, confusion_matrix,
classification_report, mean_squared_error, mean_absolute_error,
log_loss
)
scikit-learn
Pipeline with ColumnTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
numeric_features = ['tenure_months', 'avg_hours', 'support_tickets']
categorical_features = ['plan_type', 'primary_genre']
numeric_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
model = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=500, random_state=42))
])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
Custom Transformer
from sklearn.base import BaseEstimator, TransformerMixin
class UsageTrendTransformer(BaseEstimator, TransformerMixin):
"""Compute month-over-month usage change."""
def __init__(self, current_col='hours_current', previous_col='hours_previous'):
self.current_col = current_col
self.previous_col = previous_col
def fit(self, X, y=None):
return self # No fitting needed
def transform(self, X):
X = X.copy()
X['usage_trend'] = (
(X[self.current_col] - X[self.previous_col])
/ X[self.previous_col].replace(0, 1)
)
return X
Cross-Validation Patterns
# Stratified k-fold (classification)
from sklearn.model_selection import StratifiedKFold, cross_val_score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
print(f"AUC: {scores.mean():.3f} +/- {scores.std():.3f}")
# Group k-fold (repeated observations per entity)
from sklearn.model_selection import GroupKFold
cv = GroupKFold(n_splits=5)
scores = cross_val_score(model, X, y, cv=cv, groups=subscriber_ids,
scoring='average_precision')
# Time series split
from sklearn.model_selection import TimeSeriesSplit
cv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
Evaluation Metrics
from sklearn.metrics import (
classification_report, confusion_matrix,
roc_auc_score, average_precision_score,
precision_recall_curve
)
# Full classification report
print(classification_report(y_test, y_pred, digits=3))
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Retained', 'Churned'],
yticklabels=['Retained', 'Churned'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
# AUC-ROC and AUC-PR
auc_roc = roc_auc_score(y_test, y_proba)
auc_pr = average_precision_score(y_test, y_proba)
# Optimal threshold from precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
Learning and Validation Curves
from sklearn.model_selection import learning_curve, validation_curve
# Learning curve
train_sizes, train_scores, val_scores = learning_curve(
model, X, y, cv=5, scoring='roc_auc',
train_sizes=np.linspace(0.1, 1.0, 10), random_state=42
)
plt.plot(train_sizes, train_scores.mean(axis=1), label='Training')
plt.plot(train_sizes, val_scores.mean(axis=1), label='Validation')
plt.xlabel('Training Set Size')
plt.ylabel('AUC-ROC')
plt.legend()
# Validation curve (hyperparameter effect)
param_range = [10, 50, 100, 200, 500]
train_scores, val_scores = validation_curve(
RandomForestClassifier(random_state=42), X, y,
param_name='n_estimators', param_range=param_range,
cv=5, scoring='roc_auc'
)
Permutation Importance
from sklearn.inspection import permutation_importance
result = permutation_importance(model, X_test, y_test,
n_repeats=10, random_state=42,
scoring='roc_auc')
sorted_idx = result.importances_mean.argsort()[::-1]
for i in sorted_idx[:10]:
print(f"{feature_names[i]}: {result.importances_mean[i]:.4f} "
f"+/- {result.importances_std[i]:.4f}")
Partial Dependence Plots
from sklearn.inspection import PartialDependenceDisplay
fig, ax = plt.subplots(figsize=(12, 4))
PartialDependenceDisplay.from_estimator(
model, X_train,
features=['tenure_months', 'avg_hours_last_30d'],
kind='both', # PDP + ICE
ax=ax
)
XGBoost
import xgboost as xgb
model = xgb.XGBClassifier(
n_estimators=2000,
learning_rate=0.05,
max_depth=6,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=0.1, # L1 regularization
reg_lambda=1.0, # L2 regularization
scale_pos_weight=11, # For imbalanced data (neg/pos ratio)
eval_metric='logloss',
random_state=42,
n_jobs=-1
)
# Training with early stopping
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
verbose=100
)
# Feature importance
xgb.plot_importance(model, max_num_features=15, importance_type='gain')
# Native DMatrix for performance-critical code
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'max_depth': 6,
'learning_rate': 0.05,
'subsample': 0.8,
'colsample_bytree': 0.8,
}
bst = xgb.train(
params, dtrain,
num_boost_round=2000,
evals=[(dtest, 'test')],
early_stopping_rounds=50,
verbose_eval=100
)
LightGBM
import lightgbm as lgb
model = lgb.LGBMClassifier(
n_estimators=2000,
learning_rate=0.05,
max_depth=-1, # No limit (leaf-wise growth)
num_leaves=63, # Control complexity via num_leaves
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=0.1,
reg_lambda=1.0,
min_child_samples=20,
class_weight='balanced',
random_state=42,
n_jobs=-1,
verbose=-1
)
# Training with early stopping and categorical features
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
eval_metric='logloss',
callbacks=[
lgb.early_stopping(50),
lgb.log_evaluation(100)
],
categorical_feature=['plan_type', 'primary_genre'] # Native handling
)
# Native Dataset for full control
train_data = lgb.Dataset(X_train, label=y_train,
categorical_feature=['plan_type', 'primary_genre'])
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'learning_rate': 0.05,
'num_leaves': 63,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1
}
bst = lgb.train(
params, train_data,
num_boost_round=2000,
valid_sets=[val_data],
callbacks=[lgb.early_stopping(50)]
)
CatBoost
from catboost import CatBoostClassifier, Pool
cat_features = ['plan_type', 'primary_genre', 'country']
model = CatBoostClassifier(
iterations=2000,
learning_rate=0.05,
depth=8,
l2_leaf_reg=3,
auto_class_weights='Balanced',
cat_features=cat_features,
eval_metric='Logloss',
random_seed=42,
verbose=100
)
# CatBoost handles categoricals natively -- no encoding needed
model.fit(
X_train, y_train,
eval_set=(X_val, y_val),
early_stopping_rounds=50
)
# Pool objects for efficient data handling
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)
# Feature importance
feature_importance = model.get_feature_importance(
train_pool, type='ShapValues'
)
SHAP
import shap
# TreeSHAP for tree-based models (fast)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
# For binary classification, shap_values may be a list [class_0, class_1]
# Use class 1 (positive class) for interpretation
if isinstance(shap_values, list):
shap_values = shap_values[1]
# Summary plot (global feature importance + direction)
shap.summary_plot(shap_values, X_test, max_display=15)
# Bar plot (global importance only)
shap.summary_plot(shap_values, X_test, plot_type='bar', max_display=15)
# Waterfall plot (single prediction explanation)
shap.waterfall_plot(shap.Explanation(
values=shap_values[0],
base_values=explainer.expected_value,
data=X_test.iloc[0],
feature_names=X_test.columns.tolist()
))
# Dependence plot (feature effect + interaction)
shap.dependence_plot('tenure_months', shap_values, X_test,
interaction_index='avg_hours_last_30d')
# Force plot (single prediction, horizontal)
shap.force_plot(explainer.expected_value, shap_values[0], X_test.iloc[0])
# KernelSHAP for any model (slower, model-agnostic)
explainer = shap.KernelExplainer(model.predict_proba, shap.sample(X_train, 100))
shap_values = explainer.shap_values(X_test[:50])
MLflow
Experiment Tracking
import mlflow
import mlflow.sklearn
# Set experiment
mlflow.set_experiment("streamflow-churn")
# Manual logging
with mlflow.start_run(run_name="xgb_v3_tuned"):
# Log parameters
mlflow.log_param("model_type", "XGBClassifier")
mlflow.log_param("learning_rate", 0.05)
mlflow.log_param("max_depth", 6)
mlflow.log_param("n_estimators", model.best_iteration)
# Log metrics
mlflow.log_metric("auc_roc", auc_roc)
mlflow.log_metric("auc_pr", auc_pr)
mlflow.log_metric("f1", f1)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
# Log artifacts
plt.savefig("confusion_matrix.png")
mlflow.log_artifact("confusion_matrix.png")
# Log model
mlflow.sklearn.log_model(pipeline, "model")
Autologging
# One line to log everything automatically
mlflow.sklearn.autolog() # scikit-learn
mlflow.xgboost.autolog() # XGBoost
mlflow.lightgbm.autolog() # LightGBM
# Then train as usual -- parameters, metrics, and model are logged
with mlflow.start_run():
model.fit(X_train, y_train)
Model Registry
# Register a model
mlflow.register_model(
f"runs:/{run_id}/model",
"streamflow-churn-model"
)
# Transition to production
from mlflow.tracking import MlflowClient
client = MlflowClient()
client.transition_model_version_stage(
name="streamflow-churn-model",
version=3,
stage="Production"
)
# Load a production model
model = mlflow.pyfunc.load_model(
"models:/streamflow-churn-model/Production"
)
Comparing Runs
# Search runs programmatically
runs = mlflow.search_runs(
experiment_names=["streamflow-churn"],
filter_string="metrics.auc_roc > 0.82",
order_by=["metrics.auc_roc DESC"],
max_results=10
)
print(runs[['run_id', 'params.model_type', 'metrics.auc_roc', 'metrics.f1']])
FastAPI Model Serving
Basic Endpoint
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import joblib
import pandas as pd
app = FastAPI(title="StreamFlow Churn Prediction API", version="1.0.0")
# Load model at startup
model = joblib.load("model/churn_pipeline.joblib")
class CustomerFeatures(BaseModel):
tenure_months: int = Field(..., ge=0, description="Months since signup")
avg_hours_last_30d: float = Field(..., ge=0)
support_tickets_last_90d: int = Field(..., ge=0)
plan_type: str = Field(..., pattern="^(free|basic|standard|premium)$")
primary_genre: str
days_since_last_login: int = Field(..., ge=0)
class ChurnPrediction(BaseModel):
churn_probability: float
risk_level: str
top_factors: list[str]
@app.get("/health")
def health():
return {"status": "healthy", "model_version": "v3"}
@app.post("/predict", response_model=ChurnPrediction)
def predict(features: CustomerFeatures):
try:
X = pd.DataFrame([features.model_dump()])
proba = model.predict_proba(X)[0, 1]
risk = "high" if proba > 0.7 else "medium" if proba > 0.3 else "low"
return ChurnPrediction(
churn_probability=round(proba, 4),
risk_level=risk,
top_factors=_get_top_factors(X)
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/predict/batch", response_model=list[ChurnPrediction])
def predict_batch(features_list: list[CustomerFeatures]):
X = pd.DataFrame([f.model_dump() for f in features_list])
probas = model.predict_proba(X)[:, 1]
return [
ChurnPrediction(
churn_probability=round(p, 4),
risk_level="high" if p > 0.7 else "medium" if p > 0.3 else "low",
top_factors=[]
)
for p in probas
]
Running the Server
# Development
uvicorn app:app --reload --host 0.0.0.0 --port 8000
# Production (with multiple workers)
uvicorn app:app --host 0.0.0.0 --port 8000 --workers 4
Optuna (Hyperparameter Optimization)
import optuna
def objective(trial):
params = {
'n_estimators': 2000,
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'max_depth': trial.suggest_int('max_depth', 3, 10),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
}
model = xgb.XGBClassifier(**params, random_state=42, n_jobs=-1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
return scores.mean()
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=3600, show_progress_bar=True)
# Results
print(f"Best AUC: {study.best_trial.value:.4f}")
print(f"Best params: {study.best_trial.params}")
# Visualization
optuna.visualization.plot_param_importances(study)
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_parallel_coordinate(study)
Common Patterns
Save and Load a Pipeline
import joblib
# Save
joblib.dump(pipeline, 'model/churn_pipeline.joblib')
# Load
pipeline = joblib.load('model/churn_pipeline.joblib')
Reproducibility Checklist
# Set all random seeds
import random
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
# Pass random_state to every estimator and splitter
model = XGBClassifier(random_state=SEED)
cv = StratifiedKFold(random_state=SEED, shuffle=True)
X_train, X_test = train_test_split(X, y, random_state=SEED)
Calibration
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
# Calibrate a model
calibrated = CalibratedClassifierCV(model, cv=5, method='isotonic')
calibrated.fit(X_train, y_train)
# Plot calibration curve
prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=10)
plt.plot(prob_pred, prob_true, marker='o', label='Model')
plt.plot([0, 1], [0, 1], '--', label='Perfectly calibrated')
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.legend()
Class Imbalance with imblearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
# imblearn Pipeline (not sklearn Pipeline) supports samplers
model = ImbPipeline([
('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),
('classifier', XGBClassifier(random_state=42))
])
# SMOTE is applied only during fit(), not during predict()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)