Appendix A: Python ML Library Quick Reference

This appendix is a cheat sheet, not a tutorial. Skim it when you need a pattern fast. Every snippet assumes standard imports and is ready to paste into a working script.

Standard Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_val_score, GridSearchCV,
    RandomizedSearchCV
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    classification_report, mean_squared_error, mean_absolute_error,
    log_loss
)

scikit-learn

Pipeline with ColumnTransformer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

numeric_features = ['tenure_months', 'avg_hours', 'support_tickets']
categorical_features = ['plan_type', 'primary_genre']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=500, random_state=42))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

Custom Transformer

from sklearn.base import BaseEstimator, TransformerMixin

class UsageTrendTransformer(BaseEstimator, TransformerMixin):
    """Compute month-over-month usage change."""

    def __init__(self, current_col='hours_current', previous_col='hours_previous'):
        self.current_col = current_col
        self.previous_col = previous_col

    def fit(self, X, y=None):
        return self  # No fitting needed

    def transform(self, X):
        X = X.copy()
        X['usage_trend'] = (
            (X[self.current_col] - X[self.previous_col])
            / X[self.previous_col].replace(0, 1)
        )
        return X

Cross-Validation Patterns

# Stratified k-fold (classification)
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
print(f"AUC: {scores.mean():.3f} +/- {scores.std():.3f}")

# Group k-fold (repeated observations per entity)
from sklearn.model_selection import GroupKFold

cv = GroupKFold(n_splits=5)
scores = cross_val_score(model, X, y, cv=cv, groups=subscriber_ids,
                         scoring='average_precision')

# Time series split
from sklearn.model_selection import TimeSeriesSplit

cv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')

Evaluation Metrics

from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, average_precision_score,
    precision_recall_curve
)

# Full classification report
print(classification_report(y_test, y_pred, digits=3))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Retained', 'Churned'],
            yticklabels=['Retained', 'Churned'])
plt.xlabel('Predicted')
plt.ylabel('Actual')

# AUC-ROC and AUC-PR
auc_roc = roc_auc_score(y_test, y_proba)
auc_pr = average_precision_score(y_test, y_proba)

# Optimal threshold from precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

Learning and Validation Curves

from sklearn.model_selection import learning_curve, validation_curve

# Learning curve
train_sizes, train_scores, val_scores = learning_curve(
    model, X, y, cv=5, scoring='roc_auc',
    train_sizes=np.linspace(0.1, 1.0, 10), random_state=42
)

plt.plot(train_sizes, train_scores.mean(axis=1), label='Training')
plt.plot(train_sizes, val_scores.mean(axis=1), label='Validation')
plt.xlabel('Training Set Size')
plt.ylabel('AUC-ROC')
plt.legend()

# Validation curve (hyperparameter effect)
param_range = [10, 50, 100, 200, 500]
train_scores, val_scores = validation_curve(
    RandomForestClassifier(random_state=42), X, y,
    param_name='n_estimators', param_range=param_range,
    cv=5, scoring='roc_auc'
)

Permutation Importance

from sklearn.inspection import permutation_importance

result = permutation_importance(model, X_test, y_test,
                                n_repeats=10, random_state=42,
                                scoring='roc_auc')

sorted_idx = result.importances_mean.argsort()[::-1]
for i in sorted_idx[:10]:
    print(f"{feature_names[i]}: {result.importances_mean[i]:.4f} "
          f"+/- {result.importances_std[i]:.4f}")

Partial Dependence Plots

from sklearn.inspection import PartialDependenceDisplay

fig, ax = plt.subplots(figsize=(12, 4))
PartialDependenceDisplay.from_estimator(
    model, X_train,
    features=['tenure_months', 'avg_hours_last_30d'],
    kind='both',  # PDP + ICE
    ax=ax
)

XGBoost

import xgboost as xgb

model = xgb.XGBClassifier(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,        # L1 regularization
    reg_lambda=1.0,        # L2 regularization
    scale_pos_weight=11,   # For imbalanced data (neg/pos ratio)
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

# Training with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)

# Feature importance
xgb.plot_importance(model, max_num_features=15, importance_type='gain')

# Native DMatrix for performance-critical code
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}

bst = xgb.train(
    params, dtrain,
    num_boost_round=2000,
    evals=[(dtest, 'test')],
    early_stopping_rounds=50,
    verbose_eval=100
)

LightGBM

import lightgbm as lgb

model = lgb.LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=-1,             # No limit (leaf-wise growth)
    num_leaves=63,            # Control complexity via num_leaves
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    min_child_samples=20,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# Training with early stopping and categorical features
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='logloss',
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(100)
    ],
    categorical_feature=['plan_type', 'primary_genre']  # Native handling
)

# Native Dataset for full control
train_data = lgb.Dataset(X_train, label=y_train,
                         categorical_feature=['plan_type', 'primary_genre'])
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 63,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

bst = lgb.train(
    params, train_data,
    num_boost_round=2000,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(50)]
)

CatBoost

from catboost import CatBoostClassifier, Pool

cat_features = ['plan_type', 'primary_genre', 'country']

model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    auto_class_weights='Balanced',
    cat_features=cat_features,
    eval_metric='Logloss',
    random_seed=42,
    verbose=100
)

# CatBoost handles categoricals natively -- no encoding needed
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50
)

# Pool objects for efficient data handling
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)

# Feature importance
feature_importance = model.get_feature_importance(
    train_pool, type='ShapValues'
)

SHAP

import shap

# TreeSHAP for tree-based models (fast)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# For binary classification, shap_values may be a list [class_0, class_1]
# Use class 1 (positive class) for interpretation
if isinstance(shap_values, list):
    shap_values = shap_values[1]

# Summary plot (global feature importance + direction)
shap.summary_plot(shap_values, X_test, max_display=15)

# Bar plot (global importance only)
shap.summary_plot(shap_values, X_test, plot_type='bar', max_display=15)

# Waterfall plot (single prediction explanation)
shap.waterfall_plot(shap.Explanation(
    values=shap_values[0],
    base_values=explainer.expected_value,
    data=X_test.iloc[0],
    feature_names=X_test.columns.tolist()
))

# Dependence plot (feature effect + interaction)
shap.dependence_plot('tenure_months', shap_values, X_test,
                     interaction_index='avg_hours_last_30d')

# Force plot (single prediction, horizontal)
shap.force_plot(explainer.expected_value, shap_values[0], X_test.iloc[0])

# KernelSHAP for any model (slower, model-agnostic)
explainer = shap.KernelExplainer(model.predict_proba, shap.sample(X_train, 100))
shap_values = explainer.shap_values(X_test[:50])

MLflow

Experiment Tracking

import mlflow
import mlflow.sklearn

# Set experiment
mlflow.set_experiment("streamflow-churn")

# Manual logging
with mlflow.start_run(run_name="xgb_v3_tuned"):
    # Log parameters
    mlflow.log_param("model_type", "XGBClassifier")
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("n_estimators", model.best_iteration)

    # Log metrics
    mlflow.log_metric("auc_roc", auc_roc)
    mlflow.log_metric("auc_pr", auc_pr)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)

    # Log artifacts
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")

    # Log model
    mlflow.sklearn.log_model(pipeline, "model")

Autologging

# One line to log everything automatically
mlflow.sklearn.autolog()         # scikit-learn
mlflow.xgboost.autolog()         # XGBoost
mlflow.lightgbm.autolog()        # LightGBM

# Then train as usual -- parameters, metrics, and model are logged
with mlflow.start_run():
    model.fit(X_train, y_train)

Model Registry

# Register a model
mlflow.register_model(
    f"runs:/{run_id}/model",
    "streamflow-churn-model"
)

# Transition to production
from mlflow.tracking import MlflowClient

client = MlflowClient()
client.transition_model_version_stage(
    name="streamflow-churn-model",
    version=3,
    stage="Production"
)

# Load a production model
model = mlflow.pyfunc.load_model(
    "models:/streamflow-churn-model/Production"
)

Comparing Runs

# Search runs programmatically
runs = mlflow.search_runs(
    experiment_names=["streamflow-churn"],
    filter_string="metrics.auc_roc > 0.82",
    order_by=["metrics.auc_roc DESC"],
    max_results=10
)

print(runs[['run_id', 'params.model_type', 'metrics.auc_roc', 'metrics.f1']])

FastAPI Model Serving

Basic Endpoint

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import joblib
import pandas as pd

app = FastAPI(title="StreamFlow Churn Prediction API", version="1.0.0")

# Load model at startup
model = joblib.load("model/churn_pipeline.joblib")

class CustomerFeatures(BaseModel):
    tenure_months: int = Field(..., ge=0, description="Months since signup")
    avg_hours_last_30d: float = Field(..., ge=0)
    support_tickets_last_90d: int = Field(..., ge=0)
    plan_type: str = Field(..., pattern="^(free|basic|standard|premium)$")
    primary_genre: str
    days_since_last_login: int = Field(..., ge=0)

class ChurnPrediction(BaseModel):
    churn_probability: float
    risk_level: str
    top_factors: list[str]

@app.get("/health")
def health():
    return {"status": "healthy", "model_version": "v3"}

@app.post("/predict", response_model=ChurnPrediction)
def predict(features: CustomerFeatures):
    try:
        X = pd.DataFrame([features.model_dump()])
        proba = model.predict_proba(X)[0, 1]
        risk = "high" if proba > 0.7 else "medium" if proba > 0.3 else "low"
        return ChurnPrediction(
            churn_probability=round(proba, 4),
            risk_level=risk,
            top_factors=_get_top_factors(X)
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/predict/batch", response_model=list[ChurnPrediction])
def predict_batch(features_list: list[CustomerFeatures]):
    X = pd.DataFrame([f.model_dump() for f in features_list])
    probas = model.predict_proba(X)[:, 1]
    return [
        ChurnPrediction(
            churn_probability=round(p, 4),
            risk_level="high" if p > 0.7 else "medium" if p > 0.3 else "low",
            top_factors=[]
        )
        for p in probas
    ]

Running the Server

# Development
uvicorn app:app --reload --host 0.0.0.0 --port 8000

# Production (with multiple workers)
uvicorn app:app --host 0.0.0.0 --port 8000 --workers 4

Optuna (Hyperparameter Optimization)

import optuna

def objective(trial):
    params = {
        'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBClassifier(**params, random_state=42, n_jobs=-1)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=3600, show_progress_bar=True)

# Results
print(f"Best AUC: {study.best_trial.value:.4f}")
print(f"Best params: {study.best_trial.params}")

# Visualization
optuna.visualization.plot_param_importances(study)
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_parallel_coordinate(study)

Common Patterns

Save and Load a Pipeline

import joblib

# Save
joblib.dump(pipeline, 'model/churn_pipeline.joblib')

# Load
pipeline = joblib.load('model/churn_pipeline.joblib')

Reproducibility Checklist

# Set all random seeds
import random
SEED = 42

random.seed(SEED)
np.random.seed(SEED)

# Pass random_state to every estimator and splitter
model = XGBClassifier(random_state=SEED)
cv = StratifiedKFold(random_state=SEED, shuffle=True)
X_train, X_test = train_test_split(X, y, random_state=SEED)

Calibration

from sklearn.calibration import CalibratedClassifierCV, calibration_curve

# Calibrate a model
calibrated = CalibratedClassifierCV(model, cv=5, method='isotonic')
calibrated.fit(X_train, y_train)

# Plot calibration curve
prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=10)
plt.plot(prob_pred, prob_true, marker='o', label='Model')
plt.plot([0, 1], [0, 1], '--', label='Perfectly calibrated')
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.legend()

Class Imbalance with imblearn

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# imblearn Pipeline (not sklearn Pipeline) supports samplers
model = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(random_state=42))
])

# SMOTE is applied only during fit(), not during predict()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)