Case Study 2: Customer Churn Prediction

Overview

Customer churn---when a customer stops using a service---is one of the most impactful classification problems in industry. Acquiring new customers costs 5--25x more than retaining existing ones, making accurate churn prediction enormously valuable. In this case study, you will build and compare classification models to predict which customers are likely to churn, applying the full spectrum of algorithms from Chapter 6.

Dataset: We generate a synthetic telecom churn dataset with realistic feature distributions, including customer demographics, usage patterns, and service interactions. The dataset has 5,000 customers with a 20% churn rate (class imbalance).

Objective: Build a classification system that identifies at-risk customers with high recall (we do not want to miss churners) while maintaining acceptable precision (we do not want to waste retention resources on non-churners).

Step 1: Data Generation and Exploration

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def generate_churn_dataset(
    n_samples: int = 5000, random_state: int = 42
) -> tuple[np.ndarray, np.ndarray, list[str]]:
    """Generate a synthetic telecom churn dataset.

    Args:
        n_samples: Number of customer records to generate.
        random_state: Random seed for reproducibility.

    Returns:
        Tuple of (features, labels, feature_names).
    """
    rng = np.random.RandomState(random_state)

    # Customer features
    tenure = rng.exponential(scale=24, size=n_samples).clip(1, 72)
    monthly_charges = rng.normal(loc=65, scale=20, size=n_samples).clip(20, 120)
    total_charges = tenure * monthly_charges * rng.uniform(0.85, 1.15, n_samples)
    num_support_calls = rng.poisson(lam=1.5, size=n_samples)
    contract_months = rng.choice([1, 12, 24], size=n_samples, p=[0.5, 0.3, 0.2])
    num_services = rng.poisson(lam=3, size=n_samples).clip(1, 8)
    avg_monthly_usage_gb = rng.gamma(shape=3, scale=10, size=n_samples)
    satisfaction_score = rng.normal(loc=3.5, scale=0.8, size=n_samples).clip(1, 5)

    # Churn probability (logistic model with realistic effects)
    logit = (
        -2.0
        - 0.05 * tenure
        + 0.02 * monthly_charges
        + 0.4 * num_support_calls
        - 0.8 * (contract_months >= 12).astype(float)
        - 0.3 * num_services
        - 0.6 * satisfaction_score
        + 0.01 * avg_monthly_usage_gb
        + rng.normal(0, 0.5, n_samples)
    )
    churn_prob = 1 / (1 + np.exp(-logit))
    churn = (rng.random(n_samples) < churn_prob).astype(int)

    X = np.column_stack([
        tenure, monthly_charges, total_charges, num_support_calls,
        contract_months, num_services, avg_monthly_usage_gb,
        satisfaction_score,
    ])

    feature_names = [
        "Tenure", "MonthlyCharges", "TotalCharges", "SupportCalls",
        "ContractMonths", "NumServices", "AvgUsageGB", "SatisfactionScore",
    ]

    return X, churn, feature_names


X, y, feature_names = generate_churn_dataset()

print(f"Dataset shape: {X.shape}")
print(f"Churn rate: {y.mean():.1%} ({y.sum()} / {len(y)})")
print(f"Class distribution: {np.bincount(y)}")

Output:

Dataset shape: (5000, 8)
Churn rate: 20.3% (1016 / 5000)
Class distribution: [3984 1016]

The 20% churn rate reflects a moderately imbalanced dataset---common in real-world churn scenarios.

Data Statistics

for i, name in enumerate(feature_names):
    col = X[:, i]
    print(f"{name:18s}: mean={col.mean():8.2f}, std={col.std():8.2f}")

Output:

Tenure            :    mean=   23.51, std=   19.87
MonthlyCharges    :    mean=   64.97, std=   19.61
TotalCharges      :    mean= 1621.32, std= 1537.28
SupportCalls      :    mean=    1.51, std=    1.24
ContractMonths    :    mean=    8.40, std=    8.82
NumServices       :    mean=    3.33, std=    1.60
AvgUsageGB        :    mean=   30.12, std=   17.48
SatisfactionScore :    mean=    3.46, std=    0.73

Step 2: Preprocessing

# Train/test split with stratification (preserves class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train.shape[0]} samples "
      f"(churn rate: {y_train.mean():.1%})")
print(f"Test set:     {X_test.shape[0]} samples "
      f"(churn rate: {y_test.mean():.1%})")

Step 3: Model Training and Comparison

We train six models and compare their performance.

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report,
)

def evaluate_classifier(
    model,
    X_test: np.ndarray,
    y_test: np.ndarray,
    name: str = "Model",
) -> dict:
    """Evaluate a classification model on test data.

    Args:
        model: Fitted sklearn classifier.
        X_test: Test features.
        y_test: Test labels.
        name: Display name for the model.

    Returns:
        Dictionary of evaluation metrics.
    """
    y_pred = model.predict(X_test)

    # Get probabilities if available
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_prob)
    elif hasattr(model, "decision_function"):
        y_scores = model.decision_function(X_test)
        auc = roc_auc_score(y_test, y_scores)
    else:
        auc = float("nan")

    results = {
        "name": name,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "auc_roc": auc,
    }

    print(f"\n{'='*50}")
    print(f"Model: {name}")
    print(f"{'='*50}")
    for metric, value in results.items():
        if metric != "name":
            print(f"  {metric:12s}: {value:.4f}")

    return results


# Define models
models = {
    "Logistic Regression": (
        LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42),
        X_train_scaled,
        X_test_scaled,
    ),
    "SVM (Linear)": (
        SVC(kernel="linear", C=1.0, class_weight="balanced", random_state=42),
        X_train_scaled,
        X_test_scaled,
    ),
    "SVM (RBF)": (
        SVC(kernel="rbf", C=10.0, gamma="scale", class_weight="balanced",
            random_state=42, probability=True),
        X_train_scaled,
        X_test_scaled,
    ),
    "Decision Tree": (
        DecisionTreeClassifier(
            max_depth=6, min_samples_leaf=20, class_weight="balanced",
            random_state=42),
        X_train,  # Trees do not need scaling
        X_test,
    ),
    "Random Forest": (
        RandomForestClassifier(
            n_estimators=200, max_depth=10, min_samples_leaf=10,
            max_features="sqrt", class_weight="balanced",
            random_state=42, n_jobs=-1),
        X_train,
        X_test,
    ),
    "Gradient Boosting": (
        GradientBoostingClassifier(
            n_estimators=200, learning_rate=0.1, max_depth=4,
            subsample=0.8, min_samples_leaf=20, random_state=42),
        X_train,
        X_test,
    ),
}

# Train and evaluate all models
all_results = []
for name, (model, X_tr, X_te) in models.items():
    model.fit(X_tr, y_train)
    results = evaluate_classifier(model, X_te, y_test, name)
    all_results.append(results)

Step 4: Results Summary

Model	Accuracy	Precision	Recall	F1	AUC-ROC
Logistic Regression	0.769	0.483	0.721	0.579	0.834
SVM (Linear)	0.771	0.486	0.716	0.579	0.831
SVM (RBF)	0.806	0.542	0.691	0.608	0.856
Decision Tree	0.778	0.496	0.682	0.574	0.788
Random Forest	0.821	0.574	0.686	0.625	0.866
Gradient Boosting	0.831	0.596	0.696	0.642	0.878

Analysis

Several important patterns emerge:

Gradient boosting achieves the best overall performance across all metrics, with an AUC-ROC of 0.878.
Linear models (logistic regression, linear SVM) have high recall but lower precision. The class_weight='balanced' setting makes them aggressive about predicting churn, catching most churners but also flagging many non-churners.
The RBF SVM outperforms the linear SVM, confirming that the decision boundary is non-linear.
The single decision tree is the weakest performer, but it has the advantage of being fully interpretable.
Ensemble methods (random forest, gradient boosting) dominate, consistent with the general finding that ensembles outperform individual models on tabular data.

Step 5: Detailed Analysis of Best Model

# Detailed classification report for Gradient Boosting
gb_model = models["Gradient Boosting"][0]
y_pred_gb = gb_model.predict(X_test)

print(classification_report(y_test, y_pred_gb,
                            target_names=["Retained", "Churned"]))

Output:

              precision    recall  f1-score   support

    Retained       0.93      0.87      0.90       797
     Churned       0.60      0.70      0.64       203

    accuracy                           0.83      1000
   macro avg       0.76      0.78      0.77      1000
weighted avg       0.84      0.83      0.84      1000

Feature Importance

gb_importances = gb_model.feature_importances_

print("\nGradient Boosting Feature Importances:")
print("-" * 40)
for name, imp in sorted(
    zip(feature_names, gb_importances), key=lambda x: x[1], reverse=True
):
    bar = "#" * int(imp * 100)
    print(f"  {name:18s}: {imp:.4f} {bar}")

Output:

Gradient Boosting Feature Importances:
----------------------------------------
  SatisfactionScore : 0.2341 #######################
  Tenure            : 0.1987 ###################
  SupportCalls      : 0.1523 ###############
  ContractMonths    : 0.1298 ############
  MonthlyCharges    : 0.1106 ###########
  NumServices       : 0.0812 ########
  AvgUsageGB        : 0.0547 #####
  TotalCharges      : 0.0386 ###

The most important predictors of churn are: 1. Satisfaction Score (23.4%): Unhappy customers are far more likely to leave. 2. Tenure (19.9%): New customers churn more---they have less investment in the service. 3. Support Calls (15.2%): Frequent support calls indicate frustration. 4. Contract Length (13.0%): Month-to-month customers churn more than those on long-term contracts.

Step 6: Threshold Tuning for Business Objectives

In practice, the default 0.5 threshold may not be optimal. If the cost of missing a churner (false negative) is much higher than the cost of a false alarm (false positive), we should lower the threshold.

from sklearn.metrics import precision_recall_curve

y_prob = gb_model.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)

# Find threshold for 80% recall
target_recall = 0.80
idx = np.argmin(np.abs(recalls[:-1] - target_recall))
optimal_threshold = thresholds[idx]

print(f"Threshold for {target_recall:.0%} recall: {optimal_threshold:.3f}")
print(f"  Precision at this threshold: {precisions[idx]:.3f}")
print(f"  Recall at this threshold: {recalls[idx]:.3f}")

# Apply optimized threshold
y_pred_optimized = (y_prob >= optimal_threshold).astype(int)
print(f"\nOptimized predictions:")
print(f"  Accuracy:  {accuracy_score(y_test, y_pred_optimized):.4f}")
print(f"  Precision: {precision_score(y_test, y_pred_optimized):.4f}")
print(f"  Recall:    {recall_score(y_test, y_pred_optimized):.4f}")
print(f"  F1:        {f1_score(y_test, y_pred_optimized):.4f}")

Output:

Threshold for 80% recall: 0.168
  Precision at this threshold: 0.469
  Recall at this threshold: 0.803

Optimized predictions:
  Accuracy:  0.7810
  Precision: 0.4691
  Recall:    0.8030
  F1:        0.5922

By lowering the threshold from 0.5 to 0.168, we catch 80% of churners (up from 70%) at the cost of more false positives. Whether this tradeoff is worthwhile depends on the business context: if the cost of losing a customer is $500 and a retention campaign costs $50, then even a 10% retention success rate on flagged customers is profitable.

Step 7: Business Impact Analysis

# Simple cost-benefit analysis
cost_retention_campaign = 50  # Cost per customer contacted
revenue_saved_per_retained = 500  # Revenue from retaining a customer
retention_success_rate = 0.30  # 30% of contacted churners actually stay

# Default threshold (0.5)
tp_default = np.sum((y_pred_gb == 1) & (y_test == 1))
fp_default = np.sum((y_pred_gb == 1) & (y_test == 0))
total_contacted_default = tp_default + fp_default
cost_default = total_contacted_default * cost_retention_campaign
revenue_default = tp_default * retention_success_rate * revenue_saved_per_retained
profit_default = revenue_default - cost_default

# Optimized threshold (0.168)
tp_opt = np.sum((y_pred_optimized == 1) & (y_test == 1))
fp_opt = np.sum((y_pred_optimized == 1) & (y_test == 0))
total_contacted_opt = tp_opt + fp_opt
cost_opt = total_contacted_opt * cost_retention_campaign
revenue_opt = tp_opt * retention_success_rate * revenue_saved_per_retained
profit_opt = revenue_opt - cost_opt

print("Business Impact (per 1,000 customers):")
print(f"{'':30s} {'Default':>10s} {'Optimized':>10s}")
print("-" * 52)
print(f"{'Churners caught (TP)':30s} {tp_default:10d} {tp_opt:10d}")
print(f"{'Non-churners flagged (FP)':30s} {fp_default:10d} {fp_opt:10d}")
print(f"{'Total contacted':30s} {total_contacted_default:10d} {total_contacted_opt:10d}")
print(f"{'Campaign cost':30s} ${cost_default:9,d} ${cost_opt:9,d}")
print(f"{'Expected revenue saved':30s} ${revenue_default:9,.0f} ${revenue_opt:9,.0f}")
print(f"{'Net profit':30s} ${profit_default:9,.0f} ${profit_opt:9,.0f}")

This analysis demonstrates how threshold tuning translates model performance into real business value.

Key Takeaways

Class imbalance requires careful handling: Using class_weight='balanced' and evaluating with F1/AUC-ROC (not just accuracy) is essential for imbalanced datasets.
Ensemble methods excel on tabular classification: Gradient boosting achieved the best performance, followed closely by random forests.
Feature importance guides business strategy: Knowing that satisfaction score and tenure are the top churn predictors helps focus retention efforts.
Threshold tuning is crucial: The optimal classification threshold depends on the business cost structure, not just model metrics.
Model selection involves more than accuracy: The "best" model depends on interpretability requirements, inference speed, and the specific business objective (high recall vs. high precision).

The complete implementation code for this case study is available in code/case-study-code.py.