Appendix A: Python Reference

This appendix serves as a comprehensive quick-reference guide to every Python tool, library, and custom class used throughout AI & Machine Learning for Business. Whether you are debugging a lab exercise at midnight or refreshing your memory before a presentation, this is the page to bookmark. Code snippets are kept deliberately short; for full context and business applications, follow the chapter cross-references.


A.1 Python Basics Quick Reference

Variables and Data Types

# Numeric types
revenue = 1_250_000          # int (underscores improve readability)
growth_rate = 0.073          # float
complex_val = 3 + 4j         # complex (rarely used in business analytics)

# Boolean
is_churned = True

# Strings
company = "Acme Corp"
greeting = f"Welcome to {company}"   # f-string interpolation

# None
result = None

# Type checking
type(revenue)                # <class 'int'>
isinstance(growth_rate, float)  # True

Core Data Structures

# List — ordered, mutable
departments = ["Sales", "Marketing", "Engineering", "HR"]
departments.append("Finance")
departments[1:3]             # ["Marketing", "Engineering"]

# Tuple — ordered, immutable
coordinates = (40.7128, -74.0060)

# Dictionary — key-value pairs
employee = {
    "name": "Priya Sharma",
    "role": "Data Analyst",
    "tenure_years": 3
}
employee["role"]             # "Data Analyst"
employee.get("salary", 0)   # 0 (default when key is missing)

# Set — unique, unordered
unique_tags = {"ML", "NLP", "ML", "CV"}  # {"ML", "NLP", "CV"}

Operators

# Arithmetic
total = price * quantity
margin = revenue - cost
roi = (gain - cost) / cost   # returns float
floor_div = 17 // 5          # 3
remainder = 17 % 5           # 2
power = 2 ** 10              # 1024

# Comparison
x == y      # equality
x != y      # inequality
x >= y      # greater than or equal

# Logical
if revenue > 1_000_000 and growth_rate > 0.05:
    tier = "high"
elif revenue > 500_000 or is_strategic:
    tier = "medium"
else:
    tier = "low"

# Membership
"Sales" in departments       # True

# Walrus operator (Python 3.8+)
if (n := len(departments)) > 4:
    print(f"{n} departments found")

Control Flow

# if / elif / else
if score >= 90:
    grade = "A"
elif score >= 80:
    grade = "B"
else:
    grade = "C"

# for loop
for dept in departments:
    print(dept.upper())

# for with index
for i, dept in enumerate(departments, start=1):
    print(f"{i}. {dept}")

# while loop
attempts = 0
while attempts < 3:
    result = call_api()
    if result.ok:
        break
    attempts += 1

# Ternary expression
label = "Premium" if revenue > 1_000_000 else "Standard"

Functions

# Basic function
def calculate_cltv(avg_purchase: float,
                   frequency: float,
                   lifespan_years: float) -> float:
    """Calculate Customer Lifetime Value."""
    return avg_purchase * frequency * lifespan_years

# Default arguments
def greet(name: str, title: str = "Team Member") -> str:
    return f"Hello, {title} {name}"

# *args and **kwargs
def log_metrics(*values, **labels):
    for k, v in labels.items():
        print(f"{k}: {v}")

# Lambda (anonymous function)
sort_key = lambda x: x["revenue"]
customers.sort(key=sort_key, reverse=True)

# Lambda in pandas context
df["margin_pct"] = df.apply(lambda row: row["profit"] / row["revenue"], axis=1)

List Comprehensions and Generators

# List comprehension
squares = [x ** 2 for x in range(10)]
high_value = [c for c in customers if c["revenue"] > 100_000]

# Dictionary comprehension
name_to_revenue = {c["name"]: c["revenue"] for c in customers}

# Set comprehension
unique_cities = {c["city"] for c in customers}

# Generator expression (memory-efficient for large datasets)
total = sum(c["revenue"] for c in customers)

String Formatting

name = "Widget Pro"
price = 49.99
units = 1_250

# f-strings (preferred)
print(f"Product: {name}, Price: ${price:.2f}, Units: {units:,}")
# Output: Product: Widget Pro, Price: $49.99, Units: 1,250

# Alignment and padding
print(f"{'Item':<20} {'Price':>10}")   # left-align, right-align
print(f"{name:<20} {price:>10.2f}")

# Multiline f-strings
summary = (
    f"Revenue: ${revenue:,.0f}\n"
    f"Growth:  {growth_rate:.1%}\n"
    f"Churned: {is_churned}"
)

# Percentage formatting
print(f"Accuracy: {0.9432:.1%}")       # "Accuracy: 94.3%"

Error Handling

# Basic try/except
try:
    result = revenue / num_customers
except ZeroDivisionError:
    result = 0.0

# Multiple exception types
try:
    data = pd.read_csv(filepath)
except FileNotFoundError:
    print(f"File not found: {filepath}")
    data = pd.DataFrame()
except pd.errors.ParserError as e:
    print(f"Parse error: {e}")
    data = pd.DataFrame()

# try/except/else/finally
try:
    model = joblib.load("model.pkl")
except Exception as e:
    logging.error(f"Model load failed: {e}")
    model = None
else:
    print("Model loaded successfully")
finally:
    print("Load attempt complete")

# Raising exceptions
def set_discount(rate: float) -> None:
    if not 0.0 <= rate <= 1.0:
        raise ValueError(f"Discount rate must be 0-1, got {rate}")

Useful Built-in Functions

len(customers)               # number of items
max(revenues)                # largest value
min(revenues)                # smallest value
sum(revenues)                # total
sorted(revenues, reverse=True)  # sorted copy
abs(-42)                     # 42
round(3.14159, 2)            # 3.14
zip(names, scores)           # pair up two iterables
map(str.upper, names)        # apply function to each item
any([False, True, False])    # True (at least one)
all([True, True, True])      # True (every one)

A.2 pandas Quick Reference

Importing and Creating DataFrames

import pandas as pd
import numpy as np

# From CSV
df = pd.read_csv("sales_data.csv")
df = pd.read_csv("sales_data.csv", parse_dates=["order_date"], index_col="id")

# From Excel
df = pd.read_excel("report.xlsx", sheet_name="Q4")

# From dictionary
df = pd.DataFrame({
    "product": ["Widget", "Gadget", "Gizmo"],
    "revenue": [50000, 75000, 30000],
    "region": ["East", "West", "East"]
})

# From list of dictionaries
records = [
    {"name": "Alice", "score": 92},
    {"name": "Bob", "score": 85},
]
df = pd.DataFrame(records)

Inspection

df.head(10)                  # first 10 rows
df.tail(5)                   # last 5 rows
df.shape                     # (rows, columns)
df.columns                   # column names
df.dtypes                    # data types per column
df.info()                    # summary including non-null counts
df.describe()                # statistics for numeric columns
df.describe(include="object")  # statistics for categorical columns
df.nunique()                 # unique value counts per column
df.value_counts("region")    # frequency table for one column
df.sample(5)                 # random sample of 5 rows

Selection and Indexing

# Single column (returns Series)
df["revenue"]

# Multiple columns (returns DataFrame)
df[["product", "revenue"]]

# Row by label
df.loc[0]                    # row with index label 0
df.loc[0:5, "product":"revenue"]  # label-based slicing (inclusive)

# Row by position
df.iloc[0]                   # first row
df.iloc[0:5, 0:3]           # position-based slicing (exclusive end)

# Boolean indexing
high_rev = df[df["revenue"] > 50000]
east_high = df[(df["region"] == "East") & (df["revenue"] > 40000)]
selected = df[df["region"].isin(["East", "West"])]
missing = df[df["email"].isna()]

# Query syntax (alternative to boolean indexing)
df.query("revenue > 50000 and region == 'East'")

Adding and Modifying Columns

# New column from calculation
df["profit_margin"] = df["profit"] / df["revenue"]

# Conditional column
df["tier"] = np.where(df["revenue"] > 50000, "High", "Standard")

# Multiple conditions
conditions = [
    df["revenue"] > 100000,
    df["revenue"] > 50000,
]
choices = ["Premium", "Standard"]
df["tier"] = np.select(conditions, choices, default="Basic")

# Apply a function
df["name_upper"] = df["name"].apply(str.upper)

# Rename columns
df = df.rename(columns={"old_name": "new_name"})

# Drop columns
df = df.drop(columns=["temp_col", "debug_col"])

Groupby and Aggregation

# Single aggregation
df.groupby("region")["revenue"].sum()

# Multiple aggregations
summary = df.groupby("region").agg(
    total_revenue=("revenue", "sum"),
    avg_revenue=("revenue", "mean"),
    num_orders=("order_id", "count"),
    max_deal=("revenue", "max")
).reset_index()

# Multiple groupby columns
df.groupby(["region", "product"])["revenue"].mean()

# Transform (returns same-shaped Series)
df["pct_of_region"] = (
    df["revenue"] / df.groupby("region")["revenue"].transform("sum")
)

# Pivot table
pivot = df.pivot_table(
    values="revenue",
    index="region",
    columns="quarter",
    aggfunc="sum",
    fill_value=0,
    margins=True          # adds row/column totals
)

Merging and Joining

# Inner join (default)
merged = pd.merge(orders, customers, on="customer_id")

# Left join
merged = pd.merge(orders, customers, on="customer_id", how="left")

# Join on different column names
merged = pd.merge(
    orders, products,
    left_on="prod_code", right_on="product_id",
    how="left"
)

# Multiple join keys
merged = pd.merge(df1, df2, on=["year", "region"])

# Concatenation (stacking rows)
combined = pd.concat([df_q1, df_q2, df_q3, df_q4], ignore_index=True)

Missing Data

# Detection
df.isna().sum()              # count of NaN per column
df.isna().mean()             # fraction missing per column

# Dropping
df.dropna()                  # drop rows with any NaN
df.dropna(subset=["revenue", "region"])  # only check specific columns
df.dropna(thresh=3)          # keep rows with at least 3 non-NaN values

# Filling
df["revenue"].fillna(0)
df["region"].fillna("Unknown")
df["revenue"].fillna(df["revenue"].median())

# Forward/backward fill (time series)
df["stock_price"].ffill()
df["stock_price"].bfill()

# Interpolation
df["temperature"].interpolate(method="linear")

Sorting and Ranking

# Sort by column
df.sort_values("revenue", ascending=False)

# Sort by multiple columns
df.sort_values(["region", "revenue"], ascending=[True, False])

# Rank
df["revenue_rank"] = df["revenue"].rank(ascending=False, method="dense")

Date and Time Operations

# Parse dates
df["date"] = pd.to_datetime(df["date_str"])

# Extract components
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day_of_week"] = df["date"].dt.day_name()
df["quarter"] = df["date"].dt.quarter

# Date arithmetic
df["days_since"] = (pd.Timestamp.today() - df["date"]).dt.days

# Resample time series
monthly = df.set_index("date").resample("ME")["revenue"].sum()

Saving Data

df.to_csv("output.csv", index=False)
df.to_excel("output.xlsx", index=False, sheet_name="Results")
df.to_parquet("output.parquet")   # fast, compressed binary format

A.3 Visualization Quick Reference

matplotlib Basics

import matplotlib.pyplot as plt

# Figure and axes (recommended approach)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title("Quarterly Revenue by Region", fontsize=14)
ax.set_xlabel("Quarter")
ax.set_ylabel("Revenue ($)")
plt.tight_layout()
plt.savefig("chart.png", dpi=150, bbox_inches="tight")
plt.show()

matplotlib Plot Types

# Line plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(dates, revenue, marker="o", linewidth=2, label="Revenue")
ax.plot(dates, forecast, linestyle="--", label="Forecast")
ax.legend()

# Bar chart
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(regions, totals, color=["#2196F3", "#4CAF50", "#FF9800", "#F44336"])
ax.bar_label(ax.containers[0], fmt="$%,.0f")   # value labels on bars

# Horizontal bar chart
ax.barh(categories, values)

# Grouped bar chart
x = np.arange(len(regions))
width = 0.35
ax.bar(x - width / 2, q1_values, width, label="Q1")
ax.bar(x + width / 2, q2_values, width, label="Q2")
ax.set_xticks(x)
ax.set_xticklabels(regions)
ax.legend()

# Scatter plot
ax.scatter(df["spend"], df["revenue"], alpha=0.6, c=df["cluster"],
           cmap="viridis", s=50)

# Histogram
ax.hist(df["revenue"], bins=30, edgecolor="white", alpha=0.7)

# Pie chart (use sparingly)
ax.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)
ax.axis("equal")

# Subplots grid
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes[0, 0].bar(x, y1)
axes[0, 0].set_title("Chart A")
axes[0, 1].plot(x, y2)
axes[0, 1].set_title("Chart B")
# ... etc.
plt.tight_layout()

matplotlib Styling

# Color palettes
colors = plt.cm.Set2(np.linspace(0, 1, 8))

# Grid
ax.grid(True, alpha=0.3, linestyle="--")

# Axis formatting
from matplotlib.ticker import FuncFormatter
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, _: f"${x:,.0f}"))
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{x:.0%}"))

# Rotate tick labels
plt.xticks(rotation=45, ha="right")

# Annotation
ax.annotate("Peak", xy=(peak_x, peak_y),
            xytext=(peak_x + 1, peak_y + 500),
            arrowprops=dict(arrowstyle="->"), fontsize=10)

# Horizontal/vertical reference lines
ax.axhline(y=target, color="red", linestyle="--", label="Target")
ax.axvline(x=launch_date, color="gray", linestyle=":", alpha=0.7)

seaborn Plots

import seaborn as sns

# Set global theme
sns.set_theme(style="whitegrid", palette="muted", font_scale=1.1)

# Distribution plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(data=df, x="revenue", hue="region", kde=True, ax=ax)

# Box plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df, x="region", y="revenue", ax=ax)

# Violin plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.violinplot(data=df, x="department", y="salary", ax=ax)

# Pair plot (scatterplot matrix)
sns.pairplot(df[["revenue", "spend", "satisfaction", "segment"]],
             hue="segment", diag_kind="kde")

# Heatmap (correlation matrix)
fig, ax = plt.subplots(figsize=(10, 8))
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="RdBu_r",
            center=0, vmin=-1, vmax=1, ax=ax)

# Count plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.countplot(data=df, x="product_category",
              order=df["product_category"].value_counts().index, ax=ax)

# Regression plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.regplot(data=df, x="ad_spend", y="revenue", ax=ax,
            scatter_kws={"alpha": 0.5}, line_kws={"color": "red"})

# Bar plot with confidence intervals
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=df, x="region", y="revenue", hue="year",
            estimator="mean", errorbar="sd", ax=ax)

A.4 scikit-learn Quick Reference

Train/Test Split

from sklearn.model_selection import train_test_split

X = df.drop(columns=["target"])
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify for classification
)

The Universal Model Pattern

Every scikit-learn estimator follows the same three-step interface. This pattern appears in nearly every chapter of the book.

from sklearn.ensemble import RandomForestClassifier

# 1. Instantiate
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)

# 2. Fit
model.fit(X_train, y_train)

# 3. Predict and evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # probability of positive class
score = model.score(X_test, y_test)           # default metric (accuracy or R^2)

Classification Models

# Logistic Regression (Ch. 6, 7)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1.0, max_iter=1000, random_state=42)

# Random Forest Classifier (Ch. 7, 11)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
    n_estimators=200, max_depth=10, min_samples_leaf=5, random_state=42
)

# Gradient Boosting / XGBoost (Ch. 7, 11, 12)
from xgboost import XGBClassifier
model = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric="logloss", random_state=42
)

# Support Vector Machine (Ch. 11)
from sklearn.svm import SVC
model = SVC(kernel="rbf", C=1.0, probability=True, random_state=42)

# Naive Bayes (Ch. 14 — text classification)
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=1.0)

# K-Nearest Neighbors (Ch. 11)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5, metric="minkowski")

Regression Models

# Linear Regression (Ch. 6)
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Ridge Regression (L2 regularization) (Ch. 6)
from sklearn.linear_model import Ridge
model = Ridge(alpha=1.0)

# Lasso Regression (L1 regularization, feature selection) (Ch. 6)
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.1)

# Random Forest Regressor (Ch. 8)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)

# XGBoost Regressor (Ch. 8)
from xgboost import XGBRegressor
model = XGBRegressor(
    n_estimators=300, max_depth=6, learning_rate=0.1, random_state=42
)

Clustering

# K-Means (Ch. 9)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, n_init=10, random_state=42)
labels = kmeans.fit_predict(X_scaled)
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_

# Elbow method
inertias = []
K_range = range(2, 11)
for k in K_range:
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    km.fit(X_scaled)
    inertias.append(km.inertia_)

# Silhouette score
from sklearn.metrics import silhouette_score
sil = silhouette_score(X_scaled, labels)

# DBSCAN (Ch. 9)
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.5, min_samples=5)
labels = db.fit_predict(X_scaled)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

Evaluation Metrics — Classification

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score,
    precision_recall_curve, roc_curve
)

# Individual metrics
accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="binary")
recall    = recall_score(y_test, y_pred, average="binary")
f1        = f1_score(y_test, y_pred, average="binary")
auc       = roc_auc_score(y_test, y_proba)

# Full classification report
print(classification_report(y_test, y_pred, target_names=["Retained", "Churned"]))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Retained", "Churned"],
            yticklabels=["Retained", "Churned"])

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

Evaluation Metrics — Regression

from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    mean_absolute_percentage_error
)

mae  = mean_absolute_error(y_test, y_pred)
mse  = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2   = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

Cross-Validation

from sklearn.model_selection import cross_val_score, StratifiedKFold

# Quick cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring="f1")
print(f"Mean F1: {scores.mean():.3f} (+/- {scores.std():.3f})")

# Custom folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring="roc_auc")

Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Grid search
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15, None],
    "min_samples_leaf": [1, 2, 5]
}
grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid, cv=5, scoring="f1", n_jobs=-1, verbose=1
)
grid.fit(X_train, y_train)
print(grid.best_params_)
best_model = grid.best_estimator_

# Randomized search (faster for large grids)
from scipy.stats import randint, uniform
param_dist = {
    "n_estimators": randint(100, 500),
    "max_depth": randint(3, 20),
    "learning_rate": uniform(0.01, 0.3)
}
search = RandomizedSearchCV(
    XGBClassifier(random_state=42),
    param_dist, n_iter=50, cv=5, scoring="f1",
    n_jobs=-1, random_state=42
)
search.fit(X_train, y_train)

Preprocessing and Pipelines

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Numeric features
numeric_features = ["revenue", "tenure", "usage_hours"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical features
categorical_features = ["region", "plan_type"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

# Combined preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Full pipeline with model
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

Feature Importance

# Tree-based importance
importances = model.feature_importances_
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
feat_imp.head(15).plot(kind="barh")

# Permutation importance (model-agnostic)
from sklearn.inspection import permutation_importance
result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
perm_imp = pd.Series(result.importances_mean, index=feature_names).sort_values(ascending=False)

Model Persistence

import joblib

# Save
joblib.dump(pipeline, "churn_model_v1.pkl")

# Load
loaded_pipeline = joblib.load("churn_model_v1.pkl")
predictions = loaded_pipeline.predict(new_data)

A.5 NLP Quick Reference

NLTK Basics

import nltk

# Download required data (run once)
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")

# Tokenization
from nltk.tokenize import word_tokenize, sent_tokenize
tokens = word_tokenize("The product exceeded our expectations.")
sentences = sent_tokenize(paragraph)

# Stopword removal
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
filtered = [w for w in tokens if w.lower() not in stop_words]

# Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in tokens]

# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in tokens]

Text Preprocessing Pipeline

import re

def preprocess_text(text: str) -> str:
    """Standard text cleaning pipeline used throughout Chapters 13-15."""
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", "", text)     # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)              # remove non-alpha
    text = re.sub(r"\s+", " ", text).strip()           # collapse whitespace
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

TF-IDF Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

# Basic TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_tfidf = vectorizer.fit_transform(df["review_text"])

# With n-grams
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),        # unigrams and bigrams
    min_df=5,                  # ignore terms in fewer than 5 docs
    max_df=0.95,               # ignore terms in more than 95% of docs
    stop_words="english"
)

# Feature names
feature_names = vectorizer.get_feature_names_out()

# In a pipeline
from sklearn.naive_bayes import MultinomialNB
text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
    ("clf", MultinomialNB())
])
text_pipeline.fit(X_train_text, y_train)

Sentiment Analysis (VADER)

from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()
scores = sia.polarity_scores("This product is absolutely amazing!")
# {'neg': 0.0, 'neu': 0.296, 'pos': 0.704, 'compound': 0.7783}

# Apply to a DataFrame column
df["sentiment"] = df["review"].apply(lambda x: sia.polarity_scores(x)["compound"])
df["sentiment_label"] = df["sentiment"].apply(
    lambda x: "positive" if x > 0.05 else ("negative" if x < -0.05 else "neutral")
)

spaCy Patterns

import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("Apple reported $394 billion in revenue for fiscal year 2022.")

# Named Entity Recognition
for ent in doc.ents:
    print(f"{ent.text:20s} {ent.label_:10s}")
# Apple                ORG
# $394 billion         MONEY
# fiscal year 2022     DATE

# Part-of-speech tagging
for token in doc:
    print(f"{token.text:15s} {token.pos_:6s} {token.dep_:10s}")

# Noun chunks
for chunk in doc.noun_chunks:
    print(chunk.text)

# Similarity (requires medium or large model)
nlp_md = spacy.load("en_core_web_md")
doc1 = nlp_md("customer churn prediction")
doc2 = nlp_md("predicting client attrition")
print(f"Similarity: {doc1.similarity(doc2):.3f}")

A.6 LLM API Quick Reference

OpenAI API — Chat Completion

from openai import OpenAI

client = OpenAI()  # reads OPENAI_API_KEY from environment

# Basic completion
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a business analyst assistant."},
        {"role": "user", "content": "Summarize the key risks in this quarterly report."}
    ],
    temperature=0.3,
    max_tokens=1000
)
answer = response.choices[0].message.content

OpenAI API — Structured Output

from pydantic import BaseModel

class MarketAnalysis(BaseModel):
    market_size: float
    growth_rate: float
    key_competitors: list[str]
    risk_level: str
    summary: str

response = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "Extract market analysis data from the text."},
        {"role": "user", "content": report_text}
    ],
    response_format=MarketAnalysis
)
analysis = response.choices[0].message.parsed
print(f"Market size: ${analysis.market_size:,.0f}")
print(f"Growth rate: {analysis.growth_rate:.1%}")

OpenAI API — Function Calling

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_customer_data",
            "description": "Retrieve customer data by customer ID.",
            "parameters": {
                "type": "object",
                "properties": {
                    "customer_id": {
                        "type": "string",
                        "description": "The unique customer identifier"
                    },
                    "include_history": {
                        "type": "boolean",
                        "description": "Whether to include transaction history"
                    }
                },
                "required": ["customer_id"]
            }
        }
    }
]

response = client.chat.completions.create(
    model="gpt-4o",
    messages=messages,
    tools=tools,
    tool_choice="auto"
)

# Check if the model wants to call a function
if response.choices[0].message.tool_calls:
    tool_call = response.choices[0].message.tool_calls[0]
    function_name = tool_call.function.name
    arguments = json.loads(tool_call.function.arguments)
    # Execute the function and pass result back
    result = get_customer_data(**arguments)
    messages.append(response.choices[0].message)
    messages.append({
        "role": "tool",
        "tool_call_id": tool_call.id,
        "content": json.dumps(result)
    })

Anthropic API — Messages

from anthropic import Anthropic

client = Anthropic()  # reads ANTHROPIC_API_KEY from environment

response = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    system="You are a business strategy advisor.",
    messages=[
        {"role": "user", "content": "What are the top 3 risks for a SaaS startup entering the healthcare market?"}
    ]
)
answer = response.content[0].text

Anthropic API — Structured Output with Tool Use

response = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    tools=[
        {
            "name": "extract_financials",
            "description": "Extract financial metrics from a report.",
            "input_schema": {
                "type": "object",
                "properties": {
                    "revenue": {"type": "number", "description": "Total revenue in USD"},
                    "net_income": {"type": "number", "description": "Net income in USD"},
                    "yoy_growth": {"type": "number", "description": "Year-over-year growth rate"}
                },
                "required": ["revenue", "net_income"]
            }
        }
    ],
    messages=[
        {"role": "user", "content": f"Extract financials from this report:\n{report_text}"}
    ]
)

for block in response.content:
    if block.type == "tool_use":
        financials = block.input
        print(f"Revenue: ${financials['revenue']:,.0f}")

Common LLM Parameters

Parameter Typical Range Purpose
temperature 0.0 -- 1.0 Controls randomness. Use 0.0--0.3 for factual/analytical tasks, 0.5--0.8 for creative tasks.
max_tokens 100 -- 4096+ Maximum length of the response.
top_p 0.0 -- 1.0 Nucleus sampling. Alternative to temperature. Usually set one or the other, not both.
stop list of strings Sequences where the model stops generating.
presence_penalty -2.0 -- 2.0 Penalizes repeated topics. Higher values encourage new topics. (OpenAI)
frequency_penalty -2.0 -- 2.0 Penalizes repeated tokens. Higher values reduce repetition. (OpenAI)

Prompt Engineering Patterns

# Few-shot prompting
few_shot_prompt = """Classify the customer review as POSITIVE, NEGATIVE, or NEUTRAL.

Review: "The delivery was incredibly fast and the product quality is outstanding."
Classification: POSITIVE

Review: "It works okay but nothing special for the price."
Classification: NEUTRAL

Review: "{user_review}"
Classification:"""

# Chain-of-thought prompting
cot_prompt = """Analyze this business scenario step by step.

Scenario: {scenario}

Think through this step by step:
1. What are the key factors?
2. What are the potential outcomes?
3. What is your recommendation and why?"""

# Role-based prompting
system_msg = """You are a senior financial analyst at a Fortune 500 company.
You specialize in risk assessment and always support your conclusions with data.
When uncertain, you clearly state your confidence level."""

A.7 Fairness and Explainability Quick Reference

Fairlearn — Bias Assessment

from fairlearn.metrics import (
    MetricFrame,
    demographic_parity_difference,
    equalized_odds_difference,
    demographic_parity_ratio
)
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Compute metrics by group
metric_frame = MetricFrame(
    metrics={
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score
    },
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=df_test["gender"]
)

# View results
print(metric_frame.by_group)
print(f"Accuracy difference: {metric_frame.difference()['accuracy']:.3f}")
print(f"Accuracy ratio: {metric_frame.ratio()['accuracy']:.3f}")

# Demographic parity
dp_diff = demographic_parity_difference(
    y_test, y_pred, sensitive_features=df_test["gender"]
)
print(f"Demographic parity difference: {dp_diff:.3f}")

Fairlearn — Bias Mitigation

# Threshold optimization (post-processing)
from fairlearn.postprocessing import ThresholdOptimizer

mitigated = ThresholdOptimizer(
    estimator=model,
    constraints="demographic_parity",
    objective="accuracy_score",
    prefit=True
)
mitigated.fit(X_train, y_train, sensitive_features=train_sensitive)
y_pred_fair = mitigated.predict(X_test, sensitive_features=test_sensitive)

# Exponentiated Gradient (in-processing)
from fairlearn.reductions import ExponentiatedGradient, DemographicParity

mitigator = ExponentiatedGradient(
    estimator=LogisticRegression(max_iter=1000),
    constraints=DemographicParity()
)
mitigator.fit(X_train, y_train, sensitive_features=train_sensitive)

SHAP — SHapley Additive exPlanations

import shap

# TreeExplainer (fast for tree-based models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Summary plot (feature importance with direction)
shap.summary_plot(shap_values, X_test, feature_names=feature_names)

# Bar plot (mean absolute SHAP values)
shap.summary_plot(shap_values, X_test, plot_type="bar")

# Force plot (single prediction explanation)
shap.initjs()
shap.force_plot(
    explainer.expected_value, shap_values[0], X_test.iloc[0],
    feature_names=feature_names
)

# Waterfall plot (single prediction, newer API)
shap.plots.waterfall(shap.Explanation(
    values=shap_values[0],
    base_values=explainer.expected_value,
    data=X_test.iloc[0],
    feature_names=feature_names
))

# Dependence plot (one feature vs. SHAP value)
shap.dependence_plot("tenure_months", shap_values, X_test)

# KernelExplainer (model-agnostic, slower)
explainer = shap.KernelExplainer(model.predict_proba, shap.sample(X_train, 100))
shap_values = explainer.shap_values(X_test.iloc[:50])

LIME — Local Interpretable Model-agnostic Explanations

from lime.lime_tabular import LimeTabularExplainer

explainer = LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=feature_names,
    class_names=["Retained", "Churned"],
    mode="classification"
)

# Explain a single prediction
explanation = explainer.explain_instance(
    X_test.iloc[0].values,
    model.predict_proba,
    num_features=10,
    num_samples=5000
)

# Display
explanation.show_in_notebook()

# As a list
for feature, weight in explanation.as_list():
    print(f"{feature:40s} {weight:+.4f}")

# For text classification
from lime.lime_text import LimeTextExplainer
text_explainer = LimeTextExplainer(class_names=["Negative", "Positive"])
text_exp = text_explainer.explain_instance(
    review_text, pipeline.predict_proba, num_features=10
)

A.8 Book-Specific Tools Reference

This section documents every custom class built in the book's chapter exercises. Each entry includes the class signature, key methods, parameters, return types, and a usage example. Use this as an API reference when adapting the tools for your own projects.


EDAReport (Chapter 5)

Generates a comprehensive Exploratory Data Analysis report for any DataFrame, including summary statistics, missing-data analysis, distribution profiles, and a correlation matrix.

class EDAReport:
    """Automated Exploratory Data Analysis reporting tool.

    Parameters
    ----------
    df : pd.DataFrame
        The dataset to analyze.
    target_col : str, optional
        The target/outcome column for supervised analysis context.
    """

    def __init__(self, df: pd.DataFrame, target_col: str | None = None) -> None:
        self.df = df
        self.target_col = target_col

    def summary_stats(self) -> pd.DataFrame:
        """Return descriptive statistics for all columns.

        Returns
        -------
        pd.DataFrame
            Extended describe() output including dtype, missing count,
            missing percentage, unique count, and skewness for numerics.
        """
        ...

    def missing_report(self) -> pd.DataFrame:
        """Return a DataFrame of columns with missing values.

        Returns
        -------
        pd.DataFrame
            Columns: column_name, missing_count, missing_pct, dtype.
            Sorted by missing_pct descending.
        """
        ...

    def correlation_matrix(self, method: str = "pearson",
                           threshold: float = 0.0) -> pd.DataFrame:
        """Compute correlation matrix for numeric columns.

        Parameters
        ----------
        method : str
            Correlation method: 'pearson', 'spearman', or 'kendall'.
        threshold : float
            Only return pairs with |correlation| >= threshold.

        Returns
        -------
        pd.DataFrame
            Correlation matrix or filtered pairs table.
        """
        ...

    def plot_distributions(self, cols: list[str] | None = None,
                           bins: int = 30) -> plt.Figure:
        """Plot histograms for numeric columns and bar charts for categoricals.

        Parameters
        ----------
        cols : list[str], optional
            Columns to plot. If None, plots all columns (max 20).
        bins : int
            Number of histogram bins for numeric columns.

        Returns
        -------
        matplotlib.figure.Figure
            The figure object containing the subplot grid.
        """
        ...

    def target_analysis(self) -> dict:
        """Analyze relationship between features and target column.

        Returns
        -------
        dict
            Keys: 'target_distribution' (value counts),
                  'numeric_correlations' (Series of correlations with target),
                  'categorical_associations' (dict of chi-squared p-values).
        """
        ...

    def full_report(self, save_path: str | None = None) -> dict:
        """Run all analyses and optionally save to HTML.

        Parameters
        ----------
        save_path : str, optional
            File path to save the HTML report.

        Returns
        -------
        dict
            Keys: 'summary', 'missing', 'correlations', 'target_analysis'.
        """
        ...

Usage:

from eda_report import EDAReport

report = EDAReport(df, target_col="churned")
stats = report.summary_stats()
report.plot_distributions(cols=["revenue", "tenure", "region"])
full = report.full_report(save_path="eda_output.html")

ChurnClassifier (Chapter 7)

End-to-end churn prediction pipeline that handles preprocessing, model training, evaluation, and feature importance analysis.

class ChurnClassifier:
    """End-to-end churn prediction pipeline.

    Parameters
    ----------
    numeric_features : list[str]
        Names of numeric columns.
    categorical_features : list[str]
        Names of categorical columns.
    model_type : str
        One of 'logistic', 'random_forest', 'xgboost'.
    random_state : int
        Seed for reproducibility.
    """

    def __init__(self, numeric_features: list[str],
                 categorical_features: list[str],
                 model_type: str = "random_forest",
                 random_state: int = 42) -> None:
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        self.model_type = model_type
        self.random_state = random_state
        self.pipeline = None
        self.results = None

    def build_pipeline(self, **model_params) -> "ChurnClassifier":
        """Construct the sklearn Pipeline with preprocessing and classifier.

        Parameters
        ----------
        **model_params
            Keyword arguments passed to the classifier constructor.

        Returns
        -------
        ChurnClassifier
            Self, for method chaining.
        """
        ...

    def train(self, X_train: pd.DataFrame,
              y_train: pd.Series) -> "ChurnClassifier":
        """Fit the pipeline on training data.

        Returns
        -------
        ChurnClassifier
            Self, for method chaining.
        """
        ...

    def evaluate(self, X_test: pd.DataFrame,
                 y_test: pd.Series) -> dict:
        """Evaluate model on test data.

        Returns
        -------
        dict
            Keys: 'accuracy', 'precision', 'recall', 'f1', 'roc_auc',
                  'confusion_matrix', 'classification_report'.
        """
        ...

    def feature_importance(self, top_n: int = 15) -> pd.DataFrame:
        """Extract and rank feature importances.

        Parameters
        ----------
        top_n : int
            Number of top features to return.

        Returns
        -------
        pd.DataFrame
            Columns: feature, importance. Sorted descending.
        """
        ...

    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
        """Return churn probabilities for new data.

        Returns
        -------
        np.ndarray
            Array of shape (n_samples,) with churn probabilities.
        """
        ...

    def plot_roc_curve(self, X_test: pd.DataFrame,
                       y_test: pd.Series) -> plt.Figure:
        """Plot the ROC curve with AUC annotation.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

Usage:

from churn_classifier import ChurnClassifier

clf = ChurnClassifier(
    numeric_features=["tenure", "monthly_charges", "total_charges"],
    categorical_features=["contract", "payment_method"],
    model_type="xgboost"
)
clf.build_pipeline(n_estimators=300, max_depth=6, learning_rate=0.1)
clf.train(X_train, y_train)
results = clf.evaluate(X_test, y_test)
print(f"ROC AUC: {results['roc_auc']:.3f}")
top_features = clf.feature_importance(top_n=10)

DemandForecaster (Chapter 8)

Time-series demand forecasting tool supporting multiple model types, automatic feature engineering from date components, and forecast visualization.

class DemandForecaster:
    """Time-series demand forecasting tool.

    Parameters
    ----------
    date_col : str
        Name of the date/datetime column.
    target_col : str
        Name of the demand/quantity column.
    freq : str
        Frequency string: 'D' (daily), 'W' (weekly), 'ME' (monthly).
    model_type : str
        One of 'linear', 'random_forest', 'xgboost'.
    """

    def __init__(self, date_col: str, target_col: str,
                 freq: str = "ME",
                 model_type: str = "xgboost") -> None:
        self.date_col = date_col
        self.target_col = target_col
        self.freq = freq
        self.model_type = model_type
        self.model = None
        self.feature_names = None

    def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create time-series features: lags, rolling means, date parts.

        Parameters
        ----------
        df : pd.DataFrame
            Must contain the date column and target column.

        Returns
        -------
        pd.DataFrame
            Original data augmented with engineered features.
        """
        ...

    def train(self, df: pd.DataFrame,
              test_size: float = 0.2) -> dict:
        """Train the forecasting model with time-aware split.

        Parameters
        ----------
        df : pd.DataFrame
            Historical demand data.
        test_size : float
            Fraction of data reserved for testing (taken from end).

        Returns
        -------
        dict
            Keys: 'train_rmse', 'test_rmse', 'train_mape', 'test_mape'.
        """
        ...

    def forecast(self, periods: int) -> pd.DataFrame:
        """Generate future demand forecasts.

        Parameters
        ----------
        periods : int
            Number of future periods to forecast.

        Returns
        -------
        pd.DataFrame
            Columns: date, predicted_demand, lower_bound, upper_bound.
        """
        ...

    def plot_forecast(self, historical: pd.DataFrame,
                      forecast: pd.DataFrame) -> plt.Figure:
        """Visualize historical data alongside forecast with confidence interval.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

Usage:

from demand_forecaster import DemandForecaster

forecaster = DemandForecaster(
    date_col="order_date", target_col="units_sold",
    freq="ME", model_type="xgboost"
)
metrics = forecaster.train(sales_df, test_size=0.2)
print(f"Test RMSE: {metrics['test_rmse']:,.0f}")

future = forecaster.forecast(periods=6)
forecaster.plot_forecast(sales_df, future)

CustomerSegmenter (Chapter 9)

RFM-based customer segmentation tool using K-Means clustering with automatic scaling, elbow analysis, and segment profiling.

class CustomerSegmenter:
    """RFM-based customer segmentation using K-Means.

    Parameters
    ----------
    n_segments : int
        Number of customer segments to create.
    features : list[str]
        Columns to use for clustering (e.g., RFM features).
    random_state : int
        Seed for reproducibility.
    """

    def __init__(self, n_segments: int = 4,
                 features: list[str] | None = None,
                 random_state: int = 42) -> None:
        self.n_segments = n_segments
        self.features = features
        self.random_state = random_state
        self.scaler = None
        self.kmeans = None
        self.segment_profiles = None

    def compute_rfm(self, df: pd.DataFrame,
                    customer_col: str, date_col: str,
                    amount_col: str,
                    reference_date: str | None = None) -> pd.DataFrame:
        """Compute Recency, Frequency, Monetary values per customer.

        Parameters
        ----------
        df : pd.DataFrame
            Transaction-level data.
        customer_col : str
            Column identifying the customer.
        date_col : str
            Transaction date column.
        amount_col : str
            Transaction amount column.
        reference_date : str, optional
            Date string for recency calculation. Defaults to max date + 1 day.

        Returns
        -------
        pd.DataFrame
            One row per customer with columns: recency, frequency, monetary.
        """
        ...

    def find_optimal_k(self, X: pd.DataFrame,
                       k_range: range = range(2, 11)) -> plt.Figure:
        """Run elbow analysis and silhouette scoring.

        Parameters
        ----------
        X : pd.DataFrame
            Feature matrix for clustering.
        k_range : range
            Range of k values to evaluate.

        Returns
        -------
        matplotlib.figure.Figure
            Dual-axis plot with inertia (elbow) and silhouette scores.
        """
        ...

    def fit(self, X: pd.DataFrame) -> "CustomerSegmenter":
        """Scale features and fit K-Means.

        Returns
        -------
        CustomerSegmenter
            Self, for method chaining.
        """
        ...

    def profile_segments(self, df: pd.DataFrame) -> pd.DataFrame:
        """Generate descriptive profiles for each segment.

        Returns
        -------
        pd.DataFrame
            Segment-level aggregation with mean feature values,
            segment size, and percentage of total.
        """
        ...

    def predict(self, X_new: pd.DataFrame) -> np.ndarray:
        """Assign new customers to existing segments.

        Returns
        -------
        np.ndarray
            Segment labels for each row.
        """
        ...

    def plot_segments(self, df: pd.DataFrame,
                      x_col: str, y_col: str) -> plt.Figure:
        """Scatter plot of customers colored by segment.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

Usage:

from customer_segmenter import CustomerSegmenter

segmenter = CustomerSegmenter(n_segments=4)
rfm = segmenter.compute_rfm(
    transactions, customer_col="customer_id",
    date_col="purchase_date", amount_col="amount"
)
segmenter.find_optimal_k(rfm)
segmenter.fit(rfm)
profiles = segmenter.profile_segments(rfm)
print(profiles)
segmenter.plot_segments(rfm, x_col="recency", y_col="monetary")

RecommendationEngine (Chapter 10)

Hybrid recommendation engine supporting collaborative filtering, content-based filtering, and a weighted hybrid approach.

class RecommendationEngine:
    """Hybrid recommendation engine for product/content recommendations.

    Parameters
    ----------
    method : str
        One of 'collaborative', 'content', 'hybrid'.
    n_recommendations : int
        Default number of recommendations to return.
    collaborative_weight : float
        Weight for collaborative filtering in hybrid mode (0.0 to 1.0).
        Content weight = 1.0 - collaborative_weight.
    """

    def __init__(self, method: str = "hybrid",
                 n_recommendations: int = 10,
                 collaborative_weight: float = 0.6) -> None:
        self.method = method
        self.n_recommendations = n_recommendations
        self.collaborative_weight = collaborative_weight
        self.user_item_matrix = None
        self.similarity_matrix = None
        self.content_features = None

    def fit_collaborative(self, interactions: pd.DataFrame,
                          user_col: str, item_col: str,
                          rating_col: str) -> "RecommendationEngine":
        """Build user-item interaction matrix and compute similarity.

        Parameters
        ----------
        interactions : pd.DataFrame
            User-item interaction data.
        user_col : str
            Column identifying the user.
        item_col : str
            Column identifying the item.
        rating_col : str
            Column with rating/interaction value.

        Returns
        -------
        RecommendationEngine
            Self, for method chaining.
        """
        ...

    def fit_content(self, items: pd.DataFrame,
                    item_col: str,
                    feature_cols: list[str]) -> "RecommendationEngine":
        """Build content-based feature vectors and similarity.

        Parameters
        ----------
        items : pd.DataFrame
            Item metadata.
        item_col : str
            Column identifying the item.
        feature_cols : list[str]
            Columns containing item features for similarity computation.

        Returns
        -------
        RecommendationEngine
            Self, for method chaining.
        """
        ...

    def recommend(self, user_id: str,
                  n: int | None = None,
                  exclude_seen: bool = True) -> pd.DataFrame:
        """Generate recommendations for a user.

        Parameters
        ----------
        user_id : str
            The user to generate recommendations for.
        n : int, optional
            Number of recommendations. Defaults to self.n_recommendations.
        exclude_seen : bool
            Whether to exclude items the user has already interacted with.

        Returns
        -------
        pd.DataFrame
            Columns: item_id, score, method.
            Sorted by score descending.
        """
        ...

    def similar_items(self, item_id: str,
                      n: int = 10) -> pd.DataFrame:
        """Find items similar to a given item.

        Returns
        -------
        pd.DataFrame
            Columns: item_id, similarity_score.
        """
        ...

    def evaluate(self, test_interactions: pd.DataFrame,
                 k: int = 10) -> dict:
        """Evaluate recommendation quality.

        Parameters
        ----------
        test_interactions : pd.DataFrame
            Held-out interaction data.
        k : int
            Number of recommendations to evaluate (precision@k, recall@k).

        Returns
        -------
        dict
            Keys: 'precision_at_k', 'recall_at_k', 'ndcg_at_k', 'coverage'.
        """
        ...

Usage:

from recommendation_engine import RecommendationEngine

engine = RecommendationEngine(method="hybrid", collaborative_weight=0.7)
engine.fit_collaborative(ratings, user_col="user_id",
                         item_col="product_id", rating_col="rating")
engine.fit_content(products, item_col="product_id",
                   feature_cols=["category", "brand", "price_tier"])

recs = engine.recommend(user_id="U1042", n=5)
print(recs)

similar = engine.similar_items(item_id="P2001", n=5)
metrics = engine.evaluate(test_ratings, k=10)
print(f"Precision@10: {metrics['precision_at_k']:.3f}")

ModelEvaluator (Chapter 11)

Comprehensive model evaluation toolkit that compares multiple models side-by-side with cross-validation, generates visual comparisons, and produces a summary report.

class ModelEvaluator:
    """Compare and evaluate multiple ML models side-by-side.

    Parameters
    ----------
    task : str
        One of 'classification' or 'regression'.
    cv_folds : int
        Number of cross-validation folds.
    scoring : str | list[str]
        Scoring metric(s) for evaluation.
    random_state : int
        Seed for reproducibility.
    """

    def __init__(self, task: str = "classification",
                 cv_folds: int = 5,
                 scoring: str | list[str] = "f1",
                 random_state: int = 42) -> None:
        self.task = task
        self.cv_folds = cv_folds
        self.scoring = scoring
        self.random_state = random_state
        self.models = {}
        self.results = {}

    def add_model(self, name: str, model: object) -> "ModelEvaluator":
        """Register a model for evaluation.

        Parameters
        ----------
        name : str
            Human-readable model name (e.g., "Random Forest").
        model : object
            A scikit-learn compatible estimator (or Pipeline).

        Returns
        -------
        ModelEvaluator
            Self, for method chaining.
        """
        ...

    def run_evaluation(self, X: pd.DataFrame,
                       y: pd.Series) -> pd.DataFrame:
        """Run cross-validated evaluation on all registered models.

        Returns
        -------
        pd.DataFrame
            One row per model with mean and std for each scoring metric.
        """
        ...

    def holdout_evaluation(self, X_train: pd.DataFrame,
                           y_train: pd.Series,
                           X_test: pd.DataFrame,
                           y_test: pd.Series) -> pd.DataFrame:
        """Train on training set and evaluate on held-out test set.

        Returns
        -------
        pd.DataFrame
            One row per model with test metrics.
        """
        ...

    def plot_comparison(self, metric: str | None = None) -> plt.Figure:
        """Box plot comparing model performance across CV folds.

        Parameters
        ----------
        metric : str, optional
            Metric to plot. Defaults to primary scoring metric.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def best_model(self, metric: str | None = None) -> tuple[str, object]:
        """Return the name and fitted instance of the best-performing model.

        Returns
        -------
        tuple[str, object]
            (model_name, model_instance)
        """
        ...

    def summary_report(self) -> str:
        """Generate a formatted text summary of all evaluations.

        Returns
        -------
        str
            Multi-line report with rankings, key metrics, and recommendation.
        """
        ...

Usage:

from model_evaluator import ModelEvaluator

evaluator = ModelEvaluator(task="classification", cv_folds=5, scoring=["f1", "roc_auc"])
evaluator.add_model("Logistic Regression", LogisticRegression(max_iter=1000))
evaluator.add_model("Random Forest", RandomForestClassifier(n_estimators=200))
evaluator.add_model("XGBoost", XGBClassifier(n_estimators=300, learning_rate=0.1))

cv_results = evaluator.run_evaluation(X, y)
print(cv_results)
evaluator.plot_comparison(metric="f1")

best_name, best_model = evaluator.best_model(metric="roc_auc")
print(f"Best model: {best_name}")

ReviewAnalyzer (Chapter 14)

NLP-powered customer review analysis tool that performs sentiment analysis, topic extraction, and trend detection on text data.

class ReviewAnalyzer:
    """NLP-powered customer review analysis tool.

    Parameters
    ----------
    text_col : str
        Name of the column containing review text.
    date_col : str, optional
        Name of the date column for trend analysis.
    rating_col : str, optional
        Name of the numeric rating column.
    """

    def __init__(self, text_col: str = "review_text",
                 date_col: str | None = None,
                 rating_col: str | None = None) -> None:
        self.text_col = text_col
        self.date_col = date_col
        self.rating_col = rating_col
        self.vectorizer = None
        self.sentiment_analyzer = None

    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean and tokenize review text.

        Returns
        -------
        pd.DataFrame
            Original DataFrame with added 'clean_text' and 'tokens' columns.
        """
        ...

    def analyze_sentiment(self, df: pd.DataFrame) -> pd.DataFrame:
        """Compute sentiment scores for each review.

        Returns
        -------
        pd.DataFrame
            Added columns: 'sentiment_score' (-1 to 1),
            'sentiment_label' (positive/negative/neutral).
        """
        ...

    def extract_topics(self, df: pd.DataFrame,
                       n_topics: int = 5,
                       n_words: int = 10,
                       method: str = "lda") -> dict:
        """Extract topics from the review corpus.

        Parameters
        ----------
        n_topics : int
            Number of topics to extract.
        n_words : int
            Number of top words per topic.
        method : str
            Topic modeling method: 'lda' or 'nmf'.

        Returns
        -------
        dict
            Keys: 'topics' (list of word lists), 'topic_labels' (list of str),
            'document_topics' (np.ndarray of shape (n_docs, n_topics)).
        """
        ...

    def extract_keywords(self, df: pd.DataFrame,
                         top_n: int = 20,
                         by_sentiment: bool = False) -> pd.DataFrame:
        """Extract most important keywords using TF-IDF.

        Parameters
        ----------
        top_n : int
            Number of keywords to return.
        by_sentiment : bool
            If True, return separate keyword lists for positive and negative.

        Returns
        -------
        pd.DataFrame
            Columns: keyword, tfidf_score, (sentiment_label if by_sentiment).
        """
        ...

    def sentiment_trends(self, df: pd.DataFrame,
                         freq: str = "ME") -> pd.DataFrame:
        """Track sentiment over time.

        Parameters
        ----------
        freq : str
            Time frequency for aggregation.

        Returns
        -------
        pd.DataFrame
            Columns: date, avg_sentiment, review_count, pct_positive, pct_negative.
        """
        ...

    def plot_sentiment_distribution(self, df: pd.DataFrame) -> plt.Figure:
        """Visualize the distribution of sentiment scores.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def full_report(self, df: pd.DataFrame) -> dict:
        """Run all analyses and return combined results.

        Returns
        -------
        dict
            Keys: 'sentiment_summary', 'topics', 'keywords',
            'trends' (if date_col provided), 'rating_sentiment_correlation'
            (if rating_col provided).
        """
        ...

Usage:

from review_analyzer import ReviewAnalyzer

analyzer = ReviewAnalyzer(
    text_col="review_text", date_col="review_date", rating_col="stars"
)
df = analyzer.preprocess(reviews_df)
df = analyzer.analyze_sentiment(df)

topics = analyzer.extract_topics(df, n_topics=5, method="lda")
for i, topic_words in enumerate(topics["topics"]):
    print(f"Topic {i+1}: {', '.join(topic_words)}")

trends = analyzer.sentiment_trends(df, freq="ME")
report = analyzer.full_report(df)

PromptBuilder (Chapter 19)

Structured prompt construction tool for LLM interactions that supports templates, variable injection, few-shot examples, and system message configuration.

class PromptBuilder:
    """Structured prompt construction tool for LLM interactions.

    Parameters
    ----------
    model : str
        LLM model identifier (e.g., 'gpt-4o', 'claude-sonnet-4-20250514').
    default_temperature : float
        Default temperature for completions.
    """

    def __init__(self, model: str = "gpt-4o",
                 default_temperature: float = 0.3) -> None:
        self.model = model
        self.default_temperature = default_temperature
        self.templates = {}
        self.system_message = None

    def set_system_message(self, message: str) -> "PromptBuilder":
        """Set the system message for all prompts.

        Returns
        -------
        PromptBuilder
            Self, for method chaining.
        """
        ...

    def add_template(self, name: str, template: str,
                     required_vars: list[str] | None = None) -> "PromptBuilder":
        """Register a reusable prompt template.

        Parameters
        ----------
        name : str
            Template identifier.
        template : str
            Template string with {variable} placeholders.
        required_vars : list[str], optional
            List of required variable names. Validated at render time.

        Returns
        -------
        PromptBuilder
            Self, for method chaining.
        """
        ...

    def render(self, template_name: str,
               **variables) -> str:
        """Render a template with the provided variables.

        Parameters
        ----------
        template_name : str
            Name of a registered template.
        **variables
            Variable values to inject into the template.

        Returns
        -------
        str
            The rendered prompt string.

        Raises
        ------
        ValueError
            If required variables are missing.
        KeyError
            If the template name is not registered.
        """
        ...

    def build_messages(self, user_content: str,
                       few_shot_examples: list[dict] | None = None
                       ) -> list[dict]:
        """Build a complete messages list for the chat API.

        Parameters
        ----------
        user_content : str
            The user's prompt content.
        few_shot_examples : list[dict], optional
            List of {"user": ..., "assistant": ...} example pairs.

        Returns
        -------
        list[dict]
            Messages list ready for API call.
        """
        ...

    def estimate_tokens(self, text: str) -> int:
        """Estimate token count for a given text.

        Returns
        -------
        int
            Approximate token count (using word-based heuristic).
        """
        ...

    def validate_prompt(self, prompt: str,
                        max_tokens: int = 4096) -> dict:
        """Check prompt for common issues.

        Returns
        -------
        dict
            Keys: 'is_valid' (bool), 'estimated_tokens' (int),
            'warnings' (list[str]).
        """
        ...

Usage:

from prompt_builder import PromptBuilder

builder = PromptBuilder(model="gpt-4o", default_temperature=0.3)
builder.set_system_message("You are a financial analyst specializing in SaaS metrics.")
builder.add_template(
    "analysis",
    "Analyze the following quarterly data for {company}:\n\n{data}\n\n"
    "Focus on: {focus_areas}\n\nProvide your analysis in a structured format.",
    required_vars=["company", "data", "focus_areas"]
)

prompt = builder.render(
    "analysis",
    company="Acme SaaS",
    data=quarterly_summary,
    focus_areas="churn rate trends, expansion revenue, CAC payback"
)

messages = builder.build_messages(prompt, few_shot_examples=[
    {"user": "Analyze Q1 data...", "assistant": "## Q1 Analysis\n..."}
])

PromptChain (Chapter 20)

Orchestration tool for multi-step LLM workflows that chains prompts together, passing outputs from one step as inputs to the next.

class PromptChain:
    """Orchestrate multi-step LLM workflows.

    Parameters
    ----------
    client : object
        An initialized LLM client (OpenAI or Anthropic).
    model : str
        Model identifier for all steps.
    verbose : bool
        If True, print intermediate results.
    """

    def __init__(self, client: object,
                 model: str = "gpt-4o",
                 verbose: bool = False) -> None:
        self.client = client
        self.model = model
        self.verbose = verbose
        self.steps = []
        self.results = {}

    def add_step(self, name: str, prompt_template: str,
                 input_map: dict[str, str] | None = None,
                 temperature: float = 0.3,
                 max_tokens: int = 1000,
                 parser: callable | None = None) -> "PromptChain":
        """Add a step to the chain.

        Parameters
        ----------
        name : str
            Unique step identifier.
        prompt_template : str
            Prompt with {variable} placeholders.
        input_map : dict[str, str], optional
            Mapping of template variables to previous step names.
            E.g., {"summary": "step_1"} fills {summary} with step_1's output.
        temperature : float
            Temperature for this step.
        max_tokens : int
            Max tokens for this step.
        parser : callable, optional
            Function to post-process the step's raw output.

        Returns
        -------
        PromptChain
            Self, for method chaining.
        """
        ...

    def run(self, initial_inputs: dict[str, str] | None = None) -> dict:
        """Execute all steps sequentially.

        Parameters
        ----------
        initial_inputs : dict[str, str], optional
            Variables available to the first step(s).

        Returns
        -------
        dict
            Keys are step names, values are step outputs (post-parsing).
        """
        ...

    def get_result(self, step_name: str) -> str:
        """Retrieve the output of a specific step.

        Returns
        -------
        str
            The output of the named step.

        Raises
        ------
        KeyError
            If the step has not been executed.
        """
        ...

    def retry_step(self, step_name: str,
                   temperature: float | None = None) -> str:
        """Re-execute a single step (e.g., if output was unsatisfactory).

        Parameters
        ----------
        step_name : str
            The step to re-run.
        temperature : float, optional
            Override temperature for the retry.

        Returns
        -------
        str
            New output for the step.
        """
        ...

    def total_tokens_used(self) -> dict:
        """Return token usage across all steps.

        Returns
        -------
        dict
            Keys: 'prompt_tokens', 'completion_tokens', 'total_tokens',
            'estimated_cost_usd'.
        """
        ...

Usage:

from prompt_chain import PromptChain
from openai import OpenAI

chain = PromptChain(client=OpenAI(), model="gpt-4o", verbose=True)

chain.add_step(
    name="extract",
    prompt_template="Extract key financial metrics from this report:\n\n{report}",
    temperature=0.1, max_tokens=500
)
chain.add_step(
    name="analyze",
    prompt_template="Given these metrics:\n{extract}\n\nIdentify the top 3 risks.",
    input_map={"extract": "extract"},
    temperature=0.3, max_tokens=800
)
chain.add_step(
    name="recommend",
    prompt_template="Based on this risk analysis:\n{analyze}\n\n"
                    "Recommend specific actions for the executive team.",
    input_map={"analyze": "analyze"},
    temperature=0.5, max_tokens=1000
)

results = chain.run(initial_inputs={"report": quarterly_report_text})
print(results["recommend"])
print(chain.total_tokens_used())

BiasDetector (Chapter 25)

Automated tool for detecting and reporting bias in ML model predictions across protected attributes, integrating Fairlearn metrics with custom visualization and reporting.

class BiasDetector:
    """Detect and report bias in ML model predictions.

    Parameters
    ----------
    sensitive_features : list[str]
        Names of protected attribute columns (e.g., ['gender', 'race', 'age_group']).
    reference_groups : dict[str, str], optional
        Mapping of feature name to the reference group for ratio calculations.
        E.g., {'gender': 'Male', 'race': 'White'}.
    fairness_threshold : float
        Threshold for the four-fifths rule (default 0.8).
    """

    def __init__(self, sensitive_features: list[str],
                 reference_groups: dict[str, str] | None = None,
                 fairness_threshold: float = 0.8) -> None:
        self.sensitive_features = sensitive_features
        self.reference_groups = reference_groups or {}
        self.fairness_threshold = fairness_threshold
        self.audit_results = {}

    def audit(self, y_true: pd.Series, y_pred: pd.Series,
              sensitive_data: pd.DataFrame) -> dict:
        """Run a comprehensive bias audit across all sensitive features.

        Parameters
        ----------
        y_true : pd.Series
            Ground truth labels.
        y_pred : pd.Series
            Model predictions.
        sensitive_data : pd.DataFrame
            DataFrame containing the sensitive feature columns.

        Returns
        -------
        dict
            Nested dict: {feature: {metric: value}} including:
            - demographic_parity_difference
            - demographic_parity_ratio
            - equalized_odds_difference
            - group_accuracy, group_precision, group_recall, group_f1
            - four_fifths_rule_pass (bool)
        """
        ...

    def group_metrics(self, y_true: pd.Series, y_pred: pd.Series,
                      sensitive_data: pd.DataFrame,
                      feature: str) -> pd.DataFrame:
        """Compute detailed metrics broken down by group.

        Returns
        -------
        pd.DataFrame
            One row per group with accuracy, precision, recall, F1,
            selection_rate, and sample_size.
        """
        ...

    def plot_disparities(self, feature: str | None = None) -> plt.Figure:
        """Visualize metric disparities across groups.

        Parameters
        ----------
        feature : str, optional
            Sensitive feature to plot. If None, plots all features.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def four_fifths_test(self, y_pred: pd.Series,
                         sensitive_data: pd.DataFrame,
                         feature: str) -> dict:
        """Apply the four-fifths (80%) rule test.

        Returns
        -------
        dict
            Keys: 'passes' (bool), 'selection_rates' (dict),
            'adverse_impact_ratio' (float), 'disadvantaged_groups' (list).
        """
        ...

    def generate_report(self, format: str = "text") -> str:
        """Generate a formatted bias audit report.

        Parameters
        ----------
        format : str
            Output format: 'text', 'markdown', or 'html'.

        Returns
        -------
        str
            Complete bias audit report.
        """
        ...

    def suggest_mitigations(self) -> list[dict]:
        """Suggest bias mitigation strategies based on audit results.

        Returns
        -------
        list[dict]
            Each dict has keys: 'issue', 'severity' ('high'/'medium'/'low'),
            'strategy', 'implementation_notes'.
        """
        ...

Usage:

from bias_detector import BiasDetector

detector = BiasDetector(
    sensitive_features=["gender", "age_group"],
    reference_groups={"gender": "Male", "age_group": "30-50"},
    fairness_threshold=0.8
)

audit = detector.audit(y_test, y_pred, df_test[["gender", "age_group"]])
print(f"Demographic parity (gender): {audit['gender']['demographic_parity_difference']:.3f}")

group_detail = detector.group_metrics(y_test, y_pred, df_test, feature="gender")
print(group_detail)

detector.plot_disparities(feature="gender")
report = detector.generate_report(format="markdown")
mitigations = detector.suggest_mitigations()

ExplainabilityDashboard (Chapter 26)

Unified explainability interface that wraps SHAP and LIME into a single API, producing global and local explanations along with interactive visualizations.

class ExplainabilityDashboard:
    """Unified model explainability interface combining SHAP and LIME.

    Parameters
    ----------
    model : object
        A fitted scikit-learn compatible model or pipeline.
    X_train : pd.DataFrame
        Training data (used as background for SHAP and LIME).
    feature_names : list[str]
        Feature names for display.
    task : str
        One of 'classification' or 'regression'.
    class_names : list[str], optional
        Class labels for classification tasks.
    """

    def __init__(self, model: object,
                 X_train: pd.DataFrame,
                 feature_names: list[str],
                 task: str = "classification",
                 class_names: list[str] | None = None) -> None:
        self.model = model
        self.X_train = X_train
        self.feature_names = feature_names
        self.task = task
        self.class_names = class_names
        self.shap_explainer = None
        self.lime_explainer = None
        self.shap_values = None

    def compute_shap(self, X: pd.DataFrame,
                     method: str = "auto") -> np.ndarray:
        """Compute SHAP values for the given data.

        Parameters
        ----------
        X : pd.DataFrame
            Data to explain.
        method : str
            Explainer type: 'auto', 'tree', 'kernel', 'linear'.
            'auto' selects based on model type.

        Returns
        -------
        np.ndarray
            SHAP values array of shape (n_samples, n_features).
        """
        ...

    def global_importance(self, X: pd.DataFrame,
                          top_n: int = 15) -> pd.DataFrame:
        """Compute global feature importance from SHAP values.

        Returns
        -------
        pd.DataFrame
            Columns: feature, mean_abs_shap. Sorted descending.
        """
        ...

    def local_explanation(self, instance: pd.Series,
                          method: str = "shap",
                          num_features: int = 10) -> dict:
        """Explain a single prediction.

        Parameters
        ----------
        instance : pd.Series
            Single row of features.
        method : str
            Explanation method: 'shap', 'lime', or 'both'.
        num_features : int
            Number of top features to include.

        Returns
        -------
        dict
            Keys: 'prediction', 'probability' (classification),
            'feature_contributions' (list of (feature, value, contribution)),
            'base_value' (SHAP expected value).
        """
        ...

    def plot_global(self, X: pd.DataFrame,
                    plot_type: str = "summary") -> plt.Figure:
        """Create global explanation visualizations.

        Parameters
        ----------
        plot_type : str
            One of 'summary' (beeswarm), 'bar', 'heatmap'.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def plot_local(self, instance: pd.Series,
                   plot_type: str = "waterfall") -> plt.Figure:
        """Create local explanation visualization for one prediction.

        Parameters
        ----------
        plot_type : str
            One of 'waterfall', 'force', 'bar'.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def feature_dependence(self, feature: str,
                           interaction_feature: str | None = None,
                           X: pd.DataFrame | None = None) -> plt.Figure:
        """SHAP dependence plot for a single feature.

        Parameters
        ----------
        feature : str
            Feature to plot on x-axis.
        interaction_feature : str, optional
            Feature for color-coding interaction effects.
        X : pd.DataFrame, optional
            Data to use. If None, uses previously computed values.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def compare_explanations(self, instance: pd.Series,
                             top_n: int = 10) -> pd.DataFrame:
        """Compare SHAP and LIME explanations side-by-side for one instance.

        Returns
        -------
        pd.DataFrame
            Columns: feature, shap_contribution, lime_contribution, agreement.
        """
        ...

Usage:

from explainability_dashboard import ExplainabilityDashboard

dashboard = ExplainabilityDashboard(
    model=fitted_pipeline,
    X_train=X_train,
    feature_names=feature_names,
    task="classification",
    class_names=["Retained", "Churned"]
)

# Global explanations
importance = dashboard.global_importance(X_test, top_n=15)
dashboard.plot_global(X_test, plot_type="summary")

# Local explanation for a single customer
explanation = dashboard.local_explanation(X_test.iloc[0], method="both")
dashboard.plot_local(X_test.iloc[0], plot_type="waterfall")

# Compare SHAP vs LIME
comparison = dashboard.compare_explanations(X_test.iloc[0])
print(comparison)

AIROICalculator (Chapter 34)

Financial modeling tool for calculating the return on investment of AI/ML initiatives, incorporating development costs, operational costs, productivity gains, and risk-adjusted projections.

class AIROICalculator:
    """Calculate ROI for AI/ML business initiatives.

    Parameters
    ----------
    project_name : str
        Name of the AI project.
    time_horizon_years : int
        Number of years for the projection.
    discount_rate : float
        Annual discount rate for NPV calculations.
    """

    def __init__(self, project_name: str,
                 time_horizon_years: int = 3,
                 discount_rate: float = 0.10) -> None:
        self.project_name = project_name
        self.time_horizon_years = time_horizon_years
        self.discount_rate = discount_rate
        self.costs = {}
        self.benefits = {}
        self.risk_factors = {}

    def add_cost(self, category: str, year: int,
                 amount: float,
                 recurring: bool = False) -> "AIROICalculator":
        """Add a cost item.

        Parameters
        ----------
        category : str
            Cost category (e.g., 'development', 'infrastructure',
            'talent', 'data_acquisition', 'maintenance').
        year : int
            Year of the expenditure (0 = initial investment).
        amount : float
            Dollar amount.
        recurring : bool
            If True, repeats every year from 'year' onward.

        Returns
        -------
        AIROICalculator
            Self, for method chaining.
        """
        ...

    def add_benefit(self, category: str, year: int,
                    amount: float,
                    growth_rate: float = 0.0) -> "AIROICalculator":
        """Add a benefit/revenue item.

        Parameters
        ----------
        category : str
            Benefit category (e.g., 'revenue_increase', 'cost_savings',
            'productivity_gain', 'error_reduction').
        year : int
            First year the benefit is realized.
        amount : float
            Dollar amount in the first year.
        growth_rate : float
            Annual growth rate for the benefit (e.g., 0.1 for 10% growth).

        Returns
        -------
        AIROICalculator
            Self, for method chaining.
        """
        ...

    def set_risk_factor(self, scenario: str,
                        probability: float,
                        impact_multiplier: float) -> "AIROICalculator":
        """Define a risk scenario.

        Parameters
        ----------
        scenario : str
            Risk scenario name (e.g., 'adoption_delay', 'data_quality_issues').
        probability : float
            Estimated probability (0.0 to 1.0).
        impact_multiplier : float
            Multiplier on benefits (e.g., 0.5 means benefits cut in half).

        Returns
        -------
        AIROICalculator
            Self, for method chaining.
        """
        ...

    def calculate(self) -> dict:
        """Compute base-case ROI metrics.

        Returns
        -------
        dict
            Keys: 'total_costs', 'total_benefits', 'net_benefit',
            'roi_pct', 'npv', 'irr', 'payback_period_years',
            'yearly_cashflows' (list of dicts).
        """
        ...

    def risk_adjusted_calculate(self) -> dict:
        """Compute expected-value ROI incorporating risk factors.

        Returns
        -------
        dict
            Same keys as calculate() plus 'expected_npv',
            'risk_adjusted_roi_pct', 'scenario_analysis' (list of dicts).
        """
        ...

    def sensitivity_analysis(self, variable: str,
                             range_pct: float = 0.3,
                             steps: int = 10) -> pd.DataFrame:
        """Run sensitivity analysis on a single variable.

        Parameters
        ----------
        variable : str
            Variable to vary (e.g., 'discount_rate', a cost category, a benefit).
        range_pct : float
            Percentage range to vary the variable (+/- range_pct).
        steps : int
            Number of steps in the range.

        Returns
        -------
        pd.DataFrame
            Columns: variable_value, npv, roi_pct.
        """
        ...

    def plot_cashflows(self) -> plt.Figure:
        """Visualize projected cash flows over time.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def plot_sensitivity(self, variable: str) -> plt.Figure:
        """Tornado/sensitivity chart for the specified variable.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def executive_summary(self) -> str:
        """Generate a plain-English executive summary.

        Returns
        -------
        str
            Formatted summary suitable for a slide deck or memo.
        """
        ...

Usage:

from ai_roi_calculator import AIROICalculator

calc = AIROICalculator("Customer Churn Prediction", time_horizon_years=3, discount_rate=0.10)

# Costs
calc.add_cost("development", year=0, amount=150_000)
calc.add_cost("infrastructure", year=0, amount=50_000)
calc.add_cost("infrastructure", year=1, amount=30_000, recurring=True)
calc.add_cost("talent", year=1, amount=120_000, recurring=True)

# Benefits
calc.add_benefit("churn_reduction", year=1, amount=400_000, growth_rate=0.15)
calc.add_benefit("upsell_revenue", year=1, amount=100_000, growth_rate=0.10)

# Risks
calc.set_risk_factor("adoption_delay", probability=0.3, impact_multiplier=0.6)
calc.set_risk_factor("data_quality", probability=0.2, impact_multiplier=0.8)

# Calculate
results = calc.calculate()
print(f"NPV: ${results['npv']:,.0f}")
print(f"ROI: {results['roi_pct']:.1f}%")
print(f"Payback: {results['payback_period_years']:.1f} years")

risk_results = calc.risk_adjusted_calculate()
print(f"Risk-Adjusted NPV: ${risk_results['expected_npv']:,.0f}")

calc.plot_cashflows()
print(calc.executive_summary())

AIMaturityAssessment (Chapter 39)

Diagnostic tool that evaluates an organization's AI readiness across multiple dimensions and produces a maturity scorecard with actionable recommendations.

class AIMaturityAssessment:
    """Evaluate organizational AI maturity across key dimensions.

    Parameters
    ----------
    organization_name : str
        Name of the organization being assessed.
    industry : str, optional
        Industry vertical for benchmark comparisons.
    """

    DIMENSIONS = [
        "strategy_and_vision",
        "data_infrastructure",
        "talent_and_skills",
        "technology_stack",
        "governance_and_ethics",
        "organizational_culture",
        "use_case_portfolio",
        "measurement_and_value"
    ]

    MATURITY_LEVELS = {
        1: "Initial",
        2: "Developing",
        3: "Defined",
        4: "Managed",
        5: "Optimizing"
    }

    def __init__(self, organization_name: str,
                 industry: str | None = None) -> None:
        self.organization_name = organization_name
        self.industry = industry
        self.scores = {}
        self.evidence = {}
        self.benchmarks = {}

    def score_dimension(self, dimension: str, score: int,
                        evidence: str = "",
                        sub_scores: dict[str, int] | None = None
                        ) -> "AIMaturityAssessment":
        """Record a maturity score for one dimension.

        Parameters
        ----------
        dimension : str
            One of the DIMENSIONS listed above.
        score : int
            Maturity level (1-5).
        evidence : str
            Supporting evidence or justification for the score.
        sub_scores : dict[str, int], optional
            Granular sub-dimension scores.

        Returns
        -------
        AIMaturityAssessment
            Self, for method chaining.

        Raises
        ------
        ValueError
            If dimension is not recognized or score is not 1-5.
        """
        ...

    def overall_score(self) -> float:
        """Compute the weighted average maturity score.

        Returns
        -------
        float
            Overall maturity score (1.0 to 5.0).
        """
        ...

    def maturity_level(self) -> str:
        """Return the overall maturity level label.

        Returns
        -------
        str
            One of: 'Initial', 'Developing', 'Defined', 'Managed', 'Optimizing'.
        """
        ...

    def gap_analysis(self, target_level: int = 4) -> pd.DataFrame:
        """Identify gaps between current scores and target level.

        Parameters
        ----------
        target_level : int
            The desired maturity level for each dimension.

        Returns
        -------
        pd.DataFrame
            Columns: dimension, current_score, target, gap, priority.
            Sorted by gap descending.
        """
        ...

    def recommendations(self, top_n: int = 5) -> list[dict]:
        """Generate prioritized recommendations based on gaps.

        Parameters
        ----------
        top_n : int
            Number of top recommendations to return.

        Returns
        -------
        list[dict]
            Each dict has keys: 'dimension', 'recommendation',
            'expected_impact' ('high'/'medium'/'low'),
            'effort' ('high'/'medium'/'low'),
            'timeline' (str), 'quick_win' (bool).
        """
        ...

    def plot_radar(self, target_level: int = 4) -> plt.Figure:
        """Radar/spider chart of current vs. target maturity scores.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def plot_heatmap(self) -> plt.Figure:
        """Heatmap of all dimension and sub-dimension scores.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def generate_report(self, format: str = "markdown") -> str:
        """Generate a comprehensive maturity assessment report.

        Parameters
        ----------
        format : str
            Output format: 'text', 'markdown', or 'html'.

        Returns
        -------
        str
            Complete assessment report with scores, gaps, and recommendations.
        """
        ...

Usage:

from ai_maturity import AIMaturityAssessment

assessment = AIMaturityAssessment("Acme Corp", industry="financial_services")

assessment.score_dimension("strategy_and_vision", score=3,
                           evidence="AI strategy exists but not integrated with business strategy")
assessment.score_dimension("data_infrastructure", score=2,
                           evidence="Data warehouse exists but siloed; no feature store")
assessment.score_dimension("talent_and_skills", score=2,
                           evidence="Small data science team; limited ML engineering capability")
assessment.score_dimension("technology_stack", score=3,
                           evidence="Cloud-based ML platform in pilot stage")
assessment.score_dimension("governance_and_ethics", score=1,
                           evidence="No formal AI governance framework")
assessment.score_dimension("organizational_culture", score=2,
                           evidence="Pockets of data-driven culture; leadership buy-in varies")
assessment.score_dimension("use_case_portfolio", score=3,
                           evidence="5 production models; pipeline of 10+ candidates")
assessment.score_dimension("measurement_and_value", score=2,
                           evidence="Ad hoc ROI tracking; no standardized value framework")

print(f"Overall Score: {assessment.overall_score():.1f}/5.0")
print(f"Maturity Level: {assessment.maturity_level()}")

gaps = assessment.gap_analysis(target_level=4)
print(gaps)

recs = assessment.recommendations(top_n=3)
for r in recs:
    print(f"- [{r['expected_impact'].upper()}] {r['recommendation']}")

assessment.plot_radar()
print(assessment.generate_report(format="markdown"))

TransformationRoadmapGenerator (Chapter 39)

Strategic planning tool that takes AI maturity assessment results and generates a phased implementation roadmap with milestones, resource estimates, and dependencies.

class TransformationRoadmapGenerator:
    """Generate a phased AI transformation roadmap.

    Parameters
    ----------
    assessment : AIMaturityAssessment
        A completed maturity assessment.
    target_timeline_months : int
        Total timeline for the transformation in months.
    budget_constraint : float, optional
        Total budget in dollars. If provided, phases are budget-aware.
    """

    def __init__(self, assessment: "AIMaturityAssessment",
                 target_timeline_months: int = 24,
                 budget_constraint: float | None = None) -> None:
        self.assessment = assessment
        self.target_timeline_months = target_timeline_months
        self.budget_constraint = budget_constraint
        self.phases = []
        self.initiatives = []

    def generate_phases(self, n_phases: int = 3) -> list[dict]:
        """Create transformation phases based on gap analysis.

        Parameters
        ----------
        n_phases : int
            Number of phases (typically 3: Foundation, Scale, Optimize).

        Returns
        -------
        list[dict]
            Each dict has keys: 'phase_name', 'phase_number',
            'start_month', 'end_month', 'focus_dimensions',
            'objectives' (list[str]), 'target_scores' (dict).
        """
        ...

    def add_initiative(self, name: str, phase: int,
                       dimension: str, description: str,
                       estimated_cost: float,
                       estimated_months: int,
                       dependencies: list[str] | None = None,
                       kpis: list[str] | None = None) -> "TransformationRoadmapGenerator":
        """Add a specific initiative to the roadmap.

        Parameters
        ----------
        name : str
            Initiative name.
        phase : int
            Phase number (1-indexed).
        dimension : str
            Primary maturity dimension this initiative addresses.
        description : str
            Brief description.
        estimated_cost : float
            Estimated cost in dollars.
        estimated_months : int
            Estimated duration in months.
        dependencies : list[str], optional
            Names of prerequisite initiatives.
        kpis : list[str], optional
            Key performance indicators for this initiative.

        Returns
        -------
        TransformationRoadmapGenerator
            Self, for method chaining.
        """
        ...

    def auto_generate_initiatives(self) -> "TransformationRoadmapGenerator":
        """Automatically generate initiatives based on gap analysis.

        Uses built-in best-practice templates mapped to each
        dimension-gap combination.

        Returns
        -------
        TransformationRoadmapGenerator
            Self, for method chaining.
        """
        ...

    def validate_dependencies(self) -> list[str]:
        """Check for circular dependencies and missing prerequisites.

        Returns
        -------
        list[str]
            List of warning messages. Empty list if valid.
        """
        ...

    def plot_gantt(self) -> plt.Figure:
        """Generate a Gantt chart of the roadmap.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def plot_investment_timeline(self) -> plt.Figure:
        """Stacked bar chart of investment by dimension per phase.

        Returns
        -------
        matplotlib.figure.Figure
        """
        ...

    def generate_roadmap(self, format: str = "markdown") -> str:
        """Generate the complete transformation roadmap document.

        Parameters
        ----------
        format : str
            Output format: 'text', 'markdown', or 'html'.

        Returns
        -------
        str
            Complete roadmap with phases, initiatives, timelines,
            budgets, KPIs, and dependencies.
        """
        ...

    def executive_presentation(self) -> list[dict]:
        """Generate slide-ready content for executive presentation.

        Returns
        -------
        list[dict]
            Each dict represents a slide with keys: 'title',
            'bullet_points' (list[str]), 'chart_type' (str or None),
            'data' (dict or None).
        """
        ...

Usage:

from transformation_roadmap import TransformationRoadmapGenerator

roadmap_gen = TransformationRoadmapGenerator(
    assessment=assessment,
    target_timeline_months=24,
    budget_constraint=2_000_000
)

phases = roadmap_gen.generate_phases(n_phases=3)
for phase in phases:
    print(f"Phase {phase['phase_number']}: {phase['phase_name']} "
          f"(Months {phase['start_month']}-{phase['end_month']})")

roadmap_gen.auto_generate_initiatives()

# Add a custom initiative
roadmap_gen.add_initiative(
    name="Establish AI Ethics Board",
    phase=1,
    dimension="governance_and_ethics",
    description="Form cross-functional ethics review board for all AI projects",
    estimated_cost=50_000,
    estimated_months=3,
    kpis=["Board formed", "Review process documented", "First 3 projects reviewed"]
)

warnings = roadmap_gen.validate_dependencies()
if warnings:
    for w in warnings:
        print(f"WARNING: {w}")

roadmap_gen.plot_gantt()
roadmap_gen.plot_investment_timeline()

roadmap_doc = roadmap_gen.generate_roadmap(format="markdown")
print(roadmap_doc)

slides = roadmap_gen.executive_presentation()
print(f"Generated {len(slides)} slides for executive review")

Quick Index

Tool / Library Section Primary Chapters
Python basics A.1 1--4
pandas A.2 4--40
matplotlib A.3 5--40
seaborn A.3 5--40
scikit-learn A.4 6--12, 25--26
NLTK A.5 13--15
spaCy A.5 14--15
TF-IDF A.5 14
OpenAI API A.6 17--22, 30
Anthropic API A.6 18--22
Fairlearn A.7 25
SHAP A.7 26
LIME A.7 26
EDAReport A.8 5
ChurnClassifier A.8 7
DemandForecaster A.8 8
CustomerSegmenter A.8 9
RecommendationEngine A.8 10
ModelEvaluator A.8 11
ReviewAnalyzer A.8 14
PromptBuilder A.8 19
PromptChain A.8 20
BiasDetector A.8 25
ExplainabilityDashboard A.8 26
AIROICalculator A.8 34
AIMaturityAssessment A.8 39
TransformationRoadmapGenerator A.8 39