Appendix A: Python Reference
This appendix serves as a comprehensive quick-reference guide to every Python tool, library, and custom class used throughout AI & Machine Learning for Business. Whether you are debugging a lab exercise at midnight or refreshing your memory before a presentation, this is the page to bookmark. Code snippets are kept deliberately short; for full context and business applications, follow the chapter cross-references.
A.1 Python Basics Quick Reference
Variables and Data Types
# Numeric types
revenue = 1_250_000 # int (underscores improve readability)
growth_rate = 0.073 # float
complex_val = 3 + 4j # complex (rarely used in business analytics)
# Boolean
is_churned = True
# Strings
company = "Acme Corp"
greeting = f"Welcome to {company}" # f-string interpolation
# None
result = None
# Type checking
type(revenue) # <class 'int'>
isinstance(growth_rate, float) # True
Core Data Structures
# List — ordered, mutable
departments = ["Sales", "Marketing", "Engineering", "HR"]
departments.append("Finance")
departments[1:3] # ["Marketing", "Engineering"]
# Tuple — ordered, immutable
coordinates = (40.7128, -74.0060)
# Dictionary — key-value pairs
employee = {
"name": "Priya Sharma",
"role": "Data Analyst",
"tenure_years": 3
}
employee["role"] # "Data Analyst"
employee.get("salary", 0) # 0 (default when key is missing)
# Set — unique, unordered
unique_tags = {"ML", "NLP", "ML", "CV"} # {"ML", "NLP", "CV"}
Operators
# Arithmetic
total = price * quantity
margin = revenue - cost
roi = (gain - cost) / cost # returns float
floor_div = 17 // 5 # 3
remainder = 17 % 5 # 2
power = 2 ** 10 # 1024
# Comparison
x == y # equality
x != y # inequality
x >= y # greater than or equal
# Logical
if revenue > 1_000_000 and growth_rate > 0.05:
tier = "high"
elif revenue > 500_000 or is_strategic:
tier = "medium"
else:
tier = "low"
# Membership
"Sales" in departments # True
# Walrus operator (Python 3.8+)
if (n := len(departments)) > 4:
print(f"{n} departments found")
Control Flow
# if / elif / else
if score >= 90:
grade = "A"
elif score >= 80:
grade = "B"
else:
grade = "C"
# for loop
for dept in departments:
print(dept.upper())
# for with index
for i, dept in enumerate(departments, start=1):
print(f"{i}. {dept}")
# while loop
attempts = 0
while attempts < 3:
result = call_api()
if result.ok:
break
attempts += 1
# Ternary expression
label = "Premium" if revenue > 1_000_000 else "Standard"
Functions
# Basic function
def calculate_cltv(avg_purchase: float,
frequency: float,
lifespan_years: float) -> float:
"""Calculate Customer Lifetime Value."""
return avg_purchase * frequency * lifespan_years
# Default arguments
def greet(name: str, title: str = "Team Member") -> str:
return f"Hello, {title} {name}"
# *args and **kwargs
def log_metrics(*values, **labels):
for k, v in labels.items():
print(f"{k}: {v}")
# Lambda (anonymous function)
sort_key = lambda x: x["revenue"]
customers.sort(key=sort_key, reverse=True)
# Lambda in pandas context
df["margin_pct"] = df.apply(lambda row: row["profit"] / row["revenue"], axis=1)
List Comprehensions and Generators
# List comprehension
squares = [x ** 2 for x in range(10)]
high_value = [c for c in customers if c["revenue"] > 100_000]
# Dictionary comprehension
name_to_revenue = {c["name"]: c["revenue"] for c in customers}
# Set comprehension
unique_cities = {c["city"] for c in customers}
# Generator expression (memory-efficient for large datasets)
total = sum(c["revenue"] for c in customers)
String Formatting
name = "Widget Pro"
price = 49.99
units = 1_250
# f-strings (preferred)
print(f"Product: {name}, Price: ${price:.2f}, Units: {units:,}")
# Output: Product: Widget Pro, Price: $49.99, Units: 1,250
# Alignment and padding
print(f"{'Item':<20} {'Price':>10}") # left-align, right-align
print(f"{name:<20} {price:>10.2f}")
# Multiline f-strings
summary = (
f"Revenue: ${revenue:,.0f}\n"
f"Growth: {growth_rate:.1%}\n"
f"Churned: {is_churned}"
)
# Percentage formatting
print(f"Accuracy: {0.9432:.1%}") # "Accuracy: 94.3%"
Error Handling
# Basic try/except
try:
result = revenue / num_customers
except ZeroDivisionError:
result = 0.0
# Multiple exception types
try:
data = pd.read_csv(filepath)
except FileNotFoundError:
print(f"File not found: {filepath}")
data = pd.DataFrame()
except pd.errors.ParserError as e:
print(f"Parse error: {e}")
data = pd.DataFrame()
# try/except/else/finally
try:
model = joblib.load("model.pkl")
except Exception as e:
logging.error(f"Model load failed: {e}")
model = None
else:
print("Model loaded successfully")
finally:
print("Load attempt complete")
# Raising exceptions
def set_discount(rate: float) -> None:
if not 0.0 <= rate <= 1.0:
raise ValueError(f"Discount rate must be 0-1, got {rate}")
Useful Built-in Functions
len(customers) # number of items
max(revenues) # largest value
min(revenues) # smallest value
sum(revenues) # total
sorted(revenues, reverse=True) # sorted copy
abs(-42) # 42
round(3.14159, 2) # 3.14
zip(names, scores) # pair up two iterables
map(str.upper, names) # apply function to each item
any([False, True, False]) # True (at least one)
all([True, True, True]) # True (every one)
A.2 pandas Quick Reference
Importing and Creating DataFrames
import pandas as pd
import numpy as np
# From CSV
df = pd.read_csv("sales_data.csv")
df = pd.read_csv("sales_data.csv", parse_dates=["order_date"], index_col="id")
# From Excel
df = pd.read_excel("report.xlsx", sheet_name="Q4")
# From dictionary
df = pd.DataFrame({
"product": ["Widget", "Gadget", "Gizmo"],
"revenue": [50000, 75000, 30000],
"region": ["East", "West", "East"]
})
# From list of dictionaries
records = [
{"name": "Alice", "score": 92},
{"name": "Bob", "score": 85},
]
df = pd.DataFrame(records)
Inspection
df.head(10) # first 10 rows
df.tail(5) # last 5 rows
df.shape # (rows, columns)
df.columns # column names
df.dtypes # data types per column
df.info() # summary including non-null counts
df.describe() # statistics for numeric columns
df.describe(include="object") # statistics for categorical columns
df.nunique() # unique value counts per column
df.value_counts("region") # frequency table for one column
df.sample(5) # random sample of 5 rows
Selection and Indexing
# Single column (returns Series)
df["revenue"]
# Multiple columns (returns DataFrame)
df[["product", "revenue"]]
# Row by label
df.loc[0] # row with index label 0
df.loc[0:5, "product":"revenue"] # label-based slicing (inclusive)
# Row by position
df.iloc[0] # first row
df.iloc[0:5, 0:3] # position-based slicing (exclusive end)
# Boolean indexing
high_rev = df[df["revenue"] > 50000]
east_high = df[(df["region"] == "East") & (df["revenue"] > 40000)]
selected = df[df["region"].isin(["East", "West"])]
missing = df[df["email"].isna()]
# Query syntax (alternative to boolean indexing)
df.query("revenue > 50000 and region == 'East'")
Adding and Modifying Columns
# New column from calculation
df["profit_margin"] = df["profit"] / df["revenue"]
# Conditional column
df["tier"] = np.where(df["revenue"] > 50000, "High", "Standard")
# Multiple conditions
conditions = [
df["revenue"] > 100000,
df["revenue"] > 50000,
]
choices = ["Premium", "Standard"]
df["tier"] = np.select(conditions, choices, default="Basic")
# Apply a function
df["name_upper"] = df["name"].apply(str.upper)
# Rename columns
df = df.rename(columns={"old_name": "new_name"})
# Drop columns
df = df.drop(columns=["temp_col", "debug_col"])
Groupby and Aggregation
# Single aggregation
df.groupby("region")["revenue"].sum()
# Multiple aggregations
summary = df.groupby("region").agg(
total_revenue=("revenue", "sum"),
avg_revenue=("revenue", "mean"),
num_orders=("order_id", "count"),
max_deal=("revenue", "max")
).reset_index()
# Multiple groupby columns
df.groupby(["region", "product"])["revenue"].mean()
# Transform (returns same-shaped Series)
df["pct_of_region"] = (
df["revenue"] / df.groupby("region")["revenue"].transform("sum")
)
# Pivot table
pivot = df.pivot_table(
values="revenue",
index="region",
columns="quarter",
aggfunc="sum",
fill_value=0,
margins=True # adds row/column totals
)
Merging and Joining
# Inner join (default)
merged = pd.merge(orders, customers, on="customer_id")
# Left join
merged = pd.merge(orders, customers, on="customer_id", how="left")
# Join on different column names
merged = pd.merge(
orders, products,
left_on="prod_code", right_on="product_id",
how="left"
)
# Multiple join keys
merged = pd.merge(df1, df2, on=["year", "region"])
# Concatenation (stacking rows)
combined = pd.concat([df_q1, df_q2, df_q3, df_q4], ignore_index=True)
Missing Data
# Detection
df.isna().sum() # count of NaN per column
df.isna().mean() # fraction missing per column
# Dropping
df.dropna() # drop rows with any NaN
df.dropna(subset=["revenue", "region"]) # only check specific columns
df.dropna(thresh=3) # keep rows with at least 3 non-NaN values
# Filling
df["revenue"].fillna(0)
df["region"].fillna("Unknown")
df["revenue"].fillna(df["revenue"].median())
# Forward/backward fill (time series)
df["stock_price"].ffill()
df["stock_price"].bfill()
# Interpolation
df["temperature"].interpolate(method="linear")
Sorting and Ranking
# Sort by column
df.sort_values("revenue", ascending=False)
# Sort by multiple columns
df.sort_values(["region", "revenue"], ascending=[True, False])
# Rank
df["revenue_rank"] = df["revenue"].rank(ascending=False, method="dense")
Date and Time Operations
# Parse dates
df["date"] = pd.to_datetime(df["date_str"])
# Extract components
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day_of_week"] = df["date"].dt.day_name()
df["quarter"] = df["date"].dt.quarter
# Date arithmetic
df["days_since"] = (pd.Timestamp.today() - df["date"]).dt.days
# Resample time series
monthly = df.set_index("date").resample("ME")["revenue"].sum()
Saving Data
df.to_csv("output.csv", index=False)
df.to_excel("output.xlsx", index=False, sheet_name="Results")
df.to_parquet("output.parquet") # fast, compressed binary format
A.3 Visualization Quick Reference
matplotlib Basics
import matplotlib.pyplot as plt
# Figure and axes (recommended approach)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title("Quarterly Revenue by Region", fontsize=14)
ax.set_xlabel("Quarter")
ax.set_ylabel("Revenue ($)")
plt.tight_layout()
plt.savefig("chart.png", dpi=150, bbox_inches="tight")
plt.show()
matplotlib Plot Types
# Line plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(dates, revenue, marker="o", linewidth=2, label="Revenue")
ax.plot(dates, forecast, linestyle="--", label="Forecast")
ax.legend()
# Bar chart
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(regions, totals, color=["#2196F3", "#4CAF50", "#FF9800", "#F44336"])
ax.bar_label(ax.containers[0], fmt="$%,.0f") # value labels on bars
# Horizontal bar chart
ax.barh(categories, values)
# Grouped bar chart
x = np.arange(len(regions))
width = 0.35
ax.bar(x - width / 2, q1_values, width, label="Q1")
ax.bar(x + width / 2, q2_values, width, label="Q2")
ax.set_xticks(x)
ax.set_xticklabels(regions)
ax.legend()
# Scatter plot
ax.scatter(df["spend"], df["revenue"], alpha=0.6, c=df["cluster"],
cmap="viridis", s=50)
# Histogram
ax.hist(df["revenue"], bins=30, edgecolor="white", alpha=0.7)
# Pie chart (use sparingly)
ax.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)
ax.axis("equal")
# Subplots grid
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes[0, 0].bar(x, y1)
axes[0, 0].set_title("Chart A")
axes[0, 1].plot(x, y2)
axes[0, 1].set_title("Chart B")
# ... etc.
plt.tight_layout()
matplotlib Styling
# Color palettes
colors = plt.cm.Set2(np.linspace(0, 1, 8))
# Grid
ax.grid(True, alpha=0.3, linestyle="--")
# Axis formatting
from matplotlib.ticker import FuncFormatter
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, _: f"${x:,.0f}"))
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{x:.0%}"))
# Rotate tick labels
plt.xticks(rotation=45, ha="right")
# Annotation
ax.annotate("Peak", xy=(peak_x, peak_y),
xytext=(peak_x + 1, peak_y + 500),
arrowprops=dict(arrowstyle="->"), fontsize=10)
# Horizontal/vertical reference lines
ax.axhline(y=target, color="red", linestyle="--", label="Target")
ax.axvline(x=launch_date, color="gray", linestyle=":", alpha=0.7)
seaborn Plots
import seaborn as sns
# Set global theme
sns.set_theme(style="whitegrid", palette="muted", font_scale=1.1)
# Distribution plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(data=df, x="revenue", hue="region", kde=True, ax=ax)
# Box plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df, x="region", y="revenue", ax=ax)
# Violin plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.violinplot(data=df, x="department", y="salary", ax=ax)
# Pair plot (scatterplot matrix)
sns.pairplot(df[["revenue", "spend", "satisfaction", "segment"]],
hue="segment", diag_kind="kde")
# Heatmap (correlation matrix)
fig, ax = plt.subplots(figsize=(10, 8))
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="RdBu_r",
center=0, vmin=-1, vmax=1, ax=ax)
# Count plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.countplot(data=df, x="product_category",
order=df["product_category"].value_counts().index, ax=ax)
# Regression plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.regplot(data=df, x="ad_spend", y="revenue", ax=ax,
scatter_kws={"alpha": 0.5}, line_kws={"color": "red"})
# Bar plot with confidence intervals
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=df, x="region", y="revenue", hue="year",
estimator="mean", errorbar="sd", ax=ax)
A.4 scikit-learn Quick Reference
Train/Test Split
from sklearn.model_selection import train_test_split
X = df.drop(columns=["target"])
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y # stratify for classification
)
The Universal Model Pattern
Every scikit-learn estimator follows the same three-step interface. This pattern appears in nearly every chapter of the book.
from sklearn.ensemble import RandomForestClassifier
# 1. Instantiate
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
# 2. Fit
model.fit(X_train, y_train)
# 3. Predict and evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] # probability of positive class
score = model.score(X_test, y_test) # default metric (accuracy or R^2)
Classification Models
# Logistic Regression (Ch. 6, 7)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
# Random Forest Classifier (Ch. 7, 11)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
n_estimators=200, max_depth=10, min_samples_leaf=5, random_state=42
)
# Gradient Boosting / XGBoost (Ch. 7, 11, 12)
from xgboost import XGBClassifier
model = XGBClassifier(
n_estimators=300, max_depth=6, learning_rate=0.1,
subsample=0.8, colsample_bytree=0.8,
eval_metric="logloss", random_state=42
)
# Support Vector Machine (Ch. 11)
from sklearn.svm import SVC
model = SVC(kernel="rbf", C=1.0, probability=True, random_state=42)
# Naive Bayes (Ch. 14 — text classification)
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=1.0)
# K-Nearest Neighbors (Ch. 11)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5, metric="minkowski")
Regression Models
# Linear Regression (Ch. 6)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
# Ridge Regression (L2 regularization) (Ch. 6)
from sklearn.linear_model import Ridge
model = Ridge(alpha=1.0)
# Lasso Regression (L1 regularization, feature selection) (Ch. 6)
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.1)
# Random Forest Regressor (Ch. 8)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
# XGBoost Regressor (Ch. 8)
from xgboost import XGBRegressor
model = XGBRegressor(
n_estimators=300, max_depth=6, learning_rate=0.1, random_state=42
)
Clustering
# K-Means (Ch. 9)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, n_init=10, random_state=42)
labels = kmeans.fit_predict(X_scaled)
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_
# Elbow method
inertias = []
K_range = range(2, 11)
for k in K_range:
km = KMeans(n_clusters=k, n_init=10, random_state=42)
km.fit(X_scaled)
inertias.append(km.inertia_)
# Silhouette score
from sklearn.metrics import silhouette_score
sil = silhouette_score(X_scaled, labels)
# DBSCAN (Ch. 9)
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.5, min_samples=5)
labels = db.fit_predict(X_scaled)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
Evaluation Metrics — Classification
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
classification_report, confusion_matrix, roc_auc_score,
precision_recall_curve, roc_curve
)
# Individual metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="binary")
recall = recall_score(y_test, y_pred, average="binary")
f1 = f1_score(y_test, y_pred, average="binary")
auc = roc_auc_score(y_test, y_proba)
# Full classification report
print(classification_report(y_test, y_pred, target_names=["Retained", "Churned"]))
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
xticklabels=["Retained", "Churned"],
yticklabels=["Retained", "Churned"])
# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
Evaluation Metrics — Regression
from sklearn.metrics import (
mean_absolute_error, mean_squared_error, r2_score,
mean_absolute_percentage_error
)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
Cross-Validation
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Quick cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring="f1")
print(f"Mean F1: {scores.mean():.3f} (+/- {scores.std():.3f})")
# Custom folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring="roc_auc")
Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Grid search
param_grid = {
"n_estimators": [100, 200, 300],
"max_depth": [5, 10, 15, None],
"min_samples_leaf": [1, 2, 5]
}
grid = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid, cv=5, scoring="f1", n_jobs=-1, verbose=1
)
grid.fit(X_train, y_train)
print(grid.best_params_)
best_model = grid.best_estimator_
# Randomized search (faster for large grids)
from scipy.stats import randint, uniform
param_dist = {
"n_estimators": randint(100, 500),
"max_depth": randint(3, 20),
"learning_rate": uniform(0.01, 0.3)
}
search = RandomizedSearchCV(
XGBClassifier(random_state=42),
param_dist, n_iter=50, cv=5, scoring="f1",
n_jobs=-1, random_state=42
)
search.fit(X_train, y_train)
Preprocessing and Pipelines
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# Numeric features
numeric_features = ["revenue", "tenure", "usage_hours"]
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
# Categorical features
categorical_features = ["region", "plan_type"]
categorical_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
])
# Combined preprocessor
preprocessor = ColumnTransformer(transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features)
])
# Full pipeline with model
pipeline = Pipeline(steps=[
("preprocessor", preprocessor),
("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
Feature Importance
# Tree-based importance
importances = model.feature_importances_
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
feat_imp.head(15).plot(kind="barh")
# Permutation importance (model-agnostic)
from sklearn.inspection import permutation_importance
result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
perm_imp = pd.Series(result.importances_mean, index=feature_names).sort_values(ascending=False)
Model Persistence
import joblib
# Save
joblib.dump(pipeline, "churn_model_v1.pkl")
# Load
loaded_pipeline = joblib.load("churn_model_v1.pkl")
predictions = loaded_pipeline.predict(new_data)
A.5 NLP Quick Reference
NLTK Basics
import nltk
# Download required data (run once)
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
# Tokenization
from nltk.tokenize import word_tokenize, sent_tokenize
tokens = word_tokenize("The product exceeded our expectations.")
sentences = sent_tokenize(paragraph)
# Stopword removal
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
filtered = [w for w in tokens if w.lower() not in stop_words]
# Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in tokens]
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in tokens]
Text Preprocessing Pipeline
import re
def preprocess_text(text: str) -> str:
"""Standard text cleaning pipeline used throughout Chapters 13-15."""
text = text.lower()
text = re.sub(r"http\S+|www\.\S+", "", text) # remove URLs
text = re.sub(r"[^a-z\s]", "", text) # remove non-alpha
text = re.sub(r"\s+", " ", text).strip() # collapse whitespace
tokens = word_tokenize(text)
tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
return " ".join(tokens)
TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
# Basic TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_tfidf = vectorizer.fit_transform(df["review_text"])
# With n-grams
vectorizer = TfidfVectorizer(
max_features=10000,
ngram_range=(1, 2), # unigrams and bigrams
min_df=5, # ignore terms in fewer than 5 docs
max_df=0.95, # ignore terms in more than 95% of docs
stop_words="english"
)
# Feature names
feature_names = vectorizer.get_feature_names_out()
# In a pipeline
from sklearn.naive_bayes import MultinomialNB
text_pipeline = Pipeline([
("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
("clf", MultinomialNB())
])
text_pipeline.fit(X_train_text, y_train)
Sentiment Analysis (VADER)
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()
scores = sia.polarity_scores("This product is absolutely amazing!")
# {'neg': 0.0, 'neu': 0.296, 'pos': 0.704, 'compound': 0.7783}
# Apply to a DataFrame column
df["sentiment"] = df["review"].apply(lambda x: sia.polarity_scores(x)["compound"])
df["sentiment_label"] = df["sentiment"].apply(
lambda x: "positive" if x > 0.05 else ("negative" if x < -0.05 else "neutral")
)
spaCy Patterns
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple reported $394 billion in revenue for fiscal year 2022.")
# Named Entity Recognition
for ent in doc.ents:
print(f"{ent.text:20s} {ent.label_:10s}")
# Apple ORG
# $394 billion MONEY
# fiscal year 2022 DATE
# Part-of-speech tagging
for token in doc:
print(f"{token.text:15s} {token.pos_:6s} {token.dep_:10s}")
# Noun chunks
for chunk in doc.noun_chunks:
print(chunk.text)
# Similarity (requires medium or large model)
nlp_md = spacy.load("en_core_web_md")
doc1 = nlp_md("customer churn prediction")
doc2 = nlp_md("predicting client attrition")
print(f"Similarity: {doc1.similarity(doc2):.3f}")
A.6 LLM API Quick Reference
OpenAI API — Chat Completion
from openai import OpenAI
client = OpenAI() # reads OPENAI_API_KEY from environment
# Basic completion
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a business analyst assistant."},
{"role": "user", "content": "Summarize the key risks in this quarterly report."}
],
temperature=0.3,
max_tokens=1000
)
answer = response.choices[0].message.content
OpenAI API — Structured Output
from pydantic import BaseModel
class MarketAnalysis(BaseModel):
market_size: float
growth_rate: float
key_competitors: list[str]
risk_level: str
summary: str
response = client.beta.chat.completions.parse(
model="gpt-4o",
messages=[
{"role": "system", "content": "Extract market analysis data from the text."},
{"role": "user", "content": report_text}
],
response_format=MarketAnalysis
)
analysis = response.choices[0].message.parsed
print(f"Market size: ${analysis.market_size:,.0f}")
print(f"Growth rate: {analysis.growth_rate:.1%}")
OpenAI API — Function Calling
tools = [
{
"type": "function",
"function": {
"name": "get_customer_data",
"description": "Retrieve customer data by customer ID.",
"parameters": {
"type": "object",
"properties": {
"customer_id": {
"type": "string",
"description": "The unique customer identifier"
},
"include_history": {
"type": "boolean",
"description": "Whether to include transaction history"
}
},
"required": ["customer_id"]
}
}
}
]
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
tools=tools,
tool_choice="auto"
)
# Check if the model wants to call a function
if response.choices[0].message.tool_calls:
tool_call = response.choices[0].message.tool_calls[0]
function_name = tool_call.function.name
arguments = json.loads(tool_call.function.arguments)
# Execute the function and pass result back
result = get_customer_data(**arguments)
messages.append(response.choices[0].message)
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"content": json.dumps(result)
})
Anthropic API — Messages
from anthropic import Anthropic
client = Anthropic() # reads ANTHROPIC_API_KEY from environment
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
system="You are a business strategy advisor.",
messages=[
{"role": "user", "content": "What are the top 3 risks for a SaaS startup entering the healthcare market?"}
]
)
answer = response.content[0].text
Anthropic API — Structured Output with Tool Use
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
tools=[
{
"name": "extract_financials",
"description": "Extract financial metrics from a report.",
"input_schema": {
"type": "object",
"properties": {
"revenue": {"type": "number", "description": "Total revenue in USD"},
"net_income": {"type": "number", "description": "Net income in USD"},
"yoy_growth": {"type": "number", "description": "Year-over-year growth rate"}
},
"required": ["revenue", "net_income"]
}
}
],
messages=[
{"role": "user", "content": f"Extract financials from this report:\n{report_text}"}
]
)
for block in response.content:
if block.type == "tool_use":
financials = block.input
print(f"Revenue: ${financials['revenue']:,.0f}")
Common LLM Parameters
| Parameter | Typical Range | Purpose |
|---|---|---|
temperature |
0.0 -- 1.0 | Controls randomness. Use 0.0--0.3 for factual/analytical tasks, 0.5--0.8 for creative tasks. |
max_tokens |
100 -- 4096+ | Maximum length of the response. |
top_p |
0.0 -- 1.0 | Nucleus sampling. Alternative to temperature. Usually set one or the other, not both. |
stop |
list of strings | Sequences where the model stops generating. |
presence_penalty |
-2.0 -- 2.0 | Penalizes repeated topics. Higher values encourage new topics. (OpenAI) |
frequency_penalty |
-2.0 -- 2.0 | Penalizes repeated tokens. Higher values reduce repetition. (OpenAI) |
Prompt Engineering Patterns
# Few-shot prompting
few_shot_prompt = """Classify the customer review as POSITIVE, NEGATIVE, or NEUTRAL.
Review: "The delivery was incredibly fast and the product quality is outstanding."
Classification: POSITIVE
Review: "It works okay but nothing special for the price."
Classification: NEUTRAL
Review: "{user_review}"
Classification:"""
# Chain-of-thought prompting
cot_prompt = """Analyze this business scenario step by step.
Scenario: {scenario}
Think through this step by step:
1. What are the key factors?
2. What are the potential outcomes?
3. What is your recommendation and why?"""
# Role-based prompting
system_msg = """You are a senior financial analyst at a Fortune 500 company.
You specialize in risk assessment and always support your conclusions with data.
When uncertain, you clearly state your confidence level."""
A.7 Fairness and Explainability Quick Reference
Fairlearn — Bias Assessment
from fairlearn.metrics import (
MetricFrame,
demographic_parity_difference,
equalized_odds_difference,
demographic_parity_ratio
)
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Compute metrics by group
metric_frame = MetricFrame(
metrics={
"accuracy": accuracy_score,
"precision": precision_score,
"recall": recall_score
},
y_true=y_test,
y_pred=y_pred,
sensitive_features=df_test["gender"]
)
# View results
print(metric_frame.by_group)
print(f"Accuracy difference: {metric_frame.difference()['accuracy']:.3f}")
print(f"Accuracy ratio: {metric_frame.ratio()['accuracy']:.3f}")
# Demographic parity
dp_diff = demographic_parity_difference(
y_test, y_pred, sensitive_features=df_test["gender"]
)
print(f"Demographic parity difference: {dp_diff:.3f}")
Fairlearn — Bias Mitigation
# Threshold optimization (post-processing)
from fairlearn.postprocessing import ThresholdOptimizer
mitigated = ThresholdOptimizer(
estimator=model,
constraints="demographic_parity",
objective="accuracy_score",
prefit=True
)
mitigated.fit(X_train, y_train, sensitive_features=train_sensitive)
y_pred_fair = mitigated.predict(X_test, sensitive_features=test_sensitive)
# Exponentiated Gradient (in-processing)
from fairlearn.reductions import ExponentiatedGradient, DemographicParity
mitigator = ExponentiatedGradient(
estimator=LogisticRegression(max_iter=1000),
constraints=DemographicParity()
)
mitigator.fit(X_train, y_train, sensitive_features=train_sensitive)
SHAP — SHapley Additive exPlanations
import shap
# TreeExplainer (fast for tree-based models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
# Summary plot (feature importance with direction)
shap.summary_plot(shap_values, X_test, feature_names=feature_names)
# Bar plot (mean absolute SHAP values)
shap.summary_plot(shap_values, X_test, plot_type="bar")
# Force plot (single prediction explanation)
shap.initjs()
shap.force_plot(
explainer.expected_value, shap_values[0], X_test.iloc[0],
feature_names=feature_names
)
# Waterfall plot (single prediction, newer API)
shap.plots.waterfall(shap.Explanation(
values=shap_values[0],
base_values=explainer.expected_value,
data=X_test.iloc[0],
feature_names=feature_names
))
# Dependence plot (one feature vs. SHAP value)
shap.dependence_plot("tenure_months", shap_values, X_test)
# KernelExplainer (model-agnostic, slower)
explainer = shap.KernelExplainer(model.predict_proba, shap.sample(X_train, 100))
shap_values = explainer.shap_values(X_test.iloc[:50])
LIME — Local Interpretable Model-agnostic Explanations
from lime.lime_tabular import LimeTabularExplainer
explainer = LimeTabularExplainer(
training_data=X_train.values,
feature_names=feature_names,
class_names=["Retained", "Churned"],
mode="classification"
)
# Explain a single prediction
explanation = explainer.explain_instance(
X_test.iloc[0].values,
model.predict_proba,
num_features=10,
num_samples=5000
)
# Display
explanation.show_in_notebook()
# As a list
for feature, weight in explanation.as_list():
print(f"{feature:40s} {weight:+.4f}")
# For text classification
from lime.lime_text import LimeTextExplainer
text_explainer = LimeTextExplainer(class_names=["Negative", "Positive"])
text_exp = text_explainer.explain_instance(
review_text, pipeline.predict_proba, num_features=10
)
A.8 Book-Specific Tools Reference
This section documents every custom class built in the book's chapter exercises. Each entry includes the class signature, key methods, parameters, return types, and a usage example. Use this as an API reference when adapting the tools for your own projects.
EDAReport (Chapter 5)
Generates a comprehensive Exploratory Data Analysis report for any DataFrame, including summary statistics, missing-data analysis, distribution profiles, and a correlation matrix.
class EDAReport:
"""Automated Exploratory Data Analysis reporting tool.
Parameters
----------
df : pd.DataFrame
The dataset to analyze.
target_col : str, optional
The target/outcome column for supervised analysis context.
"""
def __init__(self, df: pd.DataFrame, target_col: str | None = None) -> None:
self.df = df
self.target_col = target_col
def summary_stats(self) -> pd.DataFrame:
"""Return descriptive statistics for all columns.
Returns
-------
pd.DataFrame
Extended describe() output including dtype, missing count,
missing percentage, unique count, and skewness for numerics.
"""
...
def missing_report(self) -> pd.DataFrame:
"""Return a DataFrame of columns with missing values.
Returns
-------
pd.DataFrame
Columns: column_name, missing_count, missing_pct, dtype.
Sorted by missing_pct descending.
"""
...
def correlation_matrix(self, method: str = "pearson",
threshold: float = 0.0) -> pd.DataFrame:
"""Compute correlation matrix for numeric columns.
Parameters
----------
method : str
Correlation method: 'pearson', 'spearman', or 'kendall'.
threshold : float
Only return pairs with |correlation| >= threshold.
Returns
-------
pd.DataFrame
Correlation matrix or filtered pairs table.
"""
...
def plot_distributions(self, cols: list[str] | None = None,
bins: int = 30) -> plt.Figure:
"""Plot histograms for numeric columns and bar charts for categoricals.
Parameters
----------
cols : list[str], optional
Columns to plot. If None, plots all columns (max 20).
bins : int
Number of histogram bins for numeric columns.
Returns
-------
matplotlib.figure.Figure
The figure object containing the subplot grid.
"""
...
def target_analysis(self) -> dict:
"""Analyze relationship between features and target column.
Returns
-------
dict
Keys: 'target_distribution' (value counts),
'numeric_correlations' (Series of correlations with target),
'categorical_associations' (dict of chi-squared p-values).
"""
...
def full_report(self, save_path: str | None = None) -> dict:
"""Run all analyses and optionally save to HTML.
Parameters
----------
save_path : str, optional
File path to save the HTML report.
Returns
-------
dict
Keys: 'summary', 'missing', 'correlations', 'target_analysis'.
"""
...
Usage:
from eda_report import EDAReport
report = EDAReport(df, target_col="churned")
stats = report.summary_stats()
report.plot_distributions(cols=["revenue", "tenure", "region"])
full = report.full_report(save_path="eda_output.html")
ChurnClassifier (Chapter 7)
End-to-end churn prediction pipeline that handles preprocessing, model training, evaluation, and feature importance analysis.
class ChurnClassifier:
"""End-to-end churn prediction pipeline.
Parameters
----------
numeric_features : list[str]
Names of numeric columns.
categorical_features : list[str]
Names of categorical columns.
model_type : str
One of 'logistic', 'random_forest', 'xgboost'.
random_state : int
Seed for reproducibility.
"""
def __init__(self, numeric_features: list[str],
categorical_features: list[str],
model_type: str = "random_forest",
random_state: int = 42) -> None:
self.numeric_features = numeric_features
self.categorical_features = categorical_features
self.model_type = model_type
self.random_state = random_state
self.pipeline = None
self.results = None
def build_pipeline(self, **model_params) -> "ChurnClassifier":
"""Construct the sklearn Pipeline with preprocessing and classifier.
Parameters
----------
**model_params
Keyword arguments passed to the classifier constructor.
Returns
-------
ChurnClassifier
Self, for method chaining.
"""
...
def train(self, X_train: pd.DataFrame,
y_train: pd.Series) -> "ChurnClassifier":
"""Fit the pipeline on training data.
Returns
-------
ChurnClassifier
Self, for method chaining.
"""
...
def evaluate(self, X_test: pd.DataFrame,
y_test: pd.Series) -> dict:
"""Evaluate model on test data.
Returns
-------
dict
Keys: 'accuracy', 'precision', 'recall', 'f1', 'roc_auc',
'confusion_matrix', 'classification_report'.
"""
...
def feature_importance(self, top_n: int = 15) -> pd.DataFrame:
"""Extract and rank feature importances.
Parameters
----------
top_n : int
Number of top features to return.
Returns
-------
pd.DataFrame
Columns: feature, importance. Sorted descending.
"""
...
def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
"""Return churn probabilities for new data.
Returns
-------
np.ndarray
Array of shape (n_samples,) with churn probabilities.
"""
...
def plot_roc_curve(self, X_test: pd.DataFrame,
y_test: pd.Series) -> plt.Figure:
"""Plot the ROC curve with AUC annotation.
Returns
-------
matplotlib.figure.Figure
"""
...
Usage:
from churn_classifier import ChurnClassifier
clf = ChurnClassifier(
numeric_features=["tenure", "monthly_charges", "total_charges"],
categorical_features=["contract", "payment_method"],
model_type="xgboost"
)
clf.build_pipeline(n_estimators=300, max_depth=6, learning_rate=0.1)
clf.train(X_train, y_train)
results = clf.evaluate(X_test, y_test)
print(f"ROC AUC: {results['roc_auc']:.3f}")
top_features = clf.feature_importance(top_n=10)
DemandForecaster (Chapter 8)
Time-series demand forecasting tool supporting multiple model types, automatic feature engineering from date components, and forecast visualization.
class DemandForecaster:
"""Time-series demand forecasting tool.
Parameters
----------
date_col : str
Name of the date/datetime column.
target_col : str
Name of the demand/quantity column.
freq : str
Frequency string: 'D' (daily), 'W' (weekly), 'ME' (monthly).
model_type : str
One of 'linear', 'random_forest', 'xgboost'.
"""
def __init__(self, date_col: str, target_col: str,
freq: str = "ME",
model_type: str = "xgboost") -> None:
self.date_col = date_col
self.target_col = target_col
self.freq = freq
self.model_type = model_type
self.model = None
self.feature_names = None
def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Create time-series features: lags, rolling means, date parts.
Parameters
----------
df : pd.DataFrame
Must contain the date column and target column.
Returns
-------
pd.DataFrame
Original data augmented with engineered features.
"""
...
def train(self, df: pd.DataFrame,
test_size: float = 0.2) -> dict:
"""Train the forecasting model with time-aware split.
Parameters
----------
df : pd.DataFrame
Historical demand data.
test_size : float
Fraction of data reserved for testing (taken from end).
Returns
-------
dict
Keys: 'train_rmse', 'test_rmse', 'train_mape', 'test_mape'.
"""
...
def forecast(self, periods: int) -> pd.DataFrame:
"""Generate future demand forecasts.
Parameters
----------
periods : int
Number of future periods to forecast.
Returns
-------
pd.DataFrame
Columns: date, predicted_demand, lower_bound, upper_bound.
"""
...
def plot_forecast(self, historical: pd.DataFrame,
forecast: pd.DataFrame) -> plt.Figure:
"""Visualize historical data alongside forecast with confidence interval.
Returns
-------
matplotlib.figure.Figure
"""
...
Usage:
from demand_forecaster import DemandForecaster
forecaster = DemandForecaster(
date_col="order_date", target_col="units_sold",
freq="ME", model_type="xgboost"
)
metrics = forecaster.train(sales_df, test_size=0.2)
print(f"Test RMSE: {metrics['test_rmse']:,.0f}")
future = forecaster.forecast(periods=6)
forecaster.plot_forecast(sales_df, future)
CustomerSegmenter (Chapter 9)
RFM-based customer segmentation tool using K-Means clustering with automatic scaling, elbow analysis, and segment profiling.
class CustomerSegmenter:
"""RFM-based customer segmentation using K-Means.
Parameters
----------
n_segments : int
Number of customer segments to create.
features : list[str]
Columns to use for clustering (e.g., RFM features).
random_state : int
Seed for reproducibility.
"""
def __init__(self, n_segments: int = 4,
features: list[str] | None = None,
random_state: int = 42) -> None:
self.n_segments = n_segments
self.features = features
self.random_state = random_state
self.scaler = None
self.kmeans = None
self.segment_profiles = None
def compute_rfm(self, df: pd.DataFrame,
customer_col: str, date_col: str,
amount_col: str,
reference_date: str | None = None) -> pd.DataFrame:
"""Compute Recency, Frequency, Monetary values per customer.
Parameters
----------
df : pd.DataFrame
Transaction-level data.
customer_col : str
Column identifying the customer.
date_col : str
Transaction date column.
amount_col : str
Transaction amount column.
reference_date : str, optional
Date string for recency calculation. Defaults to max date + 1 day.
Returns
-------
pd.DataFrame
One row per customer with columns: recency, frequency, monetary.
"""
...
def find_optimal_k(self, X: pd.DataFrame,
k_range: range = range(2, 11)) -> plt.Figure:
"""Run elbow analysis and silhouette scoring.
Parameters
----------
X : pd.DataFrame
Feature matrix for clustering.
k_range : range
Range of k values to evaluate.
Returns
-------
matplotlib.figure.Figure
Dual-axis plot with inertia (elbow) and silhouette scores.
"""
...
def fit(self, X: pd.DataFrame) -> "CustomerSegmenter":
"""Scale features and fit K-Means.
Returns
-------
CustomerSegmenter
Self, for method chaining.
"""
...
def profile_segments(self, df: pd.DataFrame) -> pd.DataFrame:
"""Generate descriptive profiles for each segment.
Returns
-------
pd.DataFrame
Segment-level aggregation with mean feature values,
segment size, and percentage of total.
"""
...
def predict(self, X_new: pd.DataFrame) -> np.ndarray:
"""Assign new customers to existing segments.
Returns
-------
np.ndarray
Segment labels for each row.
"""
...
def plot_segments(self, df: pd.DataFrame,
x_col: str, y_col: str) -> plt.Figure:
"""Scatter plot of customers colored by segment.
Returns
-------
matplotlib.figure.Figure
"""
...
Usage:
from customer_segmenter import CustomerSegmenter
segmenter = CustomerSegmenter(n_segments=4)
rfm = segmenter.compute_rfm(
transactions, customer_col="customer_id",
date_col="purchase_date", amount_col="amount"
)
segmenter.find_optimal_k(rfm)
segmenter.fit(rfm)
profiles = segmenter.profile_segments(rfm)
print(profiles)
segmenter.plot_segments(rfm, x_col="recency", y_col="monetary")
RecommendationEngine (Chapter 10)
Hybrid recommendation engine supporting collaborative filtering, content-based filtering, and a weighted hybrid approach.
class RecommendationEngine:
"""Hybrid recommendation engine for product/content recommendations.
Parameters
----------
method : str
One of 'collaborative', 'content', 'hybrid'.
n_recommendations : int
Default number of recommendations to return.
collaborative_weight : float
Weight for collaborative filtering in hybrid mode (0.0 to 1.0).
Content weight = 1.0 - collaborative_weight.
"""
def __init__(self, method: str = "hybrid",
n_recommendations: int = 10,
collaborative_weight: float = 0.6) -> None:
self.method = method
self.n_recommendations = n_recommendations
self.collaborative_weight = collaborative_weight
self.user_item_matrix = None
self.similarity_matrix = None
self.content_features = None
def fit_collaborative(self, interactions: pd.DataFrame,
user_col: str, item_col: str,
rating_col: str) -> "RecommendationEngine":
"""Build user-item interaction matrix and compute similarity.
Parameters
----------
interactions : pd.DataFrame
User-item interaction data.
user_col : str
Column identifying the user.
item_col : str
Column identifying the item.
rating_col : str
Column with rating/interaction value.
Returns
-------
RecommendationEngine
Self, for method chaining.
"""
...
def fit_content(self, items: pd.DataFrame,
item_col: str,
feature_cols: list[str]) -> "RecommendationEngine":
"""Build content-based feature vectors and similarity.
Parameters
----------
items : pd.DataFrame
Item metadata.
item_col : str
Column identifying the item.
feature_cols : list[str]
Columns containing item features for similarity computation.
Returns
-------
RecommendationEngine
Self, for method chaining.
"""
...
def recommend(self, user_id: str,
n: int | None = None,
exclude_seen: bool = True) -> pd.DataFrame:
"""Generate recommendations for a user.
Parameters
----------
user_id : str
The user to generate recommendations for.
n : int, optional
Number of recommendations. Defaults to self.n_recommendations.
exclude_seen : bool
Whether to exclude items the user has already interacted with.
Returns
-------
pd.DataFrame
Columns: item_id, score, method.
Sorted by score descending.
"""
...
def similar_items(self, item_id: str,
n: int = 10) -> pd.DataFrame:
"""Find items similar to a given item.
Returns
-------
pd.DataFrame
Columns: item_id, similarity_score.
"""
...
def evaluate(self, test_interactions: pd.DataFrame,
k: int = 10) -> dict:
"""Evaluate recommendation quality.
Parameters
----------
test_interactions : pd.DataFrame
Held-out interaction data.
k : int
Number of recommendations to evaluate (precision@k, recall@k).
Returns
-------
dict
Keys: 'precision_at_k', 'recall_at_k', 'ndcg_at_k', 'coverage'.
"""
...
Usage:
from recommendation_engine import RecommendationEngine
engine = RecommendationEngine(method="hybrid", collaborative_weight=0.7)
engine.fit_collaborative(ratings, user_col="user_id",
item_col="product_id", rating_col="rating")
engine.fit_content(products, item_col="product_id",
feature_cols=["category", "brand", "price_tier"])
recs = engine.recommend(user_id="U1042", n=5)
print(recs)
similar = engine.similar_items(item_id="P2001", n=5)
metrics = engine.evaluate(test_ratings, k=10)
print(f"Precision@10: {metrics['precision_at_k']:.3f}")
ModelEvaluator (Chapter 11)
Comprehensive model evaluation toolkit that compares multiple models side-by-side with cross-validation, generates visual comparisons, and produces a summary report.
class ModelEvaluator:
"""Compare and evaluate multiple ML models side-by-side.
Parameters
----------
task : str
One of 'classification' or 'regression'.
cv_folds : int
Number of cross-validation folds.
scoring : str | list[str]
Scoring metric(s) for evaluation.
random_state : int
Seed for reproducibility.
"""
def __init__(self, task: str = "classification",
cv_folds: int = 5,
scoring: str | list[str] = "f1",
random_state: int = 42) -> None:
self.task = task
self.cv_folds = cv_folds
self.scoring = scoring
self.random_state = random_state
self.models = {}
self.results = {}
def add_model(self, name: str, model: object) -> "ModelEvaluator":
"""Register a model for evaluation.
Parameters
----------
name : str
Human-readable model name (e.g., "Random Forest").
model : object
A scikit-learn compatible estimator (or Pipeline).
Returns
-------
ModelEvaluator
Self, for method chaining.
"""
...
def run_evaluation(self, X: pd.DataFrame,
y: pd.Series) -> pd.DataFrame:
"""Run cross-validated evaluation on all registered models.
Returns
-------
pd.DataFrame
One row per model with mean and std for each scoring metric.
"""
...
def holdout_evaluation(self, X_train: pd.DataFrame,
y_train: pd.Series,
X_test: pd.DataFrame,
y_test: pd.Series) -> pd.DataFrame:
"""Train on training set and evaluate on held-out test set.
Returns
-------
pd.DataFrame
One row per model with test metrics.
"""
...
def plot_comparison(self, metric: str | None = None) -> plt.Figure:
"""Box plot comparing model performance across CV folds.
Parameters
----------
metric : str, optional
Metric to plot. Defaults to primary scoring metric.
Returns
-------
matplotlib.figure.Figure
"""
...
def best_model(self, metric: str | None = None) -> tuple[str, object]:
"""Return the name and fitted instance of the best-performing model.
Returns
-------
tuple[str, object]
(model_name, model_instance)
"""
...
def summary_report(self) -> str:
"""Generate a formatted text summary of all evaluations.
Returns
-------
str
Multi-line report with rankings, key metrics, and recommendation.
"""
...
Usage:
from model_evaluator import ModelEvaluator
evaluator = ModelEvaluator(task="classification", cv_folds=5, scoring=["f1", "roc_auc"])
evaluator.add_model("Logistic Regression", LogisticRegression(max_iter=1000))
evaluator.add_model("Random Forest", RandomForestClassifier(n_estimators=200))
evaluator.add_model("XGBoost", XGBClassifier(n_estimators=300, learning_rate=0.1))
cv_results = evaluator.run_evaluation(X, y)
print(cv_results)
evaluator.plot_comparison(metric="f1")
best_name, best_model = evaluator.best_model(metric="roc_auc")
print(f"Best model: {best_name}")
ReviewAnalyzer (Chapter 14)
NLP-powered customer review analysis tool that performs sentiment analysis, topic extraction, and trend detection on text data.
class ReviewAnalyzer:
"""NLP-powered customer review analysis tool.
Parameters
----------
text_col : str
Name of the column containing review text.
date_col : str, optional
Name of the date column for trend analysis.
rating_col : str, optional
Name of the numeric rating column.
"""
def __init__(self, text_col: str = "review_text",
date_col: str | None = None,
rating_col: str | None = None) -> None:
self.text_col = text_col
self.date_col = date_col
self.rating_col = rating_col
self.vectorizer = None
self.sentiment_analyzer = None
def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean and tokenize review text.
Returns
-------
pd.DataFrame
Original DataFrame with added 'clean_text' and 'tokens' columns.
"""
...
def analyze_sentiment(self, df: pd.DataFrame) -> pd.DataFrame:
"""Compute sentiment scores for each review.
Returns
-------
pd.DataFrame
Added columns: 'sentiment_score' (-1 to 1),
'sentiment_label' (positive/negative/neutral).
"""
...
def extract_topics(self, df: pd.DataFrame,
n_topics: int = 5,
n_words: int = 10,
method: str = "lda") -> dict:
"""Extract topics from the review corpus.
Parameters
----------
n_topics : int
Number of topics to extract.
n_words : int
Number of top words per topic.
method : str
Topic modeling method: 'lda' or 'nmf'.
Returns
-------
dict
Keys: 'topics' (list of word lists), 'topic_labels' (list of str),
'document_topics' (np.ndarray of shape (n_docs, n_topics)).
"""
...
def extract_keywords(self, df: pd.DataFrame,
top_n: int = 20,
by_sentiment: bool = False) -> pd.DataFrame:
"""Extract most important keywords using TF-IDF.
Parameters
----------
top_n : int
Number of keywords to return.
by_sentiment : bool
If True, return separate keyword lists for positive and negative.
Returns
-------
pd.DataFrame
Columns: keyword, tfidf_score, (sentiment_label if by_sentiment).
"""
...
def sentiment_trends(self, df: pd.DataFrame,
freq: str = "ME") -> pd.DataFrame:
"""Track sentiment over time.
Parameters
----------
freq : str
Time frequency for aggregation.
Returns
-------
pd.DataFrame
Columns: date, avg_sentiment, review_count, pct_positive, pct_negative.
"""
...
def plot_sentiment_distribution(self, df: pd.DataFrame) -> plt.Figure:
"""Visualize the distribution of sentiment scores.
Returns
-------
matplotlib.figure.Figure
"""
...
def full_report(self, df: pd.DataFrame) -> dict:
"""Run all analyses and return combined results.
Returns
-------
dict
Keys: 'sentiment_summary', 'topics', 'keywords',
'trends' (if date_col provided), 'rating_sentiment_correlation'
(if rating_col provided).
"""
...
Usage:
from review_analyzer import ReviewAnalyzer
analyzer = ReviewAnalyzer(
text_col="review_text", date_col="review_date", rating_col="stars"
)
df = analyzer.preprocess(reviews_df)
df = analyzer.analyze_sentiment(df)
topics = analyzer.extract_topics(df, n_topics=5, method="lda")
for i, topic_words in enumerate(topics["topics"]):
print(f"Topic {i+1}: {', '.join(topic_words)}")
trends = analyzer.sentiment_trends(df, freq="ME")
report = analyzer.full_report(df)
PromptBuilder (Chapter 19)
Structured prompt construction tool for LLM interactions that supports templates, variable injection, few-shot examples, and system message configuration.
class PromptBuilder:
"""Structured prompt construction tool for LLM interactions.
Parameters
----------
model : str
LLM model identifier (e.g., 'gpt-4o', 'claude-sonnet-4-20250514').
default_temperature : float
Default temperature for completions.
"""
def __init__(self, model: str = "gpt-4o",
default_temperature: float = 0.3) -> None:
self.model = model
self.default_temperature = default_temperature
self.templates = {}
self.system_message = None
def set_system_message(self, message: str) -> "PromptBuilder":
"""Set the system message for all prompts.
Returns
-------
PromptBuilder
Self, for method chaining.
"""
...
def add_template(self, name: str, template: str,
required_vars: list[str] | None = None) -> "PromptBuilder":
"""Register a reusable prompt template.
Parameters
----------
name : str
Template identifier.
template : str
Template string with {variable} placeholders.
required_vars : list[str], optional
List of required variable names. Validated at render time.
Returns
-------
PromptBuilder
Self, for method chaining.
"""
...
def render(self, template_name: str,
**variables) -> str:
"""Render a template with the provided variables.
Parameters
----------
template_name : str
Name of a registered template.
**variables
Variable values to inject into the template.
Returns
-------
str
The rendered prompt string.
Raises
------
ValueError
If required variables are missing.
KeyError
If the template name is not registered.
"""
...
def build_messages(self, user_content: str,
few_shot_examples: list[dict] | None = None
) -> list[dict]:
"""Build a complete messages list for the chat API.
Parameters
----------
user_content : str
The user's prompt content.
few_shot_examples : list[dict], optional
List of {"user": ..., "assistant": ...} example pairs.
Returns
-------
list[dict]
Messages list ready for API call.
"""
...
def estimate_tokens(self, text: str) -> int:
"""Estimate token count for a given text.
Returns
-------
int
Approximate token count (using word-based heuristic).
"""
...
def validate_prompt(self, prompt: str,
max_tokens: int = 4096) -> dict:
"""Check prompt for common issues.
Returns
-------
dict
Keys: 'is_valid' (bool), 'estimated_tokens' (int),
'warnings' (list[str]).
"""
...
Usage:
from prompt_builder import PromptBuilder
builder = PromptBuilder(model="gpt-4o", default_temperature=0.3)
builder.set_system_message("You are a financial analyst specializing in SaaS metrics.")
builder.add_template(
"analysis",
"Analyze the following quarterly data for {company}:\n\n{data}\n\n"
"Focus on: {focus_areas}\n\nProvide your analysis in a structured format.",
required_vars=["company", "data", "focus_areas"]
)
prompt = builder.render(
"analysis",
company="Acme SaaS",
data=quarterly_summary,
focus_areas="churn rate trends, expansion revenue, CAC payback"
)
messages = builder.build_messages(prompt, few_shot_examples=[
{"user": "Analyze Q1 data...", "assistant": "## Q1 Analysis\n..."}
])
PromptChain (Chapter 20)
Orchestration tool for multi-step LLM workflows that chains prompts together, passing outputs from one step as inputs to the next.
class PromptChain:
"""Orchestrate multi-step LLM workflows.
Parameters
----------
client : object
An initialized LLM client (OpenAI or Anthropic).
model : str
Model identifier for all steps.
verbose : bool
If True, print intermediate results.
"""
def __init__(self, client: object,
model: str = "gpt-4o",
verbose: bool = False) -> None:
self.client = client
self.model = model
self.verbose = verbose
self.steps = []
self.results = {}
def add_step(self, name: str, prompt_template: str,
input_map: dict[str, str] | None = None,
temperature: float = 0.3,
max_tokens: int = 1000,
parser: callable | None = None) -> "PromptChain":
"""Add a step to the chain.
Parameters
----------
name : str
Unique step identifier.
prompt_template : str
Prompt with {variable} placeholders.
input_map : dict[str, str], optional
Mapping of template variables to previous step names.
E.g., {"summary": "step_1"} fills {summary} with step_1's output.
temperature : float
Temperature for this step.
max_tokens : int
Max tokens for this step.
parser : callable, optional
Function to post-process the step's raw output.
Returns
-------
PromptChain
Self, for method chaining.
"""
...
def run(self, initial_inputs: dict[str, str] | None = None) -> dict:
"""Execute all steps sequentially.
Parameters
----------
initial_inputs : dict[str, str], optional
Variables available to the first step(s).
Returns
-------
dict
Keys are step names, values are step outputs (post-parsing).
"""
...
def get_result(self, step_name: str) -> str:
"""Retrieve the output of a specific step.
Returns
-------
str
The output of the named step.
Raises
------
KeyError
If the step has not been executed.
"""
...
def retry_step(self, step_name: str,
temperature: float | None = None) -> str:
"""Re-execute a single step (e.g., if output was unsatisfactory).
Parameters
----------
step_name : str
The step to re-run.
temperature : float, optional
Override temperature for the retry.
Returns
-------
str
New output for the step.
"""
...
def total_tokens_used(self) -> dict:
"""Return token usage across all steps.
Returns
-------
dict
Keys: 'prompt_tokens', 'completion_tokens', 'total_tokens',
'estimated_cost_usd'.
"""
...
Usage:
from prompt_chain import PromptChain
from openai import OpenAI
chain = PromptChain(client=OpenAI(), model="gpt-4o", verbose=True)
chain.add_step(
name="extract",
prompt_template="Extract key financial metrics from this report:\n\n{report}",
temperature=0.1, max_tokens=500
)
chain.add_step(
name="analyze",
prompt_template="Given these metrics:\n{extract}\n\nIdentify the top 3 risks.",
input_map={"extract": "extract"},
temperature=0.3, max_tokens=800
)
chain.add_step(
name="recommend",
prompt_template="Based on this risk analysis:\n{analyze}\n\n"
"Recommend specific actions for the executive team.",
input_map={"analyze": "analyze"},
temperature=0.5, max_tokens=1000
)
results = chain.run(initial_inputs={"report": quarterly_report_text})
print(results["recommend"])
print(chain.total_tokens_used())
BiasDetector (Chapter 25)
Automated tool for detecting and reporting bias in ML model predictions across protected attributes, integrating Fairlearn metrics with custom visualization and reporting.
class BiasDetector:
"""Detect and report bias in ML model predictions.
Parameters
----------
sensitive_features : list[str]
Names of protected attribute columns (e.g., ['gender', 'race', 'age_group']).
reference_groups : dict[str, str], optional
Mapping of feature name to the reference group for ratio calculations.
E.g., {'gender': 'Male', 'race': 'White'}.
fairness_threshold : float
Threshold for the four-fifths rule (default 0.8).
"""
def __init__(self, sensitive_features: list[str],
reference_groups: dict[str, str] | None = None,
fairness_threshold: float = 0.8) -> None:
self.sensitive_features = sensitive_features
self.reference_groups = reference_groups or {}
self.fairness_threshold = fairness_threshold
self.audit_results = {}
def audit(self, y_true: pd.Series, y_pred: pd.Series,
sensitive_data: pd.DataFrame) -> dict:
"""Run a comprehensive bias audit across all sensitive features.
Parameters
----------
y_true : pd.Series
Ground truth labels.
y_pred : pd.Series
Model predictions.
sensitive_data : pd.DataFrame
DataFrame containing the sensitive feature columns.
Returns
-------
dict
Nested dict: {feature: {metric: value}} including:
- demographic_parity_difference
- demographic_parity_ratio
- equalized_odds_difference
- group_accuracy, group_precision, group_recall, group_f1
- four_fifths_rule_pass (bool)
"""
...
def group_metrics(self, y_true: pd.Series, y_pred: pd.Series,
sensitive_data: pd.DataFrame,
feature: str) -> pd.DataFrame:
"""Compute detailed metrics broken down by group.
Returns
-------
pd.DataFrame
One row per group with accuracy, precision, recall, F1,
selection_rate, and sample_size.
"""
...
def plot_disparities(self, feature: str | None = None) -> plt.Figure:
"""Visualize metric disparities across groups.
Parameters
----------
feature : str, optional
Sensitive feature to plot. If None, plots all features.
Returns
-------
matplotlib.figure.Figure
"""
...
def four_fifths_test(self, y_pred: pd.Series,
sensitive_data: pd.DataFrame,
feature: str) -> dict:
"""Apply the four-fifths (80%) rule test.
Returns
-------
dict
Keys: 'passes' (bool), 'selection_rates' (dict),
'adverse_impact_ratio' (float), 'disadvantaged_groups' (list).
"""
...
def generate_report(self, format: str = "text") -> str:
"""Generate a formatted bias audit report.
Parameters
----------
format : str
Output format: 'text', 'markdown', or 'html'.
Returns
-------
str
Complete bias audit report.
"""
...
def suggest_mitigations(self) -> list[dict]:
"""Suggest bias mitigation strategies based on audit results.
Returns
-------
list[dict]
Each dict has keys: 'issue', 'severity' ('high'/'medium'/'low'),
'strategy', 'implementation_notes'.
"""
...
Usage:
from bias_detector import BiasDetector
detector = BiasDetector(
sensitive_features=["gender", "age_group"],
reference_groups={"gender": "Male", "age_group": "30-50"},
fairness_threshold=0.8
)
audit = detector.audit(y_test, y_pred, df_test[["gender", "age_group"]])
print(f"Demographic parity (gender): {audit['gender']['demographic_parity_difference']:.3f}")
group_detail = detector.group_metrics(y_test, y_pred, df_test, feature="gender")
print(group_detail)
detector.plot_disparities(feature="gender")
report = detector.generate_report(format="markdown")
mitigations = detector.suggest_mitigations()
ExplainabilityDashboard (Chapter 26)
Unified explainability interface that wraps SHAP and LIME into a single API, producing global and local explanations along with interactive visualizations.
class ExplainabilityDashboard:
"""Unified model explainability interface combining SHAP and LIME.
Parameters
----------
model : object
A fitted scikit-learn compatible model or pipeline.
X_train : pd.DataFrame
Training data (used as background for SHAP and LIME).
feature_names : list[str]
Feature names for display.
task : str
One of 'classification' or 'regression'.
class_names : list[str], optional
Class labels for classification tasks.
"""
def __init__(self, model: object,
X_train: pd.DataFrame,
feature_names: list[str],
task: str = "classification",
class_names: list[str] | None = None) -> None:
self.model = model
self.X_train = X_train
self.feature_names = feature_names
self.task = task
self.class_names = class_names
self.shap_explainer = None
self.lime_explainer = None
self.shap_values = None
def compute_shap(self, X: pd.DataFrame,
method: str = "auto") -> np.ndarray:
"""Compute SHAP values for the given data.
Parameters
----------
X : pd.DataFrame
Data to explain.
method : str
Explainer type: 'auto', 'tree', 'kernel', 'linear'.
'auto' selects based on model type.
Returns
-------
np.ndarray
SHAP values array of shape (n_samples, n_features).
"""
...
def global_importance(self, X: pd.DataFrame,
top_n: int = 15) -> pd.DataFrame:
"""Compute global feature importance from SHAP values.
Returns
-------
pd.DataFrame
Columns: feature, mean_abs_shap. Sorted descending.
"""
...
def local_explanation(self, instance: pd.Series,
method: str = "shap",
num_features: int = 10) -> dict:
"""Explain a single prediction.
Parameters
----------
instance : pd.Series
Single row of features.
method : str
Explanation method: 'shap', 'lime', or 'both'.
num_features : int
Number of top features to include.
Returns
-------
dict
Keys: 'prediction', 'probability' (classification),
'feature_contributions' (list of (feature, value, contribution)),
'base_value' (SHAP expected value).
"""
...
def plot_global(self, X: pd.DataFrame,
plot_type: str = "summary") -> plt.Figure:
"""Create global explanation visualizations.
Parameters
----------
plot_type : str
One of 'summary' (beeswarm), 'bar', 'heatmap'.
Returns
-------
matplotlib.figure.Figure
"""
...
def plot_local(self, instance: pd.Series,
plot_type: str = "waterfall") -> plt.Figure:
"""Create local explanation visualization for one prediction.
Parameters
----------
plot_type : str
One of 'waterfall', 'force', 'bar'.
Returns
-------
matplotlib.figure.Figure
"""
...
def feature_dependence(self, feature: str,
interaction_feature: str | None = None,
X: pd.DataFrame | None = None) -> plt.Figure:
"""SHAP dependence plot for a single feature.
Parameters
----------
feature : str
Feature to plot on x-axis.
interaction_feature : str, optional
Feature for color-coding interaction effects.
X : pd.DataFrame, optional
Data to use. If None, uses previously computed values.
Returns
-------
matplotlib.figure.Figure
"""
...
def compare_explanations(self, instance: pd.Series,
top_n: int = 10) -> pd.DataFrame:
"""Compare SHAP and LIME explanations side-by-side for one instance.
Returns
-------
pd.DataFrame
Columns: feature, shap_contribution, lime_contribution, agreement.
"""
...
Usage:
from explainability_dashboard import ExplainabilityDashboard
dashboard = ExplainabilityDashboard(
model=fitted_pipeline,
X_train=X_train,
feature_names=feature_names,
task="classification",
class_names=["Retained", "Churned"]
)
# Global explanations
importance = dashboard.global_importance(X_test, top_n=15)
dashboard.plot_global(X_test, plot_type="summary")
# Local explanation for a single customer
explanation = dashboard.local_explanation(X_test.iloc[0], method="both")
dashboard.plot_local(X_test.iloc[0], plot_type="waterfall")
# Compare SHAP vs LIME
comparison = dashboard.compare_explanations(X_test.iloc[0])
print(comparison)
AIROICalculator (Chapter 34)
Financial modeling tool for calculating the return on investment of AI/ML initiatives, incorporating development costs, operational costs, productivity gains, and risk-adjusted projections.
class AIROICalculator:
"""Calculate ROI for AI/ML business initiatives.
Parameters
----------
project_name : str
Name of the AI project.
time_horizon_years : int
Number of years for the projection.
discount_rate : float
Annual discount rate for NPV calculations.
"""
def __init__(self, project_name: str,
time_horizon_years: int = 3,
discount_rate: float = 0.10) -> None:
self.project_name = project_name
self.time_horizon_years = time_horizon_years
self.discount_rate = discount_rate
self.costs = {}
self.benefits = {}
self.risk_factors = {}
def add_cost(self, category: str, year: int,
amount: float,
recurring: bool = False) -> "AIROICalculator":
"""Add a cost item.
Parameters
----------
category : str
Cost category (e.g., 'development', 'infrastructure',
'talent', 'data_acquisition', 'maintenance').
year : int
Year of the expenditure (0 = initial investment).
amount : float
Dollar amount.
recurring : bool
If True, repeats every year from 'year' onward.
Returns
-------
AIROICalculator
Self, for method chaining.
"""
...
def add_benefit(self, category: str, year: int,
amount: float,
growth_rate: float = 0.0) -> "AIROICalculator":
"""Add a benefit/revenue item.
Parameters
----------
category : str
Benefit category (e.g., 'revenue_increase', 'cost_savings',
'productivity_gain', 'error_reduction').
year : int
First year the benefit is realized.
amount : float
Dollar amount in the first year.
growth_rate : float
Annual growth rate for the benefit (e.g., 0.1 for 10% growth).
Returns
-------
AIROICalculator
Self, for method chaining.
"""
...
def set_risk_factor(self, scenario: str,
probability: float,
impact_multiplier: float) -> "AIROICalculator":
"""Define a risk scenario.
Parameters
----------
scenario : str
Risk scenario name (e.g., 'adoption_delay', 'data_quality_issues').
probability : float
Estimated probability (0.0 to 1.0).
impact_multiplier : float
Multiplier on benefits (e.g., 0.5 means benefits cut in half).
Returns
-------
AIROICalculator
Self, for method chaining.
"""
...
def calculate(self) -> dict:
"""Compute base-case ROI metrics.
Returns
-------
dict
Keys: 'total_costs', 'total_benefits', 'net_benefit',
'roi_pct', 'npv', 'irr', 'payback_period_years',
'yearly_cashflows' (list of dicts).
"""
...
def risk_adjusted_calculate(self) -> dict:
"""Compute expected-value ROI incorporating risk factors.
Returns
-------
dict
Same keys as calculate() plus 'expected_npv',
'risk_adjusted_roi_pct', 'scenario_analysis' (list of dicts).
"""
...
def sensitivity_analysis(self, variable: str,
range_pct: float = 0.3,
steps: int = 10) -> pd.DataFrame:
"""Run sensitivity analysis on a single variable.
Parameters
----------
variable : str
Variable to vary (e.g., 'discount_rate', a cost category, a benefit).
range_pct : float
Percentage range to vary the variable (+/- range_pct).
steps : int
Number of steps in the range.
Returns
-------
pd.DataFrame
Columns: variable_value, npv, roi_pct.
"""
...
def plot_cashflows(self) -> plt.Figure:
"""Visualize projected cash flows over time.
Returns
-------
matplotlib.figure.Figure
"""
...
def plot_sensitivity(self, variable: str) -> plt.Figure:
"""Tornado/sensitivity chart for the specified variable.
Returns
-------
matplotlib.figure.Figure
"""
...
def executive_summary(self) -> str:
"""Generate a plain-English executive summary.
Returns
-------
str
Formatted summary suitable for a slide deck or memo.
"""
...
Usage:
from ai_roi_calculator import AIROICalculator
calc = AIROICalculator("Customer Churn Prediction", time_horizon_years=3, discount_rate=0.10)
# Costs
calc.add_cost("development", year=0, amount=150_000)
calc.add_cost("infrastructure", year=0, amount=50_000)
calc.add_cost("infrastructure", year=1, amount=30_000, recurring=True)
calc.add_cost("talent", year=1, amount=120_000, recurring=True)
# Benefits
calc.add_benefit("churn_reduction", year=1, amount=400_000, growth_rate=0.15)
calc.add_benefit("upsell_revenue", year=1, amount=100_000, growth_rate=0.10)
# Risks
calc.set_risk_factor("adoption_delay", probability=0.3, impact_multiplier=0.6)
calc.set_risk_factor("data_quality", probability=0.2, impact_multiplier=0.8)
# Calculate
results = calc.calculate()
print(f"NPV: ${results['npv']:,.0f}")
print(f"ROI: {results['roi_pct']:.1f}%")
print(f"Payback: {results['payback_period_years']:.1f} years")
risk_results = calc.risk_adjusted_calculate()
print(f"Risk-Adjusted NPV: ${risk_results['expected_npv']:,.0f}")
calc.plot_cashflows()
print(calc.executive_summary())
AIMaturityAssessment (Chapter 39)
Diagnostic tool that evaluates an organization's AI readiness across multiple dimensions and produces a maturity scorecard with actionable recommendations.
class AIMaturityAssessment:
"""Evaluate organizational AI maturity across key dimensions.
Parameters
----------
organization_name : str
Name of the organization being assessed.
industry : str, optional
Industry vertical for benchmark comparisons.
"""
DIMENSIONS = [
"strategy_and_vision",
"data_infrastructure",
"talent_and_skills",
"technology_stack",
"governance_and_ethics",
"organizational_culture",
"use_case_portfolio",
"measurement_and_value"
]
MATURITY_LEVELS = {
1: "Initial",
2: "Developing",
3: "Defined",
4: "Managed",
5: "Optimizing"
}
def __init__(self, organization_name: str,
industry: str | None = None) -> None:
self.organization_name = organization_name
self.industry = industry
self.scores = {}
self.evidence = {}
self.benchmarks = {}
def score_dimension(self, dimension: str, score: int,
evidence: str = "",
sub_scores: dict[str, int] | None = None
) -> "AIMaturityAssessment":
"""Record a maturity score for one dimension.
Parameters
----------
dimension : str
One of the DIMENSIONS listed above.
score : int
Maturity level (1-5).
evidence : str
Supporting evidence or justification for the score.
sub_scores : dict[str, int], optional
Granular sub-dimension scores.
Returns
-------
AIMaturityAssessment
Self, for method chaining.
Raises
------
ValueError
If dimension is not recognized or score is not 1-5.
"""
...
def overall_score(self) -> float:
"""Compute the weighted average maturity score.
Returns
-------
float
Overall maturity score (1.0 to 5.0).
"""
...
def maturity_level(self) -> str:
"""Return the overall maturity level label.
Returns
-------
str
One of: 'Initial', 'Developing', 'Defined', 'Managed', 'Optimizing'.
"""
...
def gap_analysis(self, target_level: int = 4) -> pd.DataFrame:
"""Identify gaps between current scores and target level.
Parameters
----------
target_level : int
The desired maturity level for each dimension.
Returns
-------
pd.DataFrame
Columns: dimension, current_score, target, gap, priority.
Sorted by gap descending.
"""
...
def recommendations(self, top_n: int = 5) -> list[dict]:
"""Generate prioritized recommendations based on gaps.
Parameters
----------
top_n : int
Number of top recommendations to return.
Returns
-------
list[dict]
Each dict has keys: 'dimension', 'recommendation',
'expected_impact' ('high'/'medium'/'low'),
'effort' ('high'/'medium'/'low'),
'timeline' (str), 'quick_win' (bool).
"""
...
def plot_radar(self, target_level: int = 4) -> plt.Figure:
"""Radar/spider chart of current vs. target maturity scores.
Returns
-------
matplotlib.figure.Figure
"""
...
def plot_heatmap(self) -> plt.Figure:
"""Heatmap of all dimension and sub-dimension scores.
Returns
-------
matplotlib.figure.Figure
"""
...
def generate_report(self, format: str = "markdown") -> str:
"""Generate a comprehensive maturity assessment report.
Parameters
----------
format : str
Output format: 'text', 'markdown', or 'html'.
Returns
-------
str
Complete assessment report with scores, gaps, and recommendations.
"""
...
Usage:
from ai_maturity import AIMaturityAssessment
assessment = AIMaturityAssessment("Acme Corp", industry="financial_services")
assessment.score_dimension("strategy_and_vision", score=3,
evidence="AI strategy exists but not integrated with business strategy")
assessment.score_dimension("data_infrastructure", score=2,
evidence="Data warehouse exists but siloed; no feature store")
assessment.score_dimension("talent_and_skills", score=2,
evidence="Small data science team; limited ML engineering capability")
assessment.score_dimension("technology_stack", score=3,
evidence="Cloud-based ML platform in pilot stage")
assessment.score_dimension("governance_and_ethics", score=1,
evidence="No formal AI governance framework")
assessment.score_dimension("organizational_culture", score=2,
evidence="Pockets of data-driven culture; leadership buy-in varies")
assessment.score_dimension("use_case_portfolio", score=3,
evidence="5 production models; pipeline of 10+ candidates")
assessment.score_dimension("measurement_and_value", score=2,
evidence="Ad hoc ROI tracking; no standardized value framework")
print(f"Overall Score: {assessment.overall_score():.1f}/5.0")
print(f"Maturity Level: {assessment.maturity_level()}")
gaps = assessment.gap_analysis(target_level=4)
print(gaps)
recs = assessment.recommendations(top_n=3)
for r in recs:
print(f"- [{r['expected_impact'].upper()}] {r['recommendation']}")
assessment.plot_radar()
print(assessment.generate_report(format="markdown"))
TransformationRoadmapGenerator (Chapter 39)
Strategic planning tool that takes AI maturity assessment results and generates a phased implementation roadmap with milestones, resource estimates, and dependencies.
class TransformationRoadmapGenerator:
"""Generate a phased AI transformation roadmap.
Parameters
----------
assessment : AIMaturityAssessment
A completed maturity assessment.
target_timeline_months : int
Total timeline for the transformation in months.
budget_constraint : float, optional
Total budget in dollars. If provided, phases are budget-aware.
"""
def __init__(self, assessment: "AIMaturityAssessment",
target_timeline_months: int = 24,
budget_constraint: float | None = None) -> None:
self.assessment = assessment
self.target_timeline_months = target_timeline_months
self.budget_constraint = budget_constraint
self.phases = []
self.initiatives = []
def generate_phases(self, n_phases: int = 3) -> list[dict]:
"""Create transformation phases based on gap analysis.
Parameters
----------
n_phases : int
Number of phases (typically 3: Foundation, Scale, Optimize).
Returns
-------
list[dict]
Each dict has keys: 'phase_name', 'phase_number',
'start_month', 'end_month', 'focus_dimensions',
'objectives' (list[str]), 'target_scores' (dict).
"""
...
def add_initiative(self, name: str, phase: int,
dimension: str, description: str,
estimated_cost: float,
estimated_months: int,
dependencies: list[str] | None = None,
kpis: list[str] | None = None) -> "TransformationRoadmapGenerator":
"""Add a specific initiative to the roadmap.
Parameters
----------
name : str
Initiative name.
phase : int
Phase number (1-indexed).
dimension : str
Primary maturity dimension this initiative addresses.
description : str
Brief description.
estimated_cost : float
Estimated cost in dollars.
estimated_months : int
Estimated duration in months.
dependencies : list[str], optional
Names of prerequisite initiatives.
kpis : list[str], optional
Key performance indicators for this initiative.
Returns
-------
TransformationRoadmapGenerator
Self, for method chaining.
"""
...
def auto_generate_initiatives(self) -> "TransformationRoadmapGenerator":
"""Automatically generate initiatives based on gap analysis.
Uses built-in best-practice templates mapped to each
dimension-gap combination.
Returns
-------
TransformationRoadmapGenerator
Self, for method chaining.
"""
...
def validate_dependencies(self) -> list[str]:
"""Check for circular dependencies and missing prerequisites.
Returns
-------
list[str]
List of warning messages. Empty list if valid.
"""
...
def plot_gantt(self) -> plt.Figure:
"""Generate a Gantt chart of the roadmap.
Returns
-------
matplotlib.figure.Figure
"""
...
def plot_investment_timeline(self) -> plt.Figure:
"""Stacked bar chart of investment by dimension per phase.
Returns
-------
matplotlib.figure.Figure
"""
...
def generate_roadmap(self, format: str = "markdown") -> str:
"""Generate the complete transformation roadmap document.
Parameters
----------
format : str
Output format: 'text', 'markdown', or 'html'.
Returns
-------
str
Complete roadmap with phases, initiatives, timelines,
budgets, KPIs, and dependencies.
"""
...
def executive_presentation(self) -> list[dict]:
"""Generate slide-ready content for executive presentation.
Returns
-------
list[dict]
Each dict represents a slide with keys: 'title',
'bullet_points' (list[str]), 'chart_type' (str or None),
'data' (dict or None).
"""
...
Usage:
from transformation_roadmap import TransformationRoadmapGenerator
roadmap_gen = TransformationRoadmapGenerator(
assessment=assessment,
target_timeline_months=24,
budget_constraint=2_000_000
)
phases = roadmap_gen.generate_phases(n_phases=3)
for phase in phases:
print(f"Phase {phase['phase_number']}: {phase['phase_name']} "
f"(Months {phase['start_month']}-{phase['end_month']})")
roadmap_gen.auto_generate_initiatives()
# Add a custom initiative
roadmap_gen.add_initiative(
name="Establish AI Ethics Board",
phase=1,
dimension="governance_and_ethics",
description="Form cross-functional ethics review board for all AI projects",
estimated_cost=50_000,
estimated_months=3,
kpis=["Board formed", "Review process documented", "First 3 projects reviewed"]
)
warnings = roadmap_gen.validate_dependencies()
if warnings:
for w in warnings:
print(f"WARNING: {w}")
roadmap_gen.plot_gantt()
roadmap_gen.plot_investment_timeline()
roadmap_doc = roadmap_gen.generate_roadmap(format="markdown")
print(roadmap_doc)
slides = roadmap_gen.executive_presentation()
print(f"Generated {len(slides)} slides for executive review")
Quick Index
| Tool / Library | Section | Primary Chapters |
|---|---|---|
| Python basics | A.1 | 1--4 |
| pandas | A.2 | 4--40 |
| matplotlib | A.3 | 5--40 |
| seaborn | A.3 | 5--40 |
| scikit-learn | A.4 | 6--12, 25--26 |
| NLTK | A.5 | 13--15 |
| spaCy | A.5 | 14--15 |
| TF-IDF | A.5 | 14 |
| OpenAI API | A.6 | 17--22, 30 |
| Anthropic API | A.6 | 18--22 |
| Fairlearn | A.7 | 25 |
| SHAP | A.7 | 26 |
| LIME | A.7 | 26 |
EDAReport |
A.8 | 5 |
ChurnClassifier |
A.8 | 7 |
DemandForecaster |
A.8 | 8 |
CustomerSegmenter |
A.8 | 9 |
RecommendationEngine |
A.8 | 10 |
ModelEvaluator |
A.8 | 11 |
ReviewAnalyzer |
A.8 | 14 |
PromptBuilder |
A.8 | 19 |
PromptChain |
A.8 | 20 |
BiasDetector |
A.8 | 25 |
ExplainabilityDashboard |
A.8 | 26 |
AIROICalculator |
A.8 | 34 |
AIMaturityAssessment |
A.8 | 39 |
TransformationRoadmapGenerator |
A.8 | 39 |