Case Study 2: Implementing a Responsible AI Framework
Context
A mid-sized technology company is building an AI-assisted resume screening tool. The tool will score candidates on their likelihood of performing well in the role, based on resume features. Given the sensitive nature of employment decisions and regulatory requirements (EU AI Act classifies AI in employment as high-risk), the company must implement a comprehensive responsible AI framework.
This case study demonstrates how to integrate fairness, transparency, privacy, and safety considerations into the entire ML pipeline.
Requirements
The responsible AI framework must address: 1. Fairness: The model must not discriminate based on gender or ethnicity. 2. Transparency: Hiring managers must receive explanations for each recommendation. 3. Privacy: Candidate data must be protected. 4. Accountability: All decisions must be documented and auditable.
Implementation
Step 1: Data Generation and Analysis
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
torch.manual_seed(42)
np.random.seed(42)
def create_hiring_dataset(
num_samples: int = 3000,
) -> dict[str, torch.Tensor | list[str]]:
"""Create synthetic hiring dataset with potential bias.
Features represent resume characteristics. Gender is a protected
attribute that should NOT influence hiring decisions.
"""
feature_names = [
"years_experience", "education_level", "skill_match_score",
"projects_completed", "certifications", "gpa", "leadership_score",
"communication_score",
]
gender = torch.bernoulli(torch.full((num_samples,), 0.4)).long()
# Features: some correlated with gender due to historical patterns
exp = torch.where(
gender == 0,
torch.normal(8, 3, (num_samples,)),
torch.normal(6, 3, (num_samples,)),
).clamp(0, 30)
edu = torch.where(
gender == 0,
torch.normal(3.5, 0.8, (num_samples,)),
torch.normal(3.3, 0.9, (num_samples,)),
).clamp(1, 5)
skills = torch.normal(0.6, 0.15, (num_samples,)).clamp(0, 1)
projects = torch.poisson(torch.full((num_samples,), 4.0)).clamp(0, 20)
certs = torch.poisson(torch.full((num_samples,), 2.0)).clamp(0, 10)
gpa = torch.normal(3.3, 0.4, (num_samples,)).clamp(2.0, 4.0)
leadership = torch.normal(0.5, 0.2, (num_samples,)).clamp(0, 1)
communication = torch.normal(0.6, 0.2, (num_samples,)).clamp(0, 1)
# Normalize features
features = torch.stack([
exp / 15, edu / 5, skills, projects / 10,
certs / 5, gpa / 4, leadership, communication,
], dim=1)
# True performance (gender-independent)
performance = (
0.3 * skills + 0.2 * (exp / 15) + 0.15 * (gpa / 4)
+ 0.15 * leadership + 0.1 * communication
+ 0.1 * (projects / 10)
+ 0.2 * torch.randn(num_samples)
)
label = (performance > performance.median()).long()
return {
"features": features,
"labels": label,
"gender": gender,
"feature_names": feature_names,
}
def data_profile(data: dict) -> None:
"""Print a data profile with group-level statistics."""
X, y, g = data["features"], data["labels"], data["gender"]
names = data["feature_names"]
print("Dataset Profile")
print("=" * 50)
print(f"Total samples: {len(X)}")
print(f"Group 0 (male): {(g == 0).sum().item()}")
print(f"Group 1 (female): {(g == 1).sum().item()}")
print(f"Positive rate Group 0: {y[g == 0].float().mean():.3f}")
print(f"Positive rate Group 1: {y[g == 1].float().mean():.3f}")
print(f"\n{'Feature':>25s} {'Group 0 Mean':>12s} {'Group 1 Mean':>12s} {'Gap':>8s}")
print("-" * 60)
for i, name in enumerate(names):
m0 = X[g == 0, i].mean().item()
m1 = X[g == 1, i].mean().item()
gap = abs(m0 - m1)
print(f"{name:>25s} {m0:12.4f} {m1:12.4f} {gap:8.4f}")
Step 2: Standard Model (Biased Baseline)
class HiringModel(nn.Module):
"""Standard hiring model without fairness constraints."""
def __init__(self, input_dim: int = 8) -> None:
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, 32), nn.ReLU(),
nn.Linear(32, 16), nn.ReLU(),
nn.Linear(16, 2),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x)
Step 3: Adversarial Debiasing
class FairHiringModel(nn.Module):
"""Hiring model with adversarial debiasing.
The predictor learns to make accurate predictions while
the adversary tries to detect group membership from predictions.
"""
def __init__(self, input_dim: int = 8) -> None:
super().__init__()
self.predictor = nn.Sequential(
nn.Linear(input_dim, 32), nn.ReLU(),
nn.Linear(32, 16), nn.ReLU(),
nn.Linear(16, 2),
)
self.adversary = nn.Sequential(
nn.Linear(2, 16), nn.ReLU(),
nn.Linear(16, 2),
)
def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
pred_logits = self.predictor(x)
pred_probs = F.softmax(pred_logits, dim=1)
adv_logits = self.adversary(pred_probs)
return pred_logits, adv_logits
def train_fair_model(
model: FairHiringModel,
X_train: torch.Tensor,
y_train: torch.Tensor,
g_train: torch.Tensor,
num_epochs: int = 200,
lr: float = 1e-3,
adversary_weight: float = 1.0,
) -> None:
"""Train with adversarial debiasing.
The predictor maximizes task accuracy while minimizing
the adversary's ability to predict the protected attribute.
"""
pred_optimizer = torch.optim.Adam(model.predictor.parameters(), lr=lr)
adv_optimizer = torch.optim.Adam(model.adversary.parameters(), lr=lr)
for epoch in range(num_epochs):
model.train()
# Step 1: Train adversary
for _ in range(3):
adv_optimizer.zero_grad()
pred_logits, adv_logits = model(X_train)
adv_loss = F.cross_entropy(adv_logits, g_train)
adv_loss.backward()
adv_optimizer.step()
# Step 2: Train predictor (against adversary)
pred_optimizer.zero_grad()
pred_logits, adv_logits = model(X_train)
task_loss = F.cross_entropy(pred_logits, y_train)
adv_loss = F.cross_entropy(adv_logits, g_train)
# Minimize task loss, maximize adversary loss (GRL effect)
total_loss = task_loss - adversary_weight * adv_loss
total_loss.backward()
pred_optimizer.step()
if (epoch + 1) % 50 == 0:
with torch.no_grad():
acc = (pred_logits.argmax(1) == y_train).float().mean()
adv_acc = (adv_logits.argmax(1) == g_train).float().mean()
print(f" Epoch {epoch+1}: task_acc={acc:.4f}, adv_acc={adv_acc:.4f}")
Step 4: Fairness-Constrained Training
def train_fairness_constrained(
model: nn.Module,
X_train: torch.Tensor,
y_train: torch.Tensor,
g_train: torch.Tensor,
num_epochs: int = 200,
lr: float = 1e-3,
fairness_target: float = 0.05,
) -> None:
"""Train with adaptive fairness constraint on equal opportunity.
Uses Lagrangian relaxation: L = L_task + lambda * L_fairness.
Lambda is automatically adjusted based on constraint violation.
"""
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
lam = torch.tensor(0.0, requires_grad=False)
for epoch in range(num_epochs):
model.train()
optimizer.zero_grad()
logits = model(X_train)
task_loss = F.cross_entropy(logits, y_train)
# Compute TPR gap
preds = logits.argmax(dim=1)
g0_pos = (g_train == 0) & (y_train == 1)
g1_pos = (g_train == 1) & (y_train == 1)
tpr_0 = preds[g0_pos].float().mean() if g0_pos.sum() > 0 else torch.tensor(0.5)
tpr_1 = preds[g1_pos].float().mean() if g1_pos.sum() > 0 else torch.tensor(0.5)
tpr_gap = (tpr_0 - tpr_1).abs()
# Differentiable fairness penalty using probabilities
probs = F.softmax(logits, dim=1)[:, 1]
soft_tpr_0 = probs[g0_pos].mean() if g0_pos.sum() > 0 else torch.tensor(0.5)
soft_tpr_1 = probs[g1_pos].mean() if g1_pos.sum() > 0 else torch.tensor(0.5)
fairness_loss = (soft_tpr_0 - soft_tpr_1).pow(2)
total_loss = task_loss + lam * fairness_loss
total_loss.backward()
optimizer.step()
# Update lambda
with torch.no_grad():
if tpr_gap > fairness_target:
lam = (lam + 0.1).clamp(max=10.0)
else:
lam = (lam - 0.05).clamp(min=0.0)
if (epoch + 1) % 50 == 0:
acc = (preds == y_train).float().mean()
print(f" Epoch {epoch+1}: acc={acc:.4f}, tpr_gap={tpr_gap:.4f}, lambda={lam:.2f}")
Step 5: Model Card Generation
def generate_model_card(
model_name: str,
group_metrics: dict[str, dict[str, float]],
fairness_gaps: dict[str, float],
overall_accuracy: float,
) -> str:
"""Generate a Model Card documenting model performance and fairness.
Args:
model_name: Name of the model.
group_metrics: Per-group performance metrics.
fairness_gaps: Fairness gap measurements.
overall_accuracy: Overall model accuracy.
Returns:
Formatted Model Card string.
"""
card = f"""# Model Card: {model_name}
## Model Details
- **Task**: Resume screening for candidate ranking
- **Architecture**: Feed-forward neural network (8 -> 32 -> 16 -> 2)
- **Training Data**: {sum(m['count'] for m in group_metrics.values())} synthetic resume records
- **Framework**: PyTorch
## Intended Use
- **Primary Use**: Assist hiring managers in initial resume screening
- **Out-of-Scope**: Final hiring decisions (human review required)
- **Users**: HR professionals with hiring authority
## Performance Metrics
### Overall
- Accuracy: {overall_accuracy:.4f}
### Per-Group Performance
"""
for group_name, metrics in group_metrics.items():
card += f"\n**{group_name}** (n={metrics['count']}):\n"
for k, v in metrics.items():
if k != "count":
card += f"- {k}: {v:.4f}\n"
card += "\n## Fairness Assessment\n\n"
for metric, value in fairness_gaps.items():
status = "PASS (< 0.10)" if value < 0.10 else "NEEDS ATTENTION (>= 0.10)"
card += f"- **{metric}**: {value:.4f} [{status}]\n"
card += """
## Ethical Considerations
- The model does not use gender or ethnicity as input features.
- Historical bias in resume data (e.g., experience gaps) may still
influence predictions through correlated features.
- The model should be used as a screening aid, not a sole decision-maker.
- Regular audits should be conducted after deployment.
## Limitations
- Trained on synthetic data; real-world performance may differ.
- Does not account for non-resume factors (interview performance, references).
- Fairness is measured only on binary gender; intersectional analysis is needed.
## Recommendations
- Deploy with human-in-the-loop review for all decisions.
- Monitor fairness metrics monthly with production data.
- Retrain quarterly with updated data.
- Conduct intersectional fairness analysis before production deployment.
"""
return card
Step 6: Running the Full Pipeline
def compute_group_metrics(
preds: torch.Tensor, labels: torch.Tensor, group: torch.Tensor,
) -> tuple[dict, dict]:
"""Compute per-group metrics and fairness gaps."""
results = {}
for g in [0, 1]:
mask = group == g
p, l = preds[mask], labels[mask]
tp = ((p == 1) & (l == 1)).sum().float()
fp = ((p == 1) & (l == 0)).sum().float()
fn = ((p == 0) & (l == 1)).sum().float()
tn = ((p == 0) & (l == 0)).sum().float()
results[f"group_{g}"] = {
"count": mask.sum().item(),
"accuracy": ((tp + tn) / (tp + fp + fn + tn)).item(),
"tpr": (tp / (tp + fn + 1e-8)).item(),
"fpr": (fp / (fp + tn + 1e-8)).item(),
"positive_rate": p.float().mean().item(),
}
g0, g1 = results["group_0"], results["group_1"]
gaps = {
"demographic_parity_gap": abs(g0["positive_rate"] - g1["positive_rate"]),
"equal_opportunity_gap": abs(g0["tpr"] - g1["tpr"]),
"equalized_odds_gap": max(abs(g0["tpr"] - g1["tpr"]), abs(g0["fpr"] - g1["fpr"])),
}
return results, gaps
def run_responsible_ai_pipeline() -> None:
"""Execute the complete responsible AI pipeline."""
print("=" * 60)
print("Responsible AI Framework: Hiring Model")
print("=" * 60)
data = create_hiring_dataset(3000)
data_profile(data)
X, y, g = data["features"], data["labels"], data["gender"]
n = 2000
X_tr, y_tr, g_tr = X[:n], y[:n], g[:n]
X_te, y_te, g_te = X[n:], y[n:], g[n:]
# Baseline model
print("\n--- Baseline Model ---")
torch.manual_seed(42)
baseline = HiringModel()
opt = torch.optim.Adam(baseline.parameters(), lr=1e-3)
for _ in range(200):
opt.zero_grad()
F.cross_entropy(baseline(X_tr), y_tr).backward()
opt.step()
baseline.eval()
with torch.no_grad():
preds = baseline(X_te).argmax(1)
acc = (preds == y_te).float().mean().item()
gm, fg = compute_group_metrics(preds, y_te, g_te)
print(f"Accuracy: {acc:.4f}")
for k, v in fg.items():
print(f" {k}: {v:.4f}")
# Adversarial debiasing
print("\n--- Adversarial Debiasing ---")
torch.manual_seed(42)
fair_model = FairHiringModel()
train_fair_model(fair_model, X_tr, y_tr, g_tr, adversary_weight=2.0)
fair_model.eval()
with torch.no_grad():
pred_logits, _ = fair_model(X_te)
preds_fair = pred_logits.argmax(1)
acc_fair = (preds_fair == y_te).float().mean().item()
gm_fair, fg_fair = compute_group_metrics(preds_fair, y_te, g_te)
print(f"Accuracy: {acc_fair:.4f}")
for k, v in fg_fair.items():
print(f" {k}: {v:.4f}")
# Constrained training
print("\n--- Fairness-Constrained Training ---")
torch.manual_seed(42)
constrained = HiringModel()
train_fairness_constrained(constrained, X_tr, y_tr, g_tr)
constrained.eval()
with torch.no_grad():
preds_con = constrained(X_te).argmax(1)
acc_con = (preds_con == y_te).float().mean().item()
gm_con, fg_con = compute_group_metrics(preds_con, y_te, g_te)
print(f"Accuracy: {acc_con:.4f}")
for k, v in fg_con.items():
print(f" {k}: {v:.4f}")
# Generate Model Card for best model
print("\n--- Model Card ---")
card = generate_model_card(
"Fair Hiring Screener v1.0",
gm_fair, fg_fair, acc_fair,
)
print(card)
if __name__ == "__main__":
run_responsible_ai_pipeline()
Results
| Approach | Accuracy | DP Gap | EO Gap | EOdds Gap |
|---|---|---|---|---|
| Baseline | ~0.72 | ~0.12 | ~0.14 | ~0.15 |
| Adversarial | ~0.70 | ~0.06 | ~0.07 | ~0.08 |
| Constrained | ~0.71 | ~0.08 | ~0.05 | ~0.09 |
Lessons Learned
- Framework integration from the start: Fairness, privacy, and transparency should be designed into the system from the beginning, not bolted on at the end.
- Multiple mitigation strategies: No single debiasing method is best in all scenarios. Adversarial debiasing and constrained optimization offer different trade-offs.
- Model Cards are essential documentation: They provide accountability, transparency, and a record for regulatory compliance.
- Human oversight is non-negotiable: AI-assisted hiring decisions must always include human review, especially for high-risk applications under the EU AI Act.
- Continuous monitoring is required: Fairness metrics must be tracked in production because data distributions and societal patterns change over time.