Case Study 2: Implementing a Responsible AI Framework

Context

A mid-sized technology company is building an AI-assisted resume screening tool. The tool will score candidates on their likelihood of performing well in the role, based on resume features. Given the sensitive nature of employment decisions and regulatory requirements (EU AI Act classifies AI in employment as high-risk), the company must implement a comprehensive responsible AI framework.

This case study demonstrates how to integrate fairness, transparency, privacy, and safety considerations into the entire ML pipeline.

Requirements

The responsible AI framework must address: 1. Fairness: The model must not discriminate based on gender or ethnicity. 2. Transparency: Hiring managers must receive explanations for each recommendation. 3. Privacy: Candidate data must be protected. 4. Accountability: All decisions must be documented and auditable.

Implementation

Step 1: Data Generation and Analysis

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

torch.manual_seed(42)
np.random.seed(42)


def create_hiring_dataset(
    num_samples: int = 3000,
) -> dict[str, torch.Tensor | list[str]]:
    """Create synthetic hiring dataset with potential bias.

    Features represent resume characteristics. Gender is a protected
    attribute that should NOT influence hiring decisions.
    """
    feature_names = [
        "years_experience", "education_level", "skill_match_score",
        "projects_completed", "certifications", "gpa", "leadership_score",
        "communication_score",
    ]

    gender = torch.bernoulli(torch.full((num_samples,), 0.4)).long()

    # Features: some correlated with gender due to historical patterns
    exp = torch.where(
        gender == 0,
        torch.normal(8, 3, (num_samples,)),
        torch.normal(6, 3, (num_samples,)),
    ).clamp(0, 30)

    edu = torch.where(
        gender == 0,
        torch.normal(3.5, 0.8, (num_samples,)),
        torch.normal(3.3, 0.9, (num_samples,)),
    ).clamp(1, 5)

    skills = torch.normal(0.6, 0.15, (num_samples,)).clamp(0, 1)
    projects = torch.poisson(torch.full((num_samples,), 4.0)).clamp(0, 20)
    certs = torch.poisson(torch.full((num_samples,), 2.0)).clamp(0, 10)
    gpa = torch.normal(3.3, 0.4, (num_samples,)).clamp(2.0, 4.0)
    leadership = torch.normal(0.5, 0.2, (num_samples,)).clamp(0, 1)
    communication = torch.normal(0.6, 0.2, (num_samples,)).clamp(0, 1)

    # Normalize features
    features = torch.stack([
        exp / 15, edu / 5, skills, projects / 10,
        certs / 5, gpa / 4, leadership, communication,
    ], dim=1)

    # True performance (gender-independent)
    performance = (
        0.3 * skills + 0.2 * (exp / 15) + 0.15 * (gpa / 4)
        + 0.15 * leadership + 0.1 * communication
        + 0.1 * (projects / 10)
        + 0.2 * torch.randn(num_samples)
    )
    label = (performance > performance.median()).long()

    return {
        "features": features,
        "labels": label,
        "gender": gender,
        "feature_names": feature_names,
    }


def data_profile(data: dict) -> None:
    """Print a data profile with group-level statistics."""
    X, y, g = data["features"], data["labels"], data["gender"]
    names = data["feature_names"]

    print("Dataset Profile")
    print("=" * 50)
    print(f"Total samples: {len(X)}")
    print(f"Group 0 (male): {(g == 0).sum().item()}")
    print(f"Group 1 (female): {(g == 1).sum().item()}")
    print(f"Positive rate Group 0: {y[g == 0].float().mean():.3f}")
    print(f"Positive rate Group 1: {y[g == 1].float().mean():.3f}")

    print(f"\n{'Feature':>25s} {'Group 0 Mean':>12s} {'Group 1 Mean':>12s} {'Gap':>8s}")
    print("-" * 60)
    for i, name in enumerate(names):
        m0 = X[g == 0, i].mean().item()
        m1 = X[g == 1, i].mean().item()
        gap = abs(m0 - m1)
        print(f"{name:>25s} {m0:12.4f} {m1:12.4f} {gap:8.4f}")

Step 2: Standard Model (Biased Baseline)

class HiringModel(nn.Module):
    """Standard hiring model without fairness constraints."""

    def __init__(self, input_dim: int = 8) -> None:
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32), nn.ReLU(),
            nn.Linear(32, 16), nn.ReLU(),
            nn.Linear(16, 2),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)

Step 3: Adversarial Debiasing

class FairHiringModel(nn.Module):
    """Hiring model with adversarial debiasing.

    The predictor learns to make accurate predictions while
    the adversary tries to detect group membership from predictions.
    """

    def __init__(self, input_dim: int = 8) -> None:
        super().__init__()
        self.predictor = nn.Sequential(
            nn.Linear(input_dim, 32), nn.ReLU(),
            nn.Linear(32, 16), nn.ReLU(),
            nn.Linear(16, 2),
        )
        self.adversary = nn.Sequential(
            nn.Linear(2, 16), nn.ReLU(),
            nn.Linear(16, 2),
        )

    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        pred_logits = self.predictor(x)
        pred_probs = F.softmax(pred_logits, dim=1)
        adv_logits = self.adversary(pred_probs)
        return pred_logits, adv_logits


def train_fair_model(
    model: FairHiringModel,
    X_train: torch.Tensor,
    y_train: torch.Tensor,
    g_train: torch.Tensor,
    num_epochs: int = 200,
    lr: float = 1e-3,
    adversary_weight: float = 1.0,
) -> None:
    """Train with adversarial debiasing.

    The predictor maximizes task accuracy while minimizing
    the adversary's ability to predict the protected attribute.
    """
    pred_optimizer = torch.optim.Adam(model.predictor.parameters(), lr=lr)
    adv_optimizer = torch.optim.Adam(model.adversary.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()

        # Step 1: Train adversary
        for _ in range(3):
            adv_optimizer.zero_grad()
            pred_logits, adv_logits = model(X_train)
            adv_loss = F.cross_entropy(adv_logits, g_train)
            adv_loss.backward()
            adv_optimizer.step()

        # Step 2: Train predictor (against adversary)
        pred_optimizer.zero_grad()
        pred_logits, adv_logits = model(X_train)
        task_loss = F.cross_entropy(pred_logits, y_train)
        adv_loss = F.cross_entropy(adv_logits, g_train)
        # Minimize task loss, maximize adversary loss (GRL effect)
        total_loss = task_loss - adversary_weight * adv_loss
        total_loss.backward()
        pred_optimizer.step()

        if (epoch + 1) % 50 == 0:
            with torch.no_grad():
                acc = (pred_logits.argmax(1) == y_train).float().mean()
                adv_acc = (adv_logits.argmax(1) == g_train).float().mean()
            print(f"  Epoch {epoch+1}: task_acc={acc:.4f}, adv_acc={adv_acc:.4f}")

Step 4: Fairness-Constrained Training

def train_fairness_constrained(
    model: nn.Module,
    X_train: torch.Tensor,
    y_train: torch.Tensor,
    g_train: torch.Tensor,
    num_epochs: int = 200,
    lr: float = 1e-3,
    fairness_target: float = 0.05,
) -> None:
    """Train with adaptive fairness constraint on equal opportunity.

    Uses Lagrangian relaxation: L = L_task + lambda * L_fairness.
    Lambda is automatically adjusted based on constraint violation.
    """
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    lam = torch.tensor(0.0, requires_grad=False)

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()

        logits = model(X_train)
        task_loss = F.cross_entropy(logits, y_train)

        # Compute TPR gap
        preds = logits.argmax(dim=1)
        g0_pos = (g_train == 0) & (y_train == 1)
        g1_pos = (g_train == 1) & (y_train == 1)

        tpr_0 = preds[g0_pos].float().mean() if g0_pos.sum() > 0 else torch.tensor(0.5)
        tpr_1 = preds[g1_pos].float().mean() if g1_pos.sum() > 0 else torch.tensor(0.5)
        tpr_gap = (tpr_0 - tpr_1).abs()

        # Differentiable fairness penalty using probabilities
        probs = F.softmax(logits, dim=1)[:, 1]
        soft_tpr_0 = probs[g0_pos].mean() if g0_pos.sum() > 0 else torch.tensor(0.5)
        soft_tpr_1 = probs[g1_pos].mean() if g1_pos.sum() > 0 else torch.tensor(0.5)
        fairness_loss = (soft_tpr_0 - soft_tpr_1).pow(2)

        total_loss = task_loss + lam * fairness_loss
        total_loss.backward()
        optimizer.step()

        # Update lambda
        with torch.no_grad():
            if tpr_gap > fairness_target:
                lam = (lam + 0.1).clamp(max=10.0)
            else:
                lam = (lam - 0.05).clamp(min=0.0)

        if (epoch + 1) % 50 == 0:
            acc = (preds == y_train).float().mean()
            print(f"  Epoch {epoch+1}: acc={acc:.4f}, tpr_gap={tpr_gap:.4f}, lambda={lam:.2f}")

Step 5: Model Card Generation

def generate_model_card(
    model_name: str,
    group_metrics: dict[str, dict[str, float]],
    fairness_gaps: dict[str, float],
    overall_accuracy: float,
) -> str:
    """Generate a Model Card documenting model performance and fairness.

    Args:
        model_name: Name of the model.
        group_metrics: Per-group performance metrics.
        fairness_gaps: Fairness gap measurements.
        overall_accuracy: Overall model accuracy.

    Returns:
        Formatted Model Card string.
    """
    card = f"""# Model Card: {model_name}

## Model Details
- **Task**: Resume screening for candidate ranking
- **Architecture**: Feed-forward neural network (8 -> 32 -> 16 -> 2)
- **Training Data**: {sum(m['count'] for m in group_metrics.values())} synthetic resume records
- **Framework**: PyTorch

## Intended Use
- **Primary Use**: Assist hiring managers in initial resume screening
- **Out-of-Scope**: Final hiring decisions (human review required)
- **Users**: HR professionals with hiring authority

## Performance Metrics

### Overall
- Accuracy: {overall_accuracy:.4f}

### Per-Group Performance
"""
    for group_name, metrics in group_metrics.items():
        card += f"\n**{group_name}** (n={metrics['count']}):\n"
        for k, v in metrics.items():
            if k != "count":
                card += f"- {k}: {v:.4f}\n"

    card += "\n## Fairness Assessment\n\n"
    for metric, value in fairness_gaps.items():
        status = "PASS (< 0.10)" if value < 0.10 else "NEEDS ATTENTION (>= 0.10)"
        card += f"- **{metric}**: {value:.4f} [{status}]\n"

    card += """
## Ethical Considerations
- The model does not use gender or ethnicity as input features.
- Historical bias in resume data (e.g., experience gaps) may still
  influence predictions through correlated features.
- The model should be used as a screening aid, not a sole decision-maker.
- Regular audits should be conducted after deployment.

## Limitations
- Trained on synthetic data; real-world performance may differ.
- Does not account for non-resume factors (interview performance, references).
- Fairness is measured only on binary gender; intersectional analysis is needed.

## Recommendations
- Deploy with human-in-the-loop review for all decisions.
- Monitor fairness metrics monthly with production data.
- Retrain quarterly with updated data.
- Conduct intersectional fairness analysis before production deployment.
"""
    return card

Step 6: Running the Full Pipeline

def compute_group_metrics(
    preds: torch.Tensor, labels: torch.Tensor, group: torch.Tensor,
) -> tuple[dict, dict]:
    """Compute per-group metrics and fairness gaps."""
    results = {}
    for g in [0, 1]:
        mask = group == g
        p, l = preds[mask], labels[mask]
        tp = ((p == 1) & (l == 1)).sum().float()
        fp = ((p == 1) & (l == 0)).sum().float()
        fn = ((p == 0) & (l == 1)).sum().float()
        tn = ((p == 0) & (l == 0)).sum().float()
        results[f"group_{g}"] = {
            "count": mask.sum().item(),
            "accuracy": ((tp + tn) / (tp + fp + fn + tn)).item(),
            "tpr": (tp / (tp + fn + 1e-8)).item(),
            "fpr": (fp / (fp + tn + 1e-8)).item(),
            "positive_rate": p.float().mean().item(),
        }

    g0, g1 = results["group_0"], results["group_1"]
    gaps = {
        "demographic_parity_gap": abs(g0["positive_rate"] - g1["positive_rate"]),
        "equal_opportunity_gap": abs(g0["tpr"] - g1["tpr"]),
        "equalized_odds_gap": max(abs(g0["tpr"] - g1["tpr"]), abs(g0["fpr"] - g1["fpr"])),
    }
    return results, gaps


def run_responsible_ai_pipeline() -> None:
    """Execute the complete responsible AI pipeline."""
    print("=" * 60)
    print("Responsible AI Framework: Hiring Model")
    print("=" * 60)

    data = create_hiring_dataset(3000)
    data_profile(data)

    X, y, g = data["features"], data["labels"], data["gender"]
    n = 2000
    X_tr, y_tr, g_tr = X[:n], y[:n], g[:n]
    X_te, y_te, g_te = X[n:], y[n:], g[n:]

    # Baseline model
    print("\n--- Baseline Model ---")
    torch.manual_seed(42)
    baseline = HiringModel()
    opt = torch.optim.Adam(baseline.parameters(), lr=1e-3)
    for _ in range(200):
        opt.zero_grad()
        F.cross_entropy(baseline(X_tr), y_tr).backward()
        opt.step()

    baseline.eval()
    with torch.no_grad():
        preds = baseline(X_te).argmax(1)
        acc = (preds == y_te).float().mean().item()
    gm, fg = compute_group_metrics(preds, y_te, g_te)
    print(f"Accuracy: {acc:.4f}")
    for k, v in fg.items():
        print(f"  {k}: {v:.4f}")

    # Adversarial debiasing
    print("\n--- Adversarial Debiasing ---")
    torch.manual_seed(42)
    fair_model = FairHiringModel()
    train_fair_model(fair_model, X_tr, y_tr, g_tr, adversary_weight=2.0)

    fair_model.eval()
    with torch.no_grad():
        pred_logits, _ = fair_model(X_te)
        preds_fair = pred_logits.argmax(1)
        acc_fair = (preds_fair == y_te).float().mean().item()
    gm_fair, fg_fair = compute_group_metrics(preds_fair, y_te, g_te)
    print(f"Accuracy: {acc_fair:.4f}")
    for k, v in fg_fair.items():
        print(f"  {k}: {v:.4f}")

    # Constrained training
    print("\n--- Fairness-Constrained Training ---")
    torch.manual_seed(42)
    constrained = HiringModel()
    train_fairness_constrained(constrained, X_tr, y_tr, g_tr)

    constrained.eval()
    with torch.no_grad():
        preds_con = constrained(X_te).argmax(1)
        acc_con = (preds_con == y_te).float().mean().item()
    gm_con, fg_con = compute_group_metrics(preds_con, y_te, g_te)
    print(f"Accuracy: {acc_con:.4f}")
    for k, v in fg_con.items():
        print(f"  {k}: {v:.4f}")

    # Generate Model Card for best model
    print("\n--- Model Card ---")
    card = generate_model_card(
        "Fair Hiring Screener v1.0",
        gm_fair, fg_fair, acc_fair,
    )
    print(card)


if __name__ == "__main__":
    run_responsible_ai_pipeline()

Results

Approach	Accuracy	DP Gap	EO Gap	EOdds Gap
Baseline	~0.72	~0.12	~0.14	~0.15
Adversarial	~0.70	~0.06	~0.07	~0.08
Constrained	~0.71	~0.08	~0.05	~0.09

Lessons Learned

Framework integration from the start: Fairness, privacy, and transparency should be designed into the system from the beginning, not bolted on at the end.
Multiple mitigation strategies: No single debiasing method is best in all scenarios. Adversarial debiasing and constrained optimization offer different trade-offs.
Model Cards are essential documentation: They provide accountability, transparency, and a record for regulatory compliance.
Human oversight is non-negotiable: AI-assisted hiring decisions must always include human review, especially for high-risk applications under the EU AI Act.
Continuous monitoring is required: Fairness metrics must be tracked in production because data distributions and societal patterns change over time.