Case Study 2: Player Archetype Discovery and Draft Projection
Overview
This case study uses unsupervised learning to discover player archetypes and combines clustering with supervised learning to project draft outcomes for college football prospects.
Business Context
An NFL team's scouting department needs: - Data-driven player archetype classifications - Draft position projections for prospects - Comparison tools for player evaluation - Identification of undervalued archetype/position combinations
Data Description
# Player statistics schema
player_schema = {
'player_id': 'unique identifier',
'name': 'player name',
'position': 'QB, RB, WR, TE, OL, DL, LB, CB, S',
'school': 'college team',
'conference': 'conference',
'class': 'year (Fr, So, Jr, Sr)',
# Physical measurements
'height': 'inches',
'weight': 'pounds',
'forty_time': 'seconds',
'vertical': 'inches',
'broad_jump': 'inches',
'bench_press': 'reps',
'three_cone': 'seconds',
'shuttle': 'seconds',
# Production stats (position-specific)
# QB: comp_pct, ypa, td_rate, int_rate, rush_ypc
# RB: ypc, yac, catch_rate, td_per_game
# WR: ypt, ypr, catch_rate, deep_rate, slot_rate
# Draft outcome
'drafted': 'boolean',
'draft_round': 'int (1-7) or null',
'draft_pick': 'overall pick number or null'
}
data_summary = {
'total_players': 3500,
'draft_eligible': 1200,
'drafted': 420,
'positions': 9,
'seasons': '2018-2023'
}
Implementation
Step 1: Player Archetype Discovery
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
class PositionArchetypeSystem:
"""Discover archetypes for each position."""
def __init__(self):
self.position_models = {}
self.scalers = {}
self.feature_configs = self._define_features()
def _define_features(self) -> dict:
"""Define clustering features by position."""
return {
'QB': {
'style': ['rush_att_pct', 'deep_pass_pct', 'play_action_pct',
'time_in_pocket', 'scramble_rate'],
'efficiency': ['comp_pct', 'ypa', 'td_rate', 'int_rate',
'sack_rate', 'qbr'],
'physical': ['height', 'weight', 'forty_time', 'arm_strength']
},
'RB': {
'style': ['inside_run_pct', 'outside_run_pct', 'receiving_snap_pct',
'pass_block_snap_pct'],
'efficiency': ['ypc', 'yac_per_att', 'broken_tackles_per_att',
'catch_rate', 'fumble_rate'],
'physical': ['height', 'weight', 'forty_time', 'agility_score']
},
'WR': {
'style': ['slot_rate', 'deep_target_rate', 'yards_per_route',
'contested_catch_pct', 'separation_score'],
'efficiency': ['catch_rate', 'yac_per_rec', 'drop_rate',
'target_share'],
'physical': ['height', 'weight', 'forty_time', 'vertical',
'hand_size']
},
'LB': {
'style': ['pass_rush_pct', 'coverage_pct', 'run_stuff_pct',
'blitz_rate'],
'efficiency': ['tackle_rate', 'missed_tackle_pct', 'qb_pressure_rate',
'coverage_success_rate'],
'physical': ['height', 'weight', 'forty_time', 'vertical',
'three_cone']
}
}
def discover_archetypes(self,
players: pd.DataFrame,
position: str,
n_clusters: int = None) -> dict:
"""Discover archetypes for a position."""
pos_players = players[players['position'] == position].copy()
features = self._get_position_features(position)
# Prepare data
available = [f for f in features if f in pos_players.columns]
X = pos_players[available].dropna()
player_ids = X.index
# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
self.scalers[position] = scaler
# Find optimal k if not specified
if n_clusters is None:
n_clusters = self._find_optimal_k(X_scaled)
# Fit clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)
self.position_models[position] = kmeans
# Analyze clusters
pos_players.loc[player_ids, 'archetype'] = labels
archetype_analysis = self._analyze_archetypes(
pos_players.loc[player_ids], available, position
)
return {
'labels': labels,
'player_ids': player_ids,
'n_clusters': n_clusters,
'silhouette': silhouette_score(X_scaled, labels),
'analysis': archetype_analysis
}
def _get_position_features(self, position: str) -> list:
"""Get all features for a position."""
config = self.feature_configs.get(position, {})
features = []
for category in config.values():
features.extend(category)
return features
def _find_optimal_k(self, X: np.ndarray, max_k: int = 8) -> int:
"""Find optimal number of clusters."""
silhouettes = []
for k in range(2, max_k + 1):
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
silhouettes.append(silhouette_score(X, labels))
return np.argmax(silhouettes) + 2
def _analyze_archetypes(self,
players: pd.DataFrame,
features: list,
position: str) -> pd.DataFrame:
"""Analyze archetype characteristics."""
analysis = players.groupby('archetype')[features].mean()
# Generate archetype names
analysis['archetype_name'] = [
self._name_archetype(row, position)
for _, row in analysis.iterrows()
]
analysis['count'] = players.groupby('archetype').size()
return analysis
def _name_archetype(self, profile: pd.Series, position: str) -> str:
"""Generate descriptive archetype name."""
if position == 'QB':
if profile.get('rush_att_pct', 0) > 0.15:
return "Dual-Threat"
elif profile.get('deep_pass_pct', 0) > 0.25:
return "Gunslinger"
elif profile.get('comp_pct', 0) > 0.68:
return "Game Manager"
else:
return "Pocket Passer"
elif position == 'RB':
if profile.get('receiving_snap_pct', 0) > 0.25:
return "Receiving Back"
elif profile.get('broken_tackles_per_att', 0) > 0.2:
return "Power Back"
elif profile.get('outside_run_pct', 0) > 0.4:
return "Speed Back"
else:
return "All-Purpose"
elif position == 'WR':
if profile.get('slot_rate', 0) > 0.6:
return "Slot Receiver"
elif profile.get('deep_target_rate', 0) > 0.3:
return "Deep Threat"
elif profile.get('contested_catch_pct', 0) > 0.5:
return "Possession Receiver"
else:
return "Route Runner"
return f"Archetype {profile.name}"
def assign_archetype(self, player_stats: dict, position: str) -> dict:
"""Assign archetype to new player."""
features = self._get_position_features(position)
X = np.array([[player_stats.get(f, 0) for f in features]])
X_scaled = self.scalers[position].transform(X)
archetype = self.position_models[position].predict(X_scaled)[0]
distances = self.position_models[position].transform(X_scaled)[0]
return {
'archetype': archetype,
'confidence': 1 / (1 + distances[archetype]),
'distances': dict(zip(range(len(distances)), distances))
}
Step 2: Draft Projection Model
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import xgboost as xgb
class DraftProjectionModel:
"""Predict draft outcomes for prospects."""
def __init__(self, archetype_system: PositionArchetypeSystem):
self.archetype_system = archetype_system
self.drafted_model = None # Binary: drafted or not
self.round_model = None # Regression: draft position
self.scaler = StandardScaler()
def prepare_features(self, prospects: pd.DataFrame) -> pd.DataFrame:
"""Engineer features for draft prediction."""
df = prospects.copy()
# Physical composite scores
df['size_score'] = (
(df['height'] - df['height'].mean()) / df['height'].std() +
(df['weight'] - df['weight'].mean()) / df['weight'].std()
) / 2
df['athleticism_score'] = (
-(df['forty_time'] - df['forty_time'].mean()) / df['forty_time'].std() +
(df['vertical'] - df['vertical'].mean()) / df['vertical'].std() +
(df['broad_jump'] - df['broad_jump'].mean()) / df['broad_jump'].std()
) / 3
# Production score (position-adjusted)
df['production_score'] = df.groupby('position')['career_grade'].transform(
lambda x: (x - x.mean()) / x.std()
)
# Conference prestige
p5_conferences = ['SEC', 'Big Ten', 'Big 12', 'ACC', 'Pac-12']
df['p5_school'] = df['conference'].isin(p5_conferences).astype(int)
# Experience
df['games_played_norm'] = df['games_played'] / 50
# Add archetype features
for position in df['position'].unique():
pos_mask = df['position'] == position
if position in self.archetype_system.position_models:
for idx in df[pos_mask].index:
player = df.loc[idx]
archetype = self.archetype_system.assign_archetype(
player.to_dict(), position
)
df.loc[idx, 'archetype'] = archetype['archetype']
df.loc[idx, 'archetype_confidence'] = archetype['confidence']
return df
def train(self, prospects: pd.DataFrame):
"""Train draft prediction models."""
df = self.prepare_features(prospects)
# Define features
feature_cols = [
'size_score', 'athleticism_score', 'production_score',
'p5_school', 'games_played_norm', 'archetype', 'archetype_confidence'
]
# Add position dummies
position_dummies = pd.get_dummies(df['position'], prefix='pos')
df = pd.concat([df, position_dummies], axis=1)
feature_cols.extend(position_dummies.columns.tolist())
# Prepare data
X = df[feature_cols].fillna(0).values
y_drafted = df['drafted'].values
y_pick = df.loc[df['drafted'] == 1, 'draft_pick'].values
X_scaled = self.scaler.fit_transform(X)
X_drafted = X_scaled[df['drafted'] == 1]
# Train drafted classifier
self.drafted_model = xgb.XGBClassifier(
n_estimators=150, max_depth=5,
learning_rate=0.1, random_state=42
)
self.drafted_model.fit(X_scaled, y_drafted)
# Train pick position regressor (for drafted players)
self.round_model = xgb.XGBRegressor(
n_estimators=150, max_depth=5,
learning_rate=0.1, random_state=42
)
self.round_model.fit(X_drafted, y_pick)
self.feature_cols = feature_cols
def project_prospect(self, prospect: pd.DataFrame) -> dict:
"""Project draft outcome for a prospect."""
df = self.prepare_features(prospect)
# Create position dummies
for pos in ['QB', 'RB', 'WR', 'TE', 'OL', 'DL', 'LB', 'CB', 'S']:
df[f'pos_{pos}'] = (df['position'] == pos).astype(int)
X = df[self.feature_cols].fillna(0).values
X_scaled = self.scaler.transform(X)
# Predict
draft_prob = self.drafted_model.predict_proba(X_scaled)[0, 1]
projected_pick = self.round_model.predict(X_scaled)[0]
projected_round = self._pick_to_round(projected_pick)
# Confidence interval
pick_std = 30 # Approximate standard deviation
pick_low = max(1, projected_pick - pick_std)
pick_high = min(256, projected_pick + pick_std)
return {
'draft_probability': draft_prob,
'projected_pick': projected_pick,
'projected_round': projected_round,
'pick_range': (pick_low, pick_high),
'comparable_players': self._find_comparables(df.iloc[0])
}
def _pick_to_round(self, pick: float) -> int:
"""Convert pick number to round."""
if pick <= 32:
return 1
elif pick <= 64:
return 2
elif pick <= 96:
return 3
elif pick <= 128:
return 4
elif pick <= 160:
return 5
elif pick <= 192:
return 6
else:
return 7
def _find_comparables(self, prospect: pd.Series, n: int = 3) -> list:
"""Find similar historically drafted players."""
# Simplified: would compare across feature space
return [
{'name': 'Similar Player 1', 'draft_year': 2022, 'pick': 45},
{'name': 'Similar Player 2', 'draft_year': 2021, 'pick': 52},
{'name': 'Similar Player 3', 'draft_year': 2020, 'pick': 38}
]
Step 3: Draft Analytics Dashboard
class DraftAnalyticsDashboard:
"""Analytics dashboard for draft evaluation."""
def __init__(self,
archetype_system: PositionArchetypeSystem,
draft_model: DraftProjectionModel):
self.archetype_system = archetype_system
self.draft_model = draft_model
def generate_prospect_report(self, prospect: pd.DataFrame) -> dict:
"""Generate comprehensive prospect report."""
position = prospect['position'].iloc[0]
# Get archetype
archetype_result = self.archetype_system.assign_archetype(
prospect.iloc[0].to_dict(), position
)
# Get draft projection
draft_projection = self.draft_model.project_prospect(prospect)
# Physical evaluation
physical_eval = self._evaluate_physical(prospect.iloc[0], position)
# Production evaluation
production_eval = self._evaluate_production(prospect.iloc[0], position)
return {
'player': prospect['name'].iloc[0],
'position': position,
'school': prospect['school'].iloc[0],
'archetype': archetype_result,
'draft_projection': draft_projection,
'physical_evaluation': physical_eval,
'production_evaluation': production_eval,
'overall_grade': self._calculate_overall_grade(
physical_eval, production_eval, archetype_result
)
}
def _evaluate_physical(self, player: pd.Series, position: str) -> dict:
"""Evaluate physical traits vs. position standards."""
# Position-specific benchmarks
benchmarks = {
'QB': {'height': 74, 'weight': 220, 'forty_time': 4.75},
'RB': {'height': 70, 'weight': 210, 'forty_time': 4.50},
'WR': {'height': 72, 'weight': 200, 'forty_time': 4.45},
'LB': {'height': 74, 'weight': 240, 'forty_time': 4.60}
}
bench = benchmarks.get(position, benchmarks['QB'])
scores = {}
for metric, target in bench.items():
if metric in player.index:
value = player[metric]
if metric == 'forty_time':
# Lower is better
scores[metric] = max(0, 100 - (value - target) * 50)
else:
# Higher is better
diff_pct = (value - target) / target
scores[metric] = min(100, max(0, 70 + diff_pct * 100))
return {
'scores': scores,
'overall': np.mean(list(scores.values())),
'strengths': [k for k, v in scores.items() if v >= 80],
'weaknesses': [k for k, v in scores.items() if v < 60]
}
def _evaluate_production(self, player: pd.Series, position: str) -> dict:
"""Evaluate college production."""
# Simplified production scoring
if 'career_grade' in player.index:
grade = player['career_grade']
percentile = min(99, max(1, grade / 100 * 100))
else:
percentile = 50
return {
'grade': player.get('career_grade', 70),
'percentile': percentile,
'tier': 'Elite' if percentile >= 90 else 'Starter' if percentile >= 70 else 'Developmental'
}
def _calculate_overall_grade(self,
physical: dict,
production: dict,
archetype: dict) -> float:
"""Calculate overall prospect grade."""
# Weighted combination
physical_score = physical['overall']
production_score = production['percentile']
archetype_fit = archetype['confidence'] * 100
overall = (
0.35 * physical_score +
0.45 * production_score +
0.20 * archetype_fit
)
return round(overall, 1)
def analyze_archetype_value(self, historical_data: pd.DataFrame) -> pd.DataFrame:
"""Analyze draft value by archetype."""
results = []
for position in historical_data['position'].unique():
pos_data = historical_data[historical_data['position'] == position]
for archetype in pos_data['archetype'].unique():
arch_data = pos_data[pos_data['archetype'] == archetype]
drafted = arch_data[arch_data['drafted'] == 1]
results.append({
'position': position,
'archetype': archetype,
'count': len(arch_data),
'draft_rate': len(drafted) / len(arch_data) if len(arch_data) > 0 else 0,
'avg_pick': drafted['draft_pick'].mean() if len(drafted) > 0 else None,
'round_1_rate': (drafted['draft_round'] == 1).mean() if len(drafted) > 0 else 0
})
return pd.DataFrame(results).sort_values(['position', 'avg_pick'])
def find_undervalued_archetypes(self,
archetype_analysis: pd.DataFrame,
performance_data: pd.DataFrame) -> pd.DataFrame:
"""Find archetypes that outperform their draft position."""
# Merge draft position with NFL performance
merged = archetype_analysis.merge(
performance_data.groupby(['position', 'archetype']).agg({
'career_av': 'mean', # Approximate Value
'pro_bowl': 'sum'
}).reset_index(),
on=['position', 'archetype']
)
# Calculate value vs. draft capital
merged['value_over_expected'] = (
merged['career_av'] -
merged['avg_pick'].apply(lambda x: max(0, 20 - x/10) if pd.notnull(x) else 0)
)
return merged.sort_values('value_over_expected', ascending=False)
Results
Archetype Discovery
QUARTERBACK ARCHETYPES
======================
Archetype | Count | Avg Height | Rush% | Deep% | Comp%
----------------|-------|------------|-------|-------|------
Dual-Threat | 45 | 73.2" | 18.2% | 14.1% | 61.2%
Gunslinger | 38 | 75.1" | 6.3% | 28.4% | 59.8%
Pocket Passer | 52 | 75.8" | 4.1% | 18.2% | 64.3%
Game Manager | 29 | 74.2" | 7.8% | 11.3% | 69.1%
Silhouette Score: 0.42 (good cluster separation)
RUNNING BACK ARCHETYPES
=======================
Archetype | Count | Weight | Recv% | BrkTkl | Speed
----------------|-------|--------|-------|--------|------
Power Back | 52 | 225 lb | 12% | 0.24 | 4.58
Speed Back | 41 | 195 lb | 18% | 0.16 | 4.42
Receiving Back | 38 | 205 lb | 32% | 0.14 | 4.52
All-Purpose | 47 | 210 lb | 21% | 0.19 | 4.50
Silhouette Score: 0.38
WIDE RECEIVER ARCHETYPES
========================
Archetype | Count | Height | Slot% | Deep% | Catch%
-------------------|-------|--------|-------|-------|-------
Slot Receiver | 65 | 69.8" | 72% | 14% | 71%
Deep Threat | 48 | 72.1" | 28% | 38% | 58%
Possession Receiver| 42 | 74.2" | 35% | 22% | 67%
Route Runner | 53 | 71.5" | 45% | 24% | 65%
Silhouette Score: 0.35
Draft Projection Accuracy
DRAFT PROJECTION MODEL PERFORMANCE
==================================
Drafted Classification:
- Accuracy: 78.2%
- AUC: 0.856
- Precision (drafted): 0.71
- Recall (drafted): 0.68
Pick Position Regression (drafted players only):
- MAE: 28.4 picks
- RMSE: 38.1 picks
- Within 1 round: 52.3%
- Within 2 rounds: 74.8%
Performance by Position:
Position | Draft Acc | Pick MAE
---------|-----------|----------
QB | 81.2% | 24.2
RB | 76.4% | 31.5
WR | 77.8% | 29.8
OL | 79.1% | 26.3
DL | 80.3% | 25.1
LB | 75.6% | 32.4
CB | 74.2% | 35.8
S | 73.8% | 38.2
Archetype Draft Value Analysis
ARCHETYPE VALUE BY POSITION
===========================
QB Archetypes:
Archetype | Draft Rate | Avg Pick | R1 Rate | NFL AV
--------------|------------|----------|---------|-------
Pocket Passer | 42% | 68 | 18% | 28.4
Dual-Threat | 38% | 84 | 12% | 31.2 *
Gunslinger | 35% | 92 | 8% | 22.1
Game Manager | 28% | 145 | 3% | 15.8
* Dual-Threat QBs undervalued: Lower draft capital, higher NFL production
RB Archetypes:
Archetype | Draft Rate | Avg Pick | R1 Rate | NFL AV
---------------|------------|----------|---------|-------
All-Purpose | 45% | 72 | 15% | 18.2
Power Back | 38% | 98 | 8% | 16.8
Speed Back | 42% | 85 | 10% | 14.2
Receiving Back | 35% | 115 | 5% | 19.4 *
* Receiving Backs undervalued in draft relative to NFL value
WR Archetypes:
Archetype | Draft Rate | Avg Pick | R1 Rate | NFL AV
-------------------|------------|----------|---------|-------
Deep Threat | 48% | 65 | 22% | 21.5
Route Runner | 42% | 78 | 14% | 24.8
Slot Receiver | 35% | 95 | 8% | 22.1
Possession Receiver| 32% | 105 | 6% | 18.4
Sample Prospect Report
=================================================================
PROSPECT EVALUATION REPORT
=================================================================
PLAYER: Marcus Williams
POSITION: QB
SCHOOL: Ohio State
CLASS: Junior
-----------------------------------------------------------------
ARCHETYPE ANALYSIS
-----------------------------------------------------------------
Primary Archetype: Dual-Threat
Confidence: 87%
Style Profile:
Rush Attempts %: 15.2% (above average mobility)
Deep Pass %: 21.4% (balanced)
Completion %: 66.8% (above average)
Time in Pocket: 2.4s (quick release)
-----------------------------------------------------------------
PHYSICAL EVALUATION
-----------------------------------------------------------------
Value Benchmark Score
Height: 75" 74" 82
Weight: 218 lb 220 lb 74
40-Time: 4.58 4.75 91
Vertical: 34" 32" 85
Overall Physical Score: 83/100
Strengths: Speed, Vertical
Weaknesses: None significant
-----------------------------------------------------------------
PRODUCTION EVALUATION
-----------------------------------------------------------------
Career Grade: 88.2
Percentile: 92nd
Tier: Elite
Key Stats:
- 3,421 passing yards (2023)
- 32 TDs / 6 INTs
- 68% completion rate
- 485 rushing yards
-----------------------------------------------------------------
DRAFT PROJECTION
-----------------------------------------------------------------
Draft Probability: 94%
Projected Pick: 18 (Round 1)
Projection Range: 8-35
Comparable Players:
1. Lamar Jackson (Pick 32, 2018)
2. Jalen Hurts (Pick 53, 2020)
3. Justin Fields (Pick 11, 2021)
-----------------------------------------------------------------
OVERALL GRADE: 86.4 / 100
RECOMMENDATION: First Round Value
=================================================================
Lessons Learned
-
Archetype Insights: Clustering reveals meaningful player types beyond traditional position labels
-
Draft Value Inefficiencies: Certain archetypes (Dual-Threat QB, Receiving RB) consistently undervalued
-
Physical + Production: Both matter for draft projection, but weights vary by position
-
Comparable Players: Historical comparisons improve scouting communication
-
Position-Specific Models: Each position requires tailored feature engineering