Multi-League Projection Models

Beginner 10 min read 1 views Nov 27, 2025

Python Code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load cross-league data
players_df = pd.read_csv("data/cross_league_players.csv")

# League translation factors
league_factors = players_df.groupby("league").agg({
    "nhl_points_per_game": "mean",
    "league_points_per_game": "mean"
}).round(3)

league_factors["translation_factor"] = (
    league_factors["nhl_points_per_game"] / league_factors["league_points_per_game"]
)

print("League Translation Factors:")
print(league_factors)

# Prepare features for modeling
features = ["age", "league_points_per_game", "games_played", "league_quality_score"]
X = players_df[features].fillna(0)
y = players_df["nhl_points_per_game"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_r2 = r2_score(y_test, lr_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))

print(f"\nLinear Regression - R²: {lr_r2:.3f}, RMSE: {lr_rmse:.3f}")

# Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

print(f"Random Forest - R²: {rf_r2:.3f}, RMSE: {rf_rmse:.3f}")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# League translation scatter
for league in players_df["league"].unique():
    league_data = players_df[players_df["league"] == league]
    axes[0,0].scatter(league_data["league_points_per_game"],
                      league_data["nhl_points_per_game"],
                      label=league, alpha=0.6)
axes[0,0].plot([0, 2], [0, 2], "r--", alpha=0.5)
axes[0,0].set_title("League vs NHL Production")
axes[0,0].set_xlabel("League Points Per Game")
axes[0,0].set_ylabel("NHL Points Per Game")
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Translation factors bar chart
league_factors["translation_factor"].plot(kind="bar", ax=axes[0,1], color="steelblue")
axes[0,1].set_title("Translation Factors by League")
axes[0,1].set_xlabel("League")
axes[0,1].set_ylabel("Translation Factor")
axes[0,1].tick_params(axis="x", rotation=45)
axes[0,1].axhline(y=1.0, color="r", linestyle="--", alpha=0.5)

# Model predictions comparison
axes[1,0].scatter(y_test, lr_pred, alpha=0.6, label="Linear Reg")
axes[1,0].scatter(y_test, rf_pred, alpha=0.6, label="Random Forest")
axes[1,0].plot([0, y_test.max()], [0, y_test.max()], "r--", alpha=0.5)
axes[1,0].set_title("Actual vs Predicted NHL Production")
axes[1,0].set_xlabel("Actual NHL PPG")
axes[1,0].set_ylabel("Predicted NHL PPG")
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# Feature importance
feature_importance = pd.Series(rf_model.feature_importances_, index=features)
feature_importance.sort_values(ascending=True).plot(kind="barh", ax=axes[1,1], color="coral")
axes[1,1].set_title("Feature Importance (Random Forest)")
axes[1,1].set_xlabel("Importance")

plt.tight_layout()
plt.savefig("outputs/cross_league_projections.png", dpi=300, bbox_inches="tight")
plt.show()

# Project top prospects
current_prospects = players_df[players_df["nhl_points_per_game"].isna()].copy()
if len(current_prospects) > 0:
    X_prospects = current_prospects[features].fillna(0)
    current_prospects["projected_nhl_ppg"] = rf_model.predict(X_prospects)

    top_projections = current_prospects.nlargest(15, "projected_nhl_ppg")[
        ["player_name", "league", "age", "league_points_per_game", "projected_nhl_ppg"]
    ]
    print("\nTop Projected NHL Performers:")
    print(top_projections)

R Code

library(tidyverse)
library(ggplot2)
library(gridExtra)
library(randomForest)
library(caret)

# Load cross-league data
players_df <- read.csv("data/cross_league_players.csv")

# League translation factors
league_factors <- players_df %>%
  group_by(league) %>%
  summarise(
    nhl_ppg_avg = mean(nhl_points_per_game, na.rm = TRUE),
    league_ppg_avg = mean(league_points_per_game, na.rm = TRUE)
  ) %>%
  mutate(translation_factor = nhl_ppg_avg / league_ppg_avg)

print("League Translation Factors:")
print(league_factors)

# Prepare modeling data
model_data <- players_df %>%
  filter(!is.na(nhl_points_per_game)) %>%
  select(age, league_points_per_game, games_played, league_quality_score, nhl_points_per_game) %>%
  na.omit()

# Train-test split
set.seed(42)
train_index <- createDataPartition(model_data$nhl_points_per_game, p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]

# Linear regression model
lr_model <- lm(nhl_points_per_game ~ age + league_points_per_game +
               games_played + league_quality_score, data = train_data)
lr_pred <- predict(lr_model, test_data)
lr_r2 <- cor(test_data$nhl_points_per_game, lr_pred)^2
lr_rmse <- sqrt(mean((test_data$nhl_points_per_game - lr_pred)^2))

cat(sprintf("\nLinear Regression - R²: %.3f, RMSE: %.3f\n", lr_r2, lr_rmse))

# Random Forest model
rf_model <- randomForest(nhl_points_per_game ~ age + league_points_per_game +
                         games_played + league_quality_score,
                         data = train_data, ntree = 100)
rf_pred <- predict(rf_model, test_data)
rf_r2 <- cor(test_data$nhl_points_per_game, rf_pred)^2
rf_rmse <- sqrt(mean((test_data$nhl_points_per_game - rf_pred)^2))

cat(sprintf("Random Forest - R²: %.3f, RMSE: %.3f\n", rf_r2, rf_rmse))

# Visualization
p1 <- ggplot(players_df, aes(x = league_points_per_game, y = nhl_points_per_game, color = league)) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red", alpha = 0.5) +
  theme_minimal() +
  labs(title = "League vs NHL Production",
       x = "League Points Per Game", y = "NHL Points Per Game")

p2 <- ggplot(league_factors, aes(x = reorder(league, translation_factor), y = translation_factor)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_hline(yintercept = 1.0, linetype = "dashed", color = "red", alpha = 0.5) +
  theme_minimal() +
  labs(title = "Translation Factors by League", x = "League", y = "Translation Factor") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Model predictions comparison
predictions_df <- data.frame(
  actual = test_data$nhl_points_per_game,
  lr_pred = lr_pred,
  rf_pred = rf_pred
)

p3 <- ggplot(predictions_df) +
  geom_point(aes(x = actual, y = lr_pred, color = "Linear Reg"), alpha = 0.6) +
  geom_point(aes(x = actual, y = rf_pred, color = "Random Forest"), alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red", alpha = 0.5) +
  theme_minimal() +
  labs(title = "Actual vs Predicted NHL Production",
       x = "Actual NHL PPG", y = "Predicted NHL PPG", color = "Model")

# Feature importance
importance_df <- data.frame(
  feature = rownames(importance(rf_model)),
  importance = importance(rf_model)[,1]
) %>%
  arrange(importance)

p4 <- ggplot(importance_df, aes(x = reorder(feature, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "coral") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Feature Importance (Random Forest)", x = "Feature", y = "Importance")

# Combine plots
combined_plot <- grid.arrange(p1, p2, p3, p4, ncol = 2)

ggsave("outputs/cross_league_projections_r.png", combined_plot, width = 14, height = 10, dpi = 300)

# Project top prospects
current_prospects <- players_df %>%
  filter(is.na(nhl_points_per_game)) %>%
  na.omit()

if (nrow(current_prospects) > 0) {
  current_prospects$projected_nhl_ppg <- predict(rf_model, current_prospects)

  top_projections <- current_prospects %>%
    arrange(desc(projected_nhl_ppg)) %>%
    select(player_name, league, age, league_points_per_game, projected_nhl_ppg) %>%
    head(15)

  print("Top Projected NHL Performers:")
  print(top_projections)
}

Discussion

Have questions or feedback? Join our community discussion on Discord or GitHub Discussions.