Multi-League Projection Models

Beginner 10 min read 18 views Nov 27, 2025

Python Code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load cross-league data
players_df = pd.read_csv("data/cross_league_players.csv")

# League translation factors
league_factors = players_df.groupby("league").agg({
    "nhl_points_per_game": "mean",
    "league_points_per_game": "mean"
}).round(3)

league_factors["translation_factor"] = (
    league_factors["nhl_points_per_game"] / league_factors["league_points_per_game"]
)

print("League Translation Factors:")
print(league_factors)

# Prepare features for modeling
features = ["age", "league_points_per_game", "games_played", "league_quality_score"]
X = players_df[features].fillna(0)
y = players_df["nhl_points_per_game"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_r2 = r2_score(y_test, lr_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))

print(f"\nLinear Regression - R²: {lr_r2:.3f}, RMSE: {lr_rmse:.3f}")

# Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

print(f"Random Forest - R²: {rf_r2:.3f}, RMSE: {rf_rmse:.3f}")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# League translation scatter
for league in players_df["league"].unique():
    league_data = players_df[players_df["league"] == league]
    axes[0,0].scatter(league_data["league_points_per_game"],
                      league_data["nhl_points_per_game"],
                      label=league, alpha=0.6)
axes[0,0].plot([0, 2], [0, 2], "r--", alpha=0.5)
axes[0,0].set_title("League vs NHL Production")
axes[0,0].set_xlabel("League Points Per Game")
axes[0,0].set_ylabel("NHL Points Per Game")
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Translation factors bar chart
league_factors["translation_factor"].plot(kind="bar", ax=axes[0,1], color="steelblue")
axes[0,1].set_title("Translation Factors by League")
axes[0,1].set_xlabel("League")
axes[0,1].set_ylabel("Translation Factor")
axes[0,1].tick_params(axis="x", rotation=45)
axes[0,1].axhline(y=1.0, color="r", linestyle="--", alpha=0.5)

# Model predictions comparison
axes[1,0].scatter(y_test, lr_pred, alpha=0.6, label="Linear Reg")
axes[1,0].scatter(y_test, rf_pred, alpha=0.6, label="Random Forest")
axes[1,0].plot([0, y_test.max()], [0, y_test.max()], "r--", alpha=0.5)
axes[1,0].set_title("Actual vs Predicted NHL Production")
axes[1,0].set_xlabel("Actual NHL PPG")
axes[1,0].set_ylabel("Predicted NHL PPG")
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# Feature importance
feature_importance = pd.Series(rf_model.feature_importances_, index=features)
feature_importance.sort_values(ascending=True).plot(kind="barh", ax=axes[1,1], color="coral")
axes[1,1].set_title("Feature Importance (Random Forest)")
axes[1,1].set_xlabel("Importance")

plt.tight_layout()
plt.savefig("outputs/cross_league_projections.png", dpi=300, bbox_inches="tight")
plt.show()

# Project top prospects
current_prospects = players_df[players_df["nhl_points_per_game"].isna()].copy()
if len(current_prospects) > 0:
    X_prospects = current_prospects[features].fillna(0)
    current_prospects["projected_nhl_ppg"] = rf_model.predict(X_prospects)

    top_projections = current_prospects.nlargest(15, "projected_nhl_ppg")[
        ["player_name", "league", "age", "league_points_per_game", "projected_nhl_ppg"]
    ]
    print("\nTop Projected NHL Performers:")
    print(top_projections)

R Code

library(tidyverse)
library(ggplot2)
library(gridExtra)
library(randomForest)
library(caret)

# Load cross-league data
players_df <- read.csv("data/cross_league_players.csv")

# League translation factors
league_factors <- players_df %>%
  group_by(league) %>%
  summarise(
    nhl_ppg_avg = mean(nhl_points_per_game, na.rm = TRUE),
    league_ppg_avg = mean(league_points_per_game, na.rm = TRUE)
  ) %>%
  mutate(translation_factor = nhl_ppg_avg / league_ppg_avg)

print("League Translation Factors:")
print(league_factors)

# Prepare modeling data
model_data <- players_df %>%
  filter(!is.na(nhl_points_per_game)) %>%
  select(age, league_points_per_game, games_played, league_quality_score, nhl_points_per_game) %>%
  na.omit()

# Train-test split
set.seed(42)
train_index <- createDataPartition(model_data$nhl_points_per_game, p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]

# Linear regression model
lr_model <- lm(nhl_points_per_game ~ age + league_points_per_game +
               games_played + league_quality_score, data = train_data)
lr_pred <- predict(lr_model, test_data)
lr_r2 <- cor(test_data$nhl_points_per_game, lr_pred)^2
lr_rmse <- sqrt(mean((test_data$nhl_points_per_game - lr_pred)^2))

cat(sprintf("\nLinear Regression - R²: %.3f, RMSE: %.3f\n", lr_r2, lr_rmse))

# Random Forest model
rf_model <- randomForest(nhl_points_per_game ~ age + league_points_per_game +
                         games_played + league_quality_score,
                         data = train_data, ntree = 100)
rf_pred <- predict(rf_model, test_data)
rf_r2 <- cor(test_data$nhl_points_per_game, rf_pred)^2
rf_rmse <- sqrt(mean((test_data$nhl_points_per_game - rf_pred)^2))

cat(sprintf("Random Forest - R²: %.3f, RMSE: %.3f\n", rf_r2, rf_rmse))

# Visualization
p1 <- ggplot(players_df, aes(x = league_points_per_game, y = nhl_points_per_game, color = league)) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red", alpha = 0.5) +
  theme_minimal() +
  labs(title = "League vs NHL Production",
       x = "League Points Per Game", y = "NHL Points Per Game")

p2 <- ggplot(league_factors, aes(x = reorder(league, translation_factor), y = translation_factor)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_hline(yintercept = 1.0, linetype = "dashed", color = "red", alpha = 0.5) +
  theme_minimal() +
  labs(title = "Translation Factors by League", x = "League", y = "Translation Factor") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Model predictions comparison
predictions_df <- data.frame(
  actual = test_data$nhl_points_per_game,
  lr_pred = lr_pred,
  rf_pred = rf_pred
)

p3 <- ggplot(predictions_df) +
  geom_point(aes(x = actual, y = lr_pred, color = "Linear Reg"), alpha = 0.6) +
  geom_point(aes(x = actual, y = rf_pred, color = "Random Forest"), alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red", alpha = 0.5) +
  theme_minimal() +
  labs(title = "Actual vs Predicted NHL Production",
       x = "Actual NHL PPG", y = "Predicted NHL PPG", color = "Model")

# Feature importance
importance_df <- data.frame(
  feature = rownames(importance(rf_model)),
  importance = importance(rf_model)[,1]
) %>%
  arrange(importance)

p4 <- ggplot(importance_df, aes(x = reorder(feature, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "coral") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Feature Importance (Random Forest)", x = "Feature", y = "Importance")

# Combine plots
combined_plot <- grid.arrange(p1, p2, p3, p4, ncol = 2)

ggsave("outputs/cross_league_projections_r.png", combined_plot, width = 14, height = 10, dpi = 300)

# Project top prospects
current_prospects <- players_df %>%
  filter(is.na(nhl_points_per_game)) %>%
  na.omit()

if (nrow(current_prospects) > 0) {
  current_prospects$projected_nhl_ppg <- predict(rf_model, current_prospects)

  top_projections <- current_prospects %>%
    arrange(desc(projected_nhl_ppg)) %>%
    select(player_name, league, age, league_points_per_game, projected_nhl_ppg) %>%
    head(15)

  print("Top Projected NHL Performers:")
  print(top_projections)
}

Playing Style Analysis by Country Previous

Discussion

Have questions or feedback? Join our community discussion on Discord or GitHub Discussions.

Table of Contents

Multi-League Projection Models

Python Code

R Code

Test Your Knowledge

Discussion