Multi-League Projection Models
Beginner
10 min read
18 views
Nov 27, 2025
Python Code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Load cross-league data
players_df = pd.read_csv("data/cross_league_players.csv")
# League translation factors
league_factors = players_df.groupby("league").agg({
"nhl_points_per_game": "mean",
"league_points_per_game": "mean"
}).round(3)
league_factors["translation_factor"] = (
league_factors["nhl_points_per_game"] / league_factors["league_points_per_game"]
)
print("League Translation Factors:")
print(league_factors)
# Prepare features for modeling
features = ["age", "league_points_per_game", "games_played", "league_quality_score"]
X = players_df[features].fillna(0)
y = players_df["nhl_points_per_game"]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_r2 = r2_score(y_test, lr_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
print(f"\nLinear Regression - R²: {lr_r2:.3f}, RMSE: {lr_rmse:.3f}")
# Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
print(f"Random Forest - R²: {rf_r2:.3f}, RMSE: {rf_rmse:.3f}")
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# League translation scatter
for league in players_df["league"].unique():
league_data = players_df[players_df["league"] == league]
axes[0,0].scatter(league_data["league_points_per_game"],
league_data["nhl_points_per_game"],
label=league, alpha=0.6)
axes[0,0].plot([0, 2], [0, 2], "r--", alpha=0.5)
axes[0,0].set_title("League vs NHL Production")
axes[0,0].set_xlabel("League Points Per Game")
axes[0,0].set_ylabel("NHL Points Per Game")
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)
# Translation factors bar chart
league_factors["translation_factor"].plot(kind="bar", ax=axes[0,1], color="steelblue")
axes[0,1].set_title("Translation Factors by League")
axes[0,1].set_xlabel("League")
axes[0,1].set_ylabel("Translation Factor")
axes[0,1].tick_params(axis="x", rotation=45)
axes[0,1].axhline(y=1.0, color="r", linestyle="--", alpha=0.5)
# Model predictions comparison
axes[1,0].scatter(y_test, lr_pred, alpha=0.6, label="Linear Reg")
axes[1,0].scatter(y_test, rf_pred, alpha=0.6, label="Random Forest")
axes[1,0].plot([0, y_test.max()], [0, y_test.max()], "r--", alpha=0.5)
axes[1,0].set_title("Actual vs Predicted NHL Production")
axes[1,0].set_xlabel("Actual NHL PPG")
axes[1,0].set_ylabel("Predicted NHL PPG")
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)
# Feature importance
feature_importance = pd.Series(rf_model.feature_importances_, index=features)
feature_importance.sort_values(ascending=True).plot(kind="barh", ax=axes[1,1], color="coral")
axes[1,1].set_title("Feature Importance (Random Forest)")
axes[1,1].set_xlabel("Importance")
plt.tight_layout()
plt.savefig("outputs/cross_league_projections.png", dpi=300, bbox_inches="tight")
plt.show()
# Project top prospects
current_prospects = players_df[players_df["nhl_points_per_game"].isna()].copy()
if len(current_prospects) > 0:
X_prospects = current_prospects[features].fillna(0)
current_prospects["projected_nhl_ppg"] = rf_model.predict(X_prospects)
top_projections = current_prospects.nlargest(15, "projected_nhl_ppg")[
["player_name", "league", "age", "league_points_per_game", "projected_nhl_ppg"]
]
print("\nTop Projected NHL Performers:")
print(top_projections)R Code
library(tidyverse)
library(ggplot2)
library(gridExtra)
library(randomForest)
library(caret)
# Load cross-league data
players_df <- read.csv("data/cross_league_players.csv")
# League translation factors
league_factors <- players_df %>%
group_by(league) %>%
summarise(
nhl_ppg_avg = mean(nhl_points_per_game, na.rm = TRUE),
league_ppg_avg = mean(league_points_per_game, na.rm = TRUE)
) %>%
mutate(translation_factor = nhl_ppg_avg / league_ppg_avg)
print("League Translation Factors:")
print(league_factors)
# Prepare modeling data
model_data <- players_df %>%
filter(!is.na(nhl_points_per_game)) %>%
select(age, league_points_per_game, games_played, league_quality_score, nhl_points_per_game) %>%
na.omit()
# Train-test split
set.seed(42)
train_index <- createDataPartition(model_data$nhl_points_per_game, p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
# Linear regression model
lr_model <- lm(nhl_points_per_game ~ age + league_points_per_game +
games_played + league_quality_score, data = train_data)
lr_pred <- predict(lr_model, test_data)
lr_r2 <- cor(test_data$nhl_points_per_game, lr_pred)^2
lr_rmse <- sqrt(mean((test_data$nhl_points_per_game - lr_pred)^2))
cat(sprintf("\nLinear Regression - R²: %.3f, RMSE: %.3f\n", lr_r2, lr_rmse))
# Random Forest model
rf_model <- randomForest(nhl_points_per_game ~ age + league_points_per_game +
games_played + league_quality_score,
data = train_data, ntree = 100)
rf_pred <- predict(rf_model, test_data)
rf_r2 <- cor(test_data$nhl_points_per_game, rf_pred)^2
rf_rmse <- sqrt(mean((test_data$nhl_points_per_game - rf_pred)^2))
cat(sprintf("Random Forest - R²: %.3f, RMSE: %.3f\n", rf_r2, rf_rmse))
# Visualization
p1 <- ggplot(players_df, aes(x = league_points_per_game, y = nhl_points_per_game, color = league)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red", alpha = 0.5) +
theme_minimal() +
labs(title = "League vs NHL Production",
x = "League Points Per Game", y = "NHL Points Per Game")
p2 <- ggplot(league_factors, aes(x = reorder(league, translation_factor), y = translation_factor)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_hline(yintercept = 1.0, linetype = "dashed", color = "red", alpha = 0.5) +
theme_minimal() +
labs(title = "Translation Factors by League", x = "League", y = "Translation Factor") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Model predictions comparison
predictions_df <- data.frame(
actual = test_data$nhl_points_per_game,
lr_pred = lr_pred,
rf_pred = rf_pred
)
p3 <- ggplot(predictions_df) +
geom_point(aes(x = actual, y = lr_pred, color = "Linear Reg"), alpha = 0.6) +
geom_point(aes(x = actual, y = rf_pred, color = "Random Forest"), alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red", alpha = 0.5) +
theme_minimal() +
labs(title = "Actual vs Predicted NHL Production",
x = "Actual NHL PPG", y = "Predicted NHL PPG", color = "Model")
# Feature importance
importance_df <- data.frame(
feature = rownames(importance(rf_model)),
importance = importance(rf_model)[,1]
) %>%
arrange(importance)
p4 <- ggplot(importance_df, aes(x = reorder(feature, importance), y = importance)) +
geom_bar(stat = "identity", fill = "coral") +
coord_flip() +
theme_minimal() +
labs(title = "Feature Importance (Random Forest)", x = "Feature", y = "Importance")
# Combine plots
combined_plot <- grid.arrange(p1, p2, p3, p4, ncol = 2)
ggsave("outputs/cross_league_projections_r.png", combined_plot, width = 14, height = 10, dpi = 300)
# Project top prospects
current_prospects <- players_df %>%
filter(is.na(nhl_points_per_game)) %>%
na.omit()
if (nrow(current_prospects) > 0) {
current_prospects$projected_nhl_ppg <- predict(rf_model, current_prospects)
top_projections <- current_prospects %>%
arrange(desc(projected_nhl_ppg)) %>%
select(player_name, league, age, league_points_per_game, projected_nhl_ppg) %>%
head(15)
print("Top Projected NHL Performers:")
print(top_projections)
}Discussion
Have questions or feedback? Join our community discussion on
Discord or
GitHub Discussions.
Table of Contents
Related Topics
Quick Actions