Market Value Modeling

Beginner 10 min read 1 views Nov 27, 2025
# Market Value Modeling ## Overview Market value modeling uses statistical and machine learning techniques to predict player transfer values. These models help clubs make informed decisions about transfers, contracts, and squad valuation. Accurate valuation requires combining performance metrics, age, contract status, and market dynamics. ## Key Value Drivers ### Performance Metrics - Goals and assists - Expected goals and assists (xG, xA) - Progressive actions - Defensive contributions ### Age and Development - Peak age (26-28 for most positions) - Potential for growth - Injury history - Career trajectory ### Market Factors - Contract length remaining - Release clauses - Selling club financial situation - Buying competition ### Contextual Factors - League quality - Team success - International experience - Media profile and marketability ## Python Implementation ```python import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.linear_model import Ridge, Lasso from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error import matplotlib.pyplot as plt import seaborn as sns # Sample player market data market_data = pd.DataFrame({ 'Player': ['Player A', 'Player B', 'Player C', 'Player D', 'Player E', 'Player F', 'Player G', 'Player H', 'Player I', 'Player J', 'Player K', 'Player L', 'Player M', 'Player N', 'Player O'], 'Age': [24, 28, 22, 26, 30, 25, 27, 23, 29, 26, 24, 28, 25, 27, 23], 'Goals_p90': [0.65, 0.42, 0.85, 0.55, 0.38, 0.72, 0.48, 0.91, 0.35, 0.68, 0.55, 0.45, 0.78, 0.52, 0.88], 'Assists_p90': [0.35, 0.52, 0.25, 0.45, 0.55, 0.38, 0.48, 0.22, 0.62, 0.42, 0.38, 0.58, 0.32, 0.48, 0.28], 'xG_p90': [0.58, 0.38, 0.78, 0.52, 0.35, 0.68, 0.45, 0.85, 0.32, 0.62, 0.52, 0.42, 0.72, 0.48, 0.82], 'Minutes_Played': [2800, 2400, 2900, 2600, 2100, 2700, 2500, 2850, 2200, 2650, 2750, 2450, 2800, 2550, 2900], 'League_Quality': [8, 7, 9, 8, 7, 9, 8, 9, 7, 8, 8, 7, 9, 8, 9], 'Contract_Years': [3, 2, 4, 3, 1, 4, 2, 5, 1, 3, 3, 2, 4, 2, 5], 'International_Caps': [15, 45, 8, 25, 62, 18, 38, 5, 58, 28, 12, 42, 22, 35, 10], 'Market_Value_M': [45, 35, 75, 52, 28, 68, 42, 85, 25, 58, 48, 38, 65, 45, 82] }) print("Market Data Sample:") print(market_data.head()) # Feature engineering def engineer_features(df): """ Create additional features for market value prediction """ df = df.copy() # Age-related features df['Age_Peak'] = np.abs(df['Age'] - 27) # Distance from peak age df['Age_Squared'] = df['Age'] ** 2 # Performance composite df['Goal_Contribution'] = df['Goals_p90'] + df['Assists_p90'] df['Performance_Score'] = (df['xG_p90'] * 0.6 + df['Assists_p90'] * 0.4) # Experience metrics df['Minutes_per_Year'] = df['Minutes_Played'] / (2024 - (2024 - df['Age'] + 18)) df['Experience_Value'] = df['International_Caps'] * df['League_Quality'] # Contract value df['Contract_Value'] = df['Contract_Years'] * 5 # More years = higher value return df market_data_featured = engineer_features(market_data) # Prepare features and target feature_columns = ['Age', 'Age_Peak', 'Age_Squared', 'Goals_p90', 'Assists_p90', 'xG_p90', 'Minutes_Played', 'League_Quality', 'Contract_Years', 'International_Caps', 'Goal_Contribution', 'Performance_Score', 'Experience_Value', 'Contract_Value'] X = market_data_featured[feature_columns] y = market_data_featured['Market_Value_M'] # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Standardize features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Model 1: Random Forest rf_model = RandomForestRegressor( n_estimators=100, max_depth=8, min_samples_split=3, random_state=42 ) rf_model.fit(X_train_scaled, y_train) rf_pred = rf_model.predict(X_test_scaled) # Model 2: Gradient Boosting gb_model = GradientBoostingRegressor( n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42 ) gb_model.fit(X_train_scaled, y_train) gb_pred = gb_model.predict(X_test_scaled) # Model 3: Ridge Regression ridge_model = Ridge(alpha=1.0) ridge_model.fit(X_train_scaled, y_train) ridge_pred = ridge_model.predict(X_test_scaled) # Ensemble prediction (average of all models) ensemble_pred = (rf_pred + gb_pred + ridge_pred) / 3 # Evaluate models def evaluate_model(y_true, y_pred, model_name): """ Calculate and print model evaluation metrics """ mae = mean_absolute_error(y_true, y_pred) rmse = np.sqrt(mean_squared_error(y_true, y_pred)) r2 = r2_score(y_true, y_pred) mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100 print(f"\n{model_name} Performance:") print(f" MAE: €{mae:.2f}M") print(f" RMSE: €{rmse:.2f}M") print(f" R²: {r2:.3f}") print(f" MAPE: {mape:.2f}%") return {'MAE': mae, 'RMSE': rmse, 'R2': r2, 'MAPE': mape} rf_metrics = evaluate_model(y_test, rf_pred, "Random Forest") gb_metrics = evaluate_model(y_test, gb_pred, "Gradient Boosting") ridge_metrics = evaluate_model(y_test, ridge_pred, "Ridge Regression") ensemble_metrics = evaluate_model(y_test, ensemble_pred, "Ensemble Model") # Feature importance feature_importance = pd.DataFrame({ 'Feature': feature_columns, 'Importance': rf_model.feature_importances_ }).sort_values('Importance', ascending=False) print("\nTop 10 Most Important Features:") print(feature_importance.head(10)) # Visualize predictions vs actual fig, axes = plt.subplots(2, 2, figsize=(14, 12)) models = [ ('Random Forest', rf_pred, axes[0, 0]), ('Gradient Boosting', gb_pred, axes[0, 1]), ('Ridge Regression', ridge_pred, axes[1, 0]), ('Ensemble', ensemble_pred, axes[1, 1]) ] for model_name, predictions, ax in models: ax.scatter(y_test, predictions, alpha=0.6, s=100) # Perfect prediction line min_val = min(y_test.min(), predictions.min()) max_val = max(y_test.max(), predictions.max()) ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, alpha=0.5) ax.set_xlabel('Actual Market Value (€M)', fontsize=11) ax.set_ylabel('Predicted Market Value (€M)', fontsize=11) ax.set_title(f'{model_name}', fontsize=12, fontweight='bold') ax.grid(alpha=0.3) # Add R² score if model_name == 'Random Forest': r2 = rf_metrics['R2'] elif model_name == 'Gradient Boosting': r2 = gb_metrics['R2'] elif model_name == 'Ridge Regression': r2 = ridge_metrics['R2'] else: r2 = ensemble_metrics['R2'] ax.text(0.05, 0.95, f'R² = {r2:.3f}', transform=ax.transAxes, fontsize=10, verticalalignment='top') plt.tight_layout() plt.savefig('market_value_predictions.png', dpi=300, bbox_inches='tight') plt.show() # Predict market value for new player def predict_player_value(player_data, models, scaler, feature_cols): """ Predict market value for a new player """ # Engineer features player_featured = engineer_features(player_data) player_features = player_featured[feature_cols] # Scale features player_scaled = scaler.transform(player_features) # Predictions from all models predictions = { 'Random Forest': models['rf'].predict(player_scaled)[0], 'Gradient Boosting': models['gb'].predict(player_scaled)[0], 'Ridge': models['ridge'].predict(player_scaled)[0] } predictions['Ensemble'] = np.mean(list(predictions.values())) return predictions # Example: Predict value for a new player new_player = pd.DataFrame({ 'Player': ['New Signing'], 'Age': [25], 'Goals_p90': [0.75], 'Assists_p90': [0.42], 'xG_p90': [0.68], 'Minutes_Played': [2750], 'League_Quality': [8], 'Contract_Years': [4], 'International_Caps': [20] }) models_dict = { 'rf': rf_model, 'gb': gb_model, 'ridge': ridge_model } predicted_values = predict_player_value(new_player, models_dict, scaler, feature_columns) print("\n" + "="*50) print("PREDICTED MARKET VALUE FOR NEW PLAYER") print("="*50) for model_name, value in predicted_values.items(): print(f"{model_name}: €{value:.2f}M") # Market value by age curve def plot_age_value_curve(model, scaler, feature_cols): """ Plot how market value changes with age """ ages = np.arange(18, 36, 1) # Create synthetic player at average performance synthetic_players = [] for age in ages: player = { 'Age': age, 'Goals_p90': 0.60, 'Assists_p90': 0.40, 'xG_p90': 0.55, 'Minutes_Played': 2700, 'League_Quality': 8, 'Contract_Years': 3, 'International_Caps': 25 } synthetic_players.append(player) synthetic_df = pd.DataFrame(synthetic_players) synthetic_featured = engineer_features(synthetic_df) synthetic_features = synthetic_featured[feature_cols] synthetic_scaled = scaler.transform(synthetic_features) predicted_values = model.predict(synthetic_scaled) plt.figure(figsize=(10, 6)) plt.plot(ages, predicted_values, linewidth=3, color='#2E86AB') plt.fill_between(ages, predicted_values, alpha=0.3, color='#2E86AB') plt.xlabel('Age', fontsize=12) plt.ylabel('Predicted Market Value (€M)', fontsize=12) plt.title('Market Value by Age (Average Performance Player)', fontsize=14, fontweight='bold') plt.grid(alpha=0.3) plt.tight_layout() plt.savefig('age_value_curve.png', dpi=300, bbox_inches='tight') plt.show() plot_age_value_curve(gb_model, scaler, feature_columns) # Value over/under analysis def analyze_value_discrepancies(data, model, scaler, feature_cols): """ Find players who are over/undervalued by the market """ # Prepare and predict data_featured = engineer_features(data) X_all = data_featured[feature_cols] X_scaled = scaler.transform(X_all) predictions = model.predict(X_scaled) # Calculate discrepancy results = data.copy() results['Predicted_Value'] = predictions results['Value_Difference'] = results['Market_Value_M'] - results['Predicted_Value'] results['Value_Ratio'] = results['Market_Value_M'] / results['Predicted_Value'] # Identify over/undervalued results['Status'] = results['Value_Difference'].apply( lambda x: 'Overvalued' if x > 10 else ('Undervalued' if x < -10 else 'Fair Value') ) return results[['Player', 'Age', 'Market_Value_M', 'Predicted_Value', 'Value_Difference', 'Status']].sort_values('Value_Difference') value_analysis = analyze_value_discrepancies(market_data, gb_model, scaler, feature_columns) print("\nValue Analysis:") print(value_analysis) print("\nUndervalued Players (Potential Bargains):") print(value_analysis[value_analysis['Status'] == 'Undervalued']) ``` ## R Implementation ```r library(tidyverse) library(randomForest) library(gbm) library(glmnet) library(caret) library(Metrics) # Sample player market data market_data <- data.frame( Player = c("Player A", "Player B", "Player C", "Player D", "Player E", "Player F", "Player G", "Player H", "Player I", "Player J", "Player K", "Player L", "Player M", "Player N", "Player O"), Age = c(24, 28, 22, 26, 30, 25, 27, 23, 29, 26, 24, 28, 25, 27, 23), Goals_p90 = c(0.65, 0.42, 0.85, 0.55, 0.38, 0.72, 0.48, 0.91, 0.35, 0.68, 0.55, 0.45, 0.78, 0.52, 0.88), Assists_p90 = c(0.35, 0.52, 0.25, 0.45, 0.55, 0.38, 0.48, 0.22, 0.62, 0.42, 0.38, 0.58, 0.32, 0.48, 0.28), xG_p90 = c(0.58, 0.38, 0.78, 0.52, 0.35, 0.68, 0.45, 0.85, 0.32, 0.62, 0.52, 0.42, 0.72, 0.48, 0.82), Minutes_Played = c(2800, 2400, 2900, 2600, 2100, 2700, 2500, 2850, 2200, 2650, 2750, 2450, 2800, 2550, 2900), League_Quality = c(8, 7, 9, 8, 7, 9, 8, 9, 7, 8, 8, 7, 9, 8, 9), Contract_Years = c(3, 2, 4, 3, 1, 4, 2, 5, 1, 3, 3, 2, 4, 2, 5), International_Caps = c(15, 45, 8, 25, 62, 18, 38, 5, 58, 28, 12, 42, 22, 35, 10), Market_Value_M = c(45, 35, 75, 52, 28, 68, 42, 85, 25, 58, 48, 38, 65, 45, 82) ) print("Market Data Sample:") print(head(market_data)) # Feature engineering engineer_features <- function(df) { df %>% mutate( Age_Peak = abs(Age - 27), Age_Squared = Age^2, Goal_Contribution = Goals_p90 + Assists_p90, Performance_Score = xG_p90 * 0.6 + Assists_p90 * 0.4, Minutes_per_Year = Minutes_Played / (2024 - (2024 - Age + 18)), Experience_Value = International_Caps * League_Quality, Contract_Value = Contract_Years * 5 ) } market_data_featured <- engineer_features(market_data) # Prepare features and target feature_columns <- c("Age", "Age_Peak", "Age_Squared", "Goals_p90", "Assists_p90", "xG_p90", "Minutes_Played", "League_Quality", "Contract_Years", "International_Caps", "Goal_Contribution", "Performance_Score", "Experience_Value", "Contract_Value") X <- market_data_featured %>% select(all_of(feature_columns)) y <- market_data_featured$Market_Value_M # Split data set.seed(42) train_indices <- createDataPartition(y, p = 0.8, list = FALSE) X_train <- X[train_indices, ] X_test <- X[-train_indices, ] y_train <- y[train_indices] y_test <- y[-train_indices] # Standardize features preProcess_params <- preProcess(X_train, method = c("center", "scale")) X_train_scaled <- predict(preProcess_params, X_train) X_test_scaled <- predict(preProcess_params, X_test) # Model 1: Random Forest set.seed(42) rf_model <- randomForest( x = X_train_scaled, y = y_train, ntree = 100, mtry = 5, importance = TRUE ) rf_pred <- predict(rf_model, X_test_scaled) # Model 2: Gradient Boosting set.seed(42) gb_model <- gbm( Market_Value_M ~ ., data = cbind(X_train_scaled, Market_Value_M = y_train), distribution = "gaussian", n.trees = 100, interaction.depth = 5, shrinkage = 0.1, verbose = FALSE ) gb_pred <- predict(gb_model, X_test_scaled, n.trees = 100) # Model 3: Ridge Regression ridge_model <- cv.glmnet( as.matrix(X_train_scaled), y_train, alpha = 0 ) ridge_pred <- predict(ridge_model, as.matrix(X_test_scaled), s = "lambda.min") # Ensemble prediction ensemble_pred <- (rf_pred + gb_pred + ridge_pred) / 3 # Evaluate models evaluate_model <- function(y_true, y_pred, model_name) { mae <- mae(y_true, y_pred) rmse <- rmse(y_true, y_pred) r2 <- cor(y_true, y_pred)^2 mape <- mean(abs((y_true - y_pred) / y_true)) * 100 cat(sprintf("\n%s Performance:\n", model_name)) cat(sprintf(" MAE: €%.2fM\n", mae)) cat(sprintf(" RMSE: €%.2fM\n", rmse)) cat(sprintf(" R²: %.3f\n", r2)) cat(sprintf(" MAPE: %.2f%%\n", mape)) return(list(MAE = mae, RMSE = rmse, R2 = r2, MAPE = mape)) } rf_metrics <- evaluate_model(y_test, rf_pred, "Random Forest") gb_metrics <- evaluate_model(y_test, gb_pred, "Gradient Boosting") ridge_metrics <- evaluate_model(y_test, as.numeric(ridge_pred), "Ridge Regression") ensemble_metrics <- evaluate_model(y_test, ensemble_pred, "Ensemble Model") # Feature importance importance_df <- data.frame( Feature = rownames(importance(rf_model)), Importance = importance(rf_model)[, "%IncMSE"] ) %>% arrange(desc(Importance)) print("\nTop 10 Most Important Features:") print(head(importance_df, 10)) # Visualize predictions vs actual prediction_df <- data.frame( Actual = rep(y_test, 4), Predicted = c(rf_pred, gb_pred, as.numeric(ridge_pred), ensemble_pred), Model = rep(c("Random Forest", "Gradient Boosting", "Ridge Regression", "Ensemble"), each = length(y_test)) ) pred_plot <- ggplot(prediction_df, aes(x = Actual, y = Predicted)) + geom_point(alpha = 0.6, size = 3, color = "#2E86AB") + geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red", size = 1, alpha = 0.5) + facet_wrap(~Model, scales = "free") + labs( title = "Market Value Predictions vs Actual", x = "Actual Market Value (€M)", y = "Predicted Market Value (€M)" ) + theme_minimal() + theme( plot.title = element_text(hjust = 0.5, size = 14, face = "bold"), strip.text = element_text(size = 11, face = "bold") ) print(pred_plot) ggsave("market_value_predictions_r.png", pred_plot, width = 14, height = 12, dpi = 300) # Market value by age curve plot_age_value_curve <- function(model, preprocess_params, feature_cols) { ages <- 18:35 # Create synthetic players synthetic_data <- data.frame( Age = ages, Goals_p90 = 0.60, Assists_p90 = 0.40, xG_p90 = 0.55, Minutes_Played = 2700, League_Quality = 8, Contract_Years = 3, International_Caps = 25 ) synthetic_featured <- engineer_features(synthetic_data) synthetic_features <- synthetic_featured %>% select(all_of(feature_cols)) synthetic_scaled <- predict(preprocess_params, synthetic_features) predicted_values <- predict(model, synthetic_scaled, n.trees = 100) age_curve_plot <- ggplot(data.frame(Age = ages, Value = predicted_values), aes(x = Age, y = Value)) + geom_line(size = 2, color = "#2E86AB") + geom_area(alpha = 0.3, fill = "#2E86AB") + labs( title = "Market Value by Age (Average Performance Player)", x = "Age", y = "Predicted Market Value (€M)" ) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold")) print(age_curve_plot) ggsave("age_value_curve_r.png", age_curve_plot, width = 10, height = 6, dpi = 300) } plot_age_value_curve(gb_model, preProcess_params, feature_columns) # Value analysis analyze_value_discrepancies <- function(data, model, preprocess_params, feature_cols) { data_featured <- engineer_features(data) X_all <- data_featured %>% select(all_of(feature_cols)) X_scaled <- predict(preprocess_params, X_all) predictions <- predict(model, X_scaled, n.trees = 100) results <- data %>% mutate( Predicted_Value = predictions, Value_Difference = Market_Value_M - Predicted_Value, Value_Ratio = Market_Value_M / Predicted_Value, Status = case_when( Value_Difference > 10 ~ "Overvalued", Value_Difference < -10 ~ "Undervalued", TRUE ~ "Fair Value" ) ) %>% select(Player, Age, Market_Value_M, Predicted_Value, Value_Difference, Status) %>% arrange(Value_Difference) return(results) } value_analysis <- analyze_value_discrepancies(market_data, gb_model, preProcess_params, feature_columns) print("\nValue Analysis:") print(value_analysis) print("\nUndervalued Players (Potential Bargains):") print(value_analysis %>% filter(Status == "Undervalued")) ``` ## Key Considerations ### Model Limitations 1. **Data Quality**: Predictions only as good as input data 2. **Market Volatility**: Rapid changes in market conditions 3. **Intangibles**: Cannot capture leadership, mentality, marketing value 4. **Small Sample**: Limited transfer data for validation ### External Factors - Economic conditions - Club financial fair play constraints - COVID-19 impact on market - Currency fluctuations - Agent influence ### Position-Specific Models Consider building separate models for: - Forwards/Strikers - Midfielders - Defenders - Goalkeepers ## Best Practices 1. **Regular Updates**: Retrain models with latest transfer data 2. **Multiple Models**: Use ensemble approaches 3. **Domain Expertise**: Combine with scout knowledge 4. **Confidence Intervals**: Provide value ranges, not point estimates 5. **Market Context**: Consider current market trends 6. **Validation**: Back-test predictions against actual transfers ## Use Cases 1. **Transfer Strategy**: Identify undervalued players 2. **Contract Negotiations**: Data-driven salary discussions 3. **Squad Valuation**: Estimate total squad worth 4. **Financial Planning**: Budget forecasting 5. **Performance Evaluation**: Compare value to performance

Discussion

Have questions or feedback? Join our community discussion on Discord or GitHub Discussions.