Player Similarity Analysis

Beginner 10 min read 1 views Nov 27, 2025
# Player Similarity Analysis ## Overview Player similarity analysis identifies comparable players based on their statistical profiles. This is crucial for recruitment, finding tactical replacements, and understanding player roles. Multiple methodologies can be employed, from simple distance metrics to sophisticated machine learning algorithms. ## Core Concepts ### Similarity Metrics 1. **Euclidean Distance**: Straight-line distance in n-dimensional space 2. **Cosine Similarity**: Angle between player vectors 3. **Manhattan Distance**: Sum of absolute differences 4. **Mahalanobis Distance**: Accounts for feature correlations ### Dimensionality Reduction - **PCA**: Principal Component Analysis - **t-SNE**: t-Distributed Stochastic Neighbor Embedding - **UMAP**: Uniform Manifold Approximation and Projection ## Python Implementation ```python import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.manifold import TSNE from sklearn.cluster import KMeans from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances import matplotlib.pyplot as plt import seaborn as sns from scipy.spatial.distance import cdist # Sample player dataset player_stats = pd.DataFrame({ 'Player': ['Messi', 'Ronaldo', 'Neymar', 'Mbappe', 'Haaland', 'Kane', 'Lewandowski', 'Benzema', 'Salah', 'De Bruyne', 'van Dijk', 'Ramos', 'Kante', 'Modric', 'Busquets'], 'Goals_p90': [0.85, 0.92, 0.55, 0.78, 1.05, 0.68, 0.95, 0.72, 0.75, 0.25, 0.05, 0.08, 0.12, 0.15, 0.08], 'Assists_p90': [0.45, 0.32, 0.42, 0.38, 0.25, 0.35, 0.28, 0.40, 0.35, 0.55, 0.05, 0.08, 0.18, 0.35, 0.22], 'Key_Passes_p90': [2.8, 1.5, 2.5, 2.2, 1.2, 2.0, 1.8, 2.1, 2.3, 3.5, 0.5, 0.6, 1.2, 2.5, 2.8], 'Dribbles_p90': [4.2, 2.1, 5.5, 4.8, 1.2, 1.5, 1.3, 1.8, 2.5, 1.8, 0.3, 0.5, 1.8, 2.2, 0.8], 'Tackles_p90': [0.8, 1.2, 1.5, 1.3, 0.5, 1.0, 0.7, 0.9, 1.2, 2.5, 1.8, 2.5, 4.5, 3.2, 3.8], 'Aerial_Duels_Won': [45, 68, 35, 42, 75, 62, 58, 55, 48, 52, 78, 72, 45, 50, 48], 'Pass_Completion': [84, 82, 81, 83, 75, 78, 80, 82, 83, 88, 90, 87, 89, 90, 92], 'Position': ['FW', 'FW', 'FW', 'FW', 'FW', 'FW', 'FW', 'FW', 'FW', 'MF', 'DF', 'DF', 'MF', 'MF', 'MF'] }) print("Player Statistics:") print(player_stats.head()) # Prepare data for similarity analysis stat_columns = ['Goals_p90', 'Assists_p90', 'Key_Passes_p90', 'Dribbles_p90', 'Tackles_p90', 'Aerial_Duels_Won', 'Pass_Completion'] # Standardize features scaler = StandardScaler() stats_scaled = scaler.fit_transform(player_stats[stat_columns]) stats_scaled_df = pd.DataFrame(stats_scaled, columns=stat_columns, index=player_stats['Player']) # Method 1: Euclidean Distance Similarity def find_similar_players_euclidean(player_name, data, stats_df, top_n=5): """ Find most similar players using Euclidean distance """ # Get player index player_idx = data[data['Player'] == player_name].index[0] # Calculate distances distances = euclidean_distances(stats_df) player_distances = distances[player_idx] # Get most similar players (excluding self) similar_indices = np.argsort(player_distances)[1:top_n+1] results = pd.DataFrame({ 'Player': data.iloc[similar_indices]['Player'].values, 'Distance': player_distances[similar_indices], 'Similarity_Score': 100 * (1 - player_distances[similar_indices] / player_distances.max()) }) return results # Find similar players to Messi messi_similar = find_similar_players_euclidean('Messi', player_stats, stats_scaled_df, top_n=5) print("\nPlayers Most Similar to Messi (Euclidean):") print(messi_similar) # Method 2: Cosine Similarity def find_similar_players_cosine(player_name, data, stats_df, top_n=5): """ Find most similar players using cosine similarity """ # Get player index player_idx = data[data['Player'] == player_name].index[0] # Calculate cosine similarity similarities = cosine_similarity(stats_df) player_similarities = similarities[player_idx] # Get most similar players (excluding self) similar_indices = np.argsort(player_similarities)[::-1][1:top_n+1] results = pd.DataFrame({ 'Player': data.iloc[similar_indices]['Player'].values, 'Cosine_Similarity': player_similarities[similar_indices], 'Similarity_Score': player_similarities[similar_indices] * 100 }) return results messi_similar_cosine = find_similar_players_cosine('Messi', player_stats, stats_scaled_df, top_n=5) print("\nPlayers Most Similar to Messi (Cosine):") print(messi_similar_cosine) # Method 3: Custom Weighted Similarity def find_similar_players_weighted(player_name, data, stats_df, weights=None, top_n=5): """ Find similar players with custom attribute weights """ if weights is None: # Default equal weights weights = np.ones(len(stat_columns)) # Apply weights weighted_stats = stats_df * weights # Get player index player_idx = data[data['Player'] == player_name].index[0] # Calculate weighted Euclidean distance distances = cdist(weighted_stats, weighted_stats, metric='euclidean') player_distances = distances[player_idx] # Get most similar players similar_indices = np.argsort(player_distances)[1:top_n+1] results = pd.DataFrame({ 'Player': data.iloc[similar_indices]['Player'].values, 'Weighted_Distance': player_distances[similar_indices], 'Similarity_Score': 100 * (1 - player_distances[similar_indices] / player_distances.max()) }) return results # Emphasize attacking attributes for forwards attacking_weights = np.array([2.0, 2.0, 1.5, 1.5, 0.5, 1.0, 0.8]) messi_similar_weighted = find_similar_players_weighted( 'Messi', player_stats, stats_scaled_df, weights=attacking_weights, top_n=5 ) print("\nPlayers Most Similar to Messi (Weighted):") print(messi_similar_weighted) # Method 4: K-Means Clustering def cluster_players(stats_df, data, n_clusters=4): """ Cluster players into similar groups """ kmeans = KMeans(n_clusters=n_clusters, random_state=42) clusters = kmeans.fit_predict(stats_df) data_clustered = data.copy() data_clustered['Cluster'] = clusters return data_clustered, kmeans player_clusters, kmeans_model = cluster_players(stats_scaled_df, player_stats, n_clusters=4) print("\nPlayer Clusters:") for cluster in sorted(player_clusters['Cluster'].unique()): print(f"\nCluster {cluster}:") print(player_clusters[player_clusters['Cluster'] == cluster]['Player'].tolist()) # Method 5: PCA Visualization def visualize_player_similarity_pca(stats_df, data, highlight_player=None): """ Visualize player similarity using PCA """ # Apply PCA pca = PCA(n_components=2) pca_coords = pca.fit_transform(stats_df) # Create DataFrame pca_df = pd.DataFrame({ 'PC1': pca_coords[:, 0], 'PC2': pca_coords[:, 1], 'Player': data['Player'].values, 'Position': data['Position'].values }) # Plot fig, ax = plt.subplots(figsize=(12, 8)) # Color by position position_colors = {'FW': '#E63946', 'MF': '#457B9D', 'DF': '#2A9D8F'} for position in pca_df['Position'].unique(): mask = pca_df['Position'] == position ax.scatter(pca_df[mask]['PC1'], pca_df[mask]['PC2'], c=position_colors[position], label=position, s=100, alpha=0.6) # Annotate players for idx, row in pca_df.iterrows(): fontweight = 'bold' if row['Player'] == highlight_player else 'normal' fontsize = 12 if row['Player'] == highlight_player else 9 ax.annotate(row['Player'], (row['PC1'], row['PC2']), fontsize=fontsize, fontweight=fontweight, alpha=0.8) # Highlight specific player if highlight_player: highlight_data = pca_df[pca_df['Player'] == highlight_player] ax.scatter(highlight_data['PC1'], highlight_data['PC2'], s=300, facecolors='none', edgecolors='black', linewidth=2) ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)') ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)') ax.set_title('Player Similarity Map (PCA)', fontsize=14, fontweight='bold') ax.legend() ax.grid(alpha=0.3) plt.tight_layout() plt.savefig('player_similarity_pca.png', dpi=300, bbox_inches='tight') plt.show() return pca_df, pca pca_results, pca_model = visualize_player_similarity_pca( stats_scaled_df, player_stats, highlight_player='Messi' ) # Method 6: t-SNE Visualization def visualize_player_similarity_tsne(stats_df, data, perplexity=5): """ Visualize player similarity using t-SNE """ # Apply t-SNE tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42) tsne_coords = tsne.fit_transform(stats_df) # Create DataFrame tsne_df = pd.DataFrame({ 'Dim1': tsne_coords[:, 0], 'Dim2': tsne_coords[:, 1], 'Player': data['Player'].values, 'Position': data['Position'].values }) # Plot fig, ax = plt.subplots(figsize=(12, 8)) position_colors = {'FW': '#E63946', 'MF': '#457B9D', 'DF': '#2A9D8F'} for position in tsne_df['Position'].unique(): mask = tsne_df['Position'] == position ax.scatter(tsne_df[mask]['Dim1'], tsne_df[mask]['Dim2'], c=position_colors[position], label=position, s=100, alpha=0.6) for idx, row in tsne_df.iterrows(): ax.annotate(row['Player'], (row['Dim1'], row['Dim2']), fontsize=9, alpha=0.8) ax.set_xlabel('Dimension 1') ax.set_ylabel('Dimension 2') ax.set_title('Player Similarity Map (t-SNE)', fontsize=14, fontweight='bold') ax.legend() ax.grid(alpha=0.3) plt.tight_layout() plt.savefig('player_similarity_tsne.png', dpi=300, bbox_inches='tight') plt.show() return tsne_df tsne_results = visualize_player_similarity_tsne(stats_scaled_df, player_stats) # Comprehensive similarity report def generate_similarity_report(player_name, data, stats_df): """ Generate comprehensive similarity report for a player """ print(f"\n{'='*60}") print(f"SIMILARITY REPORT FOR {player_name.upper()}") print(f"{'='*60}\n") # Player stats player_row = data[data['Player'] == player_name] print("Player Statistics:") for col in stat_columns: print(f" {col}: {player_row[col].values[0]:.2f}") # Top 5 similar players (Euclidean) print("\n--- Most Similar Players (Euclidean Distance) ---") similar_euc = find_similar_players_euclidean(player_name, data, stats_df, top_n=5) print(similar_euc.to_string(index=False)) # Top 5 similar players (Cosine) print("\n--- Most Similar Players (Cosine Similarity) ---") similar_cos = find_similar_players_cosine(player_name, data, stats_df, top_n=5) print(similar_cos.to_string(index=False)) # Cluster membership clusters_data, _ = cluster_players(stats_df, data, n_clusters=4) player_cluster = clusters_data[clusters_data['Player'] == player_name]['Cluster'].values[0] cluster_members = clusters_data[clusters_data['Cluster'] == player_cluster]['Player'].tolist() print(f"\n--- Cluster Membership (Cluster {player_cluster}) ---") print(f"Players in same cluster: {', '.join([p for p in cluster_members if p != player_name])}") generate_similarity_report('Messi', player_stats, stats_scaled_df) ``` ## R Implementation ```r library(tidyverse) library(cluster) library(factoextra) library(proxy) library(Rtsne) library(ggrepel) # Sample player dataset player_stats <- data.frame( Player = c("Messi", "Ronaldo", "Neymar", "Mbappe", "Haaland", "Kane", "Lewandowski", "Benzema", "Salah", "De Bruyne", "van Dijk", "Ramos", "Kante", "Modric", "Busquets"), Goals_p90 = c(0.85, 0.92, 0.55, 0.78, 1.05, 0.68, 0.95, 0.72, 0.75, 0.25, 0.05, 0.08, 0.12, 0.15, 0.08), Assists_p90 = c(0.45, 0.32, 0.42, 0.38, 0.25, 0.35, 0.28, 0.40, 0.35, 0.55, 0.05, 0.08, 0.18, 0.35, 0.22), Key_Passes_p90 = c(2.8, 1.5, 2.5, 2.2, 1.2, 2.0, 1.8, 2.1, 2.3, 3.5, 0.5, 0.6, 1.2, 2.5, 2.8), Dribbles_p90 = c(4.2, 2.1, 5.5, 4.8, 1.2, 1.5, 1.3, 1.8, 2.5, 1.8, 0.3, 0.5, 1.8, 2.2, 0.8), Tackles_p90 = c(0.8, 1.2, 1.5, 1.3, 0.5, 1.0, 0.7, 0.9, 1.2, 2.5, 1.8, 2.5, 4.5, 3.2, 3.8), Aerial_Duels_Won = c(45, 68, 35, 42, 75, 62, 58, 55, 48, 52, 78, 72, 45, 50, 48), Pass_Completion = c(84, 82, 81, 83, 75, 78, 80, 82, 83, 88, 90, 87, 89, 90, 92), Position = c("FW", "FW", "FW", "FW", "FW", "FW", "FW", "FW", "FW", "MF", "DF", "DF", "MF", "MF", "MF") ) print("Player Statistics:") print(head(player_stats)) # Prepare data stat_columns <- c("Goals_p90", "Assists_p90", "Key_Passes_p90", "Dribbles_p90", "Tackles_p90", "Aerial_Duels_Won", "Pass_Completion") # Standardize features stats_scaled <- player_stats %>% select(all_of(stat_columns)) %>% scale() rownames(stats_scaled) <- player_stats$Player # Method 1: Find similar players using Euclidean distance find_similar_players_euclidean <- function(player_name, data, stats_matrix, top_n = 5) { # Calculate distance matrix distances <- dist(stats_matrix, method = "euclidean") dist_matrix <- as.matrix(distances) # Get player distances player_distances <- dist_matrix[player_name, ] # Sort and get top N (excluding self) similar_players <- sort(player_distances)[-1][1:top_n] results <- data.frame( Player = names(similar_players), Distance = as.numeric(similar_players), Similarity_Score = 100 * (1 - as.numeric(similar_players) / max(dist_matrix)) ) return(results) } # Find similar players to Messi messi_similar <- find_similar_players_euclidean("Messi", player_stats, stats_scaled, top_n = 5) print("Players Most Similar to Messi (Euclidean):") print(messi_similar) # Method 2: Cosine similarity find_similar_players_cosine <- function(player_name, data, stats_matrix, top_n = 5) { # Calculate cosine similarity cos_sim <- proxy::simil(stats_matrix, method = "cosine") cos_matrix <- as.matrix(cos_sim) # Get player similarities player_similarities <- cos_matrix[player_name, ] # Sort and get top N (excluding self) similar_players <- sort(player_similarities, decreasing = TRUE)[-1][1:top_n] results <- data.frame( Player = names(similar_players), Cosine_Similarity = as.numeric(similar_players), Similarity_Score = as.numeric(similar_players) * 100 ) return(results) } messi_similar_cosine <- find_similar_players_cosine("Messi", player_stats, stats_scaled, top_n = 5) print("Players Most Similar to Messi (Cosine):") print(messi_similar_cosine) # Method 3: K-means clustering set.seed(42) kmeans_result <- kmeans(stats_scaled, centers = 4, nstart = 25) player_clusters <- player_stats %>% mutate(Cluster = kmeans_result$cluster) print("Player Clusters:") for (cluster in sort(unique(player_clusters$Cluster))) { cat(sprintf("\nCluster %d:\n", cluster)) players <- player_clusters %>% filter(Cluster == cluster) %>% pull(Player) cat(paste(players, collapse = ", "), "\n") } # Method 4: PCA visualization pca_result <- prcomp(stats_scaled) pca_df <- data.frame( PC1 = pca_result$x[, 1], PC2 = pca_result$x[, 2], Player = player_stats$Player, Position = player_stats$Position ) # Calculate variance explained var_explained <- summary(pca_result)$importance[2, 1:2] * 100 pca_plot <- ggplot(pca_df, aes(x = PC1, y = PC2, color = Position, label = Player)) + geom_point(size = 4, alpha = 0.6) + geom_text_repel(size = 3, fontface = "bold") + scale_color_manual(values = c("FW" = "#E63946", "MF" = "#457B9D", "DF" = "#2A9D8F")) + labs( title = "Player Similarity Map (PCA)", x = sprintf("PC1 (%.1f%% variance)", var_explained[1]), y = sprintf("PC2 (%.1f%% variance)", var_explained[2]) ) + theme_minimal() + theme( plot.title = element_text(hjust = 0.5, size = 14, face = "bold"), legend.position = "bottom" ) print(pca_plot) ggsave("player_similarity_pca_r.png", pca_plot, width = 12, height = 8, dpi = 300) # Method 5: t-SNE visualization set.seed(42) tsne_result <- Rtsne(stats_scaled, dims = 2, perplexity = 5, verbose = FALSE, max_iter = 500) tsne_df <- data.frame( Dim1 = tsne_result$Y[, 1], Dim2 = tsne_result$Y[, 2], Player = player_stats$Player, Position = player_stats$Position ) tsne_plot <- ggplot(tsne_df, aes(x = Dim1, y = Dim2, color = Position, label = Player)) + geom_point(size = 4, alpha = 0.6) + geom_text_repel(size = 3, fontface = "bold") + scale_color_manual(values = c("FW" = "#E63946", "MF" = "#457B9D", "DF" = "#2A9D8F")) + labs( title = "Player Similarity Map (t-SNE)", x = "Dimension 1", y = "Dimension 2" ) + theme_minimal() + theme( plot.title = element_text(hjust = 0.5, size = 14, face = "bold"), legend.position = "bottom" ) print(tsne_plot) ggsave("player_similarity_tsne_r.png", tsne_plot, width = 12, height = 8, dpi = 300) # Comprehensive similarity report generate_similarity_report <- function(player_name, data, stats_matrix) { cat(sprintf("\n%s\n", strrep("=", 60))) cat(sprintf("SIMILARITY REPORT FOR %s\n", toupper(player_name))) cat(sprintf("%s\n\n", strrep("=", 60))) # Player stats player_row <- data %>% filter(Player == player_name) cat("Player Statistics:\n") for (col in stat_columns) { cat(sprintf(" %s: %.2f\n", col, player_row[[col]])) } # Similar players cat("\n--- Most Similar Players (Euclidean Distance) ---\n") similar_euc <- find_similar_players_euclidean(player_name, data, stats_matrix, top_n = 5) print(similar_euc) cat("\n--- Most Similar Players (Cosine Similarity) ---\n") similar_cos <- find_similar_players_cosine(player_name, data, stats_matrix, top_n = 5) print(similar_cos) } generate_similarity_report("Messi", player_stats, stats_scaled) ``` ## Best Practices ### Data Preparation 1. **Standardization**: Always scale features (z-score or min-max) 2. **Feature Selection**: Choose relevant, independent metrics 3. **Position Filtering**: Compare within same position group when appropriate 4. **Minimum Minutes**: Filter players with sufficient playing time ### Methodology Selection - **Euclidean Distance**: Good for general similarity - **Cosine Similarity**: Better for playing style similarity - **Weighted Distance**: Use when certain attributes are more important - **Clustering**: Good for finding player archetypes ### Validation - Cross-check with qualitative analysis - Consider tactical context and team style - Account for league quality differences - Validate with domain experts ## Use Cases 1. **Recruitment**: Find affordable alternatives to expensive targets 2. **Succession Planning**: Identify replacements for aging players 3. **Tactical Analysis**: Find players who fit specific roles 4. **Market Analysis**: Identify undervalued players 5. **Youth Development**: Compare young players to established stars

Discussion

Have questions or feedback? Join our community discussion on Discord or GitHub Discussions.