Basketball Reference Data Extraction

Beginner 10 min read 1 views Nov 27, 2025
# Basketball Reference Data Extraction ## Introduction Basketball-Reference.com is a comprehensive source for historical and current NBA statistics. This guide covers ethical web scraping techniques for data extraction. ## Python Implementation ```python import requests from bs4 import BeautifulSoup import pandas as pd from time import sleep class BBRefScraper: def __init__(self): self.base_url = "https://www.basketball-reference.com" self.headers = { 'User-Agent': 'Mozilla/5.0 (Research/Educational)' } def get_player_season_stats(self, year=2024): """Scrape per-game stats for a season""" url = f"{self.base_url}/leagues/NBA_{year}_per_game.html" response = requests.get(url, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') # Find the stats table table = soup.find('table', {'id': 'per_game_stats'}) # Parse table to DataFrame df = pd.read_html(str(table))[0] # Clean multi-level columns if present if isinstance(df.columns, pd.MultiIndex): df.columns = df.columns.droplevel() # Remove header rows that appear in data df = df[df['Player'] != 'Player'] return df def get_team_stats(self, year=2024): """Scrape team statistics""" url = f"{self.base_url}/leagues/NBA_{year}.html" sleep(3) # Be respectful with rate limiting response = requests.get(url, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') # Team stats table table = soup.find('table', {'id': 'per_game-team'}) df = pd.read_html(str(table))[0] return df def get_player_career_stats(self, player_id): """Get career statistics for a specific player""" url = f"{self.base_url}/players/{player_id[0]}/{player_id}.html" sleep(3) response = requests.get(url, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') table = soup.find('table', {'id': 'per_game'}) df = pd.read_html(str(table))[0] return df # Usage scraper = BBRefScraper() stats_2024 = scraper.get_player_season_stats(2024) print(stats_2024.head()) ``` ## R Implementation ```r library(rvest) library(dplyr) library(purrr) get_bbref_player_stats <- function(year = 2024) { url <- paste0("https://www.basketball-reference.com/leagues/NBA_", year, "_per_game.html") # Read the page page <- read_html(url) # Extract table stats <- page %>% html_node("#per_game_stats") %>% html_table() # Clean the data stats <- stats %>% filter(Player != "Player") %>% # Remove header rows mutate(across(where(is.character), ~na_if(., ""))) return(stats) } get_bbref_team_stats <- function(year = 2024) { url <- paste0("https://www.basketball-reference.com/leagues/NBA_", year, ".html") Sys.sleep(3) # Rate limiting page <- read_html(url) team_stats <- page %>% html_node("#per_game-team") %>% html_table() return(team_stats) } # Usage player_stats <- get_bbref_player_stats(2024) head(player_stats) ``` ## Ethical Scraping Guidelines 1. Implement respectful rate limiting (3+ seconds between requests) 2. Use descriptive User-Agent identifying purpose 3. Cache downloaded data to minimize requests 4. Check and respect robots.txt 5. Consider using official APIs when available ## Common Data Tables - Per-game stats (`#per_game_stats`) - Advanced stats (`#advanced_stats`) - Team stats (`#per_game-team`) - Shooting stats (`#shooting_stats`) - Play-by-play (`#pbp`)

Discussion

Have questions or feedback? Join our community discussion on Discord or GitHub Discussions.