Using baseballr for Data Collection

Beginner 10 min read 0 views Nov 26, 2025

Introduction to baseballr

The baseballr package is a comprehensive R package designed to facilitate the acquisition and analysis of baseball data from multiple sources. Created and maintained by Bill Petti, baseballr provides a unified interface to access data from FanGraphs, Baseball Savant (Statcast), the MLB Stats API, and other baseball data repositories.

Installation and Setup

# Install devtools if needed
install.packages("devtools")

# Install baseballr from GitHub
devtools::install_github("BillPetti/baseballr")

# Load required packages
library(baseballr)
library(tidyverse)
library(dplyr)
library(ggplot2)

Key Functions Reference

Function Data Source Description
fg_batter_leaders() FanGraphs Batting leaderboard data
fg_pitch_leaders() FanGraphs Pitching leaderboard data
scrape_statcast_savant() Baseball Savant Pitch-level Statcast data
scrape_statcast_savant_batter() Baseball Savant Batter-specific Statcast data
scrape_statcast_savant_pitcher() Baseball Savant Pitcher-specific Statcast data
mlb_standings() MLB Stats API Division and league standings
playerid_lookup() Chadwick Bureau Player ID lookup across systems

Working with FanGraphs Data

# Get qualified batting leaders for 2024
batters_2024 <- fg_batter_leaders(
  startseason = 2024,
  endseason = 2024,
  qual = "y",
  ind = 1
)

# View top 10 by wRC+
batters_2024 %>%
  select(Name, Team, PA, AVG, OBP, SLG, wRC_plus, WAR) %>%
  arrange(desc(wRC_plus)) %>%
  head(10)

# Analyze power hitters
power_hitters <- batters_2024 %>%
  filter(HR >= 30) %>%
  select(Name, Team, HR, AVG, ISO, Barrel_Pct) %>%
  arrange(desc(HR))

print(power_hitters)

Multi-Season Analysis

# Get batting data from 2019-2024
batters_multi <- fg_batter_leaders(
  startseason = 2019,
  endseason = 2024,
  qual = "y",
  ind = 0  # Aggregate across seasons
)

# Find consistent performers
consistent_performers <- batters_multi %>%
  filter(PA >= 2000) %>%
  select(Name, PA, AVG, OBP, SLG, wRC_plus, WAR) %>%
  arrange(desc(WAR)) %>%
  head(20)

# Visualize OBP vs SLG
ggplot(batters_multi %>% filter(PA >= 2000),
       aes(x = OBP, y = SLG)) +
  geom_point(aes(color = wRC_plus, size = WAR), alpha = 0.6) +
  scale_color_gradient2(low = "blue", mid = "white", high = "red",
                        midpoint = 100) +
  labs(title = "Batter Performance: OBP vs SLG (2019-2024)",
       x = "On-Base Percentage",
       y = "Slugging Percentage") +
  theme_minimal()

Pitching Leaders Analysis

# Get qualified pitchers for 2024
pitchers_2024 <- fg_pitch_leaders(
  startseason = 2024,
  endseason = 2024,
  qual = "y",
  ind = 1
)

# Elite pitchers by FIP
elite_pitchers <- pitchers_2024 %>%
  select(Name, Team, IP, ERA, FIP, xFIP, K_9, BB_9, WAR) %>%
  arrange(FIP) %>%
  head(15)

# Strikeout leaders
strikeout_leaders <- pitchers_2024 %>%
  filter(IP >= 140) %>%
  select(Name, Team, IP, K_9, SwStr_Pct) %>%
  arrange(desc(K_9)) %>%
  head(10)

Working with Statcast Data

# Get Statcast data for a date range
statcast_data <- scrape_statcast_savant(
  start_date = "2024-06-01",
  end_date = "2024-06-30",
  player_type = "batter"
)

# Examine the structure
glimpse(statcast_data)

# Analyze pitch types
pitch_summary <- statcast_data %>%
  group_by(pitch_type, pitch_name) %>%
  summarise(
    count = n(),
    avg_velocity = mean(release_speed, na.rm = TRUE),
    avg_spin = mean(release_spin_rate, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  arrange(desc(count))

print(pitch_summary)

Player-Specific Statcast Analysis

# Look up player ID
judge_id <- playerid_lookup(last_name = "Judge", first_name = "Aaron")
print(judge_id)

# Get Judge's Statcast data for the 2024 season
judge_statcast <- scrape_statcast_savant_batter(
  start_date = "2024-03-30",
  end_date = "2024-10-01",
  batterid = 592450  # Judge's MLBAM ID
)

# Analyze batted ball performance
judge_batted_balls <- judge_statcast %>%
  filter(!is.na(launch_speed)) %>%
  select(game_date, pitch_name, launch_speed, launch_angle,
         hit_distance_sc, events, estimated_ba_using_speedangle)

# Calculate metrics
judge_metrics <- judge_batted_balls %>%
  summarise(
    batted_balls = n(),
    avg_exit_velo = mean(launch_speed, na.rm = TRUE),
    max_exit_velo = max(launch_speed, na.rm = TRUE),
    hard_hit_rate = mean(launch_speed >= 95, na.rm = TRUE),
    avg_distance = mean(hit_distance_sc, na.rm = TRUE)
  )

print(judge_metrics)

Creating Visualizations

# Exit velocity vs launch angle plot
judge_batted_balls %>%
  ggplot(aes(x = launch_angle, y = launch_speed)) +
  geom_point(aes(color = events), alpha = 0.6, size = 3) +
  geom_hline(yintercept = 95, linetype = "dashed", color = "red") +
  annotate("rect", xmin = 26, xmax = 30, ymin = 98, ymax = 120,
           alpha = 0.2, fill = "gold") +
  annotate("text", x = 28, y = 115, label = "Barrel Zone",
           color = "gold4", fontface = "bold") +
  labs(title = "Aaron Judge: Exit Velocity vs Launch Angle",
       subtitle = "2024 Season - Barrel zone highlighted",
       x = "Launch Angle (degrees)",
       y = "Exit Velocity (mph)",
       color = "Outcome") +
  theme_minimal()

MLB Stats API: Standings

# Get current season standings
standings_2024 <- mlb_standings(season = 2024)

# Extract and format division standings
division_standings <- standings_2024 %>%
  select(team_name, division_name, w, l, winning_percentage,
         games_back, runs_scored, runs_allowed) %>%
  arrange(division_name, desc(winning_percentage)) %>%
  mutate(run_diff = runs_scored - runs_allowed)

print(division_standings)

Best Practices

  • Rate Limiting: Add Sys.sleep() between large requests
  • Data Caching: Save data locally with saveRDS() for repeated analysis
  • Handle Missing Data: Use na.rm = TRUE in calculations
  • Filter Complete Cases: Clean data before analysis

Key Takeaways

  • Unified data access: baseballr provides access to FanGraphs, Statcast, and MLB API data.
  • Tidyverse integration: Works seamlessly with dplyr, ggplot2, and other tidyverse packages.
  • Player ID lookup: The playerid_lookup() function maps player names to various ID systems.
  • Granular Statcast data: Access pitch-level tracking data for detailed analysis.
  • Active development: Package is regularly updated with new functions and data sources.

Discussion

Have questions or feedback? Join our community discussion on Discord or GitHub Discussions.