Football generates vast amounts of unstructured text data—scouting reports, game notes, media coverage, and social media commentary. Natural Language Processing (NLP) provides tools to extract insights from this text, automating aspects of scouting...
In This Chapter
Chapter 25: Natural Language Processing for Scouting
Introduction
Football generates vast amounts of unstructured text data—scouting reports, game notes, media coverage, and social media commentary. Natural Language Processing (NLP) provides tools to extract insights from this text, automating aspects of scouting, identifying sentiment patterns, and discovering hidden information in the written record. This chapter explores how NLP techniques apply to football analytics.
Learning Objectives
By the end of this chapter, you will be able to:
- Process and clean football text data
- Extract entities like players, teams, and statistics from text
- Classify scouting report sentiment and content
- Build topic models to discover discussion themes
- Create text-based player comparison systems
- Apply NLP to media analysis and sentiment tracking
25.1 Text Data in Football
25.1.1 Data Sources
FOOTBALL TEXT DATA SOURCES
==========================
Scouting Reports:
- Pro day evaluations
- Combine assessments
- Draft grades
- Game film notes
Media Content:
- Press conferences
- Post-game interviews
- Beat writer coverage
- National columnists
Social Media:
- Twitter/X analysis
- Fan sentiment
- Recruiting updates
- Transfer portal news
Official Communications:
- Injury reports
- Depth charts
- Roster moves
- Coach statements
25.1.2 Text Processing Pipeline
import re
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
class FootballTextProcessor:
"""Process and clean football-related text."""
FOOTBALL_STOPWORDS = {
'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for',
'team', 'player', 'game', 'season', 'year'
}
POSITION_PATTERNS = {
'quarterback': 'QB',
'running back': 'RB',
'wide receiver': 'WR',
'tight end': 'TE',
'offensive line': 'OL',
'defensive line': 'DL',
'linebacker': 'LB',
'cornerback': 'CB',
'safety': 'S'
}
def __init__(self):
self.stat_pattern = re.compile(
r'(\d+)\s*(yards?|yds?|td|touchdowns?|catches?|receptions?|carries?|attempts?)',
re.IGNORECASE
)
def clean_text(self, text: str) -> str:
"""Basic text cleaning."""
# Lowercase
text = text.lower()
# Remove special characters but keep football-relevant ones
text = re.sub(r'[^\w\s\-\.]', ' ', text)
# Normalize whitespace
text = ' '.join(text.split())
return text
def extract_stats(self, text: str) -> List[Dict]:
"""Extract statistical mentions from text."""
stats = []
matches = self.stat_pattern.findall(text)
for value, stat_type in matches:
stats.append({
'value': int(value),
'type': self._normalize_stat_type(stat_type)
})
return stats
def _normalize_stat_type(self, stat_type: str) -> str:
"""Normalize stat type to standard form."""
stat_type = stat_type.lower()
mappings = {
'yard': 'yards', 'yds': 'yards', 'yd': 'yards',
'td': 'touchdowns', 'touchdown': 'touchdowns',
'catch': 'receptions', 'reception': 'receptions',
'carry': 'carries', 'attempt': 'attempts'
}
for pattern, normalized in mappings.items():
if pattern in stat_type:
return normalized
return stat_type
def extract_positions(self, text: str) -> List[str]:
"""Extract position mentions from text."""
positions = []
text_lower = text.lower()
for pattern, abbrev in self.POSITION_PATTERNS.items():
if pattern in text_lower or abbrev.lower() in text_lower:
positions.append(abbrev)
return list(set(positions))
def tokenize_for_football(self, text: str) -> List[str]:
"""Tokenize text with football-specific handling."""
text = self.clean_text(text)
tokens = text.split()
# Remove stopwords
tokens = [t for t in tokens if t not in self.FOOTBALL_STOPWORDS]
# Handle common football terms
tokens = self._handle_football_terms(tokens)
return tokens
def _handle_football_terms(self, tokens: List[str]) -> List[str]:
"""Handle football-specific tokenization."""
result = []
i = 0
while i < len(tokens):
# Check for multi-word terms
if i < len(tokens) - 1:
bigram = f"{tokens[i]} {tokens[i+1]}"
if bigram in self.POSITION_PATTERNS:
result.append(self.POSITION_PATTERNS[bigram])
i += 2
continue
result.append(tokens[i])
i += 1
return result
25.2 Named Entity Recognition
25.2.1 Football Entity Extraction
class FootballNER:
"""Named Entity Recognition for football text."""
def __init__(self):
self.team_names = self._load_team_names()
self.player_patterns = self._build_player_patterns()
def _load_team_names(self) -> Dict[str, str]:
"""Load team name mappings."""
return {
'ohio state': 'Ohio State',
'buckeyes': 'Ohio State',
'osu': 'Ohio State',
'alabama': 'Alabama',
'crimson tide': 'Alabama',
'bama': 'Alabama',
'georgia': 'Georgia',
'bulldogs': 'Georgia',
'uga': 'Georgia',
# ... more teams
}
def _build_player_patterns(self) -> re.Pattern:
"""Build regex for player name detection."""
# Pattern for "FirstName LastName" or "F. LastName"
return re.compile(
r'\b([A-Z][a-z]+\.?\s+[A-Z][a-z]+)\b'
)
def extract_entities(self, text: str) -> Dict[str, List[str]]:
"""Extract all football entities from text."""
entities = {
'teams': [],
'players': [],
'positions': [],
'stats': []
}
# Extract teams
text_lower = text.lower()
for pattern, team in self.team_names.items():
if pattern in text_lower:
entities['teams'].append(team)
# Extract potential player names
player_matches = self.player_patterns.findall(text)
entities['players'] = list(set(player_matches))
# Extract positions
processor = FootballTextProcessor()
entities['positions'] = processor.extract_positions(text)
# Extract stats
entities['stats'] = processor.extract_stats(text)
return entities
def link_entities(self,
entities: Dict,
player_database: pd.DataFrame) -> Dict:
"""Link extracted entities to database records."""
linked = entities.copy()
linked['player_ids'] = []
for player_name in entities['players']:
# Simple fuzzy matching
matches = player_database[
player_database['name'].str.contains(
player_name.split()[-1], # Last name
case=False
)
]
if len(matches) == 1:
linked['player_ids'].append(matches.iloc[0]['player_id'])
return linked
25.3 Scouting Report Analysis
25.3.1 Sentiment and Attribute Classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
class ScoutingReportAnalyzer:
"""Analyze scouting reports using NLP."""
ATTRIBUTE_KEYWORDS = {
'arm_strength': ['arm', 'throw', 'cannon', 'velocity', 'zip'],
'athleticism': ['athletic', 'explosive', 'agile', 'quick', 'fast'],
'football_iq': ['smart', 'intelligent', 'reads', 'anticipation', 'instincts'],
'leadership': ['leader', 'captain', 'vocal', 'commanding', 'presence'],
'technique': ['technique', 'footwork', 'mechanics', 'fundamentals', 'polish']
}
SENTIMENT_WORDS = {
'positive': ['excellent', 'outstanding', 'elite', 'impressive', 'dominant',
'strong', 'exceptional', 'tremendous', 'great', 'solid'],
'negative': ['concerns', 'struggles', 'inconsistent', 'limited', 'poor',
'weak', 'lacks', 'questions', 'below', 'average']
}
def __init__(self):
self.sentiment_classifier = None
self.vectorizer = TfidfVectorizer(
max_features=1000,
ngram_range=(1, 2),
stop_words='english'
)
def analyze_attributes(self, report_text: str) -> Dict[str, float]:
"""Score player attributes based on report text."""
text_lower = report_text.lower()
scores = {}
for attribute, keywords in self.ATTRIBUTE_KEYWORDS.items():
# Count keyword mentions
mentions = sum(1 for kw in keywords if kw in text_lower)
# Check sentiment around mentions
sentiment = self._get_attribute_sentiment(text_lower, keywords)
scores[attribute] = mentions * sentiment
return scores
def _get_attribute_sentiment(self,
text: str,
keywords: List[str]) -> float:
"""Get sentiment for attribute based on context."""
positive_count = sum(1 for w in self.SENTIMENT_WORDS['positive'] if w in text)
negative_count = sum(1 for w in self.SENTIMENT_WORDS['negative'] if w in text)
if positive_count + negative_count == 0:
return 0.5
return positive_count / (positive_count + negative_count)
def calculate_overall_grade(self, report_text: str) -> Dict:
"""Calculate overall grade from scouting report."""
attributes = self.analyze_attributes(report_text)
# Extract explicit grade mentions
grade_pattern = re.compile(r'grade[:\s]+(\d+\.?\d*)', re.IGNORECASE)
grades = grade_pattern.findall(report_text)
if grades:
explicit_grade = float(grades[0])
else:
explicit_grade = None
# Calculate implied grade from attributes
implied_grade = np.mean(list(attributes.values())) * 10 + 50
return {
'explicit_grade': explicit_grade,
'implied_grade': implied_grade,
'attributes': attributes,
'confidence': len(grades) > 0
}
def compare_reports(self,
report1: str,
report2: str) -> Dict:
"""Compare two scouting reports."""
attrs1 = self.analyze_attributes(report1)
attrs2 = self.analyze_attributes(report2)
comparison = {}
for attr in attrs1.keys():
comparison[attr] = {
'player1': attrs1[attr],
'player2': attrs2[attr],
'difference': attrs1[attr] - attrs2[attr]
}
return comparison
25.4 Topic Modeling
25.4.1 Discovering Discussion Themes
from sklearn.decomposition import LatentDirichletAllocation, NMF
class FootballTopicModeler:
"""Discover topics in football text collections."""
def __init__(self, n_topics: int = 10):
self.n_topics = n_topics
self.vectorizer = TfidfVectorizer(
max_features=2000,
max_df=0.95,
min_df=2,
stop_words='english'
)
self.model = None
def fit(self, documents: List[str], method: str = 'lda'):
"""Fit topic model to documents."""
# Vectorize
X = self.vectorizer.fit_transform(documents)
# Fit model
if method == 'lda':
self.model = LatentDirichletAllocation(
n_components=self.n_topics,
random_state=42
)
else:
self.model = NMF(
n_components=self.n_topics,
random_state=42
)
self.model.fit(X)
def get_topics(self, n_words: int = 10) -> List[Dict]:
"""Get top words for each topic."""
feature_names = self.vectorizer.get_feature_names_out()
topics = []
for topic_idx, topic in enumerate(self.model.components_):
top_words = [feature_names[i] for i in topic.argsort()[:-n_words-1:-1]]
topics.append({
'topic_id': topic_idx,
'top_words': top_words,
'label': self._auto_label_topic(top_words)
})
return topics
def _auto_label_topic(self, words: List[str]) -> str:
"""Auto-generate topic label from top words."""
football_labels = {
('quarterback', 'pass', 'throw'): 'Passing Game',
('run', 'yards', 'carry'): 'Rushing Attack',
('defense', 'tackle', 'pressure'): 'Defensive Performance',
('recruit', 'commit', 'star'): 'Recruiting',
('draft', 'pick', 'prospect'): 'NFL Draft',
('injury', 'out', 'questionable'): 'Injuries'
}
for keywords, label in football_labels.items():
if any(kw in words for kw in keywords):
return label
return ' / '.join(words[:3])
def classify_document(self, document: str) -> Dict:
"""Classify document into topics."""
X = self.vectorizer.transform([document])
topic_dist = self.model.transform(X)[0]
return {
'dominant_topic': int(np.argmax(topic_dist)),
'topic_distribution': topic_dist.tolist(),
'confidence': float(np.max(topic_dist))
}
25.5 Sentiment Analysis
25.5.1 Media Sentiment Tracking
class MediaSentimentTracker:
"""Track sentiment in football media coverage."""
def __init__(self):
self.positive_words = set([
'win', 'victory', 'excellent', 'dominant', 'impressive',
'breakthrough', 'success', 'strong', 'outstanding'
])
self.negative_words = set([
'loss', 'defeat', 'struggle', 'concern', 'disappointing',
'weak', 'poor', 'fail', 'mistake', 'turnover'
])
def analyze_sentiment(self, text: str) -> Dict:
"""Analyze sentiment of text."""
words = text.lower().split()
positive = sum(1 for w in words if w in self.positive_words)
negative = sum(1 for w in words if w in self.negative_words)
total = len(words)
if positive + negative == 0:
sentiment = 0.5
else:
sentiment = positive / (positive + negative)
return {
'sentiment_score': sentiment,
'positive_count': positive,
'negative_count': negative,
'classification': self._classify_sentiment(sentiment)
}
def _classify_sentiment(self, score: float) -> str:
"""Classify sentiment into categories."""
if score >= 0.7:
return 'very_positive'
elif score >= 0.55:
return 'positive'
elif score >= 0.45:
return 'neutral'
elif score >= 0.3:
return 'negative'
else:
return 'very_negative'
def track_over_time(self,
articles: pd.DataFrame,
entity: str) -> pd.DataFrame:
"""Track sentiment for entity over time."""
results = []
for _, row in articles.iterrows():
if entity.lower() in row['text'].lower():
sentiment = self.analyze_sentiment(row['text'])
results.append({
'date': row['date'],
'sentiment_score': sentiment['sentiment_score'],
'classification': sentiment['classification']
})
return pd.DataFrame(results)
Summary
NLP provides powerful tools for extracting insights from football's rich text data:
- Entity Recognition identifies players, teams, and statistics in text
- Scouting Analysis quantifies player attributes from written evaluations
- Topic Modeling discovers themes in large text collections
- Sentiment Analysis tracks media and fan perception over time
The key challenge is adapting general NLP techniques to football's specialized vocabulary and context.
Key Takeaways
- Football text data includes scouting reports, media, and social content
- Custom entity recognition handles football-specific terms
- Attribute extraction can quantify scouting report assessments
- Topic models reveal discussion patterns across large text collections
- Sentiment tracking monitors perception changes over time
Related Reading
Explore this topic in other books
AI Engineering Pretraining, Transfer Learning & NLP Sports Betting NLP for Betting Intelligence Prediction Markets NLP & Sentiment Analysis