Appendix B: SQL Quick Reference for Data Scientists
This reference covers the SQL patterns you actually use as a data scientist: window functions, CTEs, complex joins, and query optimization. PostgreSQL is the primary dialect. BigQuery differences are noted where they matter.
Window Functions
Window functions compute a value across a set of rows related to the current row, without collapsing the result set like GROUP BY. The core syntax:
function_name() OVER (
PARTITION BY column(s)
ORDER BY column(s)
ROWS BETWEEN frame_start AND frame_end
)
Ranking Functions
-- ROW_NUMBER: unique sequential integer per partition
SELECT
subscriber_id,
event_date,
ROW_NUMBER() OVER (
PARTITION BY subscriber_id
ORDER BY event_date DESC
) AS row_num
FROM events;
-- Keep only the most recent event per subscriber
WITH ranked AS (
SELECT *,
ROW_NUMBER() OVER (
PARTITION BY subscriber_id ORDER BY event_date DESC
) AS rn
FROM events
)
SELECT * FROM ranked WHERE rn = 1;
-- RANK: same rank for ties, gaps after ties
-- DENSE_RANK: same rank for ties, no gaps
SELECT
subscriber_id,
total_hours,
RANK() OVER (ORDER BY total_hours DESC) AS rank,
DENSE_RANK() OVER (ORDER BY total_hours DESC) AS dense_rank
FROM monthly_usage;
-- NTILE: distribute rows into N equal buckets
SELECT
subscriber_id,
total_hours,
NTILE(10) OVER (ORDER BY total_hours) AS decile
FROM monthly_usage;
LAG and LEAD
-- Previous and next values without self-joins
SELECT
subscriber_id,
month,
hours_watched,
LAG(hours_watched, 1) OVER (
PARTITION BY subscriber_id ORDER BY month
) AS prev_month_hours,
LAG(hours_watched, 3) OVER (
PARTITION BY subscriber_id ORDER BY month
) AS three_months_ago_hours,
LEAD(hours_watched, 1) OVER (
PARTITION BY subscriber_id ORDER BY month
) AS next_month_hours
FROM monthly_usage;
-- Month-over-month change
SELECT
subscriber_id,
month,
hours_watched,
hours_watched - LAG(hours_watched, 1) OVER (
PARTITION BY subscriber_id ORDER BY month
) AS change_from_last_month,
ROUND(
(hours_watched - LAG(hours_watched, 1) OVER (
PARTITION BY subscriber_id ORDER BY month
))::numeric
/ NULLIF(LAG(hours_watched, 1) OVER (
PARTITION BY subscriber_id ORDER BY month
), 0) * 100, 1
) AS pct_change
FROM monthly_usage;
Running Aggregates
-- Running sum, average, count
SELECT
subscriber_id,
event_date,
amount,
SUM(amount) OVER (
PARTITION BY subscriber_id
ORDER BY event_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS cumulative_total,
AVG(amount) OVER (
PARTITION BY subscriber_id
ORDER BY event_date
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) AS rolling_3_avg,
COUNT(*) OVER (
PARTITION BY subscriber_id
ORDER BY event_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS running_count
FROM billing_events;
-- Rolling 90-day window by date range (PostgreSQL)
SELECT
subscriber_id,
event_date,
COUNT(*) OVER (
PARTITION BY subscriber_id
ORDER BY event_date
RANGE BETWEEN INTERVAL '90 days' PRECEDING AND CURRENT ROW
) AS tickets_last_90d
FROM support_tickets;
Frame Specifications
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW -- All rows from start to current
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW -- Sliding window: current + 2 prior
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING -- Current to end
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING -- Current + 1 before + 1 after
RANGE BETWEEN INTERVAL '30 days' PRECEDING AND CURRENT ROW -- Date-based window (PostgreSQL)
FIRST_VALUE and LAST_VALUE
-- First and last values within a partition
SELECT
subscriber_id,
month,
plan_type,
FIRST_VALUE(plan_type) OVER (
PARTITION BY subscriber_id ORDER BY month
) AS initial_plan,
LAST_VALUE(plan_type) OVER (
PARTITION BY subscriber_id
ORDER BY month
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS current_plan
FROM subscription_history;
Common Table Expressions (CTEs)
CTEs make complex queries readable and debuggable by breaking them into named steps.
Basic CTE
WITH monthly_stats AS (
SELECT
subscriber_id,
DATE_TRUNC('month', event_date) AS month,
SUM(hours_watched) AS total_hours,
COUNT(DISTINCT genre) AS genre_count,
COUNT(DISTINCT device_type) AS device_count
FROM viewing_events
GROUP BY subscriber_id, DATE_TRUNC('month', event_date)
),
usage_trends AS (
SELECT
subscriber_id,
month,
total_hours,
genre_count,
device_count,
LAG(total_hours, 1) OVER (
PARTITION BY subscriber_id ORDER BY month
) AS prev_month_hours
FROM monthly_stats
)
SELECT
subscriber_id,
month,
total_hours,
genre_count,
device_count,
total_hours - prev_month_hours AS usage_change,
CASE
WHEN total_hours < prev_month_hours * 0.5 THEN 'declining'
WHEN total_hours > prev_month_hours * 1.5 THEN 'growing'
ELSE 'stable'
END AS trend_category
FROM usage_trends
WHERE month = '2024-03-01';
Recursive CTE
-- Generate a date series (useful when your database lacks generate_series)
WITH RECURSIVE dates AS (
SELECT DATE '2024-01-01' AS dt
UNION ALL
SELECT dt + INTERVAL '1 day'
FROM dates
WHERE dt < '2024-12-31'
)
SELECT dt FROM dates;
-- Traverse a hierarchy (category tree)
WITH RECURSIVE category_tree AS (
-- Anchor: root categories
SELECT id, name, parent_id, 0 AS depth, name::text AS path
FROM categories
WHERE parent_id IS NULL
UNION ALL
-- Recursive: children
SELECT c.id, c.name, c.parent_id, ct.depth + 1,
ct.path || ' > ' || c.name
FROM categories c
JOIN category_tree ct ON c.parent_id = ct.id
)
SELECT * FROM category_tree ORDER BY path;
Complex Joins
Self-Join
-- Compare each subscriber's current month to their first month
SELECT
current.subscriber_id,
current.month,
current.total_hours AS current_hours,
first.total_hours AS first_month_hours,
current.total_hours - first.total_hours AS hours_change_from_start
FROM monthly_usage current
JOIN (
SELECT subscriber_id, total_hours
FROM monthly_usage
WHERE month_number = 1
) first ON current.subscriber_id = first.subscriber_id;
Anti-Join
-- Subscribers who have NEVER submitted a support ticket
SELECT s.subscriber_id, s.signup_date, s.plan_type
FROM subscribers s
LEFT JOIN support_tickets t
ON s.subscriber_id = t.subscriber_id
WHERE t.ticket_id IS NULL;
-- Alternative with NOT EXISTS (often better performance)
SELECT s.subscriber_id, s.signup_date, s.plan_type
FROM subscribers s
WHERE NOT EXISTS (
SELECT 1
FROM support_tickets t
WHERE t.subscriber_id = s.subscriber_id
);
Semi-Join
-- Subscribers who HAVE submitted at least one ticket (without duplicating rows)
SELECT s.subscriber_id, s.signup_date
FROM subscribers s
WHERE EXISTS (
SELECT 1
FROM support_tickets t
WHERE t.subscriber_id = s.subscriber_id
);
Lateral Join (PostgreSQL)
-- For each subscriber, get their 3 most recent events
SELECT s.subscriber_id, e.event_date, e.event_type
FROM subscribers s
CROSS JOIN LATERAL (
SELECT event_date, event_type
FROM events
WHERE events.subscriber_id = s.subscriber_id
ORDER BY event_date DESC
LIMIT 3
) e;
Conditional Aggregation (Pivot Pattern)
-- Count events by type per subscriber in one pass
SELECT
subscriber_id,
COUNT(*) FILTER (WHERE event_type = 'play') AS play_count,
COUNT(*) FILTER (WHERE event_type = 'pause') AS pause_count,
COUNT(*) FILTER (WHERE event_type = 'search') AS search_count,
SUM(duration) FILTER (WHERE event_type = 'play') AS total_play_seconds
FROM events
WHERE event_date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY subscriber_id;
BigQuery equivalent (FILTER not supported):
SELECT
subscriber_id,
COUNTIF(event_type = 'play') AS play_count,
COUNTIF(event_type = 'pause') AS pause_count,
COUNTIF(event_type = 'search') AS search_count,
SUM(IF(event_type = 'play', duration, 0)) AS total_play_seconds
FROM events
WHERE event_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)
GROUP BY subscriber_id;
Feature Engineering Patterns in SQL
Days Since Last Event
SELECT
subscriber_id,
CURRENT_DATE - MAX(event_date) AS days_since_last_event,
CURRENT_DATE - MAX(CASE WHEN event_type = 'play'
THEN event_date END) AS days_since_last_play,
CURRENT_DATE - MAX(CASE WHEN event_type = 'support_ticket'
THEN event_date END) AS days_since_last_ticket
FROM events
GROUP BY subscriber_id;
RFM Features
SELECT
subscriber_id,
-- Recency: days since last activity
CURRENT_DATE - MAX(event_date) AS recency_days,
-- Frequency: number of active days in last 90 days
COUNT(DISTINCT event_date) FILTER (
WHERE event_date >= CURRENT_DATE - INTERVAL '90 days'
) AS frequency_90d,
-- Monetary: total revenue in last 12 months
COALESCE(SUM(amount) FILTER (
WHERE event_date >= CURRENT_DATE - INTERVAL '12 months'
), 0) AS monetary_12m
FROM events
LEFT JOIN billing ON events.subscriber_id = billing.subscriber_id
GROUP BY subscriber_id;
Session-Based Features
-- Define sessions as gaps > 30 minutes between events
WITH event_gaps AS (
SELECT *,
EXTRACT(EPOCH FROM (
event_timestamp - LAG(event_timestamp) OVER (
PARTITION BY subscriber_id ORDER BY event_timestamp
)
)) / 60 AS minutes_since_last
FROM events
),
session_markers AS (
SELECT *,
SUM(CASE WHEN minutes_since_last > 30 OR minutes_since_last IS NULL
THEN 1 ELSE 0 END) OVER (
PARTITION BY subscriber_id ORDER BY event_timestamp
) AS session_id
FROM event_gaps
)
SELECT
subscriber_id,
COUNT(DISTINCT session_id) AS total_sessions,
AVG(session_duration) AS avg_session_minutes
FROM (
SELECT
subscriber_id,
session_id,
EXTRACT(EPOCH FROM (
MAX(event_timestamp) - MIN(event_timestamp)
)) / 60 AS session_duration
FROM session_markers
GROUP BY subscriber_id, session_id
) sessions
GROUP BY subscriber_id;
Query Optimization
Reading Execution Plans
-- Show plan without executing
EXPLAIN
SELECT * FROM events WHERE subscriber_id = 12345;
-- Show plan with actual execution times
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT * FROM events WHERE subscriber_id = 12345;
What to look for:
Seq Scan -- Full table scan. Slow on large tables. Add an index?
Index Scan -- Using an index. Good.
Bitmap Index Scan -- Index scan batched into bitmap. Good for medium selectivity.
Hash Join -- Join using hash table. Normal for equality joins.
Nested Loop -- Row-by-row join. Slow if outer table is large.
Sort -- Explicit sort. Check if an index could avoid this.
Indexing Strategies
-- Single-column index for equality and range filters
CREATE INDEX idx_events_subscriber ON events (subscriber_id);
CREATE INDEX idx_events_date ON events (event_date);
-- Composite index for common query patterns (order matters)
CREATE INDEX idx_events_sub_date ON events (subscriber_id, event_date);
-- Partial index for frequently filtered subsets
CREATE INDEX idx_active_subscribers ON subscribers (subscriber_id)
WHERE status = 'active';
-- Index for pattern matching
CREATE INDEX idx_name_trgm ON customers USING gin (name gin_trgm_ops);
Common Anti-Patterns
-- BAD: SELECT * from a wide table
SELECT * FROM events WHERE subscriber_id = 12345;
-- GOOD: Select only needed columns
SELECT event_date, event_type, duration
FROM events
WHERE subscriber_id = 12345;
-- BAD: Function on indexed column prevents index use
SELECT * FROM events WHERE DATE_TRUNC('month', event_date) = '2024-03-01';
-- GOOD: Use range comparison instead
SELECT * FROM events
WHERE event_date >= '2024-03-01' AND event_date < '2024-04-01';
-- BAD: OR on different columns prevents index use
SELECT * FROM events
WHERE subscriber_id = 123 OR event_type = 'churn';
-- GOOD: Use UNION ALL instead
SELECT * FROM events WHERE subscriber_id = 123
UNION ALL
SELECT * FROM events WHERE event_type = 'churn' AND subscriber_id != 123;
-- BAD: Correlated subquery executes once per row
SELECT s.*, (
SELECT COUNT(*) FROM events e WHERE e.subscriber_id = s.subscriber_id
) AS event_count
FROM subscribers s;
-- GOOD: Join with pre-aggregated subquery
SELECT s.*, COALESCE(e.event_count, 0) AS event_count
FROM subscribers s
LEFT JOIN (
SELECT subscriber_id, COUNT(*) AS event_count
FROM events
GROUP BY subscriber_id
) e ON s.subscriber_id = e.subscriber_id;
Materialized Views for Feature Tables
-- Create a materialized view for expensive feature extraction
CREATE MATERIALIZED VIEW subscriber_features AS
WITH usage AS ( ... ),
tickets AS ( ... ),
billing AS ( ... )
SELECT
s.subscriber_id,
u.avg_hours_30d,
u.usage_trend,
t.tickets_90d,
b.lifetime_revenue,
...
FROM subscribers s
LEFT JOIN usage u ON s.subscriber_id = u.subscriber_id
LEFT JOIN tickets t ON s.subscriber_id = t.subscriber_id
LEFT JOIN billing b ON s.subscriber_id = b.subscriber_id;
-- Create index on the materialized view
CREATE INDEX idx_sub_features ON subscriber_features (subscriber_id);
-- Refresh on schedule (or trigger)
REFRESH MATERIALIZED VIEW CONCURRENTLY subscriber_features;
BigQuery Dialect Notes
| PostgreSQL | BigQuery |
|---|---|
DATE_TRUNC('month', col) |
DATE_TRUNC(col, MONTH) |
col::date |
CAST(col AS DATE) |
INTERVAL '30 days' |
INTERVAL 30 DAY |
CURRENT_DATE - INTERVAL '30 days' |
DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY) |
COUNT(*) FILTER (WHERE ...) |
COUNTIF(...) |
EXTRACT(EPOCH FROM interval) |
TIMESTAMP_DIFF(ts1, ts2, SECOND) |
GENERATE_SERIES(...) |
UNNEST(GENERATE_DATE_ARRAY(...)) |
CREATE INDEX ... |
Not applicable (BigQuery is columnstore) |
EXPLAIN ANALYZE |
Query execution details in the UI console |
Lateral join CROSS JOIN LATERAL |
CROSS JOIN UNNEST(ARRAY(...)) |
string || string concatenation |
CONCAT(string, string) |
SERIAL / IDENTITY |
Auto-increment not supported; use GENERATE_UUID() |
BigQuery optimizes differently from PostgreSQL. Key differences: no indexes (columnar storage handles it), partition tables by date for cost and performance, cluster tables for common filter columns, and avoid SELECT * aggressively since BigQuery charges by bytes scanned.
-- BigQuery: partition and cluster for performance
CREATE TABLE events
PARTITION BY DATE(event_date)
CLUSTER BY subscriber_id
AS SELECT * FROM raw_events;