Appendix B: SQL Quick Reference for Data Scientists

This reference covers the SQL patterns you actually use as a data scientist: window functions, CTEs, complex joins, and query optimization. PostgreSQL is the primary dialect. BigQuery differences are noted where they matter.

Window Functions

Window functions compute a value across a set of rows related to the current row, without collapsing the result set like GROUP BY. The core syntax:

function_name() OVER (
    PARTITION BY column(s)
    ORDER BY column(s)
    ROWS BETWEEN frame_start AND frame_end
)

Ranking Functions

-- ROW_NUMBER: unique sequential integer per partition
SELECT
    subscriber_id,
    event_date,
    ROW_NUMBER() OVER (
        PARTITION BY subscriber_id
        ORDER BY event_date DESC
    ) AS row_num
FROM events;

-- Keep only the most recent event per subscriber
WITH ranked AS (
    SELECT *,
        ROW_NUMBER() OVER (
            PARTITION BY subscriber_id ORDER BY event_date DESC
        ) AS rn
    FROM events
)
SELECT * FROM ranked WHERE rn = 1;

-- RANK: same rank for ties, gaps after ties
-- DENSE_RANK: same rank for ties, no gaps
SELECT
    subscriber_id,
    total_hours,
    RANK() OVER (ORDER BY total_hours DESC) AS rank,
    DENSE_RANK() OVER (ORDER BY total_hours DESC) AS dense_rank
FROM monthly_usage;

-- NTILE: distribute rows into N equal buckets
SELECT
    subscriber_id,
    total_hours,
    NTILE(10) OVER (ORDER BY total_hours) AS decile
FROM monthly_usage;

LAG and LEAD

-- Previous and next values without self-joins
SELECT
    subscriber_id,
    month,
    hours_watched,
    LAG(hours_watched, 1) OVER (
        PARTITION BY subscriber_id ORDER BY month
    ) AS prev_month_hours,
    LAG(hours_watched, 3) OVER (
        PARTITION BY subscriber_id ORDER BY month
    ) AS three_months_ago_hours,
    LEAD(hours_watched, 1) OVER (
        PARTITION BY subscriber_id ORDER BY month
    ) AS next_month_hours
FROM monthly_usage;

-- Month-over-month change
SELECT
    subscriber_id,
    month,
    hours_watched,
    hours_watched - LAG(hours_watched, 1) OVER (
        PARTITION BY subscriber_id ORDER BY month
    ) AS change_from_last_month,
    ROUND(
        (hours_watched - LAG(hours_watched, 1) OVER (
            PARTITION BY subscriber_id ORDER BY month
        ))::numeric
        / NULLIF(LAG(hours_watched, 1) OVER (
            PARTITION BY subscriber_id ORDER BY month
        ), 0) * 100, 1
    ) AS pct_change
FROM monthly_usage;

Running Aggregates

-- Running sum, average, count
SELECT
    subscriber_id,
    event_date,
    amount,
    SUM(amount) OVER (
        PARTITION BY subscriber_id
        ORDER BY event_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS cumulative_total,
    AVG(amount) OVER (
        PARTITION BY subscriber_id
        ORDER BY event_date
        ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
    ) AS rolling_3_avg,
    COUNT(*) OVER (
        PARTITION BY subscriber_id
        ORDER BY event_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS running_count
FROM billing_events;

-- Rolling 90-day window by date range (PostgreSQL)
SELECT
    subscriber_id,
    event_date,
    COUNT(*) OVER (
        PARTITION BY subscriber_id
        ORDER BY event_date
        RANGE BETWEEN INTERVAL '90 days' PRECEDING AND CURRENT ROW
    ) AS tickets_last_90d
FROM support_tickets;

Frame Specifications

ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW    -- All rows from start to current
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW            -- Sliding window: current + 2 prior
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING     -- Current to end
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING             -- Current + 1 before + 1 after
RANGE BETWEEN INTERVAL '30 days' PRECEDING AND CURRENT ROW  -- Date-based window (PostgreSQL)

FIRST_VALUE and LAST_VALUE

-- First and last values within a partition
SELECT
    subscriber_id,
    month,
    plan_type,
    FIRST_VALUE(plan_type) OVER (
        PARTITION BY subscriber_id ORDER BY month
    ) AS initial_plan,
    LAST_VALUE(plan_type) OVER (
        PARTITION BY subscriber_id
        ORDER BY month
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS current_plan
FROM subscription_history;

Common Table Expressions (CTEs)

CTEs make complex queries readable and debuggable by breaking them into named steps.

Basic CTE

WITH monthly_stats AS (
    SELECT
        subscriber_id,
        DATE_TRUNC('month', event_date) AS month,
        SUM(hours_watched) AS total_hours,
        COUNT(DISTINCT genre) AS genre_count,
        COUNT(DISTINCT device_type) AS device_count
    FROM viewing_events
    GROUP BY subscriber_id, DATE_TRUNC('month', event_date)
),
usage_trends AS (
    SELECT
        subscriber_id,
        month,
        total_hours,
        genre_count,
        device_count,
        LAG(total_hours, 1) OVER (
            PARTITION BY subscriber_id ORDER BY month
        ) AS prev_month_hours
    FROM monthly_stats
)
SELECT
    subscriber_id,
    month,
    total_hours,
    genre_count,
    device_count,
    total_hours - prev_month_hours AS usage_change,
    CASE
        WHEN total_hours < prev_month_hours * 0.5 THEN 'declining'
        WHEN total_hours > prev_month_hours * 1.5 THEN 'growing'
        ELSE 'stable'
    END AS trend_category
FROM usage_trends
WHERE month = '2024-03-01';

Recursive CTE

-- Generate a date series (useful when your database lacks generate_series)
WITH RECURSIVE dates AS (
    SELECT DATE '2024-01-01' AS dt
    UNION ALL
    SELECT dt + INTERVAL '1 day'
    FROM dates
    WHERE dt < '2024-12-31'
)
SELECT dt FROM dates;

-- Traverse a hierarchy (category tree)
WITH RECURSIVE category_tree AS (
    -- Anchor: root categories
    SELECT id, name, parent_id, 0 AS depth, name::text AS path
    FROM categories
    WHERE parent_id IS NULL

    UNION ALL

    -- Recursive: children
    SELECT c.id, c.name, c.parent_id, ct.depth + 1,
           ct.path || ' > ' || c.name
    FROM categories c
    JOIN category_tree ct ON c.parent_id = ct.id
)
SELECT * FROM category_tree ORDER BY path;

Complex Joins

Self-Join

-- Compare each subscriber's current month to their first month
SELECT
    current.subscriber_id,
    current.month,
    current.total_hours AS current_hours,
    first.total_hours AS first_month_hours,
    current.total_hours - first.total_hours AS hours_change_from_start
FROM monthly_usage current
JOIN (
    SELECT subscriber_id, total_hours
    FROM monthly_usage
    WHERE month_number = 1
) first ON current.subscriber_id = first.subscriber_id;

Anti-Join

-- Subscribers who have NEVER submitted a support ticket
SELECT s.subscriber_id, s.signup_date, s.plan_type
FROM subscribers s
LEFT JOIN support_tickets t
    ON s.subscriber_id = t.subscriber_id
WHERE t.ticket_id IS NULL;

-- Alternative with NOT EXISTS (often better performance)
SELECT s.subscriber_id, s.signup_date, s.plan_type
FROM subscribers s
WHERE NOT EXISTS (
    SELECT 1
    FROM support_tickets t
    WHERE t.subscriber_id = s.subscriber_id
);

Semi-Join

-- Subscribers who HAVE submitted at least one ticket (without duplicating rows)
SELECT s.subscriber_id, s.signup_date
FROM subscribers s
WHERE EXISTS (
    SELECT 1
    FROM support_tickets t
    WHERE t.subscriber_id = s.subscriber_id
);

Lateral Join (PostgreSQL)

-- For each subscriber, get their 3 most recent events
SELECT s.subscriber_id, e.event_date, e.event_type
FROM subscribers s
CROSS JOIN LATERAL (
    SELECT event_date, event_type
    FROM events
    WHERE events.subscriber_id = s.subscriber_id
    ORDER BY event_date DESC
    LIMIT 3
) e;

Conditional Aggregation (Pivot Pattern)

-- Count events by type per subscriber in one pass
SELECT
    subscriber_id,
    COUNT(*) FILTER (WHERE event_type = 'play') AS play_count,
    COUNT(*) FILTER (WHERE event_type = 'pause') AS pause_count,
    COUNT(*) FILTER (WHERE event_type = 'search') AS search_count,
    SUM(duration) FILTER (WHERE event_type = 'play') AS total_play_seconds
FROM events
WHERE event_date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY subscriber_id;

BigQuery equivalent (FILTER not supported):

SELECT
    subscriber_id,
    COUNTIF(event_type = 'play') AS play_count,
    COUNTIF(event_type = 'pause') AS pause_count,
    COUNTIF(event_type = 'search') AS search_count,
    SUM(IF(event_type = 'play', duration, 0)) AS total_play_seconds
FROM events
WHERE event_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)
GROUP BY subscriber_id;

Feature Engineering Patterns in SQL

Days Since Last Event

SELECT
    subscriber_id,
    CURRENT_DATE - MAX(event_date) AS days_since_last_event,
    CURRENT_DATE - MAX(CASE WHEN event_type = 'play'
                            THEN event_date END) AS days_since_last_play,
    CURRENT_DATE - MAX(CASE WHEN event_type = 'support_ticket'
                            THEN event_date END) AS days_since_last_ticket
FROM events
GROUP BY subscriber_id;

RFM Features

SELECT
    subscriber_id,
    -- Recency: days since last activity
    CURRENT_DATE - MAX(event_date) AS recency_days,
    -- Frequency: number of active days in last 90 days
    COUNT(DISTINCT event_date) FILTER (
        WHERE event_date >= CURRENT_DATE - INTERVAL '90 days'
    ) AS frequency_90d,
    -- Monetary: total revenue in last 12 months
    COALESCE(SUM(amount) FILTER (
        WHERE event_date >= CURRENT_DATE - INTERVAL '12 months'
    ), 0) AS monetary_12m
FROM events
LEFT JOIN billing ON events.subscriber_id = billing.subscriber_id
GROUP BY subscriber_id;

Session-Based Features

-- Define sessions as gaps > 30 minutes between events
WITH event_gaps AS (
    SELECT *,
        EXTRACT(EPOCH FROM (
            event_timestamp - LAG(event_timestamp) OVER (
                PARTITION BY subscriber_id ORDER BY event_timestamp
            )
        )) / 60 AS minutes_since_last
    FROM events
),
session_markers AS (
    SELECT *,
        SUM(CASE WHEN minutes_since_last > 30 OR minutes_since_last IS NULL
                 THEN 1 ELSE 0 END) OVER (
            PARTITION BY subscriber_id ORDER BY event_timestamp
        ) AS session_id
    FROM event_gaps
)
SELECT
    subscriber_id,
    COUNT(DISTINCT session_id) AS total_sessions,
    AVG(session_duration) AS avg_session_minutes
FROM (
    SELECT
        subscriber_id,
        session_id,
        EXTRACT(EPOCH FROM (
            MAX(event_timestamp) - MIN(event_timestamp)
        )) / 60 AS session_duration
    FROM session_markers
    GROUP BY subscriber_id, session_id
) sessions
GROUP BY subscriber_id;

Query Optimization

Reading Execution Plans

-- Show plan without executing
EXPLAIN
SELECT * FROM events WHERE subscriber_id = 12345;

-- Show plan with actual execution times
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT * FROM events WHERE subscriber_id = 12345;

What to look for:

Seq Scan         -- Full table scan. Slow on large tables. Add an index?
Index Scan       -- Using an index. Good.
Bitmap Index Scan -- Index scan batched into bitmap. Good for medium selectivity.
Hash Join        -- Join using hash table. Normal for equality joins.
Nested Loop      -- Row-by-row join. Slow if outer table is large.
Sort             -- Explicit sort. Check if an index could avoid this.

Indexing Strategies

-- Single-column index for equality and range filters
CREATE INDEX idx_events_subscriber ON events (subscriber_id);
CREATE INDEX idx_events_date ON events (event_date);

-- Composite index for common query patterns (order matters)
CREATE INDEX idx_events_sub_date ON events (subscriber_id, event_date);

-- Partial index for frequently filtered subsets
CREATE INDEX idx_active_subscribers ON subscribers (subscriber_id)
WHERE status = 'active';

-- Index for pattern matching
CREATE INDEX idx_name_trgm ON customers USING gin (name gin_trgm_ops);

Common Anti-Patterns

-- BAD: SELECT * from a wide table
SELECT * FROM events WHERE subscriber_id = 12345;

-- GOOD: Select only needed columns
SELECT event_date, event_type, duration
FROM events
WHERE subscriber_id = 12345;

-- BAD: Function on indexed column prevents index use
SELECT * FROM events WHERE DATE_TRUNC('month', event_date) = '2024-03-01';

-- GOOD: Use range comparison instead
SELECT * FROM events
WHERE event_date >= '2024-03-01' AND event_date < '2024-04-01';

-- BAD: OR on different columns prevents index use
SELECT * FROM events
WHERE subscriber_id = 123 OR event_type = 'churn';

-- GOOD: Use UNION ALL instead
SELECT * FROM events WHERE subscriber_id = 123
UNION ALL
SELECT * FROM events WHERE event_type = 'churn' AND subscriber_id != 123;

-- BAD: Correlated subquery executes once per row
SELECT s.*, (
    SELECT COUNT(*) FROM events e WHERE e.subscriber_id = s.subscriber_id
) AS event_count
FROM subscribers s;

-- GOOD: Join with pre-aggregated subquery
SELECT s.*, COALESCE(e.event_count, 0) AS event_count
FROM subscribers s
LEFT JOIN (
    SELECT subscriber_id, COUNT(*) AS event_count
    FROM events
    GROUP BY subscriber_id
) e ON s.subscriber_id = e.subscriber_id;

Materialized Views for Feature Tables

-- Create a materialized view for expensive feature extraction
CREATE MATERIALIZED VIEW subscriber_features AS
WITH usage AS ( ... ),
     tickets AS ( ... ),
     billing AS ( ... )
SELECT
    s.subscriber_id,
    u.avg_hours_30d,
    u.usage_trend,
    t.tickets_90d,
    b.lifetime_revenue,
    ...
FROM subscribers s
LEFT JOIN usage u ON s.subscriber_id = u.subscriber_id
LEFT JOIN tickets t ON s.subscriber_id = t.subscriber_id
LEFT JOIN billing b ON s.subscriber_id = b.subscriber_id;

-- Create index on the materialized view
CREATE INDEX idx_sub_features ON subscriber_features (subscriber_id);

-- Refresh on schedule (or trigger)
REFRESH MATERIALIZED VIEW CONCURRENTLY subscriber_features;

BigQuery Dialect Notes

PostgreSQL	BigQuery
`DATE_TRUNC('month', col)`	`DATE_TRUNC(col, MONTH)`
`col::date`	`CAST(col AS DATE)`
`INTERVAL '30 days'`	`INTERVAL 30 DAY`
`CURRENT_DATE - INTERVAL '30 days'`	`DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)`
`COUNT(*) FILTER (WHERE ...)`	`COUNTIF(...)`
`EXTRACT(EPOCH FROM interval)`	`TIMESTAMP_DIFF(ts1, ts2, SECOND)`
`GENERATE_SERIES(...)`	`UNNEST(GENERATE_DATE_ARRAY(...))`
`CREATE INDEX ...`	Not applicable (BigQuery is columnstore)
`EXPLAIN ANALYZE`	Query execution details in the UI console
Lateral join `CROSS JOIN LATERAL`	`CROSS JOIN UNNEST(ARRAY(...))`
`string \|\| string` concatenation	`CONCAT(string, string)`
`SERIAL` / `IDENTITY`	Auto-increment not supported; use `GENERATE_UUID()`

BigQuery optimizes differently from PostgreSQL. Key differences: no indexes (columnar storage handles it), partition tables by date for cost and performance, cluster tables for common filter columns, and avoid SELECT * aggressively since BigQuery charges by bytes scanned.

-- BigQuery: partition and cluster for performance
CREATE TABLE events
PARTITION BY DATE(event_date)
CLUSTER BY subscriber_id
AS SELECT * FROM raw_events;