Case Study 22-1: Measuring Cache Effects with RDTSC
Objective
Build a complete cache-effect measurement tool using inline assembly RDTSC/RDTSCP for timing, CLFLUSH for cache eviction, and MFENCE for memory ordering. The tool will measure and display L1 cache hit latency, L2 latency, L3 latency, and main memory (DRAM) latency — all using hardware timing instructions unavailable from C without inline assembly.
Background: The Memory Hierarchy
Modern x86-64 systems have a multi-level cache hierarchy:
CPU Core
├── L1 Data Cache ~4 cycles latency 32-64 KB per core
├── L2 Cache ~12 cycles latency 256-512 KB per core
├── L3 Cache (LLC) ~40 cycles latency 4-32 MB shared
└── Main Memory (DRAM) ~200-300 cycles GBs off-chip
Measuring these levels requires: 1. RDTSCP + LFENCE for accurate cycle counting 2. CLFLUSH to force a specific cache line out of all levels 3. MFENCE to ensure CLFLUSH completes before timing begins 4. Careful load ordering to ensure the measured access is actually to the target memory level
The Complete Tool: cache_measure.c
// cache_measure.c
// Measure memory hierarchy latency using RDTSC/RDTSCP/CLFLUSH inline assembly
// Compile: gcc -O2 -o cache_measure cache_measure.c
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// ============================================================
// Inline Assembly Primitives
// ============================================================
// Read TSC after a serializing CPUID (measures start of region)
static inline uint64_t rdtsc_start(void) {
uint32_t lo, hi;
// CPUID serializes: all prior instructions complete before RDTSC reads
asm volatile (
"cpuid\n\t"
"rdtsc\n\t"
"movl %%eax, %0\n\t"
"movl %%edx, %1"
: "=r"(lo), "=r"(hi)
:
: "%rax", "%rbx", "%rcx", "%rdx" // CPUID clobbers all four
);
return ((uint64_t)hi << 32) | lo;
}
// Read TSC using RDTSCP (serialized on the read side), then LFENCE
static inline uint64_t rdtsc_stop(void) {
uint32_t lo, hi;
// RDTSCP waits for all prior instructions to complete
// LFENCE prevents subsequent loads from executing before RDTSCP
asm volatile (
"rdtscp\n\t"
"movl %%eax, %0\n\t"
"movl %%edx, %1\n\t"
"lfence"
: "=r"(lo), "=r"(hi)
:
: "%rax", "%rcx", "%rdx" // RDTSCP clobbers RAX, RCX (aux), RDX
);
return ((uint64_t)hi << 32) | lo;
}
// Flush a cache line containing 'addr' from all cache levels
static inline void clflush(const void *addr) {
asm volatile (
"clflush %0"
:
: "m"(*(const char*)addr)
: "memory"
);
}
// Full memory fence
static inline void mfence(void) {
asm volatile ("mfence" ::: "memory");
}
// Compiler barrier only (no hardware fence instruction emitted)
static inline void compiler_barrier(void) {
asm volatile ("" ::: "memory");
}
// ============================================================
// Measurement Functions
// ============================================================
// Measure latency to access ptr (assumes it is cache-hot)
static uint64_t measure_hot(volatile uint64_t *ptr) {
uint64_t start, end;
uint64_t dummy;
start = rdtsc_start();
dummy = *ptr; // The access being measured
end = rdtsc_stop();
compiler_barrier(); // Prevent compiler from optimizing away dummy
(void)dummy;
return end - start;
}
// Flush ptr's cache line, then measure cold access latency
static uint64_t measure_cold(volatile uint64_t *ptr) {
uint64_t start, end;
uint64_t dummy;
clflush((const void *)ptr);
mfence(); // Ensure CLFLUSH completes before we time
start = rdtsc_start();
dummy = *ptr;
end = rdtsc_stop();
compiler_barrier();
(void)dummy;
return end - start;
}
// ============================================================
// Statistical Helpers
// ============================================================
#define ITERATIONS 200
#define WARMUP 20
static int cmp_uint64(const void *a, const void *b) {
uint64_t x = *(const uint64_t *)a;
uint64_t y = *(const uint64_t *)b;
return (x > y) - (x < y);
}
static uint64_t median(uint64_t *arr, int n) {
qsort(arr, n, sizeof(uint64_t), cmp_uint64);
return arr[n / 2];
}
static uint64_t percentile(uint64_t *arr, int n, int p) {
// arr must already be sorted
return arr[(n * p) / 100];
}
// ============================================================
// Main Measurement Routine
// ============================================================
int main(void) {
// Allocate an array large enough to exceed L3 cache on most systems
// L3 is typically up to 32 MB; 64 MB guarantees DRAM access
const size_t L1_SIZE = 32 * 1024UL; // 32 KB
const size_t L2_SIZE = 512 * 1024UL; // 512 KB
const size_t L3_SIZE = 8 * 1024 * 1024UL; // 8 MB (conservative)
const size_t MEM_SIZE = 64 * 1024 * 1024UL; // 64 MB
uint8_t *mem = (uint8_t *)malloc(MEM_SIZE);
if (!mem) {
fprintf(stderr, "malloc failed\n");
return 1;
}
memset(mem, 0xAB, MEM_SIZE); // Touch all pages to fault them in
uint64_t times[ITERATIONS];
int valid_count = ITERATIONS - WARMUP;
printf("Cache Latency Measurement Tool\n");
printf("===============================\n\n");
// -------------------------------------------------------
// Test 1: L1 Cache Hit (access same address repeatedly)
// -------------------------------------------------------
{
volatile uint64_t *ptr = (volatile uint64_t *)(mem + 0);
// Warm it up
for (int i = 0; i < WARMUP; i++) {
(void)measure_hot(ptr);
}
// Measure
for (int i = 0; i < ITERATIONS; i++) {
times[i] = measure_hot(ptr);
}
qsort(times, ITERATIONS, sizeof(uint64_t), cmp_uint64);
printf("L1 Cache (hot):\n");
printf(" Median: %4llu cycles\n", (unsigned long long)median(times, ITERATIONS));
printf(" P95: %4llu cycles\n", (unsigned long long)percentile(times, ITERATIONS, 95));
printf(" Min: %4llu cycles\n", (unsigned long long)times[0]);
printf(" Max: %4llu cycles\n", (unsigned long long)times[ITERATIONS-1]);
printf("\n");
}
// -------------------------------------------------------
// Test 2: L2 Cache (access addresses that fit in L2 but not L1)
// L1 is 32KB; stride through 64KB to evict L1 but stay in L2
// -------------------------------------------------------
{
// Fill L2 with our array, then access the start (which L1 evicted)
// Simple approach: stride through 2× L1_SIZE, then access element 0
volatile uint64_t *ptr = (volatile uint64_t *)(mem + 0);
// Touch elements spread across L1_SIZE*2 to evict [0] from L1
for (int i = 0; i < ITERATIONS; i++) {
// Access a set of addresses that fills L1 without exceeding L2
volatile uint64_t sum = 0;
for (size_t j = 0; j < L1_SIZE * 2; j += 64) {
sum += *(volatile uint64_t *)(mem + j);
}
(void)sum;
times[i] = measure_hot(ptr); // Now ptr is in L2, evicted from L1
}
qsort(times, ITERATIONS, sizeof(uint64_t), cmp_uint64);
printf("L2 Cache (approximate — evicted from L1):\n");
printf(" Median: %4llu cycles\n", (unsigned long long)median(times, ITERATIONS));
printf(" P95: %4llu cycles\n", (unsigned long long)percentile(times, ITERATIONS, 95));
printf("\n");
}
// -------------------------------------------------------
// Test 3: L3 Cache (access addresses within L3, outside L2)
// Stride through L2_SIZE*2 to evict from L2, then access element 0
// -------------------------------------------------------
{
volatile uint64_t *ptr = (volatile uint64_t *)(mem + 0);
for (int i = 0; i < ITERATIONS; i++) {
// Evict ptr from L2 by touching L2_SIZE*2 bytes
volatile uint64_t sum = 0;
for (size_t j = 0; j < L2_SIZE * 2; j += 64) {
sum += *(volatile uint64_t *)(mem + j);
}
(void)sum;
times[i] = measure_hot(ptr); // ptr is in L3, evicted from L2
}
qsort(times, ITERATIONS, sizeof(uint64_t), cmp_uint64);
printf("L3 Cache (approximate — evicted from L2):\n");
printf(" Median: %4llu cycles\n", (unsigned long long)median(times, ITERATIONS));
printf(" P95: %4llu cycles\n", (unsigned long long)percentile(times, ITERATIONS, 95));
printf("\n");
}
// -------------------------------------------------------
// Test 4: DRAM (CLFLUSH to evict from all caches)
// -------------------------------------------------------
{
volatile uint64_t *ptr = (volatile uint64_t *)(mem + 0);
for (int i = 0; i < WARMUP; i++) {
(void)measure_cold(ptr);
}
for (int i = 0; i < ITERATIONS; i++) {
times[i] = measure_cold(ptr);
}
qsort(times, ITERATIONS, sizeof(uint64_t), cmp_uint64);
printf("DRAM (CLFLUSH eviction):\n");
printf(" Median: %4llu cycles\n", (unsigned long long)median(times, ITERATIONS));
printf(" P95: %4llu cycles\n", (unsigned long long)percentile(times, ITERATIONS, 95));
printf(" Min: %4llu cycles\n", (unsigned long long)times[0]);
printf(" Max: %4llu cycles\n", (unsigned long long)times[ITERATIONS-1]);
printf("\n");
}
// -------------------------------------------------------
// Test 5: RDTSC Measurement Overhead
// Measure cost of rdtsc_start() + rdtsc_stop() with no work
// -------------------------------------------------------
{
for (int i = 0; i < WARMUP; i++) {
uint64_t start = rdtsc_start();
uint64_t end = rdtsc_stop();
(void)(end - start);
}
for (int i = 0; i < ITERATIONS; i++) {
uint64_t start = rdtsc_start();
uint64_t end = rdtsc_stop();
times[i] = end - start;
}
qsort(times, ITERATIONS, sizeof(uint64_t), cmp_uint64);
printf("RDTSC overhead (subtract from other measurements):\n");
printf(" Median: %4llu cycles\n", (unsigned long long)median(times, ITERATIONS));
printf(" Min: %4llu cycles\n", (unsigned long long)times[0]);
printf("\n");
}
free(mem);
return 0;
}
Compiling and Running
gcc -O2 -o cache_measure cache_measure.c
./cache_measure
Sample output on a modern Intel Core i7 (Skylake, 3.6 GHz):
Cache Latency Measurement Tool
===============================
L1 Cache (hot):
Median: 4 cycles
P95: 6 cycles
Min: 4 cycles
Max: 18 cycles
L2 Cache (approximate — evicted from L1):
Median: 12 cycles
P95: 15 cycles
L3 Cache (approximate — evicted from L2):
Median: 38 cycles
P95: 45 cycles
DRAM (CLFLUSH eviction):
Median: 245 cycles
P95: 312 cycles
Min: 198 cycles
Max: 489 cycles
RDTSC overhead (subtract from other measurements):
Median: 22 cycles
Min: 20 cycles
Important: The RDTSC overhead (22 cycles in this example) is the cost of the CPUID + RDTSC + RDTSCP + LFENCE measurement sequence itself. The true L1 latency is approximately 4 - (overhead correction) ≈ 4 cycles (the RDTSC overhead is consistent and the processor can overlap CPUID with the measured access).
Analysis: Dissecting the Inline Assembly
Why CPUID Before RDTSC (Start)?
asm volatile (
"cpuid\n\t"
"rdtsc\n\t"
...
: : : "%rax", "%rbx", "%rcx", "%rdx"
);
CPUID is a serializing instruction: it forces the processor to complete all prior instructions before executing CPUID. Without it, out-of-order execution could execute the RDTSC before the preceding load, causing the measured time to start too early.
Alternative: Intel recommends LFENCE; RDTSC for the start. LFENCE prevents later loads from executing before LFENCE completes. The CPUID approach is more conservative (fully serializing), but CPUID takes 100+ cycles vs. LFENCE's ~1 cycle.
Why RDTSCP + LFENCE (Stop)?
asm volatile (
"rdtscp\n\t"
...
"lfence"
: : : "%rax", "%rcx", "%rdx"
);
RDTSCP waits for all prior instructions to complete before reading the TSC — it serializes on the load side. The subsequent LFENCE prevents instructions after the stop measurement from being reordered before RDTSCP. This ensures the measured access is fully committed before we read the stop time.
Why "m"(*(const char*)addr) for CLFLUSH?
asm volatile (
"clflush %0"
:
: "m"(*(const char*)addr)
: "memory"
);
The "m" constraint tells the compiler: "the operand %0 is a memory location." The compiler generates the correct memory addressing syntax for CLFLUSH. The *(const char*)addr dereference is a C trick: we're not actually loading the value, but telling GCC that the memory at addr is the operand. The "memory" clobber ensures the compiler does not move cached values across the CLFLUSH boundary.
Experiment: Seeing Cache Lines
The x86 cache line size is 64 bytes. Modify the tool to measure accesses at various strides and observe the effect:
// Measure latency at different offsets within the same cache line
printf("Stride within cache line (64B):\n");
for (int offset = 0; offset < 64; offset += 8) {
volatile uint64_t *ptr = (volatile uint64_t *)(mem + offset);
clflush((void *)(mem + 0)); // Flush the whole cache line (covers offset 0-63)
mfence();
uint64_t start = rdtsc_start();
volatile uint64_t v = *ptr;
uint64_t end = rdtsc_stop();
(void)v;
printf(" Offset %2d: %llu cycles\n", offset, (unsigned long long)(end - start));
}
Expected result: All offsets within the same 64-byte cache line show identical latency — CLFLUSH evicts the entire 64-byte line, and loading any byte in that line causes the entire line to be fetched from DRAM. This is the cache line granularity in action.
Experiment: Prefetching
x86 has software prefetch instructions. Measure the effect of PREFETCHT0:
// Prefetch into L1 cache
asm volatile ("prefetcht0 (%0)" : : "r"(ptr) : "memory");
// Wait some cycles for the prefetch to complete
for (volatile int i = 0; i < 100; i++);
// Now measure — should be L1 latency despite not having accessed ptr before
uint64_t latency = measure_hot(ptr);
Compare to the CLFLUSH result. If the prefetch worked, you should see ~4 cycle (L1) latency even though CLFLUSH evicted the line.
Security Connection: Spectre and Cache Timing
The CLFLUSH + RDTSCP measurement technique is the foundation of Flush+Reload, a cache side-channel attack and a key building block of Spectre and Meltdown exploits:
- Flush target cache line with CLFLUSH
- Trigger speculative execution (or wait for victim to access)
- Measure access time with RDTSC
- If access time is ~4 cycles (L1 hit): the speculative execution loaded that cache line
- If access time is ~250 cycles (DRAM miss): it was not loaded
Spectre uses exactly the inline assembly techniques in this chapter — RDTSCP for timing, CLFLUSH for eviction, and careful memory ordering with MFENCE/LFENCE — to extract secret data through microarchitectural side channels. Understanding inline assembly is not just for performance optimization; it is essential for security research.
Summary
This case study built a complete cache latency measurement tool demonstrating:
- RDTSC/RDTSCP for cycle-accurate measurement with proper serialization
- CLFLUSH for deterministic cache eviction
- MFENCE/LFENCE for memory ordering between measurement primitives
- "m" constraint for memory operand addressing in CLFLUSH
- "memory" clobber to prevent the compiler from caching values across measurement boundaries
- Statistical methodology (median + percentiles) rather than naive averaging to get reliable hardware measurements
The measured values — 4/12/38/245 cycles for L1/L2/L3/DRAM — match published hardware specifications, confirming that the inline assembly correctly measures what we intended.