Case Study 12.2: Linked List Traversal in Assembly
Overview
This case study implements a complete singly-linked list in assembly, interoperating with C's malloc and free. We build the list, traverse it, search it, and delete elements — all in x86-64 NASM assembly, following the System V AMD64 ABI so the functions can be called directly from C.
The Node Structure
struct Node {
int64_t value; // offset 0
struct Node *next; // offset 8
};
// sizeof(Node) = 16 bytes
; NASM struct definition using struc:
struc Node
.value: resq 1 ; int64_t (8 bytes)
.next: resq 1 ; pointer (8 bytes)
endstruc
; Node.value = 0, Node.next = 8, Node_size = 16
Building a List: Push to Front
; Node *list_push(Node *head, int64_t value)
; Allocates a new node, inserts it at the head, returns new head
; RDI = old head (or NULL), RSI = value
; Returns new head in RAX
section .text
global list_push
extern malloc
list_push:
push rbp
mov rbp, rsp
push rbx ; callee-saved: old head
push r12 ; callee-saved: value
sub rsp, 8 ; alignment pad (we pushed rbp + 2 registers = 24 bytes after rbp,
; RSP was ≡ 8 (mod 16) at entry, after push rbp → 0,
; after push rbx → 8, after push r12 → 0, sub 8 → 8; needs 8 more)
; Actually: entry RSP ≡ 8; push rbp → 0; push rbx → 8; push r12 → 0; sub 8 → 8 (MISALIGNED before call)
; Fix: sub rsp, 16 instead
add rsp, 8 ; undo our bad sub
sub rsp, 16 ; correct alignment (leaves RSP ≡ 0 mod 16)
mov rbx, rdi ; save old head
mov r12, rsi ; save value
; Allocate new node: malloc(16)
mov edi, 16 ; size = sizeof(Node)
call malloc ; rax = new node or NULL
test rax, rax
jz .alloc_failed
; Initialize node
mov qword [rax + Node.value], r12 ; node->value = value
mov qword [rax + Node.next], rbx ; node->next = old head
.alloc_failed:
; rax = new node (head of list) or NULL if malloc failed
add rsp, 16
pop r12
pop rbx
pop rbp
ret
Complete List from C
// Driver code in C to build and test our assembly functions:
extern Node *list_push(Node *head, int64_t value);
extern int64_t list_sum(Node *head);
extern Node *list_find(Node *head, int64_t target);
int main() {
Node *head = NULL;
head = list_push(head, 10);
head = list_push(head, 20);
head = list_push(head, 30);
// List is now: 30 -> 20 -> 10 -> NULL (LIFO order)
printf("Sum: %ld\n", list_sum(head)); // Should print 60
}
Traversal and Sum
; int64_t list_sum(Node *head)
; Sums all values in the list
; RDI = head, Returns sum in RAX
section .text
global list_sum
list_sum:
xor eax, eax ; sum = 0
.loop:
test rdi, rdi ; head == NULL?
jz .done ; exit if end of list
add rax, [rdi + Node.value] ; sum += current->value
mov rdi, [rdi + Node.next] ; advance to next node
jmp .loop
.done:
ret
Search: Find by Value
; Node *list_find(Node *head, int64_t target)
; Returns pointer to first node with value == target, or NULL
; RDI = head, RSI = target, Returns RAX
section .text
global list_find
list_find:
.loop:
test rdi, rdi ; head == NULL?
jz .not_found ; return NULL if end of list
cmp [rdi + Node.value], rsi ; current->value == target?
je .found
mov rdi, [rdi + Node.next] ; advance to next
jmp .loop
.found:
mov rax, rdi ; return pointer to matching node
ret
.not_found:
xor eax, eax ; return NULL
ret
Deletion: Remove First Matching Node
; Node *list_delete(Node *head, int64_t target)
; Removes the first node with value == target, frees it
; Returns (possibly updated) head
; RDI = head, RSI = target
; Returns new head in RAX
section .text
global list_delete
extern free
list_delete:
push rbp
mov rbp, rsp
push rbx ; current node
push r12 ; previous node (for relinking)
push r13 ; target value
push r14 ; head (for return)
sub rsp, 8 ; alignment (4 pushes + rbp = 5*8 = 40 bytes after entry,
; plus 8-byte retaddr = 48. RSP ≡ 8 at entry,
; 8 + 8 (rbp) + 8 (rbx) + 8 (r12) + 8 (r13) + 8 (r14) = 48 offset
; 0x...8 - 48 = 0x...8 - 48 ≡ 0 - 16 ≡ 0 (mod 16)? Let's count:
; entry: rsp ≡ 8; push rbp: rsp ≡ 0; push rbx: ≡ 8; push r12: ≡ 0;
; push r13: ≡ 8; push r14: ≡ 0. sub 8: ≡ 8 (WRONG for call)
; Need: sub 0 (already aligned at this point? no: after 4 callee-saves = 0, aligned)
; Actually: after push rbp + 4 reg saves = 5 pushes total = 40 bytes
; entry rsp ≡ 8; minus 40 → rsp ≡ 8-40 = -32 ≡ 8 (mod 16). Still misaligned.
; sub rsp, 8 → ≡ 0. Good.
mov r14, rdi ; save head for potential return
mov r13, rsi ; save target
xor r12d, r12d ; prev = NULL
mov rbx, rdi ; current = head
.loop:
test rbx, rbx ; current == NULL?
jz .not_found
cmp [rbx + Node.value], r13 ; current->value == target?
je .found_match
mov r12, rbx ; prev = current
mov rbx, [rbx + Node.next] ; current = current->next
jmp .loop
.found_match:
; Determine how to relink around this node
test r12, r12 ; prev == NULL? (removing head)
jnz .has_prev
; Removing the head node
mov r14, [rbx + Node.next] ; new head = old_head->next
jmp .do_free
.has_prev:
; Removing a middle or tail node
mov rax, [rbx + Node.next] ; rax = current->next
mov [r12 + Node.next], rax ; prev->next = current->next
.do_free:
mov rdi, rbx ; arg to free: the node to remove
call free ; deallocate the node
.not_found:
mov rax, r14 ; return (possibly new) head
add rsp, 8
pop r14
pop r13
pop r12
pop rbx
pop rbp
ret
Free the Entire List
; void list_free(Node *head)
; Frees all nodes in the list
; RDI = head
section .text
global list_free
list_free:
push rbp
mov rbp, rsp
push rbx ; current node
sub rsp, 8 ; alignment
mov rbx, rdi ; current = head
.loop:
test rbx, rbx ; current == NULL?
jz .done
mov rdi, [rbx + Node.next] ; save next pointer BEFORE freeing
push rdi ; save next (below callee-saved regs, need extra care)
; Better: use r12 to save next
; Let's redo:
mov rbx, rdi ; NOT right structure - let's fix
; Correct version:
; We need: tmp = current->next; free(current); current = tmp
push rbx ; save current pointer on stack temporarily
mov rdi, rbx ; arg to free
call free ; free current node
pop rbx ; restore... but rbx now has the next pointer we need
; PROBLEM: rbx was the pointer to the freed node, not to the next
.done:
add rsp, 8
pop rbx
pop rbp
ret
This has a bug — we need to fetch next before freeing. Let's write the correct version:
list_free:
push rbp
mov rbp, rsp
push rbx ; current node
push r12 ; next node
sub rsp, 8 ; alignment (entry ≡ 8; push rbp → 0; push rbx → 8; push r12 → 0; sub 8 → 8? no)
; entry ≡ 8; -8 (rbp) → 0; -8 (rbx) → 8; -8 (r12) → 0. Aligned. sub 0? No sub needed.
; But we need the sub for calling free. Let's just always use sub 8 after even pushes.
; Simpler: just track the alignment manually.
mov rbx, rdi ; current = head
.loop:
test rbx, rbx
jz .done
mov r12, [rbx + Node.next] ; save next BEFORE freeing
mov rdi, rbx ; arg to free
call free ; free(current)
mov rbx, r12 ; current = next
jmp .loop
.done:
add rsp, 8
pop r12
pop rbx
pop rbp
ret
Key Lessons from This Implementation
Pointer discipline: Every time you dereference a next pointer (mov rdi, [rbx + 8]), you must have verified the current pointer is not NULL first. Dereferencing NULL causes a SIGSEGV and a difficult debugging session.
Register save strategy: The delete and free functions need 4-5 values across call free. Since RAX, RCX, RDX, RSI, RDI are all caller-saved and wiped by free, every value you need after the call must be in a callee-saved register (RBX, R12-R15) or on the stack.
Memory access patterns: Linked list traversal has terrible cache performance for large lists. Each [rbx + Node.next] dereference follows a pointer to a potentially random location in the heap. A cache miss per node × 1000 nodes × 200-cycle miss latency = catastrophically slow. For large linked lists, consider converting to arrays for bulk operations. The assembly code has the same cache performance as C — which is poor.
The ownership question: Who owns the free? In our API, list_delete and list_free both free nodes. The caller must not retain pointers to freed nodes. This is the same ownership rule as in C, enforced by convention rather than by the language.