Case Study 12.2: Linked List Traversal in Assembly

Overview

This case study implements a complete singly-linked list in assembly, interoperating with C's malloc and free. We build the list, traverse it, search it, and delete elements — all in x86-64 NASM assembly, following the System V AMD64 ABI so the functions can be called directly from C.

The Node Structure

struct Node {
    int64_t  value;         // offset 0
    struct Node *next;      // offset 8
};
// sizeof(Node) = 16 bytes
; NASM struct definition using struc:
struc Node
    .value: resq 1          ; int64_t (8 bytes)
    .next:  resq 1          ; pointer (8 bytes)
endstruc
; Node.value = 0, Node.next = 8, Node_size = 16

Building a List: Push to Front

; Node *list_push(Node *head, int64_t value)
; Allocates a new node, inserts it at the head, returns new head
; RDI = old head (or NULL), RSI = value
; Returns new head in RAX

section .text
global list_push
extern malloc

list_push:
    push rbp
    mov  rbp, rsp
    push rbx                ; callee-saved: old head
    push r12                ; callee-saved: value
    sub  rsp, 8             ; alignment pad (we pushed rbp + 2 registers = 24 bytes after rbp,
                            ; RSP was ≡ 8 (mod 16) at entry, after push rbp → 0,
                            ; after push rbx → 8, after push r12 → 0, sub 8 → 8; needs 8 more)
    ; Actually: entry RSP ≡ 8; push rbp → 0; push rbx → 8; push r12 → 0; sub 8 → 8 (MISALIGNED before call)
    ; Fix: sub rsp, 16 instead
    add  rsp, 8             ; undo our bad sub
    sub  rsp, 16            ; correct alignment (leaves RSP ≡ 0 mod 16)

    mov  rbx, rdi           ; save old head
    mov  r12, rsi           ; save value

    ; Allocate new node: malloc(16)
    mov  edi, 16            ; size = sizeof(Node)
    call malloc             ; rax = new node or NULL

    test rax, rax
    jz   .alloc_failed

    ; Initialize node
    mov  qword [rax + Node.value], r12   ; node->value = value
    mov  qword [rax + Node.next],  rbx   ; node->next = old head

.alloc_failed:
    ; rax = new node (head of list) or NULL if malloc failed

    add  rsp, 16
    pop  r12
    pop  rbx
    pop  rbp
    ret

Complete List from C

// Driver code in C to build and test our assembly functions:
extern Node *list_push(Node *head, int64_t value);
extern int64_t list_sum(Node *head);
extern Node *list_find(Node *head, int64_t target);

int main() {
    Node *head = NULL;
    head = list_push(head, 10);
    head = list_push(head, 20);
    head = list_push(head, 30);
    // List is now: 30 -> 20 -> 10 -> NULL (LIFO order)
    printf("Sum: %ld\n", list_sum(head));  // Should print 60
}

Traversal and Sum

; int64_t list_sum(Node *head)
; Sums all values in the list
; RDI = head, Returns sum in RAX

section .text
global list_sum

list_sum:
    xor  eax, eax           ; sum = 0

.loop:
    test rdi, rdi           ; head == NULL?
    jz   .done              ; exit if end of list

    add  rax, [rdi + Node.value]   ; sum += current->value
    mov  rdi, [rdi + Node.next]    ; advance to next node
    jmp  .loop

.done:
    ret

Search: Find by Value

; Node *list_find(Node *head, int64_t target)
; Returns pointer to first node with value == target, or NULL
; RDI = head, RSI = target, Returns RAX

section .text
global list_find

list_find:
.loop:
    test rdi, rdi           ; head == NULL?
    jz   .not_found         ; return NULL if end of list

    cmp  [rdi + Node.value], rsi   ; current->value == target?
    je   .found

    mov  rdi, [rdi + Node.next]    ; advance to next
    jmp  .loop

.found:
    mov  rax, rdi           ; return pointer to matching node
    ret

.not_found:
    xor  eax, eax           ; return NULL
    ret

Deletion: Remove First Matching Node

; Node *list_delete(Node *head, int64_t target)
; Removes the first node with value == target, frees it
; Returns (possibly updated) head
; RDI = head, RSI = target
; Returns new head in RAX

section .text
global list_delete
extern free

list_delete:
    push rbp
    mov  rbp, rsp
    push rbx                ; current node
    push r12                ; previous node (for relinking)
    push r13                ; target value
    push r14                ; head (for return)
    sub  rsp, 8             ; alignment (4 pushes + rbp = 5*8 = 40 bytes after entry,
                            ; plus 8-byte retaddr = 48. RSP ≡ 8 at entry,
                            ; 8 + 8 (rbp) + 8 (rbx) + 8 (r12) + 8 (r13) + 8 (r14) = 48 offset
                            ; 0x...8 - 48 = 0x...8 - 48 ≡ 0 - 16 ≡ 0 (mod 16)? Let's count:
                            ; entry: rsp ≡ 8; push rbp: rsp ≡ 0; push rbx: ≡ 8; push r12: ≡ 0;
                            ; push r13: ≡ 8; push r14: ≡ 0. sub 8: ≡ 8 (WRONG for call)
                            ; Need: sub 0 (already aligned at this point? no: after 4 callee-saves = 0, aligned)
                            ; Actually: after push rbp + 4 reg saves = 5 pushes total = 40 bytes
                            ; entry rsp ≡ 8; minus 40 → rsp ≡ 8-40 = -32 ≡ 8 (mod 16). Still misaligned.
                            ; sub rsp, 8 → ≡ 0. Good.

    mov  r14, rdi           ; save head for potential return
    mov  r13, rsi           ; save target
    xor  r12d, r12d         ; prev = NULL
    mov  rbx, rdi           ; current = head

.loop:
    test rbx, rbx           ; current == NULL?
    jz   .not_found

    cmp  [rbx + Node.value], r13   ; current->value == target?
    je   .found_match

    mov  r12, rbx                          ; prev = current
    mov  rbx, [rbx + Node.next]            ; current = current->next
    jmp  .loop

.found_match:
    ; Determine how to relink around this node
    test r12, r12           ; prev == NULL? (removing head)
    jnz  .has_prev

    ; Removing the head node
    mov  r14, [rbx + Node.next]   ; new head = old_head->next
    jmp  .do_free

.has_prev:
    ; Removing a middle or tail node
    mov  rax, [rbx + Node.next]            ; rax = current->next
    mov  [r12 + Node.next], rax            ; prev->next = current->next

.do_free:
    mov  rdi, rbx           ; arg to free: the node to remove
    call free               ; deallocate the node

.not_found:
    mov  rax, r14           ; return (possibly new) head

    add  rsp, 8
    pop  r14
    pop  r13
    pop  r12
    pop  rbx
    pop  rbp
    ret

Free the Entire List

; void list_free(Node *head)
; Frees all nodes in the list
; RDI = head

section .text
global list_free

list_free:
    push rbp
    mov  rbp, rsp
    push rbx               ; current node
    sub  rsp, 8            ; alignment

    mov  rbx, rdi          ; current = head

.loop:
    test rbx, rbx          ; current == NULL?
    jz   .done

    mov  rdi, [rbx + Node.next]  ; save next pointer BEFORE freeing
    push rdi               ; save next (below callee-saved regs, need extra care)
    ; Better: use r12 to save next
    ; Let's redo:
    mov  rbx, rdi          ; NOT right structure - let's fix

    ; Correct version:
    ; We need: tmp = current->next; free(current); current = tmp
    push rbx               ; save current pointer on stack temporarily
    mov  rdi, rbx          ; arg to free
    call free              ; free current node

    pop  rbx               ; restore... but rbx now has the next pointer we need
    ; PROBLEM: rbx was the pointer to the freed node, not to the next

.done:
    add  rsp, 8
    pop  rbx
    pop  rbp
    ret

This has a bug — we need to fetch next before freeing. Let's write the correct version:

list_free:
    push rbp
    mov  rbp, rsp
    push rbx               ; current node
    push r12               ; next node
    sub  rsp, 8            ; alignment (entry ≡ 8; push rbp → 0; push rbx → 8; push r12 → 0; sub 8 → 8? no)
                           ; entry ≡ 8; -8 (rbp) → 0; -8 (rbx) → 8; -8 (r12) → 0. Aligned. sub 0? No sub needed.
                           ; But we need the sub for calling free. Let's just always use sub 8 after even pushes.
    ; Simpler: just track the alignment manually.

    mov  rbx, rdi          ; current = head

.loop:
    test rbx, rbx
    jz   .done

    mov  r12, [rbx + Node.next]   ; save next BEFORE freeing
    mov  rdi, rbx                  ; arg to free
    call free                      ; free(current)
    mov  rbx, r12                  ; current = next
    jmp  .loop

.done:
    add  rsp, 8
    pop  r12
    pop  rbx
    pop  rbp
    ret

Key Lessons from This Implementation

Pointer discipline: Every time you dereference a next pointer (mov rdi, [rbx + 8]), you must have verified the current pointer is not NULL first. Dereferencing NULL causes a SIGSEGV and a difficult debugging session.

Register save strategy: The delete and free functions need 4-5 values across call free. Since RAX, RCX, RDX, RSI, RDI are all caller-saved and wiped by free, every value you need after the call must be in a callee-saved register (RBX, R12-R15) or on the stack.

Memory access patterns: Linked list traversal has terrible cache performance for large lists. Each [rbx + Node.next] dereference follows a pointer to a potentially random location in the heap. A cache miss per node × 1000 nodes × 200-cycle miss latency = catastrophically slow. For large linked lists, consider converting to arrays for bulk operations. The assembly code has the same cache performance as C — which is poor.

The ownership question: Who owns the free? In our API, list_delete and list_free both free nodes. The caller must not retain pointers to freed nodes. This is the same ownership rule as in C, enforced by convention rather than by the language.