“If you can optimize the work the forklifts are doing, maybe they would have time to spare to break through the wall.”
Github: GCaggianese/AoC-2025/D4
Puzzle Day 4: Printing Department
Part One:
- Matrix input where
.= paper roll,@= empty space - Valid = empty space with 5+ adjacent paper rolls
- Return count of valid positions
Part Two:
- Propagation: valid positions become paper rolls
- Repeat until no new valid positions appear
- Return total count of all activated positions

I included a Python implementation of the solution that is pretty straightforward to understand.
Implementation in FASM
License
;;; SPDX-FileCopyrightText: 2026 Germán Caggianese <german.caggianese@pm.me>
;;;
;;; SPDX-License-Identifier: Apache-2.0
Header
format ELF64 executable
entry _start
Matrix macro
This macro parses the input file at compile time, converting:
.(paper roll) → 1@(empty space) → 0- Newlines and other characters are skipped
The virtual block loads the file into an addressing space, then we iterate through each byte and emit only the meaningful ones.
macro matrix_from_file name, filename {
virtual at 0
name#.data:: file filename
name#.size = $
end virtual
label name byte
name#.len = 0
repeat name#.size
load name#.ch byte from name#.data:(%-1)
if name#.ch = '.'
db 1
name#.len = name#.len + 1
else if name#.ch = '@'
db 0
name#.len = name#.len + 1
end if
end repeat
}
Data segment
Load the input and define constants
segment readable writeable
matrix_from_file my_matrix, './input.txt'
MATRIX_SIZE = my_matrix.len
ROWS = 138
COLS = 138
Working buffers
result: stores neighbor sums after each calculationbackup: preserves original matrix state for Part 2
result rb ROWS * COLS
backup rb ROWS * COLS
Direction offsets for 8-connectivity
Each pair is (row~offset~, col~offset~) as signed bytes.
directions db -1,-1, -1,0, -1,1, 0,-1, 0,1, 1,-1, 1,0, 1,1
NUM_DIRS = 8
Counters and output strings
counter_p1 dq 0
counter_p2 dq 0
msg_p1 db 'Counter P1: '
msg_p1_len = $ - msg_p1
msg_p2 db 'Counter P2: '
msg_p2_len = $ - msg_p2
buffer rb 16
newline db 0x0A
segment readable executable
Main logic
_start:
Backup original matrix
We need to preserve the original state because Part 2 will modify it.
- Copy my~matrix~ → backup
xor rcx, rcx
.backup_loop:
cmp rcx, MATRIX_SIZE
jge .part1_start
mov al, [my_matrix + rcx]
mov [backup + rcx], al
inc rcx
jmp .backup_loop
Part 1: Single pass calculation
Add neighbours
For each cell, sum its value plus all valid neighbors. Note:
r12= row indexr13= col index
.part1_start:
xor r12, r12
.p1_row_loop:
cmp r12, ROWS
jge .p1_correction
xor r13, r13
.p1_col_loop:
cmp r13, COLS
jge .p1_next_row
Calculate linear index: rax = row * COLS + col Note:
r14= sum (starts with cell value)r15= direction index
mov rax, r12
imul rax, COLS
add rax, r13
movzx r14d, byte [my_matrix + rax]
xor r15, r15
.p1_dir_loop:
cmp r15, NUM_DIRS
jge .p1_store_result
Load direction offsets
movsx r8, byte [directions + r15*2] ; r8 = di
movsx r9, byte [directions + r15*2 + 1] ; r9 = dj
Calculate neighbor position Note:
r10= ni = row + dir11= nj = col + dj
mov r10, r12
add r10, r8
mov r11, r13
add r11, r9
Bounds check: 0 <= ni < ROWS && 0 <= nj < COLS
cmp r10, 0
jl .p1_next_dir
cmp r10, ROWS
jge .p1_next_dir
cmp r11, 0
jl .p1_next_dir
cmp r11, COLS
jge .p1_next_dir
Add neighbor value to sum
mov rax, r10
imul rax, COLS
add rax, r11
movzx eax, byte [my_matrix + rax]
add r14, rax
.p1_next_dir:
inc r15
jmp .p1_dir_loop
.p1_store_result:
mov rax, r12
imul rax, COLS
add rax, r13
mov byte [result + rax], r14b
inc r13
jmp .p1_col_loop
.p1_next_row:
inc r12
jmp .p1_row_loop
Correction
Edge cells have fewer neighbors, so we compensate:
- Edge cells (non-corner): +3
- Corner cells: +3 +2 = +5
This normalizes all cells as if surrounded by empty space.
Top row: +3
.p1_correction:
xor rcx, rcx
.p1_top_row:
cmp rcx, COLS
jge .p1_bottom_row
add byte [result + rcx], 3
inc rcx
jmp .p1_top_row
Bottom row: +3
.p1_bottom_row:
xor rcx, rcx
.p1_bottom_loop:
cmp rcx, COLS
jge .p1_left_col
mov rax, (ROWS-1) * COLS
add rax, rcx
add byte [result + rax], 3
inc rcx
jmp .p1_bottom_loop
Left column (excluding corners): +3
.p1_left_col:
mov rcx, 1
.p1_left_loop:
cmp rcx, ROWS-1
jge .p1_right_col
mov rax, rcx
imul rax, COLS
add byte [result + rax], 3
inc rcx
jmp .p1_left_loop
Right column (excluding corners): +3
.p1_right_col:
mov rcx, 1
.p1_right_loop:
cmp rcx, ROWS-1
jge .p1_corners
mov rax, rcx
imul rax, COLS
add rax, COLS-1
add byte [result + rax], 3
inc rcx
jmp .p1_right_loop
Corners: additional +2
.p1_corners:
add byte [result], 2 ; [0][0]
add byte [result + COLS - 1], 2 ; [0][COLS-1]
add byte [result + (ROWS-1) * COLS], 2 ; [ROWS-1][0]
add byte [result + (ROWS-1) * COLS + COLS - 1], 2 ; [ROWS-1][COLS-1]
Filter (elementwise multiply with inverted original)
Zero out positions where original had a 1 (paper roll). We only care about empty spaces (@) that have many neighbors.
xor rcx, rcx
.p1_filter_loop:
cmp rcx, MATRIX_SIZE
jge .p1_count
cmp byte [my_matrix + rcx], 1
jne .p1_filter_next
mov byte [result + rcx], 0
.p1_filter_next:
inc rcx
jmp .p1_filter_loop
Count cells with value ≥ 5
Note:
r12= counter
.p1_count:
xor r12, r12
xor rcx, rcx
.p1_count_loop:
cmp rcx, MATRIX_SIZE
jge .p1_done
cmp byte [result + rcx], 5
jl .p1_count_next
inc r12
.p1_count_next:
inc rcx
jmp .p1_count_loop
.p1_done:
mov [counter_p1], r12
Part 2: Propagation loop
Restore original matrix and iterate until saturation. Each iteration, cells with ≥5 neighbors become paper rolls.
Restore my~matrix~ from backup
xor rcx, rcx
.restore_loop:
cmp rcx, MATRIX_SIZE
jge .p2_main_loop
mov al, [backup + rcx]
mov [my_matrix + rcx], al
inc rcx
jmp .restore_loop
mov qword [counter_p2], 0
.p2_main_loop:
Add neighbours (same algorithm as Part 1)
xor r12, r12
.p2_row_loop:
cmp r12, ROWS
jge .p2_correction
xor r13, r13
.p2_col_loop:
cmp r13, COLS
jge .p2_next_row
mov rax, r12
imul rax, COLS
add rax, r13
movzx r14d, byte [my_matrix + rax]
xor r15, r15
.p2_dir_loop:
cmp r15, NUM_DIRS
jge .p2_store_result
movsx r8, byte [directions + r15*2]
movsx r9, byte [directions + r15*2 + 1]
mov r10, r12
add r10, r8
mov r11, r13
add r11, r9
cmp r10, 0
jl .p2_next_dir
cmp r10, ROWS
jge .p2_next_dir
cmp r11, 0
jl .p2_next_dir
cmp r11, COLS
jge .p2_next_dir
mov rax, r10
imul rax, COLS
add rax, r11
movzx eax, byte [my_matrix + rax]
add r14, rax
.p2_next_dir:
inc r15
jmp .p2_dir_loop
.p2_store_result:
mov rax, r12
imul rax, COLS
add rax, r13
mov byte [result + rax], r14b
inc r13
jmp .p2_col_loop
.p2_next_row:
inc r12
jmp .p2_row_loop
Correction (same as Part 1)
.p2_correction:
xor rcx, rcx
.p2_top_row:
cmp rcx, COLS
jge .p2_bottom_row
add byte [result + rcx], 3
inc rcx
jmp .p2_top_row
.p2_bottom_row:
xor rcx, rcx
.p2_bottom_loop:
cmp rcx, COLS
jge .p2_left_col
mov rax, (ROWS-1) * COLS
add rax, rcx
add byte [result + rax], 3
inc rcx
jmp .p2_bottom_loop
.p2_left_col:
mov rcx, 1
.p2_left_loop:
cmp rcx, ROWS-1
jge .p2_right_col
mov rax, rcx
imul rax, COLS
add byte [result + rax], 3
inc rcx
jmp .p2_left_loop
.p2_right_col:
mov rcx, 1
.p2_right_loop:
cmp rcx, ROWS-1
jge .p2_corners
mov rax, rcx
imul rax, COLS
add rax, COLS-1
add byte [result + rax], 3
inc rcx
jmp .p2_right_loop
.p2_corners:
add byte [result], 2
add byte [result + COLS - 1], 2
add byte [result + (ROWS-1) * COLS], 2
add byte [result + (ROWS-1) * COLS + COLS - 1], 2
Filter
xor rcx, rcx
.p2_filter_loop:
cmp rcx, MATRIX_SIZE
jge .p2_count_and_activate
cmp byte [my_matrix + rcx], 1
jne .p2_filter_next
mov byte [result + rcx], 0
.p2_filter_next:
inc rcx
jmp .p2_filter_loop
Count and activate
Count cells ≥5 and activate them in my_matrix for next iteration.
.p2_count_and_activate:
xor rbx, rbx ; rbx = new_activations this iteration
xor rcx, rcx
.p2_count_loop:
cmp rcx, MATRIX_SIZE
jge .p2_check_continue
cmp byte [result + rcx], 5
jl .p2_count_next
inc rbx ; new_activations++
mov byte [my_matrix + rcx], 1 ; activate: becomes paper roll
.p2_count_next:
inc rcx
jmp .p2_count_loop
.p2_check_continue:
; counter_p2 += new_activations
add [counter_p2], rbx
; if new_activations > 0, continue propagating
test rbx, rbx
jnz .p2_main_loop
Output
Print Part 1 result
.print_results:
; Print "Counter P1: "
mov rax, 1
mov rdi, 1
mov rsi, msg_p1
mov rdx, msg_p1_len
syscall
; Convert counter_p1 to string
mov rax, [counter_p1]
call .print_number_in_rax
Print Part 2 result
; Print "Counter P2: "
mov rax, 1
mov rdi, 1
mov rsi, msg_p2
mov rdx, msg_p2_len
syscall
; Convert counter_p2 to string
mov rax, [counter_p2]
call .print_number_in_rax
jmp .exit
Number printing subroutine
Converts value in rax to decimal string and prints it with newline.
.print_number_in_rax:
lea rdi, [buffer + 15]
mov rcx, 10
test rax, rax
jnz .convert_loop
; Handle zero case
dec rdi
mov byte [rdi], '0'
jmp .do_print
.convert_loop:
test rax, rax
jz .do_print
xor rdx, rdx
div rcx ; rax = quotient, rdx = remainder
add dl, '0'
dec rdi
mov [rdi], dl
jmp .convert_loop
.do_print:
; Calculate length and print
lea rdx, [buffer + 15]
sub rdx, rdi ; rdx = length
mov rsi, rdi
mov rax, 1
mov rdi, 1
syscall
; Print newline
mov rax, 1
mov rdi, 1
mov rsi, newline
mov rdx, 1
syscall
ret
Exit
.exit:
;; 下次見~
mov rax, 60
xor rdi, rdi
syscall
Compile and Run
Compile main.asm with fasm main.asm and run it with ./main. It should work on every x86 and x86-64 system. The input is hardcoded to be at ./input.txt
Expected output:
Counter P1: <number>
Counter P2: <number>
Some benchmarks
Python3
❯ time python3 draft.py Counter P1: 1549 Counter P2: 8887
Executed in 1.16 secs fish external usr time 1.15 secs 0.00 micros 1.15 secs sys time 0.01 secs 871.00 micros 0.00 secs
PyPy3
❯ time pypy3 draft.py Counter P1: 1549 Counter P2: 8887
Executed in 215.26 millis fish external usr time 193.14 millis 0.00 millis 193.14 millis sys time 21.98 millis 1.14 millis 20.84 millis
FASM
❯ time ./main #FASM Counter P1: 1549 Counter P2: 8887
Executed in 29.71 millis fish external usr time 28.89 millis 0.00 micros 28.89 millis sys time 0.77 millis 773.00 micros 0.00 millis
As expected FASM is in the order of 10.7x faster than PyPy’s JIT, and 58x faster than Python. I know that my FASM code is not the most efficient, not optimized, neither is the Python one, this benchmark is just a funny fact.