Image Blur Program
For verification and profiling you will be using a custom Image blur program written in RISC-V assembly. To answer questions in the BONUS, you need to understand the assembly code. Both the original code and the modified (with custom instruction) are provided below.
The program performs image blurring of a 32x32 pixel black & white image. The image is padded with zero.
Expected Output
Since reading back the image from the FPGA is convoluted, we did it for you, and below are the expected results.
Original 32x32 pixel image | Blurred Image |
---|---|
![]() | ![]() |
We are aware that this image blurring algorithm is naïve and not implemented optimally. The main goal of this program is to highlight the impact of
LWI
instruction.
Baseline Image Blur code
.global __start
.text
__start:
# Actual Code starts here
bootloader:
lui t0, 0x42000
lw s2, (t0) # Load the image width
lw s3, 4(t0) # Load the image height
lw s4, 8(t0) # Pixel start address & loop counter
li t2, 4
addi s5, s2, 2 # Add padding on both sides
mul s5, s5, t2 # Convert bytes to word
neg s6, s5 # -img_width
addi s7, s6, 4 # -img_width + 1
addi s8, s6, -4 # -img_width - 1
addi s9, s5, 4 # img_width + 1
addi s10, s5, -4 # img_width - 1
li s11, 9 # Number of pixels for blurring
addi s0, s2, 0 # Set the row check
addi s1, s3, 0 # Set the column check
li a0, 0x42002000 # Store address for the blur image
blur:
lw t0, (s4) # Load pixel value (i)
lw t1, 4(s4) # Load pixel value (Right)
add t0, t0, t1
lw t1, -4(s4) # Load pixel value (Left)
add t0, t0, t1
add t2, s4, s6 # Load pixel value (Up)
lw t1, (t2)
add t0, t0, t1
add t2, s4, s7 # Load pixel value (Up-Right)
lw t1, (t2)
add t0, t0, t1
add t2, s4, s8 # Load pixel value (Up-Left)
lw t1, (t2)
add t0, t0, t1
add t2, s4, s5 # Load pixel value (Down)
lw t1, (t2)
add t0, t0, t1
add t2, s4, s9 # Load pixel value (Down-Right)
lw t1, (t2)
add t0, t0, t1
add t2, s4, s10 # Load pixel value (Down-Left)
lw t1, (t2)
add t0, t0, t1
div t0, t0, s11 # Average the output (Divided by 9)
# Store the output
sw t0, (a0)
addi a0, a0, 4
# Check if the row is complete
addi s4, s4, 4
addi s0, s0, -1
bnez s0, blur
addi s4, s4, 8
addi s0, s2, 0
addi s1, s1, -1
bnez s1, blur
# Write the cycle count
rdcycle t5 # Read lower 32 bits of cycle count into t5
rdcycleh t6 # Read higher 32 bits of cycle count into t6
lui t3, 0x42003 # Load upper immediate 0x42003 into t3
addi t3, t3, 0x500 # t3 = 0x42003500 (target memory address)
sw t5, 0(t3) # Store the lower cycle count at 0x42003500
addi t3, t3, 4 # t3 = 0x42003504 (next memory address for higher count)
sw t6, 0(t3) # Store the higher cycle count at 0x42003504
# End of program (infinite loop with nop)
nop_loop:
nop # No operation
j nop_loop # Infinite loop
Modified Image Blur code
.global __start
.text
__start:
# Actual Code starts here
bootloader:
lui t0, 0x42000
lw s2, (t0) # Load the image width
lw s3, 4(t0) # Load the image height
lw s4, 8(t0) # Pixel start address & loop counter
li t2, 4
addi s5, s2, 2 # Add padding on both sides
mul s5, s5, t2 # Convert bytes to word
neg s6, s5 # -img_width
addi s7, s6, 4 # -img_width + 1
addi s8, s6, -4 # -img_width - 1
addi s9, s5, 4 # img_width + 1
addi s10, s5, -4 # img_width - 1
li s11, 9 # Number of pixels for blurring
addi s0, s2, 0 # Set the row check
addi s1, s3, 0 # Set the column check
li a0, 0x42002000 # Store address for the blur image
blur:
lw t0, (s4) # Load pixel value (i)
lw t1, 4(s4) # Load pixel value (Right)
add t0, t0, t1
lw t1, -4(s4) # Load pixel value (Left)
add t0, t0, t1
# Load pixel value (Up)
lwi t1, s6(s4)
add t0, t0, t1
# Load pixel value (Up-Right)
lwi t1, s7(s4)
add t0, t0, t1
# Load pixel value (Up-Left)
lwi t1, s8(s4)
add t0, t0, t1
# Load pixel value (Down)
lwi t1, s5(s4)
add t0, t0, t1
# Load pixel value (Down-Right)
lwi t1, s9(s4)
add t0, t0, t1
# Load pixel value (Down-Left)
lwi t1, s10(s4)
add t0, t0, t1
# Average the output (Divide by 9)
div t0, t0, s11
# Store the output
sw t0, (a0)
addi a0, a0, 4
# Check if the row is complete
addi s4, s4, 4
addi s0, s0, -1
bnez s0, blur
addi s4, s4, 8
addi s0, s2, 0
addi s1, s1, -1
bnez s1, blur
# Write the cycle count
rdcycle t5 # Read lower 32 bits of cycle count into t5
rdcycleh t6 # Read higher 32 bits of cycle count into t6
lui t3, 0x42003 # Load upper immediate 0x42003 into t3
addi t3, t3, 0x500 # t3 = 0x42003500 (target memory address)
sw t5, 0(t3) # Store the lower cycle count at 0x42003500
addi t3, t3, 4 # t3 = 0x42003504 (next memory address for higher count)
sw t6, 0(t3) # Store the higher cycle count at 0x42003504
# End of program (infinite loop with nop)
nop_loop:
nop # No operation
j nop_loop # Infinite loop