From 248c9ae919f9fe64adc294daa3baead6b35695c7 Mon Sep 17 00:00:00 2001 From: slederer Date: Mon, 26 Jan 2026 02:03:28 +0100 Subject: [PATCH 01/12] vgafb: first attempt at shifter/masker acceleration functionality --- tridoracpu/tridoracpu.srcs/top.v | 4 +- tridoracpu/tridoracpu.srcs/vgafb.v | 94 ++++++++++++++++++++++++++++++ tridoracpu/tridoracpu.xpr | 3 +- 3 files changed, 97 insertions(+), 4 deletions(-) diff --git a/tridoracpu/tridoracpu.srcs/top.v b/tridoracpu/tridoracpu.srcs/top.v index 0dc3346..bf3bea8 100644 --- a/tridoracpu/tridoracpu.srcs/top.v +++ b/tridoracpu/tridoracpu.srcs/top.v @@ -137,7 +137,7 @@ module top( assign fb_wr_data = mem_write_data; vgafb vgafb0(`clock, pixclk, rst, - mem_addr[3:0], fb_rd_data, fb_wr_data, + mem_addr[5:2], fb_rd_data, fb_wr_data, fb_rd_en, fb_wr_en, VGA_HS_O, VGA_VS_O, VGA_R, VGA_G, VGA_B); `endif @@ -247,7 +247,7 @@ module top( assign tdraudio_wr_data = mem_write_data; tdraudio tdraudio0(`clock, ~rst, - mem_addr[6:0], + mem_addr[8:2], tdraudio_rd_data, tdraudio_wr_data, tdraudio_rd_en, diff --git a/tridoracpu/tridoracpu.srcs/vgafb.v b/tridoracpu/tridoracpu.srcs/vgafb.v index f87e514..408079a 100644 --- a/tridoracpu/tridoracpu.srcs/vgafb.v +++ b/tridoracpu/tridoracpu.srcs/vgafb.v @@ -1,6 +1,9 @@ `timescale 1ns / 1ps `default_nettype none +// enable shifter/masker registers +`define ENABLE_FB_ACCEL + // Project F: Display Timings // (C)2019 Will Green, Open Source Hardware released under the MIT License // Learn more at https://projectf.io @@ -126,6 +129,14 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( localparam REG_PAL_SLOT = 3; localparam REG_PAL_DATA = 4; localparam REG_CTL = 5; +`ifdef ENABLE_FB_ACCEL + localparam REG_SHIFTER = 6; + localparam REG_SHIFTCOUNT = 7; + localparam REG_SHIFTERM = 9; + localparam REG_SHIFTERSP = 10; + localparam REG_MASKGEN = 11; +`endif + localparam COLOR_WIDTH = 12; localparam PALETTE_WIDTH = 4; @@ -145,12 +156,30 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( wire pix_rd; wire [VMEM_DATA_WIDTH-1:0] status; +`ifdef ENABLE_FB_ACCEL + reg [VMEM_DATA_WIDTH-1:0] acc_shifter_in; + reg [(VMEM_DATA_WIDTH*2)-1:0] acc_shifter_out; + reg [2:0] acc_shift_count; + reg acc_start_shift; + reg [VMEM_DATA_WIDTH-1:0] acc_mask_in; + wire [VMEM_DATA_WIDTH-1:0] acc_mask_out; + wire [VMEM_DATA_WIDTH-1:0] acc_shifter_mask; + wire [VMEM_DATA_WIDTH-1:0] acc_shifter_out_h = acc_shifter_out[(VMEM_DATA_WIDTH*2)-1:VMEM_DATA_WIDTH]; + wire [VMEM_DATA_WIDTH-1:0] acc_shifter_out_l = acc_shifter_out[VMEM_DATA_WIDTH-1:0]; + `endif + assign vmem_rd_en = rd_en; assign vmem_wr_en = (reg_sel == REG_VMEM) && wr_en; assign rd_data = (reg_sel == REG_VMEM) ? vmem_rd_data : (reg_sel == REG_RD_ADDR) ? cpu_rd_addr : (reg_sel == REG_WR_ADDR) ? cpu_wr_addr : (reg_sel == REG_CTL) ? status : +`ifdef ENABLE_FB_ACCEL + (reg_sel == REG_SHIFTER) ? acc_shifter_out_h: + (reg_sel == REG_SHIFTERM) ? acc_shifter_mask : + (reg_sel == REG_SHIFTERSP) ? acc_shifter_out_l : + (reg_sel == REG_MASKGEN) ? acc_mask_out : + `endif 32'hFFFFFFFF; wire [VMEM_ADDR_WIDTH-1:0] cpu_addr = vmem_wr_en ? cpu_wr_addr : cpu_rd_addr; @@ -271,6 +300,71 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( if(rd_en && reg_sel == REG_VMEM) cpu_rd_addr <= cpu_rd_addr + 1; // auto-increment read addr on read end +`ifdef ENABLE_FB_ACCEL + // + // shifter/masker registers + // + always @(posedge cpu_clk) + begin + if(wr_en && reg_sel == REG_SHIFTER) + acc_shifter_in <= { wr_data, {32{1'b0}}}; + end + + always @(posedge cpu_clk) + begin + if(wr_en && reg_sel == REG_SHIFTCOUNT) + begin + acc_shift_count <= wr_data[2:0]; + acc_start_shift <= 1; + end + + if(acc_start_shift) + acc_start_shift <= 0; + end + + always @(posedge cpu_clk) + begin + if (acc_start_shift) + begin + acc_shifter_out <= {acc_shifter_in, {VMEM_DATA_WIDTH{1'b0}}} >> acc_shift_count; + end + end + + // mask register + always @(posedge cpu_clk) + begin + if (wr_en && reg_sel == REG_MASKGEN) + begin + acc_mask_in <= wr_data; + end + end + + assign acc_mask_out = { + {4{|{acc_mask_in[31:28]}}}, + {4{|{acc_mask_in[27:24]}}}, + {4{|{acc_mask_in[23:20]}}}, + {4{|{acc_mask_in[19:16]}}}, + {4{|{acc_mask_in[15:12]}}}, + {4{|{acc_mask_in[11:8]}}}, + {4{|{acc_mask_in[7:4]}}}, + {4{|{acc_mask_in[3:0]}}} + }; + + assign acc_shifter_mask = { + {4{|{acc_shifter_out_h[31:28]}}}, + {4{|{acc_shifter_out_h[27:24]}}}, + {4{|{acc_shifter_out_h[23:20]}}}, + {4{|{acc_shifter_out_h[19:16]}}}, + {4{|{acc_shifter_out_h[15:12]}}}, + {4{|{acc_shifter_out_h[11:8]}}}, + {4{|{acc_shifter_out_h[7:4]}}}, + {4{|{acc_shifter_out_h[3:0]}}} + }; +`endif + + // + // shifting pixels at pixel clock + // always @(posedge pix_clk) begin if(scanline || shift_count == MAX_SHIFT_COUNT) // before start of a line diff --git a/tridoracpu/tridoracpu.xpr b/tridoracpu/tridoracpu.xpr index a3dd3f6..2926f59 100644 --- a/tridoracpu/tridoracpu.xpr +++ b/tridoracpu/tridoracpu.xpr @@ -376,7 +376,7 @@ - + @@ -389,7 +389,6 @@ - From 937369f60b5349fe7d8f71b6f6f4b3d4190e3e9f Mon Sep 17 00:00:00 2001 From: slederer Date: Wed, 28 Jan 2026 01:15:16 +0100 Subject: [PATCH 02/12] lib,examples: changes for new register address mapping --- examples/fastfire.s | 10 +++++----- examples/sprites.s | 10 +++++----- lib/corelib.s | 10 +++++----- lib/pcmaudio.s | 10 +++++----- tridoracpu/tridoracpu.srcs/vgafb.v | 10 +++------- tridoracpu/tridoracpu.xpr | 3 ++- 6 files changed, 25 insertions(+), 28 deletions(-) diff --git a/examples/fastfire.s b/examples/fastfire.s index f0e10e4..63ace51 100644 --- a/examples/fastfire.s +++ b/examples/fastfire.s @@ -123,11 +123,11 @@ FF_EXIT: ; framebuffer controller registers .EQU FB_RA $900 - .EQU FB_WA $901 - .EQU FB_IO $902 - .EQU FB_PS $903 - .EQU FB_PD $904 - .EQU FB_CTL $905 + .EQU FB_WA $904 + .EQU FB_IO $908 + .EQU FB_PS $90C + .EQU FB_PD $910 + .EQU FB_CTL $914 .EQU WORDS_PER_LINE 80 ; fire width in vmem words (strict left-to-right evaluation) diff --git a/examples/sprites.s b/examples/sprites.s index 3391339..6962eda 100644 --- a/examples/sprites.s +++ b/examples/sprites.s @@ -3,9 +3,9 @@ .EQU WORDS_PER_LINE 80 .EQU FB_RA $900 - .EQU FB_WA $901 - .EQU FB_IO $902 - .EQU FB_PS $903 + .EQU FB_WA $904 + .EQU FB_IO $908 + .EQU FB_PS $90C ; calculate mask for a word of pixels ; args: word of pixels with four bits per pixel @@ -95,7 +95,7 @@ PS_LOOP1: ; in the vga controller LOADC FB_RA ; read address register LOAD PS_VMEM_ADDR - STOREI 1 ; use autoincrement to get to the next register + STOREI 4 ; use autoincrement to get to the next register LOAD PS_VMEM_ADDR STOREI DROP @@ -322,7 +322,7 @@ UD_S_L1: ; store vmem offset into write addr reg LOADCP FB_WA LOAD UD_S_OFFSET - STOREI 1 ; ugly but fast: reuse addr + STOREI 4 ; ugly but fast: reuse addr ; with postincrement to ; get to FB_IO for STOREI below diff --git a/lib/corelib.s b/lib/corelib.s index d147934..a21b95c 100644 --- a/lib/corelib.s +++ b/lib/corelib.s @@ -701,11 +701,11 @@ CMPWORDS_XT2: ; --------- Graphics Library --------------- ; vga controller registers .EQU FB_RA $900 - .EQU FB_WA $901 - .EQU FB_IO $902 - .EQU FB_PS $903 - .EQU FB_PD $904 - .EQU FB_CTL $905 + .EQU FB_WA $904 + .EQU FB_IO $908 + .EQU FB_PS $90C + .EQU FB_PD $910 + .EQU FB_CTL $914 ; set a pixel in fb memory ; parameters: x,y - coordinates PUTPIXEL_1BPP: diff --git a/lib/pcmaudio.s b/lib/pcmaudio.s index 530f52f..ebe812a 100644 --- a/lib/pcmaudio.s +++ b/lib/pcmaudio.s @@ -11,9 +11,9 @@ START_PCMAUDIO: LOADCP _DIV CALL - LOADC AUDIO_BASE + 1 + LOADC AUDIO_BASE + 4 SWAP ; put clock divider on ToS - STOREI 1 + STOREI 4 LOADCP 32768 ; set amplitude to biased 0 STOREI DROP @@ -95,7 +95,7 @@ PLAY1_L0: AND CBRANCH.NZ PLAY1_L0 ; loop if fifo is full - LOADC AUDIO_BASE+2 ; store amplitude value + LOADC AUDIO_BASE+8 ; store amplitude value SWAP STOREI DROP @@ -207,7 +207,7 @@ SMPLQ_I_B: LOADCP $FFFF AND - LOADC AUDIO_BASE+2 + LOADC AUDIO_BASE+8 SWAP STOREI ; write sample, keep addr @@ -281,7 +281,7 @@ SMPLQ_I_END1: DROP ; set amplitude out to zero (biased) - LOADC AUDIO_BASE+2 + LOADC AUDIO_BASE+8 LOADCP 32768 STOREI DROP diff --git a/tridoracpu/tridoracpu.srcs/vgafb.v b/tridoracpu/tridoracpu.srcs/vgafb.v index 408079a..2d6bc55 100644 --- a/tridoracpu/tridoracpu.srcs/vgafb.v +++ b/tridoracpu/tridoracpu.srcs/vgafb.v @@ -132,9 +132,9 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( `ifdef ENABLE_FB_ACCEL localparam REG_SHIFTER = 6; localparam REG_SHIFTCOUNT = 7; - localparam REG_SHIFTERM = 9; - localparam REG_SHIFTERSP = 10; - localparam REG_MASKGEN = 11; + localparam REG_SHIFTERM = 8; + localparam REG_SHIFTERSP = 09; + localparam REG_MASKGEN = 10; `endif localparam COLOR_WIDTH = 12; @@ -325,18 +325,14 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( always @(posedge cpu_clk) begin if (acc_start_shift) - begin acc_shifter_out <= {acc_shifter_in, {VMEM_DATA_WIDTH{1'b0}}} >> acc_shift_count; - end end // mask register always @(posedge cpu_clk) begin if (wr_en && reg_sel == REG_MASKGEN) - begin acc_mask_in <= wr_data; - end end assign acc_mask_out = { diff --git a/tridoracpu/tridoracpu.xpr b/tridoracpu/tridoracpu.xpr index 2926f59..a3dd3f6 100644 --- a/tridoracpu/tridoracpu.xpr +++ b/tridoracpu/tridoracpu.xpr @@ -376,7 +376,7 @@ - + @@ -389,6 +389,7 @@ + From 042a18fc9b7ecd5b93b88a1f1a3ea4b633302b4b Mon Sep 17 00:00:00 2001 From: slederer Date: Thu, 29 Jan 2026 01:53:35 +0100 Subject: [PATCH 03/12] vgafb: bugfixes, change synthesis optimization settings --- tridoracpu/tridoracpu.srcs/vgafb.v | 6 +++--- tridoracpu/tridoracpu.xpr | 34 ++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/tridoracpu/tridoracpu.srcs/vgafb.v b/tridoracpu/tridoracpu.srcs/vgafb.v index 2d6bc55..411e956 100644 --- a/tridoracpu/tridoracpu.srcs/vgafb.v +++ b/tridoracpu/tridoracpu.srcs/vgafb.v @@ -159,7 +159,7 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( `ifdef ENABLE_FB_ACCEL reg [VMEM_DATA_WIDTH-1:0] acc_shifter_in; reg [(VMEM_DATA_WIDTH*2)-1:0] acc_shifter_out; - reg [2:0] acc_shift_count; + reg [4:0] acc_shift_count; reg acc_start_shift; reg [VMEM_DATA_WIDTH-1:0] acc_mask_in; wire [VMEM_DATA_WIDTH-1:0] acc_mask_out; @@ -307,14 +307,14 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( always @(posedge cpu_clk) begin if(wr_en && reg_sel == REG_SHIFTER) - acc_shifter_in <= { wr_data, {32{1'b0}}}; + acc_shifter_in <= wr_data; end always @(posedge cpu_clk) begin if(wr_en && reg_sel == REG_SHIFTCOUNT) begin - acc_shift_count <= wr_data[2:0]; + acc_shift_count <= { wr_data[2:0], 2'b0}; acc_start_shift <= 1; end diff --git a/tridoracpu/tridoracpu.xpr b/tridoracpu/tridoracpu.xpr index a3dd3f6..4d21f83 100644 --- a/tridoracpu/tridoracpu.xpr +++ b/tridoracpu/tridoracpu.xpr @@ -356,10 +356,16 @@ - + - - + + Performs optimizations which creates alternative logic technology mapping, including disabling LUT combining, forcing F7/F8/F9 to logic, increasing the threshold of shift register inference. + + + + + + @@ -376,16 +382,26 @@ - + - + + Best predicted directive for place_design. + - + + + - + + + - - + + + + + + From 8900eb90be47d2d3eceb5b2e2b5417ce58b24993 Mon Sep 17 00:00:00 2001 From: slederer Date: Sat, 31 Jan 2026 02:31:00 +0100 Subject: [PATCH 04/12] corelib: new putpixel routine using shifter/maskgen --- lib/corelib.s | 190 +++++++++----------------------------------------- 1 file changed, 34 insertions(+), 156 deletions(-) diff --git a/lib/corelib.s b/lib/corelib.s index a21b95c..b228d20 100644 --- a/lib/corelib.s +++ b/lib/corelib.s @@ -706,108 +706,32 @@ CMPWORDS_XT2: .EQU FB_PS $90C .EQU FB_PD $910 .EQU FB_CTL $914 -; set a pixel in fb memory -; parameters: x,y - coordinates -PUTPIXEL_1BPP: - ; calculate vmem address: - OVER ; duplicate x - ; divide x by 32 - SHR - SHR - SHR - SHR - SHR - SWAP - ; multiply y by words per line - SHL 2 - SHL 2 - SHL + .EQU FB_SHIFTER $918 + .EQU FB_SHIFTCOUNT $91C + .EQU FB_SHIFTERM $920 + .EQU FB_SHIFTERSP $924 + .EQU FB_MASKGEN $928 - ADD ; add results together for vmem addr +; draw a single pixel +; args: x, y, color - DUP - LOADCP FB_WA - SWAP - STOREI ; store to framebuffer write addr register - DROP - LOADCP FB_RA ; and to framebuffer read addr register - SWAP - STOREI - DROP - - ; x is now at top of stack - ; get bit value from x modulo 32 - LOADC 31 - AND - SHL 2 ; (x & 31) * 4 = offset into table - LOADCP INT_TO_PIX_TABLE - ADD - LOADI - - LOADCP FB_IO - ; read old vmem value - LOADCP FB_IO - LOADI - ; or in new bit - OR - ; write new value - STOREI - DROP - - RET - -INT_TO_PIX_TABLE: - .WORD %10000000_00000000_00000000_00000000 - .WORD %01000000_00000000_00000000_00000000 - .WORD %00100000_00000000_00000000_00000000 - .WORD %00010000_00000000_00000000_00000000 - .WORD %00001000_00000000_00000000_00000000 - .WORD %00000100_00000000_00000000_00000000 - .WORD %00000010_00000000_00000000_00000000 - .WORD %00000001_00000000_00000000_00000000 - .WORD %00000000_10000000_00000000_00000000 - .WORD %00000000_01000000_00000000_00000000 - .WORD %00000000_00100000_00000000_00000000 - .WORD %00000000_00010000_00000000_00000000 - .WORD %00000000_00001000_00000000_00000000 - .WORD %00000000_00000100_00000000_00000000 - .WORD %00000000_00000010_00000000_00000000 - .WORD %00000000_00000001_00000000_00000000 - .WORD %00000000_00000000_10000000_00000000 - .WORD %00000000_00000000_01000000_00000000 - .WORD %00000000_00000000_00100000_00000000 - .WORD %00000000_00000000_00010000_00000000 - .WORD %00000000_00000000_00001000_00000000 - .WORD %00000000_00000000_00000100_00000000 - .WORD %00000000_00000000_00000010_00000000 - .WORD %00000000_00000000_00000001_00000000 - .WORD %00000000_00000000_00000000_10000000 - .WORD %00000000_00000000_00000000_01000000 - .WORD %00000000_00000000_00000000_00100000 - .WORD %00000000_00000000_00000000_00010000 - .WORD %00000000_00000000_00000000_00001000 - .WORD %00000000_00000000_00000000_00000100 - .WORD %00000000_00000000_00000000_00000010 - .WORD %00000000_00000000_00000000_00000001 - -PUTMPIXEL: - LOADC 1 -; set a pixel in fb memory -; parameters: x,y,color - coordinates, color value (0-15) PUTPIXEL: PUTPIXEL_4BPP: .EQU PUTPIXEL_X 0 .EQU PUTPIXEL_Y 4 .EQU PUTPIXEL_COLOR 8 - .EQU PUTPIXEL_PIXPOS 12 + .EQU PUTPIXEL_BPSAV 12 .EQU PUTPIXEL_FS 16 FPADJ -PUTPIXEL_FS - STORE PUTPIXEL_COLOR STORE PUTPIXEL_Y STORE PUTPIXEL_X + LOADREG BP + STORE PUTPIXEL_BPSAV + LOADC 0 + STOREREG BP ; calculate vmem address: (x / 8) + (y * 80) LOAD PUTPIXEL_X @@ -826,83 +750,37 @@ PUTPIXEL_4BPP: ADD ; add results together for vmem addr - LOADCP FB_WA - OVER - STOREI ; store to framebuffer write addr register - DROP - LOADCP FB_RA ; and to framebuffer read addr register - SWAP ; swap addr and value for STOREI - STOREI - DROP - - LOAD PUTPIXEL_X - ; |0000.0000|0000.0000|0000.0000|0000.1111| - LOADC 7 - AND ; calculate pixel position in word - LOADC 7 - SWAP - SUB ; pixpos = 7 - (x & 7) - STORE PUTPIXEL_PIXPOS + DUP + STORE.B FB_WA ; set as write and read addresses + STORE.B FB_RA + ; create pixel data from color value in + ; leftmost pixel data bits (31-28) LOAD PUTPIXEL_COLOR - LOAD PUTPIXEL_PIXPOS - SHR ; rcount = pixpos / 2 -ROTLOOP_: - DUP ; exit loop if rcount is 0 - CBRANCH.Z ROTLOOP_END - SWAP ; pixel value is now on top of stack - BROT ; value = value << 8 - SWAP ; rcount is now on top of stack - DEC 1 ; rcount = rcount - 1 - BRANCH ROTLOOP_ -ROTLOOP_END: - DROP ; drop rcount - ; shifted pixel value is now at top of stack - LOAD PUTPIXEL_PIXPOS - LOADC 1 - AND - CBRANCH.Z EVEN_PIXPOS - SHL 2 ; if pixpos is odd, shift by 4 bits + BROT + BROT + BROT SHL 2 -EVEN_PIXPOS: - LOAD PUTPIXEL_X - ; get bit value from x modulo 8 - LOADC 7 - AND - SHL 2 ; (x & 7) * 4 = offset into table - LOADCP INT_TO_MASK_TABLE - ADD - LOADI + SHL 2 + STORE.B FB_SHIFTER ; store pixel into shifter - ; read old vmem value - LOADCP FB_IO - LOADI - ; mask bits - AND - ; or in shifted pixel value - OR + LOAD PUTPIXEL_X ; use x coord as shift count + STORE.B FB_SHIFTCOUNT ; writing triggers shifting - ; write new value - LOADCP FB_IO - SWAP - STOREI - DROP + LOAD.B FB_SHIFTERM ; get shift result as mask + LOAD.B FB_IO ; get background pixel data + AND ; remove bits for new pixel from bg + + LOAD.B FB_SHIFTER ; load shifted pixel + OR ; OR in new pixel bits + STORE.B FB_IO ; write new pixel data word to vmem + + LOAD PUTPIXEL_BPSAV + STOREREG BP FPADJ PUTPIXEL_FS RET - .CPOOL - -INT_TO_MASK_TABLE: - .WORD %00001111_11111111_11111111_11111111 - .WORD %11110000_11111111_11111111_11111111 - .WORD %11111111_00001111_11111111_11111111 - .WORD %11111111_11110000_11111111_11111111 - .WORD %11111111_11111111_00001111_11111111 - .WORD %11111111_11111111_11110000_11111111 - .WORD %11111111_11111111_11111111_00001111 - .WORD %11111111_11111111_11111111_11110000 - ; draw a line between two points ; parameters: x0, y0, x1, y1, color .EQU DL_X0 0 From 1e56251fc1417ff53f2444c578e4a3323400c0c9 Mon Sep 17 00:00:00 2001 From: slederer Date: Sat, 31 Jan 2026 17:24:36 +0100 Subject: [PATCH 05/12] vgafb: buffer maskgen outputs to avoid timing problems --- lib/corelib.s | 5 ++- tridoracpu/tridoracpu.srcs/vgafb.v | 57 +++++++++++++++++------------- tridoracpu/tridoracpu.xpr | 8 ++--- 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/lib/corelib.s b/lib/corelib.s index b228d20..93dc81f 100644 --- a/lib/corelib.s +++ b/lib/corelib.s @@ -756,10 +756,9 @@ PUTPIXEL_4BPP: ; create pixel data from color value in ; leftmost pixel data bits (31-28) + LOADC 0 LOAD PUTPIXEL_COLOR - BROT - BROT - BROT + BPLC SHL 2 SHL 2 STORE.B FB_SHIFTER ; store pixel into shifter diff --git a/tridoracpu/tridoracpu.srcs/vgafb.v b/tridoracpu/tridoracpu.srcs/vgafb.v index 411e956..fd42627 100644 --- a/tridoracpu/tridoracpu.srcs/vgafb.v +++ b/tridoracpu/tridoracpu.srcs/vgafb.v @@ -162,10 +162,12 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( reg [4:0] acc_shift_count; reg acc_start_shift; reg [VMEM_DATA_WIDTH-1:0] acc_mask_in; - wire [VMEM_DATA_WIDTH-1:0] acc_mask_out; - wire [VMEM_DATA_WIDTH-1:0] acc_shifter_mask; + reg [VMEM_DATA_WIDTH-1:0] acc_mask_buf; + reg [VMEM_DATA_WIDTH-1:0] acc_shiftmask_buf; + wire [VMEM_DATA_WIDTH-1:0] acc_shifter_mask = acc_shiftmask_buf; wire [VMEM_DATA_WIDTH-1:0] acc_shifter_out_h = acc_shifter_out[(VMEM_DATA_WIDTH*2)-1:VMEM_DATA_WIDTH]; wire [VMEM_DATA_WIDTH-1:0] acc_shifter_out_l = acc_shifter_out[VMEM_DATA_WIDTH-1:0]; + `endif assign vmem_rd_en = rd_en; @@ -176,9 +178,9 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( (reg_sel == REG_CTL) ? status : `ifdef ENABLE_FB_ACCEL (reg_sel == REG_SHIFTER) ? acc_shifter_out_h: - (reg_sel == REG_SHIFTERM) ? acc_shifter_mask : + (reg_sel == REG_SHIFTERM) ? acc_shiftmask_buf : (reg_sel == REG_SHIFTERSP) ? acc_shifter_out_l : - (reg_sel == REG_MASKGEN) ? acc_mask_out : + (reg_sel == REG_MASKGEN) ? acc_mask_buf : `endif 32'hFFFFFFFF; @@ -335,27 +337,34 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( acc_mask_in <= wr_data; end - assign acc_mask_out = { - {4{|{acc_mask_in[31:28]}}}, - {4{|{acc_mask_in[27:24]}}}, - {4{|{acc_mask_in[23:20]}}}, - {4{|{acc_mask_in[19:16]}}}, - {4{|{acc_mask_in[15:12]}}}, - {4{|{acc_mask_in[11:8]}}}, - {4{|{acc_mask_in[7:4]}}}, - {4{|{acc_mask_in[3:0]}}} - }; + // mask output is buffered to avoid timing problems + always @(posedge cpu_clk) + begin + acc_mask_buf <= { + {4{~|{acc_mask_in[31:28]}}}, + {4{~|{acc_mask_in[27:24]}}}, + {4{~|{acc_mask_in[23:20]}}}, + {4{~|{acc_mask_in[19:16]}}}, + {4{~|{acc_mask_in[15:12]}}}, + {4{~|{acc_mask_in[11:8]}}}, + {4{~|{acc_mask_in[7:4]}}}, + {4{~|{acc_mask_in[3:0]}}} + }; + end - assign acc_shifter_mask = { - {4{|{acc_shifter_out_h[31:28]}}}, - {4{|{acc_shifter_out_h[27:24]}}}, - {4{|{acc_shifter_out_h[23:20]}}}, - {4{|{acc_shifter_out_h[19:16]}}}, - {4{|{acc_shifter_out_h[15:12]}}}, - {4{|{acc_shifter_out_h[11:8]}}}, - {4{|{acc_shifter_out_h[7:4]}}}, - {4{|{acc_shifter_out_h[3:0]}}} - }; + always @(posedge cpu_clk) + begin + acc_shiftmask_buf = { + {4{~|{acc_shifter_out_h[31:28]}}}, + {4{~|{acc_shifter_out_h[27:24]}}}, + {4{~|{acc_shifter_out_h[23:20]}}}, + {4{~|{acc_shifter_out_h[19:16]}}}, + {4{~|{acc_shifter_out_h[15:12]}}}, + {4{~|{acc_shifter_out_h[11:8]}}}, + {4{~|{acc_shifter_out_h[7:4]}}}, + {4{~|{acc_shifter_out_h[3:0]}}} + }; + end `endif // diff --git a/tridoracpu/tridoracpu.xpr b/tridoracpu/tridoracpu.xpr index 4d21f83..a088319 100644 --- a/tridoracpu/tridoracpu.xpr +++ b/tridoracpu/tridoracpu.xpr @@ -358,9 +358,7 @@ - - Performs optimizations which creates alternative logic technology mapping, including disabling LUT combining, forcing F7/F8/F9 to logic, increasing the threshold of shift register inference. - + @@ -384,9 +382,7 @@ - - Best predicted directive for place_design. - + From c119a2a5bb25a12f6fc13ca1b9f0f43c9ba8703e Mon Sep 17 00:00:00 2001 From: slederer Date: Sat, 31 Jan 2026 17:26:13 +0100 Subject: [PATCH 06/12] add line/points drawing benchmark --- examples/graphbench.pas | 92 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 examples/graphbench.pas diff --git a/examples/graphbench.pas b/examples/graphbench.pas new file mode 100644 index 0000000..327e72e --- /dev/null +++ b/examples/graphbench.pas @@ -0,0 +1,92 @@ +program graphbench; +var starttime,endtime:DateTime; + +procedure startBench(name:string); +begin + write(name:20, ' '); + starttime := GetTime; +end; + +procedure endBench; +var secDelta, minDelta, hourDelta:integer; + procedure write2Digits(i:integer); + begin + if i < 10 then + write('0'); + write(i); + end; +begin + endTime := GetTime; + + hourDelta := endtime.hours - starttime.hours; + minDelta := endtime.minutes - starttime.minutes; + secDelta := endtime.seconds - starttime.seconds; + + if secDelta < 0 then + begin + secDelta := 60 + secDelta; + minDelta := minDelta - 1; + end; + + if minDelta < 0 then + begin + minDelta := 60 + minDelta; + hourDelta := hourDelta - 1; + end; + + write2Digits(hourDelta); + write(':'); write2Digits(minDelta); + write(':'); write2Digits(secDelta); + writeln; +end; + +function randint(lessthan:integer):integer; +var r:integer; +begin + r := random and 511; + if r >= lessthan then + r := r - lessthan; + randint := r; +end; + +procedure drawlines(count:integer); +var i,col,x1,y1,x2,y2:integer; +begin + col := 1; + for i := 1 to count do + begin + x1 := randint(500); + y1 := randint(400); + x2 := randint(500); + y2 := randint(400); + DrawLine(x1,y1,x2,y2,col); + col := col + 1; + if col > 15 then col := 1; + end; +end; + +procedure drawpoints(count:integer); +var i,col,x,y:integer; +begin + col := 1; + for i := 1 to count do + begin + x := randint(500); + y := randint(400); + PutPixel(x,y,col); + col := col + 1; + if col > 15 then col := 1; + end; +end; + +begin + InitGraphics; + startBench('200K points'); + drawpoints(200000); + endBench; + + InitGraphics; + startBench('10K lines'); + drawlines(10000); + endBench; +end. From 66a50d5ea86bb28891476fd41b485480cfefac31 Mon Sep 17 00:00:00 2001 From: slederer Date: Sun, 1 Feb 2026 00:44:34 +0100 Subject: [PATCH 07/12] update sprites unit to use shifter/maskgen --- examples/graphbench.pas | 39 +++++++- examples/sprites.s | 204 ++++++++++------------------------------ 2 files changed, 86 insertions(+), 157 deletions(-) diff --git a/examples/graphbench.pas b/examples/graphbench.pas index 327e72e..9abbfba 100644 --- a/examples/graphbench.pas +++ b/examples/graphbench.pas @@ -1,5 +1,17 @@ program graphbench; +uses sprites; + var starttime,endtime:DateTime; + spriteData:SpritePixels; + +procedure readSpriteData(filename:string); +var f:file; +begin + open(f,filename,ModeReadOnly); + seek(f,8); (* skip file header *) + read(f,spriteData); + close(f); +end; procedure startBench(name:string); begin @@ -13,7 +25,7 @@ var secDelta, minDelta, hourDelta:integer; begin if i < 10 then write('0'); - write(i); + write(i); end; begin endTime := GetTime; @@ -49,6 +61,20 @@ begin randint := r; end; +procedure drawsprites(count:integer); +var i,col,x,y:integer; +begin + col := 1; + for i := 1 to count do + begin + x := randint(350); + y := randint(350); + PutSprite(x,y,spriteData); + col := col + 1; + if col > 15 then col := 1; + end; +end; + procedure drawlines(count:integer); var i,col,x1,y1,x2,y2:integer; begin @@ -80,13 +106,20 @@ begin end; begin + readSpriteData('rocket.sprt'); + InitGraphics; - startBench('200K points'); + startBench('points 200K'); drawpoints(200000); endBench; InitGraphics; - startBench('10K lines'); + startBench('lines 10K'); drawlines(10000); endBench; + + InitGraphics; + startBench('sprites 50K'); + drawsprites(50000); + endBench; end. diff --git a/examples/sprites.s b/examples/sprites.s index 6962eda..ab2e580 100644 --- a/examples/sprites.s +++ b/examples/sprites.s @@ -6,28 +6,13 @@ .EQU FB_WA $904 .EQU FB_IO $908 .EQU FB_PS $90C - -; calculate mask for a word of pixels -; args: word of pixels with four bits per pixel -; returns: value that masks out all pixels that are set -CALC_MASK: - LOADC $F ; pixel mask -C_M_L0: - SWAP ; swap mask and pixels value - AND.S1.X2Y ; isolate one pixel, keep args - CBRANCH.Z C_M_L1 ; if pixel is zero, dont set mask bits - OVER ; copy current mask - OR ; or into pixels value -C_M_L1: - SWAP ; swap back, ToS is now mask bits - SHL 2 ; shift mask for next pixel to the left - SHL 2 - - DUP - CBRANCH.NZ C_M_L0 ; if mask is zero, we are done - DROP ; remove mask bits - NOT ; invert result - RET + .EQU FB_PD $910 + .EQU FB_CTL $914 + .EQU FB_SHIFTER $918 + .EQU FB_SHIFTCOUNT $91C + .EQU FB_SHIFTERM $920 + .EQU FB_SHIFTERSP $924 + .EQU FB_MASKGEN $928 ; calculate vmem address from coordinates ; args: x,y @@ -67,13 +52,19 @@ CALC_VMEM_ADDR: .EQU PS_SHIFT_C 20 .EQU PS_SPILL 24 .EQU PS_STRIPE_C 28 - .EQU PS_FS 32 + .EQU PS_BPSAVE 32 + .EQU PS_FS 36 PUTSPRITE: FPADJ -PS_FS STORE PS_SPRITE_DATA STORE PS_Y STORE PS_X + LOADREG BP + STORE PS_BPSAVE + LOADC 0 + STOREREG BP + ; calculate vmem address LOAD PS_X LOAD PS_Y @@ -81,11 +72,6 @@ PUTSPRITE: CALL STORE PS_VMEM_ADDR - LOAD PS_X ; shift count = x mod 8 - LOADC 7 - AND - STORE PS_SHIFT_C - LOADC SPRITE_HEIGHT STORE PS_SPRITE_LINES @@ -93,12 +79,10 @@ PUTSPRITE: PS_LOOP1: ; set read and write address ; in the vga controller - LOADC FB_RA ; read address register LOAD PS_VMEM_ADDR - STOREI 4 ; use autoincrement to get to the next register - LOAD PS_VMEM_ADDR - STOREI - DROP + DUP + STORE.B FB_RA + STORE.B FB_WA LOAD PS_SPRITE_DATA ; address of sprite data DUP @@ -106,61 +90,19 @@ PS_LOOP1: STORE PS_SPRITE_DATA ; and store it again LOADI ; load word from orig. address + ; ------- one word of sprite pixels on stack - LOADC 0 - STORE PS_SPILL + STORE.B FB_SHIFTER + LOAD PS_X + STORE.B FB_SHIFTCOUNT - ; loop to shift pixel data to right - LOAD PS_SHIFT_C ; load shift count -PS_LOOP2: - DUP ; test it for zero - CBRANCH.Z PS_LOOP2_X + LOAD.B FB_SHIFTERM ; get shifted mask + LOAD.B FB_IO ; and background pixel data + AND ; remove foreground pixels - SWAP ; swap count with pixels - - ; save the pixel that is shifted out - LOADC $F ; mask the four bits - AND.S0 ; keep original value on stack - BROT ; and move them to MSB - BROT - BROT - SHL 2 - SHL 2 ; shift by 28 in total - - LOAD PS_SPILL ; load spill bits - SHR ; shift by four to make space - SHR - SHR - SHR - OR ; or with orig value - STORE PS_SPILL ; store new value - - SHR ; shift pixels right - SHR ; four bits per pixel - SHR - SHR - - SWAP ; swap back, count now ToS - DEC 1 - BRANCH PS_LOOP2 -PS_LOOP2_X: - DROP ; remove shift count, shifted pixels now in ToS - - DUP - LOADCP CALC_MASK ; calculate sprite mask for this word - CALL - - LOADCP FB_IO ; address of the i/o register - LOADI ; read word from video mem - - AND ; and word with mask - - OR ; OR sprite data with original pixels - - LOADCP FB_IO - SWAP - STOREI ; store result into i/o reg - DROP + LOAD.B FB_SHIFTER ; get shifted pixels + OR ; combine with background + STORE.B FB_IO ; store into vmem ; set counter for remaining stripes LOADC SPRITE_STRIPES - 1 @@ -170,8 +112,8 @@ PS_LOOP2_X: ; process spilled bits and next vertical stripe of sprite data ; PS_NEXT_STRIPE: - ; put spill bits on stack for later - LOAD PS_SPILL + ;use spill bits from first column + LOAD.B FB_SHIFTERSP LOAD PS_SPRITE_DATA ; address of sprite data DUP @@ -179,65 +121,20 @@ PS_NEXT_STRIPE: STORE PS_SPRITE_DATA ; and store it again LOADI ; load word from orig. address - ; reset spill bits - LOADC 0 - STORE PS_SPILL - - ; last spill bits are on ToS now - - ; shift pixel data to right - LOAD PS_SHIFT_C ; load shift count -PS_LOOP3: ; test it for zero + STORE.B FB_SHIFTER ; store into shifter + LOAD PS_X + STORE.B FB_SHIFTCOUNT ; shift stuff + LOAD.B FB_SHIFTER ; get shifted pixels + OR ; combine with spill bits (see above) DUP - CBRANCH.Z PS_LOOP3_X + STORE.B FB_MASKGEN ; store to mask reg to get new mask - SWAP ; swap count with pixels + LOAD.B FB_MASKGEN ; get mask for spill bits + shifted pixels + LOAD.B FB_IO ; get vmem data + AND ; remove foreground pixels from bg - ; save the pixel that is shifted out - LOADC $F ; mask the four bits - AND.S0 ; keep original value on stack - BROT ; and move them to MSB - BROT - BROT - SHL 2 - SHL 2 ; shift by 28 in total - - LOAD PS_SPILL ; load spill bits - SHR ; shift by four to make space - SHR - SHR - SHR - OR ; or with orig value - STORE PS_SPILL ; store new value - - SHR ; shift pixels right - SHR ; four bits per pixel - SHR - SHR - - SWAP ; swap back, count now ToS - DEC 1 - BRANCH PS_LOOP3 -PS_LOOP3_X: - DROP ; remove shift count, shifted pixels now in ToS - - OR ; or together with spill bits - - DUP - LOADCP CALC_MASK ; calculate sprite mask - CALL - - LOADCP FB_IO ; load original pixels - LOADI - - AND ; and with mask - - OR ; or together with original pixels - - LOADCP FB_IO - SWAP - STOREI - DROP + OR ; combine with shifted pixels + STORE.B FB_IO ; write to vmem LOAD PS_STRIPE_C ; decrement stripe count DEC 1 @@ -246,22 +143,18 @@ PS_LOOP3_X: CBRANCH.NZ PS_NEXT_STRIPE ; if non-zero, next stripe ; write spilled bits of the last stripe into next vmem word - LOAD PS_SPILL ; get spill bits + LOAD.B FB_SHIFTERSP ; get spill bits DUP - LOADCP CALC_MASK ; calculate sprite mask for spill bits - CALL + STORE.B FB_MASKGEN + LOAD.B FB_MASKGEN ; get sprite mask for spill bits - LOADCP FB_IO - LOADI ; load next vmem word + LOAD.B FB_IO ; load next vmem word AND ; apply sprite mask OR ; OR in spill bits - LOADCP FB_IO - SWAP ; swap pixels and addr - STOREI ; write back - DROP - + STORE.B FB_IO ; write to vmem + LOAD PS_SPRITE_LINES ; decrement lines count DEC 1 DUP @@ -275,7 +168,10 @@ PS_LOOP3_X: BRANCH PS_LOOP1 PS_L_XT: DROP - + + LOAD PS_BPSAVE + STOREREG BP + FPADJ PS_FS RET From bf813fac1d43250d26eac49c1a2847507fee2919 Mon Sep 17 00:00:00 2001 From: slederer Date: Sun, 1 Feb 2026 11:52:16 +0100 Subject: [PATCH 08/12] corelib: revert PUTPIXEL changes - changes to corelib made sdcard i/o unstable for unknown reasons and the performance improvement for PUTPIXEL was only about 10% --- lib/corelib.s | 189 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 156 insertions(+), 33 deletions(-) diff --git a/lib/corelib.s b/lib/corelib.s index 93dc81f..a21b95c 100644 --- a/lib/corelib.s +++ b/lib/corelib.s @@ -706,32 +706,108 @@ CMPWORDS_XT2: .EQU FB_PS $90C .EQU FB_PD $910 .EQU FB_CTL $914 - .EQU FB_SHIFTER $918 - .EQU FB_SHIFTCOUNT $91C - .EQU FB_SHIFTERM $920 - .EQU FB_SHIFTERSP $924 - .EQU FB_MASKGEN $928 +; set a pixel in fb memory +; parameters: x,y - coordinates +PUTPIXEL_1BPP: + ; calculate vmem address: + OVER ; duplicate x + ; divide x by 32 + SHR + SHR + SHR + SHR + SHR + SWAP + ; multiply y by words per line + SHL 2 + SHL 2 + SHL -; draw a single pixel -; args: x, y, color + ADD ; add results together for vmem addr + DUP + LOADCP FB_WA + SWAP + STOREI ; store to framebuffer write addr register + DROP + LOADCP FB_RA ; and to framebuffer read addr register + SWAP + STOREI + DROP + + ; x is now at top of stack + ; get bit value from x modulo 32 + LOADC 31 + AND + SHL 2 ; (x & 31) * 4 = offset into table + LOADCP INT_TO_PIX_TABLE + ADD + LOADI + + LOADCP FB_IO + ; read old vmem value + LOADCP FB_IO + LOADI + ; or in new bit + OR + ; write new value + STOREI + DROP + + RET + +INT_TO_PIX_TABLE: + .WORD %10000000_00000000_00000000_00000000 + .WORD %01000000_00000000_00000000_00000000 + .WORD %00100000_00000000_00000000_00000000 + .WORD %00010000_00000000_00000000_00000000 + .WORD %00001000_00000000_00000000_00000000 + .WORD %00000100_00000000_00000000_00000000 + .WORD %00000010_00000000_00000000_00000000 + .WORD %00000001_00000000_00000000_00000000 + .WORD %00000000_10000000_00000000_00000000 + .WORD %00000000_01000000_00000000_00000000 + .WORD %00000000_00100000_00000000_00000000 + .WORD %00000000_00010000_00000000_00000000 + .WORD %00000000_00001000_00000000_00000000 + .WORD %00000000_00000100_00000000_00000000 + .WORD %00000000_00000010_00000000_00000000 + .WORD %00000000_00000001_00000000_00000000 + .WORD %00000000_00000000_10000000_00000000 + .WORD %00000000_00000000_01000000_00000000 + .WORD %00000000_00000000_00100000_00000000 + .WORD %00000000_00000000_00010000_00000000 + .WORD %00000000_00000000_00001000_00000000 + .WORD %00000000_00000000_00000100_00000000 + .WORD %00000000_00000000_00000010_00000000 + .WORD %00000000_00000000_00000001_00000000 + .WORD %00000000_00000000_00000000_10000000 + .WORD %00000000_00000000_00000000_01000000 + .WORD %00000000_00000000_00000000_00100000 + .WORD %00000000_00000000_00000000_00010000 + .WORD %00000000_00000000_00000000_00001000 + .WORD %00000000_00000000_00000000_00000100 + .WORD %00000000_00000000_00000000_00000010 + .WORD %00000000_00000000_00000000_00000001 + +PUTMPIXEL: + LOADC 1 +; set a pixel in fb memory +; parameters: x,y,color - coordinates, color value (0-15) PUTPIXEL: PUTPIXEL_4BPP: .EQU PUTPIXEL_X 0 .EQU PUTPIXEL_Y 4 .EQU PUTPIXEL_COLOR 8 - .EQU PUTPIXEL_BPSAV 12 + .EQU PUTPIXEL_PIXPOS 12 .EQU PUTPIXEL_FS 16 FPADJ -PUTPIXEL_FS + STORE PUTPIXEL_COLOR STORE PUTPIXEL_Y STORE PUTPIXEL_X - LOADREG BP - STORE PUTPIXEL_BPSAV - LOADC 0 - STOREREG BP ; calculate vmem address: (x / 8) + (y * 80) LOAD PUTPIXEL_X @@ -750,36 +826,83 @@ PUTPIXEL_4BPP: ADD ; add results together for vmem addr - DUP - STORE.B FB_WA ; set as write and read addresses - STORE.B FB_RA + LOADCP FB_WA + OVER + STOREI ; store to framebuffer write addr register + DROP + LOADCP FB_RA ; and to framebuffer read addr register + SWAP ; swap addr and value for STOREI + STOREI + DROP + + LOAD PUTPIXEL_X + ; |0000.0000|0000.0000|0000.0000|0000.1111| + LOADC 7 + AND ; calculate pixel position in word + LOADC 7 + SWAP + SUB ; pixpos = 7 - (x & 7) + STORE PUTPIXEL_PIXPOS - ; create pixel data from color value in - ; leftmost pixel data bits (31-28) - LOADC 0 LOAD PUTPIXEL_COLOR - BPLC + LOAD PUTPIXEL_PIXPOS + SHR ; rcount = pixpos / 2 +ROTLOOP_: + DUP ; exit loop if rcount is 0 + CBRANCH.Z ROTLOOP_END + SWAP ; pixel value is now on top of stack + BROT ; value = value << 8 + SWAP ; rcount is now on top of stack + DEC 1 ; rcount = rcount - 1 + BRANCH ROTLOOP_ +ROTLOOP_END: + DROP ; drop rcount + ; shifted pixel value is now at top of stack + LOAD PUTPIXEL_PIXPOS + LOADC 1 + AND + CBRANCH.Z EVEN_PIXPOS + SHL 2 ; if pixpos is odd, shift by 4 bits SHL 2 - SHL 2 - STORE.B FB_SHIFTER ; store pixel into shifter +EVEN_PIXPOS: + LOAD PUTPIXEL_X + ; get bit value from x modulo 8 + LOADC 7 + AND + SHL 2 ; (x & 7) * 4 = offset into table + LOADCP INT_TO_MASK_TABLE + ADD + LOADI - LOAD PUTPIXEL_X ; use x coord as shift count - STORE.B FB_SHIFTCOUNT ; writing triggers shifting + ; read old vmem value + LOADCP FB_IO + LOADI + ; mask bits + AND + ; or in shifted pixel value + OR - LOAD.B FB_SHIFTERM ; get shift result as mask - LOAD.B FB_IO ; get background pixel data - AND ; remove bits for new pixel from bg - - LOAD.B FB_SHIFTER ; load shifted pixel - OR ; OR in new pixel bits - STORE.B FB_IO ; write new pixel data word to vmem - - LOAD PUTPIXEL_BPSAV - STOREREG BP + ; write new value + LOADCP FB_IO + SWAP + STOREI + DROP FPADJ PUTPIXEL_FS RET + .CPOOL + +INT_TO_MASK_TABLE: + .WORD %00001111_11111111_11111111_11111111 + .WORD %11110000_11111111_11111111_11111111 + .WORD %11111111_00001111_11111111_11111111 + .WORD %11111111_11110000_11111111_11111111 + .WORD %11111111_11111111_00001111_11111111 + .WORD %11111111_11111111_11110000_11111111 + .WORD %11111111_11111111_11111111_00001111 + .WORD %11111111_11111111_11111111_11110000 + ; draw a line between two points ; parameters: x0, y0, x1, y1, color .EQU DL_X0 0 From f90d52926f7a90f52a6a47e858b570ec99a063fe Mon Sep 17 00:00:00 2001 From: slederer Date: Sun, 1 Feb 2026 22:08:06 +0100 Subject: [PATCH 09/12] vgafb: simplify maskgen a bit to avoid timing problems --- examples/sprites.s | 3 +++ tridoracpu/tridoracpu.srcs/vgafb.v | 32 ++++++++++++++-------------- tridoracpu/tridoracpu.xpr | 34 ++++++++++++------------------ utils/tdrimg.py | 1 + 4 files changed, 33 insertions(+), 37 deletions(-) diff --git a/examples/sprites.s b/examples/sprites.s index ab2e580..5f50081 100644 --- a/examples/sprites.s +++ b/examples/sprites.s @@ -97,6 +97,7 @@ PS_LOOP1: STORE.B FB_SHIFTCOUNT LOAD.B FB_SHIFTERM ; get shifted mask + NOT LOAD.B FB_IO ; and background pixel data AND ; remove foreground pixels @@ -130,6 +131,7 @@ PS_NEXT_STRIPE: STORE.B FB_MASKGEN ; store to mask reg to get new mask LOAD.B FB_MASKGEN ; get mask for spill bits + shifted pixels + NOT LOAD.B FB_IO ; get vmem data AND ; remove foreground pixels from bg @@ -147,6 +149,7 @@ PS_NEXT_STRIPE: DUP STORE.B FB_MASKGEN LOAD.B FB_MASKGEN ; get sprite mask for spill bits + NOT LOAD.B FB_IO ; load next vmem word AND ; apply sprite mask diff --git a/tridoracpu/tridoracpu.srcs/vgafb.v b/tridoracpu/tridoracpu.srcs/vgafb.v index fd42627..49dad2d 100644 --- a/tridoracpu/tridoracpu.srcs/vgafb.v +++ b/tridoracpu/tridoracpu.srcs/vgafb.v @@ -341,28 +341,28 @@ module vgafb #(VMEM_ADDR_WIDTH = 15, VMEM_DATA_WIDTH = 32) ( always @(posedge cpu_clk) begin acc_mask_buf <= { - {4{~|{acc_mask_in[31:28]}}}, - {4{~|{acc_mask_in[27:24]}}}, - {4{~|{acc_mask_in[23:20]}}}, - {4{~|{acc_mask_in[19:16]}}}, - {4{~|{acc_mask_in[15:12]}}}, - {4{~|{acc_mask_in[11:8]}}}, - {4{~|{acc_mask_in[7:4]}}}, - {4{~|{acc_mask_in[3:0]}}} + {4{|{acc_mask_in[31:28]}}}, + {4{|{acc_mask_in[27:24]}}}, + {4{|{acc_mask_in[23:20]}}}, + {4{|{acc_mask_in[19:16]}}}, + {4{|{acc_mask_in[15:12]}}}, + {4{|{acc_mask_in[11:8]}}}, + {4{|{acc_mask_in[7:4]}}}, + {4{|{acc_mask_in[3:0]}}} }; end always @(posedge cpu_clk) begin acc_shiftmask_buf = { - {4{~|{acc_shifter_out_h[31:28]}}}, - {4{~|{acc_shifter_out_h[27:24]}}}, - {4{~|{acc_shifter_out_h[23:20]}}}, - {4{~|{acc_shifter_out_h[19:16]}}}, - {4{~|{acc_shifter_out_h[15:12]}}}, - {4{~|{acc_shifter_out_h[11:8]}}}, - {4{~|{acc_shifter_out_h[7:4]}}}, - {4{~|{acc_shifter_out_h[3:0]}}} + {4{|{acc_shifter_out_h[31:28]}}}, + {4{|{acc_shifter_out_h[27:24]}}}, + {4{|{acc_shifter_out_h[23:20]}}}, + {4{|{acc_shifter_out_h[19:16]}}}, + {4{|{acc_shifter_out_h[15:12]}}}, + {4{|{acc_shifter_out_h[11:8]}}}, + {4{|{acc_shifter_out_h[7:4]}}}, + {4{|{acc_shifter_out_h[3:0]}}} }; end `endif diff --git a/tridoracpu/tridoracpu.xpr b/tridoracpu/tridoracpu.xpr index a088319..5d8ff88 100644 --- a/tridoracpu/tridoracpu.xpr +++ b/tridoracpu/tridoracpu.xpr @@ -356,14 +356,12 @@ - + - - - - - - + + Vivado Synthesis Defaults + + @@ -380,24 +378,18 @@ - + - + + Default settings for Implementation. + - - - + - - - + - - - - - - + + diff --git a/utils/tdrimg.py b/utils/tdrimg.py index b7ce4cb..4eeaead 100644 --- a/utils/tdrimg.py +++ b/utils/tdrimg.py @@ -614,6 +614,7 @@ def create_image_with_stuff(imgfile): slotnr = putfile("../examples/benchmarks.pas", None , f, part, partstart, slotnr) slotnr = putfile("../examples/animate.pas", None , f, part, partstart, slotnr) + slotnr = putfile("../examples/graphbench.pas", None , f, part, partstart, slotnr) slotnr = putfile("../examples/sprites.inc", None , f, part, partstart, slotnr) slotnr = putfile("../examples/sprites.s", None , f, part, partstart, slotnr) slotnr = putfile("../examples/background.pict", None , f, part, partstart, slotnr) From 885e50c1c09838ca19f8560774cf225011f169f4 Mon Sep 17 00:00:00 2001 From: slederer Date: Sun, 1 Feb 2026 22:46:18 +0100 Subject: [PATCH 10/12] corelib: restore new PUTPIXEL implementation --- lib/corelib.s | 190 +++++++++----------------------------------------- 1 file changed, 34 insertions(+), 156 deletions(-) diff --git a/lib/corelib.s b/lib/corelib.s index a21b95c..c57a94e 100644 --- a/lib/corelib.s +++ b/lib/corelib.s @@ -706,108 +706,32 @@ CMPWORDS_XT2: .EQU FB_PS $90C .EQU FB_PD $910 .EQU FB_CTL $914 -; set a pixel in fb memory -; parameters: x,y - coordinates -PUTPIXEL_1BPP: - ; calculate vmem address: - OVER ; duplicate x - ; divide x by 32 - SHR - SHR - SHR - SHR - SHR - SWAP - ; multiply y by words per line - SHL 2 - SHL 2 - SHL + .EQU FB_SHIFTER $918 + .EQU FB_SHIFTCOUNT $91C + .EQU FB_SHIFTERM $920 + .EQU FB_SHIFTERSP $924 + .EQU FB_MASKGEN $928 - ADD ; add results together for vmem addr +; draw a single pixel +; args: x, y, color - DUP - LOADCP FB_WA - SWAP - STOREI ; store to framebuffer write addr register - DROP - LOADCP FB_RA ; and to framebuffer read addr register - SWAP - STOREI - DROP - - ; x is now at top of stack - ; get bit value from x modulo 32 - LOADC 31 - AND - SHL 2 ; (x & 31) * 4 = offset into table - LOADCP INT_TO_PIX_TABLE - ADD - LOADI - - LOADCP FB_IO - ; read old vmem value - LOADCP FB_IO - LOADI - ; or in new bit - OR - ; write new value - STOREI - DROP - - RET - -INT_TO_PIX_TABLE: - .WORD %10000000_00000000_00000000_00000000 - .WORD %01000000_00000000_00000000_00000000 - .WORD %00100000_00000000_00000000_00000000 - .WORD %00010000_00000000_00000000_00000000 - .WORD %00001000_00000000_00000000_00000000 - .WORD %00000100_00000000_00000000_00000000 - .WORD %00000010_00000000_00000000_00000000 - .WORD %00000001_00000000_00000000_00000000 - .WORD %00000000_10000000_00000000_00000000 - .WORD %00000000_01000000_00000000_00000000 - .WORD %00000000_00100000_00000000_00000000 - .WORD %00000000_00010000_00000000_00000000 - .WORD %00000000_00001000_00000000_00000000 - .WORD %00000000_00000100_00000000_00000000 - .WORD %00000000_00000010_00000000_00000000 - .WORD %00000000_00000001_00000000_00000000 - .WORD %00000000_00000000_10000000_00000000 - .WORD %00000000_00000000_01000000_00000000 - .WORD %00000000_00000000_00100000_00000000 - .WORD %00000000_00000000_00010000_00000000 - .WORD %00000000_00000000_00001000_00000000 - .WORD %00000000_00000000_00000100_00000000 - .WORD %00000000_00000000_00000010_00000000 - .WORD %00000000_00000000_00000001_00000000 - .WORD %00000000_00000000_00000000_10000000 - .WORD %00000000_00000000_00000000_01000000 - .WORD %00000000_00000000_00000000_00100000 - .WORD %00000000_00000000_00000000_00010000 - .WORD %00000000_00000000_00000000_00001000 - .WORD %00000000_00000000_00000000_00000100 - .WORD %00000000_00000000_00000000_00000010 - .WORD %00000000_00000000_00000000_00000001 - -PUTMPIXEL: - LOADC 1 -; set a pixel in fb memory -; parameters: x,y,color - coordinates, color value (0-15) PUTPIXEL: PUTPIXEL_4BPP: .EQU PUTPIXEL_X 0 .EQU PUTPIXEL_Y 4 .EQU PUTPIXEL_COLOR 8 - .EQU PUTPIXEL_PIXPOS 12 + .EQU PUTPIXEL_BPSAV 12 .EQU PUTPIXEL_FS 16 FPADJ -PUTPIXEL_FS - STORE PUTPIXEL_COLOR STORE PUTPIXEL_Y STORE PUTPIXEL_X + LOADREG BP + STORE PUTPIXEL_BPSAV + LOADC 0 + STOREREG BP ; calculate vmem address: (x / 8) + (y * 80) LOAD PUTPIXEL_X @@ -826,83 +750,37 @@ PUTPIXEL_4BPP: ADD ; add results together for vmem addr - LOADCP FB_WA - OVER - STOREI ; store to framebuffer write addr register - DROP - LOADCP FB_RA ; and to framebuffer read addr register - SWAP ; swap addr and value for STOREI - STOREI - DROP - - LOAD PUTPIXEL_X - ; |0000.0000|0000.0000|0000.0000|0000.1111| - LOADC 7 - AND ; calculate pixel position in word - LOADC 7 - SWAP - SUB ; pixpos = 7 - (x & 7) - STORE PUTPIXEL_PIXPOS + DUP + STORE.B FB_WA ; set as write and read addresses + STORE.B FB_RA + ; create pixel data from color value in + ; leftmost pixel data bits (31-28) + LOADC 0 LOAD PUTPIXEL_COLOR - LOAD PUTPIXEL_PIXPOS - SHR ; rcount = pixpos / 2 -ROTLOOP_: - DUP ; exit loop if rcount is 0 - CBRANCH.Z ROTLOOP_END - SWAP ; pixel value is now on top of stack - BROT ; value = value << 8 - SWAP ; rcount is now on top of stack - DEC 1 ; rcount = rcount - 1 - BRANCH ROTLOOP_ -ROTLOOP_END: - DROP ; drop rcount - ; shifted pixel value is now at top of stack - LOAD PUTPIXEL_PIXPOS - LOADC 1 - AND - CBRANCH.Z EVEN_PIXPOS - SHL 2 ; if pixpos is odd, shift by 4 bits + BPLC SHL 2 -EVEN_PIXPOS: - LOAD PUTPIXEL_X - ; get bit value from x modulo 8 - LOADC 7 - AND - SHL 2 ; (x & 7) * 4 = offset into table - LOADCP INT_TO_MASK_TABLE - ADD - LOADI + SHL 2 + STORE.B FB_SHIFTER ; store pixel into shifter - ; read old vmem value - LOADCP FB_IO - LOADI - ; mask bits - AND - ; or in shifted pixel value - OR + LOAD PUTPIXEL_X ; use x coord as shift count + STORE.B FB_SHIFTCOUNT ; writing triggers shifting - ; write new value - LOADCP FB_IO - SWAP - STOREI - DROP + LOAD.B FB_SHIFTERM ; get shift result as mask + NOT ; invert to get background mask + LOAD.B FB_IO ; get background pixel data + AND ; remove bits for new pixel from bg + + LOAD.B FB_SHIFTER ; load shifted pixel + OR ; OR in new pixel bits + STORE.B FB_IO ; write new pixel data word to vmem + + LOAD PUTPIXEL_BPSAV + STOREREG BP FPADJ PUTPIXEL_FS RET - .CPOOL - -INT_TO_MASK_TABLE: - .WORD %00001111_11111111_11111111_11111111 - .WORD %11110000_11111111_11111111_11111111 - .WORD %11111111_00001111_11111111_11111111 - .WORD %11111111_11110000_11111111_11111111 - .WORD %11111111_11111111_00001111_11111111 - .WORD %11111111_11111111_11110000_11111111 - .WORD %11111111_11111111_11111111_00001111 - .WORD %11111111_11111111_11111111_11110000 - ; draw a line between two points ; parameters: x0, y0, x1, y1, color .EQU DL_X0 0 From 4ad879ba68b4153d83df94f08c9d372c34679a27 Mon Sep 17 00:00:00 2001 From: slederer Date: Sun, 1 Feb 2026 23:27:25 +0100 Subject: [PATCH 11/12] Update documentation --- LICENSE.md | 2 +- doc/mem.md | 5 ++-- doc/tdraudio.md | 10 ++++---- doc/vga.md | 68 ++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 70 insertions(+), 15 deletions(-) diff --git a/LICENSE.md b/LICENSE.md index 3755dbb..6392510 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -4,7 +4,7 @@ All files, except where explicitly stated otherwise, are licensed according to t ------------------------------------------------------------------------------ -Copyright 2024 Sebastian Lederer +Copyright 2024-2026 Sebastian Lederer Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/doc/mem.md b/doc/mem.md index f7dbc2b..29177b2 100644 --- a/doc/mem.md +++ b/doc/mem.md @@ -22,11 +22,12 @@ The _BSEL_ and _BPLC_ instructions are designed to assist with accessing bytes w The byte ordering is big-endian. ## Accessing the I/O Area -The I/O area organizes memory slightly different. Here, pointing out individual bytes is not very useful, so the I/O controllers use register addresses with increments of one. In practice, there is only the VGA framebuffer controller which uses multiple registers. +The I/O area uses the same word addressing in increments of four to access the registers of the I/O controllers. In practice, only the VGA framebuffer controller and the audio controller use multiple registers. +For the other controllers, there is a single 32 bit register that is repeated all over the address space of the corresponding I/O slot. The individual I/O controllers each have a memory area of 128 bytes, so there is a maximum number of 16 I/O controllers. -Currently, only I/O slots 0-3 are being used. +Currently, only I/O slots 0-4 are being used. |I/O slot| Address | Controller | |--------|---------|------------| diff --git a/doc/tdraudio.md b/doc/tdraudio.md index 999ebfc..5d8b22f 100644 --- a/doc/tdraudio.md +++ b/doc/tdraudio.md @@ -10,12 +10,12 @@ For the first channel the register addresses are: |Address|Description| |-------|-----------| | $A00 | Control Register | -| $A01 | Clock Divider Register | -| $A02 | Amplitude Register | +| $A04 | Clock Divider Register | +| $A08 | Amplitude Register | -The register addresses for the second channel start at $A04, -the third channel at $A08 -and the fourth channel at $A0C. +The register addresses for the second channel start at $A10, +the third channel at $A20 +and the fourth channel at $A30. ## Reading the control register diff --git a/doc/vga.md b/doc/vga.md index b53f56d..76520f2 100644 --- a/doc/vga.md +++ b/doc/vga.md @@ -4,13 +4,16 @@ Registers |Name|Address|Description| |----|-------|-----------| |_FB_RA_ | $900 | Read Address | -|_FB_WA_ | $901 | Write Address | -| _FB_IO_ | $902 | I/O Register | -| _FB_PS_ | $903 | Palette Select | -| _FB_PD_ | $904 | Palette Data | -| _FB_CTL_ | $905 | Control Register | - - +|_FB_WA_ | $904 | Write Address | +| _FB_IO_ | $908 | I/O Register | +| _FB_PS_ | $90C | Palette Select | +| _FB_PD_ | $910 | Palette Data | +| _FB_CTL_ | $914 | Control Register | +| _FB_SHIFTER | $918 | Shift Assist Register | +| _FB_SHIFTCOUNT | $91C | Shift Count Register | +| _FB_SHIFTERM | $920 | Shifted Mask Register | +| _FB_SHIFTERSP | $924 | Shifter Spill Register | +| _FB_MASKGEN | $928 | Mask Generator Register | ## Pixel Data Pixel data is organized in 32-bit-words. With four bits per pixel, one word @@ -81,3 +84,54 @@ The control register contains status information. It can only be read. The _m_ field indicates the current graphics mode. At the time of writing, it is always 1 which denotes a 640x400x4 mode. The _vb_ bit is 1 when the video signal generator is in its vertical blank phase. + +## Shift Assist Register +The *shift assist register* can be used to accelerate shifting pixel/bitmap data. +Writing a word of pixel data to this register initialises the shifting process. + +After writing to the shift count register (see below), reading the shift assist +register retrieves the shifted pixel data. + +Writing to the shift assist register will reset the shift count. + +## Shift Count Register +Writing a number from 0-7 to the *shift count register* triggers shifting the +contents of the shift assist register. Pixel data is shifted by four bits +to the right times the shift count. Bits 31-3 of the shift count are ignored, so you can +directly write a horizontal screen coordinate to the register. + +This register cannot be read. + +## Shifter Mask Register +The *shifter mask register* contains the shifted pixel data converted into +a mask. See the *mask generator register* for an +explanation of the mask. + +## Shifter Spill Register +The *shifter spill register* contains the pixel data that has +been shifted out to the right. For example, if the shift count is two, +the spill register contains the two rightmost pixels (bits 7-0) of +the original pixel data, placed into the two topmost pixels (bits 31-24). + +The rest of the register is set to zero. + +## Mask Generator Register +The *mask generator register* creates a mask from pixel data. +For each four bits of a pixel, the corresponding four mask bits +are all set to one if the pixel value is not zero. + +This can be used to combine foreground and background pixel data +with a pixel value of zero for a transparent background color. + +Usually, the mask will be inverted with a *NOT* instruction +to clear all pixels in the background that are set in the foreground +with an *AND* instruction +before *ORing* foreground and background together. + +Example in hexadecimal, each digit is a pixel: +| Pixel Data | Mask | +|------------|------| +| $00000000 | $00000000 | +| $00000001 | $0000000F | +| $0407000F | $0F0F000F | +| $1234ABC0 | $FFFFFFF0 | From 4d103f99ec041a5e50ec5fbf64b8dcdac144d7ec Mon Sep 17 00:00:00 2001 From: slederer Date: Mon, 2 Feb 2026 00:33:50 +0100 Subject: [PATCH 12/12] corelib: PUTPIXEL can draw color 0 again --- lib/corelib.s | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lib/corelib.s b/lib/corelib.s index c57a94e..1ac12e9 100644 --- a/lib/corelib.s +++ b/lib/corelib.s @@ -754,6 +754,9 @@ PUTPIXEL_4BPP: STORE.B FB_WA ; set as write and read addresses STORE.B FB_RA + LOAD PUTPIXEL_COLOR + CBRANCH.Z PUTPX_CLR ; color 0 is special case + ; create pixel data from color value in ; leftmost pixel data bits (31-28) LOADC 0 @@ -775,12 +778,29 @@ PUTPIXEL_4BPP: OR ; OR in new pixel bits STORE.B FB_IO ; write new pixel data word to vmem +PUTPX_XT: LOAD PUTPIXEL_BPSAV STOREREG BP FPADJ PUTPIXEL_FS RET +PUTPX_CLR: + LOADCP $F0000000 ; mask for leftmost pixel + STORE.B FB_SHIFTER ; shift accordingly + LOAD PUTPIXEL_X + STORE.B FB_SHIFTCOUNT + + LOAD.B FB_SHIFTER ; get shifted value + NOT ; invert for real mask + LOAD.B FB_IO ; get background pixels + AND ; clear pixel with mask + STORE.B FB_IO ; no need to OR in new pixel, just store to vmem + + BRANCH PUTPX_XT + + + ; draw a line between two points ; parameters: x0, y0, x1, y1, color .EQU DL_X0 0