From 278f90a464992dc89b1301d2f5f4a5807c7260c5 Mon Sep 17 00:00:00 2001 From: slederer Date: Mon, 15 Sep 2025 23:02:22 +0200 Subject: [PATCH] tridoracpu: implement data cache --- examples/benchmarks.results.text | 57 ++++++++++++++++- tridoracpu/tridoracpu.srcs/dram_bridge.v | 80 +++++++++++++++++++----- tridoracpu/tridoracpu.xpr | 39 ++++++------ 3 files changed, 138 insertions(+), 38 deletions(-) diff --git a/examples/benchmarks.results.text b/examples/benchmarks.results.text index 5ae8a13..7b4023e 100644 --- a/examples/benchmarks.results.text +++ b/examples/benchmarks.results.text @@ -45,6 +45,7 @@ Running benchmarks.prog exp() 10K 00:00:29 cos() 10K 00:00:06 +-------------------------------------- Arty-A7-35T 76.92MHz, 64KB SRAM, 256MB DRAM running in DRAM (except corelib, stdlib, runtime) @@ -68,7 +69,7 @@ Running benchmarks.prog exp() 10K 00:00:32 cos() 10K 00:00:06 - +-------------------------------------- Arty-A7-35T 76.92MHz, 64KB SRAM, 256MB DRAM, 16B instruction cache running in DRAM (except corelib, stdlib, runtime) @@ -91,3 +92,57 @@ Running benchmarks.prog array copy 128k 1K 00:00:48 exp() 10K 00:00:32 cos() 10K 00:00:06 + +-------------------------------------- +Arty-A7-35T +76.92MHz, 64KB SRAM, 256MB DRAM, + 16B instruction cache, 16B wt data cache +running in DRAM (except corelib, stdlib, runtime) + +Running benchmarks.prog + empty loop 10M 00:00:07 + write variable 10M 00:00:17 + read variable 10M 00:00:20 + integer addition 10M 00:00:20 + real addition 1M 00:00:28 + integer multiplication 1M 00:01:11 + real multiplication 1M 00:00:59 + integer division 1M 00:01:36 + real division 1M 00:01:05 + string indexing 1M 00:00:39 + string iteration 1M 00:00:19 + new/dispose 1k 1M 00:00:19 + new/dispose 128k 1M 00:00:19 + array copy 1k 10K 00:00:03 + array copy 128k 1K 00:00:39 + exp() 10K 00:00:26 + cos() 10K 00:00:05 + + + +-------------------------------------- +Arty-A7-35T +76.92MHz, 64KB SRAM, 256MB DRAM, + 16B instruction cache, 16B wb data cache +running in DRAM (except corelib, stdlib, runtime) + +Running benchmarks.prog + empty loop 10M 00:00:04 + write variable 10M 00:00:11 + read variable 10M 00:00:18 + integer addition 10M 00:00:18 + real addition 1M 00:00:27 + integer multiplication 1M 00:00:49 + real multiplication 1M 00:00:58 + integer division 1M 00:01:06 + real division 1M 00:01:04 + string indexing 1M 00:00:36 + string iteration 1M 00:00:19 + new/dispose 1k 1M 00:00:18 + new/dispose 128k 1M 00:00:18 + array copy 1k 10K 00:00:03 + array copy 128k 1K 00:00:39 + exp() 10K 00:00:25 + cos() 10K 00:00:05 + + diff --git a/tridoracpu/tridoracpu.srcs/dram_bridge.v b/tridoracpu/tridoracpu.srcs/dram_bridge.v index c3f948a..d4f798b 100644 --- a/tridoracpu/tridoracpu.srcs/dram_bridge.v +++ b/tridoracpu/tridoracpu.srcs/dram_bridge.v @@ -107,8 +107,14 @@ module dram_bridge #(ADDR_WIDTH = 32, WIDTH = 32) ); (*KEEP*) reg [DRAM_DATA_WIDTH-1:0] ins_cache; - (*KEEP*) reg [DRAM_ADDR_WIDTH-1:4] cached_addr; - (*KEEP*) wire cache_hit = mem_read_enable && mem_read_ins && (cached_addr == mem_addr[DRAM_ADDR_WIDTH-1:4]); + (*KEEP*) reg [DRAM_ADDR_WIDTH-1:4] icached_addr; + (*KEEP*) wire icache_hit = mem_read_enable && mem_read_ins && (icached_addr == mem_addr[DRAM_ADDR_WIDTH-1:4]); + + (*KEEP*) reg [DRAM_DATA_WIDTH-1:0] d_cache; + (*KEEP*) reg [DRAM_ADDR_WIDTH-1:4] dcached_addr; + (*KEEP*) wire dcache_hit = mem_read_enable && !mem_read_ins && (dcached_addr == mem_addr[DRAM_ADDR_WIDTH-1:4]); + + wire cache_hit = icache_hit | dcache_hit; reg [WIDTH-1:0] read_buf; reg read_inprogress = 0; @@ -125,25 +131,32 @@ module dram_bridge #(ADDR_WIDTH = 32, WIDTH = 32) wire [1:0] word_sel = mem_addr[3:2]; wire [WIDTH-1:0] read_word = - word_sel == 3'b11 ? app_rd_data[31:0] : - word_sel == 3'b10 ? app_rd_data[63:32] : - word_sel == 3'b01 ? app_rd_data[95:64] : + word_sel == 2'b11 ? app_rd_data[31:0] : + word_sel == 2'b10 ? app_rd_data[63:32] : + word_sel == 2'b01 ? app_rd_data[95:64] : app_rd_data[127:96]; - wire [WIDTH-1:0] read_cached_word = - word_sel == 3'b11 ? ins_cache[31:0] : - word_sel == 3'b10 ? ins_cache[63:32] : - word_sel == 3'b01 ? ins_cache[95:64] : + wire [WIDTH-1:0] read_icached_word = + word_sel == 2'b11 ? ins_cache[31:0] : + word_sel == 2'b10 ? ins_cache[63:32] : + word_sel == 2'b01 ? ins_cache[95:64] : ins_cache[127:96]; - (*KEEP*) assign mem_read_data = cache_hit ? read_cached_word : + wire [WIDTH-1:0] read_dcached_word = + word_sel == 2'b11 ? d_cache[31:0] : + word_sel == 2'b10 ? d_cache[63:32] : + word_sel == 2'b01 ? d_cache[95:64] : + d_cache[127:96]; + + (*KEEP*) assign mem_read_data = icache_hit ? read_icached_word : + dcache_hit ? read_dcached_word : app_rd_data_valid ? read_word : read_buf; // set the write mask according to the lower bits of the address // (ignoring bit 0) - assign app_wdf_mask = word_sel == 3'b11 ? 16'b1111111111110000 : - word_sel == 3'b10 ? 16'b1111111100001111 : - word_sel == 3'b01 ? 16'b1111000011111111 : + assign app_wdf_mask = word_sel == 2'b11 ? 16'b1111111111110000 : + word_sel == 2'b10 ? 16'b1111111100001111 : + word_sel == 2'b01 ? 16'b1111000011111111 : 16'b0000111111111111 ; wire write_ready = mem_write_enable & app_wdf_rdy & app_rdy; @@ -152,7 +165,7 @@ module dram_bridge #(ADDR_WIDTH = 32, WIDTH = 32) assign app_wdf_data = { {4{mem_write_data}} }; assign mem_wait = (dram_read_enable & ~read_inprogress) | - (mem_write_enable & (~app_wdf_rdy | ~app_rdy)) | + (mem_write_enable & ~dcache_hit & (~app_wdf_rdy | ~app_rdy)) | (read_inprogress & ~app_rd_data_valid); assign app_en = (dram_read_enable & ~read_inprogress) | @@ -160,25 +173,58 @@ module dram_bridge #(ADDR_WIDTH = 32, WIDTH = 32) assign app_cmd = dram_read_enable ? CMD_READ : CMD_WRITE; + /* set instruction cache */ always @(posedge dram_front_clk) begin if(dram_read_enable && mem_read_ins && app_rd_data_valid) begin ins_cache <= app_rd_data; - cached_addr <= mem_addr[DRAM_ADDR_WIDTH-1:4]; + icached_addr <= mem_addr[DRAM_ADDR_WIDTH-1:4]; end end + /* set data cache */ + always @(posedge dram_front_clk) + begin + if(dram_read_enable && !mem_read_ins && app_rd_data_valid) + begin + d_cache <= app_rd_data; + dcached_addr <= mem_addr[DRAM_ADDR_WIDTH-1:4]; + end + + /* write-through cache - invalidate on write */ + /* invalidate data cache on write */ +// if(mem_write_enable && dcached_addr == mem_addr[DRAM_ADDR_WIDTH-1:4]) +// dcached_addr <= {DRAM_ADDR_WIDTH-4{1'b1}}; + + /* write-back cache - update cache on write */ + // write back to data cache on mem_write + if(mem_write_enable && dcached_addr == mem_addr[DRAM_ADDR_WIDTH-1:4]) + begin + case(word_sel) + 2'b11: d_cache[31:0] <= mem_write_data; + 2'b10: d_cache[63:32] <= mem_write_data; + 2'b01: d_cache[95:64] <= mem_write_data; + 2'b00: d_cache[127:96] <= mem_write_data; + endcase + end + end + + /* transfer read data, either from cache or from DRAM */ always @(posedge dram_front_clk) begin if(dram_read_enable & ~read_inprogress & app_rdy) read_inprogress <= 1; if(read_inprogress & app_rd_data_valid) read_inprogress <= 0; + if(dram_read_enable & app_rd_data_valid) read_buf <= mem_read_data; else - if (mem_read_enable & cache_hit) - read_buf <= read_cached_word; + if (mem_read_enable & icache_hit) + read_buf <= read_icached_word; + else + if (mem_read_enable & dcache_hit) + read_buf <= read_dcached_word; end endmodule diff --git a/tridoracpu/tridoracpu.xpr b/tridoracpu/tridoracpu.xpr index b0ba2bb..cdeb09a 100644 --- a/tridoracpu/tridoracpu.xpr +++ b/tridoracpu/tridoracpu.xpr @@ -349,15 +349,22 @@ - + - - Vivado Synthesis Defaults + + Higher performance designs, resource sharing is turned off, the global fanout guide is set to a lower number, FSM extraction forced to one-hot, LUT combining is disabled, equivalent registers are preserved, SRL are inferred with a larger threshold - + + + + + + + + - + @@ -371,26 +378,18 @@ - + - - Best predicted directive for place_design. + + Default settings for Implementation. - - - + - - - + - - - - - - + +