From f1dad61fd79023370374ff3c1ade526ce8490bfb Mon Sep 17 00:00:00 2001 From: Aaron Kantsevoy <72467919+aakan511@users.noreply.github.com> Date: Wed, 22 Apr 2026 23:35:33 -0400 Subject: [PATCH 1/5] Create assignment5_solution.md --- Solutions/assignment5_solution.md | 284 ++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 Solutions/assignment5_solution.md diff --git a/Solutions/assignment5_solution.md b/Solutions/assignment5_solution.md new file mode 100644 index 0000000..dddc249 --- /dev/null +++ b/Solutions/assignment5_solution.md @@ -0,0 +1,284 @@ +# Assignment 5: Dot Product Acceleration (SimX) — Solution + +This document walks through the complete solution for extending the Vortex GPU microarchitecture with a custom `VX_DOT8` RISC-V instruction and implementing it in the SimX cycle-level simulator. + +--- + +## Overview + +`VX_DOT8` computes the integer dot product of two packed vectors of four `int8` elements: + +``` +rd = A1*B1 + A2*B2 + A3*B3 + A4*B4 +``` + +The instruction uses the **R-Type** RISC-V format with: +- `opcode = 0x0B` (RISC-V custom-0) +- `funct7 = 3` +- `funct3 = 0` + +--- + +## Step 1: ISA Extension — `vx_intrinsics.h` + +Add the `vx_dot8` inline intrinsic using GAS `.insn r` pseudo-instruction syntax. + +```cpp +// DOT8: computes dot product of two packed int8x4 vectors +inline int vx_dot8(int a, int b) { + int ret; + asm volatile (".insn r %1, 0, 3, %0, %2, %3" + : "=r"(ret) + : "i"(RISCV_CUSTOM0), "r"(a), "r"(b)); + return ret; +} +``` + +**Breakdown of `.insn r` arguments:** +| Field | Value | Notes | +|----------|-----------------|--------------------------------| +| opcode | `RISCV_CUSTOM0` | Expands to `0x0B` | +| funct3 | `0` | Selects DOT8 within custom ops | +| funct7 | `3` | Selects DOT8 within custom ops | +| rd | `%0` | Output register | +| rs1 | `%2` | Packed A input | +| rs2 | `%3` | Packed B input | + +> Reference: [RISC-V `.insn` format — GNU Binutils docs](https://sourceware.org/binutils/docs/as/RISC_002dV_002dFormats.html) + +--- + +## Step 2: Matrix Multiplication Kernel + +### Directory Setup + +Clone `tests/regression/sgemm` into `tests/regression/dot8` and update the Makefile: + +```makefile +# tests/regression/dot8/Makefile +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := dot8 + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) +SRCS := $(SRC_DIR)/main.cpp +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n32 + +include ../common.mk +``` + +### `common.h` + +Define the source and destination types used across the kernel and host: + +```cpp +#ifndef _COMMON_H_ +#define _COMMON_H_ + +typedef int8_t SrcType; +typedef int32_t DstType; + +typedef struct { + uint32_t grid_dim[2]; + uint32_t size; + uint64_t A_addr; + uint64_t B_addr; + uint64_t C_addr; +} kernel_arg_t; + +#endif +``` + +### `kernel.cpp` + +Each GPU thread computes one output cell `C[row][col]`. Rows from A and columns from B are packed into `uint32_t` registers before calling `vx_dot8`. + +```cpp +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto A = reinterpret_cast(arg->A_addr); + auto B = reinterpret_cast(arg->B_addr); + auto C = reinterpret_cast(arg->C_addr); + auto size = arg->size; + + int row = blockIdx.x; + int col = blockIdx.y; + + DstType sum(0); + for (int k = 0; k < size; k += 4) { + // Pack 4 consecutive elements from row of A (row-major) + uint32_t packedA = *((uint32_t*)(A + (row * size + k))); + + // Pack 4 elements from column of B (row-major, non-contiguous) + uint32_t packedB = ((uint8_t)B[(k + 0) * size + col] << 0) + | ((uint8_t)B[(k + 1) * size + col] << 8) + | ((uint8_t)B[(k + 2) * size + col] << 16) + | ((uint8_t)B[(k + 3) * size + col] << 24); + + sum += vx_dot8(packedA, packedB); + } + + C[row * size + col] = sum; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(2, arg->grid_dim, nullptr, + (vx_kernel_func_cb)kernel_body, arg); +} +``` + +### `main.cpp` — Key Changes from `sgemm` + +**Types:** Use `SrcType` (`int8_t`) for input matrices and `DstType` (`int32_t`) for the output. + +**Buffer sizes** must reflect the correct element size in bytes: +```cpp +uint32_t src_buf_size = size * size * sizeof(SrcType); // int8_t buffers +uint32_t dst_buf_size = size * size * sizeof(DstType); // int32_t buffer +``` + +**CPU reference implementation** for verification: +```cpp +static void matmul_cpu(DstType* out, const SrcType* A, const SrcType* B, + uint32_t width, uint32_t height) { + for (uint32_t row = 0; row < height; ++row) { + for (uint32_t col = 0; col < width; ++col) { + DstType sum(0); + for (uint32_t e = 0; e < width; ++e) { + sum += (DstType)(A[row * width + e]) * (DstType)(B[e * width + col]); + } + out[row * width + col] = sum; + } + } +} +``` + +--- + +## Step 3: SimX Simulator Implementation + +### `types.h` — Add `DOT8` to `AluType` + +```cpp +enum class AluType { + // ... existing types ... + CZERO, + DOT8 // <-- add this +}; + +// Also update the ostream operator: +inline std::ostream& operator<<(std::ostream& os, const AluType& type) { + switch (type) { + // ... existing cases ... + case AluType::DOT8: os << "DOT8"; break; + default: assert(false); + } + return os; +} +``` + +### `decode.cpp` — Decode the New Instruction + +Add the `DOT8` string in `op_string()`: + +```cpp +case AluType::DOT8: return {"DOT8", ""}; +``` + +Add a new `case 3` block inside the custom-instruction switch in `Emulator::decode()`: + +```cpp +case 3: { + switch (funct3) { + case 0: { // DOT8 + auto instr = std::allocate_shared(instr_pool_, uuid, FUType::ALU); + instr->setOpType(AluType::DOT8); + instr->setArgs(IntrAluArgs{0, 0, 0}); + instr->setDestReg(rd, RegType::Integer); + instr->setSrcReg(0, rs1, RegType::Integer); + instr->setSrcReg(1, rs2, RegType::Integer); + ibuffer.push_back(instr); + } break; + default: + std::abort(); + } +} break; +``` + +### `execute.cpp` — Emulate the Dot Product + +Add a `DOT8` case inside the ALU execution switch. Each byte is sign-extended to `int8_t` before multiplication, and products are accumulated as `int32_t`: + +```cpp +case AluType::DOT8: { + for (uint32_t t = thread_start; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + + uint32_t packedA = rs1_data[t].u; + uint32_t packedB = rs2_data[t].u; + + // Extract and sign-extend each byte + int8_t a0 = (int8_t)(packedA ), b0 = (int8_t)(packedB ); + int8_t a1 = (int8_t)(packedA >> 8), b1 = (int8_t)(packedB >> 8); + int8_t a2 = (int8_t)(packedA >> 16), b2 = (int8_t)(packedB >> 16); + int8_t a3 = (int8_t)(packedA >> 24), b3 = (int8_t)(packedB >> 24); + + int32_t sum = (int32_t)(a0 * b0) + + (int32_t)(a1 * b1) + + (int32_t)(a2 * b2) + + (int32_t)(a3 * b3); + + DP(3, "*** DOT8[" << t << "]: a=0x" << std::hex << packedA + << ", b=0x" << packedB << ", c=0x" << sum << std::dec); + + rd_data[t].i = sum; + } +} break; +``` + +### `func_unit.cpp` — Set 2-Cycle Latency + +Add `DOT8` to the 2-cycle latency group in `AluUnit::tick()`: + +```cpp +case AluType::AND: +case AluType::OR: +case AluType::CZERO: +case AluType::DOT8: // <-- add here + delay = 2; + break; +``` + +--- + +## Step 4: Testing + +### Build and Run + +```bash +# Build the dot8 regression test +make -C tests/regression/dot8 + +# Build the simulator +make -s + +# Run with SimX (4 cores, 4 warps, 4 threads) +./ci/blackbox.sh --driver=simx --cores=4 --warps=4 --threads=4 --app=dot8 +``` + +### Performance Sweep + +Run the following configurations with `N=256` on a 4-core GPU and record **total instruction count** and **execution cycles**: + +| Warps | Threads | +|-------|---------| +| 4 | 4 | +| 4 | 8 | +| 8 | 4 | +| 8 | 8 | + +Compare against the scalar `int8_t` baseline kernel. The `VX_DOT8` version should show a meaningful reduction in instruction count by replacing 4 multiplies + 3 adds with a single fused instruction. From 31e6396d593b4e94455e543cebe1849e733ddcae Mon Sep 17 00:00:00 2001 From: Aaron Kantsevoy <72467919+aakan511@users.noreply.github.com> Date: Thu, 23 Apr 2026 09:04:42 -0400 Subject: [PATCH 2/5] Create assignment6_solution.md --- Solutions/assignment6_solution.md | 284 ++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 Solutions/assignment6_solution.md diff --git a/Solutions/assignment6_solution.md b/Solutions/assignment6_solution.md new file mode 100644 index 0000000..c9bb664 --- /dev/null +++ b/Solutions/assignment6_solution.md @@ -0,0 +1,284 @@ +# Assignment 6: Dot Product Acceleration (RTL) — Solution + +This document walks through the complete solution for implementing `VX_DOT8` in the Vortex RTL hardware. Steps 1 and 2 (ISA extension and kernel) are identical to Assignment 5 — refer to that solution for details. This document focuses on the RTL-specific changes in Step 3. + +--- + +## Overview + +Rather than emulating `VX_DOT8` in the SimX software simulator, this assignment implements it as a real hardware module (`VX_alu_dot8`) wired into the ALU pipeline as a third sub-unit alongside the integer ALU and MulDiv unit. + +--- + +## Step 1 & 2: ISA Extension and Kernel + +Unchanged from Assignment 5. See that solution for: +- `vx_dot8` intrinsic in `vx_intrinsics.h` +- `kernel.cpp` packing and accumulation logic +- `main.cpp` buffer sizing and CPU reference implementation + +--- + +## Step 3: Hardware RTL Implementation + +### `VX_config.vh` — Define `LATENCY_DOT8` + +Add the DOT8 latency macro alongside the other functional unit latencies: + +```verilog +// DOT8 Latency +`ifndef LATENCY_DOT8 +`define LATENCY_DOT8 2 +`endif +``` + +### `VX_gpu_pkg.sv` — Add `INST_ALU_DOT8` Opcode + +Replace the unused ALU opcode slot `4'b0001` with the DOT8 type: + +```verilog +localparam INST_ALU_ADD = 4'b0000; +localparam INST_ALU_DOT8 = 4'b0001; // was INST_ALU_UNUSED +localparam INST_ALU_LUI = 4'b0010; +// ... rest unchanged +``` + +### `VX_trace_pkg.sv` — Add Trace String + +Add a `DOT8` case to the ALU trace decoder so pipeline traces print the instruction name correctly: + +```verilog +INST_ALU_AND: `TRACE(level, ("AND")) +INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ")) +INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ")) +INST_ALU_DOT8: `TRACE(level, ("DOT8")) // <-- add this +default: `TRACE(level, ("?")) +``` + +### `VX_decode.sv` — Decode the Instruction + +Add a `7'd3` case inside the custom-opcode block to decode `VX_DOT8`. The instruction targets the ALU functional unit and uses `ALU_TYPE_OTHER` to route it to the new dot8 sub-unit (rather than the integer ALU or MulDiv): + +```verilog +7'd3: begin + case (funct3) + 3'h0: begin // DOT8 + ex_type = EX_ALU; + op_type = INST_ALU_DOT8; + op_args.alu = '0; + op_args.alu.xtype = ALU_TYPE_OTHER; + `USED_IREG (rd); + `USED_IREG (rs1); + `USED_IREG (rs2); + end + default:; + endcase +end +``` + +**Key decisions:** +- `EX_ALU` routes this instruction through the ALU unit's dispatch logic +- `INST_ALU_DOT8` is the op_type tag used for tracing +- `ALU_TYPE_OTHER` is the xtype field checked in `VX_alu_unit.sv` to select the dot8 PE +- `USED_IREG` macros register rd, rs1, and rs2 as active, enabling writeback and scoreboard tracking + +### `VX_alu_dot8.sv` — New Hardware Module + +Create `hw/rtl/core/VX_alu_dot8.sv`. The module uses `VX_pe_serializer` to time-multiplex lanes across PEs, then instantiates one PE per `NUM_PES` that computes the dot product combinatorially before latching through `BUFFER_EX`. + +The key computation — sign-extending each byte and computing the four partial products — is: + +```verilog +assign c = (signed'(a[31:24]) * signed'(b[31:24])) + + (signed'(a[23:16]) * signed'(b[23:16])) + + (signed'(a[15:8]) * signed'(b[15:8])) + + (signed'(a[7:0]) * signed'(b[7:0])); +``` + +`signed'(...)` performs an in-place cast to signed before multiplication, so each 8-bit slice is treated as a two's-complement `int8`. The four products are summed into a 32-bit result. + +The result is pipelined through the `BUFFER_EX` macro at the configured `LATENCY_DOT8` (2 cycles): + +```verilog +`BUFFER_EX(result, c, pe_enable, 1, LATENCY_DOT8); +assign pe_data_out[i] = `XLEN'(result); +``` + +Full module: + +```verilog +`include "VX_define.vh" + +module VX_alu_dot8 import VX_gpu_pkg::*; #( + parameter `STRING INSTANCE_ID = "", + parameter NUM_LANES = 1 +) ( + input wire clk, + input wire reset, + VX_execute_if.slave execute_if, + VX_result_if.master result_if +); + `UNUSED_SPARAM (INSTANCE_ID) + localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); + localparam PID_WIDTH = `UP(PID_BITS); + localparam TAG_WIDTH = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + 1 + + NUM_REGS_BITS + PID_WIDTH + 1 + 1; + localparam LATENCY_DOT8 = `LATENCY_DOT8; + localparam PE_RATIO = 1; + localparam NUM_PES = `UP(NUM_LANES / PE_RATIO); + + `UNUSED_VAR (execute_if.data.op_type) + `UNUSED_VAR (execute_if.data.op_args) + `UNUSED_VAR (execute_if.data.rs3_data) + + wire pe_enable; + wire [NUM_LANES-1:0][2*`XLEN-1:0] data_in; + wire [NUM_PES-1:0][2*`XLEN-1:0] pe_data_in; + wire [NUM_PES-1:0][`XLEN-1:0] pe_data_out; + + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in + assign data_in[i][0 +: `XLEN] = execute_if.data.rs1_data[i]; + assign data_in[i][`XLEN +: `XLEN] = execute_if.data.rs2_data[i]; + end + + VX_pe_serializer #( + .NUM_LANES (NUM_LANES), + .NUM_PES (NUM_PES), + .LATENCY (LATENCY_DOT8), + .DATA_IN_WIDTH (2 * `XLEN), + .DATA_OUT_WIDTH (`XLEN), + .TAG_WIDTH (TAG_WIDTH), + .PE_REG (1) + ) pe_serializer ( + .clk (clk), + .reset (reset), + .valid_in (execute_if.valid), + .data_in (data_in), + .tag_in ({ + execute_if.data.uuid, + execute_if.data.wid, + execute_if.data.tmask, + execute_if.data.PC, + execute_if.data.wb, + execute_if.data.rd, + execute_if.data.pid, + execute_if.data.sop, + execute_if.data.eop + }), + .ready_in (execute_if.ready), + .pe_enable (pe_enable), + .pe_data_in (pe_data_out), + .pe_data_out(pe_data_in), + .valid_out (result_if.valid), + .data_out (result_if.data.data), + .tag_out ({ + result_if.data.uuid, + result_if.data.wid, + result_if.data.tmask, + result_if.data.PC, + result_if.data.wb, + result_if.data.rd, + result_if.data.pid, + result_if.data.sop, + result_if.data.eop + }), + .ready_out (result_if.ready) + ); + + for (genvar i = 0; i < NUM_PES; ++i) begin : g_PEs + /* verilator lint_off UNUSEDSIGNAL */ + wire [`XLEN-1:0] a = pe_data_in[i][0 +: `XLEN]; + wire [`XLEN-1:0] b = pe_data_in[i][`XLEN +: `XLEN]; + /* verilator lint_on UNUSEDSIGNAL */ + wire [31:0] c, result; + + assign c = (signed'(a[31:24]) * signed'(b[31:24])) + + (signed'(a[23:16]) * signed'(b[23:16])) + + (signed'(a[15:8]) * signed'(b[15:8])) + + (signed'(a[7:0]) * signed'(b[7:0])); + + `BUFFER_EX(result, c, pe_enable, 1, LATENCY_DOT8); + assign pe_data_out[i] = `XLEN'(result); + + `ifdef DBG_TRACE_PIPELINE + always @(posedge clk) begin + if (pe_enable) begin + `TRACE(2, ("%t: %s dot8[%0d]: a=0x%0h, b=0x%0h, c=0x%0h\n", + $time, INSTANCE_ID, i, a, b, c)) + end + end + `endif + end + +endmodule +``` + +### `VX_alu_unit.sv` — Wire in the New Sub-Unit + +Three changes are needed: updating the PE count, adding the routing condition, and instantiating the module. + +**1. Update PE count and index constants:** + +```verilog +localparam PE_COUNT = 1 + `EXT_M_ENABLED + 1; // +1 for DOT8 +localparam PE_SEL_BITS = `CLOG2(PE_COUNT); +localparam PE_IDX_INT = 0; +localparam PE_IDX_MDV = PE_IDX_INT + `EXT_M_ENABLED; +localparam PE_IDX_DOT8 = PE_IDX_MDV + 1; +``` + +**2. Add routing condition in the PE select logic:** + +```verilog +pe_select = PE_IDX_INT; // default: integer ALU +if (`EXT_M_ENABLED && (... xtype == ALU_TYPE_MULDIV)) + pe_select = PE_IDX_MDV; +else if (... xtype == ALU_TYPE_OTHER) + pe_select = PE_IDX_DOT8; +``` + +`ALU_TYPE_OTHER` is the xtype value set during decode, distinguishing DOT8 from both the integer ALU and MulDiv. + +**3. Instantiate `VX_alu_dot8` after `VX_alu_muldiv`:** + +```verilog +VX_alu_dot8 #( + .INSTANCE_ID (`SFORMATF(("%s-dot8%0d", INSTANCE_ID, block_idx))), + .NUM_LANES (NUM_LANES) +) dot8_unit ( + .clk (clk), + .reset (reset), + .execute_if (pe_execute_if[PE_IDX_DOT8]), + .result_if (pe_result_if[PE_IDX_DOT8]) +); +``` + +--- + +## Step 4: Testing + +### Build and Run + +```bash +# Build the dot8 regression test +make -C tests/regression/dot8 + +# Build the RTL simulator +make -s + +# Run with RTL simulation (4 cores, 4 warps, 4 threads) +./ci/blackbox.sh --driver=rtlsim --cores=4 --warps=4 --threads=4 --app=dot8 +``` + +### Performance Sweep + +Run the following configurations with `N=256` on a 4-core GPU and record **total instruction count** and **execution cycles**: + +| Warps | Threads | +|-------|---------| +| 4 | 4 | +| 4 | 8 | +| 8 | 4 | +| 8 | 8 | + +Compare against the scalar `int8_t` baseline. The RTL implementation should show the same instruction count reduction as the SimX version (fusing 4 multiplies + 3 adds into one instruction), with the added benefit of real pipeline timing at the configured 2-cycle latency. From 34c36a2c629001ce735c14f33d14b3356dc0e5e2 Mon Sep 17 00:00:00 2001 From: Aaron Kantsevoy <72467919+aakan511@users.noreply.github.com> Date: Thu, 23 Apr 2026 09:07:17 -0400 Subject: [PATCH 3/5] Create assignment8_solution.md --- Solutions/assignment8_solution.md | 149 ++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 Solutions/assignment8_solution.md diff --git a/Solutions/assignment8_solution.md b/Solutions/assignment8_solution.md new file mode 100644 index 0000000..9cc82dc --- /dev/null +++ b/Solutions/assignment8_solution.md @@ -0,0 +1,149 @@ +# Assignment 8: Tensor Core Extension (SimX) — Solution + +This document walks through the complete solution for extending the Vortex Tensor Core to support the **TF32** input format in the SimX cycle-level simulator. + +--- + +## Overview + +TF32 (TensorFloat-32) is a 19-bit floating-point format padded to 32 bits: + +| Field | Width | +|----------|-------| +| Sign | 1 bit | +| Exponent | 8 bits | +| Mantissa | 10 bits | +| Padding | 13 bits (to fill 32-bit register) | + +It shares the same exponent range as IEEE fp32 but has the same mantissa precision as fp16, making it a natural drop-in for MMA operations that want fp32 dynamic range without full fp32 compute cost. + +The changes required are minimal and localized to two files: `sim/common/tensor_cfg.h` and `sim/simx/tensor_unit.cpp`. + +--- + +## Step 1: Add TF32 Format Definition — `tensor_cfg.h` + +Add the `tf32` struct alongside the existing format definitions (`fp16`, `bf16`, etc.): + +```cpp +struct tf32 { + using dtype = uint32_t; // stored in a 32-bit register + static constexpr uint32_t id = 3; // unique format ID used in select_FEDP + static constexpr uint32_t bits = 32; + static constexpr const char* name = "tf32"; +}; +``` + +Then register it in `fmt_string()` so the simulator can print the format name in logs and error messages: + +```cpp +inline const char* fmt_string(uint32_t fmt) { + switch (fmt) { + case fp16::id: return fp16::name; + case bf16::id: return bf16::name; + case fp32::id: return fp32::name; + case int8::id: return int8::name; + case uint8::id: return uint8::name; + case int4::id: return int4::name; + case uint4::id: return uint4::name; + case tf32::id: return tf32::name; // <-- add this + default: return ""; + } +} +``` + +--- + +## Step 2: FMA Specialization — `tensor_unit.cpp` + +Add a template specialization of `FMA` for ``. This handles the case where both input matrices hold TF32 values and the accumulator is fp32. + +The two-step conversion process is: +1. Use `rv_xtof_s(raw_bits, exp_bits=8, mant_bits=10, rm, &fflags)` to convert a raw TF32 bit-pattern to an IEEE fp32 bit-pattern. This correctly handles the exponent bias and mantissa alignment. +2. Use `bit_cast()` to reinterpret those bits as a C++ `float` without any numeric conversion. + +```cpp +template <> +struct FMA { + static float eval(uint32_t a, uint32_t b, float c) { + uint32_t fflags = 0; + uint32_t rm = 0; // RNE: Round to Nearest, ties to Even + + // Convert TF32 bit-patterns to IEEE fp32 bit-patterns + uint32_t a_bits = rv_xtof_s(a, 8, 10, rm, &fflags); + uint32_t b_bits = rv_xtof_s(b, 8, 10, rm, &fflags); + + // Reinterpret bits as floats (no numeric conversion) + float a_f = bit_cast(a_bits); + float b_f = bit_cast(b_bits); + + // Multiply-accumulate into the fp32 accumulator + return (a_f * b_f) + c; + } +}; +``` + +> **Note:** The Vortex Tensor Unit uses an unfused multiply-add here (`(a * b) + c`) rather than `std::fma`, consistent with how the hardware MMA pipeline accumulates partial products in separate stages. + +--- + +## Step 3: Register TF32 in `select_FEDP` — `tensor_unit.cpp` + +`select_FEDP` maps `(input_type_id, output_type_id)` pairs to their FEDP evaluation function pointer. Add the TF32→FP32 case inside the `fp32` output branch: + +```cpp +static PFN_FEDP select_FEDP(uint32_t IT, uint32_t OT) { + switch (OT) { + case vt::fp32::id: + switch (IT) { + case vt::fp16::id: + return FEDP::eval; + case vt::bf16::id: + return FEDP::eval; + case vt::tf32::id: // <-- add this case + return FEDP::eval; + default: + std::cout << "Error: unsupported mma format: " + << IT << " -> " << OT << "!" << std::endl; + std::abort(); + } + // ... other output type cases unchanged + } +} +``` + +`FEDP` (Fused Element-wise Dot Product) wraps the `FMA` specialization in a loop over the vector elements, so no changes to `FEDP` itself are needed — the template instantiation is generated automatically from the specialization added in Step 2. + +--- + +## Step 4: Testing + +Clean any previous build and rebuild with the TF32 configuration before running, since input/output types are compiled into the kernel binary: + +```bash +# Clean prior build +make -C tests/regression/sgemm_tcu clean + +# Rebuild for 8 threads, TF32 input, FP32 output +CONFIGS="-DNUM_THREADS=8 -DITYPE=tf32 -DOTYPE=fp32" \ + make -C tests/regression/sgemm_tcu + +# Run on SimX with Tensor Core extension enabled +CONFIGS="-DNUM_THREADS=8 -DEXT_TCU_ENABLE" \ + ./ci/blackbox.sh --driver=simx --app=sgemm_tcu +``` + +--- + +## Step 5: Benchmark TF32 vs. FP16/BF16 + +Using `N=256` on a 4-core GPU, sweep the following thread configurations for `fp16`, `bf16`, and `tf32` (all accumulating to FP32): + +| Warps | Threads | +|-------|---------| +| 4 | 4 | +| 4 | 8 | +| 8 | 4 | +| 8 | 8 | + +Record **total instruction count** and **execution cycles** for each format and configuration. Because all three formats execute the same MMA instruction count (the loop trip count is unchanged), differences in cycles will reflect the per-operation latency of the FMA specialization and any pipeline effects from the wider 32-bit TF32 operands versus the 16-bit fp16/bf16 operands. From 89a05b8d3bce58948082c71ff66abc332645692a Mon Sep 17 00:00:00 2001 From: Aaron Kantsevoy <72467919+aakan511@users.noreply.github.com> Date: Thu, 23 Apr 2026 09:10:33 -0400 Subject: [PATCH 4/5] Create assigmnent9_solution.md --- Solutions/assigmnent9_solution.md | 174 ++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 Solutions/assigmnent9_solution.md diff --git a/Solutions/assigmnent9_solution.md b/Solutions/assigmnent9_solution.md new file mode 100644 index 0000000..b44e276 --- /dev/null +++ b/Solutions/assigmnent9_solution.md @@ -0,0 +1,174 @@ +# Assignment 9: Tensor Core (TF32) Support in RTL — Solution + +This document walks through the complete solution for extending the Vortex Tensor Core RTL to support **TF32** inputs via the **BHF (Berkeley HardFloat) backend**. + +--- + +## Overview + +Assignment 8 added TF32 to the SimX software emulator. This assignment wires TF32 into the actual RTL pipeline, specifically the BHF FEDP (Fused Element-wise Dot Product) module that drives the hardware Tensor Core. + +The BHF backend uses `VX_tcu_bhf_fmul` — a parameterized floating-point multiplier — to compute products before accumulation. Adding TF32 means instantiating a new multiplier configured for TF32's 8-exponent / 10-mantissa bit format, unpacking TF32 operands correctly, and routing the result through the existing mux. + +Changes touch three files: `VX_tcu_pkg.sv`, and `VX_tcu_fedp_bhf.sv`. The `tensor_cfg.h` changes are identical to Assignment 8 and are not repeated here. + +--- + +## Step 1: Format Definition — `tensor_cfg.h` + +Identical to Assignment 8. Add the `tf32` struct and register it in `fmt_string()`. See that solution for details. + +--- + +## Step 2: Update Tensor Core Package — `VX_tcu_pkg.sv` + +Add the TF32 format ID constant alongside the other format IDs: + +```sv +localparam TCU_FP16_ID = 1; +localparam TCU_BF16_ID = 2; +localparam TCU_TF32_ID = 3; // <-- add this +localparam TCU_FP32_ID = 4; +// ... integer format IDs unchanged +``` + +Add TF32 to the `trace_fmt` task so pipeline traces print `"tf32"` instead of `"?"`: + +```sv +task trace_fmt(input int level, input logic [3:0] fmt); + case (fmt) + TCU_FP16_ID: `TRACE(level, ("fp16")) + TCU_BF16_ID: `TRACE(level, ("bf16")) + TCU_TF32_ID: `TRACE(level, ("tf32")) // <-- add this + TCU_FP32_ID: `TRACE(level, ("fp32")) + TCU_U8_ID: `TRACE(level, ("u8")) + TCU_I4_ID: `TRACE(level, ("i4")) + TCU_U4_ID: `TRACE(level, ("u4")) + default: `TRACE(level, ("?")) + endcase +endtask +``` + +--- + +## Step 3: Extend BHF FEDP — `VX_tcu_fedp_bhf.sv` + +This is the main change. Three sub-steps are needed: unpacking TF32 operands, instantiating the TF32 multiplier, and routing its output through the result mux. + +### 3a. Unpack TF32 Operands + +The existing unpack block splits each 32-bit element into two 16-bit halves for fp16/bf16, giving `TCK = 2N` packed 16-bit lanes. TF32 elements are 19 bits wide and one element fills an entire 32-bit register, so there is no second element to unpack. The second slot is filled with zero to preserve the same `TCK`-wide pipeline width used by the accumulator: + +```sv +wire [TCK-1:0][15:0] a_row16; +wire [TCK-1:0][15:0] b_col16; +wire [TCK-1:0][18:0] a_row19; // <-- add +wire [TCK-1:0][18:0] b_col19; // <-- add + +for (genvar i = 0; i < N; i++) begin : g_unpack + // Existing fp16/bf16 unpacking (unchanged) + assign a_row16[2*i] = a_row[i][15:0]; + assign a_row16[2*i+1] = a_row[i][31:16]; + assign b_col16[2*i] = b_col[i][15:0]; + assign b_col16[2*i+1] = b_col[i][31:16]; + + // TF32: one 19-bit element per register; interleave zeros to match TCK width + assign a_row19[2*i] = a_row[i][18:0]; + assign a_row19[2*i+1] = 19'd0; + assign b_col19[2*i] = b_col[i][18:0]; + assign b_col19[2*i+1] = 19'd0; +end +``` + +The zero-interleaving means the accumulator still sees `TCK` inputs, but every other product is zero — effectively halving throughput for TF32, which is correct since TF32 elements are twice as wide as fp16/bf16. + +### 3b. Instantiate the TF32 Multiplier + +Inside the per-lane generate loop `g_prod`, add a `VX_tcu_bhf_fmul` instance configured for TF32's format: 8 exponent bits and 11 significand bits (10 explicit mantissa bits + 1 implicit leading bit). Input is in IEEE format (`IN_REC=0`); output is in recoded format (`OUT_REC=1`) to feed directly into the BHF accumulator: + +```sv +for (genvar i = 0; i < TCK; i++) begin : g_prod + wire [32:0] mult_result_fp16; + wire [32:0] mult_result_bf16; + wire [32:0] mult_result_tf32; // <-- add + + // FP16 multiplier (unchanged) + VX_tcu_bhf_fmul #( + .IN_EXPW (5), .IN_SIGW (11), + .OUT_EXPW(8), .OUT_SIGW(24), + .IN_REC(0), .OUT_REC(1), + .MUL_LATENCY(FMUL_LATENCY), .RND_LATENCY(FRND_LATENCY) + ) fp16_mul ( .clk(clk), .reset(reset), .enable(enable), .frm(frm), + .a(a_row16[i]), .b(b_col16[i]), .y(mult_result_fp16), + `UNUSED_PIN(fflags) ); + + // BF16 multiplier (unchanged) + VX_tcu_bhf_fmul #( + .IN_EXPW (8), .IN_SIGW (8), + .OUT_EXPW(8), .OUT_SIGW(24), + .IN_REC(0), .OUT_REC(1), + .MUL_LATENCY(FMUL_LATENCY), .RND_LATENCY(FRND_LATENCY) + ) bf16_mul ( .clk(clk), .reset(reset), .enable(enable), .frm(frm), + .a(a_row16[i]), .b(b_col16[i]), .y(mult_result_bf16), + `UNUSED_PIN(fflags) ); + + // TF32 multiplier: 8 exponent bits, 11 significand bits (10 mantissa + implicit 1) + VX_tcu_bhf_fmul #( + .IN_EXPW (8), .IN_SIGW (11), + .OUT_EXPW(8), .OUT_SIGW(24), + .IN_REC (0), + .OUT_REC(1), + .MUL_LATENCY(FMUL_LATENCY), + .RND_LATENCY(FRND_LATENCY) + ) tf32_mul ( + .clk (clk), + .reset (reset), + .enable (enable), + .frm (frm), + .a (a_row19[i]), + .b (b_col19[i]), + .y (mult_result_tf32), + `UNUSED_PIN(fflags) + ); +``` + +**Why `IN_SIGW=11`?** TF32 has 10 explicit mantissa bits; the significand width passed to BHF includes the implicit leading 1, so the correct value is 11 — matching fp16 despite TF32's wider exponent field. + +### 3c. Route Through the Result Mux + +Add `3'd3` (matching `TCU_TF32_ID`) to the format select mux: + +```sv +logic [32:0] mult_result_mux; +always_comb begin + case (fmt_s_delayed) + 3'd1: mult_result_mux = mult_result_fp16; + 3'd2: mult_result_mux = mult_result_bf16; + 3'd3: mult_result_mux = mult_result_tf32; // <-- add + default: mult_result_mux = 'x; + endcase +end +``` + +The selected result flows into the existing fp32 accumulator chain unchanged — no accumulator modifications are needed since all three formats produce a recoded fp32 output from the multiplier. + +--- + +## Step 4: Testing + +Clean and rebuild with the TF32 configuration before running, since input/output types are compiled into the kernel binary: + +```bash +# Clean prior build +make -C tests/regression/sgemm_tcu clean + +# Rebuild for 8 threads, TF32 input, FP32 output +CONFIGS="-DNUM_THREADS=8 -DITYPE=tf32 -DOTYPE=fp32" \ + make -C tests/regression/sgemm_tcu + +# Run on RTL simulator with Tensor Core (BHF backend) enabled +CONFIGS="-DNUM_THREADS=8 -DEXT_TCU_ENABLE -DTCU_BHF" \ + ./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu +``` + +--- From 44e027c0f5b2da7b885759cfac99531af2970868 Mon Sep 17 00:00:00 2001 From: Aaron Kantsevoy <72467919+aakan511@users.noreply.github.com> Date: Thu, 23 Apr 2026 09:50:40 -0400 Subject: [PATCH 5/5] Rename assigmnent9_solution.md to assignment9_solution.md --- Solutions/{assigmnent9_solution.md => assignment9_solution.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Solutions/{assigmnent9_solution.md => assignment9_solution.md} (100%) diff --git a/Solutions/assigmnent9_solution.md b/Solutions/assignment9_solution.md similarity index 100% rename from Solutions/assigmnent9_solution.md rename to Solutions/assignment9_solution.md