From f1dad61fd79023370374ff3c1ade526ce8490bfb Mon Sep 17 00:00:00 2001
From: Aaron Kantsevoy <72467919+aakan511@users.noreply.github.com>
Date: Wed, 22 Apr 2026 23:35:33 -0400
Subject: [PATCH 1/5] Create assignment5_solution.md

---
 Solutions/assignment5_solution.md | 284 ++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 Solutions/assignment5_solution.md

diff --git a/Solutions/assignment5_solution.md b/Solutions/assignment5_solution.md
new file mode 100644
index 0000000..dddc249
--- /dev/null
+++ b/Solutions/assignment5_solution.md
@@ -0,0 +1,284 @@
+# Assignment 5: Dot Product Acceleration (SimX) — Solution
+
+This document walks through the complete solution for extending the Vortex GPU microarchitecture with a custom `VX_DOT8` RISC-V instruction and implementing it in the SimX cycle-level simulator.
+
+---
+
+## Overview
+
+`VX_DOT8` computes the integer dot product of two packed vectors of four `int8` elements:
+
+```
+rd = A1*B1 + A2*B2 + A3*B3 + A4*B4
+```
+
+The instruction uses the **R-Type** RISC-V format with:
+- `opcode = 0x0B` (RISC-V custom-0)
+- `funct7 = 3`
+- `funct3 = 0`
+
+---
+
+## Step 1: ISA Extension — `vx_intrinsics.h`
+
+Add the `vx_dot8` inline intrinsic using GAS `.insn r` pseudo-instruction syntax.
+
+```cpp
+// DOT8: computes dot product of two packed int8x4 vectors
+inline int vx_dot8(int a, int b) {
+  int ret;
+  asm volatile (".insn r %1, 0, 3, %0, %2, %3"
+    : "=r"(ret)
+    : "i"(RISCV_CUSTOM0), "r"(a), "r"(b));
+  return ret;
+}
+```
+
+**Breakdown of `.insn r` arguments:**
+| Field    | Value           | Notes                          |
+|----------|-----------------|--------------------------------|
+| opcode   | `RISCV_CUSTOM0` | Expands to `0x0B`              |
+| funct3   | `0`             | Selects DOT8 within custom ops |
+| funct7   | `3`             | Selects DOT8 within custom ops |
+| rd       | `%0`            | Output register                |
+| rs1      | `%2`            | Packed A input                 |
+| rs2      | `%3`            | Packed B input                 |
+
+> Reference: [RISC-V `.insn` format — GNU Binutils docs](https://sourceware.org/binutils/docs/as/RISC_002dV_002dFormats.html)
+
+---
+
+## Step 2: Matrix Multiplication Kernel
+
+### Directory Setup
+
+Clone `tests/regression/sgemm` into `tests/regression/dot8` and update the Makefile:
+
+```makefile
+# tests/regression/dot8/Makefile
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := dot8
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+SRCS    := $(SRC_DIR)/main.cpp
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -n32
+
+include ../common.mk
+```
+
+### `common.h`
+
+Define the source and destination types used across the kernel and host:
+
+```cpp
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+typedef int8_t  SrcType;
+typedef int32_t DstType;
+
+typedef struct {
+  uint32_t grid_dim[2];
+  uint32_t size;
+  uint64_t A_addr;
+  uint64_t B_addr;
+  uint64_t C_addr;
+} kernel_arg_t;
+
+#endif
+```
+
+### `kernel.cpp`
+
+Each GPU thread computes one output cell `C[row][col]`. Rows from A and columns from B are packed into `uint32_t` registers before calling `vx_dot8`.
+
+```cpp
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+    auto A    = reinterpret_cast<SrcType*>(arg->A_addr);
+    auto B    = reinterpret_cast<SrcType*>(arg->B_addr);
+    auto C    = reinterpret_cast<DstType*>(arg->C_addr);
+    auto size = arg->size;
+
+    int row = blockIdx.x;
+    int col = blockIdx.y;
+
+    DstType sum(0);
+    for (int k = 0; k < size; k += 4) {
+        // Pack 4 consecutive elements from row of A (row-major)
+        uint32_t packedA = *((uint32_t*)(A + (row * size + k)));
+
+        // Pack 4 elements from column of B (row-major, non-contiguous)
+        uint32_t packedB = ((uint8_t)B[(k + 0) * size + col] <<  0)
+                         | ((uint8_t)B[(k + 1) * size + col] <<  8)
+                         | ((uint8_t)B[(k + 2) * size + col] << 16)
+                         | ((uint8_t)B[(k + 3) * size + col] << 24);
+
+        sum += vx_dot8(packedA, packedB);
+    }
+
+    C[row * size + col] = sum;
+}
+
+int main() {
+    kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+    return vx_spawn_threads(2, arg->grid_dim, nullptr,
+                            (vx_kernel_func_cb)kernel_body, arg);
+}
+```
+
+### `main.cpp` — Key Changes from `sgemm`
+
+**Types:** Use `SrcType` (`int8_t`) for input matrices and `DstType` (`int32_t`) for the output.
+
+**Buffer sizes** must reflect the correct element size in bytes:
+```cpp
+uint32_t src_buf_size = size * size * sizeof(SrcType);  // int8_t buffers
+uint32_t dst_buf_size = size * size * sizeof(DstType);  // int32_t buffer
+```
+
+**CPU reference implementation** for verification:
+```cpp
+static void matmul_cpu(DstType* out, const SrcType* A, const SrcType* B,
+                       uint32_t width, uint32_t height) {
+    for (uint32_t row = 0; row < height; ++row) {
+        for (uint32_t col = 0; col < width; ++col) {
+            DstType sum(0);
+            for (uint32_t e = 0; e < width; ++e) {
+                sum += (DstType)(A[row * width + e]) * (DstType)(B[e * width + col]);
+            }
+            out[row * width + col] = sum;
+        }
+    }
+}
+```
+
+---
+
+## Step 3: SimX Simulator Implementation
+
+### `types.h` — Add `DOT8` to `AluType`
+
+```cpp
+enum class AluType {
+  // ... existing types ...
+  CZERO,
+  DOT8   // <-- add this
+};
+
+// Also update the ostream operator:
+inline std::ostream& operator<<(std::ostream& os, const AluType& type) {
+  switch (type) {
+    // ... existing cases ...
+    case AluType::DOT8:  os << "DOT8"; break;
+    default: assert(false);
+  }
+  return os;
+}
+```
+
+### `decode.cpp` — Decode the New Instruction
+
+Add the `DOT8` string in `op_string()`:
+
+```cpp
+case AluType::DOT8: return {"DOT8", ""};
+```
+
+Add a new `case 3` block inside the custom-instruction switch in `Emulator::decode()`:
+
+```cpp
+case 3: {
+    switch (funct3) {
+    case 0: { // DOT8
+        auto instr = std::allocate_shared<Instr>(instr_pool_, uuid, FUType::ALU);
+        instr->setOpType(AluType::DOT8);
+        instr->setArgs(IntrAluArgs{0, 0, 0});
+        instr->setDestReg(rd,  RegType::Integer);
+        instr->setSrcReg(0, rs1, RegType::Integer);
+        instr->setSrcReg(1, rs2, RegType::Integer);
+        ibuffer.push_back(instr);
+    } break;
+    default:
+        std::abort();
+    }
+} break;
+```
+
+### `execute.cpp` — Emulate the Dot Product
+
+Add a `DOT8` case inside the ALU execution switch. Each byte is sign-extended to `int8_t` before multiplication, and products are accumulated as `int32_t`:
+
+```cpp
+case AluType::DOT8: {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
+        if (!warp.tmask.test(t))
+            continue;
+
+        uint32_t packedA = rs1_data[t].u;
+        uint32_t packedB = rs2_data[t].u;
+
+        // Extract and sign-extend each byte
+        int8_t a0 = (int8_t)(packedA      ), b0 = (int8_t)(packedB      );
+        int8_t a1 = (int8_t)(packedA >>  8), b1 = (int8_t)(packedB >>  8);
+        int8_t a2 = (int8_t)(packedA >> 16), b2 = (int8_t)(packedB >> 16);
+        int8_t a3 = (int8_t)(packedA >> 24), b3 = (int8_t)(packedB >> 24);
+
+        int32_t sum = (int32_t)(a0 * b0)
+                    + (int32_t)(a1 * b1)
+                    + (int32_t)(a2 * b2)
+                    + (int32_t)(a3 * b3);
+
+        DP(3, "*** DOT8[" << t << "]: a=0x" << std::hex << packedA
+               << ", b=0x" << packedB << ", c=0x" << sum << std::dec);
+
+        rd_data[t].i = sum;
+    }
+} break;
+```
+
+### `func_unit.cpp` — Set 2-Cycle Latency
+
+Add `DOT8` to the 2-cycle latency group in `AluUnit::tick()`:
+
+```cpp
+case AluType::AND:
+case AluType::OR:
+case AluType::CZERO:
+case AluType::DOT8:   // <-- add here
+    delay = 2;
+    break;
+```
+
+---
+
+## Step 4: Testing
+
+### Build and Run
+
+```bash
+# Build the dot8 regression test
+make -C tests/regression/dot8
+
+# Build the simulator
+make -s
+
+# Run with SimX (4 cores, 4 warps, 4 threads)
+./ci/blackbox.sh --driver=simx --cores=4 --warps=4 --threads=4 --app=dot8
+```
+
+### Performance Sweep
+
+Run the following configurations with `N=256` on a 4-core GPU and record **total instruction count** and **execution cycles**:
+
+| Warps | Threads |
+|-------|---------|
+| 4     | 4       |
+| 4     | 8       |
+| 8     | 4       |
+| 8     | 8       |
+
+Compare against the scalar `int8_t` baseline kernel. The `VX_DOT8` version should show a meaningful reduction in instruction count by replacing 4 multiplies + 3 adds with a single fused instruction.

From 31e6396d593b4e94455e543cebe1849e733ddcae Mon Sep 17 00:00:00 2001
From: Aaron Kantsevoy <72467919+aakan511@users.noreply.github.com>
Date: Thu, 23 Apr 2026 09:04:42 -0400
Subject: [PATCH 2/5] Create assignment6_solution.md

---
 Solutions/assignment6_solution.md | 284 ++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 Solutions/assignment6_solution.md

diff --git a/Solutions/assignment6_solution.md b/Solutions/assignment6_solution.md
new file mode 100644
index 0000000..c9bb664
--- /dev/null
+++ b/Solutions/assignment6_solution.md
@@ -0,0 +1,284 @@
+# Assignment 6: Dot Product Acceleration (RTL) — Solution
+
+This document walks through the complete solution for implementing `VX_DOT8` in the Vortex RTL hardware. Steps 1 and 2 (ISA extension and kernel) are identical to Assignment 5 — refer to that solution for details. This document focuses on the RTL-specific changes in Step 3.
+
+---
+
+## Overview
+
+Rather than emulating `VX_DOT8` in the SimX software simulator, this assignment implements it as a real hardware module (`VX_alu_dot8`) wired into the ALU pipeline as a third sub-unit alongside the integer ALU and MulDiv unit.
+
+---
+
+## Step 1 & 2: ISA Extension and Kernel
+
+Unchanged from Assignment 5. See that solution for:
+- `vx_dot8` intrinsic in `vx_intrinsics.h`
+- `kernel.cpp` packing and accumulation logic
+- `main.cpp` buffer sizing and CPU reference implementation
+
+---
+
+## Step 3: Hardware RTL Implementation
+
+### `VX_config.vh` — Define `LATENCY_DOT8`
+
+Add the DOT8 latency macro alongside the other functional unit latencies:
+
+```verilog
+// DOT8 Latency
+`ifndef LATENCY_DOT8
+`define LATENCY_DOT8 2
+`endif
+```
+
+### `VX_gpu_pkg.sv` — Add `INST_ALU_DOT8` Opcode
+
+Replace the unused ALU opcode slot `4'b0001` with the DOT8 type:
+
+```verilog
+localparam INST_ALU_ADD  = 4'b0000;
+localparam INST_ALU_DOT8 = 4'b0001;  // was INST_ALU_UNUSED
+localparam INST_ALU_LUI  = 4'b0010;
+// ... rest unchanged
+```
+
+### `VX_trace_pkg.sv` — Add Trace String
+
+Add a `DOT8` case to the ALU trace decoder so pipeline traces print the instruction name correctly:
+
+```verilog
+INST_ALU_AND:   `TRACE(level, ("AND"))
+INST_ALU_CZEQ:  `TRACE(level, ("CZERO.EQZ"))
+INST_ALU_CZNE:  `TRACE(level, ("CZERO.NEZ"))
+INST_ALU_DOT8:  `TRACE(level, ("DOT8"))   // <-- add this
+default:        `TRACE(level, ("?"))
+```
+
+### `VX_decode.sv` — Decode the Instruction
+
+Add a `7'd3` case inside the custom-opcode block to decode `VX_DOT8`. The instruction targets the ALU functional unit and uses `ALU_TYPE_OTHER` to route it to the new dot8 sub-unit (rather than the integer ALU or MulDiv):
+
+```verilog
+7'd3: begin
+    case (funct3)
+        3'h0: begin // DOT8
+            ex_type          = EX_ALU;
+            op_type          = INST_ALU_DOT8;
+            op_args.alu      = '0;
+            op_args.alu.xtype = ALU_TYPE_OTHER;
+            `USED_IREG (rd);
+            `USED_IREG (rs1);
+            `USED_IREG (rs2);
+        end
+        default:;
+    endcase
+end
+```
+
+**Key decisions:**
+- `EX_ALU` routes this instruction through the ALU unit's dispatch logic
+- `INST_ALU_DOT8` is the op_type tag used for tracing
+- `ALU_TYPE_OTHER` is the xtype field checked in `VX_alu_unit.sv` to select the dot8 PE
+- `USED_IREG` macros register rd, rs1, and rs2 as active, enabling writeback and scoreboard tracking
+
+### `VX_alu_dot8.sv` — New Hardware Module
+
+Create `hw/rtl/core/VX_alu_dot8.sv`. The module uses `VX_pe_serializer` to time-multiplex lanes across PEs, then instantiates one PE per `NUM_PES` that computes the dot product combinatorially before latching through `BUFFER_EX`.
+
+The key computation — sign-extending each byte and computing the four partial products — is:
+
+```verilog
+assign c = (signed'(a[31:24]) * signed'(b[31:24])) +
+           (signed'(a[23:16]) * signed'(b[23:16])) +
+           (signed'(a[15:8])  * signed'(b[15:8]))  +
+           (signed'(a[7:0])   * signed'(b[7:0]));
+```
+
+`signed'(...)` performs an in-place cast to signed before multiplication, so each 8-bit slice is treated as a two's-complement `int8`. The four products are summed into a 32-bit result.
+
+The result is pipelined through the `BUFFER_EX` macro at the configured `LATENCY_DOT8` (2 cycles):
+
+```verilog
+`BUFFER_EX(result, c, pe_enable, 1, LATENCY_DOT8);
+assign pe_data_out[i] = `XLEN'(result);
+```
+
+Full module:
+
+```verilog
+`include "VX_define.vh"
+
+module VX_alu_dot8 import VX_gpu_pkg::*; #(
+    parameter `STRING INSTANCE_ID = "",
+    parameter NUM_LANES = 1
+) (
+    input wire clk,
+    input wire reset,
+    VX_execute_if.slave  execute_if,
+    VX_result_if.master  result_if
+);
+    `UNUSED_SPARAM (INSTANCE_ID)
+    localparam PID_BITS  = `CLOG2(`NUM_THREADS / NUM_LANES);
+    localparam PID_WIDTH = `UP(PID_BITS);
+    localparam TAG_WIDTH = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + 1
+                         + NUM_REGS_BITS + PID_WIDTH + 1 + 1;
+    localparam LATENCY_DOT8 = `LATENCY_DOT8;
+    localparam PE_RATIO = 1;
+    localparam NUM_PES  = `UP(NUM_LANES / PE_RATIO);
+
+    `UNUSED_VAR (execute_if.data.op_type)
+    `UNUSED_VAR (execute_if.data.op_args)
+    `UNUSED_VAR (execute_if.data.rs3_data)
+
+    wire pe_enable;
+    wire [NUM_LANES-1:0][2*`XLEN-1:0] data_in;
+    wire [NUM_PES-1:0][2*`XLEN-1:0]  pe_data_in;
+    wire [NUM_PES-1:0][`XLEN-1:0]    pe_data_out;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
+        assign data_in[i][0      +: `XLEN] = execute_if.data.rs1_data[i];
+        assign data_in[i][`XLEN  +: `XLEN] = execute_if.data.rs2_data[i];
+    end
+
+    VX_pe_serializer #(
+        .NUM_LANES      (NUM_LANES),
+        .NUM_PES        (NUM_PES),
+        .LATENCY        (LATENCY_DOT8),
+        .DATA_IN_WIDTH  (2 * `XLEN),
+        .DATA_OUT_WIDTH (`XLEN),
+        .TAG_WIDTH      (TAG_WIDTH),
+        .PE_REG         (1)
+    ) pe_serializer (
+        .clk        (clk),
+        .reset      (reset),
+        .valid_in   (execute_if.valid),
+        .data_in    (data_in),
+        .tag_in     ({
+            execute_if.data.uuid,
+            execute_if.data.wid,
+            execute_if.data.tmask,
+            execute_if.data.PC,
+            execute_if.data.wb,
+            execute_if.data.rd,
+            execute_if.data.pid,
+            execute_if.data.sop,
+            execute_if.data.eop
+        }),
+        .ready_in   (execute_if.ready),
+        .pe_enable  (pe_enable),
+        .pe_data_in (pe_data_out),
+        .pe_data_out(pe_data_in),
+        .valid_out  (result_if.valid),
+        .data_out   (result_if.data.data),
+        .tag_out    ({
+            result_if.data.uuid,
+            result_if.data.wid,
+            result_if.data.tmask,
+            result_if.data.PC,
+            result_if.data.wb,
+            result_if.data.rd,
+            result_if.data.pid,
+            result_if.data.sop,
+            result_if.data.eop
+        }),
+        .ready_out  (result_if.ready)
+    );
+
+    for (genvar i = 0; i < NUM_PES; ++i) begin : g_PEs
+        /* verilator lint_off UNUSEDSIGNAL */
+        wire [`XLEN-1:0] a = pe_data_in[i][0     +: `XLEN];
+        wire [`XLEN-1:0] b = pe_data_in[i][`XLEN +: `XLEN];
+        /* verilator lint_on UNUSEDSIGNAL */
+        wire [31:0] c, result;
+
+        assign c = (signed'(a[31:24]) * signed'(b[31:24])) +
+                   (signed'(a[23:16]) * signed'(b[23:16])) +
+                   (signed'(a[15:8])  * signed'(b[15:8]))  +
+                   (signed'(a[7:0])   * signed'(b[7:0]));
+
+        `BUFFER_EX(result, c, pe_enable, 1, LATENCY_DOT8);
+        assign pe_data_out[i] = `XLEN'(result);
+
+    `ifdef DBG_TRACE_PIPELINE
+        always @(posedge clk) begin
+            if (pe_enable) begin
+                `TRACE(2, ("%t: %s dot8[%0d]: a=0x%0h, b=0x%0h, c=0x%0h\n",
+                           $time, INSTANCE_ID, i, a, b, c))
+            end
+        end
+    `endif
+    end
+
+endmodule
+```
+
+### `VX_alu_unit.sv` — Wire in the New Sub-Unit
+
+Three changes are needed: updating the PE count, adding the routing condition, and instantiating the module.
+
+**1. Update PE count and index constants:**
+
+```verilog
+localparam PE_COUNT    = 1 + `EXT_M_ENABLED + 1;  // +1 for DOT8
+localparam PE_SEL_BITS = `CLOG2(PE_COUNT);
+localparam PE_IDX_INT  = 0;
+localparam PE_IDX_MDV  = PE_IDX_INT + `EXT_M_ENABLED;
+localparam PE_IDX_DOT8 = PE_IDX_MDV + 1;
+```
+
+**2. Add routing condition in the PE select logic:**
+
+```verilog
+pe_select = PE_IDX_INT;  // default: integer ALU
+if (`EXT_M_ENABLED && (... xtype == ALU_TYPE_MULDIV))
+    pe_select = PE_IDX_MDV;
+else if (... xtype == ALU_TYPE_OTHER)
+    pe_select = PE_IDX_DOT8;
+```
+
+`ALU_TYPE_OTHER` is the xtype value set during decode, distinguishing DOT8 from both the integer ALU and MulDiv.
+
+**3. Instantiate `VX_alu_dot8` after `VX_alu_muldiv`:**
+
+```verilog
+VX_alu_dot8 #(
+    .INSTANCE_ID (`SFORMATF(("%s-dot8%0d", INSTANCE_ID, block_idx))),
+    .NUM_LANES   (NUM_LANES)
+) dot8_unit (
+    .clk        (clk),
+    .reset      (reset),
+    .execute_if (pe_execute_if[PE_IDX_DOT8]),
+    .result_if  (pe_result_if[PE_IDX_DOT8])
+);
+```
+
+---
+
+## Step 4: Testing
+
+### Build and Run
+
+```bash
+# Build the dot8 regression test
+make -C tests/regression/dot8
+
+# Build the RTL simulator
+make -s
+
+# Run with RTL simulation (4 cores, 4 warps, 4 threads)
+./ci/blackbox.sh --driver=rtlsim --cores=4 --warps=4 --threads=4 --app=dot8
+```
+
+### Performance Sweep
+
+Run the following configurations with `N=256` on a 4-core GPU and record **total instruction count** and **execution cycles**:
+
+| Warps | Threads |
+|-------|---------|
+| 4     | 4       |
+| 4     | 8       |
+| 8     | 4       |
+| 8     | 8       |
+
+Compare against the scalar `int8_t` baseline. The RTL implementation should show the same instruction count reduction as the SimX version (fusing 4 multiplies + 3 adds into one instruction), with the added benefit of real pipeline timing at the configured 2-cycle latency.

From 34c36a2c629001ce735c14f33d14b3356dc0e5e2 Mon Sep 17 00:00:00 2001
From: Aaron Kantsevoy <72467919+aakan511@users.noreply.github.com>
Date: Thu, 23 Apr 2026 09:07:17 -0400
Subject: [PATCH 3/5] Create assignment8_solution.md

---
 Solutions/assignment8_solution.md | 149 ++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 Solutions/assignment8_solution.md

diff --git a/Solutions/assignment8_solution.md b/Solutions/assignment8_solution.md
new file mode 100644
index 0000000..9cc82dc
--- /dev/null
+++ b/Solutions/assignment8_solution.md
@@ -0,0 +1,149 @@
+# Assignment 8: Tensor Core Extension (SimX) — Solution
+
+This document walks through the complete solution for extending the Vortex Tensor Core to support the **TF32** input format in the SimX cycle-level simulator.
+
+---
+
+## Overview
+
+TF32 (TensorFloat-32) is a 19-bit floating-point format padded to 32 bits:
+
+| Field    | Width |
+|----------|-------|
+| Sign     | 1 bit |
+| Exponent | 8 bits |
+| Mantissa | 10 bits |
+| Padding  | 13 bits (to fill 32-bit register) |
+
+It shares the same exponent range as IEEE fp32 but has the same mantissa precision as fp16, making it a natural drop-in for MMA operations that want fp32 dynamic range without full fp32 compute cost.
+
+The changes required are minimal and localized to two files: `sim/common/tensor_cfg.h` and `sim/simx/tensor_unit.cpp`.
+
+---
+
+## Step 1: Add TF32 Format Definition — `tensor_cfg.h`
+
+Add the `tf32` struct alongside the existing format definitions (`fp16`, `bf16`, etc.):
+
+```cpp
+struct tf32 {
+  using dtype = uint32_t;              // stored in a 32-bit register
+  static constexpr uint32_t id   = 3; // unique format ID used in select_FEDP
+  static constexpr uint32_t bits = 32;
+  static constexpr const char* name = "tf32";
+};
+```
+
+Then register it in `fmt_string()` so the simulator can print the format name in logs and error messages:
+
+```cpp
+inline const char* fmt_string(uint32_t fmt) {
+  switch (fmt) {
+  case fp16::id:  return fp16::name;
+  case bf16::id:  return bf16::name;
+  case fp32::id:  return fp32::name;
+  case int8::id:  return int8::name;
+  case uint8::id: return uint8::name;
+  case int4::id:  return int4::name;
+  case uint4::id: return uint4::name;
+  case tf32::id:  return tf32::name;  // <-- add this
+  default:        return "";
+  }
+}
+```
+
+---
+
+## Step 2: FMA Specialization — `tensor_unit.cpp`
+
+Add a template specialization of `FMA` for `<vt::tf32, vt::fp32>`. This handles the case where both input matrices hold TF32 values and the accumulator is fp32.
+
+The two-step conversion process is:
+1. Use `rv_xtof_s(raw_bits, exp_bits=8, mant_bits=10, rm, &fflags)` to convert a raw TF32 bit-pattern to an IEEE fp32 bit-pattern. This correctly handles the exponent bias and mantissa alignment.
+2. Use `bit_cast<float>()` to reinterpret those bits as a C++ `float` without any numeric conversion.
+
+```cpp
+template <>
+struct FMA<vt::tf32, vt::fp32> {
+  static float eval(uint32_t a, uint32_t b, float c) {
+    uint32_t fflags = 0;
+    uint32_t rm = 0; // RNE: Round to Nearest, ties to Even
+
+    // Convert TF32 bit-patterns to IEEE fp32 bit-patterns
+    uint32_t a_bits = rv_xtof_s(a, 8, 10, rm, &fflags);
+    uint32_t b_bits = rv_xtof_s(b, 8, 10, rm, &fflags);
+
+    // Reinterpret bits as floats (no numeric conversion)
+    float a_f = bit_cast<float>(a_bits);
+    float b_f = bit_cast<float>(b_bits);
+
+    // Multiply-accumulate into the fp32 accumulator
+    return (a_f * b_f) + c;
+  }
+};
+```
+
+> **Note:** The Vortex Tensor Unit uses an unfused multiply-add here (`(a * b) + c`) rather than `std::fma`, consistent with how the hardware MMA pipeline accumulates partial products in separate stages.
+
+---
+
+## Step 3: Register TF32 in `select_FEDP` — `tensor_unit.cpp`
+
+`select_FEDP` maps `(input_type_id, output_type_id)` pairs to their FEDP evaluation function pointer. Add the TF32→FP32 case inside the `fp32` output branch:
+
+```cpp
+static PFN_FEDP select_FEDP(uint32_t IT, uint32_t OT) {
+  switch (OT) {
+  case vt::fp32::id:
+    switch (IT) {
+    case vt::fp16::id:
+      return FEDP<vt::fp16, vt::fp32>::eval;
+    case vt::bf16::id:
+      return FEDP<vt::bf16, vt::fp32>::eval;
+    case vt::tf32::id:                          // <-- add this case
+      return FEDP<vt::tf32, vt::fp32>::eval;
+    default:
+      std::cout << "Error: unsupported mma format: "
+                << IT << " -> " << OT << "!" << std::endl;
+      std::abort();
+    }
+  // ... other output type cases unchanged
+  }
+}
+```
+
+`FEDP` (Fused Element-wise Dot Product) wraps the `FMA` specialization in a loop over the vector elements, so no changes to `FEDP` itself are needed — the template instantiation is generated automatically from the specialization added in Step 2.
+
+---
+
+## Step 4: Testing
+
+Clean any previous build and rebuild with the TF32 configuration before running, since input/output types are compiled into the kernel binary:
+
+```bash
+# Clean prior build
+make -C tests/regression/sgemm_tcu clean
+
+# Rebuild for 8 threads, TF32 input, FP32 output
+CONFIGS="-DNUM_THREADS=8 -DITYPE=tf32 -DOTYPE=fp32" \
+  make -C tests/regression/sgemm_tcu
+
+# Run on SimX with Tensor Core extension enabled
+CONFIGS="-DNUM_THREADS=8 -DEXT_TCU_ENABLE" \
+  ./ci/blackbox.sh --driver=simx --app=sgemm_tcu
+```
+
+---
+
+## Step 5: Benchmark TF32 vs. FP16/BF16
+
+Using `N=256` on a 4-core GPU, sweep the following thread configurations for `fp16`, `bf16`, and `tf32` (all accumulating to FP32):
+
+| Warps | Threads |
+|-------|---------|
+| 4     | 4       |
+| 4     | 8       |
+| 8     | 4       |
+| 8     | 8       |
+
+Record **total instruction count** and **execution cycles** for each format and configuration. Because all three formats execute the same MMA instruction count (the loop trip count is unchanged), differences in cycles will reflect the per-operation latency of the FMA specialization and any pipeline effects from the wider 32-bit TF32 operands versus the 16-bit fp16/bf16 operands.

From 89a05b8d3bce58948082c71ff66abc332645692a Mon Sep 17 00:00:00 2001
From: Aaron Kantsevoy <72467919+aakan511@users.noreply.github.com>
Date: Thu, 23 Apr 2026 09:10:33 -0400
Subject: [PATCH 4/5] Create assigmnent9_solution.md

---
 Solutions/assigmnent9_solution.md | 174 ++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 Solutions/assigmnent9_solution.md

diff --git a/Solutions/assigmnent9_solution.md b/Solutions/assigmnent9_solution.md
new file mode 100644
index 0000000..b44e276
--- /dev/null
+++ b/Solutions/assigmnent9_solution.md
@@ -0,0 +1,174 @@
+# Assignment 9: Tensor Core (TF32) Support in RTL — Solution
+
+This document walks through the complete solution for extending the Vortex Tensor Core RTL to support **TF32** inputs via the **BHF (Berkeley HardFloat) backend**.
+
+---
+
+## Overview
+
+Assignment 8 added TF32 to the SimX software emulator. This assignment wires TF32 into the actual RTL pipeline, specifically the BHF FEDP (Fused Element-wise Dot Product) module that drives the hardware Tensor Core.
+
+The BHF backend uses `VX_tcu_bhf_fmul` — a parameterized floating-point multiplier — to compute products before accumulation. Adding TF32 means instantiating a new multiplier configured for TF32's 8-exponent / 10-mantissa bit format, unpacking TF32 operands correctly, and routing the result through the existing mux.
+
+Changes touch three files: `VX_tcu_pkg.sv`, and `VX_tcu_fedp_bhf.sv`. The `tensor_cfg.h` changes are identical to Assignment 8 and are not repeated here.
+
+---
+
+## Step 1: Format Definition — `tensor_cfg.h`
+
+Identical to Assignment 8. Add the `tf32` struct and register it in `fmt_string()`. See that solution for details.
+
+---
+
+## Step 2: Update Tensor Core Package — `VX_tcu_pkg.sv`
+
+Add the TF32 format ID constant alongside the other format IDs:
+
+```sv
+localparam TCU_FP16_ID = 1;
+localparam TCU_BF16_ID = 2;
+localparam TCU_TF32_ID = 3;  // <-- add this
+localparam TCU_FP32_ID = 4;
+// ... integer format IDs unchanged
+```
+
+Add TF32 to the `trace_fmt` task so pipeline traces print `"tf32"` instead of `"?"`:
+
+```sv
+task trace_fmt(input int level, input logic [3:0] fmt);
+    case (fmt)
+        TCU_FP16_ID: `TRACE(level, ("fp16"))
+        TCU_BF16_ID: `TRACE(level, ("bf16"))
+        TCU_TF32_ID: `TRACE(level, ("tf32"))  // <-- add this
+        TCU_FP32_ID: `TRACE(level, ("fp32"))
+        TCU_U8_ID:   `TRACE(level, ("u8"))
+        TCU_I4_ID:   `TRACE(level, ("i4"))
+        TCU_U4_ID:   `TRACE(level, ("u4"))
+        default:     `TRACE(level, ("?"))
+    endcase
+endtask
+```
+
+---
+
+## Step 3: Extend BHF FEDP — `VX_tcu_fedp_bhf.sv`
+
+This is the main change. Three sub-steps are needed: unpacking TF32 operands, instantiating the TF32 multiplier, and routing its output through the result mux.
+
+### 3a. Unpack TF32 Operands
+
+The existing unpack block splits each 32-bit element into two 16-bit halves for fp16/bf16, giving `TCK = 2N` packed 16-bit lanes. TF32 elements are 19 bits wide and one element fills an entire 32-bit register, so there is no second element to unpack. The second slot is filled with zero to preserve the same `TCK`-wide pipeline width used by the accumulator:
+
+```sv
+wire [TCK-1:0][15:0] a_row16;
+wire [TCK-1:0][15:0] b_col16;
+wire [TCK-1:0][18:0] a_row19;  // <-- add
+wire [TCK-1:0][18:0] b_col19;  // <-- add
+
+for (genvar i = 0; i < N; i++) begin : g_unpack
+    // Existing fp16/bf16 unpacking (unchanged)
+    assign a_row16[2*i]   = a_row[i][15:0];
+    assign a_row16[2*i+1] = a_row[i][31:16];
+    assign b_col16[2*i]   = b_col[i][15:0];
+    assign b_col16[2*i+1] = b_col[i][31:16];
+
+    // TF32: one 19-bit element per register; interleave zeros to match TCK width
+    assign a_row19[2*i]   = a_row[i][18:0];
+    assign a_row19[2*i+1] = 19'd0;
+    assign b_col19[2*i]   = b_col[i][18:0];
+    assign b_col19[2*i+1] = 19'd0;
+end
+```
+
+The zero-interleaving means the accumulator still sees `TCK` inputs, but every other product is zero — effectively halving throughput for TF32, which is correct since TF32 elements are twice as wide as fp16/bf16.
+
+### 3b. Instantiate the TF32 Multiplier
+
+Inside the per-lane generate loop `g_prod`, add a `VX_tcu_bhf_fmul` instance configured for TF32's format: 8 exponent bits and 11 significand bits (10 explicit mantissa bits + 1 implicit leading bit). Input is in IEEE format (`IN_REC=0`); output is in recoded format (`OUT_REC=1`) to feed directly into the BHF accumulator:
+
+```sv
+for (genvar i = 0; i < TCK; i++) begin : g_prod
+    wire [32:0] mult_result_fp16;
+    wire [32:0] mult_result_bf16;
+    wire [32:0] mult_result_tf32;  // <-- add
+
+    // FP16 multiplier (unchanged)
+    VX_tcu_bhf_fmul #(
+        .IN_EXPW (5), .IN_SIGW (11),
+        .OUT_EXPW(8), .OUT_SIGW(24),
+        .IN_REC(0), .OUT_REC(1),
+        .MUL_LATENCY(FMUL_LATENCY), .RND_LATENCY(FRND_LATENCY)
+    ) fp16_mul ( .clk(clk), .reset(reset), .enable(enable), .frm(frm),
+                 .a(a_row16[i]), .b(b_col16[i]), .y(mult_result_fp16),
+                 `UNUSED_PIN(fflags) );
+
+    // BF16 multiplier (unchanged)
+    VX_tcu_bhf_fmul #(
+        .IN_EXPW (8), .IN_SIGW (8),
+        .OUT_EXPW(8), .OUT_SIGW(24),
+        .IN_REC(0), .OUT_REC(1),
+        .MUL_LATENCY(FMUL_LATENCY), .RND_LATENCY(FRND_LATENCY)
+    ) bf16_mul ( .clk(clk), .reset(reset), .enable(enable), .frm(frm),
+                 .a(a_row16[i]), .b(b_col16[i]), .y(mult_result_bf16),
+                 `UNUSED_PIN(fflags) );
+
+    // TF32 multiplier: 8 exponent bits, 11 significand bits (10 mantissa + implicit 1)
+    VX_tcu_bhf_fmul #(
+        .IN_EXPW (8), .IN_SIGW (11),
+        .OUT_EXPW(8), .OUT_SIGW(24),
+        .IN_REC (0),
+        .OUT_REC(1),
+        .MUL_LATENCY(FMUL_LATENCY),
+        .RND_LATENCY(FRND_LATENCY)
+    ) tf32_mul (
+        .clk    (clk),
+        .reset  (reset),
+        .enable (enable),
+        .frm    (frm),
+        .a      (a_row19[i]),
+        .b      (b_col19[i]),
+        .y      (mult_result_tf32),
+        `UNUSED_PIN(fflags)
+    );
+```
+
+**Why `IN_SIGW=11`?** TF32 has 10 explicit mantissa bits; the significand width passed to BHF includes the implicit leading 1, so the correct value is 11 — matching fp16 despite TF32's wider exponent field.
+
+### 3c. Route Through the Result Mux
+
+Add `3'd3` (matching `TCU_TF32_ID`) to the format select mux:
+
+```sv
+logic [32:0] mult_result_mux;
+always_comb begin
+    case (fmt_s_delayed)
+        3'd1: mult_result_mux = mult_result_fp16;
+        3'd2: mult_result_mux = mult_result_bf16;
+        3'd3: mult_result_mux = mult_result_tf32;  // <-- add
+        default: mult_result_mux = 'x;
+    endcase
+end
+```
+
+The selected result flows into the existing fp32 accumulator chain unchanged — no accumulator modifications are needed since all three formats produce a recoded fp32 output from the multiplier.
+
+---
+
+## Step 4: Testing
+
+Clean and rebuild with the TF32 configuration before running, since input/output types are compiled into the kernel binary:
+
+```bash
+# Clean prior build
+make -C tests/regression/sgemm_tcu clean
+
+# Rebuild for 8 threads, TF32 input, FP32 output
+CONFIGS="-DNUM_THREADS=8 -DITYPE=tf32 -DOTYPE=fp32" \
+  make -C tests/regression/sgemm_tcu
+
+# Run on RTL simulator with Tensor Core (BHF backend) enabled
+CONFIGS="-DNUM_THREADS=8 -DEXT_TCU_ENABLE -DTCU_BHF" \
+  ./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu
+```
+
+---

From 44e027c0f5b2da7b885759cfac99531af2970868 Mon Sep 17 00:00:00 2001
From: Aaron Kantsevoy <72467919+aakan511@users.noreply.github.com>
Date: Thu, 23 Apr 2026 09:50:40 -0400
Subject: [PATCH 5/5] Rename assigmnent9_solution.md to assignment9_solution.md

---
 Solutions/{assigmnent9_solution.md => assignment9_solution.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename Solutions/{assigmnent9_solution.md => assignment9_solution.md} (100%)

diff --git a/Solutions/assigmnent9_solution.md b/Solutions/assignment9_solution.md
similarity index 100%
rename from Solutions/assigmnent9_solution.md
rename to Solutions/assignment9_solution.md