From d0cd9a4cbec4f69f4e0d93db689ea2de4195615f Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Wed, 4 Feb 2026 10:58:17 +0100
Subject: [PATCH 01/57] adding basic ssp sptrsv kernel (non-optimized)

---
 apps/CMakeLists.txt                           |   2 +
 apps/maxbsp_ssp_sptrsv.cpp                    | 108 ++++++++++++++++++
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp |  73 ++++++++++++
 3 files changed, 183 insertions(+)
 create mode 100644 apps/maxbsp_ssp_sptrsv.cpp
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 97935a92..2c7cbb5e 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -55,5 +55,7 @@ endif()
 
 endif()
 
+_add_executable( maxbsp_ssp_sptrsv )
+
 # Custom target to compile all the executables
 add_custom_target( build_executables DEPENDS ${executable_list} )
diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
new file mode 100644
index 00000000..f89aac94
--- /dev/null
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -0,0 +1,108 @@
+/*
+ * maxbsp_ssp_sptrsv.cpp
+ * Demonstrates maxbsp scheduling with staleness=2, then runs SpTRSV with SSP kernel.
+ */
+
+#include <iostream>
+#include <vector>
+#include <Eigen/Sparse>
+#include <unsupported/Eigen/SparseExtra>
+#include "osp/auxiliary/sptrsv_simulator/sptrsv.hpp"
+#include "osp/bsp/model/BspInstance.hpp"
+#include "osp/bsp/model/BspSchedule.hpp"
+#include "osp/bsp/model/MaxBspSchedule.hpp"
+#include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCores.hpp"
+#include "osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp"
+#include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp"
+#include <chrono>
+
+using namespace osp;
+
+int main(int argc, char* argv[]) {
+    // Accept matrix filename and iteration count as arguments
+    std::string filename = "../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx";
+    int num_iterations = 1;
+    if (argc > 1) {
+        filename = argv[1];
+    }
+    if (argc > 2) {
+        num_iterations = std::stoi(argv[2]);
+    }
+
+    // Load matrix
+    Eigen::SparseMatrix<double, Eigen::RowMajor, int32_t> lCsr;
+    bool matrixLoadSuccess = Eigen::loadMarket(lCsr, filename);
+    if (!matrixLoadSuccess) {
+        std::cerr << "Failed to read matrix from " << filename << std::endl;
+        return 1;
+    }
+    std::cout << "Loaded matrix of size " << lCsr.rows() << " x " << lCsr.cols() << " with " << lCsr.nonZeros() << " non-zeros.\n";
+
+    // Setup graph and architecture
+    SparseMatrixImp<int32_t> graph;
+    graph.SetCsr(&lCsr);
+    Eigen::SparseMatrix<double, Eigen::ColMajor, int32_t> lCsc = lCsr;
+    graph.SetCsc(&lCsc);
+    BspArchitecture<SparseMatrixImp<int32_t>> architecture(16, 1, 500); // 16 processors
+    BspInstance<SparseMatrixImp<int32_t>> instance(graph, architecture);
+
+    // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness=2)
+    GreedyVarianceSspScheduler<SparseMatrixImp<int32_t>> ssp_scheduler;
+    MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_schedule(instance);
+    ssp_scheduler.ComputeSchedule(ssp_schedule);
+
+    // Setup SpTRSV kernel
+    Sptrsv<int32_t> sptrsv_kernel(instance);
+    sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule);
+
+    size_t n = static_cast<size_t>(lCsc.cols());
+
+    // Benchmark SSP L-solve
+    double ssp_total_time = 0.0;
+    std::vector<double> ssp_result(n, 0.0);
+    for (int iter = 0; iter < num_iterations; ++iter) {
+        std::vector<double> x(n, 0.0);
+        std::vector<double> b(n, 1.0);
+        sptrsv_kernel.x_ = x.data();
+        sptrsv_kernel.b_ = b.data();
+        auto start = std::chrono::high_resolution_clock::now();
+        sptrsv_kernel.SspLsolveStaleness2();
+        auto end = std::chrono::high_resolution_clock::now();
+        ssp_total_time += std::chrono::duration<double>(end - start).count();
+        if (iter == 0) ssp_result = std::vector<double>(x.begin(), x.end());
+    }
+    double ssp_avg_time = ssp_total_time / num_iterations;
+
+    // Benchmark serial L-solve
+    double serial_total_time = 0.0;
+    std::vector<double> serial_result(n, 0.0);
+    for (int iter = 0; iter < num_iterations; ++iter) {
+        std::vector<double> x_serial(n, 0.0);
+        std::vector<double> b_serial(n, 1.0);
+        sptrsv_kernel.x_ = x_serial.data();
+        sptrsv_kernel.b_ = b_serial.data();
+        auto start = std::chrono::high_resolution_clock::now();
+        sptrsv_kernel.LsolveSerial();
+        auto end = std::chrono::high_resolution_clock::now();
+        serial_total_time += std::chrono::duration<double>(end - start).count();
+        if (iter == 0) serial_result = std::vector<double>(x_serial.begin(), x_serial.end());
+    }
+    double serial_avg_time = serial_total_time / num_iterations;
+
+    // Compare results
+    double max_diff = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+        double diff = std::abs(ssp_result[i] - serial_result[i]);
+        if (diff > max_diff) max_diff = diff;
+    }
+    std::cout << "Max difference between SSP and serial L-solve: " << max_diff << std::endl;
+    if (max_diff < 1e-10) {
+        std::cout << "SSP L-solve matches serial L-solve!" << std::endl;
+    } else {
+        std::cout << "SSP L-solve does NOT match serial L-solve!" << std::endl;
+    }
+    std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_avg_time << " seconds" << std::endl;
+    std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl;
+    std::cout << "MaxBSP with staleness=2 and SSP SpTRSV executed." << std::endl;
+    return 0;
+}
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 436e3dd4..2371e351 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -24,6 +24,7 @@ limitations under the License.
 
 #    include <Eigen/Core>
 #    include <algorithm>
+#    include <atomic>
 #    include <iostream>
 #    include <list>
 #    include <map>
@@ -36,6 +37,28 @@ limitations under the License.
 #    include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp"
 
 namespace osp {
+// Portable cpu_relax definition
+#if defined(__x86_64__) || defined(_M_X64)
+#include <immintrin.h>
+inline void cpu_relax() { _mm_pause(); }
+#elif defined(__aarch64__)
+inline void cpu_relax() { asm volatile("yield" ::: "memory"); }
+#else
+inline void cpu_relax() { std::this_thread::yield(); }
+#endif
+// SSPBarrierRaph for staleness-aware synchronization
+class SSPBarrierRaph {
+private:
+    alignas(64) std::atomic<std::size_t> threadCounter{0U};
+    void barrier_sleep() const {}
+public:
+    void arrive() { threadCounter.fetch_add(1U, std::memory_order_release); }
+    void wait(std::size_t arr_token) {
+        while ((threadCounter.load(std::memory_order_relaxed) < arr_token) || (threadCounter.load(std::memory_order_acquire) < arr_token)) {
+            cpu_relax();
+        }
+    }
+};
 
 template <typename EigenIdxType>
 class Sptrsv {
@@ -126,6 +149,8 @@ class Sptrsv {
                     do {
                         node--;
                         vectorStepProcessorVerticesU_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back(
+                // --- SSP SpTRSV kernel integration from BspSptrsvCSR.hpp/cpp ---
+
                             static_cast<EigenIdxType>(node));
                     } while (node > 0);
 
@@ -479,6 +504,54 @@ class Sptrsv {
 
     std::size_t GetNumberOfVertices() { return instance_->NumberOfVertices(); }
 
+    // SSP Lsolve with staleness=2 (allowing at most one superstep of lag)
+    void SspLsolveStaleness2() {
+        constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference
+        const unsigned nthreads = instance_->NumberOfProcessors();
+        std::vector<std::atomic<unsigned>> stepDone(numSupersteps_);
+        for (auto &counter : stepDone) {
+            counter.store(0U, std::memory_order_relaxed);
+        }
+
+        auto *csr = instance_->GetComputationalDag().GetCSR();
+        const auto *outer = csr->outerIndexPtr();
+        const auto *inner = csr->innerIndexPtr();
+        const auto *vals = csr->valuePtr();
+
+        #pragma omp parallel num_threads(nthreads)
+        {
+            const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
+            for (unsigned step = 0; step < numSupersteps_; ++step) {
+                if (step >= staleness) {
+                    const unsigned waitStep = step - static_cast<unsigned>(staleness);
+                    while (stepDone[waitStep].load(std::memory_order_acquire) < nthreads) {
+                        cpu_relax();
+                    }
+                }
+                // Each thread processes its assigned node ranges for this superstep
+                const size_t boundsStrSize = boundsArrayL_[step][proc].size();
+                for (size_t index = 0; index < boundsStrSize; index += 2) {
+                    EigenIdxType lowerB = boundsArrayL_[step][proc][index];
+                    const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
+                    for (EigenIdxType node = lowerB; node <= upperB; ++node) {
+                        // Initialize solution for this node
+                        x_[node] = b_[node];
+                        // Perform lower-triangular solve for this node
+                        for (EigenIdxType i = outer[node];
+                             i < outer[node + 1] - 1;
+                             ++i) {
+                            // Subtract contributions from previously solved nodes
+                            x_[node] -= vals[i] * x_[inner[i]];
+                        }
+                        // Divide by diagonal element to complete solve for this node
+                        x_[node] /= vals[outer[node + 1] - 1];
+                    }
+                }
+                stepDone[step].fetch_add(1U, std::memory_order_release);
+            }
+        }
+    }
+
     virtual ~Sptrsv() = default;
 };
 

From 1c8622e500dacf2cd652ef83ae337110eb167a37 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Wed, 4 Feb 2026 13:32:04 +0100
Subject: [PATCH 02/57] Improvements

---
 apps/maxbsp_ssp_sptrsv.cpp                    |  3 ++
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 38 +++++++++++++++----
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index f89aac94..0377b7ce 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -103,6 +103,9 @@ int main(int argc, char* argv[]) {
     }
     std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_avg_time << " seconds" << std::endl;
     std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl;
+    if (ssp_avg_time > 0.0) {
+        std::cout << "Speedup (serial/SSP): " << (serial_avg_time / ssp_avg_time) << "x" << std::endl;
+    }
     std::cout << "MaxBSP with staleness=2 and SSP SpTRSV executed." << std::endl;
     return 0;
 }
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 2371e351..cc58fe76 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -25,11 +25,14 @@ limitations under the License.
 #    include <Eigen/Core>
 #    include <algorithm>
 #    include <atomic>
+#    include <chrono>
 #    include <iostream>
 #    include <list>
 #    include <map>
+#    include <memory>
 #    include <random>
 #    include <stdexcept>
+#    include <thread>
 #    include <vector>
 
 #    include "osp/bsp/model/BspInstance.hpp"
@@ -91,6 +94,8 @@ class Sptrsv {
 
     std::vector<std::vector<std::vector<EigenIdxType>>> boundsArrayL_;
     std::vector<std::vector<std::vector<EigenIdxType>>> boundsArrayU_;
+    std::unique_ptr<std::atomic<unsigned>[]> stepDone_;
+    std::size_t stepDoneSize_ = 0U;
 
     Sptrsv() = default;
 
@@ -109,6 +114,13 @@ class Sptrsv {
             schedule.NumberOfSupersteps(), std::vector<std::vector<EigenIdxType>>(schedule.GetInstance().NumberOfProcessors()));
 
         numSupersteps_ = schedule.NumberOfSupersteps();
+        if (stepDoneSize_ != static_cast<std::size_t>(numSupersteps_)) {
+            stepDone_ = std::make_unique<std::atomic<unsigned>[]>(numSupersteps_);
+            stepDoneSize_ = static_cast<std::size_t>(numSupersteps_);
+        }
+        for (std::size_t i = 0; i < stepDoneSize_; ++i) {
+            stepDone_[i].store(0U, std::memory_order_relaxed);
+        }
         size_t numberOfVertices = instance_->GetComputationalDag().NumVertices();
 
 #    pragma omp parallel num_threads(2)
@@ -508,9 +520,8 @@ class Sptrsv {
     void SspLsolveStaleness2() {
         constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference
         const unsigned nthreads = instance_->NumberOfProcessors();
-        std::vector<std::atomic<unsigned>> stepDone(numSupersteps_);
-        for (auto &counter : stepDone) {
-            counter.store(0U, std::memory_order_relaxed);
+        for (std::size_t i = 0; i < stepDoneSize_; ++i) {
+            stepDone_[i].store(0U, std::memory_order_relaxed);
         }
 
         auto *csr = instance_->GetComputationalDag().GetCSR();
@@ -524,11 +535,24 @@ class Sptrsv {
             for (unsigned step = 0; step < numSupersteps_; ++step) {
                 if (step >= staleness) {
                     const unsigned waitStep = step - static_cast<unsigned>(staleness);
-                    while (stepDone[waitStep].load(std::memory_order_acquire) < nthreads) {
-                        cpu_relax();
+                    unsigned spinCount = 0U;
+                    auto backoff = std::chrono::nanoseconds(50);
+                    while (stepDone_[waitStep].load(std::memory_order_relaxed) < nthreads) {
+                        if (spinCount < 2000U) {
+                            cpu_relax();
+                            ++spinCount;
+                        } else if (spinCount < 4000U) {
+                            std::this_thread::yield();
+                            ++spinCount;
+                        } else {
+                            std::this_thread::sleep_for(backoff);
+                            if (backoff < std::chrono::nanoseconds(500)) {
+                                backoff *= 2;
+                            }
+                        }
                     }
+                    std::atomic_thread_fence(std::memory_order_acquire);
                 }
-                // Each thread processes its assigned node ranges for this superstep
                 const size_t boundsStrSize = boundsArrayL_[step][proc].size();
                 for (size_t index = 0; index < boundsStrSize; index += 2) {
                     EigenIdxType lowerB = boundsArrayL_[step][proc][index];
@@ -547,7 +571,7 @@ class Sptrsv {
                         x_[node] /= vals[outer[node + 1] - 1];
                     }
                 }
-                stepDone[step].fetch_add(1U, std::memory_order_release);
+                stepDone_[step].fetch_add(1U, std::memory_order_release);
             }
         }
     }

From d13bd4c570358fdaff83c34b26b7f5dcdc3465e3 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Wed, 4 Feb 2026 13:42:12 +0100
Subject: [PATCH 03/57] Make class for barrier functionality

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 81 +++++++++++--------
 1 file changed, 48 insertions(+), 33 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index cc58fe76..61123022 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -63,6 +63,49 @@ class SSPBarrierRaph {
     }
 };
 
+class SspStalenessBarrier {
+  private:
+    std::unique_ptr<std::atomic<unsigned>[]> stepDone_;
+    std::size_t stepDoneSize_ = 0U;
+
+  public:
+    void Reset(std::size_t numSupersteps) {
+        if (stepDoneSize_ != numSupersteps) {
+            stepDone_ = std::make_unique<std::atomic<unsigned>[]>(numSupersteps);
+            stepDoneSize_ = numSupersteps;
+        }
+        for (std::size_t i = 0; i < stepDoneSize_; ++i) {
+            stepDone_[i].store(0U, std::memory_order_relaxed);
+        }
+    }
+
+    void WaitIfNeeded(unsigned step, unsigned staleness, unsigned nthreads) {
+        if (step < staleness) {
+            return;
+        }
+        const unsigned waitStep = step - staleness;
+        unsigned spinCount = 0U;
+        auto backoff = std::chrono::nanoseconds(50);
+        while (stepDone_[waitStep].load(std::memory_order_relaxed) < nthreads) {
+            if (spinCount < 2000U) {
+                cpu_relax();
+                ++spinCount;
+            } else if (spinCount < 4000U) {
+                std::this_thread::yield();
+                ++spinCount;
+            } else {
+                std::this_thread::sleep_for(backoff);
+                if (backoff < std::chrono::nanoseconds(500)) {
+                    backoff *= 2;
+                }
+            }
+        }
+        std::atomic_thread_fence(std::memory_order_acquire);
+    }
+
+    void Arrive(unsigned step) { stepDone_[step].fetch_add(1U, std::memory_order_release); }
+};
+
 template <typename EigenIdxType>
 class Sptrsv {
     using UVertType = typename SparseMatrixImp<EigenIdxType>::VertexIdx;
@@ -94,8 +137,7 @@ class Sptrsv {
 
     std::vector<std::vector<std::vector<EigenIdxType>>> boundsArrayL_;
     std::vector<std::vector<std::vector<EigenIdxType>>> boundsArrayU_;
-    std::unique_ptr<std::atomic<unsigned>[]> stepDone_;
-    std::size_t stepDoneSize_ = 0U;
+    SspStalenessBarrier sspBarrier_;
 
     Sptrsv() = default;
 
@@ -114,13 +156,7 @@ class Sptrsv {
             schedule.NumberOfSupersteps(), std::vector<std::vector<EigenIdxType>>(schedule.GetInstance().NumberOfProcessors()));
 
         numSupersteps_ = schedule.NumberOfSupersteps();
-        if (stepDoneSize_ != static_cast<std::size_t>(numSupersteps_)) {
-            stepDone_ = std::make_unique<std::atomic<unsigned>[]>(numSupersteps_);
-            stepDoneSize_ = static_cast<std::size_t>(numSupersteps_);
-        }
-        for (std::size_t i = 0; i < stepDoneSize_; ++i) {
-            stepDone_[i].store(0U, std::memory_order_relaxed);
-        }
+        sspBarrier_.Reset(static_cast<std::size_t>(numSupersteps_));
         size_t numberOfVertices = instance_->GetComputationalDag().NumVertices();
 
 #    pragma omp parallel num_threads(2)
@@ -520,9 +556,7 @@ class Sptrsv {
     void SspLsolveStaleness2() {
         constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference
         const unsigned nthreads = instance_->NumberOfProcessors();
-        for (std::size_t i = 0; i < stepDoneSize_; ++i) {
-            stepDone_[i].store(0U, std::memory_order_relaxed);
-        }
+        sspBarrier_.Reset(static_cast<std::size_t>(numSupersteps_));
 
         auto *csr = instance_->GetComputationalDag().GetCSR();
         const auto *outer = csr->outerIndexPtr();
@@ -533,26 +567,7 @@ class Sptrsv {
         {
             const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; ++step) {
-                if (step >= staleness) {
-                    const unsigned waitStep = step - static_cast<unsigned>(staleness);
-                    unsigned spinCount = 0U;
-                    auto backoff = std::chrono::nanoseconds(50);
-                    while (stepDone_[waitStep].load(std::memory_order_relaxed) < nthreads) {
-                        if (spinCount < 2000U) {
-                            cpu_relax();
-                            ++spinCount;
-                        } else if (spinCount < 4000U) {
-                            std::this_thread::yield();
-                            ++spinCount;
-                        } else {
-                            std::this_thread::sleep_for(backoff);
-                            if (backoff < std::chrono::nanoseconds(500)) {
-                                backoff *= 2;
-                            }
-                        }
-                    }
-                    std::atomic_thread_fence(std::memory_order_acquire);
-                }
+                sspBarrier_.WaitIfNeeded(step, static_cast<unsigned>(staleness), nthreads);
                 const size_t boundsStrSize = boundsArrayL_[step][proc].size();
                 for (size_t index = 0; index < boundsStrSize; index += 2) {
                     EigenIdxType lowerB = boundsArrayL_[step][proc][index];
@@ -571,7 +586,7 @@ class Sptrsv {
                         x_[node] /= vals[outer[node + 1] - 1];
                     }
                 }
-                stepDone_[step].fetch_add(1U, std::memory_order_release);
+                sspBarrier_.Arrive(step);
             }
         }
     }

From fba4e7cb06602138626d63d553e30fc186c79b31 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Wed, 4 Feb 2026 13:47:51 +0100
Subject: [PATCH 04/57] Adding comments

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 61123022..de32d6c8 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -49,20 +49,9 @@ inline void cpu_relax() { asm volatile("yield" ::: "memory"); }
 #else
 inline void cpu_relax() { std::this_thread::yield(); }
 #endif
-// SSPBarrierRaph for staleness-aware synchronization
-class SSPBarrierRaph {
-private:
-    alignas(64) std::atomic<std::size_t> threadCounter{0U};
-    void barrier_sleep() const {}
-public:
-    void arrive() { threadCounter.fetch_add(1U, std::memory_order_release); }
-    void wait(std::size_t arr_token) {
-        while ((threadCounter.load(std::memory_order_relaxed) < arr_token) || (threadCounter.load(std::memory_order_acquire) < arr_token)) {
-            cpu_relax();
-        }
-    }
-};
 
+// Staleness-aware barrier for SSP: threads may run up to (staleness-1) steps ahead.
+// Internally tracks per-step completion counts and uses adaptive backoff to limit spinning.
 class SspStalenessBarrier {
   private:
     std::unique_ptr<std::atomic<unsigned>[]> stepDone_;
@@ -70,6 +59,7 @@ class SspStalenessBarrier {
 
   public:
     void Reset(std::size_t numSupersteps) {
+        // Reinitialize counters for a new schedule/run.
         if (stepDoneSize_ != numSupersteps) {
             stepDone_ = std::make_unique<std::atomic<unsigned>[]>(numSupersteps);
             stepDoneSize_ = numSupersteps;
@@ -80,6 +70,7 @@ class SspStalenessBarrier {
     }
 
     void WaitIfNeeded(unsigned step, unsigned staleness, unsigned nthreads) {
+        // Enforce: step may start only when all threads completed (step - staleness).
         if (step < staleness) {
             return;
         }
@@ -87,6 +78,7 @@ class SspStalenessBarrier {
         unsigned spinCount = 0U;
         auto backoff = std::chrono::nanoseconds(50);
         while (stepDone_[waitStep].load(std::memory_order_relaxed) < nthreads) {
+            // Adaptive backoff: spin -> yield -> short sleep to reduce contention.
             if (spinCount < 2000U) {
                 cpu_relax();
                 ++spinCount;
@@ -103,6 +95,7 @@ class SspStalenessBarrier {
         std::atomic_thread_fence(std::memory_order_acquire);
     }
 
+    // Mark completion of a superstep by this thread.
     void Arrive(unsigned step) { stepDone_[step].fetch_add(1U, std::memory_order_release); }
 };
 
@@ -552,10 +545,12 @@ class Sptrsv {
 
     std::size_t GetNumberOfVertices() { return instance_->NumberOfVertices(); }
 
-    // SSP Lsolve with staleness=2 (allowing at most one superstep of lag)
+    // SSP Lsolve with staleness=2 (allowing at most one superstep of lag).
+    // Uses the staleness barrier to respect dependencies between supersteps.
     void SspLsolveStaleness2() {
         constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference
         const unsigned nthreads = instance_->NumberOfProcessors();
+        // Reset per-step completion counters for this run.
         sspBarrier_.Reset(static_cast<std::size_t>(numSupersteps_));
 
         auto *csr = instance_->GetComputationalDag().GetCSR();
@@ -567,7 +562,9 @@ class Sptrsv {
         {
             const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; ++step) {
+                // Ensure we are not more than (staleness-1) supersteps ahead.
                 sspBarrier_.WaitIfNeeded(step, static_cast<unsigned>(staleness), nthreads);
+                // Process nodes assigned to this (step, proc) pair.
                 const size_t boundsStrSize = boundsArrayL_[step][proc].size();
                 for (size_t index = 0; index < boundsStrSize; index += 2) {
                     EigenIdxType lowerB = boundsArrayL_[step][proc][index];
@@ -586,6 +583,7 @@ class Sptrsv {
                         x_[node] /= vals[outer[node + 1] - 1];
                     }
                 }
+                // Signal completion of this superstep for staleness tracking.
                 sspBarrier_.Arrive(step);
             }
         }

From 37f0d0aa65b9892b948924ff4be028c87e395799 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Wed, 4 Feb 2026 14:55:12 +0100
Subject: [PATCH 05/57] comparing against growlocal

---
 apps/maxbsp_ssp_sptrsv.cpp | 53 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 0377b7ce..dd1112d0 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -51,9 +51,13 @@ int main(int argc, char* argv[]) {
     MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_schedule(instance);
     ssp_scheduler.ComputeSchedule(ssp_schedule);
 
+    // Create a non-SSP schedule using GrowLocalAutoCores
+    GrowLocalAutoCores<SparseMatrixImp<int32_t>> growlocal_scheduler;
+    BspSchedule<SparseMatrixImp<int32_t>> growlocal_schedule(instance);
+    growlocal_scheduler.ComputeSchedule(growlocal_schedule);
+
     // Setup SpTRSV kernel
     Sptrsv<int32_t> sptrsv_kernel(instance);
-    sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule);
 
     size_t n = static_cast<size_t>(lCsc.cols());
 
@@ -63,6 +67,7 @@ int main(int argc, char* argv[]) {
     for (int iter = 0; iter < num_iterations; ++iter) {
         std::vector<double> x(n, 0.0);
         std::vector<double> b(n, 1.0);
+        sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule);
         sptrsv_kernel.x_ = x.data();
         sptrsv_kernel.b_ = b.data();
         auto start = std::chrono::high_resolution_clock::now();
@@ -73,6 +78,23 @@ int main(int argc, char* argv[]) {
     }
     double ssp_avg_time = ssp_total_time / num_iterations;
 
+    // Benchmark GrowLocalAutoCores schedule with non-SSP L-solve (no permutation)
+    double growlocal_total_time = 0.0;
+    std::vector<double> growlocal_result(n, 0.0);
+    for (int iter = 0; iter < num_iterations; ++iter) {
+        std::vector<double> x(n, 0.0);
+        std::vector<double> b(n, 1.0);
+        sptrsv_kernel.SetupCsrNoPermutation(growlocal_schedule);
+        sptrsv_kernel.x_ = x.data();
+        sptrsv_kernel.b_ = b.data();
+        auto start = std::chrono::high_resolution_clock::now();
+        sptrsv_kernel.LsolveNoPermutation();
+        auto end = std::chrono::high_resolution_clock::now();
+        growlocal_total_time += std::chrono::duration<double>(end - start).count();
+        if (iter == 0) growlocal_result = std::vector<double>(x.begin(), x.end());
+    }
+    double growlocal_avg_time = growlocal_total_time / num_iterations;
+
     // Benchmark serial L-solve
     double serial_total_time = 0.0;
     std::vector<double> serial_result(n, 0.0);
@@ -101,11 +123,38 @@ int main(int argc, char* argv[]) {
     } else {
         std::cout << "SSP L-solve does NOT match serial L-solve!" << std::endl;
     }
+    double max_diff_growlocal = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+        double diff = std::abs(growlocal_result[i] - serial_result[i]);
+        if (diff > max_diff_growlocal) max_diff_growlocal = diff;
+    }
+    std::cout << "Max difference between GrowLocalAutoCores and serial L-solve: " << max_diff_growlocal << std::endl;
+    if (max_diff_growlocal < 1e-10) {
+        std::cout << "GrowLocalAutoCores L-solve matches serial L-solve!" << std::endl;
+    } else {
+        std::cout << "GrowLocalAutoCores L-solve does NOT match serial L-solve!" << std::endl;
+    }
+
+    double max_diff_ssp_growlocal = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+        double diff = std::abs(ssp_result[i] - growlocal_result[i]);
+        if (diff > max_diff_ssp_growlocal) max_diff_ssp_growlocal = diff;
+    }
+    std::cout << "Max difference between SSP and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal << std::endl;
+
     std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_avg_time << " seconds" << std::endl;
+    std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time
+              << " seconds" << std::endl;
     std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl;
     if (ssp_avg_time > 0.0) {
         std::cout << "Speedup (serial/SSP): " << (serial_avg_time / ssp_avg_time) << "x" << std::endl;
     }
-    std::cout << "MaxBSP with staleness=2 and SSP SpTRSV executed." << std::endl;
+    if (growlocal_avg_time > 0.0) {
+        std::cout << "Speedup (serial/GrowLocalAutoCores): " << (serial_avg_time / growlocal_avg_time) << "x" << std::endl;
+    }
+    if (ssp_avg_time > 0.0) {
+        std::cout << "Speedup (GrowLocalAutoCores/SSP): " << (growlocal_avg_time / ssp_avg_time) << "x" << std::endl;
+    }
+    std::cout << "MaxBSP staleness=2 SSP and GrowLocalAutoCores SpTRSV executed." << std::endl;
     return 0;
 }

From 057ffb0f01ddf487858faa4adc5500599bb2563b Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 5 Feb 2026 14:07:32 +0100
Subject: [PATCH 06/57] move executable in cmake

---
 apps/CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 2c7cbb5e..ef5823ee 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -36,16 +36,18 @@ if(Boost_FOUND)
 
 _add_executable( osp_turnus )
 
-_add_executable ( osp )
+_add_executable( osp )
 
 configure_file(config/osp_config.json osp_config.json COPYONLY)
 
-_add_executable ( bsp_test_suite )
+_add_executable( bsp_test_suite )
 
 _add_executable( coarser_plotter )
 
 if(Eigen3_FOUND)
-_add_executable ( sptrsv_test_suite )
+_add_executable( sptrsv_test_suite )
+
+_add_executable( maxbsp_ssp_sptrsv )
 endif()
 
 if (COPT_FOUND)
@@ -55,7 +57,5 @@ endif()
 
 endif()
 
-_add_executable( maxbsp_ssp_sptrsv )
-
 # Custom target to compile all the executables
 add_custom_target( build_executables DEPENDS ${executable_list} )

From 866e6269358f62c6791820fca75c896de2789a16 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 5 Feb 2026 15:35:43 +0100
Subject: [PATCH 07/57] FlatBarrier

---
 .../WeakBarriers/flat_barrier.hpp             | 93 +++++++++++++++++++
 .../GrowLocalAutoCoresParallel.hpp            |  3 +-
 include/osp/config/config.hpp                 | 25 +++++
 tests/CMakeLists.txt                          |  2 +
 tests/weak_barrier.cpp                        | 32 +++++++
 5 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100644 include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
 create mode 100644 include/osp/config/config.hpp
 create mode 100644 tests/weak_barrier.cpp

diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
new file mode 100644
index 00000000..9b35ac8d
--- /dev/null
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
@@ -0,0 +1,93 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+#include "osp/config/config.hpp"
+
+namespace osp {
+
+// Portable cpu_relax definition
+#if defined(__x86_64__) || defined(_M_X64)
+#    include <immintrin.h>
+
+inline void cpu_relax() { _mm_pause(); }
+#elif defined(__aarch64__)
+inline void cpu_relax() { asm volatile("yield" ::: "memory"); }
+#else
+inline void cpu_relax() { std::this_thread::yield(); }
+#endif
+
+struct alignas(CACHE_LINE_SIZE) AlignedAtomicFlag {
+    std::atomic<bool> flag_;
+    int8_t pad[CACHE_LINE_SIZE - sizeof(std::atomic<bool>)];
+
+    static_assert(std::atomic<bool>::is_always_lock_free);
+    static_assert(sizeof(int8_t) == 1U);
+};
+
+/**
+ * @brief A weak synchronisation barrier which can be reused.
+ * Instatiate with number of threads. Each thread should call "Arrive" with its thread id to indicate that its work has been
+ * completed. Each thread can then call "Wait" to wait till all other threads have completed their work.
+ *
+ * The barrier can be reset and reused after calling "Reset" for each thread.
+ *
+ * WARNING: The reset is NOT synchronised, thus a second FlatBarrier is required to synchronise the reset of the barrier. That is
+ * do NOT call "Reset" immediately after "Wait" as this could cause other threads not to see that the work has been completed.
+ *
+ */
+class FlatBarrier {
+  private:
+    std::vector<AlignedAtomicFlag> flags_;
+
+  public:
+    FlatBarrier(std::size_t numThreads) : flags_(std::vector<AlignedAtomicFlag>(numThreads)) {};
+
+    inline void Arrive(std::size_t threadId);
+    inline void Wait() const;
+    inline void Reset(std::size_t threadId);
+
+    FlatBarrier() = delete;
+    FlatBarrier(const FlatBarrier &) = delete;
+    FlatBarrier(FlatBarrier &&) = delete;
+    FlatBarrier &operator=(const FlatBarrier &) = delete;
+    FlatBarrier &operator=(FlatBarrier &&) = delete;
+    ~FlatBarrier() = default;
+};
+
+inline void FlatBarrier::Arrive(std::size_t threadId) { flags_[threadId].flag_.store(true, std::memory_order_relaxed); }
+
+inline void FlatBarrier::Wait() const {
+    for (const AlignedAtomicFlag &flag : flags_) {
+        std::size_t cntr = 0U;
+        while (not flag.flag_.load(std::memory_order_relaxed)) {
+            ++cntr;
+            if (cntr % 256U == 0U) {
+                cpu_relax();
+            }
+        }
+    }
+}
+
+inline void FlatBarrier::Reset(std::size_t threadId) { flags_[threadId].flag_.store(false, std::memory_order_relaxed); }
+
+}    // end namespace osp
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp
index 7f9ac6cb..5f7bcaaf 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp
@@ -36,11 +36,10 @@ limitations under the License.
 #include "osp/auxiliary/misc.hpp"
 #include "osp/bsp/model/BspSchedule.hpp"
 #include "osp/bsp/scheduler/Scheduler.hpp"
+#include "osp/config/config.hpp"
 
 namespace osp {
 
-static constexpr std::size_t CACHE_LINE_SIZE = 64;
-
 template <typename VertT, typename WeightT>
 struct GrowLocalAutoCoresParallelParams {
     VertT minSuperstepSize_ = 20;
diff --git a/include/osp/config/config.hpp b/include/osp/config/config.hpp
new file mode 100644
index 00000000..9cc6c5a8
--- /dev/null
+++ b/include/osp/config/config.hpp
@@ -0,0 +1,25 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+namespace osp {
+
+static constexpr std::size_t CACHE_LINE_SIZE = 64U;
+
+}    // end namespace osp
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ebc5c6cf..d6a8f8c2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -129,6 +129,8 @@ _add_test( bit_mask )
 
 _add_test( hash_pair )
 
+_add_test( weak_barrier )
+
 ## io
 _add_test( filereader DATA )
 
diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp
new file mode 100644
index 00000000..833f2111
--- /dev/null
+++ b/tests/weak_barrier.cpp
@@ -0,0 +1,32 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#define BOOST_TEST_MODULE WeakBarrierTests
+
+#include <boost/test/unit_test.hpp>
+#include <cstdint>
+#include <memory>
+
+#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp"
+
+using namespace osp;
+
+BOOST_AUTO_TEST_CASE(TestAlignedAtomicFlag) {
+    BOOST_CHECK_EQUAL(sizeof(AlignedAtomicFlag), 64U);
+    BOOST_CHECK_EQUAL(alignof(AlignedAtomicFlag), 64U);
+}

From 361d516c700275e93651f8fd21ebeb1c24a5e819 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 5 Feb 2026 16:37:10 +0100
Subject: [PATCH 08/57] initial barrier version

---
 .../WeakBarriers/flat_barrier.hpp             |  5 +-
 tests/weak_barrier.cpp                        | 94 +++++++++++++++++++
 2 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
index 9b35ac8d..13beedba 100644
--- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
@@ -37,7 +37,7 @@ inline void cpu_relax() { std::this_thread::yield(); }
 #endif
 
 struct alignas(CACHE_LINE_SIZE) AlignedAtomicFlag {
-    std::atomic<bool> flag_;
+    std::atomic<bool> flag_{false};
     int8_t pad[CACHE_LINE_SIZE - sizeof(std::atomic<bool>)];
 
     static_assert(std::atomic<bool>::is_always_lock_free);
@@ -54,6 +54,7 @@ struct alignas(CACHE_LINE_SIZE) AlignedAtomicFlag {
  * WARNING: The reset is NOT synchronised, thus a second FlatBarrier is required to synchronise the reset of the barrier. That is
  * do NOT call "Reset" immediately after "Wait" as this could cause other threads not to see that the work has been completed.
  *
+ * WARNING: A thread calling "Wait" before calling "Arrive" with its thread id results in a deadlock.
  */
 class FlatBarrier {
   private:
@@ -81,7 +82,7 @@ inline void FlatBarrier::Wait() const {
         std::size_t cntr = 0U;
         while (not flag.flag_.load(std::memory_order_relaxed)) {
             ++cntr;
-            if (cntr % 256U == 0U) {
+            if (cntr % 128U == 0U) {
                 cpu_relax();
             }
         }
diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp
index 833f2111..08661f39 100644
--- a/tests/weak_barrier.cpp
+++ b/tests/weak_barrier.cpp
@@ -18,9 +18,13 @@ limitations under the License.
 
 #define BOOST_TEST_MODULE WeakBarrierTests
 
+#include <array>
 #include <boost/test/unit_test.hpp>
 #include <cstdint>
 #include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
 
 #include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp"
 
@@ -30,3 +34,93 @@ BOOST_AUTO_TEST_CASE(TestAlignedAtomicFlag) {
     BOOST_CHECK_EQUAL(sizeof(AlignedAtomicFlag), 64U);
     BOOST_CHECK_EQUAL(alignof(AlignedAtomicFlag), 64U);
 }
+
+BOOST_AUTO_TEST_CASE(TestFlatBarrier_2Threads) {
+    constexpr std::size_t numThreads = 2U;
+    constexpr std::size_t numBarriers = 1024U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    std::array<FlatBarrier, 3U> barrier{FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}};
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(cntr);
+            }
+            barrier[0].Arrive(threadId);
+            barrier[0].Wait();
+            barrier[2].Reset(threadId);
+            barrier[1].Arrive(threadId);
+            barrier[1].Wait();
+            barrier[0].Reset(threadId);
+            barrier[2].Arrive(threadId);
+            barrier[2].Wait();
+            barrier[1].Reset(threadId);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+    for (std::size_t ind = 0U; ind < ans.size(); ++ind) {
+        BOOST_CHECK_EQUAL(ans[ind], ind / numThreads);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestFlatBarrier_128Threads) {
+    constexpr std::size_t numThreads = 128U;
+    constexpr std::size_t numBarriers = 8U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    std::array<FlatBarrier, 3U> barrier{FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}};
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(cntr);
+            }
+            barrier[0].Arrive(threadId);
+            barrier[0].Wait();
+            barrier[2].Reset(threadId);
+            barrier[1].Arrive(threadId);
+            barrier[1].Wait();
+            barrier[0].Reset(threadId);
+            barrier[2].Arrive(threadId);
+            barrier[2].Wait();
+            barrier[1].Reset(threadId);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+    for (std::size_t ind = 0U; ind < ans.size(); ++ind) {
+        BOOST_CHECK_EQUAL(ans[ind], ind / numThreads);
+    }
+}

From 51a1ad136dbfb0d8cb2fd93a9e03cdc8c2251dea Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 5 Feb 2026 16:58:09 +0100
Subject: [PATCH 09/57] improved FlatBarrier

---
 .../WeakBarriers/flat_barrier.hpp             | 23 +++++++++----------
 tests/weak_barrier.cpp                        | 18 ++++-----------
 2 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
index 13beedba..ca7f4f21 100644
--- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
@@ -49,12 +49,10 @@ struct alignas(CACHE_LINE_SIZE) AlignedAtomicFlag {
  * Instatiate with number of threads. Each thread should call "Arrive" with its thread id to indicate that its work has been
  * completed. Each thread can then call "Wait" to wait till all other threads have completed their work.
  *
- * The barrier can be reset and reused after calling "Reset" for each thread.
+ * WARNING: The barrier can be reused IF AND ONLY IF another synchronisation, i.e. through a second FlatBarrier, takes place in between
+ * the "Wait" and "Arrive".
  *
- * WARNING: The reset is NOT synchronised, thus a second FlatBarrier is required to synchronise the reset of the barrier. That is
- * do NOT call "Reset" immediately after "Wait" as this could cause other threads not to see that the work has been completed.
- *
- * WARNING: A thread calling "Wait" before calling "Arrive" with its thread id results in a deadlock.
+ * WARNING: A thread calling "Wait" before calling "Arrive" with its thread id is undefined behaviour and can result in a deadlock.
  */
 class FlatBarrier {
   private:
@@ -64,8 +62,7 @@ class FlatBarrier {
     FlatBarrier(std::size_t numThreads) : flags_(std::vector<AlignedAtomicFlag>(numThreads)) {};
 
     inline void Arrive(std::size_t threadId);
-    inline void Wait() const;
-    inline void Reset(std::size_t threadId);
+    inline void Wait(std::size_t threadId) const;
 
     FlatBarrier() = delete;
     FlatBarrier(const FlatBarrier &) = delete;
@@ -75,12 +72,16 @@ class FlatBarrier {
     ~FlatBarrier() = default;
 };
 
-inline void FlatBarrier::Arrive(std::size_t threadId) { flags_[threadId].flag_.store(true, std::memory_order_relaxed); }
+inline void FlatBarrier::Arrive(std::size_t threadId) {
+    const bool oldVal = flags_[threadId].flag_.load(std::memory_order_relaxed);
+    flags_[threadId].flag_.store(!oldVal, std::memory_order_relaxed);
+}
 
-inline void FlatBarrier::Wait() const {
+inline void FlatBarrier::Wait(std::size_t threadId) const {
+    const bool val = flags_[threadId].flag_.load(std::memory_order_relaxed);
     for (const AlignedAtomicFlag &flag : flags_) {
         std::size_t cntr = 0U;
-        while (not flag.flag_.load(std::memory_order_relaxed)) {
+        while (flag.flag_.load(std::memory_order_relaxed) != val) {
             ++cntr;
             if (cntr % 128U == 0U) {
                 cpu_relax();
@@ -89,6 +90,4 @@ inline void FlatBarrier::Wait() const {
     }
 }
 
-inline void FlatBarrier::Reset(std::size_t threadId) { flags_[threadId].flag_.store(false, std::memory_order_relaxed); }
-
 }    // end namespace osp
diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp
index 08661f39..9aafa6d5 100644
--- a/tests/weak_barrier.cpp
+++ b/tests/weak_barrier.cpp
@@ -55,14 +55,9 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_2Threads) {
                 ans.emplace_back(cntr);
             }
             barrier[0].Arrive(threadId);
-            barrier[0].Wait();
-            barrier[2].Reset(threadId);
+            barrier[0].Wait(threadId);
             barrier[1].Arrive(threadId);
-            barrier[1].Wait();
-            barrier[0].Reset(threadId);
-            barrier[2].Arrive(threadId);
-            barrier[2].Wait();
-            barrier[1].Reset(threadId);
+            barrier[1].Wait(threadId);
         }
     };
 
@@ -100,14 +95,9 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_128Threads) {
                 ans.emplace_back(cntr);
             }
             barrier[0].Arrive(threadId);
-            barrier[0].Wait();
-            barrier[2].Reset(threadId);
+            barrier[0].Wait(threadId);
             barrier[1].Arrive(threadId);
-            barrier[1].Wait();
-            barrier[0].Reset(threadId);
-            barrier[2].Arrive(threadId);
-            barrier[2].Wait();
-            barrier[1].Reset(threadId);
+            barrier[1].Wait(threadId);
         }
     };
 

From 60a9ca92ccee877250816ae03b0dddd86bb294b5 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 5 Feb 2026 17:09:31 +0100
Subject: [PATCH 10/57] small test fix

---
 tests/weak_barrier.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp
index 9aafa6d5..c7cc0d45 100644
--- a/tests/weak_barrier.cpp
+++ b/tests/weak_barrier.cpp
@@ -44,7 +44,7 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_2Threads) {
 
     std::mutex ans_mutex;
 
-    std::array<FlatBarrier, 3U> barrier{FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}};
+    std::array<FlatBarrier, 2U> barrier{FlatBarrier{numThreads}, FlatBarrier{numThreads}};
 
     std::vector<std::thread> threads(numThreads);
 

From 85844c2af29694fc5e407841180aa5548b23f3eb Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Fri, 6 Feb 2026 08:40:30 +0100
Subject: [PATCH 11/57] fixed barrier and ssp test

---
 .../WeakBarriers/flat_barrier.hpp             |  12 +-
 tests/weak_barrier.cpp                        | 104 +++++++++++++++++-
 2 files changed, 108 insertions(+), 8 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
index ca7f4f21..7c4e586c 100644
--- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
@@ -61,8 +61,8 @@ class FlatBarrier {
   public:
     FlatBarrier(std::size_t numThreads) : flags_(std::vector<AlignedAtomicFlag>(numThreads)) {};
 
-    inline void Arrive(std::size_t threadId);
-    inline void Wait(std::size_t threadId) const;
+    inline void Arrive(const std::size_t threadId);
+    inline void Wait(const std::size_t threadId) const;
 
     FlatBarrier() = delete;
     FlatBarrier(const FlatBarrier &) = delete;
@@ -72,16 +72,16 @@ class FlatBarrier {
     ~FlatBarrier() = default;
 };
 
-inline void FlatBarrier::Arrive(std::size_t threadId) {
+inline void FlatBarrier::Arrive(const std::size_t threadId) {
     const bool oldVal = flags_[threadId].flag_.load(std::memory_order_relaxed);
-    flags_[threadId].flag_.store(!oldVal, std::memory_order_relaxed);
+    flags_[threadId].flag_.store(!oldVal, std::memory_order_release);
 }
 
-inline void FlatBarrier::Wait(std::size_t threadId) const {
+inline void FlatBarrier::Wait(const std::size_t threadId) const {
     const bool val = flags_[threadId].flag_.load(std::memory_order_relaxed);
     for (const AlignedAtomicFlag &flag : flags_) {
         std::size_t cntr = 0U;
-        while (flag.flag_.load(std::memory_order_relaxed) != val) {
+        while (flag.flag_.load(std::memory_order_acquire) != val) {
             ++cntr;
             if (cntr % 128U == 0U) {
                 cpu_relax();
diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp
index c7cc0d45..8870b18c 100644
--- a/tests/weak_barrier.cpp
+++ b/tests/weak_barrier.cpp
@@ -48,7 +48,7 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_2Threads) {
 
     std::vector<std::thread> threads(numThreads);
 
-    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](std::size_t threadId) {
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
         for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
             {
                 std::lock_guard<std::mutex> lock(ans_mutex);
@@ -88,7 +88,7 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_128Threads) {
 
     std::vector<std::thread> threads(numThreads);
 
-    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](std::size_t threadId) {
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
         for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
             {
                 std::lock_guard<std::mutex> lock(ans_mutex);
@@ -114,3 +114,103 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_128Threads) {
         BOOST_CHECK_EQUAL(ans[ind], ind / numThreads);
     }
 }
+
+BOOST_AUTO_TEST_CASE(TestFlatBarrier_SSP_2Threads) {
+    constexpr std::size_t numThreads = 2U;
+    constexpr std::size_t numBarriers = 1024U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    constexpr std::size_t numSync = 4U;
+    std::array<FlatBarrier, numSync> barrier{
+        FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}};
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        barrier[1U].Arrive(threadId);
+        barrier[2U].Arrive(threadId);
+    }
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            barrier[(cntr - 2U + numSync) % numSync].Wait(threadId);
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(threadId);
+            }
+            barrier[cntr % numSync].Arrive(threadId);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+
+    std::vector<std::size_t> cntrs(numThreads, 0);
+    for (const std::size_t work : ans) {
+        const std::size_t current = ++cntrs[work];
+        for (const std::size_t cntr : cntrs) {
+            BOOST_CHECK_GE(cntr, std::max(current, static_cast<std::size_t>(2U)) - 2U);
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestFlatBarrier_SSP_128Threads) {
+    constexpr std::size_t numThreads = 128U;
+    constexpr std::size_t numBarriers = 32U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    constexpr std::size_t numSync = 4U;
+    std::array<FlatBarrier, numSync> barrier{
+        FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}};
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        barrier[1U].Arrive(threadId);
+        barrier[2U].Arrive(threadId);
+    }
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            barrier[(cntr - 2U + numSync) % numSync].Wait(threadId);
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(threadId);
+            }
+            barrier[cntr % numSync].Arrive(threadId);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+
+    std::vector<std::size_t> cntrs(numThreads, 0);
+    for (const std::size_t work : ans) {
+        const std::size_t current = ++cntrs[work];
+        for (const std::size_t cntr : cntrs) {
+            BOOST_CHECK_GE(cntr, std::max(current, static_cast<std::size_t>(2U)) - 2U);
+        }
+    }
+}

From ca2e4ecd7aa3fd95a0cc37a1e99b45a3385b2602 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Fri, 6 Feb 2026 09:33:31 +0100
Subject: [PATCH 12/57] barrier with counter

---
 .../WeakBarriers/cpu_relax.hpp                |  38 ++++
 .../WeakBarriers/flat_barrier.hpp             |  12 +-
 .../flat_checkpoint_counter_barrier.hpp       |  82 +++++++++
 tests/weak_barrier.cpp                        | 173 +++++++++++++++++-
 4 files changed, 292 insertions(+), 13 deletions(-)
 create mode 100644 include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp
 create mode 100644 include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp

diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp
new file mode 100644
index 00000000..d9e5e268
--- /dev/null
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp
@@ -0,0 +1,38 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include <thread>
+
+#if defined(__x86_64__) || defined(_M_X64)
+#    include <immintrin.h>
+#endif
+
+namespace osp {
+
+// Portable cpu_relax definition
+#if defined(__x86_64__) || defined(_M_X64)
+inline void cpu_relax() { _mm_pause(); }
+#elif defined(__aarch64__)
+inline void cpu_relax() { asm volatile("yield" ::: "memory"); }
+#else
+inline void cpu_relax() { std::this_thread::yield(); }
+#endif
+
+}    // end namespace osp
diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
index 7c4e586c..2de8adcc 100644
--- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp
@@ -21,21 +21,11 @@ limitations under the License.
 #include <atomic>
 #include <cstdint>
 
+#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp"
 #include "osp/config/config.hpp"
 
 namespace osp {
 
-// Portable cpu_relax definition
-#if defined(__x86_64__) || defined(_M_X64)
-#    include <immintrin.h>
-
-inline void cpu_relax() { _mm_pause(); }
-#elif defined(__aarch64__)
-inline void cpu_relax() { asm volatile("yield" ::: "memory"); }
-#else
-inline void cpu_relax() { std::this_thread::yield(); }
-#endif
-
 struct alignas(CACHE_LINE_SIZE) AlignedAtomicFlag {
     std::atomic<bool> flag_{false};
     int8_t pad[CACHE_LINE_SIZE - sizeof(std::atomic<bool>)];
diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
new file mode 100644
index 00000000..87607def
--- /dev/null
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
@@ -0,0 +1,82 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+
+#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp"
+#include "osp/config/config.hpp"
+
+namespace osp {
+
+struct alignas(CACHE_LINE_SIZE) AlignedAtomicCounter {
+    std::atomic<std::size_t> cntr_{0U};
+    int8_t pad[CACHE_LINE_SIZE - sizeof(std::atomic<std::size_t>)];
+
+    static_assert(std::atomic<std::size_t>::is_always_lock_free);
+    static_assert(sizeof(int8_t) == 1U);
+};
+
+class FlatCheckpointCounterBarrier {
+  private:
+    std::vector<AlignedAtomicCounter> cntrs_;
+    mutable std::vector<std::vector<std::size_t>> cachedCntrs_;
+
+  public:
+    FlatCheckpointCounterBarrier(std::size_t numThreads)
+        : cntrs_(std::vector<AlignedAtomicCounter>(numThreads)),
+          cachedCntrs_(std::vector<std::vector<std::size_t>>(numThreads, std::vector<std::size_t>(numThreads, 0U))) {};
+
+    inline void Arrive(const std::size_t threadId);
+    inline void Wait(const std::size_t threadId, const std::size_t diff) const;
+
+    FlatCheckpointCounterBarrier() = delete;
+    FlatCheckpointCounterBarrier(const FlatCheckpointCounterBarrier &) = delete;
+    FlatCheckpointCounterBarrier(FlatCheckpointCounterBarrier &&) = delete;
+    FlatCheckpointCounterBarrier &operator=(const FlatCheckpointCounterBarrier &) = delete;
+    FlatCheckpointCounterBarrier &operator=(FlatCheckpointCounterBarrier &&) = delete;
+    ~FlatCheckpointCounterBarrier() = default;
+};
+
+inline void FlatCheckpointCounterBarrier::Arrive(const std::size_t threadId) {
+    const std::size_t curr = cntrs_[threadId].cntr_.fetch_add(1U, std::memory_order_release) + 1U;
+    cachedCntrs_[threadId][threadId] = curr;
+}
+
+inline void FlatCheckpointCounterBarrier::Wait(const std::size_t threadId, const std::size_t diff) const {
+    std::vector<std::size_t> &localCachedCntrs = cachedCntrs_[threadId];
+
+    const std::size_t minVal = std::max(localCachedCntrs[threadId], diff) - diff;
+
+    for (std::size_t ind = 0U; ind < cntrs_.size(); ++ind) {
+        std::size_t loopCntr = 0U;
+        while ((localCachedCntrs[ind] < minVal)
+               && ((localCachedCntrs[ind] = cntrs_[ind].cntr_.load(std::memory_order_acquire)) < minVal)) {
+            ++loopCntr;
+            if (loopCntr % 128U == 0U) {
+                cpu_relax();
+            }
+        }
+    }
+}
+
+}    // end namespace osp
diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp
index 8870b18c..c3bc2f34 100644
--- a/tests/weak_barrier.cpp
+++ b/tests/weak_barrier.cpp
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp"
+#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp"
 
 using namespace osp;
 
@@ -77,7 +78,7 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_2Threads) {
 
 BOOST_AUTO_TEST_CASE(TestFlatBarrier_128Threads) {
     constexpr std::size_t numThreads = 128U;
-    constexpr std::size_t numBarriers = 8U;
+    constexpr std::size_t numBarriers = 16U;
 
     std::vector<std::size_t> ans;
     ans.reserve(numThreads * numBarriers);
@@ -167,7 +168,7 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_SSP_2Threads) {
 
 BOOST_AUTO_TEST_CASE(TestFlatBarrier_SSP_128Threads) {
     constexpr std::size_t numThreads = 128U;
-    constexpr std::size_t numBarriers = 32U;
+    constexpr std::size_t numBarriers = 16U;
 
     std::vector<std::size_t> ans;
     ans.reserve(numThreads * numBarriers);
@@ -214,3 +215,171 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_SSP_128Threads) {
         }
     }
 }
+
+BOOST_AUTO_TEST_CASE(TestAlignedAtomicCounter) {
+    BOOST_CHECK_EQUAL(sizeof(AlignedAtomicCounter), 64U);
+    BOOST_CHECK_EQUAL(alignof(AlignedAtomicCounter), 64U);
+}
+
+
+BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrier_2Threads) {
+    constexpr std::size_t numThreads = 2U;
+    constexpr std::size_t numBarriers = 1024U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    FlatCheckpointCounterBarrier barrier{numThreads};
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(cntr);
+            }
+            barrier.Arrive(threadId);
+            barrier.Wait(threadId, 0U);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+    for (std::size_t ind = 0U; ind < ans.size(); ++ind) {
+        BOOST_CHECK_EQUAL(ans[ind], ind / numThreads);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrier_128Threads) {
+    constexpr std::size_t numThreads = 128U;
+    constexpr std::size_t numBarriers = 16U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    FlatCheckpointCounterBarrier barrier{numThreads};
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(cntr);
+            }
+            barrier.Arrive(threadId);
+            barrier.Wait(threadId, 0U);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+    for (std::size_t ind = 0U; ind < ans.size(); ++ind) {
+        BOOST_CHECK_EQUAL(ans[ind], ind / numThreads);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrier_SSP_2Threads) {
+    constexpr std::size_t numThreads = 2U;
+    constexpr std::size_t numBarriers = 1024U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    FlatCheckpointCounterBarrier barrier{numThreads};
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            barrier.Wait(threadId, 1U);
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(threadId);
+            }
+            barrier.Arrive(threadId);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+
+    std::vector<std::size_t> cntrs(numThreads, 0);
+    for (const std::size_t work : ans) {
+        const std::size_t current = ++cntrs[work];
+        for (const std::size_t cntr : cntrs) {
+            BOOST_CHECK_GE(cntr, std::max(current, static_cast<std::size_t>(2U)) - 2U);
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrier_SSP_128Threads) {
+    constexpr std::size_t numThreads = 128U;
+    constexpr std::size_t numBarriers = 16U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    FlatCheckpointCounterBarrier barrier{numThreads};
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            barrier.Wait(threadId, 1U);
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(threadId);
+            }
+            barrier.Arrive(threadId);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+
+    std::vector<std::size_t> cntrs(numThreads, 0);
+    for (const std::size_t work : ans) {
+        const std::size_t current = ++cntrs[work];
+        for (const std::size_t cntr : cntrs) {
+            BOOST_CHECK_GE(cntr, std::max(current, static_cast<std::size_t>(2U)) - 2U);
+        }
+    }
+}
\ No newline at end of file

From 82de6ded9060b9990e89daf32063f33bfc3223bf Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Fri, 6 Feb 2026 10:00:43 +0100
Subject: [PATCH 13/57] moved cpu_relax

---
 include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index de32d6c8..d607a80a 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -35,20 +35,12 @@ limitations under the License.
 #    include <thread>
 #    include <vector>
 
+#    include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp"
 #    include "osp/bsp/model/BspInstance.hpp"
 #    include "osp/bsp/model/BspSchedule.hpp"
 #    include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp"
 
 namespace osp {
-// Portable cpu_relax definition
-#if defined(__x86_64__) || defined(_M_X64)
-#include <immintrin.h>
-inline void cpu_relax() { _mm_pause(); }
-#elif defined(__aarch64__)
-inline void cpu_relax() { asm volatile("yield" ::: "memory"); }
-#else
-inline void cpu_relax() { std::this_thread::yield(); }
-#endif
 
 // Staleness-aware barrier for SSP: threads may run up to (staleness-1) steps ahead.
 // Internally tracks per-step completion counts and uses adaptive backoff to limit spinning.

From 9c6bc82bbd58bcdf48509729694afbafb48009e7 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Fri, 6 Feb 2026 12:56:36 +0100
Subject: [PATCH 14/57] Adding barrier to ssp sptrsv / adding to benchmark app

---
 apps/maxbsp_ssp_sptrsv.cpp                    | 100 ++++++++---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp |  99 ++++-------
 tests/weak_barrier.cpp                        | 163 ++++++++++++++++++
 3 files changed, 277 insertions(+), 85 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index dd1112d0..4c8ed159 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -19,15 +19,21 @@
 using namespace osp;
 
 int main(int argc, char* argv[]) {
-    // Accept matrix filename and iteration count as arguments
+    // Accept matrix filename and iteration count as arguments (threads via OMP_NUM_THREADS or optional arg)
     std::string filename = "../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx";
     int num_iterations = 1;
+    unsigned num_threads = 16U;
     if (argc > 1) {
         filename = argv[1];
     }
     if (argc > 2) {
         num_iterations = std::stoi(argv[2]);
     }
+    if (const char *omp_env = std::getenv("OMP_NUM_THREADS")) {
+        num_threads = static_cast<unsigned>(std::stoul(omp_env));
+    } else if (argc > 3) {
+        num_threads = static_cast<unsigned>(std::stoul(argv[3]));
+    }
 
     // Load matrix
     Eigen::SparseMatrix<double, Eigen::RowMajor, int32_t> lCsr;
@@ -43,7 +49,7 @@ int main(int argc, char* argv[]) {
     graph.SetCsr(&lCsr);
     Eigen::SparseMatrix<double, Eigen::ColMajor, int32_t> lCsc = lCsr;
     graph.SetCsc(&lCsc);
-    BspArchitecture<SparseMatrixImp<int32_t>> architecture(16, 1, 500); // 16 processors
+    BspArchitecture<SparseMatrixImp<int32_t>> architecture(num_threads, 1, 500); // configurable processors
     BspInstance<SparseMatrixImp<int32_t>> instance(graph, architecture);
 
     // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness=2)
@@ -61,22 +67,43 @@ int main(int argc, char* argv[]) {
 
     size_t n = static_cast<size_t>(lCsc.cols());
 
-    // Benchmark SSP L-solve
-    double ssp_total_time = 0.0;
-    std::vector<double> ssp_result(n, 0.0);
+    // Benchmark SSP L-solve with cached barrier
+    double ssp_cached_total_time = 0.0;
+    std::vector<double> ssp_cached_result(n, 0.0);
+    for (int iter = 0; iter < num_iterations; ++iter) {
+        std::vector<double> x(n, 0.0);
+        std::vector<double> b(n, 1.0);
+        sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule);
+        sptrsv_kernel.x_ = x.data();
+        sptrsv_kernel.b_ = b.data();
+        FlatCheckpointCounterBarrierCached barrier(num_threads);
+        auto ops = Sptrsv<int32_t>::MakeBarrierOps(barrier);
+        auto start = std::chrono::high_resolution_clock::now();
+        sptrsv_kernel.SspLsolveStaleness2(ops);
+        auto end = std::chrono::high_resolution_clock::now();
+        ssp_cached_total_time += std::chrono::duration<double>(end - start).count();
+        if (iter == 0) ssp_cached_result = std::vector<double>(x.begin(), x.end());
+    }
+    double ssp_cached_avg_time = ssp_cached_total_time / num_iterations;
+
+    // Benchmark SSP L-solve with flat barrier
+    double ssp_flat_total_time = 0.0;
+    std::vector<double> ssp_flat_result(n, 0.0);
     for (int iter = 0; iter < num_iterations; ++iter) {
         std::vector<double> x(n, 0.0);
         std::vector<double> b(n, 1.0);
         sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule);
         sptrsv_kernel.x_ = x.data();
         sptrsv_kernel.b_ = b.data();
+        FlatCheckpointCounterBarrier barrier(num_threads);
+        auto ops = Sptrsv<int32_t>::MakeBarrierOps(barrier);
         auto start = std::chrono::high_resolution_clock::now();
-        sptrsv_kernel.SspLsolveStaleness2();
+        sptrsv_kernel.SspLsolveStaleness2(ops);
         auto end = std::chrono::high_resolution_clock::now();
-        ssp_total_time += std::chrono::duration<double>(end - start).count();
-        if (iter == 0) ssp_result = std::vector<double>(x.begin(), x.end());
+        ssp_flat_total_time += std::chrono::duration<double>(end - start).count();
+        if (iter == 0) ssp_flat_result = std::vector<double>(x.begin(), x.end());
     }
-    double ssp_avg_time = ssp_total_time / num_iterations;
+    double ssp_flat_avg_time = ssp_flat_total_time / num_iterations;
 
     // Benchmark GrowLocalAutoCores schedule with non-SSP L-solve (no permutation)
     double growlocal_total_time = 0.0;
@@ -114,14 +141,26 @@ int main(int argc, char* argv[]) {
     // Compare results
     double max_diff = 0.0;
     for (size_t i = 0; i < n; ++i) {
-        double diff = std::abs(ssp_result[i] - serial_result[i]);
+        double diff = std::abs(ssp_cached_result[i] - serial_result[i]);
         if (diff > max_diff) max_diff = diff;
     }
-    std::cout << "Max difference between SSP and serial L-solve: " << max_diff << std::endl;
+    std::cout << "Max difference between SSP (cached barrier) and serial L-solve: " << max_diff << std::endl;
     if (max_diff < 1e-10) {
-        std::cout << "SSP L-solve matches serial L-solve!" << std::endl;
+        std::cout << "SSP (cached barrier) L-solve matches serial L-solve!" << std::endl;
     } else {
-        std::cout << "SSP L-solve does NOT match serial L-solve!" << std::endl;
+        std::cout << "SSP (cached barrier) L-solve does NOT match serial L-solve!" << std::endl;
+    }
+
+    double max_diff_flat = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+        double diff = std::abs(ssp_flat_result[i] - serial_result[i]);
+        if (diff > max_diff_flat) max_diff_flat = diff;
+    }
+    std::cout << "Max difference between SSP (flat barrier) and serial L-solve: " << max_diff_flat << std::endl;
+    if (max_diff_flat < 1e-10) {
+        std::cout << "SSP (flat barrier) L-solve matches serial L-solve!" << std::endl;
+    } else {
+        std::cout << "SSP (flat barrier) L-solve does NOT match serial L-solve!" << std::endl;
     }
     double max_diff_growlocal = 0.0;
     for (size_t i = 0; i < n; ++i) {
@@ -137,24 +176,43 @@ int main(int argc, char* argv[]) {
 
     double max_diff_ssp_growlocal = 0.0;
     for (size_t i = 0; i < n; ++i) {
-        double diff = std::abs(ssp_result[i] - growlocal_result[i]);
+        double diff = std::abs(ssp_cached_result[i] - growlocal_result[i]);
         if (diff > max_diff_ssp_growlocal) max_diff_ssp_growlocal = diff;
     }
-    std::cout << "Max difference between SSP and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal << std::endl;
+    std::cout << "Max difference between SSP (cached barrier) and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal
+              << std::endl;
 
-    std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_avg_time << " seconds" << std::endl;
+    double max_diff_ssp_flat_cached = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+        double diff = std::abs(ssp_flat_result[i] - ssp_cached_result[i]);
+        if (diff > max_diff_ssp_flat_cached) max_diff_ssp_flat_cached = diff;
+    }
+    std::cout << "Max difference between SSP (flat barrier) and SSP (cached barrier): " << max_diff_ssp_flat_cached
+              << std::endl;
+
+    std::cout << "Average SSP (cached barrier) L-solve time (" << num_iterations << " runs): " << ssp_cached_avg_time
+              << " seconds" << std::endl;
+    std::cout << "Average SSP (flat barrier) L-solve time (" << num_iterations << " runs): " << ssp_flat_avg_time
+              << " seconds" << std::endl;
     std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time
               << " seconds" << std::endl;
     std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl;
-    if (ssp_avg_time > 0.0) {
-        std::cout << "Speedup (serial/SSP): " << (serial_avg_time / ssp_avg_time) << "x" << std::endl;
+    if (ssp_cached_avg_time > 0.0) {
+        std::cout << "Speedup (serial/SSP cached): " << (serial_avg_time / ssp_cached_avg_time) << "x" << std::endl;
+    }
+    if (ssp_flat_avg_time > 0.0) {
+        std::cout << "Speedup (serial/SSP flat): " << (serial_avg_time / ssp_flat_avg_time) << "x" << std::endl;
     }
     if (growlocal_avg_time > 0.0) {
         std::cout << "Speedup (serial/GrowLocalAutoCores): " << (serial_avg_time / growlocal_avg_time) << "x" << std::endl;
     }
-    if (ssp_avg_time > 0.0) {
-        std::cout << "Speedup (GrowLocalAutoCores/SSP): " << (growlocal_avg_time / ssp_avg_time) << "x" << std::endl;
+    if (ssp_cached_avg_time > 0.0) {
+        std::cout << "Speedup (GrowLocalAutoCores/SSP cached): " << (growlocal_avg_time / ssp_cached_avg_time) << "x"
+                  << std::endl;
+    }
+    if (ssp_flat_avg_time > 0.0) {
+        std::cout << "Speedup (GrowLocalAutoCores/SSP flat): " << (growlocal_avg_time / ssp_flat_avg_time) << "x" << std::endl;
     }
-    std::cout << "MaxBSP staleness=2 SSP and GrowLocalAutoCores SpTRSV executed." << std::endl;
+    std::cout << "MaxBSP staleness=2 SSP (cached+flat) and GrowLocalAutoCores SpTRSV executed." << std::endl;
     return 0;
 }
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index d607a80a..7d08e32c 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -35,62 +35,14 @@ limitations under the License.
 #    include <thread>
 #    include <vector>
 
-#    include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp"
+#    include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp"
+#    include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp"
 #    include "osp/bsp/model/BspInstance.hpp"
 #    include "osp/bsp/model/BspSchedule.hpp"
 #    include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp"
 
 namespace osp {
 
-// Staleness-aware barrier for SSP: threads may run up to (staleness-1) steps ahead.
-// Internally tracks per-step completion counts and uses adaptive backoff to limit spinning.
-class SspStalenessBarrier {
-  private:
-    std::unique_ptr<std::atomic<unsigned>[]> stepDone_;
-    std::size_t stepDoneSize_ = 0U;
-
-  public:
-    void Reset(std::size_t numSupersteps) {
-        // Reinitialize counters for a new schedule/run.
-        if (stepDoneSize_ != numSupersteps) {
-            stepDone_ = std::make_unique<std::atomic<unsigned>[]>(numSupersteps);
-            stepDoneSize_ = numSupersteps;
-        }
-        for (std::size_t i = 0; i < stepDoneSize_; ++i) {
-            stepDone_[i].store(0U, std::memory_order_relaxed);
-        }
-    }
-
-    void WaitIfNeeded(unsigned step, unsigned staleness, unsigned nthreads) {
-        // Enforce: step may start only when all threads completed (step - staleness).
-        if (step < staleness) {
-            return;
-        }
-        const unsigned waitStep = step - staleness;
-        unsigned spinCount = 0U;
-        auto backoff = std::chrono::nanoseconds(50);
-        while (stepDone_[waitStep].load(std::memory_order_relaxed) < nthreads) {
-            // Adaptive backoff: spin -> yield -> short sleep to reduce contention.
-            if (spinCount < 2000U) {
-                cpu_relax();
-                ++spinCount;
-            } else if (spinCount < 4000U) {
-                std::this_thread::yield();
-                ++spinCount;
-            } else {
-                std::this_thread::sleep_for(backoff);
-                if (backoff < std::chrono::nanoseconds(500)) {
-                    backoff *= 2;
-                }
-            }
-        }
-        std::atomic_thread_fence(std::memory_order_acquire);
-    }
-
-    // Mark completion of a superstep by this thread.
-    void Arrive(unsigned step) { stepDone_[step].fetch_add(1U, std::memory_order_release); }
-};
-
 template <typename EigenIdxType>
 class Sptrsv {
     using UVertType = typename SparseMatrixImp<EigenIdxType>::VertexIdx;
@@ -99,6 +51,23 @@ class Sptrsv {
     const BspInstance<SparseMatrixImp<EigenIdxType>> *instance_;
 
   public:
+    struct BarrierOps {
+        void *ctx;
+        void (*arrive)(void *ctx, std::size_t threadId);
+        void (*wait)(void *ctx, std::size_t threadId, std::size_t diff);
+    };
+
+    template <typename BarrierT>
+    static BarrierOps MakeBarrierOps(BarrierT &barrier) {
+        return BarrierOps{
+            static_cast<void *>(&barrier),
+            [](void *ctx, std::size_t threadId) {
+                static_cast<BarrierT *>(ctx)->Arrive(threadId);
+            },
+            [](void *ctx, std::size_t threadId, std::size_t diff) {
+                static_cast<BarrierT *>(ctx)->Wait(threadId, diff);
+            }};
+    }
     std::vector<double> val_;
     std::vector<double> cscVal_;
 
@@ -122,7 +91,6 @@ class Sptrsv {
 
     std::vector<std::vector<std::vector<EigenIdxType>>> boundsArrayL_;
     std::vector<std::vector<std::vector<EigenIdxType>>> boundsArrayU_;
-    SspStalenessBarrier sspBarrier_;
 
     Sptrsv() = default;
 
@@ -141,7 +109,6 @@ class Sptrsv {
             schedule.NumberOfSupersteps(), std::vector<std::vector<EigenIdxType>>(schedule.GetInstance().NumberOfProcessors()));
 
         numSupersteps_ = schedule.NumberOfSupersteps();
-        sspBarrier_.Reset(static_cast<std::size_t>(numSupersteps_));
         size_t numberOfVertices = instance_->GetComputationalDag().NumVertices();
 
 #    pragma omp parallel num_threads(2)
@@ -538,12 +505,10 @@ class Sptrsv {
     std::size_t GetNumberOfVertices() { return instance_->NumberOfVertices(); }
 
     // SSP Lsolve with staleness=2 (allowing at most one superstep of lag).
-    // Uses the staleness barrier to respect dependencies between supersteps.
-    void SspLsolveStaleness2() {
+    // Barrier operations are injected via function pointers.
+    void SspLsolveStaleness2(const BarrierOps &barrierOps) {
         constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference
         const unsigned nthreads = instance_->NumberOfProcessors();
-        // Reset per-step completion counters for this run.
-        sspBarrier_.Reset(static_cast<std::size_t>(numSupersteps_));
 
         auto *csr = instance_->GetComputationalDag().GetCSR();
         const auto *outer = csr->outerIndexPtr();
@@ -552,10 +517,10 @@ class Sptrsv {
 
         #pragma omp parallel num_threads(nthreads)
         {
-            const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
+            const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; ++step) {
-                // Ensure we are not more than (staleness-1) supersteps ahead.
-                sspBarrier_.WaitIfNeeded(step, static_cast<unsigned>(staleness), nthreads);
+                // Enforce staleness window before starting this superstep.
+                barrierOps.wait(barrierOps.ctx, proc, staleness - 1U);
                 // Process nodes assigned to this (step, proc) pair.
                 const size_t boundsStrSize = boundsArrayL_[step][proc].size();
                 for (size_t index = 0; index < boundsStrSize; index += 2) {
@@ -565,9 +530,7 @@ class Sptrsv {
                         // Initialize solution for this node
                         x_[node] = b_[node];
                         // Perform lower-triangular solve for this node
-                        for (EigenIdxType i = outer[node];
-                             i < outer[node + 1] - 1;
-                             ++i) {
+                        for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
                             // Subtract contributions from previously solved nodes
                             x_[node] -= vals[i] * x_[inner[i]];
                         }
@@ -575,12 +538,20 @@ class Sptrsv {
                         x_[node] /= vals[outer[node + 1] - 1];
                     }
                 }
-                // Signal completion of this superstep for staleness tracking.
-                sspBarrier_.Arrive(step);
+                // Signal completion of this superstep.
+                barrierOps.arrive(barrierOps.ctx, proc);
             }
         }
     }
 
+    // Default SSP Lsolve uses the cached flat checkpoint counter barrier.
+    void SspLsolveStaleness2() {
+        const unsigned nthreads = instance_->NumberOfProcessors();
+        FlatCheckpointCounterBarrierCached barrier(nthreads);
+        const BarrierOps ops = MakeBarrierOps(barrier);
+        SspLsolveStaleness2(ops);
+    }
+
     virtual ~Sptrsv() = default;
 };
 
diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp
index c3bc2f34..22ad875a 100644
--- a/tests/weak_barrier.cpp
+++ b/tests/weak_barrier.cpp
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp"
 #include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp"
+#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp"
 
 using namespace osp;
 
@@ -375,6 +376,168 @@ BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrier_SSP_128Threads) {
 
     BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
 
+    std::vector<std::size_t> cntrs(numThreads, 0);
+    for (const std::size_t work : ans) {
+        const std::size_t current = ++cntrs[work];
+        for (const std::size_t cntr : cntrs) {
+            BOOST_CHECK_GE(cntr, std::max(current, static_cast<std::size_t>(2U)) - 2U);
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrierCached_2Threads) {
+    constexpr std::size_t numThreads = 2U;
+    constexpr std::size_t numBarriers = 1024U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    FlatCheckpointCounterBarrierCached barrier{numThreads};
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(cntr);
+            }
+            barrier.Arrive(threadId);
+            barrier.Wait(threadId, 0U);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+    for (std::size_t ind = 0U; ind < ans.size(); ++ind) {
+        BOOST_CHECK_EQUAL(ans[ind], ind / numThreads);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrierCached_128Threads) {
+    constexpr std::size_t numThreads = 128U;
+    constexpr std::size_t numBarriers = 16U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    FlatCheckpointCounterBarrierCached barrier{numThreads};
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(cntr);
+            }
+            barrier.Arrive(threadId);
+            barrier.Wait(threadId, 0U);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+    for (std::size_t ind = 0U; ind < ans.size(); ++ind) {
+        BOOST_CHECK_EQUAL(ans[ind], ind / numThreads);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrierCached_SSP_2Threads) {
+    constexpr std::size_t numThreads = 2U;
+    constexpr std::size_t numBarriers = 1024U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    FlatCheckpointCounterBarrierCached barrier{numThreads};
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            barrier.Wait(threadId, 1U);
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(threadId);
+            }
+            barrier.Arrive(threadId);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+
+    std::vector<std::size_t> cntrs(numThreads, 0);
+    for (const std::size_t work : ans) {
+        const std::size_t current = ++cntrs[work];
+        for (const std::size_t cntr : cntrs) {
+            BOOST_CHECK_GE(cntr, std::max(current, static_cast<std::size_t>(2U)) - 2U);
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrierCached_SSP_128Threads) {
+    constexpr std::size_t numThreads = 128U;
+    constexpr std::size_t numBarriers = 16U;
+
+    std::vector<std::size_t> ans;
+    ans.reserve(numThreads * numBarriers);
+
+    std::mutex ans_mutex;
+
+    FlatCheckpointCounterBarrierCached barrier{numThreads};
+
+    std::vector<std::thread> threads(numThreads);
+
+    auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) {
+        for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) {
+            barrier.Wait(threadId, 1U);
+            {
+                std::lock_guard<std::mutex> lock(ans_mutex);
+                ans.emplace_back(threadId);
+            }
+            barrier.Arrive(threadId);
+        }
+    };
+
+    for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) {
+        threads[threadId] = std::thread(threadWork, threadId);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers);
+
     std::vector<std::size_t> cntrs(numThreads, 0);
     for (const std::size_t work : ans) {
         const std::size_t current = ++cntrs[work];

From c2bbfc9fa1f091b1c173d5dee85a17d35cb3d539 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Fri, 6 Feb 2026 12:57:10 +0100
Subject: [PATCH 15/57] add different barrier implementation

---
 ...flat_checkpoint_counter_barrier_cached.hpp | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp

diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp
new file mode 100644
index 00000000..bd5d7fab
--- /dev/null
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp
@@ -0,0 +1,95 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <vector>
+
+#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp"
+#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp"
+
+namespace osp {
+
+class FlatCheckpointCounterBarrierCached {
+  private:
+    std::vector<AlignedAtomicCounter> cntrs_;
+        // Change vs flat_checkpoint_counter_barrier.hpp: flatten 2D cache into 1D array
+        // to improve locality and avoid nested vector indirections.
+        std::vector<std::size_t> cachedCntrs_;
+        // Keep explicit thread count for fast index math instead of cntrs_.size().
+        std::size_t numThreads_ = 0U;
+
+    inline std::size_t &Cached(std::size_t row, std::size_t col) {
+        // Helper to map (row, col) to flat index.
+        return cachedCntrs_[row * numThreads_ + col];
+    }
+
+    inline const std::size_t &Cached(std::size_t row, std::size_t col) const {
+        // Const helper for the same flat index mapping.
+        return cachedCntrs_[row * numThreads_ + col];
+    }
+
+  public:
+    FlatCheckpointCounterBarrierCached(std::size_t numThreads)
+        : cntrs_(std::vector<AlignedAtomicCounter>(numThreads)),
+                    // Allocate one contiguous block instead of vector-of-vectors.
+                    cachedCntrs_(numThreads * numThreads, 0U),
+          numThreads_(numThreads) {}
+
+    inline void Arrive(const std::size_t threadId);
+    inline void Wait(const std::size_t threadId, const std::size_t diff) const;
+
+    FlatCheckpointCounterBarrierCached() = delete;
+    FlatCheckpointCounterBarrierCached(const FlatCheckpointCounterBarrierCached &) = delete;
+    FlatCheckpointCounterBarrierCached(FlatCheckpointCounterBarrierCached &&) = delete;
+    FlatCheckpointCounterBarrierCached &operator=(const FlatCheckpointCounterBarrierCached &) = delete;
+    FlatCheckpointCounterBarrierCached &operator=(FlatCheckpointCounterBarrierCached &&) = delete;
+    ~FlatCheckpointCounterBarrierCached() = default;
+};
+
+inline void FlatCheckpointCounterBarrierCached::Arrive(const std::size_t threadId) {
+    const std::size_t curr = cntrs_[threadId].cntr_.fetch_add(1U, std::memory_order_release) + 1U;
+    // Update cached counter via flat indexing helper.
+    Cached(threadId, threadId) = curr;
+}
+
+inline void FlatCheckpointCounterBarrierCached::Wait(const std::size_t threadId, const std::size_t diff) const {
+    // Compute row base once for flat cache; avoids vector-of-vectors access.
+    const std::size_t base = threadId * numThreads_;
+    // Cast away const instead of marking cachedCntrs_ mutable in this class.
+    std::size_t *localCached = const_cast<std::size_t *>(cachedCntrs_.data() + base);
+    const std::size_t localThreadVal = localCached[threadId];
+    const std::size_t minVal = std::max(localThreadVal, diff) - diff;
+    // Hoist data pointer and use numThreads_ instead of cntrs_.size().
+    const AlignedAtomicCounter *cntrs = cntrs_.data();
+
+    for (std::size_t ind = 0U; ind < numThreads_; ++ind) {
+        std::size_t loopCntr = 0U;
+        while ((localCached[ind] < minVal)
+               && ((localCached[ind] = cntrs[ind].cntr_.load(std::memory_order_acquire)) < minVal)) {
+            ++loopCntr;
+            if (loopCntr % 128U == 0U) {
+                cpu_relax();
+            }
+        }
+    }
+}
+
+}    // end namespace osp

From d2b4a07a3b68ffe5c2ad84922dfa9435e2ac944e Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Fri, 6 Feb 2026 15:04:15 +0100
Subject: [PATCH 16/57] Corrections and cleaning

---
 apps/maxbsp_ssp_sptrsv.cpp                    | 88 ++++++-------------
 ...flat_checkpoint_counter_barrier_cached.hpp |  2 +-
 2 files changed, 30 insertions(+), 60 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 4c8ed159..40c62ce3 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -67,26 +67,7 @@ int main(int argc, char* argv[]) {
 
     size_t n = static_cast<size_t>(lCsc.cols());
 
-    // Benchmark SSP L-solve with cached barrier
-    double ssp_cached_total_time = 0.0;
-    std::vector<double> ssp_cached_result(n, 0.0);
-    for (int iter = 0; iter < num_iterations; ++iter) {
-        std::vector<double> x(n, 0.0);
-        std::vector<double> b(n, 1.0);
-        sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule);
-        sptrsv_kernel.x_ = x.data();
-        sptrsv_kernel.b_ = b.data();
-        FlatCheckpointCounterBarrierCached barrier(num_threads);
-        auto ops = Sptrsv<int32_t>::MakeBarrierOps(barrier);
-        auto start = std::chrono::high_resolution_clock::now();
-        sptrsv_kernel.SspLsolveStaleness2(ops);
-        auto end = std::chrono::high_resolution_clock::now();
-        ssp_cached_total_time += std::chrono::duration<double>(end - start).count();
-        if (iter == 0) ssp_cached_result = std::vector<double>(x.begin(), x.end());
-    }
-    double ssp_cached_avg_time = ssp_cached_total_time / num_iterations;
-
-    // Benchmark SSP L-solve with flat barrier
+    // Benchmark SSP L-solve
     double ssp_flat_total_time = 0.0;
     std::vector<double> ssp_flat_result(n, 0.0);
     for (int iter = 0; iter < num_iterations; ++iter) {
@@ -139,80 +120,69 @@ int main(int argc, char* argv[]) {
     double serial_avg_time = serial_total_time / num_iterations;
 
     // Compare results
-    double max_diff = 0.0;
-    for (size_t i = 0; i < n; ++i) {
-        double diff = std::abs(ssp_cached_result[i] - serial_result[i]);
-        if (diff > max_diff) max_diff = diff;
-    }
-    std::cout << "Max difference between SSP (cached barrier) and serial L-solve: " << max_diff << std::endl;
-    if (max_diff < 1e-10) {
-        std::cout << "SSP (cached barrier) L-solve matches serial L-solve!" << std::endl;
-    } else {
-        std::cout << "SSP (cached barrier) L-solve does NOT match serial L-solve!" << std::endl;
-    }
-
     double max_diff_flat = 0.0;
+    double frobNorm = 0.0;
     for (size_t i = 0; i < n; ++i) {
         double diff = std::abs(ssp_flat_result[i] - serial_result[i]);
         if (diff > max_diff_flat) max_diff_flat = diff;
+        frobNorm += diff * diff;
     }
-    std::cout << "Max difference between SSP (flat barrier) and serial L-solve: " << max_diff_flat << std::endl;
-    if (max_diff_flat < 1e-10) {
-        std::cout << "SSP (flat barrier) L-solve matches serial L-solve!" << std::endl;
+    frobNorm = std::sqrt(frobNorm);
+    std::cout << "Frobenius norm of difference: " << frobNorm << std::endl;
+    std::cout << "Max difference between SSP and serial L-solve: " << max_diff_flat << std::endl;
+    if (frobNorm <= 1e-30 || max_diff_flat < 1e-10 * frobNorm) {
+        std::cout << "SSP L-solve matches serial L-solve!" << std::endl;
     } else {
-        std::cout << "SSP (flat barrier) L-solve does NOT match serial L-solve!" << std::endl;
+        std::cout << "SSP L-solve does NOT match serial L-solve!" << std::endl;
+        std::cout << "Relative error: " << (max_diff_flat / frobNorm) << std::endl;
     }
     double max_diff_growlocal = 0.0;
+    double frobNormGrowlocal = 0.0;
     for (size_t i = 0; i < n; ++i) {
         double diff = std::abs(growlocal_result[i] - serial_result[i]);
         if (diff > max_diff_growlocal) max_diff_growlocal = diff;
+        frobNormGrowlocal += diff * diff;
     }
+    frobNormGrowlocal = std::sqrt(frobNormGrowlocal);
     std::cout << "Max difference between GrowLocalAutoCores and serial L-solve: " << max_diff_growlocal << std::endl;
-    if (max_diff_growlocal < 1e-10) {
+    if (frobNormGrowlocal <= 1e-30 || max_diff_growlocal < 1e-10 * frobNormGrowlocal) {
         std::cout << "GrowLocalAutoCores L-solve matches serial L-solve!" << std::endl;
     } else {
         std::cout << "GrowLocalAutoCores L-solve does NOT match serial L-solve!" << std::endl;
+        std::cout << "Relative error: " << (max_diff_growlocal / frobNormGrowlocal) << std::endl;
     }
 
     double max_diff_ssp_growlocal = 0.0;
+    double frobNormSspGrowlocal = 0.0;
     for (size_t i = 0; i < n; ++i) {
-        double diff = std::abs(ssp_cached_result[i] - growlocal_result[i]);
+        double diff = std::abs(ssp_flat_result[i] - growlocal_result[i]);
         if (diff > max_diff_ssp_growlocal) max_diff_ssp_growlocal = diff;
+        frobNormSspGrowlocal += diff * diff;
     }
-    std::cout << "Max difference between SSP (cached barrier) and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal
+    frobNormSspGrowlocal = std::sqrt(frobNormSspGrowlocal);
+    std::cout << "Max difference between SSP and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal
               << std::endl;
-
-    double max_diff_ssp_flat_cached = 0.0;
-    for (size_t i = 0; i < n; ++i) {
-        double diff = std::abs(ssp_flat_result[i] - ssp_cached_result[i]);
-        if (diff > max_diff_ssp_flat_cached) max_diff_ssp_flat_cached = diff;
+    if (frobNormSspGrowlocal <= 1e-30 || max_diff_ssp_growlocal < 1e-10 * frobNormSspGrowlocal) {
+        std::cout << "SSP L-solve matches GrowLocalAutoCores L-solve!" << std::endl;
+    } else {
+        std::cout << "SSP L-solve does NOT match GrowLocalAutoCores L-solve!" << std::endl;
+        std::cout << "Relative error: " << (max_diff_ssp_growlocal / frobNormSspGrowlocal) << std::endl;
     }
-    std::cout << "Max difference between SSP (flat barrier) and SSP (cached barrier): " << max_diff_ssp_flat_cached
-              << std::endl;
 
-    std::cout << "Average SSP (cached barrier) L-solve time (" << num_iterations << " runs): " << ssp_cached_avg_time
-              << " seconds" << std::endl;
-    std::cout << "Average SSP (flat barrier) L-solve time (" << num_iterations << " runs): " << ssp_flat_avg_time
+    std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_flat_avg_time
               << " seconds" << std::endl;
     std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time
               << " seconds" << std::endl;
     std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl;
-    if (ssp_cached_avg_time > 0.0) {
-        std::cout << "Speedup (serial/SSP cached): " << (serial_avg_time / ssp_cached_avg_time) << "x" << std::endl;
-    }
     if (ssp_flat_avg_time > 0.0) {
-        std::cout << "Speedup (serial/SSP flat): " << (serial_avg_time / ssp_flat_avg_time) << "x" << std::endl;
+        std::cout << "Speedup (serial/SSP): " << (serial_avg_time / ssp_flat_avg_time) << "x" << std::endl;
     }
     if (growlocal_avg_time > 0.0) {
         std::cout << "Speedup (serial/GrowLocalAutoCores): " << (serial_avg_time / growlocal_avg_time) << "x" << std::endl;
     }
-    if (ssp_cached_avg_time > 0.0) {
-        std::cout << "Speedup (GrowLocalAutoCores/SSP cached): " << (growlocal_avg_time / ssp_cached_avg_time) << "x"
-                  << std::endl;
-    }
     if (ssp_flat_avg_time > 0.0) {
-        std::cout << "Speedup (GrowLocalAutoCores/SSP flat): " << (growlocal_avg_time / ssp_flat_avg_time) << "x" << std::endl;
+        std::cout << "Speedup (GrowLocalAutoCores/SSP): " << (growlocal_avg_time / ssp_flat_avg_time) << "x" << std::endl;
     }
-    std::cout << "MaxBSP staleness=2 SSP (cached+flat) and GrowLocalAutoCores SpTRSV executed." << std::endl;
+    std::cout << "MaxBSP staleness=2 SSP and GrowLocalAutoCores SpTRSV executed." << std::endl;
     return 0;
 }
diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp
index bd5d7fab..76cd5e4a 100644
--- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp
@@ -92,4 +92,4 @@ inline void FlatCheckpointCounterBarrierCached::Wait(const std::size_t threadId,
     }
 }
 
-}    // end namespace osp
+}
\ No newline at end of file

From bc18d26b8957512cc59f2eed47e0bf1925d17a61 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Fri, 6 Feb 2026 15:48:21 +0100
Subject: [PATCH 17/57] removed false sharing

---
 .../WeakBarriers/flat_checkpoint_counter_barrier.hpp | 10 +++++++++-
 tests/weak_barrier.cpp                               | 12 ++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
index 87607def..df3b53f1 100644
--- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
@@ -28,6 +28,13 @@ limitations under the License.
 
 namespace osp {
 
+constexpr std::size_t RoundUpToCacheLine(std::size_t num) {
+    std::size_t size = ((num * sizeof(std::size_t) + CACHE_LINE_SIZE - 1U) / CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
+    std::size_t ans = (size + sizeof(std::size_t) - 1U) / sizeof(std::size_t);
+
+    return ans;
+}
+
 struct alignas(CACHE_LINE_SIZE) AlignedAtomicCounter {
     std::atomic<std::size_t> cntr_{0U};
     int8_t pad[CACHE_LINE_SIZE - sizeof(std::atomic<std::size_t>)];
@@ -44,7 +51,8 @@ class FlatCheckpointCounterBarrier {
   public:
     FlatCheckpointCounterBarrier(std::size_t numThreads)
         : cntrs_(std::vector<AlignedAtomicCounter>(numThreads)),
-          cachedCntrs_(std::vector<std::vector<std::size_t>>(numThreads, std::vector<std::size_t>(numThreads, 0U))) {};
+          cachedCntrs_(
+              std::vector<std::vector<std::size_t>>(numThreads, std::vector<std::size_t>(RoundUpToCacheLine(numThreads), 0U))) {};
 
     inline void Arrive(const std::size_t threadId);
     inline void Wait(const std::size_t threadId, const std::size_t diff) const;
diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp
index 22ad875a..a5d8ad01 100644
--- a/tests/weak_barrier.cpp
+++ b/tests/weak_barrier.cpp
@@ -545,4 +545,16 @@ BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrierCached_SSP_128Threads) {
             BOOST_CHECK_GE(cntr, std::max(current, static_cast<std::size_t>(2U)) - 2U);
         }
     }
+}
+
+BOOST_AUTO_TEST_CASE(TestVectorPadding) {
+    for (std::size_t i = 0U; i < 257; ++i) {
+        const std::size_t numCacheLines = (i * sizeof(std::size_t) + CACHE_LINE_SIZE - 1U) / CACHE_LINE_SIZE;
+        const std::size_t ans = RoundUpToCacheLine(i);
+
+        BOOST_CHECK_LE(numCacheLines * CACHE_LINE_SIZE, ans * sizeof(std::size_t));
+        if (ans > 0U) {
+            BOOST_CHECK_GT(numCacheLines * CACHE_LINE_SIZE, (ans - 1U) * sizeof(std::size_t));
+        }
+    }
 }
\ No newline at end of file

From c454e8a2f49ebfbcb3a757e39a19a93f4f0d52da Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 12 Feb 2026 16:51:10 +0100
Subject: [PATCH 18/57] added SSP grow local

---
 apps/maxbsp_ssp_sptrsv.cpp | 194 ++++++++++++++++++++++++-------------
 1 file changed, 124 insertions(+), 70 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 40c62ce3..c18f4f7d 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -3,22 +3,57 @@
  * Demonstrates maxbsp scheduling with staleness=2, then runs SpTRSV with SSP kernel.
  */
 
-#include <iostream>
-#include <vector>
 #include <Eigen/Sparse>
+#include <chrono>
+#include <iostream>
 #include <unsupported/Eigen/SparseExtra>
+#include <vector>
+
 #include "osp/auxiliary/sptrsv_simulator/sptrsv.hpp"
 #include "osp/bsp/model/BspInstance.hpp"
 #include "osp/bsp/model/BspSchedule.hpp"
 #include "osp/bsp/model/MaxBspSchedule.hpp"
-#include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCores.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp"
+#include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCores.hpp"
+#include "osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp"
 #include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp"
-#include <chrono>
 
 using namespace osp;
 
-int main(int argc, char* argv[]) {
+#define EPSILON 1e-20
+
+double L2NormalisedDiff(const std::vector<double> &v, const std::vector<double> &w) {
+    assert(v.size() == w.size());
+    double l2diff = 0.0;
+    double frobNorm = 0.0;
+    for (std::size_t i = 0U; i < v.size(); ++i) {
+        const double absdiff = std::abs(v[i] - w[i]);
+        l2diff += absdiff * absdiff;
+
+        const double vAbs = std::abs(v[i]);
+        const double wAbs = std::abs(w[i]);
+
+        frobNorm += ((vAbs * vAbs) + (wAbs * wAbs)) / 2.0;
+    }
+    l2diff = std::sqrt(l2diff);
+    frobNorm = std::sqrt(frobNorm);
+    const double ratio = l2diff / (frobNorm + EPSILON);
+    return ratio;
+}
+
+double LInftyNormalisedDiff(const std::vector<double> &v, const std::vector<double> &w) {
+    double diff = 0.0;
+    for (std::size_t i = 0U; i < v.size(); ++i) {
+        const double absdiff = std::abs(v[i] - w[i]);
+        const double vAbs = std::abs(v[i]);
+        const double wAbs = std::abs(w[i]);
+
+        diff = std::max(diff, 2 * absdiff / (vAbs + wAbs + EPSILON));
+    }
+    return diff;
+}
+
+int main(int argc, char *argv[]) {
     // Accept matrix filename and iteration count as arguments (threads via OMP_NUM_THREADS or optional arg)
     std::string filename = "../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx";
     int num_iterations = 1;
@@ -49,13 +84,18 @@ int main(int argc, char* argv[]) {
     graph.SetCsr(&lCsr);
     Eigen::SparseMatrix<double, Eigen::ColMajor, int32_t> lCsc = lCsr;
     graph.SetCsc(&lCsc);
-    BspArchitecture<SparseMatrixImp<int32_t>> architecture(num_threads, 1, 500); // configurable processors
+    BspArchitecture<SparseMatrixImp<int32_t>> architecture(num_threads, 1, 500);    // configurable processors
     BspInstance<SparseMatrixImp<int32_t>> instance(graph, architecture);
 
     // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness=2)
-    GreedyVarianceSspScheduler<SparseMatrixImp<int32_t>> ssp_scheduler;
-    MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_schedule(instance);
-    ssp_scheduler.ComputeSchedule(ssp_schedule);
+    GreedyVarianceSspScheduler<SparseMatrixImp<int32_t>> ssp_var_scheduler;
+    MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_var_schedule(instance);
+    ssp_var_scheduler.ComputeSchedule(ssp_var_schedule);
+
+    // Create SSP-aware schedule using GrowLocalMaxBsp (staleness=2)
+    GrowLocalSSP<SparseMatrixImp<int32_t>> ssp_gl_scheduler;
+    MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_gl_schedule(instance);
+    ssp_gl_scheduler.ComputeSchedule(ssp_gl_schedule);
 
     // Create a non-SSP schedule using GrowLocalAutoCores
     GrowLocalAutoCores<SparseMatrixImp<int32_t>> growlocal_scheduler;
@@ -67,13 +107,13 @@ int main(int argc, char* argv[]) {
 
     size_t n = static_cast<size_t>(lCsc.cols());
 
-    // Benchmark SSP L-solve
-    double ssp_flat_total_time = 0.0;
-    std::vector<double> ssp_flat_result(n, 0.0);
+    // Benchmark SSP Variance L-solve
+    double ssp_var_flat_total_time = 0.0;
+    std::vector<double> ssp_var_flat_result(n, 0.0);
     for (int iter = 0; iter < num_iterations; ++iter) {
         std::vector<double> x(n, 0.0);
         std::vector<double> b(n, 1.0);
-        sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule);
+        sptrsv_kernel.SetupCsrNoPermutation(ssp_var_schedule);
         sptrsv_kernel.x_ = x.data();
         sptrsv_kernel.b_ = b.data();
         FlatCheckpointCounterBarrier barrier(num_threads);
@@ -81,10 +121,33 @@ int main(int argc, char* argv[]) {
         auto start = std::chrono::high_resolution_clock::now();
         sptrsv_kernel.SspLsolveStaleness2(ops);
         auto end = std::chrono::high_resolution_clock::now();
-        ssp_flat_total_time += std::chrono::duration<double>(end - start).count();
-        if (iter == 0) ssp_flat_result = std::vector<double>(x.begin(), x.end());
+        ssp_var_flat_total_time += std::chrono::duration<double>(end - start).count();
+        if (iter == 0) {
+            ssp_var_flat_result = std::vector<double>(x.begin(), x.end());
+        }
     }
-    double ssp_flat_avg_time = ssp_flat_total_time / num_iterations;
+    double ssp_var_flat_avg_time = ssp_var_flat_total_time / num_iterations;
+
+    // Benchmark SSP GrowLocal L-solve
+    double ssp_gl_flat_total_time = 0.0;
+    std::vector<double> ssp_gl_flat_result(n, 0.0);
+    for (int iter = 0; iter < num_iterations; ++iter) {
+        std::vector<double> x(n, 0.0);
+        std::vector<double> b(n, 1.0);
+        sptrsv_kernel.SetupCsrNoPermutation(ssp_gl_schedule);
+        sptrsv_kernel.x_ = x.data();
+        sptrsv_kernel.b_ = b.data();
+        FlatCheckpointCounterBarrier barrier(num_threads);
+        auto ops = Sptrsv<int32_t>::MakeBarrierOps(barrier);
+        auto start = std::chrono::high_resolution_clock::now();
+        sptrsv_kernel.SspLsolveStaleness2(ops);
+        auto end = std::chrono::high_resolution_clock::now();
+        ssp_gl_flat_total_time += std::chrono::duration<double>(end - start).count();
+        if (iter == 0) {
+            ssp_gl_flat_result = std::vector<double>(x.begin(), x.end());
+        }
+    }
+    double ssp_gl_flat_avg_time = ssp_gl_flat_total_time / num_iterations;
 
     // Benchmark GrowLocalAutoCores schedule with non-SSP L-solve (no permutation)
     double growlocal_total_time = 0.0;
@@ -99,7 +162,9 @@ int main(int argc, char* argv[]) {
         sptrsv_kernel.LsolveNoPermutation();
         auto end = std::chrono::high_resolution_clock::now();
         growlocal_total_time += std::chrono::duration<double>(end - start).count();
-        if (iter == 0) growlocal_result = std::vector<double>(x.begin(), x.end());
+        if (iter == 0) {
+            growlocal_result = std::vector<double>(x.begin(), x.end());
+        }
     }
     double growlocal_avg_time = growlocal_total_time / num_iterations;
 
@@ -115,74 +180,63 @@ int main(int argc, char* argv[]) {
         sptrsv_kernel.LsolveSerial();
         auto end = std::chrono::high_resolution_clock::now();
         serial_total_time += std::chrono::duration<double>(end - start).count();
-        if (iter == 0) serial_result = std::vector<double>(x_serial.begin(), x_serial.end());
+        if (iter == 0) {
+            serial_result = std::vector<double>(x_serial.begin(), x_serial.end());
+        }
     }
     double serial_avg_time = serial_total_time / num_iterations;
 
     // Compare results
-    double max_diff_flat = 0.0;
-    double frobNorm = 0.0;
-    for (size_t i = 0; i < n; ++i) {
-        double diff = std::abs(ssp_flat_result[i] - serial_result[i]);
-        if (diff > max_diff_flat) max_diff_flat = diff;
-        frobNorm += diff * diff;
-    }
-    frobNorm = std::sqrt(frobNorm);
-    std::cout << "Frobenius norm of difference: " << frobNorm << std::endl;
-    std::cout << "Max difference between SSP and serial L-solve: " << max_diff_flat << std::endl;
-    if (frobNorm <= 1e-30 || max_diff_flat < 1e-10 * frobNorm) {
-        std::cout << "SSP L-solve matches serial L-solve!" << std::endl;
-    } else {
-        std::cout << "SSP L-solve does NOT match serial L-solve!" << std::endl;
-        std::cout << "Relative error: " << (max_diff_flat / frobNorm) << std::endl;
-    }
-    double max_diff_growlocal = 0.0;
-    double frobNormGrowlocal = 0.0;
-    for (size_t i = 0; i < n; ++i) {
-        double diff = std::abs(growlocal_result[i] - serial_result[i]);
-        if (diff > max_diff_growlocal) max_diff_growlocal = diff;
-        frobNormGrowlocal += diff * diff;
-    }
-    frobNormGrowlocal = std::sqrt(frobNormGrowlocal);
-    std::cout << "Max difference between GrowLocalAutoCores and serial L-solve: " << max_diff_growlocal << std::endl;
-    if (frobNormGrowlocal <= 1e-30 || max_diff_growlocal < 1e-10 * frobNormGrowlocal) {
-        std::cout << "GrowLocalAutoCores L-solve matches serial L-solve!" << std::endl;
+    const double varDiff = LInftyNormalisedDiff(ssp_var_flat_result, serial_result);
+
+    std::cout << "Max relative difference between SSP Variance and serial L-solve: " << varDiff << std::endl;
+    if (varDiff < EPSILON) {
+        std::cout << "SSP Variance L-solve matches serial L-solve!" << std::endl;
     } else {
-        std::cout << "GrowLocalAutoCores L-solve does NOT match serial L-solve!" << std::endl;
-        std::cout << "Relative error: " << (max_diff_growlocal / frobNormGrowlocal) << std::endl;
+        std::cout << "SSP Variance L-solve does NOT match serial L-solve!" << std::endl;
     }
 
-    double max_diff_ssp_growlocal = 0.0;
-    double frobNormSspGrowlocal = 0.0;
-    for (size_t i = 0; i < n; ++i) {
-        double diff = std::abs(ssp_flat_result[i] - growlocal_result[i]);
-        if (diff > max_diff_ssp_growlocal) max_diff_ssp_growlocal = diff;
-        frobNormSspGrowlocal += diff * diff;
+    const double GLSSPDiff = LInftyNormalisedDiff(ssp_gl_flat_result, serial_result);
+
+    std::cout << "Max relative difference between SSP GrowLocal and serial L-solve: " << GLSSPDiff << std::endl;
+    if (GLSSPDiff < EPSILON) {
+        std::cout << "SSP GrowLocal L-solve matches serial L-solve!" << std::endl;
+    } else {
+        std::cout << "SSP GrowLocal L-solve does NOT match serial L-solve!" << std::endl;
     }
-    frobNormSspGrowlocal = std::sqrt(frobNormSspGrowlocal);
-    std::cout << "Max difference between SSP and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal
-              << std::endl;
-    if (frobNormSspGrowlocal <= 1e-30 || max_diff_ssp_growlocal < 1e-10 * frobNormSspGrowlocal) {
-        std::cout << "SSP L-solve matches GrowLocalAutoCores L-solve!" << std::endl;
+
+    const double GLPDiff = LInftyNormalisedDiff(growlocal_result, serial_result);
+
+    std::cout << "Max relative difference between GrowLocal and serial L-solve: " << GLPDiff << std::endl;
+    if (GLPDiff < EPSILON) {
+        std::cout << "GrowLocal L-solve matches serial L-solve!" << std::endl;
     } else {
-        std::cout << "SSP L-solve does NOT match GrowLocalAutoCores L-solve!" << std::endl;
-        std::cout << "Relative error: " << (max_diff_ssp_growlocal / frobNormSspGrowlocal) << std::endl;
+        std::cout << "GrowLocal L-solve does NOT match serial L-solve!" << std::endl;
     }
 
-    std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_flat_avg_time
-              << " seconds" << std::endl;
-    std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time
-              << " seconds" << std::endl;
-    std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl;
-    if (ssp_flat_avg_time > 0.0) {
-        std::cout << "Speedup (serial/SSP): " << (serial_avg_time / ssp_flat_avg_time) << "x" << std::endl;
+    std::cout << "Average SSP Variance L-solve time (" << num_iterations << " runs): " << ssp_var_flat_avg_time << " seconds"
+              << std::endl;
+    std::cout << "Average SSP GrowLocal L-solve time (" << num_iterations << " runs): " << ssp_gl_flat_avg_time << " seconds"
+              << std::endl;
+    std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time << " seconds"
+              << std::endl;
+    std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl << std::endl;
+
+    if (ssp_var_flat_avg_time > 0.0) {
+        std::cout << "Speedup (serial/SSP Var): " << (serial_avg_time / ssp_var_flat_avg_time) << "x" << std::endl;
+    }
+    if (ssp_gl_flat_avg_time > 0.0) {
+        std::cout << "Speedup (serial/SSP GL): " << (serial_avg_time / ssp_gl_flat_avg_time) << "x" << std::endl;
     }
     if (growlocal_avg_time > 0.0) {
         std::cout << "Speedup (serial/GrowLocalAutoCores): " << (serial_avg_time / growlocal_avg_time) << "x" << std::endl;
     }
-    if (ssp_flat_avg_time > 0.0) {
-        std::cout << "Speedup (GrowLocalAutoCores/SSP): " << (growlocal_avg_time / ssp_flat_avg_time) << "x" << std::endl;
+    if (ssp_var_flat_avg_time > 0.0) {
+        std::cout << "Speedup (GrowLocalAutoCores/SSP Var): " << (growlocal_avg_time / ssp_var_flat_avg_time) << "x" << std::endl;
+    }
+    if (ssp_gl_flat_avg_time > 0.0) {
+        std::cout << "Speedup (GrowLocalAutoCores/SSP GL): " << (growlocal_avg_time / ssp_gl_flat_avg_time) << "x" << std::endl;
     }
-    std::cout << "MaxBSP staleness=2 SSP and GrowLocalAutoCores SpTRSV executed." << std::endl;
+
     return 0;
 }

From b32f4dc40bc5302d5a9bc2872a79fd61c1d798b4 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 12 Feb 2026 17:11:22 +0100
Subject: [PATCH 19/57] changed splitting of work between supersteps

---
 include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
index 8ce849c6..23462bd8 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
@@ -71,7 +71,7 @@ typename std::deque<VertexIdxT<GraphT>>::difference_type GrowLocalSSP<GraphT>::m
         typename std::deque<VertexType>::difference_type lengthNext
             = std::distance(nextSuperstepReady.cbegin(), nextSuperstepReady.cend());
 
-        typename std::deque<VertexType>::difference_type ans = ((lengthCurrently + lengthNext + 2) / 3) * 2;
+        typename std::deque<VertexType>::difference_type ans = ((lengthCurrently + lengthNext + 1) / 2);
 
         return ans;
     }

From 27f0edd551e45eb6868ef4735e5e1494b64f664f Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Fri, 13 Feb 2026 13:59:20 +0100
Subject: [PATCH 20/57] aligned allocators

---
 .../WeakBarriers/aligned_allocator.hpp        | 70 +++++++++++++++++
 .../flat_checkpoint_counter_barrier.hpp       | 14 ++--
 tests/CMakeLists.txt                          |  2 +
 tests/aligned_allocator.cpp                   | 77 +++++++++++++++++++
 4 files changed, 157 insertions(+), 6 deletions(-)
 create mode 100644 include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp
 create mode 100644 tests/aligned_allocator.cpp

diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp
new file mode 100644
index 00000000..b5103a91
--- /dev/null
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp
@@ -0,0 +1,70 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include <cstddef>
+#include <cstdlib>
+#include <memory>
+
+namespace osp {
+
+template <class T, std::size_t alignment = alignof(T)>
+struct AlignedAllocator {
+    static_assert(alignment > 0U, "Alignment must be a positive integer.");
+    static_assert((alignment & (alignment - 1U)) == 0U, "Alignment must be a power of two.");
+    static_assert(alignment % alignof(T) == 0U, "Alignment must be a multiple of the alignment of the type.");
+
+    using value_type = T;
+
+    template <typename U>
+    struct rebind {
+        using other = AlignedAllocator<U, alignment>;
+    };
+
+    AlignedAllocator() noexcept = default;
+
+    template <class U>
+    AlignedAllocator(const AlignedAllocator<U, alignment> &) noexcept {}
+
+    inline T *allocate(std::size_t size) { return reinterpret_cast<T *>(std::aligned_alloc(alignment, size * sizeof(T))); }
+
+    inline void deallocate(T *p, [[maybe_unused]] std::size_t size) { std::free(p); }
+
+    template <typename U, typename... Args>
+    inline void construct(U *p, Args &&...args) {
+        new (static_cast<void *>(p)) U(std::forward<Args>(args)...);
+    }
+
+    template <typename U>
+    inline void destroy(U *p) noexcept {
+        p->~U();
+    }
+};
+
+template <class T, std::size_t T_alignment, class U, std::size_t U_alignment>
+constexpr bool operator==(const AlignedAllocator<T, T_alignment> &, const AlignedAllocator<U, U_alignment> &) noexcept {
+    return (T_alignment == U_alignment);
+}
+
+template <class T, std::size_t T_alignment, class U, std::size_t U_alignment>
+constexpr bool operator!=(const AlignedAllocator<T, T_alignment> &, const AlignedAllocator<U, U_alignment> &) noexcept {
+    return (T_alignment != U_alignment);
+}
+
+}    // end namespace osp
diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
index df3b53f1..5b25acd3 100644
--- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
@@ -23,6 +23,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp"
 #include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp"
 #include "osp/config/config.hpp"
 
@@ -46,13 +47,14 @@ struct alignas(CACHE_LINE_SIZE) AlignedAtomicCounter {
 class FlatCheckpointCounterBarrier {
   private:
     std::vector<AlignedAtomicCounter> cntrs_;
-    mutable std::vector<std::vector<std::size_t>> cachedCntrs_;
+    mutable std::vector<std::vector<std::size_t, AlignedAllocator<std::size_t, CACHE_LINE_SIZE>>> cachedCntrs_;
 
   public:
     FlatCheckpointCounterBarrier(std::size_t numThreads)
         : cntrs_(std::vector<AlignedAtomicCounter>(numThreads)),
-          cachedCntrs_(
-              std::vector<std::vector<std::size_t>>(numThreads, std::vector<std::size_t>(RoundUpToCacheLine(numThreads), 0U))) {};
+          cachedCntrs_(std::vector<std::vector<std::size_t, AlignedAllocator<std::size_t, CACHE_LINE_SIZE>>>(
+              numThreads,
+              std::vector<std::size_t, AlignedAllocator<std::size_t, CACHE_LINE_SIZE>>(RoundUpToCacheLine(numThreads), 0U))) {};
 
     inline void Arrive(const std::size_t threadId);
     inline void Wait(const std::size_t threadId, const std::size_t diff) const;
@@ -66,12 +68,12 @@ class FlatCheckpointCounterBarrier {
 };
 
 inline void FlatCheckpointCounterBarrier::Arrive(const std::size_t threadId) {
-    const std::size_t curr = cntrs_[threadId].cntr_.fetch_add(1U, std::memory_order_release) + 1U;
-    cachedCntrs_[threadId][threadId] = curr;
+    cntrs_[threadId].cntr_.fetch_add(1U, std::memory_order_release);
+    ++cachedCntrs_[threadId][threadId];
 }
 
 inline void FlatCheckpointCounterBarrier::Wait(const std::size_t threadId, const std::size_t diff) const {
-    std::vector<std::size_t> &localCachedCntrs = cachedCntrs_[threadId];
+    std::vector<std::size_t, AlignedAllocator<std::size_t, CACHE_LINE_SIZE>> &localCachedCntrs = cachedCntrs_[threadId];
 
     const std::size_t minVal = std::max(localCachedCntrs[threadId], diff) - diff;
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d6a8f8c2..7bf3aed5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -131,6 +131,8 @@ _add_test( hash_pair )
 
 _add_test( weak_barrier )
 
+_add_test( aligned_allocator )
+
 ## io
 _add_test( filereader DATA )
 
diff --git a/tests/aligned_allocator.cpp b/tests/aligned_allocator.cpp
new file mode 100644
index 00000000..6f03257d
--- /dev/null
+++ b/tests/aligned_allocator.cpp
@@ -0,0 +1,77 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#define BOOST_TEST_MODULE AlignedAllocatorTests
+
+#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp"
+
+#include <boost/test/unit_test.hpp>
+#include <vector>
+
+using namespace osp;
+
+BOOST_AUTO_TEST_CASE(TestAlignedAllocation32) {
+    constexpr std::size_t alignment = 32U;
+
+    std::vector<unsigned, AlignedAllocator<unsigned, alignment>> vec(7, 7U);
+    BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+
+    for (unsigned i = 0U; i < 2048U; ++i) {
+        vec.emplace_back(i);
+        BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+    }
+
+    vec.resize(8000U);
+    BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+    vec.resize(5U);
+    BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+}
+
+BOOST_AUTO_TEST_CASE(TestAlignedAllocation16) {
+    constexpr std::size_t alignment = 16U;
+
+    std::vector<unsigned, AlignedAllocator<unsigned, alignment>> vec(7, 7U);
+    BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+
+    for (unsigned i = 0U; i < 2048U; ++i) {
+        vec.emplace_back(i);
+        BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+    }
+
+    vec.resize(8000U);
+    BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+    vec.resize(5U);
+    BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+}
+
+BOOST_AUTO_TEST_CASE(TestAlignedAllocation64) {
+    constexpr std::size_t alignment = 64U;
+
+    std::vector<char, AlignedAllocator<char, alignment>> vec(7, 7U);
+    BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+
+    for (unsigned i = 0U; i < 2048U; ++i) {
+        vec.emplace_back('a');
+        BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+    }
+
+    vec.resize(8000U);
+    BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+    vec.resize(5U);
+    BOOST_CHECK_EQUAL(reinterpret_cast<std::size_t>(static_cast<void *>(vec.data())) % alignment, 0U);
+}

From ef2abf8578db0239b125985f6d59a9a0a4fdde61 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 19 Feb 2026 08:42:23 +0100
Subject: [PATCH 21/57] compact sparse graph mtx file reader

---
 ...tx_to_compact_sparse_graph_file_reader.hpp | 138 ++++++++++++++++++
 .../adj_list_impl/compact_sparse_graph.hpp    |   4 +-
 tests/filereader.cpp                          |  71 +++++++++
 3 files changed, 211 insertions(+), 2 deletions(-)
 create mode 100644 include/osp/auxiliary/io/mtx_to_compact_sparse_graph_file_reader.hpp

diff --git a/include/osp/auxiliary/io/mtx_to_compact_sparse_graph_file_reader.hpp b/include/osp/auxiliary/io/mtx_to_compact_sparse_graph_file_reader.hpp
new file mode 100644
index 00000000..1730f594
--- /dev/null
+++ b/include/osp/auxiliary/io/mtx_to_compact_sparse_graph_file_reader.hpp
@@ -0,0 +1,138 @@
+/*
+Copyright 2026 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+
+#include "osp/auxiliary/io/mtx_graph_file_reader.hpp"
+#include "osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp"
+
+namespace osp {
+namespace file_reader {
+
+template <>
+bool ReadComputationalDagMartixMarketFormat<
+    CompactSparseGraph<true, false, false, false, false, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t,
+    unsigned>>( std::ifstream &infile, CompactSparseGraph<true, false, false, false, false, std::size_t, std::size_t,
+    std::size_t, std::size_t, std::size_t, unsigned>
+        &graph) {
+    using GraphT
+        = CompactSparseGraph<true, false, false, false, false, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t,
+        unsigned>;
+    using VertexT = VertexIdxT<GraphT>;
+
+    std::vector<std::pair<VertexT, VertexT>> edges;
+    std::string line;
+
+    // Skip comments or empty lines (robustly)
+    while (std::getline(infile, line)) {
+        if (line.empty() || line[0] == '%') {
+            continue;
+        }
+
+        // Null byte check
+        if (line.find('\0') != std::string::npos) {
+            std::cerr << "Error: Null byte detected in header line.\n";
+            return false;
+        }
+
+        if (line.size() > MAX_LINE_LENGTH) {
+            std::cerr << "Error: Line too long, possible malformed or malicious file.\n";
+            return false;
+        }
+        break;    // We found the actual header line
+    }
+
+    if (infile.eof()) {
+        std::cerr << "Error: Unexpected end of file while reading header.\n";
+        return false;
+    }
+
+    VertexT mRow = 0;
+    VertexT mCol = 0;
+    std::size_t nEntries = 0;
+
+    std::istringstream headerStream(line);
+    if (!(headerStream >> mRow >> mCol >> nEntries) || mRow <= 0 || mCol <= 0 || mRow != mCol) {
+        std::cerr << "Error: Invalid header or non-square matrix.\n";
+        return false;
+    }
+
+    const VertexT numNodes = mRow;
+
+    std::size_t entriesRead = 0;
+    while (entriesRead < nEntries && std::getline(infile, line)) {
+        if (line.empty() || line[0] == '%') {
+            continue;
+        }
+        if (line.size() > MAX_LINE_LENGTH) {
+            std::cerr << "Error: Line too long.\n";
+            return false;
+        }
+
+        std::istringstream entryStream(line);
+        VertexT row = std::numeric_limits<VertexT>::max();
+        VertexT col = std::numeric_limits<VertexT>::max();
+        double val = 0.0;
+
+        if (!(entryStream >> row >> col >> val)) {
+            std::cerr << "Error: Malformed matrix entry.\n";
+            return false;
+        }
+
+        row -= 1;
+        col -= 1;    // Convert to 0-based
+
+        if (row >= mRow || col >= mCol) {
+            std::cerr << "Error: Matrix entry out of bounds.\n";
+            return false;
+        }
+
+        if (row < col) {
+            std::cerr << "Error: Expected lower-triangular matrix.\n";
+            return false;
+        }
+
+        if (row != col) {
+            edges.emplace_back(col, row);
+        }
+
+        ++entriesRead;
+    }
+
+    if (entriesRead != nEntries) {
+        std::cerr << "Error: Incomplete matrix entries.\n";
+        return false;
+    }
+
+    while (std::getline(infile, line)) {
+        if (!line.empty() && line[0] != '%') {
+            std::cerr << "Error: Extra data after matrix content.\n";
+            return false;
+        }
+    }
+
+    graph = GraphT(numNodes, edges);
+
+    return true;
+}
+
+}    // namespace file_reader
+}    // namespace osp
diff --git a/include/osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp b/include/osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp
index 9d4614fb..e7488bc4 100644
--- a/include/osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp
+++ b/include/osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp
@@ -831,7 +831,7 @@ class CompactSparseGraph {
 
     template <typename RetT = VertexCommWeightType>
     inline std::enable_if_t<not useCommWeights, RetT> VertexCommWeight(const VertexIdx) const {
-        return static_cast<RetT>(0);
+        return static_cast<RetT>(1);
     }
 
     template <typename RetT = VertexMemWeightType>
@@ -841,7 +841,7 @@ class CompactSparseGraph {
 
     template <typename RetT = VertexMemWeightType>
     inline std::enable_if_t<not useMemWeights, RetT> VertexMemWeight(const VertexIdx) const {
-        return static_cast<RetT>(0);
+        return static_cast<RetT>(1);
     }
 
     template <typename RetT = VertexTypeType>
diff --git a/tests/filereader.cpp b/tests/filereader.cpp
index 0f6c0917..85347b3b 100644
--- a/tests/filereader.cpp
+++ b/tests/filereader.cpp
@@ -25,6 +25,7 @@ limitations under the License.
 #include "osp/auxiliary/io/dot_graph_file_reader.hpp"
 #include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
 #include "osp/auxiliary/io/mtx_graph_file_reader.hpp"
+#include "osp/auxiliary/io/mtx_to_compact_sparse_graph_file_reader.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp"
 #include "osp/graph_implementations/boost_graphs/boost_graph.hpp"
@@ -169,6 +170,76 @@ BOOST_AUTO_TEST_CASE(TestMtxBoostGraph) {
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(7).begin(), graph.Children(7).end(), c7.begin(), c7.end());
 }
 
+BOOST_AUTO_TEST_CASE(TestMtxCompactSparseGraph) {
+    // Getting root git directory
+    std::filesystem::path cwd = std::filesystem::current_path();
+    std::cout << cwd << std::endl;
+    while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) {
+        cwd = cwd.parent_path();
+        std::cout << cwd << std::endl;
+    }
+
+    CompactSparseGraph<true, false, false, false, false, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, unsigned>
+        graph;
+
+    bool status
+        = file_reader::ReadComputationalDagMartixMarketFormat((cwd / "data/mtx_tests/ErdosRenyi_8_19_A.mtx").string(), graph);
+
+    std::cout << "STATUS:" << status << std::endl;
+    BOOST_CHECK(status);
+    BOOST_CHECK_EQUAL(graph.NumVertices(), 8);
+    BOOST_CHECK_EQUAL(graph.NumEdges(), 19);
+
+    // ---- Node 0
+    std::vector<int> p0{};
+    std::vector<int> c0{2, 3, 4, 5, 6};
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(0).begin(), graph.Parents(0).end(), p0.begin(), p0.end());
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(0).begin(), graph.Children(0).end(), c0.begin(), c0.end());
+
+    // ---- Node 1
+    std::vector<int> p1{};
+    std::vector<int> c1{2, 3, 5, 6};
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(1).begin(), graph.Parents(1).end(), p1.begin(), p1.end());
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(1).begin(), graph.Children(1).end(), c1.begin(), c1.end());
+
+    // ---- Node 2
+    std::vector<int> p2{0, 1};
+    std::vector<int> c2{3, 5};
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(2).begin(), graph.Parents(2).end(), p2.begin(), p2.end());
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(2).begin(), graph.Children(2).end(), c2.begin(), c2.end());
+
+    // ---- Node 3
+    std::vector<int> p3{0, 1, 2};
+    std::vector<int> c3{4, 5, 6, 7};
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(3).begin(), graph.Parents(3).end(), p3.begin(), p3.end());
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(3).begin(), graph.Children(3).end(), c3.begin(), c3.end());
+
+    // ---- Node 4
+    std::vector<int> p4{0, 3};
+    std::vector<int> c4{5, 6, 7};
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(4).begin(), graph.Parents(4).end(), p4.begin(), p4.end());
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(4).begin(), graph.Children(4).end(), c4.begin(), c4.end());
+
+    // ---- Node 5
+    std::vector<int> p5{0, 1, 2, 3, 4};
+    std::vector<int> c5{};
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(5).begin(), graph.Parents(5).end(), p5.begin(), p5.end());
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(5).begin(), graph.Children(5).end(), c5.begin(), c5.end());
+
+    // ---- Node 6
+    std::vector<int> p6{0, 1, 3, 4};
+    std::vector<int> c6{7};
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(6).begin(), graph.Parents(6).end(), p6.begin(), p6.end());
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(6).begin(), graph.Children(6).end(), c6.begin(), c6.end());
+
+    // ---- Node 7
+    std::vector<int> p7{3, 4, 6};
+    std::vector<int> c7{};
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(7).begin(), graph.Parents(7).end(), p7.begin(), p7.end());
+    BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(7).begin(), graph.Children(7).end(), c7.begin(), c7.end());
+}
+
 BOOST_AUTO_TEST_CASE(TestBicgstab) {
     // Getting root git directory
     std::filesystem::path cwd = std::filesystem::current_path();

From 63827f5194709e4ecdff2529be8f87e30ee5af92 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 19 Feb 2026 08:46:27 +0100
Subject: [PATCH 22/57] vertex weight test

---
 tests/filereader.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/filereader.cpp b/tests/filereader.cpp
index 85347b3b..2d809458 100644
--- a/tests/filereader.cpp
+++ b/tests/filereader.cpp
@@ -196,48 +196,56 @@ BOOST_AUTO_TEST_CASE(TestMtxCompactSparseGraph) {
 
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(0).begin(), graph.Parents(0).end(), p0.begin(), p0.end());
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(0).begin(), graph.Children(0).end(), c0.begin(), c0.end());
+    BOOST_CHECK_EQUAL(graph.VertexWorkWeight(0), p0.size() + 1);
 
     // ---- Node 1
     std::vector<int> p1{};
     std::vector<int> c1{2, 3, 5, 6};
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(1).begin(), graph.Parents(1).end(), p1.begin(), p1.end());
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(1).begin(), graph.Children(1).end(), c1.begin(), c1.end());
+    BOOST_CHECK_EQUAL(graph.VertexWorkWeight(1), p1.size() + 1);
 
     // ---- Node 2
     std::vector<int> p2{0, 1};
     std::vector<int> c2{3, 5};
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(2).begin(), graph.Parents(2).end(), p2.begin(), p2.end());
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(2).begin(), graph.Children(2).end(), c2.begin(), c2.end());
+    BOOST_CHECK_EQUAL(graph.VertexWorkWeight(2), p2.size() + 1);
 
     // ---- Node 3
     std::vector<int> p3{0, 1, 2};
     std::vector<int> c3{4, 5, 6, 7};
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(3).begin(), graph.Parents(3).end(), p3.begin(), p3.end());
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(3).begin(), graph.Children(3).end(), c3.begin(), c3.end());
+    BOOST_CHECK_EQUAL(graph.VertexWorkWeight(3), p3.size() + 1);
 
     // ---- Node 4
     std::vector<int> p4{0, 3};
     std::vector<int> c4{5, 6, 7};
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(4).begin(), graph.Parents(4).end(), p4.begin(), p4.end());
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(4).begin(), graph.Children(4).end(), c4.begin(), c4.end());
+    BOOST_CHECK_EQUAL(graph.VertexWorkWeight(4), p4.size() + 1);
 
     // ---- Node 5
     std::vector<int> p5{0, 1, 2, 3, 4};
     std::vector<int> c5{};
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(5).begin(), graph.Parents(5).end(), p5.begin(), p5.end());
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(5).begin(), graph.Children(5).end(), c5.begin(), c5.end());
+    BOOST_CHECK_EQUAL(graph.VertexWorkWeight(5), p5.size() + 1);
 
     // ---- Node 6
     std::vector<int> p6{0, 1, 3, 4};
     std::vector<int> c6{7};
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(6).begin(), graph.Parents(6).end(), p6.begin(), p6.end());
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(6).begin(), graph.Children(6).end(), c6.begin(), c6.end());
+    BOOST_CHECK_EQUAL(graph.VertexWorkWeight(6), p6.size() + 1);
 
     // ---- Node 7
     std::vector<int> p7{3, 4, 6};
     std::vector<int> c7{};
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(7).begin(), graph.Parents(7).end(), p7.begin(), p7.end());
     BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(7).begin(), graph.Children(7).end(), c7.begin(), c7.end());
+    BOOST_CHECK_EQUAL(graph.VertexWorkWeight(7), p7.size() + 1);
 }
 
 BOOST_AUTO_TEST_CASE(TestBicgstab) {

From fc2a31504527bbcdb02bf128b4dd87be0c1dcd26 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 19 Feb 2026 13:39:18 +0100
Subject: [PATCH 23/57] fixed allocation length

---
 .../WeakBarriers/aligned_allocator.hpp           | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp
index b5103a91..906f87a2 100644
--- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp
@@ -42,9 +42,8 @@ struct AlignedAllocator {
     template <class U>
     AlignedAllocator(const AlignedAllocator<U, alignment> &) noexcept {}
 
-    inline T *allocate(std::size_t size) { return reinterpret_cast<T *>(std::aligned_alloc(alignment, size * sizeof(T))); }
-
-    inline void deallocate(T *p, [[maybe_unused]] std::size_t size) { std::free(p); }
+    inline T *allocate(std::size_t size);
+    inline void deallocate(T *p, [[maybe_unused]] std::size_t size);
 
     template <typename U, typename... Args>
     inline void construct(U *p, Args &&...args) {
@@ -57,6 +56,17 @@ struct AlignedAllocator {
     }
 };
 
+template <class T, std::size_t alignment>
+inline T *AlignedAllocator<T, alignment>::allocate(std::size_t size) {
+    std::size_t allocationSize = ((size * sizeof(T) + alignment - 1U) / alignment) * alignment;
+    return reinterpret_cast<T *>(std::aligned_alloc(alignment, allocationSize));
+}
+
+template <class T, std::size_t alignment>
+inline void AlignedAllocator<T, alignment>::deallocate(T *p, [[maybe_unused]] std::size_t size) {
+    std::free(p);
+}
+
 template <class T, std::size_t T_alignment, class U, std::size_t U_alignment>
 constexpr bool operator==(const AlignedAllocator<T, T_alignment> &, const AlignedAllocator<U, U_alignment> &) noexcept {
     return (T_alignment == U_alignment);

From 33ed5d553427c0e908c218cc50cb9f1a525d8a51 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Thu, 19 Feb 2026 13:54:46 +0100
Subject: [PATCH 24/57] clean ssp sptrsv

---
 apps/maxbsp_ssp_sptrsv.cpp                    |  8 ++---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 35 +++----------------
 2 files changed, 7 insertions(+), 36 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index c18f4f7d..7d0153ac 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -116,10 +116,8 @@ int main(int argc, char *argv[]) {
         sptrsv_kernel.SetupCsrNoPermutation(ssp_var_schedule);
         sptrsv_kernel.x_ = x.data();
         sptrsv_kernel.b_ = b.data();
-        FlatCheckpointCounterBarrier barrier(num_threads);
-        auto ops = Sptrsv<int32_t>::MakeBarrierOps(barrier);
         auto start = std::chrono::high_resolution_clock::now();
-        sptrsv_kernel.SspLsolveStaleness2(ops);
+        sptrsv_kernel.SspLsolveStaleness2();
         auto end = std::chrono::high_resolution_clock::now();
         ssp_var_flat_total_time += std::chrono::duration<double>(end - start).count();
         if (iter == 0) {
@@ -137,10 +135,8 @@ int main(int argc, char *argv[]) {
         sptrsv_kernel.SetupCsrNoPermutation(ssp_gl_schedule);
         sptrsv_kernel.x_ = x.data();
         sptrsv_kernel.b_ = b.data();
-        FlatCheckpointCounterBarrier barrier(num_threads);
-        auto ops = Sptrsv<int32_t>::MakeBarrierOps(barrier);
         auto start = std::chrono::high_resolution_clock::now();
-        sptrsv_kernel.SspLsolveStaleness2(ops);
+        sptrsv_kernel.SspLsolveStaleness2();
         auto end = std::chrono::high_resolution_clock::now();
         ssp_gl_flat_total_time += std::chrono::duration<double>(end - start).count();
         if (iter == 0) {
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 7d08e32c..22dcc294 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -36,7 +36,6 @@ limitations under the License.
 #    include <vector>
 
 #    include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp"
-#    include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp"
 #    include "osp/bsp/model/BspInstance.hpp"
 #    include "osp/bsp/model/BspSchedule.hpp"
 #    include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp"
@@ -51,23 +50,6 @@ class Sptrsv {
     const BspInstance<SparseMatrixImp<EigenIdxType>> *instance_;
 
   public:
-    struct BarrierOps {
-        void *ctx;
-        void (*arrive)(void *ctx, std::size_t threadId);
-        void (*wait)(void *ctx, std::size_t threadId, std::size_t diff);
-    };
-
-    template <typename BarrierT>
-    static BarrierOps MakeBarrierOps(BarrierT &barrier) {
-        return BarrierOps{
-            static_cast<void *>(&barrier),
-            [](void *ctx, std::size_t threadId) {
-                static_cast<BarrierT *>(ctx)->Arrive(threadId);
-            },
-            [](void *ctx, std::size_t threadId, std::size_t diff) {
-                static_cast<BarrierT *>(ctx)->Wait(threadId, diff);
-            }};
-    }
     std::vector<double> val_;
     std::vector<double> cscVal_;
 
@@ -505,10 +487,11 @@ class Sptrsv {
     std::size_t GetNumberOfVertices() { return instance_->NumberOfVertices(); }
 
     // SSP Lsolve with staleness=2 (allowing at most one superstep of lag).
-    // Barrier operations are injected via function pointers.
-    void SspLsolveStaleness2(const BarrierOps &barrierOps) {
+    // Uses FlatCheckpointCounterBarrier created internally.
+    void SspLsolveStaleness2() {
         constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference
         const unsigned nthreads = instance_->NumberOfProcessors();
+        FlatCheckpointCounterBarrier barrier(nthreads);
 
         auto *csr = instance_->GetComputationalDag().GetCSR();
         const auto *outer = csr->outerIndexPtr();
@@ -520,7 +503,7 @@ class Sptrsv {
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; ++step) {
                 // Enforce staleness window before starting this superstep.
-                barrierOps.wait(barrierOps.ctx, proc, staleness - 1U);
+                barrier.Wait(proc, staleness - 1U);
                 // Process nodes assigned to this (step, proc) pair.
                 const size_t boundsStrSize = boundsArrayL_[step][proc].size();
                 for (size_t index = 0; index < boundsStrSize; index += 2) {
@@ -539,19 +522,11 @@ class Sptrsv {
                     }
                 }
                 // Signal completion of this superstep.
-                barrierOps.arrive(barrierOps.ctx, proc);
+                barrier.Arrive(proc);
             }
         }
     }
 
-    // Default SSP Lsolve uses the cached flat checkpoint counter barrier.
-    void SspLsolveStaleness2() {
-        const unsigned nthreads = instance_->NumberOfProcessors();
-        FlatCheckpointCounterBarrierCached barrier(nthreads);
-        const BarrierOps ops = MakeBarrierOps(barrier);
-        SspLsolveStaleness2(ops);
-    }
-
     virtual ~Sptrsv() = default;
 };
 

From 9fc69e94307f55cb351443fe717f083db04903d3 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 19 Feb 2026 14:20:30 +0100
Subject: [PATCH 25/57] made staleness a parameter

---
 apps/maxbsp_ssp_sptrsv.cpp                    | 12 ++++---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp |  4 +--
 .../GreedySchedulers/GrowLocalMaxBsp.hpp      | 36 +++++++++----------
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 7d0153ac..065cf572 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -20,7 +20,7 @@
 
 using namespace osp;
 
-#define EPSILON 1e-20
+#define EPSILON 1e-50
 
 double L2NormalisedDiff(const std::vector<double> &v, const std::vector<double> &w) {
     assert(v.size() == w.size());
@@ -87,13 +87,15 @@ int main(int argc, char *argv[]) {
     BspArchitecture<SparseMatrixImp<int32_t>> architecture(num_threads, 1, 500);    // configurable processors
     BspInstance<SparseMatrixImp<int32_t>> instance(graph, architecture);
 
+    constexpr unsigned staleness = 2U;
+
     // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness=2)
     GreedyVarianceSspScheduler<SparseMatrixImp<int32_t>> ssp_var_scheduler;
     MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_var_schedule(instance);
-    ssp_var_scheduler.ComputeSchedule(ssp_var_schedule);
+    ssp_var_scheduler.ComputeSspSchedule(ssp_var_schedule, staleness);
 
     // Create SSP-aware schedule using GrowLocalMaxBsp (staleness=2)
-    GrowLocalSSP<SparseMatrixImp<int32_t>> ssp_gl_scheduler;
+    GrowLocalSSP<SparseMatrixImp<int32_t>, staleness> ssp_gl_scheduler;
     MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_gl_schedule(instance);
     ssp_gl_scheduler.ComputeSchedule(ssp_gl_schedule);
 
@@ -117,7 +119,7 @@ int main(int argc, char *argv[]) {
         sptrsv_kernel.x_ = x.data();
         sptrsv_kernel.b_ = b.data();
         auto start = std::chrono::high_resolution_clock::now();
-        sptrsv_kernel.SspLsolveStaleness2();
+        sptrsv_kernel.SspLsolveStaleness<staleness>();
         auto end = std::chrono::high_resolution_clock::now();
         ssp_var_flat_total_time += std::chrono::duration<double>(end - start).count();
         if (iter == 0) {
@@ -136,7 +138,7 @@ int main(int argc, char *argv[]) {
         sptrsv_kernel.x_ = x.data();
         sptrsv_kernel.b_ = b.data();
         auto start = std::chrono::high_resolution_clock::now();
-        sptrsv_kernel.SspLsolveStaleness2();
+        sptrsv_kernel.SspLsolveStaleness<staleness>();
         auto end = std::chrono::high_resolution_clock::now();
         ssp_gl_flat_total_time += std::chrono::duration<double>(end - start).count();
         if (iter == 0) {
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 22dcc294..0fbc80c5 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -488,8 +488,8 @@ class Sptrsv {
 
     // SSP Lsolve with staleness=2 (allowing at most one superstep of lag).
     // Uses FlatCheckpointCounterBarrier created internally.
-    void SspLsolveStaleness2() {
-        constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference
+    template <unsigned staleness = 2U>
+    void SspLsolveStaleness() {
         const unsigned nthreads = instance_->NumberOfProcessors();
         FlatCheckpointCounterBarrier barrier(nthreads);
 
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
index cfd8f85f..ad088a4c 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
@@ -39,7 +39,7 @@ struct GrowLocalSSPParams {
     WeightT syncCostMultiplierParallelCheck_ = 4;
 };
 
-template <typename GraphT>
+template <typename GraphT, unsigned staleness_t = 2U>
 class GrowLocalSSP : public MaxBspScheduler<GraphT> {
     static_assert(isDirectedGraphV<GraphT>);
     static_assert(hasVertexWeightsV<GraphT>);
@@ -47,7 +47,7 @@ class GrowLocalSSP : public MaxBspScheduler<GraphT> {
   private:
     using VertexType = VertexIdxT<GraphT>;
 
-    static constexpr unsigned staleness{2U};
+    static constexpr unsigned staleness{staleness_t};
     GrowLocalSSPParams<VertexIdxT<GraphT>, VWorkwT<GraphT>> params_;
 
     /*! Vertices ready in current superstep */
@@ -89,18 +89,18 @@ class GrowLocalSSP : public MaxBspScheduler<GraphT> {
     std::string GetScheduleName() const override { return "GrowLocalSSP"; }
 };
 
-template <typename GraphT>
-inline GrowLocalSSPParams<VertexIdxT<GraphT>, VWorkwT<GraphT>> &GrowLocalSSP<GraphT>::GetParameters() {
+template <typename GraphT, unsigned staleness_t>
+inline GrowLocalSSPParams<VertexIdxT<GraphT>, VWorkwT<GraphT>> &GrowLocalSSP<GraphT, staleness_t>::GetParameters() {
     return params_;
 }
 
-template <typename GraphT>
-inline const GrowLocalSSPParams<VertexIdxT<GraphT>, VWorkwT<GraphT>> &GrowLocalSSP<GraphT>::GetParameters() const {
+template <typename GraphT, unsigned staleness_t>
+inline const GrowLocalSSPParams<VertexIdxT<GraphT>, VWorkwT<GraphT>> &GrowLocalSSP<GraphT, staleness_t>::GetParameters() const {
     return params_;
 }
 
-template <typename GraphT>
-void GrowLocalSSP<GraphT>::Init(const unsigned numProcs) {
+template <typename GraphT, unsigned staleness_t>
+void GrowLocalSSP<GraphT, staleness_t>::Init(const unsigned numProcs) {
     currentlyReady_.clear();
 
     for (auto &stepFutureReady : futureReady_) {
@@ -125,8 +125,8 @@ void GrowLocalSSP<GraphT>::Init(const unsigned numProcs) {
     }
 }
 
-template <typename GraphT>
-void GrowLocalSSP<GraphT>::ReleaseMemory() {
+template <typename GraphT, unsigned staleness_t>
+void GrowLocalSSP<GraphT, staleness_t>::ReleaseMemory() {
     currentlyReady_.clear();
     currentlyReady_.shrink_to_fit();
 
@@ -159,8 +159,8 @@ void GrowLocalSSP<GraphT>::ReleaseMemory() {
     }
 }
 
-template <typename GraphT>
-inline typename std::deque<VertexIdxT<GraphT>>::difference_type GrowLocalSSP<GraphT>::MaxAllReadyUsage(
+template <typename GraphT, unsigned staleness_t>
+inline typename std::deque<VertexIdxT<GraphT>>::difference_type GrowLocalSSP<GraphT, staleness_t>::MaxAllReadyUsage(
     const std::deque<VertexIdxT<GraphT>> &currentlyReady, const std::deque<VertexIdxT<GraphT>> &nextSuperstepReady) const {
     if constexpr (staleness == 1U) {
         return std::distance(currentlyReady.cbegin(), currentlyReady.cend());
@@ -176,8 +176,8 @@ inline typename std::deque<VertexIdxT<GraphT>>::difference_type GrowLocalSSP<Gra
     }
 }
 
-template <typename GraphT>
-bool GrowLocalSSP<GraphT>::ChanceToFinish(const unsigned superStep) const {
+template <typename GraphT, unsigned staleness_t>
+bool GrowLocalSSP<GraphT, staleness_t>::ChanceToFinish(const unsigned superStep) const {
     bool ans = std::all_of(futureReady_.cbegin(), futureReady_.cend(), [](const auto &deq) { return deq.empty(); });
 
     if (ans) {
@@ -204,13 +204,13 @@ bool GrowLocalSSP<GraphT>::ChanceToFinish(const unsigned superStep) const {
     return ans;
 }
 
-template <typename GraphT>
-ReturnStatus GrowLocalSSP<GraphT>::ComputeSchedule(BspSchedule<GraphT> &schedule) {
+template <typename GraphT, unsigned staleness_t>
+ReturnStatus GrowLocalSSP<GraphT, staleness_t>::ComputeSchedule(BspSchedule<GraphT> &schedule) {
     return MaxBspScheduler<GraphT>::ComputeSchedule(schedule);
 }
 
-template <typename GraphT>
-ReturnStatus GrowLocalSSP<GraphT>::ComputeSchedule(MaxBspSchedule<GraphT> &schedule) {
+template <typename GraphT, unsigned staleness_t>
+ReturnStatus GrowLocalSSP<GraphT, staleness_t>::ComputeSchedule(MaxBspSchedule<GraphT> &schedule) {
     const BspInstance<GraphT> &instance = schedule.GetInstance();
     const GraphT &graph = instance.GetComputationalDag();
     const VertexType numVertices = graph.NumVertices();

From ae1c62b5a601c7b980e254dad8927bcee746780c Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 19 Feb 2026 14:23:21 +0100
Subject: [PATCH 26/57] fix text

---
 apps/maxbsp_ssp_sptrsv.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 065cf572..a9b3416c 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -89,12 +89,12 @@ int main(int argc, char *argv[]) {
 
     constexpr unsigned staleness = 2U;
 
-    // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness=2)
+    // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness)
     GreedyVarianceSspScheduler<SparseMatrixImp<int32_t>> ssp_var_scheduler;
     MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_var_schedule(instance);
     ssp_var_scheduler.ComputeSspSchedule(ssp_var_schedule, staleness);
 
-    // Create SSP-aware schedule using GrowLocalMaxBsp (staleness=2)
+    // Create SSP-aware schedule using GrowLocalMaxBsp (staleness)
     GrowLocalSSP<SparseMatrixImp<int32_t>, staleness> ssp_gl_scheduler;
     MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_gl_schedule(instance);
     ssp_gl_scheduler.ComputeSchedule(ssp_gl_schedule);

From e4460ee30fb06dae208299a2a68594c86d31dcec Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Thu, 19 Feb 2026 14:42:20 +0100
Subject: [PATCH 27/57] improved parameter

---
 include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
index ad088a4c..64a5b97f 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
@@ -34,7 +34,7 @@ namespace osp {
 
 template <typename VertT, typename WeightT>
 struct GrowLocalSSPParams {
-    VertT minSuperstepSize_ = 10;
+    VertT minSuperstepSize_ = 20;
     WeightT syncCostMultiplierMinSuperstepWeight_ = 1;
     WeightT syncCostMultiplierParallelCheck_ = 4;
 };

From 8f4c8031722b9b07b45955cc49ff33ba5eac8fb7 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Fri, 20 Feb 2026 12:09:27 +0100
Subject: [PATCH 28/57] Benchmark for ssp

---
 apps/maxbsp_ssp_sptrsv.cpp | 630 ++++++++++++++++++++++++++-----------
 1 file changed, 454 insertions(+), 176 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index a9b3416c..57ab6ae8 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -1,11 +1,30 @@
 /*
  * maxbsp_ssp_sptrsv.cpp
- * Demonstrates maxbsp scheduling with staleness=2, then runs SpTRSV with SSP kernel.
+ * Benchmark for SpTRSV using:
+ *   - variance_ssp
+ *   - growlocal_ssp
+ *   - growlocal
+ *   - eigen_serial
+ *
+ * Outputs per-iteration runtime rows to CSV:
+ * graph,Algorithm,processors,time to compute schedule,schedule supersteps,
+ * schedule synchronization costs,staleness,runtime
  */
 
 #include <Eigen/Sparse>
+#include <algorithm>
 #include <chrono>
+#include <cstdlib>
+#include <cmath>
+#include <ctime>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
 #include <iostream>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
 #include <unsupported/Eigen/SparseExtra>
 #include <vector>
 
@@ -20,25 +39,80 @@
 
 using namespace osp;
 
-#define EPSILON 1e-50
+namespace {
 
-double L2NormalisedDiff(const std::vector<double> &v, const std::vector<double> &w) {
-    assert(v.size() == w.size());
-    double l2diff = 0.0;
-    double frobNorm = 0.0;
-    for (std::size_t i = 0U; i < v.size(); ++i) {
-        const double absdiff = std::abs(v[i] - w[i]);
-        l2diff += absdiff * absdiff;
+constexpr double EPSILON = 1e-12;
+constexpr unsigned kDefaultStaleness = 2U;
 
-        const double vAbs = std::abs(v[i]);
-        const double wAbs = std::abs(w[i]);
+enum class Algorithm {
+    VarianceSsp,
+    GrowLocalSsp,
+    GrowLocal,
+    EigenSerial
+};
+
+struct Args {
+    std::string inputPath;
+    std::string outputCsv = "sptrsv_benchmark.csv";
+    int iterations = 100;
+    unsigned processors = 16U;
+    std::set<Algorithm> algorithms;
+};
+
+struct CsvRow {
+    std::string graph;
+    std::string algorithm;
+    unsigned processors;
+    double scheduleTimeSeconds;
+    unsigned supersteps;
+    double scheduleSyncCosts;
+    unsigned staleness;
+    double runtimeSeconds;
+};
+
+struct SummaryKey {
+    std::string graph;
+    std::string algorithm;
+    unsigned processors;
+    unsigned staleness;
+
+    bool operator<(const SummaryKey &other) const {
+        if (graph != other.graph) {
+            return graph < other.graph;
+        }
+        if (algorithm != other.algorithm) {
+            return algorithm < other.algorithm;
+        }
+        if (processors != other.processors) {
+            return processors < other.processors;
+        }
+        return staleness < other.staleness;
+    }
+};
+
+struct SummaryAgg {
+    double scheduleTimeSeconds = 0.0;
+    unsigned supersteps = 0U;
+    double scheduleSyncCosts = 0.0;
+    double sumLogRuntime = 0.0;
+    std::size_t samples = 0U;
+};
 
-        frobNorm += ((vAbs * vAbs) + (wAbs * wAbs)) / 2.0;
+std::string CsvEscape(const std::string &s) {
+    if (s.find(',') == std::string::npos && s.find('"') == std::string::npos && s.find('\n') == std::string::npos
+        && s.find('\r') == std::string::npos) {
+        return s;
     }
-    l2diff = std::sqrt(l2diff);
-    frobNorm = std::sqrt(frobNorm);
-    const double ratio = l2diff / (frobNorm + EPSILON);
-    return ratio;
+    std::string out = "\"";
+    for (const char c : s) {
+        if (c == '"') {
+            out += "\"\"";
+        } else {
+            out.push_back(c);
+        }
+    }
+    out += "\"";
+    return out;
 }
 
 double LInftyNormalisedDiff(const std::vector<double> &v, const std::vector<double> &w) {
@@ -47,194 +121,398 @@ double LInftyNormalisedDiff(const std::vector<double> &v, const std::vector<doub
         const double absdiff = std::abs(v[i] - w[i]);
         const double vAbs = std::abs(v[i]);
         const double wAbs = std::abs(w[i]);
-
-        diff = std::max(diff, 2 * absdiff / (vAbs + wAbs + EPSILON));
+        diff = std::max(diff, 2.0 * absdiff / (vAbs + wAbs + EPSILON));
     }
     return diff;
 }
 
-int main(int argc, char *argv[]) {
-    // Accept matrix filename and iteration count as arguments (threads via OMP_NUM_THREADS or optional arg)
-    std::string filename = "../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx";
-    int num_iterations = 1;
-    unsigned num_threads = 16U;
-    if (argc > 1) {
-        filename = argv[1];
-    }
-    if (argc > 2) {
-        num_iterations = std::stoi(argv[2]);
-    }
-    if (const char *omp_env = std::getenv("OMP_NUM_THREADS")) {
-        num_threads = static_cast<unsigned>(std::stoul(omp_env));
-    } else if (argc > 3) {
-        num_threads = static_cast<unsigned>(std::stoul(argv[3]));
-    }
-
-    // Load matrix
-    Eigen::SparseMatrix<double, Eigen::RowMajor, int32_t> lCsr;
-    bool matrixLoadSuccess = Eigen::loadMarket(lCsr, filename);
-    if (!matrixLoadSuccess) {
-        std::cerr << "Failed to read matrix from " << filename << std::endl;
-        return 1;
+void PrintUsage(const char *prog) {
+    std::cout << "Usage:\n"
+              << "  " << prog
+              << " --input <file_or_directory> [--output <csv>] [--iterations <n>] [--processors <p>]\n"
+              << "      [--variance-ssp] [--growlocal-ssp] [--growlocal] [--eigen-serial] [--all]\n\n"
+              << "Examples:\n"
+              << "  " << prog << " --input ../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx --all\n"
+              << "  " << prog
+              << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --variance-ssp --growlocal-ssp --growlocal\n";
+}
+
+bool ParseArgs(int argc, char *argv[], Args &args) {
+    if (const char *ompEnv = std::getenv("OMP_NUM_THREADS")) {
+        args.processors = static_cast<unsigned>(std::stoul(ompEnv));
     }
-    std::cout << "Loaded matrix of size " << lCsr.rows() << " x " << lCsr.cols() << " with " << lCsr.nonZeros() << " non-zeros.\n";
-
-    // Setup graph and architecture
-    SparseMatrixImp<int32_t> graph;
-    graph.SetCsr(&lCsr);
-    Eigen::SparseMatrix<double, Eigen::ColMajor, int32_t> lCsc = lCsr;
-    graph.SetCsc(&lCsc);
-    BspArchitecture<SparseMatrixImp<int32_t>> architecture(num_threads, 1, 500);    // configurable processors
-    BspInstance<SparseMatrixImp<int32_t>> instance(graph, architecture);
-
-    constexpr unsigned staleness = 2U;
-
-    // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness)
-    GreedyVarianceSspScheduler<SparseMatrixImp<int32_t>> ssp_var_scheduler;
-    MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_var_schedule(instance);
-    ssp_var_scheduler.ComputeSspSchedule(ssp_var_schedule, staleness);
-
-    // Create SSP-aware schedule using GrowLocalMaxBsp (staleness)
-    GrowLocalSSP<SparseMatrixImp<int32_t>, staleness> ssp_gl_scheduler;
-    MaxBspSchedule<SparseMatrixImp<int32_t>> ssp_gl_schedule(instance);
-    ssp_gl_scheduler.ComputeSchedule(ssp_gl_schedule);
-
-    // Create a non-SSP schedule using GrowLocalAutoCores
-    GrowLocalAutoCores<SparseMatrixImp<int32_t>> growlocal_scheduler;
-    BspSchedule<SparseMatrixImp<int32_t>> growlocal_schedule(instance);
-    growlocal_scheduler.ComputeSchedule(growlocal_schedule);
-
-    // Setup SpTRSV kernel
-    Sptrsv<int32_t> sptrsv_kernel(instance);
-
-    size_t n = static_cast<size_t>(lCsc.cols());
-
-    // Benchmark SSP Variance L-solve
-    double ssp_var_flat_total_time = 0.0;
-    std::vector<double> ssp_var_flat_result(n, 0.0);
-    for (int iter = 0; iter < num_iterations; ++iter) {
-        std::vector<double> x(n, 0.0);
-        std::vector<double> b(n, 1.0);
-        sptrsv_kernel.SetupCsrNoPermutation(ssp_var_schedule);
-        sptrsv_kernel.x_ = x.data();
-        sptrsv_kernel.b_ = b.data();
-        auto start = std::chrono::high_resolution_clock::now();
-        sptrsv_kernel.SspLsolveStaleness<staleness>();
-        auto end = std::chrono::high_resolution_clock::now();
-        ssp_var_flat_total_time += std::chrono::duration<double>(end - start).count();
-        if (iter == 0) {
-            ssp_var_flat_result = std::vector<double>(x.begin(), x.end());
+
+    for (int i = 1; i < argc; ++i) {
+        const std::string flag = argv[i];
+
+        const bool needsValue
+            = (flag == "--input" || flag == "--output" || flag == "--iterations" || flag == "--processors");
+        if (needsValue && i + 1 >= argc) {
+            std::cerr << "Missing value for " << flag << "\n";
+            return false;
         }
-    }
-    double ssp_var_flat_avg_time = ssp_var_flat_total_time / num_iterations;
-
-    // Benchmark SSP GrowLocal L-solve
-    double ssp_gl_flat_total_time = 0.0;
-    std::vector<double> ssp_gl_flat_result(n, 0.0);
-    for (int iter = 0; iter < num_iterations; ++iter) {
-        std::vector<double> x(n, 0.0);
-        std::vector<double> b(n, 1.0);
-        sptrsv_kernel.SetupCsrNoPermutation(ssp_gl_schedule);
-        sptrsv_kernel.x_ = x.data();
-        sptrsv_kernel.b_ = b.data();
-        auto start = std::chrono::high_resolution_clock::now();
-        sptrsv_kernel.SspLsolveStaleness<staleness>();
-        auto end = std::chrono::high_resolution_clock::now();
-        ssp_gl_flat_total_time += std::chrono::duration<double>(end - start).count();
-        if (iter == 0) {
-            ssp_gl_flat_result = std::vector<double>(x.begin(), x.end());
+
+        if (flag == "--input") {
+            args.inputPath = argv[++i];
+        } else if (flag == "--output") {
+            args.outputCsv = argv[++i];
+        } else if (flag == "--iterations") {
+            args.iterations = std::stoi(argv[++i]);
+        } else if (flag == "--processors") {
+            args.processors = static_cast<unsigned>(std::stoul(argv[++i]));
+        } else if (flag == "--variance-ssp") {
+            args.algorithms.insert(Algorithm::VarianceSsp);
+        } else if (flag == "--growlocal-ssp") {
+            args.algorithms.insert(Algorithm::GrowLocalSsp);
+        } else if (flag == "--growlocal") {
+            args.algorithms.insert(Algorithm::GrowLocal);
+        } else if (flag == "--eigen-serial") {
+            args.algorithms.insert(Algorithm::EigenSerial);
+        } else if (flag == "--all") {
+            args.algorithms = {Algorithm::VarianceSsp, Algorithm::GrowLocalSsp, Algorithm::GrowLocal, Algorithm::EigenSerial};
+        } else if (flag == "--help" || flag == "-h") {
+            PrintUsage(argv[0]);
+            return false;
+        } else {
+            std::cerr << "Unknown option: " << flag << "\n";
+            return false;
         }
     }
-    double ssp_gl_flat_avg_time = ssp_gl_flat_total_time / num_iterations;
-
-    // Benchmark GrowLocalAutoCores schedule with non-SSP L-solve (no permutation)
-    double growlocal_total_time = 0.0;
-    std::vector<double> growlocal_result(n, 0.0);
-    for (int iter = 0; iter < num_iterations; ++iter) {
-        std::vector<double> x(n, 0.0);
-        std::vector<double> b(n, 1.0);
-        sptrsv_kernel.SetupCsrNoPermutation(growlocal_schedule);
-        sptrsv_kernel.x_ = x.data();
-        sptrsv_kernel.b_ = b.data();
-        auto start = std::chrono::high_resolution_clock::now();
-        sptrsv_kernel.LsolveNoPermutation();
-        auto end = std::chrono::high_resolution_clock::now();
-        growlocal_total_time += std::chrono::duration<double>(end - start).count();
-        if (iter == 0) {
-            growlocal_result = std::vector<double>(x.begin(), x.end());
+
+    if (args.inputPath.empty()) {
+        std::cerr << "--input is required\n";
+        return false;
+    }
+    if (args.iterations <= 0) {
+        std::cerr << "--iterations must be > 0\n";
+        return false;
+    }
+    if (args.processors == 0U) {
+        std::cerr << "--processors must be > 0\n";
+        return false;
+    }
+    if (args.algorithms.empty()) {
+        std::cerr << "No algorithm selected. Use --all or explicit flags.\n";
+        return false;
+    }
+
+    return true;
+}
+
+std::vector<std::filesystem::path> CollectInputGraphs(const std::string &inputPath) {
+    std::vector<std::filesystem::path> inputs;
+    const std::filesystem::path p(inputPath);
+
+    if (!std::filesystem::exists(p)) {
+        throw std::runtime_error("Input path does not exist: " + inputPath);
+    }
+
+    if (std::filesystem::is_regular_file(p)) {
+        if (p.extension() == ".mtx") {
+            inputs.push_back(p);
         }
+        return inputs;
     }
-    double growlocal_avg_time = growlocal_total_time / num_iterations;
-
-    // Benchmark serial L-solve
-    double serial_total_time = 0.0;
-    std::vector<double> serial_result(n, 0.0);
-    for (int iter = 0; iter < num_iterations; ++iter) {
-        std::vector<double> x_serial(n, 0.0);
-        std::vector<double> b_serial(n, 1.0);
-        sptrsv_kernel.x_ = x_serial.data();
-        sptrsv_kernel.b_ = b_serial.data();
-        auto start = std::chrono::high_resolution_clock::now();
-        sptrsv_kernel.LsolveSerial();
-        auto end = std::chrono::high_resolution_clock::now();
-        serial_total_time += std::chrono::duration<double>(end - start).count();
-        if (iter == 0) {
-            serial_result = std::vector<double>(x_serial.begin(), x_serial.end());
+
+    if (std::filesystem::is_directory(p)) {
+        for (const auto &entry : std::filesystem::recursive_directory_iterator(p)) {
+            if (!entry.is_regular_file()) {
+                continue;
+            }
+            if (entry.path().extension() == ".mtx") {
+                inputs.push_back(entry.path());
+            }
         }
     }
-    double serial_avg_time = serial_total_time / num_iterations;
 
-    // Compare results
-    const double varDiff = LInftyNormalisedDiff(ssp_var_flat_result, serial_result);
+    std::sort(inputs.begin(), inputs.end());
+    return inputs;
+}
+
+void EnsureCsvHeader(std::ofstream &csv) {
+    csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,ScheduleSynchronizationCosts,Staleness,RuntimeSeconds\n";
+}
 
-    std::cout << "Max relative difference between SSP Variance and serial L-solve: " << varDiff << std::endl;
-    if (varDiff < EPSILON) {
-        std::cout << "SSP Variance L-solve matches serial L-solve!" << std::endl;
-    } else {
-        std::cout << "SSP Variance L-solve does NOT match serial L-solve!" << std::endl;
+void EnsureSummaryCsvHeader(std::ofstream &csv) {
+    csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,ScheduleSynchronizationCosts,Staleness,"
+           "RuntimeSamples,RuntimeGeometricMeanSeconds\n";
+}
+
+void WriteCsvRow(std::ofstream &csv, const CsvRow &row) {
+    csv << CsvEscape(row.graph) << "," << row.algorithm << "," << row.processors << "," << row.scheduleTimeSeconds << ","
+    << row.supersteps << "," << row.scheduleSyncCosts << "," << row.staleness << "," << row.runtimeSeconds << "\n";
+}
+
+std::string BuildSummaryCsvPath(const std::string &detailPath) {
+    const std::filesystem::path p(detailPath);
+    const std::string stem = p.stem().string();
+    const std::string ext = p.has_extension() ? p.extension().string() : std::string(".csv");
+    const std::filesystem::path summary = p.parent_path() / (stem + "_summary" + ext);
+    return summary.string();
+}
+
+std::string FormatExperimentStartTimestampForFilename() {
+    const std::time_t now = std::time(nullptr);
+    std::tm localTm{};
+#ifdef _WIN32
+    localtime_s(&localTm, &now);
+#else
+    localtime_r(&now, &localTm);
+#endif
+    std::ostringstream oss;
+    oss << std::put_time(&localTm, "%d-%m-%Y_%H%M");
+    return oss.str();
+}
+
+std::string BuildTimestampedCsvPath(const std::string &basePath, const std::string &timestamp) {
+    const std::filesystem::path p(basePath);
+    const std::string stem = p.stem().string();
+    const std::string ext = p.has_extension() ? p.extension().string() : std::string(".csv");
+    const std::filesystem::path out = p.parent_path() / (stem + "_" + timestamp + ext);
+    return out.string();
+}
+
+template <typename ScheduleT>
+double ComputeScheduleSyncCosts(const BspInstance<SparseMatrixImp<int32_t>> &instance, const ScheduleT &schedule) {
+    if (schedule.NumberOfSupersteps() == 0U) {
+        return 0.0;
     }
+    return static_cast<double>(schedule.NumberOfSupersteps() - 1U) * static_cast<double>(instance.SynchronisationCosts());
+}
+
+}    // namespace
 
-    const double GLSSPDiff = LInftyNormalisedDiff(ssp_gl_flat_result, serial_result);
+int main(int argc, char *argv[]) {
+    const std::string experimentStart = FormatExperimentStartTimestampForFilename();
 
-    std::cout << "Max relative difference between SSP GrowLocal and serial L-solve: " << GLSSPDiff << std::endl;
-    if (GLSSPDiff < EPSILON) {
-        std::cout << "SSP GrowLocal L-solve matches serial L-solve!" << std::endl;
-    } else {
-        std::cout << "SSP GrowLocal L-solve does NOT match serial L-solve!" << std::endl;
+    Args args;
+    if (!ParseArgs(argc, argv, args)) {
+        PrintUsage(argv[0]);
+        return 1;
     }
 
-    const double GLPDiff = LInftyNormalisedDiff(growlocal_result, serial_result);
+    std::vector<std::filesystem::path> graphFiles;
+    try {
+        graphFiles = CollectInputGraphs(args.inputPath);
+    } catch (const std::exception &e) {
+        std::cerr << e.what() << std::endl;
+        return 1;
+    }
 
-    std::cout << "Max relative difference between GrowLocal and serial L-solve: " << GLPDiff << std::endl;
-    if (GLPDiff < EPSILON) {
-        std::cout << "GrowLocal L-solve matches serial L-solve!" << std::endl;
-    } else {
-        std::cout << "GrowLocal L-solve does NOT match serial L-solve!" << std::endl;
+    if (graphFiles.empty()) {
+        std::cerr << "No .mtx files found at input path: " << args.inputPath << std::endl;
+        return 1;
     }
 
-    std::cout << "Average SSP Variance L-solve time (" << num_iterations << " runs): " << ssp_var_flat_avg_time << " seconds"
-              << std::endl;
-    std::cout << "Average SSP GrowLocal L-solve time (" << num_iterations << " runs): " << ssp_gl_flat_avg_time << " seconds"
-              << std::endl;
-    std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time << " seconds"
-              << std::endl;
-    std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl << std::endl;
+    const std::string detailCsvPath = BuildTimestampedCsvPath(args.outputCsv, experimentStart);
+    std::ofstream csv(detailCsvPath, std::ios::out | std::ios::trunc);
+    if (!csv.is_open()) {
+        std::cerr << "Failed to open CSV output: " << detailCsvPath << std::endl;
+        return 1;
+    }
+    EnsureCsvHeader(csv);
 
-    if (ssp_var_flat_avg_time > 0.0) {
-        std::cout << "Speedup (serial/SSP Var): " << (serial_avg_time / ssp_var_flat_avg_time) << "x" << std::endl;
+    const std::string summaryCsvPath = BuildSummaryCsvPath(detailCsvPath);
+    std::ofstream summaryCsv(summaryCsvPath, std::ios::out | std::ios::trunc);
+    if (!summaryCsv.is_open()) {
+        std::cerr << "Failed to open summary CSV output: " << summaryCsvPath << std::endl;
+        return 1;
     }
-    if (ssp_gl_flat_avg_time > 0.0) {
-        std::cout << "Speedup (serial/SSP GL): " << (serial_avg_time / ssp_gl_flat_avg_time) << "x" << std::endl;
+    EnsureSummaryCsvHeader(summaryCsv);
+
+    std::cout << "Running benchmark on " << graphFiles.size() << " graph(s), iterations=" << args.iterations
+              << ", processors=" << args.processors << std::endl;
+    std::cout << "Experiment id timestamp: " << experimentStart << std::endl;
+
+    std::vector<CsvRow> bufferedRows;
+    bufferedRows.reserve(graphFiles.size() * args.algorithms.size() * static_cast<std::size_t>(args.iterations));
+
+    for (const auto &graphPath : graphFiles) {
+        const std::string graphName = graphPath.filename().string();
+
+        Eigen::SparseMatrix<double, Eigen::RowMajor, int32_t> lCsr;
+        if (!Eigen::loadMarket(lCsr, graphPath.string())) {
+            std::cerr << "Failed to load matrix: " << graphPath << std::endl;
+            continue;
+        }
+
+        Eigen::SparseMatrix<double, Eigen::ColMajor, int32_t> lCsc = lCsr;
+
+        SparseMatrixImp<int32_t> graph;
+        graph.SetCsr(&lCsr);
+        graph.SetCsc(&lCsc);
+
+        BspArchitecture<SparseMatrixImp<int32_t>> architecture(args.processors, 1, 500);
+        BspInstance<SparseMatrixImp<int32_t>> instance(graph, architecture);
+
+        Sptrsv<int32_t> sptrsv(instance);
+        const std::size_t n = static_cast<std::size_t>(lCsr.cols());
+
+        std::vector<double> serialRefX(n, 0.0);
+        std::vector<double> serialB(n, 1.0);
+        sptrsv.x_ = serialRefX.data();
+        sptrsv.b_ = serialB.data();
+        sptrsv.LsolveSerial();
+
+        std::cout << "Graph: " << graphName << " (" << lCsr.rows() << "x" << lCsr.cols() << ", nnz=" << lCsr.nonZeros() << ")\n";
+
+        if (args.algorithms.count(Algorithm::VarianceSsp) > 0U) {
+            GreedyVarianceSspScheduler<SparseMatrixImp<int32_t>> scheduler;
+            MaxBspSchedule<SparseMatrixImp<int32_t>> schedule(instance);
+
+            const auto t0 = std::chrono::high_resolution_clock::now();
+            scheduler.ComputeSspSchedule(schedule, kDefaultStaleness);
+            const auto t1 = std::chrono::high_resolution_clock::now();
+            const double scheduleTime = std::chrono::duration<double>(t1 - t0).count();
+
+            sptrsv.SetupCsrNoPermutation(schedule);
+            const unsigned supersteps = schedule.NumberOfSupersteps();
+            const double syncCosts = ComputeScheduleSyncCosts(instance, schedule);
+
+            for (int iter = 0; iter < args.iterations; ++iter) {
+                std::vector<double> x(n, 0.0);
+                std::vector<double> b(n, 1.0);
+                sptrsv.x_ = x.data();
+                sptrsv.b_ = b.data();
+
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.SspLsolveStaleness<kDefaultStaleness>();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
+
+                if (iter == 0) {
+                    const double diff = LInftyNormalisedDiff(x, serialRefX);
+                    std::cout << "  variance_ssp first-run max relative diff vs serial: " << diff << std::endl;
+                }
+
+                bufferedRows.push_back(CsvRow{graphName,
+                                              "variance_ssp",
+                                              args.processors,
+                                              scheduleTime,
+                                              supersteps,
+                                              syncCosts,
+                                              kDefaultStaleness,
+                                              runtime});
+            }
+        }
+
+        if (args.algorithms.count(Algorithm::GrowLocalSsp) > 0U) {
+            GrowLocalSSP<SparseMatrixImp<int32_t>, kDefaultStaleness> scheduler;
+            MaxBspSchedule<SparseMatrixImp<int32_t>> schedule(instance);
+
+            const auto t0 = std::chrono::high_resolution_clock::now();
+            scheduler.ComputeSchedule(schedule);
+            const auto t1 = std::chrono::high_resolution_clock::now();
+            const double scheduleTime = std::chrono::duration<double>(t1 - t0).count();
+
+            sptrsv.SetupCsrNoPermutation(schedule);
+            const unsigned supersteps = schedule.NumberOfSupersteps();
+            const double syncCosts = ComputeScheduleSyncCosts(instance, schedule);
+
+            for (int iter = 0; iter < args.iterations; ++iter) {
+                std::vector<double> x(n, 0.0);
+                std::vector<double> b(n, 1.0);
+                sptrsv.x_ = x.data();
+                sptrsv.b_ = b.data();
+
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.SspLsolveStaleness<kDefaultStaleness>();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
+
+                if (iter == 0) {
+                    const double diff = LInftyNormalisedDiff(x, serialRefX);
+                    std::cout << "  growlocal_ssp first-run max relative diff vs serial: " << diff << std::endl;
+                }
+
+                bufferedRows.push_back(CsvRow{graphName,
+                                              "growlocal_ssp",
+                                              args.processors,
+                                              scheduleTime,
+                                              supersteps,
+                                              syncCosts,
+                                              kDefaultStaleness,
+                                              runtime});
+            }
+        }
+
+        if (args.algorithms.count(Algorithm::GrowLocal) > 0U) {
+            GrowLocalAutoCores<SparseMatrixImp<int32_t>> scheduler;
+            BspSchedule<SparseMatrixImp<int32_t>> schedule(instance);
+
+            const auto t0 = std::chrono::high_resolution_clock::now();
+            scheduler.ComputeSchedule(schedule);
+            const auto t1 = std::chrono::high_resolution_clock::now();
+            const double scheduleTime = std::chrono::duration<double>(t1 - t0).count();
+
+            sptrsv.SetupCsrNoPermutation(schedule);
+            const unsigned supersteps = schedule.NumberOfSupersteps();
+            const double syncCosts = ComputeScheduleSyncCosts(instance, schedule);
+
+            for (int iter = 0; iter < args.iterations; ++iter) {
+                std::vector<double> x(n, 0.0);
+                std::vector<double> b(n, 1.0);
+                sptrsv.x_ = x.data();
+                sptrsv.b_ = b.data();
+
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.LsolveNoPermutation();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
+
+                if (iter == 0) {
+                    const double diff = LInftyNormalisedDiff(x, serialRefX);
+                    std::cout << "  growlocal first-run max relative diff vs serial: " << diff << std::endl;
+                }
+
+                bufferedRows.push_back(CsvRow{
+                    graphName, "growlocal", args.processors, scheduleTime, supersteps, syncCosts, 1U, runtime});
+            }
+        }
+
+        if (args.algorithms.count(Algorithm::EigenSerial) > 0U) {
+            for (int iter = 0; iter < args.iterations; ++iter) {
+                std::vector<double> x(n, 0.0);
+                std::vector<double> b(n, 1.0);
+                sptrsv.x_ = x.data();
+                sptrsv.b_ = b.data();
+
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.LsolveSerial();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
+
+                bufferedRows.push_back(CsvRow{graphName, "eigen_serial", 1U, 0.0, 1U, 0.0, 0U, runtime});
+            }
+        }
     }
-    if (growlocal_avg_time > 0.0) {
-        std::cout << "Speedup (serial/GrowLocalAutoCores): " << (serial_avg_time / growlocal_avg_time) << "x" << std::endl;
+
+    for (const CsvRow &row : bufferedRows) {
+        WriteCsvRow(csv, row);
     }
-    if (ssp_var_flat_avg_time > 0.0) {
-        std::cout << "Speedup (GrowLocalAutoCores/SSP Var): " << (growlocal_avg_time / ssp_var_flat_avg_time) << "x" << std::endl;
+
+    std::map<SummaryKey, SummaryAgg> summary;
+    constexpr double kMinRuntime = 1e-15;
+    for (const CsvRow &row : bufferedRows) {
+        SummaryKey key{row.graph, row.algorithm, row.processors, row.staleness};
+        SummaryAgg &agg = summary[key];
+        if (agg.samples == 0U) {
+            agg.scheduleTimeSeconds = row.scheduleTimeSeconds;
+            agg.supersteps = row.supersteps;
+            agg.scheduleSyncCosts = row.scheduleSyncCosts;
+        }
+        agg.sumLogRuntime += std::log(std::max(row.runtimeSeconds, kMinRuntime));
+        ++agg.samples;
     }
-    if (ssp_gl_flat_avg_time > 0.0) {
-        std::cout << "Speedup (GrowLocalAutoCores/SSP GL): " << (growlocal_avg_time / ssp_gl_flat_avg_time) << "x" << std::endl;
+
+    for (const auto &[key, agg] : summary) {
+        const double geomean = std::exp(agg.sumLogRuntime / static_cast<double>(agg.samples));
+        summaryCsv << CsvEscape(key.graph) << "," << key.algorithm << "," << key.processors << "," << agg.scheduleTimeSeconds
+               << "," << agg.supersteps << "," << agg.scheduleSyncCosts << "," << key.staleness
+                   << "," << agg.samples << "," << geomean << "\n";
     }
 
+    std::cout << "Benchmark complete. CSV written to: " << detailCsvPath << std::endl;
+    std::cout << "Summary CSV written to: " << summaryCsvPath << std::endl;
     return 0;
 }

From 65d169b0514e20c984fef99c9556767f80f0beb2 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Fri, 20 Feb 2026 14:34:04 +0100
Subject: [PATCH 29/57] benchmark additions and checkpoints

---
 apps/maxbsp_ssp_sptrsv.cpp | 179 ++++++++++++++++++++++++-------------
 1 file changed, 118 insertions(+), 61 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 57ab6ae8..988653be 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -21,6 +21,7 @@
 #include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <iterator>
 #include <map>
 #include <set>
 #include <sstream>
@@ -43,12 +44,15 @@ namespace {
 
 constexpr double EPSILON = 1e-12;
 constexpr unsigned kDefaultStaleness = 2U;
+constexpr int defaultSynchronisationCosts = 500;
+
+constexpr int preMeasureIterations = 2;
 
 enum class Algorithm {
     VarianceSsp,
     GrowLocalSsp,
     GrowLocal,
-    EigenSerial
+    Serial
 };
 
 struct Args {
@@ -65,9 +69,10 @@ struct CsvRow {
     unsigned processors;
     double scheduleTimeSeconds;
     unsigned supersteps;
-    double scheduleSyncCosts;
+    int SyncCosts;
     unsigned staleness;
     double runtimeSeconds;
+    bool correctness;
 };
 
 struct SummaryKey {
@@ -93,9 +98,10 @@ struct SummaryKey {
 struct SummaryAgg {
     double scheduleTimeSeconds = 0.0;
     unsigned supersteps = 0U;
-    double scheduleSyncCosts = 0.0;
+    int SyncCosts = 0;
     double sumLogRuntime = 0.0;
     std::size_t samples = 0U;
+    bool correctness = false;
 };
 
 std::string CsvEscape(const std::string &s) {
@@ -167,9 +173,9 @@ bool ParseArgs(int argc, char *argv[], Args &args) {
         } else if (flag == "--growlocal") {
             args.algorithms.insert(Algorithm::GrowLocal);
         } else if (flag == "--eigen-serial") {
-            args.algorithms.insert(Algorithm::EigenSerial);
+            args.algorithms.insert(Algorithm::Serial);
         } else if (flag == "--all") {
-            args.algorithms = {Algorithm::VarianceSsp, Algorithm::GrowLocalSsp, Algorithm::GrowLocal, Algorithm::EigenSerial};
+            args.algorithms = {Algorithm::VarianceSsp, Algorithm::GrowLocalSsp, Algorithm::GrowLocal, Algorithm::Serial};
         } else if (flag == "--help" || flag == "-h") {
             PrintUsage(argv[0]);
             return false;
@@ -201,7 +207,11 @@ bool ParseArgs(int argc, char *argv[], Args &args) {
 
 std::vector<std::filesystem::path> CollectInputGraphs(const std::string &inputPath) {
     std::vector<std::filesystem::path> inputs;
-    const std::filesystem::path p(inputPath);
+    std::filesystem::path p(inputPath);
+
+    while (std::filesystem::exists(p) && std::filesystem::is_symlink(p)) {
+        p = std::filesystem::read_symlink(p);
+    }
 
     if (!std::filesystem::exists(p)) {
         throw std::runtime_error("Input path does not exist: " + inputPath);
@@ -211,16 +221,18 @@ std::vector<std::filesystem::path> CollectInputGraphs(const std::string &inputPa
         if (p.extension() == ".mtx") {
             inputs.push_back(p);
         }
-        return inputs;
-    }
-
-    if (std::filesystem::is_directory(p)) {
+    } else if (std::filesystem::is_directory(p)) {
         for (const auto &entry : std::filesystem::recursive_directory_iterator(p)) {
-            if (!entry.is_regular_file()) {
+            auto entryPath = entry.path();
+            while (std::filesystem::exists(entryPath) && std::filesystem::is_symlink(entryPath)) {
+                entryPath = std::filesystem::read_symlink(entryPath);
+            }
+
+            if (!std::filesystem::is_regular_file(entryPath)) {
                 continue;
             }
-            if (entry.path().extension() == ".mtx") {
-                inputs.push_back(entry.path());
+            if (entryPath.extension() == ".mtx") {
+                inputs.push_back(entryPath);
             }
         }
     }
@@ -230,17 +242,17 @@ std::vector<std::filesystem::path> CollectInputGraphs(const std::string &inputPa
 }
 
 void EnsureCsvHeader(std::ofstream &csv) {
-    csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,ScheduleSynchronizationCosts,Staleness,RuntimeSeconds\n";
+    csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,SynchronizationCosts,Staleness,RuntimeSeconds,Correctness\n";
 }
 
 void EnsureSummaryCsvHeader(std::ofstream &csv) {
-    csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,ScheduleSynchronizationCosts,Staleness,"
-           "RuntimeSamples,RuntimeGeometricMeanSeconds\n";
+    csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,SynchronizationCosts,Staleness,"
+           "RuntimeSamples,RuntimeGeometricMeanSeconds,Correctness\n";
 }
 
 void WriteCsvRow(std::ofstream &csv, const CsvRow &row) {
     csv << CsvEscape(row.graph) << "," << row.algorithm << "," << row.processors << "," << row.scheduleTimeSeconds << ","
-    << row.supersteps << "," << row.scheduleSyncCosts << "," << row.staleness << "," << row.runtimeSeconds << "\n";
+    << row.supersteps << "," << row.SyncCosts << "," << row.staleness << "," << row.runtimeSeconds << "," << row.correctness << "\n";
 }
 
 std::string BuildSummaryCsvPath(const std::string &detailPath) {
@@ -272,12 +284,8 @@ std::string BuildTimestampedCsvPath(const std::string &basePath, const std::stri
     return out.string();
 }
 
-template <typename ScheduleT>
-double ComputeScheduleSyncCosts(const BspInstance<SparseMatrixImp<int32_t>> &instance, const ScheduleT &schedule) {
-    if (schedule.NumberOfSupersteps() == 0U) {
-        return 0.0;
-    }
-    return static_cast<double>(schedule.NumberOfSupersteps() - 1U) * static_cast<double>(instance.SynchronisationCosts());
+int ComputeSyncCosts(const BspInstance<SparseMatrixImp<int32_t>> &instance) {
+    return instance.GetArchitecture().SynchronisationCosts();
 }
 
 }    // namespace
@@ -326,6 +334,7 @@ int main(int argc, char *argv[]) {
 
     std::vector<CsvRow> bufferedRows;
     bufferedRows.reserve(graphFiles.size() * args.algorithms.size() * static_cast<std::size_t>(args.iterations));
+    typename std::vector<CsvRow>::difference_type writtenEntries = 0U;
 
     for (const auto &graphPath : graphFiles) {
         const std::string graphName = graphPath.filename().string();
@@ -342,7 +351,7 @@ int main(int argc, char *argv[]) {
         graph.SetCsr(&lCsr);
         graph.SetCsc(&lCsc);
 
-        BspArchitecture<SparseMatrixImp<int32_t>> architecture(args.processors, 1, 500);
+        BspArchitecture<SparseMatrixImp<int32_t>> architecture(args.processors, 1, defaultSynchronisationCosts);
         BspInstance<SparseMatrixImp<int32_t>> instance(graph, architecture);
 
         Sptrsv<int32_t> sptrsv(instance);
@@ -367,9 +376,10 @@ int main(int argc, char *argv[]) {
 
             sptrsv.SetupCsrNoPermutation(schedule);
             const unsigned supersteps = schedule.NumberOfSupersteps();
-            const double syncCosts = ComputeScheduleSyncCosts(instance, schedule);
+            const int syncCosts = ComputeSyncCosts(instance);
 
-            for (int iter = 0; iter < args.iterations; ++iter) {
+            bool correct = false;
+            for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
                 std::vector<double> x(n, 0.0);
                 std::vector<double> b(n, 1.0);
                 sptrsv.x_ = x.data();
@@ -382,17 +392,26 @@ int main(int argc, char *argv[]) {
 
                 if (iter == 0) {
                     const double diff = LInftyNormalisedDiff(x, serialRefX);
-                    std::cout << "  variance_ssp first-run max relative diff vs serial: " << diff << std::endl;
+                    correct = (diff < EPSILON);
+                    std::cout << "  Variance_SSP first-run max relative diff vs serial: " << diff << std::endl;
+                }
+
+                if (iter >= preMeasureIterations) {
+                    bufferedRows.emplace_back(CsvRow{graphName,
+                                                     "Variance_SSP",
+                                                     args.processors,
+                                                     scheduleTime,
+                                                     supersteps,
+                                                     syncCosts,
+                                                     kDefaultStaleness,
+                                                     runtime,
+                                                     correct});
                 }
+            }
 
-                bufferedRows.push_back(CsvRow{graphName,
-                                              "variance_ssp",
-                                              args.processors,
-                                              scheduleTime,
-                                              supersteps,
-                                              syncCosts,
-                                              kDefaultStaleness,
-                                              runtime});
+            for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) {
+                WriteCsvRow(csv, *it);
+                ++writtenEntries;
             }
         }
 
@@ -407,9 +426,10 @@ int main(int argc, char *argv[]) {
 
             sptrsv.SetupCsrNoPermutation(schedule);
             const unsigned supersteps = schedule.NumberOfSupersteps();
-            const double syncCosts = ComputeScheduleSyncCosts(instance, schedule);
+            const int syncCosts = ComputeSyncCosts(instance);
 
-            for (int iter = 0; iter < args.iterations; ++iter) {
+            bool correct = false;
+            for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
                 std::vector<double> x(n, 0.0);
                 std::vector<double> b(n, 1.0);
                 sptrsv.x_ = x.data();
@@ -422,17 +442,26 @@ int main(int argc, char *argv[]) {
 
                 if (iter == 0) {
                     const double diff = LInftyNormalisedDiff(x, serialRefX);
-                    std::cout << "  growlocal_ssp first-run max relative diff vs serial: " << diff << std::endl;
+                    correct = (diff < EPSILON);
+                    std::cout << "  Growlocal_SSP first-run max relative diff vs serial: " << diff << std::endl;
                 }
 
-                bufferedRows.push_back(CsvRow{graphName,
-                                              "growlocal_ssp",
-                                              args.processors,
-                                              scheduleTime,
-                                              supersteps,
-                                              syncCosts,
-                                              kDefaultStaleness,
-                                              runtime});
+                if (iter >= preMeasureIterations) {
+                    bufferedRows.emplace_back(CsvRow{graphName,
+                                                     "Growlocal_SSP",
+                                                     args.processors,
+                                                     scheduleTime,
+                                                     supersteps,
+                                                     syncCosts,
+                                                     kDefaultStaleness,
+                                                     runtime,
+                                                     correct});
+                }
+            }
+
+            for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) {
+                WriteCsvRow(csv, *it);
+                ++writtenEntries;
             }
         }
 
@@ -447,9 +476,10 @@ int main(int argc, char *argv[]) {
 
             sptrsv.SetupCsrNoPermutation(schedule);
             const unsigned supersteps = schedule.NumberOfSupersteps();
-            const double syncCosts = ComputeScheduleSyncCosts(instance, schedule);
+            const int syncCosts = ComputeSyncCosts(instance);
 
-            for (int iter = 0; iter < args.iterations; ++iter) {
+            bool correct;
+            for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
                 std::vector<double> x(n, 0.0);
                 std::vector<double> b(n, 1.0);
                 sptrsv.x_ = x.data();
@@ -462,16 +492,31 @@ int main(int argc, char *argv[]) {
 
                 if (iter == 0) {
                     const double diff = LInftyNormalisedDiff(x, serialRefX);
-                    std::cout << "  growlocal first-run max relative diff vs serial: " << diff << std::endl;
+                    correct = (diff < EPSILON);
+                    std::cout << "  Growlocal first-run max relative diff vs serial: " << diff << std::endl;
+                }
+
+                if (iter >= preMeasureIterations) {
+                    bufferedRows.emplace_back(CsvRow{graphName,
+                                                     "Growlocal",
+                                                     args.processors,
+                                                     scheduleTime,
+                                                     supersteps,
+                                                     syncCosts,
+                                                     1U,
+                                                     runtime,
+                                                     correct});
                 }
+            }
 
-                bufferedRows.push_back(CsvRow{
-                    graphName, "growlocal", args.processors, scheduleTime, supersteps, syncCosts, 1U, runtime});
+            for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) {
+                WriteCsvRow(csv, *it);
+                ++writtenEntries;
             }
         }
 
-        if (args.algorithms.count(Algorithm::EigenSerial) > 0U) {
-            for (int iter = 0; iter < args.iterations; ++iter) {
+        if (args.algorithms.count(Algorithm::Serial) > 0U) {
+            for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
                 std::vector<double> x(n, 0.0);
                 std::vector<double> b(n, 1.0);
                 sptrsv.x_ = x.data();
@@ -482,13 +527,24 @@ int main(int argc, char *argv[]) {
                 const auto e = std::chrono::high_resolution_clock::now();
                 const double runtime = std::chrono::duration<double>(e - s).count();
 
-                bufferedRows.push_back(CsvRow{graphName, "eigen_serial", 1U, 0.0, 1U, 0.0, 0U, runtime});
+                if (iter >= preMeasureIterations) {
+                    bufferedRows.emplace_back(CsvRow{graphName,
+                                                     "Serial",
+                                                     1U,
+                                                     0.0,
+                                                     1U,
+                                                     0,
+                                                     1U,
+                                                     runtime,
+                                                     true});
+                }
             }
-        }
-    }
 
-    for (const CsvRow &row : bufferedRows) {
-        WriteCsvRow(csv, row);
+            for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) {
+                WriteCsvRow(csv, *it);
+                ++writtenEntries;
+            }
+        }
     }
 
     std::map<SummaryKey, SummaryAgg> summary;
@@ -499,7 +555,8 @@ int main(int argc, char *argv[]) {
         if (agg.samples == 0U) {
             agg.scheduleTimeSeconds = row.scheduleTimeSeconds;
             agg.supersteps = row.supersteps;
-            agg.scheduleSyncCosts = row.scheduleSyncCosts;
+            agg.SyncCosts = row.SyncCosts;
+            agg.correctness = row.correctness;
         }
         agg.sumLogRuntime += std::log(std::max(row.runtimeSeconds, kMinRuntime));
         ++agg.samples;
@@ -508,8 +565,8 @@ int main(int argc, char *argv[]) {
     for (const auto &[key, agg] : summary) {
         const double geomean = std::exp(agg.sumLogRuntime / static_cast<double>(agg.samples));
         summaryCsv << CsvEscape(key.graph) << "," << key.algorithm << "," << key.processors << "," << agg.scheduleTimeSeconds
-               << "," << agg.supersteps << "," << agg.scheduleSyncCosts << "," << key.staleness
-                   << "," << agg.samples << "," << geomean << "\n";
+               << "," << agg.supersteps << "," << agg.SyncCosts << "," << key.staleness
+                   << "," << agg.samples << "," << geomean << "," << agg.correctness << "\n";
     }
 
     std::cout << "Benchmark complete. CSV written to: " << detailCsvPath << std::endl;

From 7ad26e7191a3e8c7202382914bdcc46cfa2aabb7 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Mon, 23 Feb 2026 17:06:52 +0100
Subject: [PATCH 30/57] wait at barrier only if there is something to compute

---
 include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 0fbc80c5..d2461e7f 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -502,10 +502,12 @@ class Sptrsv {
         {
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; ++step) {
-                // Enforce staleness window before starting this superstep.
-                barrier.Wait(proc, staleness - 1U);
                 // Process nodes assigned to this (step, proc) pair.
                 const size_t boundsStrSize = boundsArrayL_[step][proc].size();
+                // Enforce staleness window before starting this superstep.
+                if (boundsStrSize > 0U) {
+                    barrier.Wait(proc, staleness - 1U);
+                }
                 for (size_t index = 0; index < boundsStrSize; index += 2) {
                     EigenIdxType lowerB = boundsArrayL_[step][proc][index];
                     const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];

From 54c492887bc749c913bc43d7aa928fcdb01908c9 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Tue, 24 Feb 2026 09:43:05 +0100
Subject: [PATCH 31/57] improved busy waiting

---
 .../sptrsv_simulator/WeakBarriers/cpu_relax.hpp      | 12 +++++-------
 .../WeakBarriers/flat_checkpoint_counter_barrier.hpp |  6 +-----
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp
index d9e5e268..7b1c79ca 100644
--- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp
@@ -18,21 +18,19 @@ limitations under the License.
 
 #pragma once
 
-#include <thread>
-
-#if defined(__x86_64__) || defined(_M_X64)
-#    include <immintrin.h>
-#endif
-
 namespace osp {
 
 // Portable cpu_relax definition
 #if defined(__x86_64__) || defined(_M_X64)
+#    include <immintrin.h>
 inline void cpu_relax() { _mm_pause(); }
 #elif defined(__aarch64__)
+inline void cpu_relax() { asm volatile("isb" ::: "memory"); }
+#elif defined(__arm__)
 inline void cpu_relax() { asm volatile("yield" ::: "memory"); }
 #else
-inline void cpu_relax() { std::this_thread::yield(); }
+#include <atomic>
+inline void cpu_relax() { std::atomic_signal_fence(std::memory_order_acquire); }
 #endif
 
 }    // end namespace osp
diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
index 5b25acd3..533f6845 100644
--- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp
@@ -78,13 +78,9 @@ inline void FlatCheckpointCounterBarrier::Wait(const std::size_t threadId, const
     const std::size_t minVal = std::max(localCachedCntrs[threadId], diff) - diff;
 
     for (std::size_t ind = 0U; ind < cntrs_.size(); ++ind) {
-        std::size_t loopCntr = 0U;
         while ((localCachedCntrs[ind] < minVal)
                && ((localCachedCntrs[ind] = cntrs_[ind].cntr_.load(std::memory_order_acquire)) < minVal)) {
-            ++loopCntr;
-            if (loopCntr % 128U == 0U) {
-                cpu_relax();
-            }
+            cpu_relax();
         }
     }
 }

From 707ea5ea2bc6a44c9b2fe889420af224cdb7fb13 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Wed, 11 Mar 2026 11:13:48 +0100
Subject: [PATCH 32/57] Adding usolve ssp sptsv

---
 apps/maxbsp_ssp_sptrsv.cpp                    | 272 ++++++++++++++----
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp |  42 +++
 2 files changed, 256 insertions(+), 58 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 988653be..272411ba 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -1,6 +1,6 @@
 /*
  * maxbsp_ssp_sptrsv.cpp
- * Benchmark for SpTRSV using:
+ * Benchmark for SpTRSV (Lsolve + Usolve) using:
  *   - variance_ssp
  *   - growlocal_ssp
  *   - growlocal
@@ -13,6 +13,7 @@
 
 #include <Eigen/Sparse>
 #include <algorithm>
+#include <cctype>
 #include <chrono>
 #include <cstdlib>
 #include <cmath>
@@ -60,6 +61,7 @@ struct Args {
     std::string outputCsv = "sptrsv_benchmark.csv";
     int iterations = 100;
     unsigned processors = 16U;
+    bool runUsolve = true;
     std::set<Algorithm> algorithms;
 };
 
@@ -135,12 +137,29 @@ double LInftyNormalisedDiff(const std::vector<double> &v, const std::vector<doub
 void PrintUsage(const char *prog) {
     std::cout << "Usage:\n"
               << "  " << prog
-              << " --input <file_or_directory> [--output <csv>] [--iterations <n>] [--processors <p>]\n"
+              << " --input <file_or_directory> [--output <csv>] [--iterations <n>] [--processors <p>] [--run-usolve <0|1>]\n"
               << "      [--variance-ssp] [--growlocal-ssp] [--growlocal] [--eigen-serial] [--all]\n\n"
               << "Examples:\n"
               << "  " << prog << " --input ../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx --all\n"
               << "  " << prog
-              << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --variance-ssp --growlocal-ssp --growlocal\n";
+              << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --run-usolve 0 --variance-ssp --growlocal-ssp --growlocal\n";
+}
+
+bool ParseBoolValue(const std::string &value, bool &parsed) {
+    std::string normalised = value;
+    std::transform(normalised.begin(), normalised.end(), normalised.begin(), [](unsigned char c) {
+        return static_cast<char>(std::tolower(c));
+    });
+
+    if (normalised == "1" || normalised == "true" || normalised == "yes" || normalised == "on") {
+        parsed = true;
+        return true;
+    }
+    if (normalised == "0" || normalised == "false" || normalised == "no" || normalised == "off") {
+        parsed = false;
+        return true;
+    }
+    return false;
 }
 
 bool ParseArgs(int argc, char *argv[], Args &args) {
@@ -151,8 +170,8 @@ bool ParseArgs(int argc, char *argv[], Args &args) {
     for (int i = 1; i < argc; ++i) {
         const std::string flag = argv[i];
 
-        const bool needsValue
-            = (flag == "--input" || flag == "--output" || flag == "--iterations" || flag == "--processors");
+        const bool needsValue = (flag == "--input" || flag == "--output" || flag == "--iterations"
+                                 || flag == "--processors" || flag == "--run-usolve");
         if (needsValue && i + 1 >= argc) {
             std::cerr << "Missing value for " << flag << "\n";
             return false;
@@ -166,6 +185,13 @@ bool ParseArgs(int argc, char *argv[], Args &args) {
             args.iterations = std::stoi(argv[++i]);
         } else if (flag == "--processors") {
             args.processors = static_cast<unsigned>(std::stoul(argv[++i]));
+        } else if (flag == "--run-usolve") {
+            bool parsed = false;
+            if (!ParseBoolValue(argv[++i], parsed)) {
+                std::cerr << "Invalid value for --run-usolve. Use 0/1, false/true, no/yes, or off/on.\n";
+                return false;
+            }
+            args.runUsolve = parsed;
         } else if (flag == "--variance-ssp") {
             args.algorithms.insert(Algorithm::VarianceSsp);
         } else if (flag == "--growlocal-ssp") {
@@ -329,11 +355,12 @@ int main(int argc, char *argv[]) {
     EnsureSummaryCsvHeader(summaryCsv);
 
     std::cout << "Running benchmark on " << graphFiles.size() << " graph(s), iterations=" << args.iterations
-              << ", processors=" << args.processors << std::endl;
+              << ", processors=" << args.processors << ", run-usolve=" << (args.runUsolve ? "1" : "0") << std::endl;
     std::cout << "Experiment id timestamp: " << experimentStart << std::endl;
 
     std::vector<CsvRow> bufferedRows;
-    bufferedRows.reserve(graphFiles.size() * args.algorithms.size() * static_cast<std::size_t>(args.iterations));
+    bufferedRows.reserve((args.runUsolve ? 2U : 1U) * graphFiles.size() * args.algorithms.size()
+                        * static_cast<std::size_t>(args.iterations));
     typename std::vector<CsvRow>::difference_type writtenEntries = 0U;
 
     for (const auto &graphPath : graphFiles) {
@@ -357,12 +384,21 @@ int main(int argc, char *argv[]) {
         Sptrsv<int32_t> sptrsv(instance);
         const std::size_t n = static_cast<std::size_t>(lCsr.cols());
 
-        std::vector<double> serialRefX(n, 0.0);
-        std::vector<double> serialB(n, 1.0);
-        sptrsv.x_ = serialRefX.data();
-        sptrsv.b_ = serialB.data();
+        std::vector<double> serialRefXL(n, 0.0);
+        std::vector<double> serialBL(n, 1.0);
+        sptrsv.x_ = serialRefXL.data();
+        sptrsv.b_ = serialBL.data();
         sptrsv.LsolveSerial();
 
+        std::vector<double> serialRefXU;
+        if (args.runUsolve) {
+            std::vector<double> serialBU(n, 1.0);
+            serialRefXU.assign(n, 0.0);
+            sptrsv.x_ = serialRefXU.data();
+            sptrsv.b_ = serialBU.data();
+            sptrsv.UsolveSerial();
+        }
+
         std::cout << "Graph: " << graphName << " (" << lCsr.rows() << "x" << lCsr.cols() << ", nnz=" << lCsr.nonZeros() << ")\n";
 
         if (args.algorithms.count(Algorithm::VarianceSsp) > 0U) {
@@ -378,22 +414,23 @@ int main(int argc, char *argv[]) {
             const unsigned supersteps = schedule.NumberOfSupersteps();
             const int syncCosts = ComputeSyncCosts(instance);
 
-            bool correct = false;
+            bool correctL = false;
+            bool correctU = false;
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> x(n, 0.0);
-                std::vector<double> b(n, 1.0);
-                sptrsv.x_ = x.data();
-                sptrsv.b_ = b.data();
+                std::vector<double> xL(n, 0.0);
+                std::vector<double> bL(n, 1.0);
+                sptrsv.x_ = xL.data();
+                sptrsv.b_ = bL.data();
 
-                const auto s = std::chrono::high_resolution_clock::now();
+                const auto sL = std::chrono::high_resolution_clock::now();
                 sptrsv.SspLsolveStaleness<kDefaultStaleness>();
-                const auto e = std::chrono::high_resolution_clock::now();
-                const double runtime = std::chrono::duration<double>(e - s).count();
+                const auto eL = std::chrono::high_resolution_clock::now();
+                const double runtimeL = std::chrono::duration<double>(eL - sL).count();
 
                 if (iter == 0) {
-                    const double diff = LInftyNormalisedDiff(x, serialRefX);
-                    correct = (diff < EPSILON);
-                    std::cout << "  Variance_SSP first-run max relative diff vs serial: " << diff << std::endl;
+                    const double diffL = LInftyNormalisedDiff(xL, serialRefXL);
+                    correctL = (diffL < EPSILON);
+                    std::cout << "  Variance_SSP first-run max relative diff vs serial lsolve: " << diffL << std::endl;
                 }
 
                 if (iter >= preMeasureIterations) {
@@ -404,8 +441,39 @@ int main(int argc, char *argv[]) {
                                                      supersteps,
                                                      syncCosts,
                                                      kDefaultStaleness,
-                                                     runtime,
-                                                     correct});
+                                                     runtimeL,
+                                                     correctL});
+                }
+
+                if (args.runUsolve) {
+                    std::vector<double> xU(n, 0.0);
+                    std::vector<double> bU(n, 1.0);
+                    sptrsv.x_ = xU.data();
+                    sptrsv.b_ = bU.data();
+
+                    const auto sU = std::chrono::high_resolution_clock::now();
+                    sptrsv.SspUsolveStaleness<kDefaultStaleness>();
+                    const auto eU = std::chrono::high_resolution_clock::now();
+                    const double runtimeU = std::chrono::duration<double>(eU - sU).count();
+
+                    if (iter == 0) {
+                        const double diffU = LInftyNormalisedDiff(xU, serialRefXU);
+                        correctU = (diffU < EPSILON);
+                        std::cout << "  Variance_SSP_Usolve first-run max relative diff vs serial usolve: " << diffU
+                                  << std::endl;
+                    }
+
+                    if (iter >= preMeasureIterations) {
+                        bufferedRows.emplace_back(CsvRow{graphName,
+                                                         "Variance_SSP_Usolve",
+                                                         args.processors,
+                                                         scheduleTime,
+                                                         supersteps,
+                                                         syncCosts,
+                                                         kDefaultStaleness,
+                                                         runtimeU,
+                                                         correctU});
+                    }
                 }
             }
 
@@ -428,22 +496,23 @@ int main(int argc, char *argv[]) {
             const unsigned supersteps = schedule.NumberOfSupersteps();
             const int syncCosts = ComputeSyncCosts(instance);
 
-            bool correct = false;
+            bool correctL = false;
+            bool correctU = false;
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> x(n, 0.0);
-                std::vector<double> b(n, 1.0);
-                sptrsv.x_ = x.data();
-                sptrsv.b_ = b.data();
+                std::vector<double> xL(n, 0.0);
+                std::vector<double> bL(n, 1.0);
+                sptrsv.x_ = xL.data();
+                sptrsv.b_ = bL.data();
 
-                const auto s = std::chrono::high_resolution_clock::now();
+                const auto sL = std::chrono::high_resolution_clock::now();
                 sptrsv.SspLsolveStaleness<kDefaultStaleness>();
-                const auto e = std::chrono::high_resolution_clock::now();
-                const double runtime = std::chrono::duration<double>(e - s).count();
+                const auto eL = std::chrono::high_resolution_clock::now();
+                const double runtimeL = std::chrono::duration<double>(eL - sL).count();
 
                 if (iter == 0) {
-                    const double diff = LInftyNormalisedDiff(x, serialRefX);
-                    correct = (diff < EPSILON);
-                    std::cout << "  Growlocal_SSP first-run max relative diff vs serial: " << diff << std::endl;
+                    const double diffL = LInftyNormalisedDiff(xL, serialRefXL);
+                    correctL = (diffL < EPSILON);
+                    std::cout << "  Growlocal_SSP first-run max relative diff vs serial lsolve: " << diffL << std::endl;
                 }
 
                 if (iter >= preMeasureIterations) {
@@ -454,8 +523,39 @@ int main(int argc, char *argv[]) {
                                                      supersteps,
                                                      syncCosts,
                                                      kDefaultStaleness,
-                                                     runtime,
-                                                     correct});
+                                                     runtimeL,
+                                                     correctL});
+                }
+
+                if (args.runUsolve) {
+                    std::vector<double> xU(n, 0.0);
+                    std::vector<double> bU(n, 1.0);
+                    sptrsv.x_ = xU.data();
+                    sptrsv.b_ = bU.data();
+
+                    const auto sU = std::chrono::high_resolution_clock::now();
+                    sptrsv.SspUsolveStaleness<kDefaultStaleness>();
+                    const auto eU = std::chrono::high_resolution_clock::now();
+                    const double runtimeU = std::chrono::duration<double>(eU - sU).count();
+
+                    if (iter == 0) {
+                        const double diffU = LInftyNormalisedDiff(xU, serialRefXU);
+                        correctU = (diffU < EPSILON);
+                        std::cout << "  Growlocal_SSP_Usolve first-run max relative diff vs serial usolve: " << diffU
+                                  << std::endl;
+                    }
+
+                    if (iter >= preMeasureIterations) {
+                        bufferedRows.emplace_back(CsvRow{graphName,
+                                                         "Growlocal_SSP_Usolve",
+                                                         args.processors,
+                                                         scheduleTime,
+                                                         supersteps,
+                                                         syncCosts,
+                                                         kDefaultStaleness,
+                                                         runtimeU,
+                                                         correctU});
+                    }
                 }
             }
 
@@ -478,22 +578,23 @@ int main(int argc, char *argv[]) {
             const unsigned supersteps = schedule.NumberOfSupersteps();
             const int syncCosts = ComputeSyncCosts(instance);
 
-            bool correct;
+            bool correctL = false;
+            bool correctU = false;
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> x(n, 0.0);
-                std::vector<double> b(n, 1.0);
-                sptrsv.x_ = x.data();
-                sptrsv.b_ = b.data();
+                std::vector<double> xL(n, 0.0);
+                std::vector<double> bL(n, 1.0);
+                sptrsv.x_ = xL.data();
+                sptrsv.b_ = bL.data();
 
-                const auto s = std::chrono::high_resolution_clock::now();
+                const auto sL = std::chrono::high_resolution_clock::now();
                 sptrsv.LsolveNoPermutation();
-                const auto e = std::chrono::high_resolution_clock::now();
-                const double runtime = std::chrono::duration<double>(e - s).count();
+                const auto eL = std::chrono::high_resolution_clock::now();
+                const double runtimeL = std::chrono::duration<double>(eL - sL).count();
 
                 if (iter == 0) {
-                    const double diff = LInftyNormalisedDiff(x, serialRefX);
-                    correct = (diff < EPSILON);
-                    std::cout << "  Growlocal first-run max relative diff vs serial: " << diff << std::endl;
+                    const double diffL = LInftyNormalisedDiff(xL, serialRefXL);
+                    correctL = (diffL < EPSILON);
+                    std::cout << "  Growlocal first-run max relative diff vs serial lsolve: " << diffL << std::endl;
                 }
 
                 if (iter >= preMeasureIterations) {
@@ -504,8 +605,39 @@ int main(int argc, char *argv[]) {
                                                      supersteps,
                                                      syncCosts,
                                                      1U,
-                                                     runtime,
-                                                     correct});
+                                                     runtimeL,
+                                                     correctL});
+                }
+
+                if (args.runUsolve) {
+                    std::vector<double> xU(n, 0.0);
+                    std::vector<double> bU(n, 1.0);
+                    sptrsv.x_ = xU.data();
+                    sptrsv.b_ = bU.data();
+
+                    const auto s = std::chrono::high_resolution_clock::now();
+                    sptrsv.UsolveNoPermutation();
+                    const auto e = std::chrono::high_resolution_clock::now();
+                    const double runtime = std::chrono::duration<double>(e - s).count();
+
+                    if (iter == 0) {
+                        const double diff = LInftyNormalisedDiff(xU, serialRefXU);
+                        correctU = (diff < EPSILON);
+                        std::cout << "  Growlocal_Usolve first-run max relative diff vs serial usolve: " << diff
+                                  << std::endl;
+                    }
+
+                    if (iter >= preMeasureIterations) {
+                        bufferedRows.emplace_back(CsvRow{graphName,
+                                                         "Growlocal_Usolve",
+                                                         args.processors,
+                                                         scheduleTime,
+                                                         supersteps,
+                                                         syncCosts,
+                                                         1U,
+                                                         runtime,
+                                                         correctU});
+                    }
                 }
             }
 
@@ -517,15 +649,15 @@ int main(int argc, char *argv[]) {
 
         if (args.algorithms.count(Algorithm::Serial) > 0U) {
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> x(n, 0.0);
-                std::vector<double> b(n, 1.0);
-                sptrsv.x_ = x.data();
-                sptrsv.b_ = b.data();
+                std::vector<double> xL(n, 0.0);
+                std::vector<double> bL(n, 1.0);
+                sptrsv.x_ = xL.data();
+                sptrsv.b_ = bL.data();
 
-                const auto s = std::chrono::high_resolution_clock::now();
+                const auto sL = std::chrono::high_resolution_clock::now();
                 sptrsv.LsolveSerial();
-                const auto e = std::chrono::high_resolution_clock::now();
-                const double runtime = std::chrono::duration<double>(e - s).count();
+                const auto eL = std::chrono::high_resolution_clock::now();
+                const double runtimeL = std::chrono::duration<double>(eL - sL).count();
 
                 if (iter >= preMeasureIterations) {
                     bufferedRows.emplace_back(CsvRow{graphName,
@@ -535,9 +667,33 @@ int main(int argc, char *argv[]) {
                                                      1U,
                                                      0,
                                                      1U,
-                                                     runtime,
+                                                     runtimeL,
                                                      true});
                 }
+
+                if (args.runUsolve) {
+                    std::vector<double> xU(n, 0.0);
+                    std::vector<double> bU(n, 1.0);
+                    sptrsv.x_ = xU.data();
+                    sptrsv.b_ = bU.data();
+
+                    const auto s = std::chrono::high_resolution_clock::now();
+                    sptrsv.UsolveSerial();
+                    const auto e = std::chrono::high_resolution_clock::now();
+                    const double runtime = std::chrono::duration<double>(e - s).count();
+
+                    if (iter >= preMeasureIterations) {
+                        bufferedRows.emplace_back(CsvRow{graphName,
+                                                         "Serial_Usolve",
+                                                         1U,
+                                                         0.0,
+                                                         1U,
+                                                         0,
+                                                         1U,
+                                                         runtime,
+                                                         true});
+                    }
+                }
             }
 
             for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) {
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index d2461e7f..5c34bac6 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -529,6 +529,48 @@ class Sptrsv {
         }
     }
 
+    // SSP Usolve with configurable staleness.
+    // Uses FlatCheckpointCounterBarrier created internally.
+    template <unsigned staleness = 2U>
+    void SspUsolveStaleness() {
+        const unsigned nthreads = instance_->NumberOfProcessors();
+        FlatCheckpointCounterBarrier barrier(nthreads);
+
+        auto *csc = instance_->GetComputationalDag().GetCSC();
+        const auto *outer = csc->outerIndexPtr();
+        const auto *inner = csc->innerIndexPtr();
+        const auto *vals = csc->valuePtr();
+
+        #pragma omp parallel num_threads(nthreads)
+        {
+            const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+            unsigned step = numSupersteps_;
+            do {
+                step--;
+                const size_t boundsStrSize = boundsArrayU_[step][proc].size();
+                if (boundsStrSize > 0U) {
+                    barrier.Wait(proc, staleness - 1U);
+                }
+
+                for (size_t index = 0; index < boundsStrSize; index += 2) {
+                    EigenIdxType node = boundsArrayU_[step][proc][index] + 1;
+                    const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1];
+
+                    do {
+                        node--;
+                        x_[node] = b_[node];
+                        for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
+                            x_[node] -= vals[i] * x_[inner[i]];
+                        }
+                        x_[node] /= vals[outer[node]];
+                    } while (node != lowerB);
+                }
+
+                barrier.Arrive(proc);
+            } while (step != 0);
+        }
+    }
+
     virtual ~Sptrsv() = default;
 };
 

From 649fe9c4a57a5c17538c289dd08b7ab9a5dcc097 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Mon, 16 Mar 2026 09:35:50 +0100
Subject: [PATCH 33/57] Reverted benchmark and in-place kernels added

---
 apps/maxbsp_ssp_sptrsv.cpp                    | 272 ++++--------------
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp |  41 +++
 2 files changed, 94 insertions(+), 219 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 272411ba..079c7d3a 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -1,6 +1,6 @@
 /*
  * maxbsp_ssp_sptrsv.cpp
- * Benchmark for SpTRSV (Lsolve + Usolve) using:
+ * Benchmark for SpTRSV using:
  *   - variance_ssp
  *   - growlocal_ssp
  *   - growlocal
@@ -13,7 +13,6 @@
 
 #include <Eigen/Sparse>
 #include <algorithm>
-#include <cctype>
 #include <chrono>
 #include <cstdlib>
 #include <cmath>
@@ -61,7 +60,6 @@ struct Args {
     std::string outputCsv = "sptrsv_benchmark.csv";
     int iterations = 100;
     unsigned processors = 16U;
-    bool runUsolve = true;
     std::set<Algorithm> algorithms;
 };
 
@@ -137,29 +135,12 @@ double LInftyNormalisedDiff(const std::vector<double> &v, const std::vector<doub
 void PrintUsage(const char *prog) {
     std::cout << "Usage:\n"
               << "  " << prog
-              << " --input <file_or_directory> [--output <csv>] [--iterations <n>] [--processors <p>] [--run-usolve <0|1>]\n"
+              << " --input <file_or_directory> [--output <csv>] [--iterations <n>] [--processors <p>]\n"
               << "      [--variance-ssp] [--growlocal-ssp] [--growlocal] [--eigen-serial] [--all]\n\n"
               << "Examples:\n"
               << "  " << prog << " --input ../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx --all\n"
               << "  " << prog
-              << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --run-usolve 0 --variance-ssp --growlocal-ssp --growlocal\n";
-}
-
-bool ParseBoolValue(const std::string &value, bool &parsed) {
-    std::string normalised = value;
-    std::transform(normalised.begin(), normalised.end(), normalised.begin(), [](unsigned char c) {
-        return static_cast<char>(std::tolower(c));
-    });
-
-    if (normalised == "1" || normalised == "true" || normalised == "yes" || normalised == "on") {
-        parsed = true;
-        return true;
-    }
-    if (normalised == "0" || normalised == "false" || normalised == "no" || normalised == "off") {
-        parsed = false;
-        return true;
-    }
-    return false;
+              << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --variance-ssp --growlocal-ssp --growlocal\n";
 }
 
 bool ParseArgs(int argc, char *argv[], Args &args) {
@@ -170,8 +151,8 @@ bool ParseArgs(int argc, char *argv[], Args &args) {
     for (int i = 1; i < argc; ++i) {
         const std::string flag = argv[i];
 
-        const bool needsValue = (flag == "--input" || flag == "--output" || flag == "--iterations"
-                                 || flag == "--processors" || flag == "--run-usolve");
+        const bool needsValue
+            = (flag == "--input" || flag == "--output" || flag == "--iterations" || flag == "--processors");
         if (needsValue && i + 1 >= argc) {
             std::cerr << "Missing value for " << flag << "\n";
             return false;
@@ -185,13 +166,6 @@ bool ParseArgs(int argc, char *argv[], Args &args) {
             args.iterations = std::stoi(argv[++i]);
         } else if (flag == "--processors") {
             args.processors = static_cast<unsigned>(std::stoul(argv[++i]));
-        } else if (flag == "--run-usolve") {
-            bool parsed = false;
-            if (!ParseBoolValue(argv[++i], parsed)) {
-                std::cerr << "Invalid value for --run-usolve. Use 0/1, false/true, no/yes, or off/on.\n";
-                return false;
-            }
-            args.runUsolve = parsed;
         } else if (flag == "--variance-ssp") {
             args.algorithms.insert(Algorithm::VarianceSsp);
         } else if (flag == "--growlocal-ssp") {
@@ -355,12 +329,11 @@ int main(int argc, char *argv[]) {
     EnsureSummaryCsvHeader(summaryCsv);
 
     std::cout << "Running benchmark on " << graphFiles.size() << " graph(s), iterations=" << args.iterations
-              << ", processors=" << args.processors << ", run-usolve=" << (args.runUsolve ? "1" : "0") << std::endl;
+              << ", processors=" << args.processors << std::endl;
     std::cout << "Experiment id timestamp: " << experimentStart << std::endl;
 
     std::vector<CsvRow> bufferedRows;
-    bufferedRows.reserve((args.runUsolve ? 2U : 1U) * graphFiles.size() * args.algorithms.size()
-                        * static_cast<std::size_t>(args.iterations));
+    bufferedRows.reserve(graphFiles.size() * args.algorithms.size() * static_cast<std::size_t>(args.iterations));
     typename std::vector<CsvRow>::difference_type writtenEntries = 0U;
 
     for (const auto &graphPath : graphFiles) {
@@ -384,20 +357,9 @@ int main(int argc, char *argv[]) {
         Sptrsv<int32_t> sptrsv(instance);
         const std::size_t n = static_cast<std::size_t>(lCsr.cols());
 
-        std::vector<double> serialRefXL(n, 0.0);
-        std::vector<double> serialBL(n, 1.0);
-        sptrsv.x_ = serialRefXL.data();
-        sptrsv.b_ = serialBL.data();
-        sptrsv.LsolveSerial();
-
-        std::vector<double> serialRefXU;
-        if (args.runUsolve) {
-            std::vector<double> serialBU(n, 1.0);
-            serialRefXU.assign(n, 0.0);
-            sptrsv.x_ = serialRefXU.data();
-            sptrsv.b_ = serialBU.data();
-            sptrsv.UsolveSerial();
-        }
+        std::vector<double> serialRefX(n, 1.0);
+        sptrsv.x_ = serialRefX.data();
+        sptrsv.LsolveSerialInPlace();
 
         std::cout << "Graph: " << graphName << " (" << lCsr.rows() << "x" << lCsr.cols() << ", nnz=" << lCsr.nonZeros() << ")\n";
 
@@ -414,23 +376,20 @@ int main(int argc, char *argv[]) {
             const unsigned supersteps = schedule.NumberOfSupersteps();
             const int syncCosts = ComputeSyncCosts(instance);
 
-            bool correctL = false;
-            bool correctU = false;
+            bool correct = false;
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> xL(n, 0.0);
-                std::vector<double> bL(n, 1.0);
-                sptrsv.x_ = xL.data();
-                sptrsv.b_ = bL.data();
+                std::vector<double> x(n, 1.0);
+                sptrsv.x_ = x.data();
 
-                const auto sL = std::chrono::high_resolution_clock::now();
-                sptrsv.SspLsolveStaleness<kDefaultStaleness>();
-                const auto eL = std::chrono::high_resolution_clock::now();
-                const double runtimeL = std::chrono::duration<double>(eL - sL).count();
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.SspLsolveStalenessInPlace<kDefaultStaleness>();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
 
                 if (iter == 0) {
-                    const double diffL = LInftyNormalisedDiff(xL, serialRefXL);
-                    correctL = (diffL < EPSILON);
-                    std::cout << "  Variance_SSP first-run max relative diff vs serial lsolve: " << diffL << std::endl;
+                    const double diff = LInftyNormalisedDiff(x, serialRefX);
+                    correct = (diff < EPSILON);
+                    std::cout << "  Variance_SSP first-run max relative diff vs serial: " << diff << std::endl;
                 }
 
                 if (iter >= preMeasureIterations) {
@@ -441,39 +400,8 @@ int main(int argc, char *argv[]) {
                                                      supersteps,
                                                      syncCosts,
                                                      kDefaultStaleness,
-                                                     runtimeL,
-                                                     correctL});
-                }
-
-                if (args.runUsolve) {
-                    std::vector<double> xU(n, 0.0);
-                    std::vector<double> bU(n, 1.0);
-                    sptrsv.x_ = xU.data();
-                    sptrsv.b_ = bU.data();
-
-                    const auto sU = std::chrono::high_resolution_clock::now();
-                    sptrsv.SspUsolveStaleness<kDefaultStaleness>();
-                    const auto eU = std::chrono::high_resolution_clock::now();
-                    const double runtimeU = std::chrono::duration<double>(eU - sU).count();
-
-                    if (iter == 0) {
-                        const double diffU = LInftyNormalisedDiff(xU, serialRefXU);
-                        correctU = (diffU < EPSILON);
-                        std::cout << "  Variance_SSP_Usolve first-run max relative diff vs serial usolve: " << diffU
-                                  << std::endl;
-                    }
-
-                    if (iter >= preMeasureIterations) {
-                        bufferedRows.emplace_back(CsvRow{graphName,
-                                                         "Variance_SSP_Usolve",
-                                                         args.processors,
-                                                         scheduleTime,
-                                                         supersteps,
-                                                         syncCosts,
-                                                         kDefaultStaleness,
-                                                         runtimeU,
-                                                         correctU});
-                    }
+                                                     runtime,
+                                                     correct});
                 }
             }
 
@@ -496,23 +424,20 @@ int main(int argc, char *argv[]) {
             const unsigned supersteps = schedule.NumberOfSupersteps();
             const int syncCosts = ComputeSyncCosts(instance);
 
-            bool correctL = false;
-            bool correctU = false;
+            bool correct = false;
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> xL(n, 0.0);
-                std::vector<double> bL(n, 1.0);
-                sptrsv.x_ = xL.data();
-                sptrsv.b_ = bL.data();
+                std::vector<double> x(n, 1.0);
+                sptrsv.x_ = x.data();
 
-                const auto sL = std::chrono::high_resolution_clock::now();
-                sptrsv.SspLsolveStaleness<kDefaultStaleness>();
-                const auto eL = std::chrono::high_resolution_clock::now();
-                const double runtimeL = std::chrono::duration<double>(eL - sL).count();
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.SspLsolveStalenessInPlace<kDefaultStaleness>();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
 
                 if (iter == 0) {
-                    const double diffL = LInftyNormalisedDiff(xL, serialRefXL);
-                    correctL = (diffL < EPSILON);
-                    std::cout << "  Growlocal_SSP first-run max relative diff vs serial lsolve: " << diffL << std::endl;
+                    const double diff = LInftyNormalisedDiff(x, serialRefX);
+                    correct = (diff < EPSILON);
+                    std::cout << "  Growlocal_SSP first-run max relative diff vs serial: " << diff << std::endl;
                 }
 
                 if (iter >= preMeasureIterations) {
@@ -523,39 +448,8 @@ int main(int argc, char *argv[]) {
                                                      supersteps,
                                                      syncCosts,
                                                      kDefaultStaleness,
-                                                     runtimeL,
-                                                     correctL});
-                }
-
-                if (args.runUsolve) {
-                    std::vector<double> xU(n, 0.0);
-                    std::vector<double> bU(n, 1.0);
-                    sptrsv.x_ = xU.data();
-                    sptrsv.b_ = bU.data();
-
-                    const auto sU = std::chrono::high_resolution_clock::now();
-                    sptrsv.SspUsolveStaleness<kDefaultStaleness>();
-                    const auto eU = std::chrono::high_resolution_clock::now();
-                    const double runtimeU = std::chrono::duration<double>(eU - sU).count();
-
-                    if (iter == 0) {
-                        const double diffU = LInftyNormalisedDiff(xU, serialRefXU);
-                        correctU = (diffU < EPSILON);
-                        std::cout << "  Growlocal_SSP_Usolve first-run max relative diff vs serial usolve: " << diffU
-                                  << std::endl;
-                    }
-
-                    if (iter >= preMeasureIterations) {
-                        bufferedRows.emplace_back(CsvRow{graphName,
-                                                         "Growlocal_SSP_Usolve",
-                                                         args.processors,
-                                                         scheduleTime,
-                                                         supersteps,
-                                                         syncCosts,
-                                                         kDefaultStaleness,
-                                                         runtimeU,
-                                                         correctU});
-                    }
+                                                     runtime,
+                                                     correct});
                 }
             }
 
@@ -578,23 +472,20 @@ int main(int argc, char *argv[]) {
             const unsigned supersteps = schedule.NumberOfSupersteps();
             const int syncCosts = ComputeSyncCosts(instance);
 
-            bool correctL = false;
-            bool correctU = false;
+            bool correct;
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> xL(n, 0.0);
-                std::vector<double> bL(n, 1.0);
-                sptrsv.x_ = xL.data();
-                sptrsv.b_ = bL.data();
+                std::vector<double> x(n, 1.0);
+                sptrsv.x_ = x.data();
 
-                const auto sL = std::chrono::high_resolution_clock::now();
-                sptrsv.LsolveNoPermutation();
-                const auto eL = std::chrono::high_resolution_clock::now();
-                const double runtimeL = std::chrono::duration<double>(eL - sL).count();
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.LsolveNoPermutationInPlace();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
 
                 if (iter == 0) {
-                    const double diffL = LInftyNormalisedDiff(xL, serialRefXL);
-                    correctL = (diffL < EPSILON);
-                    std::cout << "  Growlocal first-run max relative diff vs serial lsolve: " << diffL << std::endl;
+                    const double diff = LInftyNormalisedDiff(x, serialRefX);
+                    correct = (diff < EPSILON);
+                    std::cout << "  Growlocal first-run max relative diff vs serial: " << diff << std::endl;
                 }
 
                 if (iter >= preMeasureIterations) {
@@ -605,39 +496,8 @@ int main(int argc, char *argv[]) {
                                                      supersteps,
                                                      syncCosts,
                                                      1U,
-                                                     runtimeL,
-                                                     correctL});
-                }
-
-                if (args.runUsolve) {
-                    std::vector<double> xU(n, 0.0);
-                    std::vector<double> bU(n, 1.0);
-                    sptrsv.x_ = xU.data();
-                    sptrsv.b_ = bU.data();
-
-                    const auto s = std::chrono::high_resolution_clock::now();
-                    sptrsv.UsolveNoPermutation();
-                    const auto e = std::chrono::high_resolution_clock::now();
-                    const double runtime = std::chrono::duration<double>(e - s).count();
-
-                    if (iter == 0) {
-                        const double diff = LInftyNormalisedDiff(xU, serialRefXU);
-                        correctU = (diff < EPSILON);
-                        std::cout << "  Growlocal_Usolve first-run max relative diff vs serial usolve: " << diff
-                                  << std::endl;
-                    }
-
-                    if (iter >= preMeasureIterations) {
-                        bufferedRows.emplace_back(CsvRow{graphName,
-                                                         "Growlocal_Usolve",
-                                                         args.processors,
-                                                         scheduleTime,
-                                                         supersteps,
-                                                         syncCosts,
-                                                         1U,
-                                                         runtime,
-                                                         correctU});
-                    }
+                                                     runtime,
+                                                     correct});
                 }
             }
 
@@ -649,15 +509,13 @@ int main(int argc, char *argv[]) {
 
         if (args.algorithms.count(Algorithm::Serial) > 0U) {
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> xL(n, 0.0);
-                std::vector<double> bL(n, 1.0);
-                sptrsv.x_ = xL.data();
-                sptrsv.b_ = bL.data();
+                std::vector<double> x(n, 1.0);
+                sptrsv.x_ = x.data();
 
-                const auto sL = std::chrono::high_resolution_clock::now();
-                sptrsv.LsolveSerial();
-                const auto eL = std::chrono::high_resolution_clock::now();
-                const double runtimeL = std::chrono::duration<double>(eL - sL).count();
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.LsolveSerialInPlace();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
 
                 if (iter >= preMeasureIterations) {
                     bufferedRows.emplace_back(CsvRow{graphName,
@@ -667,33 +525,9 @@ int main(int argc, char *argv[]) {
                                                      1U,
                                                      0,
                                                      1U,
-                                                     runtimeL,
+                                                     runtime,
                                                      true});
                 }
-
-                if (args.runUsolve) {
-                    std::vector<double> xU(n, 0.0);
-                    std::vector<double> bU(n, 1.0);
-                    sptrsv.x_ = xU.data();
-                    sptrsv.b_ = bU.data();
-
-                    const auto s = std::chrono::high_resolution_clock::now();
-                    sptrsv.UsolveSerial();
-                    const auto e = std::chrono::high_resolution_clock::now();
-                    const double runtime = std::chrono::duration<double>(e - s).count();
-
-                    if (iter >= preMeasureIterations) {
-                        bufferedRows.emplace_back(CsvRow{graphName,
-                                                         "Serial_Usolve",
-                                                         1U,
-                                                         0.0,
-                                                         1U,
-                                                         0,
-                                                         1U,
-                                                         runtime,
-                                                         true});
-                    }
-                }
             }
 
             for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) {
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 5c34bac6..3323c07a 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -529,6 +529,47 @@ class Sptrsv {
         }
     }
 
+    // SSP Lsolve in-place with staleness=2 (allowing at most one superstep of lag).
+    // Uses FlatCheckpointCounterBarrier created internally.
+    template <unsigned staleness = 2U>
+    void SspLsolveStalenessInPlace() {
+        const unsigned nthreads = instance_->NumberOfProcessors();
+        FlatCheckpointCounterBarrier barrier(nthreads);
+
+        auto *csr = instance_->GetComputationalDag().GetCSR();
+        const auto *outer = csr->outerIndexPtr();
+        const auto *inner = csr->innerIndexPtr();
+        const auto *vals = csr->valuePtr();
+
+        #pragma omp parallel num_threads(nthreads)
+        {
+            const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+            for (unsigned step = 0; step < numSupersteps_; ++step) {
+                // Process nodes assigned to this (step, proc) pair.
+                const size_t boundsStrSize = boundsArrayL_[step][proc].size();
+                // Enforce staleness window before starting this superstep.
+                if (boundsStrSize > 0U) {
+                    barrier.Wait(proc, staleness - 1U);
+                }
+                for (size_t index = 0; index < boundsStrSize; index += 2) {
+                    EigenIdxType lowerB = boundsArrayL_[step][proc][index];
+                    const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
+                    for (EigenIdxType node = lowerB; node <= upperB; ++node) {
+                        // Perform lower-triangular solve for this node
+                        for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
+                            // Subtract contributions from previously solved nodes
+                            x_[node] -= vals[i] * x_[inner[i]];
+                        }
+                        // Divide by diagonal element to complete solve for this node
+                        x_[node] /= vals[outer[node + 1] - 1];
+                    }
+                }
+                // Signal completion of this superstep.
+                barrier.Arrive(proc);
+            }
+        }
+    }
+
     // SSP Usolve with configurable staleness.
     // Uses FlatCheckpointCounterBarrier created internally.
     template <unsigned staleness = 2U>

From ff5e15f209ef6446e0b8cbbd065fc5038ce0f129 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Tue, 17 Mar 2026 11:56:52 +0100
Subject: [PATCH 34/57] making const

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 150 +++++++++---------
 1 file changed, 75 insertions(+), 75 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 3323c07a..906e8850 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -131,7 +131,7 @@ class Sptrsv {
                     do {
                         node--;
                         vectorStepProcessorVerticesU_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back(
-                // --- SSP SpTRSV kernel integration from BspSptrsvCSR.hpp/cpp ---
+                            // --- SSP SpTRSV kernel integration from BspSptrsvCSR.hpp/cpp ---
 
                             static_cast<EigenIdxType>(node));
                     } while (node > 0);
@@ -240,39 +240,43 @@ class Sptrsv {
     }
 
     void LsolveSerial() {
+        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
+        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
+        const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+
         EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         for (EigenIdxType i = 0; i < numberOfVertices; ++i) {
             x_[i] = b_[i];
-            for (EigenIdxType j = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i];
-                 j < (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i + 1] - 1;
-                 ++j) {
-                x_[i] -= (*(instance_->GetComputationalDag().GetCSR())).valuePtr()[j]
-                         * x_[(*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr()[j]];
+            for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) {
+                x_[i] -= valPtr[j] * x_[inner[j]];
             }
-            x_[i] /= (*(instance_->GetComputationalDag().GetCSR()))
-                         .valuePtr()[(*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i + 1] - 1];
+            x_[i] /= valPtr[outer[i + 1] - 1];
         }
     }
 
     void UsolveSerial() {
-        EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
+        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
+        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
+        const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+
+        const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
 
         EigenIdxType i = numberOfVertices;
         do {
             i--;
             x_[i] = b_[i];
-            for (EigenIdxType j = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i] + 1;
-                 j < (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i + 1];
-                 ++j) {
-                x_[i] -= (*(instance_->GetComputationalDag().GetCSC())).valuePtr()[j]
-                         * x_[(*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr()[j]];
+            for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) {
+                x_[i] -= valPtr[j] * x_[inner[j]];
             }
-            x_[i] /= (*(instance_->GetComputationalDag().GetCSC()))
-                         .valuePtr()[(*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i]];
+            x_[i] /= valPtr[outer[i]];
         } while (i != 0);
     }
 
     void LsolveNoPermutationInPlace() {
+        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
+        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
+        const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
             const size_t proc = static_cast<size_t>(omp_get_thread_num());
@@ -284,14 +288,10 @@ class Sptrsv {
                     const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
 
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
-                        for (EigenIdxType i = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node];
-                             i < (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node + 1] - 1;
-                             ++i) {
-                            x_[node] -= (*(instance_->GetComputationalDag().GetCSR())).valuePtr()[i]
-                                        * x_[(*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr()[i]];
+                        for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
+                            x_[node] -= valPtr[i] * x_[inner[i]];
                         }
-                        x_[node] /= (*(instance_->GetComputationalDag().GetCSR()))
-                                        .valuePtr()[(*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node + 1] - 1];
+                        x_[node] /= valPtr[outer[node + 1] - 1];
                     }
                 }
 #    pragma omp barrier
@@ -300,6 +300,10 @@ class Sptrsv {
     }
 
     void UsolveNoPermutationInPlace() {
+        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
+        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
+        const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
             // Process each superstep starting from the last one (opposite of lsolve)
@@ -314,14 +318,10 @@ class Sptrsv {
 
                     do {
                         node--;
-                        for (EigenIdxType i = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node] + 1;
-                             i < (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node + 1];
-                             ++i) {
-                            x_[node] -= (*(instance_->GetComputationalDag().GetCSC())).valuePtr()[i]
-                                        * x_[(*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr()[i]];
+                        for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
+                            x_[node] -= valPtr[i] * x_[inner[i]];
                         }
-                        x_[node] /= (*(instance_->GetComputationalDag().GetCSC()))
-                                        .valuePtr()[(*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node]];
+                        x_[node] /= valPtr[outer[node]];
                     } while (node != lowerB);
                 }
 #    pragma omp barrier
@@ -330,6 +330,10 @@ class Sptrsv {
     }
 
     void LsolveNoPermutation() {
+        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
+        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
+        const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
             const size_t proc = static_cast<size_t>(omp_get_thread_num());
@@ -342,14 +346,10 @@ class Sptrsv {
 
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
                         x_[node] = b_[node];
-                        for (EigenIdxType i = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node];
-                             i < (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node + 1] - 1;
-                             ++i) {
-                            x_[node] -= (*(instance_->GetComputationalDag().GetCSR())).valuePtr()[i]
-                                        * x_[(*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr()[i]];
+                        for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
+                            x_[node] -= valPtr[i] * x_[inner[i]];
                         }
-                        x_[node] /= (*(instance_->GetComputationalDag().GetCSR()))
-                                        .valuePtr()[(*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node + 1] - 1];
+                        x_[node] /= valPtr[outer[node + 1] - 1];
                     }
                 }
 #    pragma omp barrier
@@ -358,6 +358,10 @@ class Sptrsv {
     }
 
     void UsolveNoPermutation() {
+        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
+        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
+        const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
             // Process each superstep starting from the last one (opposite of lsolve)
@@ -373,14 +377,10 @@ class Sptrsv {
                     do {
                         node--;
                         x_[node] = b_[node];
-                        for (EigenIdxType i = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node] + 1;
-                             i < (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node + 1];
-                             ++i) {
-                            x_[node] -= (*(instance_->GetComputationalDag().GetCSC())).valuePtr()[i]
-                                        * x_[(*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr()[i]];
+                        for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
+                            x_[node] -= valPtr[i] * x_[inner[i]];
                         }
-                        x_[node] /= (*(instance_->GetComputationalDag().GetCSC()))
-                                        .valuePtr()[(*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node]];
+                        x_[node] /= valPtr[outer[node]];
                     } while (node != lowerB);
                 }
 #    pragma omp barrier
@@ -389,32 +389,32 @@ class Sptrsv {
     }
 
     void LsolveSerialInPlace() {
-        EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
+        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
+        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
+        const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+
+        const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         for (EigenIdxType i = 0; i < numberOfVertices; ++i) {
-            for (EigenIdxType j = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i];
-                 j < (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i + 1] - 1;
-                 ++j) {
-                x_[i] -= (*(instance_->GetComputationalDag().GetCSR())).valuePtr()[j]
-                         * x_[(*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr()[j]];
+            for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) {
+                x_[i] -= valPtr[j] * x_[inner[j]];
             }
-            x_[i] /= (*(instance_->GetComputationalDag().GetCSR()))
-                         .valuePtr()[(*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i + 1] - 1];
+            x_[i] /= valPtr[outer[i + 1] - 1];
         }
     }
 
     void UsolveSerialInPlace() {
-        EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
+        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
+        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
+        const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+
+        const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         EigenIdxType i = numberOfVertices;
         do {
             i--;
-            for (EigenIdxType j = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i] + 1;
-                 j < (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i + 1];
-                 ++j) {
-                x_[i] -= (*(instance_->GetComputationalDag().GetCSC())).valuePtr()[j]
-                         * x_[(*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr()[j]];
+            for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) {
+                x_[i] -= valPtr[j] * x_[inner[j]];
             }
-            x_[i] /= (*(instance_->GetComputationalDag().GetCSC()))
-                         .valuePtr()[(*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i]];
+            x_[i] /= valPtr[outer[i]];
         } while (i != 0);
     }
 
@@ -493,12 +493,12 @@ class Sptrsv {
         const unsigned nthreads = instance_->NumberOfProcessors();
         FlatCheckpointCounterBarrier barrier(nthreads);
 
-        auto *csr = instance_->GetComputationalDag().GetCSR();
-        const auto *outer = csr->outerIndexPtr();
-        const auto *inner = csr->innerIndexPtr();
-        const auto *vals = csr->valuePtr();
+        const auto *csr = instance_->GetComputationalDag().GetCSR();
+        const EigenIdxType *outer = csr->outerIndexPtr();
+        const EigenIdxType *inner = csr->innerIndexPtr();
+        const double *vals = csr->valuePtr();
 
-        #pragma omp parallel num_threads(nthreads)
+#    pragma omp parallel num_threads(nthreads)
         {
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; ++step) {
@@ -536,12 +536,12 @@ class Sptrsv {
         const unsigned nthreads = instance_->NumberOfProcessors();
         FlatCheckpointCounterBarrier barrier(nthreads);
 
-        auto *csr = instance_->GetComputationalDag().GetCSR();
-        const auto *outer = csr->outerIndexPtr();
-        const auto *inner = csr->innerIndexPtr();
-        const auto *vals = csr->valuePtr();
+        const auto *csr = instance_->GetComputationalDag().GetCSR();
+        const EigenIdxType *outer = csr->outerIndexPtr();
+        const EigenIdxType *inner = csr->innerIndexPtr();
+        const double *vals = csr->valuePtr();
 
-        #pragma omp parallel num_threads(nthreads)
+#    pragma omp parallel num_threads(nthreads)
         {
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; ++step) {
@@ -577,12 +577,12 @@ class Sptrsv {
         const unsigned nthreads = instance_->NumberOfProcessors();
         FlatCheckpointCounterBarrier barrier(nthreads);
 
-        auto *csc = instance_->GetComputationalDag().GetCSC();
-        const auto *outer = csc->outerIndexPtr();
-        const auto *inner = csc->innerIndexPtr();
-        const auto *vals = csc->valuePtr();
+        const auto *csc = instance_->GetComputationalDag().GetCSC();
+        const EigenIdxType *outer = csc->outerIndexPtr();
+        const EigenIdxType *inner = csc->innerIndexPtr();
+        const double *vals = csc->valuePtr();
 
-        #pragma omp parallel num_threads(nthreads)
+#    pragma omp parallel num_threads(nthreads)
         {
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             unsigned step = numSupersteps_;

From 8f584922aed1e1c45ca41f33b30a9d112e030d69 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Tue, 17 Mar 2026 12:04:13 +0100
Subject: [PATCH 35/57] inplace reset

---
 apps/maxbsp_ssp_sptrsv.cpp | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 079c7d3a..69ed2c57 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -14,8 +14,8 @@
 #include <Eigen/Sparse>
 #include <algorithm>
 #include <chrono>
-#include <cstdlib>
 #include <cmath>
+#include <cstdlib>
 #include <ctime>
 #include <filesystem>
 #include <fstream>
@@ -134,8 +134,7 @@ double LInftyNormalisedDiff(const std::vector<double> &v, const std::vector<doub
 
 void PrintUsage(const char *prog) {
     std::cout << "Usage:\n"
-              << "  " << prog
-              << " --input <file_or_directory> [--output <csv>] [--iterations <n>] [--processors <p>]\n"
+              << "  " << prog << " --input <file_or_directory> [--output <csv>] [--iterations <n>] [--processors <p>]\n"
               << "      [--variance-ssp] [--growlocal-ssp] [--growlocal] [--eigen-serial] [--all]\n\n"
               << "Examples:\n"
               << "  " << prog << " --input ../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx --all\n"
@@ -290,6 +289,12 @@ int ComputeSyncCosts(const BspInstance<SparseMatrixImp<int32_t>> &instance) {
 
 }    // namespace
 
+void resetOnes(std::vector<double> &x) {
+    for (double &val : x) {
+        val = 1.0;
+    }
+}
+
 int main(int argc, char *argv[]) {
     const std::string experimentStart = FormatExperimentStartTimestampForFilename();
 
@@ -377,9 +382,10 @@ int main(int argc, char *argv[]) {
             const int syncCosts = ComputeSyncCosts(instance);
 
             bool correct = false;
+            std::vector<double> x(n, 1.0);
+            sptrsv.x_ = x.data();
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> x(n, 1.0);
-                sptrsv.x_ = x.data();
+                resetOnes(x);
 
                 const auto s = std::chrono::high_resolution_clock::now();
                 sptrsv.SspLsolveStalenessInPlace<kDefaultStaleness>();
@@ -425,9 +431,10 @@ int main(int argc, char *argv[]) {
             const int syncCosts = ComputeSyncCosts(instance);
 
             bool correct = false;
+            std::vector<double> x(n, 1.0);
+            sptrsv.x_ = x.data();
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> x(n, 1.0);
-                sptrsv.x_ = x.data();
+                resetOnes(x);
 
                 const auto s = std::chrono::high_resolution_clock::now();
                 sptrsv.SspLsolveStalenessInPlace<kDefaultStaleness>();
@@ -473,9 +480,10 @@ int main(int argc, char *argv[]) {
             const int syncCosts = ComputeSyncCosts(instance);
 
             bool correct;
+            std::vector<double> x(n, 1.0);
+            sptrsv.x_ = x.data();
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> x(n, 1.0);
-                sptrsv.x_ = x.data();
+                resetOnes(x);
 
                 const auto s = std::chrono::high_resolution_clock::now();
                 sptrsv.LsolveNoPermutationInPlace();
@@ -508,9 +516,10 @@ int main(int argc, char *argv[]) {
         }
 
         if (args.algorithms.count(Algorithm::Serial) > 0U) {
+            std::vector<double> x(n, 1.0);
+            sptrsv.x_ = x.data();
             for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
-                std::vector<double> x(n, 1.0);
-                sptrsv.x_ = x.data();
+                resetOnes(x);
 
                 const auto s = std::chrono::high_resolution_clock::now();
                 sptrsv.LsolveSerialInPlace();
@@ -555,8 +564,8 @@ int main(int argc, char *argv[]) {
     for (const auto &[key, agg] : summary) {
         const double geomean = std::exp(agg.sumLogRuntime / static_cast<double>(agg.samples));
         summaryCsv << CsvEscape(key.graph) << "," << key.algorithm << "," << key.processors << "," << agg.scheduleTimeSeconds
-               << "," << agg.supersteps << "," << agg.SyncCosts << "," << key.staleness
-                   << "," << agg.samples << "," << geomean << "," << agg.correctness << "\n";
+                   << "," << agg.supersteps << "," << agg.SyncCosts << "," << key.staleness << "," << agg.samples << ","
+                   << geomean << "," << agg.correctness << "\n";
     }
 
     std::cout << "Benchmark complete. CSV written to: " << detailCsvPath << std::endl;

From 4b97007414abbe4ff4e90ca43fba0064272dd843 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Tue, 17 Mar 2026 14:22:04 +0100
Subject: [PATCH 36/57] make eigen matrix compressed

---
 apps/maxbsp_ssp_sptrsv.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 69ed2c57..4ad3565e 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -349,8 +349,10 @@ int main(int argc, char *argv[]) {
             std::cerr << "Failed to load matrix: " << graphPath << std::endl;
             continue;
         }
+        lCsr.makeCompressed();
 
         Eigen::SparseMatrix<double, Eigen::ColMajor, int32_t> lCsc = lCsr;
+        lCsc.makeCompressed();
 
         SparseMatrixImp<int32_t> graph;
         graph.SetCsr(&lCsr);

From 1f506b3454ac6b611e45aa9dd1b9a9fe4d50c58a Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Wed, 18 Mar 2026 08:59:50 +0100
Subject: [PATCH 37/57] const everywhere

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 192 ++++++++++--------
 1 file changed, 107 insertions(+), 85 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 906e8850..e02e5968 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -239,43 +239,48 @@ class Sptrsv {
         rowPtr_.push_back(colIdx_.size());
     }
 
-    void LsolveSerial() {
-        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
-        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
-        const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+    void LsolveSerial() const {
+        const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
+        const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
+        const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+        double *const x = x_;
+        const double *const b = b_;
 
-        EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
+        const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         for (EigenIdxType i = 0; i < numberOfVertices; ++i) {
-            x_[i] = b_[i];
+            x[i] = b[i];
             for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) {
-                x_[i] -= valPtr[j] * x_[inner[j]];
+                x[i] -= valPtr[j] * x[inner[j]];
             }
-            x_[i] /= valPtr[outer[i + 1] - 1];
+            x[i] /= valPtr[outer[i + 1] - 1];
         }
     }
 
-    void UsolveSerial() {
-        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
-        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
-        const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+    void UsolveSerial() const {
+        const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
+        const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
+        const double *const valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+        double *const x = x_;
+        const double *const b = b_;
 
         const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
 
         EigenIdxType i = numberOfVertices;
         do {
             i--;
-            x_[i] = b_[i];
+            x[i] = b[i];
             for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) {
-                x_[i] -= valPtr[j] * x_[inner[j]];
+                x[i] -= valPtr[j] * x[inner[j]];
             }
-            x_[i] /= valPtr[outer[i]];
+            x[i] /= valPtr[outer[i]];
         } while (i != 0);
     }
 
-    void LsolveNoPermutationInPlace() {
-        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
-        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
-        const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+    void LsolveNoPermutationInPlace() const {
+        const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
+        const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
+        const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+        double *const x = x_;
 
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
@@ -289,9 +294,9 @@ class Sptrsv {
 
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
                         for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
-                            x_[node] -= valPtr[i] * x_[inner[i]];
+                            x[node] -= valPtr[i] * x[inner[i]];
                         }
-                        x_[node] /= valPtr[outer[node + 1] - 1];
+                        x[node] /= valPtr[outer[node + 1] - 1];
                     }
                 }
 #    pragma omp barrier
@@ -299,10 +304,11 @@ class Sptrsv {
         }
     }
 
-    void UsolveNoPermutationInPlace() {
-        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
-        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
-        const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+    void UsolveNoPermutationInPlace() const {
+        const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
+        const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
+        const double *const valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+        double *const x = x_;
 
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
@@ -319,9 +325,9 @@ class Sptrsv {
                     do {
                         node--;
                         for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
-                            x_[node] -= valPtr[i] * x_[inner[i]];
+                            x[node] -= valPtr[i] * x[inner[i]];
                         }
-                        x_[node] /= valPtr[outer[node]];
+                        x[node] /= valPtr[outer[node]];
                     } while (node != lowerB);
                 }
 #    pragma omp barrier
@@ -329,10 +335,12 @@ class Sptrsv {
         }
     }
 
-    void LsolveNoPermutation() {
-        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
-        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
-        const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+    void LsolveNoPermutation() const {
+        const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
+        const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
+        const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+        double *const x = x_;
+        const double *const b = b_;
 
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
@@ -345,11 +353,11 @@ class Sptrsv {
                     const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
 
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
-                        x_[node] = b_[node];
+                        x[node] = b[node];
                         for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
-                            x_[node] -= valPtr[i] * x_[inner[i]];
+                            x[node] -= valPtr[i] * x[inner[i]];
                         }
-                        x_[node] /= valPtr[outer[node + 1] - 1];
+                        x[node] /= valPtr[outer[node + 1] - 1];
                     }
                 }
 #    pragma omp barrier
@@ -357,10 +365,12 @@ class Sptrsv {
         }
     }
 
-    void UsolveNoPermutation() {
-        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
-        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
-        const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+    void UsolveNoPermutation() const {
+        const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
+        const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
+        const double *const valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+        double *const x = x_;
+        const double *const b = b_;
 
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
@@ -376,11 +386,11 @@ class Sptrsv {
 
                     do {
                         node--;
-                        x_[node] = b_[node];
+                        x[node] = b[node];
                         for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
-                            x_[node] -= valPtr[i] * x_[inner[i]];
+                            x[node] -= valPtr[i] * x[inner[i]];
                         }
-                        x_[node] /= valPtr[outer[node]];
+                        x[node] /= valPtr[outer[node]];
                     } while (node != lowerB);
                 }
 #    pragma omp barrier
@@ -388,37 +398,41 @@ class Sptrsv {
         }
     }
 
-    void LsolveSerialInPlace() {
-        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
-        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
-        const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+    void LsolveSerialInPlace() const {
+        const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
+        const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
+        const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
+        double *const x = x_;
 
         const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         for (EigenIdxType i = 0; i < numberOfVertices; ++i) {
             for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) {
-                x_[i] -= valPtr[j] * x_[inner[j]];
+                x[i] -= valPtr[j] * x[inner[j]];
             }
-            x_[i] /= valPtr[outer[i + 1] - 1];
+            x[i] /= valPtr[outer[i + 1] - 1];
         }
     }
 
-    void UsolveSerialInPlace() {
-        const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
-        const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
-        const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+    void UsolveSerialInPlace() const {
+        const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr();
+        const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr();
+        const double *const valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr();
+        double *const x = x_;
 
         const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         EigenIdxType i = numberOfVertices;
         do {
             i--;
             for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) {
-                x_[i] -= valPtr[j] * x_[inner[j]];
+                x[i] -= valPtr[j] * x[inner[j]];
             }
-            x_[i] /= valPtr[outer[i]];
+            x[i] /= valPtr[outer[i]];
         } while (i != 0);
     }
 
-    void LsolveWithPermutationInPlace() {
+    void LsolveWithPermutationInPlace() const {
+        double *const x = x_;
+
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
             for (unsigned step = 0; step < numSupersteps_; step++) {
@@ -426,10 +440,10 @@ class Sptrsv {
                 const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc];
                 for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) {
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
-                        x_[rowIdx] -= val_[i] * x_[colIdx_[i]];
+                        x[rowIdx] -= val_[i] * x[colIdx_[i]];
                     }
 
-                    x_[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1];
+                    x[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1];
                 }
 
 #    pragma omp barrier
@@ -437,19 +451,22 @@ class Sptrsv {
         }
     }
 
-    void LsolveWithPermutation() {
+    void LsolveWithPermutation() const {
+        double *const x = x_;
+        const double *const b = b_;
+
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
             for (unsigned step = 0; step < numSupersteps_; step++) {
                 const size_t proc = static_cast<size_t>(omp_get_thread_num());
                 const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc];
                 for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) {
-                    x_[rowIdx] = b_[rowIdx];
+                    x[rowIdx] = b[rowIdx];
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
-                        x_[rowIdx] -= val_[i] * x_[colIdx_[i]];
+                        x[rowIdx] -= val_[i] * x[colIdx_[i]];
                     }
 
-                    x_[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1];
+                    x[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1];
                 }
 
 #    pragma omp barrier
@@ -458,7 +475,7 @@ class Sptrsv {
     }
 
     void ResetX() {
-        EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
+        const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         for (EigenIdxType i = 0; i < numberOfVertices; i++) {
             x_[i] = 1.0;
         }
@@ -484,19 +501,21 @@ class Sptrsv {
         }
     }
 
-    std::size_t GetNumberOfVertices() { return instance_->NumberOfVertices(); }
+    std::size_t GetNumberOfVertices() const { return instance_->NumberOfVertices(); }
 
     // SSP Lsolve with staleness=2 (allowing at most one superstep of lag).
     // Uses FlatCheckpointCounterBarrier created internally.
     template <unsigned staleness = 2U>
-    void SspLsolveStaleness() {
+    void SspLsolveStaleness() const {
         const unsigned nthreads = instance_->NumberOfProcessors();
         FlatCheckpointCounterBarrier barrier(nthreads);
 
-        const auto *csr = instance_->GetComputationalDag().GetCSR();
-        const EigenIdxType *outer = csr->outerIndexPtr();
-        const EigenIdxType *inner = csr->innerIndexPtr();
-        const double *vals = csr->valuePtr();
+        const auto *const csr = instance_->GetComputationalDag().GetCSR();
+        const EigenIdxType *const outer = csr->outerIndexPtr();
+        const EigenIdxType *const inner = csr->innerIndexPtr();
+        const double *const vals = csr->valuePtr();
+        double *const x = x_;
+        const double *const b = b_;
 
 #    pragma omp parallel num_threads(nthreads)
         {
@@ -513,14 +532,14 @@ class Sptrsv {
                     const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
                         // Initialize solution for this node
-                        x_[node] = b_[node];
+                        x[node] = b[node];
                         // Perform lower-triangular solve for this node
                         for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
                             // Subtract contributions from previously solved nodes
-                            x_[node] -= vals[i] * x_[inner[i]];
+                            x[node] -= vals[i] * x[inner[i]];
                         }
                         // Divide by diagonal element to complete solve for this node
-                        x_[node] /= vals[outer[node + 1] - 1];
+                        x[node] /= vals[outer[node + 1] - 1];
                     }
                 }
                 // Signal completion of this superstep.
@@ -532,14 +551,15 @@ class Sptrsv {
     // SSP Lsolve in-place with staleness=2 (allowing at most one superstep of lag).
     // Uses FlatCheckpointCounterBarrier created internally.
     template <unsigned staleness = 2U>
-    void SspLsolveStalenessInPlace() {
+    void SspLsolveStalenessInPlace() const {
         const unsigned nthreads = instance_->NumberOfProcessors();
         FlatCheckpointCounterBarrier barrier(nthreads);
 
-        const auto *csr = instance_->GetComputationalDag().GetCSR();
-        const EigenIdxType *outer = csr->outerIndexPtr();
-        const EigenIdxType *inner = csr->innerIndexPtr();
-        const double *vals = csr->valuePtr();
+        const auto *const csr = instance_->GetComputationalDag().GetCSR();
+        const EigenIdxType *const outer = csr->outerIndexPtr();
+        const EigenIdxType *const inner = csr->innerIndexPtr();
+        const double *const vals = csr->valuePtr();
+        double *const x = x_;
 
 #    pragma omp parallel num_threads(nthreads)
         {
@@ -558,10 +578,10 @@ class Sptrsv {
                         // Perform lower-triangular solve for this node
                         for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
                             // Subtract contributions from previously solved nodes
-                            x_[node] -= vals[i] * x_[inner[i]];
+                            x[node] -= vals[i] * x[inner[i]];
                         }
                         // Divide by diagonal element to complete solve for this node
-                        x_[node] /= vals[outer[node + 1] - 1];
+                        x[node] /= vals[outer[node + 1] - 1];
                     }
                 }
                 // Signal completion of this superstep.
@@ -573,14 +593,16 @@ class Sptrsv {
     // SSP Usolve with configurable staleness.
     // Uses FlatCheckpointCounterBarrier created internally.
     template <unsigned staleness = 2U>
-    void SspUsolveStaleness() {
+    void SspUsolveStaleness() const {
         const unsigned nthreads = instance_->NumberOfProcessors();
         FlatCheckpointCounterBarrier barrier(nthreads);
 
-        const auto *csc = instance_->GetComputationalDag().GetCSC();
-        const EigenIdxType *outer = csc->outerIndexPtr();
-        const EigenIdxType *inner = csc->innerIndexPtr();
-        const double *vals = csc->valuePtr();
+        const auto *const csc = instance_->GetComputationalDag().GetCSC();
+        const EigenIdxType *const outer = csc->outerIndexPtr();
+        const EigenIdxType *const inner = csc->innerIndexPtr();
+        const double *const vals = csc->valuePtr();
+        double *const x = x_;
+        const double *const b = b_;
 
 #    pragma omp parallel num_threads(nthreads)
         {
@@ -599,11 +621,11 @@ class Sptrsv {
 
                     do {
                         node--;
-                        x_[node] = b_[node];
+                        x[node] = b[node];
                         for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
-                            x_[node] -= vals[i] * x_[inner[i]];
+                            x[node] -= vals[i] * x[inner[i]];
                         }
-                        x_[node] /= vals[outer[node]];
+                        x[node] /= vals[outer[node]];
                     } while (node != lowerB);
                 }
 

From ebfa82adbdd2f417a12a0cebf6eff8761cae621a Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Wed, 18 Mar 2026 16:05:22 +0100
Subject: [PATCH 38/57] Add localaccumulator for inner loop of sptrsv kernels

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 69 +++++++++++--------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index e02e5968..b992614b 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -249,10 +249,11 @@ class Sptrsv {
         const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         for (EigenIdxType i = 0; i < numberOfVertices; ++i) {
             x[i] = b[i];
+            double acc = 0.0;
             for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) {
-                x[i] -= valPtr[j] * x[inner[j]];
+                acc += valPtr[j] * x[inner[j]];
             }
-            x[i] /= valPtr[outer[i + 1] - 1];
+            x[i] = (x[i] - acc) / valPtr[outer[i + 1] - 1];
         }
     }
 
@@ -269,10 +270,11 @@ class Sptrsv {
         do {
             i--;
             x[i] = b[i];
+            double acc = 0.0;
             for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) {
-                x[i] -= valPtr[j] * x[inner[j]];
+                acc += valPtr[j] * x[inner[j]];
             }
-            x[i] /= valPtr[outer[i]];
+            x[i] = (x[i] - acc) / valPtr[outer[i]];
         } while (i != 0);
     }
 
@@ -293,10 +295,11 @@ class Sptrsv {
                     const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
 
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
+                        double acc = 0.0;
                         for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
-                            x[node] -= valPtr[i] * x[inner[i]];
+                            acc += valPtr[i] * x[inner[i]];
                         }
-                        x[node] /= valPtr[outer[node + 1] - 1];
+                        x[node] = (x[node] - acc) / valPtr[outer[node + 1] - 1];
                     }
                 }
 #    pragma omp barrier
@@ -324,10 +327,11 @@ class Sptrsv {
 
                     do {
                         node--;
+                        double acc = 0.0;
                         for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
-                            x[node] -= valPtr[i] * x[inner[i]];
+                            acc += valPtr[i] * x[inner[i]];
                         }
-                        x[node] /= valPtr[outer[node]];
+                        x[node] = (x[node] - acc) / valPtr[outer[node]];
                     } while (node != lowerB);
                 }
 #    pragma omp barrier
@@ -354,10 +358,11 @@ class Sptrsv {
 
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
                         x[node] = b[node];
+                        double acc = 0.0;
                         for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
-                            x[node] -= valPtr[i] * x[inner[i]];
+                            acc += valPtr[i] * x[inner[i]];
                         }
-                        x[node] /= valPtr[outer[node + 1] - 1];
+                        x[node] = (x[node] - acc) / valPtr[outer[node + 1] - 1];
                     }
                 }
 #    pragma omp barrier
@@ -387,10 +392,11 @@ class Sptrsv {
                     do {
                         node--;
                         x[node] = b[node];
+                        double acc = 0.0;
                         for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
-                            x[node] -= valPtr[i] * x[inner[i]];
+                            acc += valPtr[i] * x[inner[i]];
                         }
-                        x[node] /= valPtr[outer[node]];
+                        x[node] = (x[node] - acc) / valPtr[outer[node]];
                     } while (node != lowerB);
                 }
 #    pragma omp barrier
@@ -406,10 +412,11 @@ class Sptrsv {
 
         const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         for (EigenIdxType i = 0; i < numberOfVertices; ++i) {
+            double acc = 0.0;
             for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) {
-                x[i] -= valPtr[j] * x[inner[j]];
+                acc += valPtr[j] * x[inner[j]];
             }
-            x[i] /= valPtr[outer[i + 1] - 1];
+            x[i] = (x[i] - acc) / valPtr[outer[i + 1] - 1];
         }
     }
 
@@ -423,10 +430,11 @@ class Sptrsv {
         EigenIdxType i = numberOfVertices;
         do {
             i--;
+            double acc = 0.0;
             for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) {
-                x[i] -= valPtr[j] * x[inner[j]];
+                acc += valPtr[j] * x[inner[j]];
             }
-            x[i] /= valPtr[outer[i]];
+            x[i] = (x[i] - acc) / valPtr[outer[i]];
         } while (i != 0);
     }
 
@@ -439,11 +447,12 @@ class Sptrsv {
                 const size_t proc = static_cast<size_t>(omp_get_thread_num());
                 const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc];
                 for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) {
+                    double acc = 0.0;
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
-                        x[rowIdx] -= val_[i] * x[colIdx_[i]];
+                        acc += val_[i] * x[colIdx_[i]];
                     }
 
-                    x[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1];
+                    x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
                 }
 
 #    pragma omp barrier
@@ -462,11 +471,12 @@ class Sptrsv {
                 const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc];
                 for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) {
                     x[rowIdx] = b[rowIdx];
+                    double acc = 0.0;
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
-                        x[rowIdx] -= val_[i] * x[colIdx_[i]];
+                        acc += val_[i] * x[colIdx_[i]];
                     }
 
-                    x[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1];
+                    x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
                 }
 
 #    pragma omp barrier
@@ -533,13 +543,14 @@ class Sptrsv {
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
                         // Initialize solution for this node
                         x[node] = b[node];
+                        double acc = 0.0;
                         // Perform lower-triangular solve for this node
                         for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
-                            // Subtract contributions from previously solved nodes
-                            x[node] -= vals[i] * x[inner[i]];
+                            // Accumulate contributions from previously solved nodes
+                            acc += vals[i] * x[inner[i]];
                         }
                         // Divide by diagonal element to complete solve for this node
-                        x[node] /= vals[outer[node + 1] - 1];
+                        x[node] = (x[node] - acc) / vals[outer[node + 1] - 1];
                     }
                 }
                 // Signal completion of this superstep.
@@ -575,13 +586,14 @@ class Sptrsv {
                     EigenIdxType lowerB = boundsArrayL_[step][proc][index];
                     const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
+                        double acc = 0.0;
                         // Perform lower-triangular solve for this node
                         for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
-                            // Subtract contributions from previously solved nodes
-                            x[node] -= vals[i] * x[inner[i]];
+                            // Accumulate contributions from previously solved nodes
+                            acc += vals[i] * x[inner[i]];
                         }
                         // Divide by diagonal element to complete solve for this node
-                        x[node] /= vals[outer[node + 1] - 1];
+                        x[node] = (x[node] - acc) / vals[outer[node + 1] - 1];
                     }
                 }
                 // Signal completion of this superstep.
@@ -622,10 +634,11 @@ class Sptrsv {
                     do {
                         node--;
                         x[node] = b[node];
+                        double acc = 0.0;
                         for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
-                            x[node] -= vals[i] * x[inner[i]];
+                            acc += vals[i] * x[inner[i]];
                         }
-                        x[node] /= vals[outer[node]];
+                        x[node] = (x[node] - acc) / vals[outer[node]];
                     } while (node != lowerB);
                 }
 

From 2a371ca008eb53ffa1987a0dc384ca4759ce60e5 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Mon, 23 Mar 2026 09:41:39 +0100
Subject: [PATCH 39/57] Loop Processor Permutation

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 104 +++++++++++++++++-
 tests/sptrsv.cpp                              |  14 ++-
 2 files changed, 110 insertions(+), 8 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index b992614b..4e8e2f98 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -27,12 +27,14 @@ limitations under the License.
 #    include <atomic>
 #    include <chrono>
 #    include <iostream>
+#    include <limits>
 #    include <list>
 #    include <map>
 #    include <memory>
 #    include <random>
 #    include <stdexcept>
 #    include <thread>
+#    include <type_traits>
 #    include <vector>
 
 #    include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp"
@@ -59,6 +61,9 @@ class Sptrsv {
     std::vector<UVertType> rowIdx_;
     std::vector<UVertType> colPtr_;
 
+    std::vector<std::vector<unsigned>> procStepPtr_;
+    std::vector<std::vector<unsigned>> procStepNum_;
+
     std::vector<std::vector<unsigned>> stepProcPtr_;
     std::vector<std::vector<unsigned>> stepProcNum_;
 
@@ -166,6 +171,91 @@ class Sptrsv {
         }
     }
 
+    void SetupCsrWithPermutationLoopProcessors(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule, std::vector<unsigned> &perm) {
+        const auto *const csr = instance_->GetComputationalDag().GetCSR();
+        const EigenIdxType *const outer = csr->outerIndexPtr();
+        const EigenIdxType *const inner = csr->innerIndexPtr();
+        const double *const values = csr->valuePtr();
+
+        const SparseMatrixImp<EigenIdxType> &graph = instance_->GetComputationalDag();
+        assert(static_cast<std::size_t>(graph.NumVertices()) + static_cast<std::size_t>(graph.NumEdges()) <= static_cast<std::size_t>(std::numeric_limits<unsigned>::max()));
+        const unsigned numVert = static_cast<unsigned>(graph.NumVertices());
+        numSupersteps_ = schedule.NumberOfSupersteps();
+        const unsigned numProcs = instance_->NumberOfProcessors();
+
+        perm = std::vector<unsigned>(numVert, 0U);
+
+        val_ = std::vector<double>(static_cast<size_t>(csr->nonZeros()));
+        colIdx_ = std::vector<UVertType>(static_cast<size_t>(csr->nonZeros()));
+        rowPtr_ = std::vector<UVertType>(numVert + 1U, 0U);
+
+        procStepPtr_ = std::vector<std::vector<unsigned>>(numProcs, std::vector<unsigned>(numSupersteps_, 0U));
+        procStepNum_ = std::vector<std::vector<unsigned>>(numProcs, std::vector<unsigned>(numSupersteps_, 0U));
+
+        for (const auto vert : graph.Vertices()) {
+            const unsigned whichStep = schedule.AssignedSuperstep(vert);
+            const unsigned whichProc = schedule.AssignedProcessor(vert);
+
+            perm[vert] = procStepNum_[whichProc][whichStep]++; // offsets
+        }
+
+        unsigned accNode = 0U;
+        for (unsigned step = 0U; step < numSupersteps_; ++step) {
+            for (unsigned proc = 0U; proc < numProcs; ++proc) {
+                procStepPtr_[proc][step] = accNode;
+                accNode += procStepNum_[proc][step];
+            }
+        }
+
+        for (const auto vert : graph.Vertices()) {
+            perm[vert] += procStepPtr_[schedule.AssignedProcessor(vert)][schedule.AssignedSuperstep(vert)];
+        }
+
+        std::vector<std::vector<unsigned>> entryAccumulation = std::vector<std::vector<unsigned>>(numProcs, std::vector<unsigned>(numSupersteps_, 0U));
+
+        for (const auto vert : graph.Vertices()) {
+            const unsigned whichStep = schedule.AssignedSuperstep(vert);
+            const unsigned whichProc = schedule.AssignedProcessor(vert);
+
+            rowPtr_[perm[vert]] = entryAccumulation[whichProc][whichStep];
+            entryAccumulation[whichProc][whichStep] += static_cast<unsigned>(graph.InDegree(vert)) + 1;
+        }
+
+        unsigned accEntry = 0U;
+        for (unsigned step = 0U; step < numSupersteps_; ++step) {
+            for (unsigned proc = 0U; proc < numProcs; ++proc) {
+                unsigned temp = entryAccumulation[proc][step];
+                entryAccumulation[proc][step] = accEntry;
+                accEntry += temp;
+            }
+        }
+        rowPtr_[numVert] = accEntry;
+        assert(static_cast<std::size_t>(accEntry) == static_cast<std::size_t>(graph.NumVertices()) + static_cast<std::size_t>(graph.NumEdges()) );
+
+        for (const auto vert : graph.Vertices()) {
+            rowPtr_[perm[vert]] += entryAccumulation[schedule.AssignedProcessor(vert)][schedule.AssignedSuperstep(vert)];
+        }
+
+        for (const auto vert : graph.Vertices()) {
+            std::vector<std::pair<unsigned, unsigned>> parents;
+            parents.reserve(graph.InDegree(vert));
+            for (EigenIdxType edge = outer[vert]; edge < outer[vert + 1] - 1; ++edge) {
+                parents.emplace_back(perm[static_cast<std::size_t>(inner[edge])], static_cast<unsigned>(edge));
+            }
+            std::sort(parents.begin(), parents.end());
+
+            const unsigned permVert = perm[vert];
+            UVertType location = rowPtr_[permVert];
+            for (const auto [permPar, edgeIdx] : parents) {
+                colIdx_[location] = permPar;
+                val_[location] = values[edgeIdx];
+                ++location;
+            }
+            colIdx_[location] = permVert;
+            val_[location] = values[outer[vert + 1] - 1];
+        }
+    }
+
     void SetupCsrWithPermutation(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule, std::vector<size_t> &perm) {
         std::vector<size_t> permInv(perm.size());
         for (size_t i = 0; i < perm.size(); i++) {
@@ -443,10 +533,10 @@ class Sptrsv {
 
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
+            const size_t proc = static_cast<size_t>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; step++) {
-                const size_t proc = static_cast<size_t>(omp_get_thread_num());
-                const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc];
-                for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) {
+                const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
+                for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
                     double acc = 0.0;
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
                         acc += val_[i] * x[colIdx_[i]];
@@ -491,12 +581,14 @@ class Sptrsv {
         }
     }
 
-    void PermuteXVector(const std::vector<size_t> &perm) {
+    template<typename IntegralType>
+    void PermuteXVector(const std::vector<IntegralType> &perm) {
+        static_assert(std::is_integral_v<IntegralType>);
         std::vector<double> vecPerm(perm.size());
-        for (size_t i = 0; i < perm.size(); i++) {
+        for (IntegralType i = 0; i < perm.size(); i++) {
             vecPerm[i] = x_[perm[i]];
         }
-        for (size_t i = 0; i < perm.size(); i++) {
+        for (IntegralType i = 0; i < perm.size(); i++) {
             x_[i] = vecPerm[i];
         }
     }
diff --git a/tests/sptrsv.cpp b/tests/sptrsv.cpp
index 59605ae8..d11ffc77 100644
--- a/tests/sptrsv.cpp
+++ b/tests/sptrsv.cpp
@@ -227,8 +227,18 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) {
     BOOST_CHECK(CompareVectors(uXRef, uXOsp));
 
     // Lsolve in-place With PERMUTATION
-    std::vector<size_t> perm = ScheduleNodePermuterBasic(scheduleCs, LOOP_PROCESSORS);
-    sim.SetupCsrWithPermutation(scheduleCs, perm);
+    std::vector<unsigned> perm;// = ScheduleNodePermuterBasic(scheduleCs, LOOP_PROCESSORS);
+    sim.SetupCsrWithPermutationLoopProcessors(scheduleCs, perm);
+    std::vector<bool> permCheck(graph.NumVertices(), false);
+    BOOST_CHECK_EQUAL(permCheck.size(), perm.size());
+    for (const auto vert : graph.Vertices()) {
+        BOOST_CHECK(not permCheck[perm[vert]]);
+        permCheck[perm[vert]] = true;
+    }
+    for (const bool val : permCheck) {
+        BOOST_CHECK(val);
+    }
+
 
     // Comparisson with osp serial in place L solve
     // Eigen

From 57c8365f2606795bc38064ca7d25d282f6ebabfc Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Mon, 23 Mar 2026 10:35:28 +0100
Subject: [PATCH 40/57] Processor First Permutation

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 115 ++++++++++++++++++
 tests/sptrsv.cpp                              |  31 +++++
 2 files changed, 146 insertions(+)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 4e8e2f98..21d2ed29 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -64,6 +64,8 @@ class Sptrsv {
     std::vector<std::vector<unsigned>> procStepPtr_;
     std::vector<std::vector<unsigned>> procStepNum_;
 
+    std::vector<unsigned> procFirstStepPtr_;
+
     std::vector<std::vector<unsigned>> stepProcPtr_;
     std::vector<std::vector<unsigned>> stepProcNum_;
 
@@ -256,6 +258,95 @@ class Sptrsv {
         }
     }
 
+    void SetupCsrWithPermutationProcessorsFirst(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule, std::vector<unsigned> &perm) {
+        const auto *const csr = instance_->GetComputationalDag().GetCSR();
+        const EigenIdxType *const outer = csr->outerIndexPtr();
+        const EigenIdxType *const inner = csr->innerIndexPtr();
+        const double *const values = csr->valuePtr();
+
+        const SparseMatrixImp<EigenIdxType> &graph = instance_->GetComputationalDag();
+        assert(static_cast<std::size_t>(graph.NumVertices()) + static_cast<std::size_t>(graph.NumEdges()) <= static_cast<std::size_t>(std::numeric_limits<unsigned>::max()));
+        const unsigned numVert = static_cast<unsigned>(graph.NumVertices());
+        numSupersteps_ = schedule.NumberOfSupersteps();
+        const unsigned numProcs = instance_->NumberOfProcessors();
+
+        perm = std::vector<unsigned>(numVert, 0U);
+
+        val_ = std::vector<double>(static_cast<size_t>(csr->nonZeros()));
+        colIdx_ = std::vector<UVertType>(static_cast<size_t>(csr->nonZeros()));
+        rowPtr_ = std::vector<UVertType>(numVert + 1U, 0U);
+
+        procFirstStepPtr_ = std::vector<unsigned>(0U);
+        procFirstStepPtr_.reserve(numProcs + numSupersteps_ + 1U);
+
+        procStepNum_ = std::vector<std::vector<unsigned>>(numProcs, std::vector<unsigned>(numSupersteps_, 0U));
+
+        for (const auto vert : graph.Vertices()) {
+            const unsigned whichStep = schedule.AssignedSuperstep(vert);
+            const unsigned whichProc = schedule.AssignedProcessor(vert);
+
+            perm[vert] = procStepNum_[whichProc][whichStep]++; // offsets
+        }
+
+        unsigned accNode = 0U;
+        for (unsigned proc = 0U; proc < numProcs; ++proc) {
+            for (unsigned step = 0U; step < numSupersteps_; ++step) {
+                procFirstStepPtr_.emplace_back(accNode);
+                accNode += procStepNum_[proc][step];
+            }
+        }
+        procFirstStepPtr_.emplace_back(accNode);
+
+
+        for (const auto vert : graph.Vertices()) {
+            perm[vert] += procFirstStepPtr_[schedule.AssignedProcessor(vert) * numSupersteps_ + schedule.AssignedSuperstep(vert)];
+        }
+
+        std::vector<std::vector<unsigned>> entryAccumulation = std::vector<std::vector<unsigned>>(numProcs, std::vector<unsigned>(numSupersteps_, 0U));
+
+        for (const auto vert : graph.Vertices()) {
+            const unsigned whichStep = schedule.AssignedSuperstep(vert);
+            const unsigned whichProc = schedule.AssignedProcessor(vert);
+
+            rowPtr_[perm[vert]] = entryAccumulation[whichProc][whichStep];
+            entryAccumulation[whichProc][whichStep] += static_cast<unsigned>(graph.InDegree(vert)) + 1;
+        }
+
+        unsigned accEntry = 0U;
+        for (unsigned proc = 0U; proc < numProcs; ++proc) {
+            for (unsigned step = 0U; step < numSupersteps_; ++step) {
+                unsigned temp = entryAccumulation[proc][step];
+                entryAccumulation[proc][step] = accEntry;
+                accEntry += temp;
+            }
+        }
+        rowPtr_[numVert] = accEntry;
+        assert(static_cast<std::size_t>(accEntry) == static_cast<std::size_t>(graph.NumVertices()) + static_cast<std::size_t>(graph.NumEdges()) );
+
+        for (const auto vert : graph.Vertices()) {
+            rowPtr_[perm[vert]] += entryAccumulation[schedule.AssignedProcessor(vert)][schedule.AssignedSuperstep(vert)];
+        }
+
+        for (const auto vert : graph.Vertices()) {
+            std::vector<std::pair<unsigned, unsigned>> parents;
+            parents.reserve(graph.InDegree(vert));
+            for (EigenIdxType edge = outer[vert]; edge < outer[vert + 1] - 1; ++edge) {
+                parents.emplace_back(perm[static_cast<std::size_t>(inner[edge])], static_cast<unsigned>(edge));
+            }
+            std::sort(parents.begin(), parents.end());
+
+            const unsigned permVert = perm[vert];
+            UVertType location = rowPtr_[permVert];
+            for (const auto [permPar, edgeIdx] : parents) {
+                colIdx_[location] = permPar;
+                val_[location] = values[edgeIdx];
+                ++location;
+            }
+            colIdx_[location] = permVert;
+            val_[location] = values[outer[vert + 1] - 1];
+        }
+    }
+
     void SetupCsrWithPermutation(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule, std::vector<size_t> &perm) {
         std::vector<size_t> permInv(perm.size());
         for (size_t i = 0; i < perm.size(); i++) {
@@ -545,6 +636,30 @@ class Sptrsv {
                     x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
                 }
 
+#    pragma omp barrier
+            }
+        }
+    }
+
+    void LsolveWithProcFirstPermutationInPlace() const {
+        double *const x = x_;
+
+#    pragma omp parallel num_threads(instance_->NumberOfProcessors())
+        {
+            const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
+            const auto endStepPtr = std::next(procFirstStepPtr_.cbegin(), (proc + 1U) * numSupersteps_);
+            for (auto stepPtr = std::next(procFirstStepPtr_.cbegin(), proc * numSupersteps_); stepPtr != endStepPtr;) {
+                UVertType rowIdx = *stepPtr;
+                const UVertType endRowIdx = *(++stepPtr);
+                for (; rowIdx != endRowIdx; ++rowIdx) {
+                    double acc = 0.0;
+                    for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
+                        acc += val_[i] * x[colIdx_[i]];
+                    }
+
+                    x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
+                }
+
 #    pragma omp barrier
             }
         }
diff --git a/tests/sptrsv.cpp b/tests/sptrsv.cpp
index d11ffc77..034fc271 100644
--- a/tests/sptrsv.cpp
+++ b/tests/sptrsv.cpp
@@ -255,6 +255,37 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) {
 
     sim.PermuteXVector(perm);
     BOOST_CHECK(CompareVectors(lXRef, lXOsp));
+
+
+
+
+    sim.SetupCsrWithPermutationProcessorsFirst(scheduleCs, perm);
+    permCheck = std::vector<bool>(graph.NumVertices(), false);
+    BOOST_CHECK_EQUAL(permCheck.size(), perm.size());
+    for (const auto vert : graph.Vertices()) {
+        BOOST_CHECK(not permCheck[perm[vert]]);
+        permCheck[perm[vert]] = true;
+    }
+    for (const bool val : permCheck) {
+        BOOST_CHECK(val);
+    }
+
+
+    // Comparisson with osp serial in place L solve
+    // Eigen
+    lBRef.setConstant(0.1);
+    lXRef.setConstant(0.1);
+    lXRef = lView.solve(lBRef);
+    // OSP
+    lXOsp.setConstant(0.1);
+    lBOsp.setZero();    // this will not be used as x will take the values that already has instead of the b values
+    sim.x_ = &lXOsp[0];
+    sim.b_ = &lBOsp[0];
+    // sim.permute_x_vector(perm);
+    sim.LsolveWithProcFirstPermutationInPlace();
+
+    sim.PermuteXVector(perm);
+    BOOST_CHECK(CompareVectors(lXRef, lXOsp));
 }
 
 #endif

From 8bd04d2a33363cb452dea6c73d11eb8d81b3cc6a Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Mon, 23 Mar 2026 11:28:35 +0100
Subject: [PATCH 41/57] Loop Processor SSP kernel

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 38 +++++++++++++++-
 tests/sptrsv.cpp                              | 44 +++++++++++++++++++
 2 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 21d2ed29..656c8fe7 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -673,8 +673,8 @@ class Sptrsv {
         {
             for (unsigned step = 0; step < numSupersteps_; step++) {
                 const size_t proc = static_cast<size_t>(omp_get_thread_num());
-                const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc];
-                for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) {
+                const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
+                for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
                     x[rowIdx] = b[rowIdx];
                     double acc = 0.0;
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
@@ -689,6 +689,40 @@ class Sptrsv {
         }
     }
 
+    template <unsigned staleness = 2U>
+    void SspLsolveStalenessInPlaceWithPermutation() const {
+        const unsigned nthreads = instance_->NumberOfProcessors();
+        FlatCheckpointCounterBarrier barrier(nthreads);
+
+        const auto *const csr = instance_->GetComputationalDag().GetCSR();
+        const EigenIdxType *const outer = csr->outerIndexPtr();
+        const EigenIdxType *const inner = csr->innerIndexPtr();
+        const double *const vals = csr->valuePtr();
+        double *const x = x_;
+
+#    pragma omp parallel num_threads(nthreads)
+        {
+            const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+            for (unsigned step = 0; step < numSupersteps_; ++step) {
+                if (procStepNum_[proc][step] > 0U) {
+                    barrier.Wait(proc, staleness - 1U);
+                }
+
+                const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
+                for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
+                    double acc = 0.0;
+                    for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
+                        acc += val_[i] * x[colIdx_[i]];
+                    }
+
+                    x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
+                }
+                // Signal completion of this superstep.
+                barrier.Arrive(proc);
+            }
+        }
+    }
+
     void ResetX() {
         const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         for (EigenIdxType i = 0; i < numberOfVertices; i++) {
diff --git a/tests/sptrsv.cpp b/tests/sptrsv.cpp
index 034fc271..e85c7a9c 100644
--- a/tests/sptrsv.cpp
+++ b/tests/sptrsv.cpp
@@ -37,6 +37,8 @@ limitations under the License.
 #    include "osp/graph_algorithms/directed_graph_util.hpp"
 #    include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp"
 
+#    include "osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp"
+
 using namespace osp;
 
 bool CompareVectors(Eigen::VectorXd &v1, Eigen::VectorXd &v2) {
@@ -286,6 +288,48 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) {
 
     sim.PermuteXVector(perm);
     BOOST_CHECK(CompareVectors(lXRef, lXOsp));
+
+
+
+
+
+    GrowLocalSSP<SparseMatrixImp<int32_t>, 2U> schedulerSSP;
+    MaxBspSchedule<SparseMatrixImp<int32_t>> scheduleSSP(instance);
+
+    schedulerSSP.ComputeSchedule(scheduleSSP);
+
+    sim.SetupCsrWithPermutationLoopProcessors(scheduleSSP, perm);
+    permCheck = std::vector<bool>(graph.NumVertices(), false);
+    BOOST_CHECK_EQUAL(permCheck.size(), perm.size());
+    for (const auto vert : graph.Vertices()) {
+        BOOST_CHECK(not permCheck[perm[vert]]);
+        permCheck[perm[vert]] = true;
+    }
+    for (const bool val : permCheck) {
+        BOOST_CHECK(val);
+    }
+
+
+    // Comparisson with osp serial in place L solve
+    // Eigen
+    lBRef.setConstant(0.1);
+    lXRef.setConstant(0.1);
+    lXRef = lView.solve(lBRef);
+    // OSP
+    lXOsp.setConstant(0.1);
+    lBOsp.setZero();    // this will not be used as x will take the values that already has instead of the b values
+    sim.x_ = &lXOsp[0];
+    sim.b_ = &lBOsp[0];
+    // sim.permute_x_vector(perm);
+    sim.SspLsolveStalenessInPlaceWithPermutation<2U>();
+
+    sim.PermuteXVector(perm);
+    BOOST_CHECK(CompareVectors(lXRef, lXOsp));
+
+
+
+
+
 }
 
 #endif

From c7eef33c9db2103b524cfa8e5ef9289df3222e1e Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Mon, 23 Mar 2026 11:35:39 +0100
Subject: [PATCH 42/57] SSP Proc first SpTrSV kernels and tests

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 39 ++++++++++++++++++-
 tests/sptrsv.cpp                              | 28 ++++++++++++-
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 656c8fe7..d7deb63b 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -690,7 +690,7 @@ class Sptrsv {
     }
 
     template <unsigned staleness = 2U>
-    void SspLsolveStalenessInPlaceWithPermutation() const {
+    void SspLsolveStalenessWithPermutationInPlace() const {
         const unsigned nthreads = instance_->NumberOfProcessors();
         FlatCheckpointCounterBarrier barrier(nthreads);
 
@@ -723,6 +723,43 @@ class Sptrsv {
         }
     }
 
+    template <unsigned staleness = 2U>
+    void SspLsolveStalenessWithProcFirstPermutationInPlace() const {
+        const unsigned nthreads = instance_->NumberOfProcessors();
+        FlatCheckpointCounterBarrier barrier(nthreads);
+
+        const auto *const csr = instance_->GetComputationalDag().GetCSR();
+        const EigenIdxType *const outer = csr->outerIndexPtr();
+        const EigenIdxType *const inner = csr->innerIndexPtr();
+        const double *const vals = csr->valuePtr();
+        double *const x = x_;
+
+#    pragma omp parallel num_threads(nthreads)
+        {
+            const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
+            const auto endStepPtr = std::next(procFirstStepPtr_.cbegin(), (proc + 1U) * numSupersteps_);
+            for (auto stepPtr = std::next(procFirstStepPtr_.cbegin(), proc * numSupersteps_); stepPtr != endStepPtr;) {
+                UVertType rowIdx = *stepPtr;
+                const UVertType endRowIdx = *(++stepPtr);
+
+                if (rowIdx != endRowIdx) {
+                    barrier.Wait(proc, staleness - 1U);
+                }
+
+                for (; rowIdx != endRowIdx; ++rowIdx) {
+                    double acc = 0.0;
+                    for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
+                        acc += val_[i] * x[colIdx_[i]];
+                    }
+
+                    x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
+                }
+                // Signal completion of this superstep.
+                barrier.Arrive(proc);
+            }
+        }
+    }
+
     void ResetX() {
         const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
         for (EigenIdxType i = 0; i < numberOfVertices; i++) {
diff --git a/tests/sptrsv.cpp b/tests/sptrsv.cpp
index e85c7a9c..355a36d7 100644
--- a/tests/sptrsv.cpp
+++ b/tests/sptrsv.cpp
@@ -321,7 +321,7 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) {
     sim.x_ = &lXOsp[0];
     sim.b_ = &lBOsp[0];
     // sim.permute_x_vector(perm);
-    sim.SspLsolveStalenessInPlaceWithPermutation<2U>();
+    sim.SspLsolveStalenessWithPermutationInPlace<2U>();
 
     sim.PermuteXVector(perm);
     BOOST_CHECK(CompareVectors(lXRef, lXOsp));
@@ -329,7 +329,33 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) {
 
 
 
+    sim.SetupCsrWithPermutationProcessorsFirst(scheduleSSP, perm);
+    permCheck = std::vector<bool>(graph.NumVertices(), false);
+    BOOST_CHECK_EQUAL(permCheck.size(), perm.size());
+    for (const auto vert : graph.Vertices()) {
+        BOOST_CHECK(not permCheck[perm[vert]]);
+        permCheck[perm[vert]] = true;
+    }
+    for (const bool val : permCheck) {
+        BOOST_CHECK(val);
+    }
+
+
+    // Comparisson with osp serial in place L solve
+    // Eigen
+    lBRef.setConstant(0.1);
+    lXRef.setConstant(0.1);
+    lXRef = lView.solve(lBRef);
+    // OSP
+    lXOsp.setConstant(0.1);
+    lBOsp.setZero();    // this will not be used as x will take the values that already has instead of the b values
+    sim.x_ = &lXOsp[0];
+    sim.b_ = &lBOsp[0];
+    // sim.permute_x_vector(perm);
+    sim.SspLsolveStalenessWithProcFirstPermutationInPlace<2U>();
 
+    sim.PermuteXVector(perm);
+    BOOST_CHECK(CompareVectors(lXRef, lXOsp));
 }
 
 #endif

From d0a974cb4f45e23de9388b2774f8266ee5b90381 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Mon, 23 Mar 2026 14:33:23 +0100
Subject: [PATCH 43/57] update ssp bench with permutation

---
 apps/maxbsp_ssp_sptrsv.cpp | 185 +++++++++++++++++++++++++++++++++++--
 1 file changed, 178 insertions(+), 7 deletions(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 4ad3565e..70b99822 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -51,7 +51,10 @@ constexpr int preMeasureIterations = 2;
 enum class Algorithm {
     VarianceSsp,
     GrowLocalSsp,
+    GrowLocalSspPermSteps,
+    GrowLocalSspPermProcs,
     GrowLocal,
+    GrowLocalPermSteps,
     Serial
 };
 
@@ -135,11 +138,13 @@ double LInftyNormalisedDiff(const std::vector<double> &v, const std::vector<doub
 void PrintUsage(const char *prog) {
     std::cout << "Usage:\n"
               << "  " << prog << " --input <file_or_directory> [--output <csv>] [--iterations <n>] [--processors <p>]\n"
-              << "      [--variance-ssp] [--growlocal-ssp] [--growlocal] [--eigen-serial] [--all]\n\n"
+              << "      [--variance-ssp] [--growlocal-ssp] [--growlocal-ssp-perm-step] [--growlocal-ssp-perm-proc] [--growlocal] "
+                 "[--growlocal-perm-step] [--eigen-serial] [--all]\n\n"
               << "Examples:\n"
               << "  " << prog << " --input ../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx --all\n"
               << "  " << prog
-              << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --variance-ssp --growlocal-ssp --growlocal\n";
+              << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --variance-ssp --growlocal-ssp "
+                 "--growlocal\n";
 }
 
 bool ParseArgs(int argc, char *argv[], Args &args) {
@@ -150,8 +155,7 @@ bool ParseArgs(int argc, char *argv[], Args &args) {
     for (int i = 1; i < argc; ++i) {
         const std::string flag = argv[i];
 
-        const bool needsValue
-            = (flag == "--input" || flag == "--output" || flag == "--iterations" || flag == "--processors");
+        const bool needsValue = (flag == "--input" || flag == "--output" || flag == "--iterations" || flag == "--processors");
         if (needsValue && i + 1 >= argc) {
             std::cerr << "Missing value for " << flag << "\n";
             return false;
@@ -169,12 +173,24 @@ bool ParseArgs(int argc, char *argv[], Args &args) {
             args.algorithms.insert(Algorithm::VarianceSsp);
         } else if (flag == "--growlocal-ssp") {
             args.algorithms.insert(Algorithm::GrowLocalSsp);
+        } else if (flag == "--growlocal-ssp-perm-step") {
+            args.algorithms.insert(Algorithm::GrowLocalSspPermSteps);
+        } else if (flag == "--growlocal-ssp-perm-proc") {
+            args.algorithms.insert(Algorithm::GrowLocalSspPermProcs);
         } else if (flag == "--growlocal") {
             args.algorithms.insert(Algorithm::GrowLocal);
+        } else if (flag == "--growlocal-perm-step") {
+            args.algorithms.insert(Algorithm::GrowLocalPermSteps);
         } else if (flag == "--eigen-serial") {
             args.algorithms.insert(Algorithm::Serial);
         } else if (flag == "--all") {
-            args.algorithms = {Algorithm::VarianceSsp, Algorithm::GrowLocalSsp, Algorithm::GrowLocal, Algorithm::Serial};
+            args.algorithms = {Algorithm::VarianceSsp,
+                               Algorithm::GrowLocalSsp,
+                               Algorithm::GrowLocalSspPermProcs,
+                               Algorithm::GrowLocalSspPermSteps,
+                               Algorithm::GrowLocal,
+                               Algorithm::GrowLocalPermSteps,
+                               Algorithm::Serial};
         } else if (flag == "--help" || flag == "-h") {
             PrintUsage(argv[0]);
             return false;
@@ -241,7 +257,8 @@ std::vector<std::filesystem::path> CollectInputGraphs(const std::string &inputPa
 }
 
 void EnsureCsvHeader(std::ofstream &csv) {
-    csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,SynchronizationCosts,Staleness,RuntimeSeconds,Correctness\n";
+    csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,SynchronizationCosts,Staleness,RuntimeSeconds,"
+           "Correctness\n";
 }
 
 void EnsureSummaryCsvHeader(std::ofstream &csv) {
@@ -251,7 +268,8 @@ void EnsureSummaryCsvHeader(std::ofstream &csv) {
 
 void WriteCsvRow(std::ofstream &csv, const CsvRow &row) {
     csv << CsvEscape(row.graph) << "," << row.algorithm << "," << row.processors << "," << row.scheduleTimeSeconds << ","
-    << row.supersteps << "," << row.SyncCosts << "," << row.staleness << "," << row.runtimeSeconds << "," << row.correctness << "\n";
+        << row.supersteps << "," << row.SyncCosts << "," << row.staleness << "," << row.runtimeSeconds << "," << row.correctness
+        << "\n";
 }
 
 std::string BuildSummaryCsvPath(const std::string &detailPath) {
@@ -468,6 +486,108 @@ int main(int argc, char *argv[]) {
             }
         }
 
+        if (args.algorithms.count(Algorithm::GrowLocalSspPermSteps) > 0U) {
+            GrowLocalSSP<SparseMatrixImp<int32_t>, kDefaultStaleness> scheduler;
+            MaxBspSchedule<SparseMatrixImp<int32_t>> schedule(instance);
+
+            const auto t0 = std::chrono::high_resolution_clock::now();
+            scheduler.ComputeSchedule(schedule);
+            const auto t1 = std::chrono::high_resolution_clock::now();
+            const double scheduleTime = std::chrono::duration<double>(t1 - t0).count();
+
+            std::vector<unsigned> perm;
+            sptrsv.SetupCsrWithPermutationLoopProcessors(schedule, perm);
+            const unsigned supersteps = schedule.NumberOfSupersteps();
+            const int syncCosts = ComputeSyncCosts(instance);
+
+            bool correct = false;
+            std::vector<double> x(n, 1.0);
+            sptrsv.x_ = x.data();
+            for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
+                resetOnes(x);
+
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.SspLsolveStalenessWithPermutationInPlace<kDefaultStaleness>();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
+
+                if (iter == 0) {
+                    sptrsv.PermuteXVector(perm);
+                    const double diff = LInftyNormalisedDiff(x, serialRefX);
+                    correct = (diff < EPSILON);
+                    std::cout << "  Growlocal_SSP_Perm_Step first-run max relative diff vs serial: " << diff << std::endl;
+                }
+
+                if (iter >= preMeasureIterations) {
+                    bufferedRows.emplace_back(CsvRow{graphName,
+                                                     "Growlocal_SSP_Perm_Step",
+                                                     args.processors,
+                                                     scheduleTime,
+                                                     supersteps,
+                                                     syncCosts,
+                                                     kDefaultStaleness,
+                                                     runtime,
+                                                     correct});
+                }
+            }
+
+            for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) {
+                WriteCsvRow(csv, *it);
+                ++writtenEntries;
+            }
+        }
+
+        if (args.algorithms.count(Algorithm::GrowLocalSspPermProcs) > 0U) {
+            GrowLocalSSP<SparseMatrixImp<int32_t>, kDefaultStaleness> scheduler;
+            MaxBspSchedule<SparseMatrixImp<int32_t>> schedule(instance);
+
+            const auto t0 = std::chrono::high_resolution_clock::now();
+            scheduler.ComputeSchedule(schedule);
+            const auto t1 = std::chrono::high_resolution_clock::now();
+            const double scheduleTime = std::chrono::duration<double>(t1 - t0).count();
+
+            std::vector<unsigned> perm;
+            sptrsv.SetupCsrWithPermutationProcessorsFirst(schedule, perm);
+            const unsigned supersteps = schedule.NumberOfSupersteps();
+            const int syncCosts = ComputeSyncCosts(instance);
+
+            bool correct = false;
+            std::vector<double> x(n, 1.0);
+            sptrsv.x_ = x.data();
+            for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
+                resetOnes(x);
+
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.SspLsolveStalenessWithProcFirstPermutationInPlace<kDefaultStaleness>();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
+
+                if (iter == 0) {
+                    sptrsv.PermuteXVector(perm);
+                    const double diff = LInftyNormalisedDiff(x, serialRefX);
+                    correct = (diff < EPSILON);
+                    std::cout << "  Growlocal_SSP_Perm_Proc first-run max relative diff vs serial: " << diff << std::endl;
+                }
+
+                if (iter >= preMeasureIterations) {
+                    bufferedRows.emplace_back(CsvRow{graphName,
+                                                     "Growlocal_SSP_Perm_Proc",
+                                                     args.processors,
+                                                     scheduleTime,
+                                                     supersteps,
+                                                     syncCosts,
+                                                     kDefaultStaleness,
+                                                     runtime,
+                                                     correct});
+                }
+            }
+
+            for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) {
+                WriteCsvRow(csv, *it);
+                ++writtenEntries;
+            }
+        }
+
         if (args.algorithms.count(Algorithm::GrowLocal) > 0U) {
             GrowLocalAutoCores<SparseMatrixImp<int32_t>> scheduler;
             BspSchedule<SparseMatrixImp<int32_t>> schedule(instance);
@@ -517,6 +637,57 @@ int main(int argc, char *argv[]) {
             }
         }
 
+        if (args.algorithms.count(Algorithm::GrowLocalPermSteps) > 0U) {
+            GrowLocalAutoCores<SparseMatrixImp<int32_t>> scheduler;
+            BspSchedule<SparseMatrixImp<int32_t>> schedule(instance);
+
+            const auto t0 = std::chrono::high_resolution_clock::now();
+            scheduler.ComputeSchedule(schedule);
+            const auto t1 = std::chrono::high_resolution_clock::now();
+            const double scheduleTime = std::chrono::duration<double>(t1 - t0).count();
+
+            std::vector<unsigned> perm;
+            sptrsv.SetupCsrWithPermutationLoopProcessors(schedule, perm);
+            const unsigned supersteps = schedule.NumberOfSupersteps();
+            const int syncCosts = ComputeSyncCosts(instance);
+
+            bool correct;
+            std::vector<double> x(n, 1.0);
+            sptrsv.x_ = x.data();
+            for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) {
+                resetOnes(x);
+
+                const auto s = std::chrono::high_resolution_clock::now();
+                sptrsv.LsolveWithPermutationInPlace();
+                const auto e = std::chrono::high_resolution_clock::now();
+                const double runtime = std::chrono::duration<double>(e - s).count();
+
+                if (iter == 0) {
+                    sptrsv.PermuteXVector(perm);
+                    const double diff = LInftyNormalisedDiff(x, serialRefX);
+                    correct = (diff < EPSILON);
+                    std::cout << "  Growlocal_Perm_Step first-run max relative diff vs serial: " << diff << std::endl;
+                }
+
+                if (iter >= preMeasureIterations) {
+                    bufferedRows.emplace_back(CsvRow{graphName,
+                                                     "Growlocal_Perm_Step",
+                                                     args.processors,
+                                                     scheduleTime,
+                                                     supersteps,
+                                                     syncCosts,
+                                                     1U,
+                                                     runtime,
+                                                     correct});
+                }
+            }
+
+            for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) {
+                WriteCsvRow(csv, *it);
+                ++writtenEntries;
+            }
+        }
+
         if (args.algorithms.count(Algorithm::Serial) > 0U) {
             std::vector<double> x(n, 1.0);
             sptrsv.x_ = x.data();

From f167feb7bc568eb765cd523711f0df407b6e2eb7 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Mon, 23 Mar 2026 15:15:46 +0100
Subject: [PATCH 44/57] missing references

---
 include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index d7deb63b..b221918f 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -248,7 +248,7 @@ class Sptrsv {
 
             const unsigned permVert = perm[vert];
             UVertType location = rowPtr_[permVert];
-            for (const auto [permPar, edgeIdx] : parents) {
+            for (const auto &[permPar, edgeIdx] : parents) {
                 colIdx_[location] = permPar;
                 val_[location] = values[edgeIdx];
                 ++location;
@@ -337,7 +337,7 @@ class Sptrsv {
 
             const unsigned permVert = perm[vert];
             UVertType location = rowPtr_[permVert];
-            for (const auto [permPar, edgeIdx] : parents) {
+            for (const auto &[permPar, edgeIdx] : parents) {
                 colIdx_[location] = permPar;
                 val_[location] = values[edgeIdx];
                 ++location;

From 2298bb789b5f8eee55962087b2a5a9386b20c2df Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Tue, 24 Mar 2026 14:50:27 +0100
Subject: [PATCH 45/57] Sparse kernels progress

---
 apps/maxbsp_ssp_sptrsv.cpp                    |   6 +-
 .../StatsModules/BspSptrsvStatsModule.hpp     |   4 +-
 .../StringToScheduler/get_coarser.hpp         |   2 +-
 .../sptrsv_simulator/ScheduleNodePermuter.hpp |  16 +-
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 227 ++++++------------
 .../sptrsv_simulator/sptrsv_kernels.hpp       | 160 ++++++++++++
 include/osp/bsp/model/BspSchedule.hpp         |   4 +-
 .../GreedySchedulers/GrowLocalMaxBsp.hpp      |   2 +-
 .../GreedySchedulers/RandomGreedy.hpp         |   2 +-
 include/osp/coarser/Sarkar/SarkarMul.hpp      |   4 +-
 include/osp/coarser/SquashA/SquashAMul.hpp    |   2 +-
 include/osp/coarser/coarser_util.hpp          |   4 +-
 .../coarser/top_order/top_order_coarser.hpp   |  10 +-
 .../osp/concepts/directed_graph_concept.hpp   |  12 +-
 .../osp/graph_algorithms/cuthill_mckee.hpp    |   2 +-
 .../directed_graph_path_util.hpp              |   2 +-
 .../eigen_sparse_iterator.hpp                 |  17 +-
 .../eigen_matrix_adapter/sparse_matrix.hpp    |  13 +-
 tests/sparse_matrix_impl.cpp                  |  12 +-
 tests/sptrsv.cpp                              |   2 +-
 20 files changed, 302 insertions(+), 201 deletions(-)
 create mode 100644 include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index 70b99822..a15e24f5 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -495,7 +495,7 @@ int main(int argc, char *argv[]) {
             const auto t1 = std::chrono::high_resolution_clock::now();
             const double scheduleTime = std::chrono::duration<double>(t1 - t0).count();
 
-            std::vector<unsigned> perm;
+            std::vector<SparseMatrixImp<int32_t>::VertexIdx> perm;
             sptrsv.SetupCsrWithPermutationLoopProcessors(schedule, perm);
             const unsigned supersteps = schedule.NumberOfSupersteps();
             const int syncCosts = ComputeSyncCosts(instance);
@@ -546,7 +546,7 @@ int main(int argc, char *argv[]) {
             const auto t1 = std::chrono::high_resolution_clock::now();
             const double scheduleTime = std::chrono::duration<double>(t1 - t0).count();
 
-            std::vector<unsigned> perm;
+            std::vector<SparseMatrixImp<int32_t>::VertexIdx> perm;
             sptrsv.SetupCsrWithPermutationProcessorsFirst(schedule, perm);
             const unsigned supersteps = schedule.NumberOfSupersteps();
             const int syncCosts = ComputeSyncCosts(instance);
@@ -646,7 +646,7 @@ int main(int argc, char *argv[]) {
             const auto t1 = std::chrono::high_resolution_clock::now();
             const double scheduleTime = std::chrono::duration<double>(t1 - t0).count();
 
-            std::vector<unsigned> perm;
+            std::vector<SparseMatrixImp<int32_t>::VertexIdx> perm;
             sptrsv.SetupCsrWithPermutationLoopProcessors(schedule, perm);
             const unsigned supersteps = schedule.NumberOfSupersteps();
             const int syncCosts = ComputeSyncCosts(instance);
diff --git a/apps/test_suite_runner/StatsModules/BspSptrsvStatsModule.hpp b/apps/test_suite_runner/StatsModules/BspSptrsvStatsModule.hpp
index 969bc114..79bf53ae 100644
--- a/apps/test_suite_runner/StatsModules/BspSptrsvStatsModule.hpp
+++ b/apps/test_suite_runner/StatsModules/BspSptrsvStatsModule.hpp
@@ -93,11 +93,13 @@ class BspSptrsvStatsModule : public IStatisticModule<TargetObjectType> {
                       || std::is_same_v<TargetObjectType, osp::BspSchedule<osp::SparseMatrixImp<int64_t>>>) {
             using IndexT
                 = std::conditional_t<std::is_same_v<TargetObjectType, osp::BspSchedule<osp::SparseMatrixImp<int32_t>>>, int32_t, int64_t>;
+            using UndexT
+                = std::conditional_t<std::is_same_v<TargetObjectType, osp::BspSchedule<osp::SparseMatrixImp<int32_t>>>, uint32_t, uint64_t>;
 
             auto instance = schedule.GetInstance();
             Sptrsv<IndexT> sim{instance};
 
-            std::vector<size_t> perm;
+            std::vector<UndexT> perm;
 
             if (mode_ == NO_PERMUTE) {
                 sim.SetupCsrNoPermutation(schedule);
diff --git a/apps/test_suite_runner/StringToScheduler/get_coarser.hpp b/apps/test_suite_runner/StringToScheduler/get_coarser.hpp
index e0455870..9d60b8d6 100644
--- a/apps/test_suite_runner/StringToScheduler/get_coarser.hpp
+++ b/apps/test_suite_runner/StringToScheduler/get_coarser.hpp
@@ -88,7 +88,7 @@ std::unique_ptr<Coarser<GraphTIn, GraphTOut>> GetCoarserByName(const ConfigParse
                                                    .value_or(std::numeric_limits<VMemwT<GraphTIn>>::max()));
                 coarserPtr->SetCommunicationThreshold(paramsPt.get_optional<VCommwT<GraphTIn>>("communication_threshold")
                                                           .value_or(std::numeric_limits<VCommwT<GraphTIn>>::max()));
-                coarserPtr->SetSuperNodeSizeThreshold(paramsPt.get_optional<std::size_t>("super_node_size_threshold").value_or(10));
+                coarserPtr->SetSuperNodeSizeThreshold(paramsPt.get_optional<VertexIdxT<GraphTIn>>("super_node_size_threshold").value_or(10));
                 coarserPtr->SetNodeDistThreshold(paramsPt.get_optional<unsigned>("node_dist_threshold").value_or(10));
             }
         };
diff --git a/include/osp/auxiliary/sptrsv_simulator/ScheduleNodePermuter.hpp b/include/osp/auxiliary/sptrsv_simulator/ScheduleNodePermuter.hpp
index 3378c5b9..b5f8e577 100644
--- a/include/osp/auxiliary/sptrsv_simulator/ScheduleNodePermuter.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/ScheduleNodePermuter.hpp
@@ -34,25 +34,25 @@ enum ScheduleNodePermutationModes { LOOP_PROCESSORS, SNAKE_PROCESSORS, PROCESSOR
  *
  * @param sched BSP Schedule
  * @param mode ordering of processors
- * @return std::vector<size_t> vec[prev_node_name] = new_node_name(location)
+ * @return std::vector<VertexIdxT<GraphT>> vec[prev_node_name] = new_node_name(location)
  */
 template <typename GraphT>
-std::vector<size_t> ScheduleNodePermuterBasic(const BspSchedule<GraphT> &sched,
-                                              const ScheduleNodePermutationModes mode = LOOP_PROCESSORS) {
+std::vector<VertexIdxT<GraphT>> ScheduleNodePermuterBasic(const BspSchedule<GraphT> &sched,
+                                                          const ScheduleNodePermutationModes mode = LOOP_PROCESSORS) {
     // superstep, processor, nodes
-    std::vector<std::vector<std::vector<size_t>>> allocation(
+    std::vector<std::vector<std::vector<VertexIdxT<GraphT>>>> allocation(
         sched.NumberOfSupersteps(),
-        std::vector<std::vector<size_t>>(sched.GetInstance().NumberOfProcessors(), std::vector<size_t>({})));
-    for (size_t node = 0; node < sched.GetInstance().NumberOfVertices(); node++) {
+        std::vector<std::vector<VertexIdxT<GraphT>>>(sched.GetInstance().NumberOfProcessors(), std::vector<VertexIdxT<GraphT>>({})));
+    for (VertexIdxT<GraphT> node = 0; node < sched.GetInstance().NumberOfVertices(); node++) {
         allocation[sched.AssignedSuperstep(node)][sched.AssignedProcessor(node)].emplace_back(node);
     }
 
     // reordering and allocating into permutation
-    std::vector<size_t> permutation(sched.GetInstance().NumberOfVertices());
+    std::vector<VertexIdxT<GraphT>> permutation(sched.GetInstance().NumberOfVertices());
 
     if (mode == LOOP_PROCESSORS || mode == SNAKE_PROCESSORS) {
         bool forward = true;
-        size_t counter = 0;
+        VertexIdxT<GraphT> counter = 0;
         for (auto stepIt = allocation.begin(); stepIt != allocation.cend(); stepIt++) {
             if (forward) {
                 for (auto procIt = stepIt->begin(); procIt != stepIt->cend(); procIt++) {
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index b221918f..6de9a89e 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -41,6 +41,7 @@ limitations under the License.
 #    include "osp/bsp/model/BspInstance.hpp"
 #    include "osp/bsp/model/BspSchedule.hpp"
 #    include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp"
+#    include "osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp"
 
 namespace osp {
 
@@ -61,13 +62,13 @@ class Sptrsv {
     std::vector<UVertType> rowIdx_;
     std::vector<UVertType> colPtr_;
 
-    std::vector<std::vector<unsigned>> procStepPtr_;
-    std::vector<std::vector<unsigned>> procStepNum_;
+    std::vector<std::vector<UVertType>> procStepPtr_;
+    std::vector<std::vector<UVertType>> procStepNum_;
 
-    std::vector<unsigned> procFirstStepPtr_;
+    std::vector<UVertType> procFirstStepPtr_;
 
-    std::vector<std::vector<unsigned>> stepProcPtr_;
-    std::vector<std::vector<unsigned>> stepProcNum_;
+    std::vector<std::vector<UVertType>> stepProcPtr_;
+    std::vector<std::vector<UVertType>> stepProcNum_;
 
     double *x_;
     const double *b_;
@@ -98,14 +99,14 @@ class Sptrsv {
             schedule.NumberOfSupersteps(), std::vector<std::vector<EigenIdxType>>(schedule.GetInstance().NumberOfProcessors()));
 
         numSupersteps_ = schedule.NumberOfSupersteps();
-        size_t numberOfVertices = instance_->GetComputationalDag().NumVertices();
+        UVertType numberOfVertices = instance_->GetComputationalDag().NumVertices();
 
 #    pragma omp parallel num_threads(2)
         {
             int id = omp_get_thread_num();
             switch (id) {
                 case 0: {
-                    for (size_t node = 0; node < numberOfVertices; ++node) {
+                    for (UVertType node = 0; node < numberOfVertices; ++node) {
                         vectorStepProcessorVertices_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back(
                             static_cast<EigenIdxType>(node));
                     }
@@ -116,7 +117,7 @@ class Sptrsv {
                                 EigenIdxType start = vectorStepProcessorVertices_[step][proc][0];
                                 EigenIdxType prev = vectorStepProcessorVertices_[step][proc][0];
 
-                                for (size_t i = 1; i < vectorStepProcessorVertices_[step][proc].size(); ++i) {
+                                for (UVertType i = 1; i < vectorStepProcessorVertices_[step][proc].size(); ++i) {
                                     if (vectorStepProcessorVertices_[step][proc][i] != prev + 1) {
                                         boundsArrayL_[step][proc].push_back(start);
                                         boundsArrayL_[step][proc].push_back(prev);
@@ -134,7 +135,7 @@ class Sptrsv {
                     break;
                 }
                 case 1: {
-                    size_t node = numberOfVertices;
+                    UVertType node = numberOfVertices;
                     do {
                         node--;
                         vectorStepProcessorVerticesU_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back(
@@ -149,7 +150,7 @@ class Sptrsv {
                                 EigenIdxType startU = static_cast<EigenIdxType>(vectorStepProcessorVerticesU_[step][proc][0]);
                                 EigenIdxType prevU = static_cast<EigenIdxType>(vectorStepProcessorVerticesU_[step][proc][0]);
 
-                                for (size_t i = 1; i < vectorStepProcessorVerticesU_[step][proc].size(); ++i) {
+                                for (UVertType i = 1; i < vectorStepProcessorVerticesU_[step][proc].size(); ++i) {
                                     if (static_cast<EigenIdxType>(vectorStepProcessorVerticesU_[step][proc][i]) != prevU - 1) {
                                         boundsArrayU_[step][proc].push_back(startU);
                                         boundsArrayU_[step][proc].push_back(prevU);
@@ -173,26 +174,26 @@ class Sptrsv {
         }
     }
 
-    void SetupCsrWithPermutationLoopProcessors(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule, std::vector<unsigned> &perm) {
+    void SetupCsrWithPermutationLoopProcessors(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule, std::vector<UVertType> &perm) {
         const auto *const csr = instance_->GetComputationalDag().GetCSR();
         const EigenIdxType *const outer = csr->outerIndexPtr();
         const EigenIdxType *const inner = csr->innerIndexPtr();
         const double *const values = csr->valuePtr();
 
         const SparseMatrixImp<EigenIdxType> &graph = instance_->GetComputationalDag();
-        assert(static_cast<std::size_t>(graph.NumVertices()) + static_cast<std::size_t>(graph.NumEdges()) <= static_cast<std::size_t>(std::numeric_limits<unsigned>::max()));
-        const unsigned numVert = static_cast<unsigned>(graph.NumVertices());
+        assert(static_cast<std::size_t>(graph.NumVertices()) + static_cast<std::size_t>(graph.NumEdges()) <= static_cast<std::size_t>(std::numeric_limits<UVertType>::max()));
+        const UVertType numVert = static_cast<UVertType>(graph.NumVertices());
         numSupersteps_ = schedule.NumberOfSupersteps();
         const unsigned numProcs = instance_->NumberOfProcessors();
 
-        perm = std::vector<unsigned>(numVert, 0U);
+        perm = std::vector<UVertType>(numVert, 0U);
 
-        val_ = std::vector<double>(static_cast<size_t>(csr->nonZeros()));
-        colIdx_ = std::vector<UVertType>(static_cast<size_t>(csr->nonZeros()));
+        val_ = std::vector<double>(static_cast<std::size_t>(csr->nonZeros()));
+        colIdx_ = std::vector<UVertType>(static_cast<std::size_t>(csr->nonZeros()));
         rowPtr_ = std::vector<UVertType>(numVert + 1U, 0U);
 
-        procStepPtr_ = std::vector<std::vector<unsigned>>(numProcs, std::vector<unsigned>(numSupersteps_, 0U));
-        procStepNum_ = std::vector<std::vector<unsigned>>(numProcs, std::vector<unsigned>(numSupersteps_, 0U));
+        procStepPtr_ = std::vector<std::vector<UVertType>>(numProcs, std::vector<UVertType>(numSupersteps_, 0U));
+        procStepNum_ = std::vector<std::vector<UVertType>>(numProcs, std::vector<UVertType>(numSupersteps_, 0U));
 
         for (const auto vert : graph.Vertices()) {
             const unsigned whichStep = schedule.AssignedSuperstep(vert);
@@ -201,7 +202,7 @@ class Sptrsv {
             perm[vert] = procStepNum_[whichProc][whichStep]++; // offsets
         }
 
-        unsigned accNode = 0U;
+        UVertType accNode = 0U;
         for (unsigned step = 0U; step < numSupersteps_; ++step) {
             for (unsigned proc = 0U; proc < numProcs; ++proc) {
                 procStepPtr_[proc][step] = accNode;
@@ -213,20 +214,20 @@ class Sptrsv {
             perm[vert] += procStepPtr_[schedule.AssignedProcessor(vert)][schedule.AssignedSuperstep(vert)];
         }
 
-        std::vector<std::vector<unsigned>> entryAccumulation = std::vector<std::vector<unsigned>>(numProcs, std::vector<unsigned>(numSupersteps_, 0U));
+        std::vector<std::vector<UVertType>> entryAccumulation = std::vector<std::vector<UVertType>>(numProcs, std::vector<UVertType>(numSupersteps_, 0U));
 
         for (const auto vert : graph.Vertices()) {
             const unsigned whichStep = schedule.AssignedSuperstep(vert);
             const unsigned whichProc = schedule.AssignedProcessor(vert);
 
             rowPtr_[perm[vert]] = entryAccumulation[whichProc][whichStep];
-            entryAccumulation[whichProc][whichStep] += static_cast<unsigned>(graph.InDegree(vert)) + 1;
+            entryAccumulation[whichProc][whichStep] += static_cast<UVertType>(graph.InDegree(vert)) + 1;
         }
 
-        unsigned accEntry = 0U;
+        UVertType accEntry = 0U;
         for (unsigned step = 0U; step < numSupersteps_; ++step) {
             for (unsigned proc = 0U; proc < numProcs; ++proc) {
-                unsigned temp = entryAccumulation[proc][step];
+                UVertType temp = entryAccumulation[proc][step];
                 entryAccumulation[proc][step] = accEntry;
                 accEntry += temp;
             }
@@ -239,14 +240,14 @@ class Sptrsv {
         }
 
         for (const auto vert : graph.Vertices()) {
-            std::vector<std::pair<unsigned, unsigned>> parents;
+            std::vector<std::pair<UVertType, UVertType>> parents;
             parents.reserve(graph.InDegree(vert));
             for (EigenIdxType edge = outer[vert]; edge < outer[vert + 1] - 1; ++edge) {
-                parents.emplace_back(perm[static_cast<std::size_t>(inner[edge])], static_cast<unsigned>(edge));
+                parents.emplace_back(perm[static_cast<UVertType>(inner[edge])], static_cast<UVertType>(edge));
             }
             std::sort(parents.begin(), parents.end());
 
-            const unsigned permVert = perm[vert];
+            const UVertType permVert = perm[vert];
             UVertType location = rowPtr_[permVert];
             for (const auto &[permPar, edgeIdx] : parents) {
                 colIdx_[location] = permPar;
@@ -258,28 +259,28 @@ class Sptrsv {
         }
     }
 
-    void SetupCsrWithPermutationProcessorsFirst(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule, std::vector<unsigned> &perm) {
+    void SetupCsrWithPermutationProcessorsFirst(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule, std::vector<UVertType> &perm) {
         const auto *const csr = instance_->GetComputationalDag().GetCSR();
         const EigenIdxType *const outer = csr->outerIndexPtr();
         const EigenIdxType *const inner = csr->innerIndexPtr();
         const double *const values = csr->valuePtr();
 
         const SparseMatrixImp<EigenIdxType> &graph = instance_->GetComputationalDag();
-        assert(static_cast<std::size_t>(graph.NumVertices()) + static_cast<std::size_t>(graph.NumEdges()) <= static_cast<std::size_t>(std::numeric_limits<unsigned>::max()));
-        const unsigned numVert = static_cast<unsigned>(graph.NumVertices());
+        assert(static_cast<std::size_t>(graph.NumVertices()) + static_cast<std::size_t>(graph.NumEdges()) <= static_cast<std::size_t>(std::numeric_limits<UVertType>::max()));
+        const UVertType numVert = static_cast<unsigned>(graph.NumVertices());
         numSupersteps_ = schedule.NumberOfSupersteps();
         const unsigned numProcs = instance_->NumberOfProcessors();
 
-        perm = std::vector<unsigned>(numVert, 0U);
+        perm = std::vector<UVertType>(numVert, 0U);
 
-        val_ = std::vector<double>(static_cast<size_t>(csr->nonZeros()));
-        colIdx_ = std::vector<UVertType>(static_cast<size_t>(csr->nonZeros()));
+        val_ = std::vector<double>(static_cast<std::size_t>(csr->nonZeros()));
+        colIdx_ = std::vector<UVertType>(static_cast<std::size_t>(csr->nonZeros()));
         rowPtr_ = std::vector<UVertType>(numVert + 1U, 0U);
 
-        procFirstStepPtr_ = std::vector<unsigned>(0U);
+        procFirstStepPtr_ = std::vector<UVertType>(0U);
         procFirstStepPtr_.reserve(numProcs + numSupersteps_ + 1U);
 
-        procStepNum_ = std::vector<std::vector<unsigned>>(numProcs, std::vector<unsigned>(numSupersteps_, 0U));
+        procStepNum_ = std::vector<std::vector<UVertType>>(numProcs, std::vector<UVertType>(numSupersteps_, 0U));
 
         for (const auto vert : graph.Vertices()) {
             const unsigned whichStep = schedule.AssignedSuperstep(vert);
@@ -288,7 +289,7 @@ class Sptrsv {
             perm[vert] = procStepNum_[whichProc][whichStep]++; // offsets
         }
 
-        unsigned accNode = 0U;
+        UVertType accNode = 0U;
         for (unsigned proc = 0U; proc < numProcs; ++proc) {
             for (unsigned step = 0U; step < numSupersteps_; ++step) {
                 procFirstStepPtr_.emplace_back(accNode);
@@ -302,20 +303,20 @@ class Sptrsv {
             perm[vert] += procFirstStepPtr_[schedule.AssignedProcessor(vert) * numSupersteps_ + schedule.AssignedSuperstep(vert)];
         }
 
-        std::vector<std::vector<unsigned>> entryAccumulation = std::vector<std::vector<unsigned>>(numProcs, std::vector<unsigned>(numSupersteps_, 0U));
+        std::vector<std::vector<UVertType>> entryAccumulation = std::vector<std::vector<UVertType>>(numProcs, std::vector<UVertType>(numSupersteps_, 0U));
 
         for (const auto vert : graph.Vertices()) {
             const unsigned whichStep = schedule.AssignedSuperstep(vert);
             const unsigned whichProc = schedule.AssignedProcessor(vert);
 
             rowPtr_[perm[vert]] = entryAccumulation[whichProc][whichStep];
-            entryAccumulation[whichProc][whichStep] += static_cast<unsigned>(graph.InDegree(vert)) + 1;
+            entryAccumulation[whichProc][whichStep] += static_cast<UVertType>(graph.InDegree(vert)) + 1;
         }
 
-        unsigned accEntry = 0U;
+        UVertType accEntry = 0U;
         for (unsigned proc = 0U; proc < numProcs; ++proc) {
             for (unsigned step = 0U; step < numSupersteps_; ++step) {
-                unsigned temp = entryAccumulation[proc][step];
+                UVertType temp = entryAccumulation[proc][step];
                 entryAccumulation[proc][step] = accEntry;
                 accEntry += temp;
             }
@@ -328,14 +329,14 @@ class Sptrsv {
         }
 
         for (const auto vert : graph.Vertices()) {
-            std::vector<std::pair<unsigned, unsigned>> parents;
+            std::vector<std::pair<UVertType, UVertType>> parents;
             parents.reserve(graph.InDegree(vert));
             for (EigenIdxType edge = outer[vert]; edge < outer[vert + 1] - 1; ++edge) {
-                parents.emplace_back(perm[static_cast<std::size_t>(inner[edge])], static_cast<unsigned>(edge));
+                parents.emplace_back(perm[static_cast<UVertType>(inner[edge])], static_cast<UVertType>(edge));
             }
             std::sort(parents.begin(), parents.end());
 
-            const unsigned permVert = perm[vert];
+            const UVertType permVert = perm[vert];
             UVertType location = rowPtr_[permVert];
             for (const auto &[permPar, edgeIdx] : parents) {
                 colIdx_[location] = permPar;
@@ -347,25 +348,25 @@ class Sptrsv {
         }
     }
 
-    void SetupCsrWithPermutation(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule, std::vector<size_t> &perm) {
-        std::vector<size_t> permInv(perm.size());
-        for (size_t i = 0; i < perm.size(); i++) {
+    void SetupCsrWithPermutation(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule, std::vector<UVertType> &perm) {
+        std::vector<UVertType> permInv(perm.size());
+        for (UVertType i = 0; i < perm.size(); i++) {
             permInv[perm[i]] = i;
         }
 
         numSupersteps_ = schedule.NumberOfSupersteps();
 
         val_.clear();
-        val_.reserve(static_cast<size_t>(instance_->GetComputationalDag().GetCSR()->nonZeros()));
+        val_.reserve(static_cast<std::size_t>(instance_->GetComputationalDag().GetCSR()->nonZeros()));
 
         colIdx_.clear();
-        colIdx_.reserve(static_cast<size_t>(instance_->GetComputationalDag().GetCSR()->nonZeros()));
+        colIdx_.reserve(static_cast<std::size_t>(instance_->GetComputationalDag().GetCSR()->nonZeros()));
 
         rowPtr_.clear();
         rowPtr_.reserve(instance_->NumberOfVertices() + 1);
 
         stepProcPtr_
-            = std::vector<std::vector<unsigned>>(numSupersteps_, std::vector<unsigned>(instance_->NumberOfProcessors(), 0));
+            = std::vector<std::vector<UVertType>>(numSupersteps_, std::vector<UVertType>(instance_->NumberOfProcessors(), 0));
 
         stepProcNum_ = schedule.NumAssignedNodesPerSuperstepProcessor();
 
@@ -385,10 +386,10 @@ class Sptrsv {
                     }
                 }
 
-                stepProcPtr_[currentStep][currentProcessor] = static_cast<unsigned>(rowPtr_.size());
+                stepProcPtr_[currentStep][currentProcessor] = static_cast<UVertType>(rowPtr_.size());
             }
 
-            rowPtr_.push_back(colIdx_.size());
+            rowPtr_.push_back(static_cast<UVertType>(colIdx_.size()));
 
             std::set<UVertType> parents;
 
@@ -403,7 +404,7 @@ class Sptrsv {
                 const auto *outer = instance_->GetComputationalDag().GetCSR()->outerIndexPtr();
                 for (UVertType parInd = static_cast<UVertType>(outer[node]); parInd < static_cast<UVertType>(outer[node + 1] - 1);
                      ++parInd) {
-                    if (static_cast<size_t>(instance_->GetComputationalDag().GetCSR()->innerIndexPtr()[parInd]) == permInv[par]) {
+                    if (static_cast<UVertType>(instance_->GetComputationalDag().GetCSR()->innerIndexPtr()[parInd]) == permInv[par]) {
                         val_.push_back(instance_->GetComputationalDag().GetCSR()->valuePtr()[parInd]);
                         found++;
                     }
@@ -417,7 +418,7 @@ class Sptrsv {
                                ->valuePtr()[instance_->GetComputationalDag().GetCSR()->outerIndexPtr()[node + 1] - 1]);
         }
 
-        rowPtr_.push_back(colIdx_.size());
+        rowPtr_.push_back(static_cast<UVertType>(colIdx_.size()));
     }
 
     void LsolveSerial() const {
@@ -426,16 +427,9 @@ class Sptrsv {
         const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
         double *const x = x_;
         const double *const b = b_;
-
         const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
-        for (EigenIdxType i = 0; i < numberOfVertices; ++i) {
-            x[i] = b[i];
-            double acc = 0.0;
-            for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) {
-                acc += valPtr[j] * x[inner[j]];
-            }
-            x[i] = (x[i] - acc) / valPtr[outer[i + 1] - 1];
-        }
+
+        SpLTrSvSerial(numberOfVertices, x, b, outer, inner, valPtr);
     }
 
     void UsolveSerial() const {
@@ -465,27 +459,7 @@ class Sptrsv {
         const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
         double *const x = x_;
 
-#    pragma omp parallel num_threads(instance_->NumberOfProcessors())
-        {
-            const size_t proc = static_cast<size_t>(omp_get_thread_num());
-            for (unsigned step = 0; step < numSupersteps_; ++step) {
-                const size_t boundsStrSize = boundsArrayL_[step][proc].size();
-
-                for (size_t index = 0; index < boundsStrSize; index += 2) {
-                    EigenIdxType lowerB = boundsArrayL_[step][proc][index];
-                    const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
-
-                    for (EigenIdxType node = lowerB; node <= upperB; ++node) {
-                        double acc = 0.0;
-                        for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
-                            acc += valPtr[i] * x[inner[i]];
-                        }
-                        x[node] = (x[node] - acc) / valPtr[outer[node + 1] - 1];
-                    }
-                }
-#    pragma omp barrier
-            }
-        }
+        SpLTrSvBSPParallelInPlace(x, outer, inner, valPtr, boundsArrayL_);
     }
 
     void UsolveNoPermutationInPlace() const {
@@ -497,12 +471,12 @@ class Sptrsv {
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
             // Process each superstep starting from the last one (opposite of lsolve)
-            const size_t proc = static_cast<size_t>(omp_get_thread_num());
+            const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             unsigned step = numSupersteps_;
             do {
                 step--;
-                const size_t boundsStrSize = boundsArrayU_[step][proc].size();
-                for (size_t index = 0; index < boundsStrSize; index += 2) {
+                const std::size_t boundsStrSize = boundsArrayU_[step][proc].size();
+                for (std::size_t index = 0; index < boundsStrSize; index += 2) {
                     EigenIdxType node = boundsArrayU_[step][proc][index] + 1;
                     const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1];
 
@@ -527,28 +501,7 @@ class Sptrsv {
         double *const x = x_;
         const double *const b = b_;
 
-#    pragma omp parallel num_threads(instance_->NumberOfProcessors())
-        {
-            const size_t proc = static_cast<size_t>(omp_get_thread_num());
-            for (unsigned step = 0; step < numSupersteps_; ++step) {
-                const size_t boundsStrSize = boundsArrayL_[step][proc].size();
-
-                for (size_t index = 0; index < boundsStrSize; index += 2) {
-                    EigenIdxType lowerB = boundsArrayL_[step][proc][index];
-                    const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
-
-                    for (EigenIdxType node = lowerB; node <= upperB; ++node) {
-                        x[node] = b[node];
-                        double acc = 0.0;
-                        for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
-                            acc += valPtr[i] * x[inner[i]];
-                        }
-                        x[node] = (x[node] - acc) / valPtr[outer[node + 1] - 1];
-                    }
-                }
-#    pragma omp barrier
-            }
-        }
+        SpLTrSvBSPParallel(x, b, outer, inner, valPtr, boundsArrayL_);
     }
 
     void UsolveNoPermutation() const {
@@ -561,12 +514,12 @@ class Sptrsv {
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
             // Process each superstep starting from the last one (opposite of lsolve)
-            const size_t proc = static_cast<size_t>(omp_get_thread_num());
+            const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             unsigned step = numSupersteps_;
             do {
                 step--;
-                const size_t boundsStrSize = boundsArrayU_[step][proc].size();
-                for (size_t index = 0; index < boundsStrSize; index += 2) {
+                const std::size_t boundsStrSize = boundsArrayU_[step][proc].size();
+                for (std::size_t index = 0; index < boundsStrSize; index += 2) {
                     EigenIdxType node = boundsArrayU_[step][proc][index] + 1;
                     const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1];
 
@@ -590,15 +543,9 @@ class Sptrsv {
         const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
         const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
         double *const x = x_;
-
         const EigenIdxType numberOfVertices = static_cast<EigenIdxType>(instance_->NumberOfVertices());
-        for (EigenIdxType i = 0; i < numberOfVertices; ++i) {
-            double acc = 0.0;
-            for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) {
-                acc += valPtr[j] * x[inner[j]];
-            }
-            x[i] = (x[i] - acc) / valPtr[outer[i + 1] - 1];
-        }
+
+        SpLTrSvSerialInPlace(numberOfVertices, x, outer, inner, valPtr);
     }
 
     void UsolveSerialInPlace() const {
@@ -624,7 +571,7 @@ class Sptrsv {
 
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
-            const size_t proc = static_cast<size_t>(omp_get_thread_num());
+            const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; step++) {
                 const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
                 for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
@@ -644,25 +591,7 @@ class Sptrsv {
     void LsolveWithProcFirstPermutationInPlace() const {
         double *const x = x_;
 
-#    pragma omp parallel num_threads(instance_->NumberOfProcessors())
-        {
-            const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
-            const auto endStepPtr = std::next(procFirstStepPtr_.cbegin(), (proc + 1U) * numSupersteps_);
-            for (auto stepPtr = std::next(procFirstStepPtr_.cbegin(), proc * numSupersteps_); stepPtr != endStepPtr;) {
-                UVertType rowIdx = *stepPtr;
-                const UVertType endRowIdx = *(++stepPtr);
-                for (; rowIdx != endRowIdx; ++rowIdx) {
-                    double acc = 0.0;
-                    for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
-                        acc += val_[i] * x[colIdx_[i]];
-                    }
-
-                    x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
-                }
-
-#    pragma omp barrier
-            }
-        }
+        SpLTrSvProcPermBSPParallelInPlace(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_);
     }
 
     void LsolveWithPermutation() const {
@@ -672,7 +601,7 @@ class Sptrsv {
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
             for (unsigned step = 0; step < numSupersteps_; step++) {
-                const size_t proc = static_cast<size_t>(omp_get_thread_num());
+                const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
                 const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
                 for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
                     x[rowIdx] = b[rowIdx];
@@ -779,17 +708,17 @@ class Sptrsv {
         }
     }
 
-    void PermuteXVectorInverse(const std::vector<size_t> &perm) {
+    void PermuteXVectorInverse(const std::vector<UVertType> &perm) {
         std::vector<double> vecUnperm(perm.size());
-        for (size_t i = 0; i < perm.size(); i++) {
+        for (UVertType i = 0; i < perm.size(); i++) {
             vecUnperm[perm[i]] = x_[i];
         }
-        for (size_t i = 0; i < perm.size(); i++) {
+        for (UVertType i = 0; i < perm.size(); i++) {
             x_[i] = vecUnperm[i];
         }
     }
 
-    std::size_t GetNumberOfVertices() const { return instance_->NumberOfVertices(); }
+    UVertType GetNumberOfVertices() const { return instance_->NumberOfVertices(); }
 
     // SSP Lsolve with staleness=2 (allowing at most one superstep of lag).
     // Uses FlatCheckpointCounterBarrier created internally.
@@ -810,12 +739,12 @@ class Sptrsv {
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; ++step) {
                 // Process nodes assigned to this (step, proc) pair.
-                const size_t boundsStrSize = boundsArrayL_[step][proc].size();
+                const std::size_t boundsStrSize = boundsArrayL_[step][proc].size();
                 // Enforce staleness window before starting this superstep.
                 if (boundsStrSize > 0U) {
                     barrier.Wait(proc, staleness - 1U);
                 }
-                for (size_t index = 0; index < boundsStrSize; index += 2) {
+                for (std::size_t index = 0; index < boundsStrSize; index += 2) {
                     EigenIdxType lowerB = boundsArrayL_[step][proc][index];
                     const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
@@ -855,12 +784,12 @@ class Sptrsv {
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
             for (unsigned step = 0; step < numSupersteps_; ++step) {
                 // Process nodes assigned to this (step, proc) pair.
-                const size_t boundsStrSize = boundsArrayL_[step][proc].size();
+                const std::size_t boundsStrSize = boundsArrayL_[step][proc].size();
                 // Enforce staleness window before starting this superstep.
                 if (boundsStrSize > 0U) {
                     barrier.Wait(proc, staleness - 1U);
                 }
-                for (size_t index = 0; index < boundsStrSize; index += 2) {
+                for (std::size_t index = 0; index < boundsStrSize; index += 2) {
                     EigenIdxType lowerB = boundsArrayL_[step][proc][index];
                     const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
                     for (EigenIdxType node = lowerB; node <= upperB; ++node) {
@@ -900,12 +829,12 @@ class Sptrsv {
             unsigned step = numSupersteps_;
             do {
                 step--;
-                const size_t boundsStrSize = boundsArrayU_[step][proc].size();
+                const std::size_t boundsStrSize = boundsArrayU_[step][proc].size();
                 if (boundsStrSize > 0U) {
                     barrier.Wait(proc, staleness - 1U);
                 }
 
-                for (size_t index = 0; index < boundsStrSize; index += 2) {
+                for (std::size_t index = 0; index < boundsStrSize; index += 2) {
                     EigenIdxType node = boundsArrayU_[step][proc][index] + 1;
                     const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1];
 
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
new file mode 100644
index 00000000..26739e6e
--- /dev/null
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
@@ -0,0 +1,160 @@
+/*
+Copyright 2026 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include <omp.h>
+
+#include <cstddef>
+#include <type_traits>
+#include <vector>
+
+#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp"
+
+namespace osp {
+
+template <typename IdxType>
+void SpLTrSvSerial(const IdxType N,
+                   double *__restrict__ const x,
+                   const double *__restrict__ const b,
+                   const IdxType *__restrict__ const outer,
+                   const IdxType *__restrict__ const inner,
+                   const double *__restrict__ const val) {
+    static_assert(std::is_integral_v<IdxType>);
+
+    for (IdxType row = 0; row < N; ++row) {
+        double acc = b[row];
+        for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
+            acc -= val[entryIdx] * x[inner[entryIdx]];
+        }
+        x[row] = acc / val[outer[row + 1] - 1];
+    }
+}
+
+template <typename IdxType>
+void SpLTrSvSerialInPlace(const IdxType N,
+                          double *__restrict__ const x,
+                          const IdxType *__restrict__ const outer,
+                          const IdxType *__restrict__ const inner,
+                          const double *__restrict__ const val) {
+    static_assert(std::is_integral_v<IdxType>);
+
+    for (IdxType row = 0; row < N; ++row) {
+        double acc = x[row];
+        for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
+            acc -= val[entryIdx] * x[inner[entryIdx]];
+        }
+        x[row] = acc / val[outer[row + 1] - 1];
+    }
+}
+
+template <typename IdxType>
+void SpLTrSvBSPParallel(double *__restrict__ const x,
+                        const double *__restrict__ const b,
+                        const IdxType *__restrict__ const outer,
+                        const IdxType *__restrict__ const inner,
+                        const double *__restrict__ const val,
+                        const std::vector<std::vector<std::vector<IdxType>>> &BoundsStepProcIdx) {
+    static_assert(std::is_integral_v<IdxType>);
+
+#pragma omp parallel num_threads(BoundsStepProcIdx[0U].size())
+    {
+        const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+        const std::size_t numSuperSteps = BoundsStepProcIdx.size();
+
+        for (std::size_t step = 0U; step < numSuperSteps; ++step) {
+            const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size();
+            for (std::size_t idx = 0U; idx < ubIdx; ++idx) {
+                IdxType row = BoundsStepProcIdx[step][proc][idx];
+                const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx];
+                for (; row <= ubRow; ++row) {
+                    double acc = b[row];
+                    for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
+                        acc -= val[entryIdx] * x[inner[entryIdx]];
+                    }
+                    x[row] = acc / val[outer[row + 1] - 1];
+                }
+            }
+#pragma omp barrier
+        }
+    }
+}
+
+template <typename IdxType>
+void SpLTrSvBSPParallelInPlace(double *__restrict__ const x,
+                               const IdxType *__restrict__ const outer,
+                               const IdxType *__restrict__ const inner,
+                               const double *__restrict__ const val,
+                               const std::vector<std::vector<std::vector<IdxType>>> &BoundsStepProcIdx) {
+    static_assert(std::is_integral_v<IdxType>);
+
+#pragma omp parallel num_threads(BoundsStepProcIdx[0U].size())
+    {
+        const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+        const std::size_t numSuperSteps = BoundsStepProcIdx.size();
+
+        for (std::size_t step = 0U; step < numSuperSteps; ++step) {
+            const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size();
+            for (std::size_t idx = 0U; idx < ubIdx; ++idx) {
+                IdxType row = BoundsStepProcIdx[step][proc][idx];
+                const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx];
+                for (; row <= ubRow; ++row) {
+                    double acc = x[row];
+                    for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
+                        acc -= val[entryIdx] * x[inner[entryIdx]];
+                    }
+                    x[row] = acc / val[outer[row + 1] - 1];
+                }
+            }
+#pragma omp barrier
+        }
+    }
+}
+
+template <typename IdxType>
+void SpLTrSvProcPermBSPParallelInPlace(double *__restrict__ const x,
+                                       const IdxType *__restrict__ const outer,
+                                       const IdxType *__restrict__ const inner,
+                                       const double *__restrict__ const val,
+                                       const unsigned numProcs,
+                                       const unsigned numSuperSteps,
+                                       const std::vector<IdxType> &procStepPtr) {
+    static_assert(std::is_integral_v<IdxType>);
+
+#pragma omp parallel num_threads(numProcs)
+    {
+        const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
+        const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps);
+        for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) {
+            IdxType row = *stepPtr;
+            const IdxType endRow = *(++stepPtr);
+            for (; row != endRow; ++row) {
+                double acc = x[row];
+                for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
+                    acc -= val[entryIdx] * x[inner[entryIdx]];
+                }
+
+                x[row] = acc / val[outer[row + 1] - 1];
+            }
+
+#pragma omp barrier
+        }
+    }
+}
+
+}    // end namespace osp
diff --git a/include/osp/bsp/model/BspSchedule.hpp b/include/osp/bsp/model/BspSchedule.hpp
index d132c267..d5abeb0b 100644
--- a/include/osp/bsp/model/BspSchedule.hpp
+++ b/include/osp/bsp/model/BspSchedule.hpp
@@ -553,8 +553,8 @@ class BspSchedule : public IBspSchedule<GraphT>, public IBspScheduleEval<GraphT>
      *
      * @return A 2D vector containing the number of nodes assigned to each processor in each superstep.
      */
-    [[nodiscard]] std::vector<std::vector<unsigned>> NumAssignedNodesPerSuperstepProcessor() const {
-        std::vector<std::vector<unsigned>> num(numberOfSupersteps_, std::vector<unsigned>(instance_->NumberOfProcessors(), 0));
+    [[nodiscard]] std::vector<std::vector<VertexIdx>> NumAssignedNodesPerSuperstepProcessor() const {
+        std::vector<std::vector<VertexIdx>> num(numberOfSupersteps_, std::vector<VertexIdx>(instance_->NumberOfProcessors(), 0));
 
         for (const auto &v : instance_->Vertices()) {
             num[nodeToSuperstepAssignment_[v]][nodeToProcessorAssignment_[v]] += 1;
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
index 64a5b97f..b9bd1ced 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp
@@ -469,7 +469,7 @@ ReturnStatus GrowLocalSSP<GraphT, staleness_t>::ComputeSchedule(MaxBspSchedule<G
         }
 
         for (unsigned proc = 0U; proc < numProcs; ++proc) {
-            totalAssigned += bestNewAssignments[proc].size();
+            totalAssigned += static_cast<VertexType>(bestNewAssignments[proc].size());
             for (const VertexType &node : bestNewAssignments[proc]) {
                 schedule.SetAssignedProcessor(node, proc);
 
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/RandomGreedy.hpp b/include/osp/bsp/scheduler/GreedySchedulers/RandomGreedy.hpp
index 6297a7ba..6f9f5164 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/RandomGreedy.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/RandomGreedy.hpp
@@ -69,7 +69,7 @@ class RandomGreedy : public Scheduler<GraphT> {
             bool fewSources = next.size() < instance.NumberOfProcessors() ? true : false;
             unsigned failCounter = 0;
             while (!next.empty() && failCounter < 20) {
-                std::uniform_int_distribution<VertexType> randNodeIdx(0, next.size() - 1);
+                std::uniform_int_distribution<VertexType> randNodeIdx(0, static_cast<VertexType>(next.size() - 1));
                 VertexType nodeInd = randNodeIdx(g);
                 const auto &node = next[nodeInd];
                 bool processorSet = false;
diff --git a/include/osp/coarser/Sarkar/SarkarMul.hpp b/include/osp/coarser/Sarkar/SarkarMul.hpp
index 86793b99..195ee6a2 100644
--- a/include/osp/coarser/Sarkar/SarkarMul.hpp
+++ b/include/osp/coarser/Sarkar/SarkarMul.hpp
@@ -122,7 +122,7 @@ ReturnStatus SarkarMul<GraphT, GraphTCoarse>::RunSingleContractionMode(VertexIdx
     if (firstCoarsen_) {
         currentNumVertices = MultilevelCoarser<GraphT, GraphTCoarse>::GetOriginalGraph()->NumVertices();
     } else {
-        currentNumVertices = MultilevelCoarser<GraphT, GraphTCoarse>::dagHistory_.back()->NumVertices();
+        currentNumVertices = static_cast<VertexIdxT<GraphT>>(MultilevelCoarser<GraphT, GraphTCoarse>::dagHistory_.back()->NumVertices());
     }
 
     GraphTCoarse coarsenedDag;
@@ -145,7 +145,7 @@ ReturnStatus SarkarMul<GraphT, GraphTCoarse>::RunSingleContractionMode(VertexIdx
     status = std::max(
         status, MultilevelCoarser<GraphT, GraphTCoarse>::AddContraction(std::move(contractionMap), std::move(coarsenedDag)));
 
-    VertexIdxT<GraphT> newNumVertices = MultilevelCoarser<GraphT, GraphTCoarse>::dagHistory_.back()->NumVertices();
+    VertexIdxT<GraphT> newNumVertices = static_cast<VertexIdxT<GraphT>>(MultilevelCoarser<GraphT, GraphTCoarse>::dagHistory_.back()->NumVertices());
     diffVertices = currentNumVertices - newNumVertices;
 
     return status;
diff --git a/include/osp/coarser/SquashA/SquashAMul.hpp b/include/osp/coarser/SquashA/SquashAMul.hpp
index 2d0c85fb..8a3fbd32 100644
--- a/include/osp/coarser/SquashA/SquashAMul.hpp
+++ b/include/osp/coarser/SquashA/SquashAMul.hpp
@@ -97,7 +97,7 @@ ReturnStatus SquashAMul<GraphT, GraphTCoarse>::RunContractions() {
         status = std::max(
             status, MultilevelCoarser<GraphT, GraphTCoarse>::AddContraction(std::move(contractionMap), std::move(coarsenedDag)));
 
-        VertexIdxT<GraphT> newNumVertices = MultilevelCoarser<GraphT, GraphTCoarse>::dagHistory_.back()->NumVertices();
+        VertexIdxT<GraphT> newNumVertices = static_cast<VertexIdxT<GraphT>>(MultilevelCoarser<GraphT, GraphTCoarse>::dagHistory_.back()->NumVertices());
 
         if (newNumVertices == currentNumVertices) {
             noChangeInARow++;
diff --git a/include/osp/coarser/coarser_util.hpp b/include/osp/coarser/coarser_util.hpp
index aec1a48e..1f5a0203 100644
--- a/include/osp/coarser/coarser_util.hpp
+++ b/include/osp/coarser/coarser_util.hpp
@@ -490,9 +490,9 @@ bool PullBackSchedule(const BspSchedule<GraphTIn> &scheduleIn,
 
 template <typename GraphTIn, typename GraphTOut>
 bool PullBackSchedule(const BspSchedule<GraphTIn> &scheduleIn,
-                      const std::vector<VertexIdxT<GraphTOut>> &reverseVertexMap,
+                      const std::vector<VertexIdxT<GraphTIn>> &reverseVertexMap,
                       BspSchedule<GraphTOut> &scheduleOut) {
-    for (unsigned idx = 0; idx < reverseVertexMap.size(); ++idx) {
+    for (const auto &idx : scheduleOut.GetInstance().GetComputationalDag().Vertices()) {
         const auto &v = reverseVertexMap[idx];
 
         scheduleOut.SetAssignedSuperstep(idx, scheduleIn.AssignedSuperstep(v));
diff --git a/include/osp/coarser/top_order/top_order_coarser.hpp b/include/osp/coarser/top_order/top_order_coarser.hpp
index d3b2d9cc..59567653 100644
--- a/include/osp/coarser/top_order/top_order_coarser.hpp
+++ b/include/osp/coarser/top_order/top_order_coarser.hpp
@@ -43,7 +43,7 @@ class TopOrderCoarser : public Coarser<GraphTIn, GraphTOut> {
     VMemwT<GraphTIn> currentMemory_ = 0;
     VWorkwT<GraphTIn> currentWork_ = 0;
     VCommwT<GraphTIn> currentCommunication_ = 0;
-    VertexType currentSuperNodeIdx_ = 0;
+    VertexIdxT<GraphTOut> currentSuperNodeIdx_ = 0;
 
     void FinishSuperNodeAddEdges(const GraphTIn &dagIn,
                                  GraphTOut &dagOut,
@@ -56,8 +56,8 @@ class TopOrderCoarser : public Coarser<GraphTIn, GraphTOut> {
         for (const auto &node : nodes) {
             if constexpr (hasEdgeWeightsV<GraphTIn> && hasEdgeWeightsV<GraphTOut>) {
                 for (const auto &inEdge : InEdges(node, dagIn)) {
-                    const VertexType parentRev = reverseVertexMap[Source(inEdge, dagIn)];
-                    if (parentRev != currentSuperNodeIdx_ && parentRev != std::numeric_limits<VertexType>::max()) {
+                    const VertexIdxT<GraphTOut> parentRev = reverseVertexMap[Source(inEdge, dagIn)];
+                    if (parentRev != currentSuperNodeIdx_ && parentRev != std::numeric_limits<VertexIdxT<GraphTOut>>::max()) {
                         auto pair = EdgeDesc(parentRev, currentSuperNodeIdx_, dagOut);
                         if (pair.second) {
                             dagOut.SetEdgeCommWeight(pair.first, dagOut.EdgeCommWeight(pair.first) + dagIn.EdgeCommWeight(inEdge));
@@ -68,8 +68,8 @@ class TopOrderCoarser : public Coarser<GraphTIn, GraphTOut> {
                 }
             } else {
                 for (const auto &parent : dagIn.Parents(node)) {
-                    const VertexType parentRev = reverseVertexMap[parent];
-                    if (parentRev != currentSuperNodeIdx_ && parentRev != std::numeric_limits<VertexType>::max()) {
+                    const VertexIdxT<GraphTOut> parentRev = reverseVertexMap[parent];
+                    if (parentRev != currentSuperNodeIdx_ && parentRev != std::numeric_limits<VertexIdxT<GraphTOut>>::max()) {
                         if (not Edge(parentRev, currentSuperNodeIdx_, dagOut)) {
                             dagOut.AddEdge(parentRev, currentSuperNodeIdx_);
                         }
diff --git a/include/osp/concepts/directed_graph_concept.hpp b/include/osp/concepts/directed_graph_concept.hpp
index aaa537ad..09bc9900 100644
--- a/include/osp/concepts/directed_graph_concept.hpp
+++ b/include/osp/concepts/directed_graph_concept.hpp
@@ -64,13 +64,15 @@ struct IsDirectedGraph<T,
                                    decltype(std::declval<T>().Children(std::declval<VertexIdxT<T>>())),
                                    decltype(std::declval<T>().InDegree(std::declval<VertexIdxT<T>>())),
                                    decltype(std::declval<T>().OutDegree(std::declval<VertexIdxT<T>>()))>>
-    : std::conjunction<IsForwardRangeOf<decltype(std::declval<T>().Vertices()), VertexIdxT<T>>,
+    : std::conjunction<
+                       IsForwardRangeOf<decltype(std::declval<T>().Vertices()), VertexIdxT<T>>,
                        std::is_integral<decltype(std::declval<T>().NumVertices())>,
                        std::is_integral<decltype(std::declval<T>().NumEdges())>,
-                       IsInputRangeOf<decltype(std::declval<T>().Parents(std::declval<VertexIdxT<T>>())), VertexIdxT<T>>,
-                       IsInputRangeOf<decltype(std::declval<T>().Children(std::declval<VertexIdxT<T>>())), VertexIdxT<T>>,
-                       std::is_integral<decltype(std::declval<T>().InDegree(std::declval<VertexIdxT<T>>()))>,
-                       std::is_integral<decltype(std::declval<T>().OutDegree(std::declval<VertexIdxT<T>>()))>> {};
+                       IsInputRangeOf<decltype(std::declval<T>().Parents(std::declval<VertexIdxT<T>>())), VertexIdxT<T>>
+                    //    IsInputRangeOf<decltype(std::declval<T>().Children(std::declval<VertexIdxT<T>>())), VertexIdxT<T>>,
+                    //    std::is_integral<decltype(std::declval<T>().InDegree(std::declval<VertexIdxT<T>>()))>,
+                    //    std::is_integral<decltype(std::declval<T>().OutDegree(std::declval<VertexIdxT<T>>()))>
+                       > {};
 
 template <typename T>
 inline constexpr bool isDirectedGraphV = IsDirectedGraph<T>::value;
diff --git a/include/osp/graph_algorithms/cuthill_mckee.hpp b/include/osp/graph_algorithms/cuthill_mckee.hpp
index 6470d17d..c1dcc2bb 100644
--- a/include/osp/graph_algorithms/cuthill_mckee.hpp
+++ b/include/osp/graph_algorithms/cuthill_mckee.hpp
@@ -205,7 +205,7 @@ std::vector<VertexIdxT<GraphT>> CuthillMckeeUndirected(const GraphT &dag, bool s
             }
         }
 
-        nodeCounter += currentLevel.size();
+        nodeCounter += static_cast<VertexType>(currentLevel.size());
 
         if (nodePriority.empty()) {    // the dag has more than one connected components
 
diff --git a/include/osp/graph_algorithms/directed_graph_path_util.hpp b/include/osp/graph_algorithms/directed_graph_path_util.hpp
index 37733275..16423bc6 100644
--- a/include/osp/graph_algorithms/directed_graph_path_util.hpp
+++ b/include/osp/graph_algorithms/directed_graph_path_util.hpp
@@ -132,7 +132,7 @@ std::size_t LongestPath(const GraphT &graph) {
             if (visitCounter[child] == graph.InDegree(child)) {
                 bfsQueue.push(child);
                 distances[child] = distances[current] + 1;
-                maxEdgecount = std::max(maxEdgecount, distances[child]);
+                maxEdgecount = std::max(maxEdgecount, static_cast<std::size_t>(distances[child]));
             }
         }
     }
diff --git a/include/osp/graph_implementations/eigen_matrix_adapter/eigen_sparse_iterator.hpp b/include/osp/graph_implementations/eigen_matrix_adapter/eigen_sparse_iterator.hpp
index b621ee3b..227a6724 100644
--- a/include/osp/graph_implementations/eigen_matrix_adapter/eigen_sparse_iterator.hpp
+++ b/include/osp/graph_implementations/eigen_matrix_adapter/eigen_sparse_iterator.hpp
@@ -30,6 +30,7 @@ class EigenCSRRange {
   public:
     using CSRMatrix = Eigen::SparseMatrix<double, Eigen::RowMajor, EigenIdxType>;
     using Inner = typename CSRMatrix::InnerIterator;
+    using UsignedType = std::make_unsigned_t<EigenIdxType>;
 
     class Iterator {
         Inner it_;
@@ -41,10 +42,10 @@ class EigenCSRRange {
         }
 
       public:
-        using value_type = std::size_t;
+        using value_type = UsignedType;
         using reference = value_type;
         using pointer = void;
-        using difference_type = std::ptrdiff_t;
+        using difference_type = UsignedType;
         using iterator_category = std::input_iterator_tag;
 
         Iterator() = default;
@@ -58,7 +59,7 @@ class EigenCSRRange {
 
         Iterator(const CSRMatrix &mat, EigenIdxType idx) : it_(mat, idx) { SkipDiagonal(); }
 
-        reference operator*() const { return static_cast<std::size_t>(it_.col()); }
+        reference operator*() const { return static_cast<UsignedType>(it_.col()); }
 
         Iterator &operator++() {
             ++it_;
@@ -93,9 +94,11 @@ class EigenCSCRange {
     const Graph &graph_;
     EigenIdxType index_;
 
-  public:
+
+    public:
     using CSCMatrix = Eigen::SparseMatrix<double, Eigen::ColMajor, EigenIdxType>;
     using Inner = typename CSCMatrix::InnerIterator;
+    using UsignedType = std::make_unsigned_t<EigenIdxType>;
 
     class Iterator {
         Inner it_;
@@ -107,10 +110,10 @@ class EigenCSCRange {
         }
 
       public:
-        using value_type = std::size_t;
+        using value_type = UsignedType;
         using reference = value_type;
         using pointer = void;
-        using difference_type = std::ptrdiff_t;
+        using difference_type = UsignedType;
         using iterator_category = std::input_iterator_tag;
 
         Iterator() = default;
@@ -124,7 +127,7 @@ class EigenCSCRange {
 
         Iterator(const CSCMatrix &mat, EigenIdxType idx) : it_(mat, idx) { SkipDiagonal(); }
 
-        reference operator*() const { return static_cast<std::size_t>(it_.row()); }
+        reference operator*() const { return static_cast<UsignedType>(it_.row()); }
 
         Iterator &operator++() {
             ++it_;
diff --git a/include/osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp b/include/osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp
index 96bdad19..d3233467 100644
--- a/include/osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp
+++ b/include/osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp
@@ -20,6 +20,9 @@ limitations under the License.
 
 #ifdef EIGEN_FOUND
 
+#    include <cstdint>
+#    include <type_traits>
+
 #    include <Eigen/SparseCore>
 
 #    include "eigen_sparse_iterator.hpp"
@@ -47,7 +50,7 @@ class SparseMatrixImp {
 
   public:
     // Vertex index type must match Eigen's StorageIndex (signed 32-bit)
-    using VertexIdx = size_t;
+    using VertexIdx = std::make_unsigned_t<EigenIdxType>;
 
     // Required graph trait aliases (used in concept checks)
     using VertexWorkWeightType = EigenIdxType;
@@ -70,10 +73,10 @@ class SparseMatrixImp {
     const MatrixCSC *GetCSC() const { return lCscP_; }
 
     /// @brief Number of vertices = number of rows in the matrix
-    size_t NumVertices() const noexcept { return static_cast<size_t>(lCsrP_->rows()); }
+    VertexIdx NumVertices() const noexcept { return static_cast<VertexIdx>(lCsrP_->rows()); }
 
     /// @brief Return a range over all vertices [0, NumVertices)
-    auto Vertices() const { return osp::IntegralRange<size_t>(NumVertices()); }
+    auto Vertices() const { return osp::IntegralRange<VertexIdx>(NumVertices()); }
 
     /// @brief Number of edges = total non-zeros minus diagonal elements
     VertexIdx NumEdges() const noexcept { return static_cast<VertexIdx>(lCsrP_->nonZeros() - lCsrP_->rows()); }
@@ -116,8 +119,6 @@ class SparseMatrixImp {
 using SparseMatrixGraphInt32T = SparseMatrixImp<int32_t>;
 using SparseMatrixGraphInt64T = SparseMatrixImp<int64_t>;
 
-static_assert(isDirectedGraphEdgeDescV<SparseMatrixImp<int32_t>>, "SparseMatrix must satisfy the directed_graph_edge_desc concept");
-
 // Verify that SparseMatrixImp satisfies the directed graph concept
 static_assert(isDirectedGraphV<SparseMatrixImp<int32_t>>, "SparseMatrix must satisfy directed_graph_concept");
 
@@ -130,6 +131,8 @@ static_assert(hasVertexWeightsV<SparseMatrixImp<int64_t>>, "CompactSparseGraph m
 static_assert(isComputationalDagTypedVerticesV<SparseMatrixImp<int32_t>>,
               "CompactSparseGraph must satisfy the is_computation_dag concept");
 
+static_assert(isDirectedGraphEdgeDescV<SparseMatrixImp<int32_t>>, "SparseMatrix must satisfy the directed_graph_edge_desc concept");
+
 }    // namespace osp
 
 #endif
diff --git a/tests/sparse_matrix_impl.cpp b/tests/sparse_matrix_impl.cpp
index 291f4aeb..8e928df9 100644
--- a/tests/sparse_matrix_impl.cpp
+++ b/tests/sparse_matrix_impl.cpp
@@ -136,7 +136,7 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter1) {
 
     size_t idx = 0;
 
-    for (const long unsigned int &v : graph.Vertices()) {
+    for (const auto &v : graph.Vertices()) {
         BOOST_CHECK_EQUAL(v, vertices[idx++]);
 
         size_t i = 0;
@@ -234,6 +234,8 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter2) {
         lCsc = lCsr;
 
         SparseMatrixImp<int32_t> graph;
+        using UVertType = VertexIdxT<SparseMatrixImp<int32_t>>;
+
         graph.SetCsr(&lCsr);
         graph.SetCsc(&lCsc);
 
@@ -244,7 +246,7 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter2) {
         BOOST_CHECK_EQUAL(static_cast<std::size_t>(graph.NumEdges()), graph2.NumEdges());
 
         for (const auto &vert : graph2.Vertices()) {
-            auto chldren = graph.Children(vert);
+            auto chldren = graph.Children(static_cast<UVertType>(vert));
             auto chldren2 = graph2.Children(vert);
             auto it = chldren.begin();
             auto it_other = chldren.begin();
@@ -269,7 +271,7 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter2) {
                 ++it_other;
                 ++it2;
             }
-            BOOST_CHECK_EQUAL(cntr, graph.OutDegree(vert));
+            BOOST_CHECK_EQUAL(static_cast<UVertType>(cntr), graph.OutDegree(static_cast<UVertType>(vert)));
             BOOST_CHECK_EQUAL(cntr, graph1.OutDegree(vert));
             BOOST_CHECK_EQUAL(cntr, graph2.OutDegree(vert));
             BOOST_CHECK(it == end);
@@ -278,7 +280,7 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter2) {
         }
 
         for (const auto &vert : graph2.Vertices()) {
-            auto parents = graph.Parents(vert);
+            auto parents = graph.Parents(static_cast<UVertType>(vert));
             auto parents2 = graph2.Parents(vert);
             auto it = parents.begin();
             auto it_other = parents.begin();
@@ -301,7 +303,7 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter2) {
                 ++it;
                 ++it2;
             }
-            BOOST_CHECK_EQUAL(cntr, graph.InDegree(vert));
+            BOOST_CHECK_EQUAL(static_cast<UVertType>(cntr), graph.InDegree(static_cast<UVertType>(vert)));
             BOOST_CHECK_EQUAL(cntr, graph1.InDegree(vert));
             BOOST_CHECK_EQUAL(cntr, graph2.InDegree(vert));
             BOOST_CHECK(it == end);
diff --git a/tests/sptrsv.cpp b/tests/sptrsv.cpp
index 355a36d7..d235aea0 100644
--- a/tests/sptrsv.cpp
+++ b/tests/sptrsv.cpp
@@ -229,7 +229,7 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) {
     BOOST_CHECK(CompareVectors(uXRef, uXOsp));
 
     // Lsolve in-place With PERMUTATION
-    std::vector<unsigned> perm;// = ScheduleNodePermuterBasic(scheduleCs, LOOP_PROCESSORS);
+    std::vector<SparseMatrixImp<int32_t>::VertexIdx> perm;// = ScheduleNodePermuterBasic(scheduleCs, LOOP_PROCESSORS);
     sim.SetupCsrWithPermutationLoopProcessors(scheduleCs, perm);
     std::vector<bool> permCheck(graph.NumVertices(), false);
     BOOST_CHECK_EQUAL(permCheck.size(), perm.size());

From 019bf4445755e519851f6673b958863a0ea24520 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Tue, 24 Mar 2026 15:03:21 +0100
Subject: [PATCH 46/57] ssp sptrsv kernel

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp |  32 +-----
 .../sptrsv_simulator/sptrsv_kernels.hpp       | 107 ++++++++++++++++++
 2 files changed, 108 insertions(+), 31 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 6de9a89e..76b59a28 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -654,39 +654,9 @@ class Sptrsv {
 
     template <unsigned staleness = 2U>
     void SspLsolveStalenessWithProcFirstPermutationInPlace() const {
-        const unsigned nthreads = instance_->NumberOfProcessors();
-        FlatCheckpointCounterBarrier barrier(nthreads);
-
-        const auto *const csr = instance_->GetComputationalDag().GetCSR();
-        const EigenIdxType *const outer = csr->outerIndexPtr();
-        const EigenIdxType *const inner = csr->innerIndexPtr();
-        const double *const vals = csr->valuePtr();
         double *const x = x_;
 
-#    pragma omp parallel num_threads(nthreads)
-        {
-            const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
-            const auto endStepPtr = std::next(procFirstStepPtr_.cbegin(), (proc + 1U) * numSupersteps_);
-            for (auto stepPtr = std::next(procFirstStepPtr_.cbegin(), proc * numSupersteps_); stepPtr != endStepPtr;) {
-                UVertType rowIdx = *stepPtr;
-                const UVertType endRowIdx = *(++stepPtr);
-
-                if (rowIdx != endRowIdx) {
-                    barrier.Wait(proc, staleness - 1U);
-                }
-
-                for (; rowIdx != endRowIdx; ++rowIdx) {
-                    double acc = 0.0;
-                    for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
-                        acc += val_[i] * x[colIdx_[i]];
-                    }
-
-                    x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
-                }
-                // Signal completion of this superstep.
-                barrier.Arrive(proc);
-            }
-        }
+        SpLTrSvProcPermSSPParallelInPlace<UVertType, staleness>(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_);
     }
 
     void ResetX() {
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
index 26739e6e..a6cec829 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
@@ -126,6 +126,38 @@ void SpLTrSvBSPParallelInPlace(double *__restrict__ const x,
     }
 }
 
+template <typename IdxType>
+void SpLTrSvProcPermBSPParallel(double *__restrict__ const x,
+                                const double *__restrict__ const b,
+                                const IdxType *__restrict__ const outer,
+                                const IdxType *__restrict__ const inner,
+                                const double *__restrict__ const val,
+                                const unsigned numProcs,
+                                const unsigned numSuperSteps,
+                                const std::vector<IdxType> &procStepPtr) {
+    static_assert(std::is_integral_v<IdxType>);
+
+#pragma omp parallel num_threads(numProcs)
+    {
+        const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
+        const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps);
+        for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) {
+            IdxType row = *stepPtr;
+            const IdxType endRow = *(++stepPtr);
+            for (; row != endRow; ++row) {
+                double acc = b[row];
+                for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
+                    acc -= val[entryIdx] * x[inner[entryIdx]];
+                }
+
+                x[row] = acc / val[outer[row + 1] - 1];
+            }
+
+#pragma omp barrier
+        }
+    }
+}
+
 template <typename IdxType>
 void SpLTrSvProcPermBSPParallelInPlace(double *__restrict__ const x,
                                        const IdxType *__restrict__ const outer,
@@ -157,4 +189,79 @@ void SpLTrSvProcPermBSPParallelInPlace(double *__restrict__ const x,
     }
 }
 
+template <typename IdxType, unsigned staleness = 2U>
+void SpLTrSvProcPermSSPParallel(double *__restrict__ const x,
+                                const double *__restrict__ const b,
+                                const IdxType *__restrict__ const outer,
+                                const IdxType *__restrict__ const inner,
+                                const double *__restrict__ const val,
+                                const unsigned numProcs,
+                                const unsigned numSuperSteps,
+                                const std::vector<IdxType> &procStepPtr) {
+    static_assert(std::is_integral_v<IdxType>);
+
+    FlatCheckpointCounterBarrier barrier(numProcs);
+#pragma omp parallel num_threads(numProcs)
+    {
+        const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
+        const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps);
+        for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) {
+            IdxType row = *stepPtr;
+            const IdxType endRow = *(++stepPtr);
+
+            if (row != endRow) {
+                barrier.Wait(proc, staleness - 1U);
+            }
+
+            for (; row != endRow; ++row) {
+                double acc = b[row];
+                for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; entryIdx++) {
+                    acc -= val[entryIdx] * x[inner[entryIdx]];
+                }
+
+                x[row] = acc / val[outer[row + 1] - 1];
+            }
+            // Signal completion of this superstep.
+            barrier.Arrive(proc);
+        }
+    }
+}
+
+template <typename IdxType, unsigned staleness = 2U>
+void SpLTrSvProcPermSSPParallelInPlace(double *__restrict__ const x,
+                                       const IdxType *__restrict__ const outer,
+                                       const IdxType *__restrict__ const inner,
+                                       const double *__restrict__ const val,
+                                       const unsigned numProcs,
+                                       const unsigned numSuperSteps,
+                                       const std::vector<IdxType> &procStepPtr) {
+    static_assert(std::is_integral_v<IdxType>);
+
+    FlatCheckpointCounterBarrier barrier(numProcs);
+#pragma omp parallel num_threads(numProcs)
+    {
+        const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
+        const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps);
+        for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) {
+            IdxType row = *stepPtr;
+            const IdxType endRow = *(++stepPtr);
+
+            if (row != endRow) {
+                barrier.Wait(proc, staleness - 1U);
+            }
+
+            for (; row != endRow; ++row) {
+                double acc = x[row];
+                for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; entryIdx++) {
+                    acc -= val[entryIdx] * x[inner[entryIdx]];
+                }
+
+                x[row] = acc / val[outer[row + 1] - 1];
+            }
+            // Signal completion of this superstep.
+            barrier.Arrive(proc);
+        }
+    }
+}
+
 }    // end namespace osp

From a310d833f2b74814cf5b782712c3c6edf994c40b Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Tue, 24 Mar 2026 15:24:30 +0100
Subject: [PATCH 47/57] SSP SpTrSV kernels

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 80 ++-----------------
 .../sptrsv_simulator/sptrsv_kernels.hpp       | 73 ++++++++++++++++-
 2 files changed, 79 insertions(+), 74 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 76b59a28..62e1696f 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -694,89 +694,25 @@ class Sptrsv {
     // Uses FlatCheckpointCounterBarrier created internally.
     template <unsigned staleness = 2U>
     void SspLsolveStaleness() const {
-        const unsigned nthreads = instance_->NumberOfProcessors();
-        FlatCheckpointCounterBarrier barrier(nthreads);
-
-        const auto *const csr = instance_->GetComputationalDag().GetCSR();
-        const EigenIdxType *const outer = csr->outerIndexPtr();
-        const EigenIdxType *const inner = csr->innerIndexPtr();
-        const double *const vals = csr->valuePtr();
+        const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
+        const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
+        const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
         double *const x = x_;
         const double *const b = b_;
 
-#    pragma omp parallel num_threads(nthreads)
-        {
-            const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
-            for (unsigned step = 0; step < numSupersteps_; ++step) {
-                // Process nodes assigned to this (step, proc) pair.
-                const std::size_t boundsStrSize = boundsArrayL_[step][proc].size();
-                // Enforce staleness window before starting this superstep.
-                if (boundsStrSize > 0U) {
-                    barrier.Wait(proc, staleness - 1U);
-                }
-                for (std::size_t index = 0; index < boundsStrSize; index += 2) {
-                    EigenIdxType lowerB = boundsArrayL_[step][proc][index];
-                    const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
-                    for (EigenIdxType node = lowerB; node <= upperB; ++node) {
-                        // Initialize solution for this node
-                        x[node] = b[node];
-                        double acc = 0.0;
-                        // Perform lower-triangular solve for this node
-                        for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
-                            // Accumulate contributions from previously solved nodes
-                            acc += vals[i] * x[inner[i]];
-                        }
-                        // Divide by diagonal element to complete solve for this node
-                        x[node] = (x[node] - acc) / vals[outer[node + 1] - 1];
-                    }
-                }
-                // Signal completion of this superstep.
-                barrier.Arrive(proc);
-            }
-        }
+        SpLTrSvSSPParallel<EigenIdxType, staleness>(x, b, outer, inner, valPtr, boundsArrayL_);
     }
 
     // SSP Lsolve in-place with staleness=2 (allowing at most one superstep of lag).
     // Uses FlatCheckpointCounterBarrier created internally.
     template <unsigned staleness = 2U>
     void SspLsolveStalenessInPlace() const {
-        const unsigned nthreads = instance_->NumberOfProcessors();
-        FlatCheckpointCounterBarrier barrier(nthreads);
-
-        const auto *const csr = instance_->GetComputationalDag().GetCSR();
-        const EigenIdxType *const outer = csr->outerIndexPtr();
-        const EigenIdxType *const inner = csr->innerIndexPtr();
-        const double *const vals = csr->valuePtr();
+        const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr();
+        const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr();
+        const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr();
         double *const x = x_;
 
-#    pragma omp parallel num_threads(nthreads)
-        {
-            const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
-            for (unsigned step = 0; step < numSupersteps_; ++step) {
-                // Process nodes assigned to this (step, proc) pair.
-                const std::size_t boundsStrSize = boundsArrayL_[step][proc].size();
-                // Enforce staleness window before starting this superstep.
-                if (boundsStrSize > 0U) {
-                    barrier.Wait(proc, staleness - 1U);
-                }
-                for (std::size_t index = 0; index < boundsStrSize; index += 2) {
-                    EigenIdxType lowerB = boundsArrayL_[step][proc][index];
-                    const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1];
-                    for (EigenIdxType node = lowerB; node <= upperB; ++node) {
-                        double acc = 0.0;
-                        // Perform lower-triangular solve for this node
-                        for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) {
-                            // Accumulate contributions from previously solved nodes
-                            acc += vals[i] * x[inner[i]];
-                        }
-                        // Divide by diagonal element to complete solve for this node
-                        x[node] = (x[node] - acc) / vals[outer[node + 1] - 1];
-                    }
-                }
-                // Signal completion of this superstep.
-                barrier.Arrive(proc);
-            }
-        }
+        SpLTrSvSSPParallelInPlace<EigenIdxType, staleness>(x, outer, inner, valPtr, boundsArrayL_);
     }
 
     // SSP Usolve with configurable staleness.
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
index a6cec829..d5a18fdc 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
@@ -126,6 +126,77 @@ void SpLTrSvBSPParallelInPlace(double *__restrict__ const x,
     }
 }
 
+template <typename IdxType, unsigned staleness = 2U>
+void SpLTrSvSSPParallel(double *__restrict__ const x,
+                        const double *__restrict__ const b,
+                        const IdxType *__restrict__ const outer,
+                        const IdxType *__restrict__ const inner,
+                        const double *__restrict__ const val,
+                        const std::vector<std::vector<std::vector<IdxType>>> &BoundsStepProcIdx) {
+    static_assert(std::is_integral_v<IdxType>);
+
+    const std::size_t nthreads = BoundsStepProcIdx[0U].size();
+    FlatCheckpointCounterBarrier barrier(nthreads);
+
+#pragma omp parallel num_threads(nthreads)
+    {
+        const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+        for (std::size_t step = 0; step < BoundsStepProcIdx.size(); ++step) {
+            const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size();
+            if (ubIdx > 0U) {
+                barrier.Wait(proc, staleness - 1U);
+            }
+            for (std::size_t idx = 0; idx < ubIdx; ++idx) {
+                IdxType row = BoundsStepProcIdx[step][proc][idx];
+                const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx];
+                for (; row <= ubRow; ++row) {
+                    double acc = b[row];
+                    for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
+                        acc -= val[entryIdx] * x[inner[entryIdx]];
+                    }
+                    x[row] = acc / val[outer[row + 1] - 1];
+                }
+            }
+            barrier.Arrive(proc);
+        }
+    }
+}
+
+template <typename IdxType, unsigned staleness = 2U>
+void SpLTrSvSSPParallelInPlace(double *__restrict__ const x,
+                               const IdxType *__restrict__ const outer,
+                               const IdxType *__restrict__ const inner,
+                               const double *__restrict__ const val,
+                               const std::vector<std::vector<std::vector<IdxType>>> &BoundsStepProcIdx) {
+    static_assert(std::is_integral_v<IdxType>);
+
+    const std::size_t nthreads = BoundsStepProcIdx[0U].size();
+    FlatCheckpointCounterBarrier barrier(nthreads);
+
+#pragma omp parallel num_threads(nthreads)
+    {
+        const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+        for (std::size_t step = 0; step < BoundsStepProcIdx.size(); ++step) {
+            const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size();
+            if (ubIdx > 0U) {
+                barrier.Wait(proc, staleness - 1U);
+            }
+            for (std::size_t idx = 0; idx < ubIdx; ++idx) {
+                IdxType row = BoundsStepProcIdx[step][proc][idx];
+                const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx];
+                for (; row <= ubRow; ++row) {
+                    double acc = x[row];
+                    for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
+                        acc -= val[entryIdx] * x[inner[entryIdx]];
+                    }
+                    x[row] = acc / val[outer[row + 1] - 1];
+                }
+            }
+            barrier.Arrive(proc);
+        }
+    }
+}
+
 template <typename IdxType>
 void SpLTrSvProcPermBSPParallel(double *__restrict__ const x,
                                 const double *__restrict__ const b,
@@ -221,7 +292,6 @@ void SpLTrSvProcPermSSPParallel(double *__restrict__ const x,
 
                 x[row] = acc / val[outer[row + 1] - 1];
             }
-            // Signal completion of this superstep.
             barrier.Arrive(proc);
         }
     }
@@ -258,7 +328,6 @@ void SpLTrSvProcPermSSPParallelInPlace(double *__restrict__ const x,
 
                 x[row] = acc / val[outer[row + 1] - 1];
             }
-            // Signal completion of this superstep.
             barrier.Arrive(proc);
         }
     }

From 64cd7d7dc6a3503483fc59e01d72775cc090daf7 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Tue, 24 Mar 2026 15:37:53 +0100
Subject: [PATCH 48/57] changed to pointer

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp |  4 ++--
 .../sptrsv_simulator/sptrsv_kernels.hpp       | 24 +++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 62e1696f..84166644 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -591,7 +591,7 @@ class Sptrsv {
     void LsolveWithProcFirstPermutationInPlace() const {
         double *const x = x_;
 
-        SpLTrSvProcPermBSPParallelInPlace(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_);
+        SpLTrSvProcPermBSPParallelInPlace(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_.data());
     }
 
     void LsolveWithPermutation() const {
@@ -656,7 +656,7 @@ class Sptrsv {
     void SspLsolveStalenessWithProcFirstPermutationInPlace() const {
         double *const x = x_;
 
-        SpLTrSvProcPermSSPParallelInPlace<UVertType, staleness>(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_);
+        SpLTrSvProcPermSSPParallelInPlace<UVertType, staleness>(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_.data());
     }
 
     void ResetX() {
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
index d5a18fdc..ccf323be 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
@@ -205,14 +205,14 @@ void SpLTrSvProcPermBSPParallel(double *__restrict__ const x,
                                 const double *__restrict__ const val,
                                 const unsigned numProcs,
                                 const unsigned numSuperSteps,
-                                const std::vector<IdxType> &procStepPtr) {
+                                const IdxType *__restrict__ const procStepPtr) {
     static_assert(std::is_integral_v<IdxType>);
 
 #pragma omp parallel num_threads(numProcs)
     {
         const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
-        const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps);
-        for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) {
+        const IdxType *const endStepPtr = procStepPtr + ((proc + 1U) * numSuperSteps);
+        for (const IdxType *stepPtr = procStepPtr + (proc * numSuperSteps); stepPtr != endStepPtr;) {
             IdxType row = *stepPtr;
             const IdxType endRow = *(++stepPtr);
             for (; row != endRow; ++row) {
@@ -236,14 +236,14 @@ void SpLTrSvProcPermBSPParallelInPlace(double *__restrict__ const x,
                                        const double *__restrict__ const val,
                                        const unsigned numProcs,
                                        const unsigned numSuperSteps,
-                                       const std::vector<IdxType> &procStepPtr) {
+                                       const IdxType *__restrict__ const procStepPtr) {
     static_assert(std::is_integral_v<IdxType>);
 
 #pragma omp parallel num_threads(numProcs)
     {
         const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
-        const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps);
-        for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) {
+        const IdxType *const endStepPtr = procStepPtr + ((proc + 1U) * numSuperSteps);
+        for (const IdxType *stepPtr = procStepPtr + (proc * numSuperSteps); stepPtr != endStepPtr;) {
             IdxType row = *stepPtr;
             const IdxType endRow = *(++stepPtr);
             for (; row != endRow; ++row) {
@@ -268,15 +268,15 @@ void SpLTrSvProcPermSSPParallel(double *__restrict__ const x,
                                 const double *__restrict__ const val,
                                 const unsigned numProcs,
                                 const unsigned numSuperSteps,
-                                const std::vector<IdxType> &procStepPtr) {
+                                const IdxType *__restrict__ const procStepPtr) {
     static_assert(std::is_integral_v<IdxType>);
 
     FlatCheckpointCounterBarrier barrier(numProcs);
 #pragma omp parallel num_threads(numProcs)
     {
         const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
-        const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps);
-        for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) {
+        const IdxType *const endStepPtr = procStepPtr + ((proc + 1U) * numSuperSteps);
+        for (const IdxType *stepPtr = procStepPtr + (proc * numSuperSteps); stepPtr != endStepPtr;) {
             IdxType row = *stepPtr;
             const IdxType endRow = *(++stepPtr);
 
@@ -304,15 +304,15 @@ void SpLTrSvProcPermSSPParallelInPlace(double *__restrict__ const x,
                                        const double *__restrict__ const val,
                                        const unsigned numProcs,
                                        const unsigned numSuperSteps,
-                                       const std::vector<IdxType> &procStepPtr) {
+                                       const IdxType *__restrict__ const procStepPtr) {
     static_assert(std::is_integral_v<IdxType>);
 
     FlatCheckpointCounterBarrier barrier(numProcs);
 #pragma omp parallel num_threads(numProcs)
     {
         const unsigned proc = static_cast<unsigned>(omp_get_thread_num());
-        const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps);
-        for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) {
+        const IdxType *const endStepPtr = procStepPtr + ((proc + 1U) * numSuperSteps);
+        for (const IdxType *stepPtr = procStepPtr + (proc * numSuperSteps); stepPtr != endStepPtr;) {
             IdxType row = *stepPtr;
             const IdxType endRow = *(++stepPtr);
 

From 7cc544b6fedc988a40cba82595e5a62ed37b2cad Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Tue, 24 Mar 2026 15:44:08 +0100
Subject: [PATCH 49/57] initialised accumulator

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 52 +++++++++----------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 84166644..4dcc9047 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -444,12 +444,11 @@ class Sptrsv {
         EigenIdxType i = numberOfVertices;
         do {
             i--;
-            x[i] = b[i];
-            double acc = 0.0;
+            double acc = b[i];
             for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) {
-                acc += valPtr[j] * x[inner[j]];
+                acc -= valPtr[j] * x[inner[j]];
             }
-            x[i] = (x[i] - acc) / valPtr[outer[i]];
+            x[i] = acc / valPtr[outer[i]];
         } while (i != 0);
     }
 
@@ -482,11 +481,11 @@ class Sptrsv {
 
                     do {
                         node--;
-                        double acc = 0.0;
+                        double acc = x[node];
                         for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
-                            acc += valPtr[i] * x[inner[i]];
+                            acc -= valPtr[i] * x[inner[i]];
                         }
-                        x[node] = (x[node] - acc) / valPtr[outer[node]];
+                        x[node] = acc / valPtr[outer[node]];
                     } while (node != lowerB);
                 }
 #    pragma omp barrier
@@ -525,12 +524,11 @@ class Sptrsv {
 
                     do {
                         node--;
-                        x[node] = b[node];
-                        double acc = 0.0;
+                        double acc = b[node];
                         for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
-                            acc += valPtr[i] * x[inner[i]];
+                            acc -= valPtr[i] * x[inner[i]];
                         }
-                        x[node] = (x[node] - acc) / valPtr[outer[node]];
+                        x[node] = acc / valPtr[outer[node]];
                     } while (node != lowerB);
                 }
 #    pragma omp barrier
@@ -558,11 +556,11 @@ class Sptrsv {
         EigenIdxType i = numberOfVertices;
         do {
             i--;
-            double acc = 0.0;
+            double acc = x[i];
             for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) {
-                acc += valPtr[j] * x[inner[j]];
+                acc -= valPtr[j] * x[inner[j]];
             }
-            x[i] = (x[i] - acc) / valPtr[outer[i]];
+            x[i] = acc / valPtr[outer[i]];
         } while (i != 0);
     }
 
@@ -575,12 +573,12 @@ class Sptrsv {
             for (unsigned step = 0; step < numSupersteps_; step++) {
                 const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
                 for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
-                    double acc = 0.0;
+                    double acc = x[rowIdx];
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
-                        acc += val_[i] * x[colIdx_[i]];
+                        acc -= val_[i] * x[colIdx_[i]];
                     }
 
-                    x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
+                    x[rowIdx] = acc / val_[rowPtr_[rowIdx + 1] - 1];
                 }
 
 #    pragma omp barrier
@@ -604,13 +602,12 @@ class Sptrsv {
                 const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
                 const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
                 for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
-                    x[rowIdx] = b[rowIdx];
-                    double acc = 0.0;
+                    double acc = b[rowIdx];
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
-                        acc += val_[i] * x[colIdx_[i]];
+                        acc -= val_[i] * x[colIdx_[i]];
                     }
 
-                    x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
+                    x[rowIdx] = acc / val_[rowPtr_[rowIdx + 1] - 1];
                 }
 
 #    pragma omp barrier
@@ -639,12 +636,12 @@ class Sptrsv {
 
                 const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
                 for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
-                    double acc = 0.0;
+                    double acc = x[rowIdx];
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
-                        acc += val_[i] * x[colIdx_[i]];
+                        acc -= val_[i] * x[colIdx_[i]];
                     }
 
-                    x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1];
+                    x[rowIdx] = acc / val_[rowPtr_[rowIdx + 1] - 1];
                 }
                 // Signal completion of this superstep.
                 barrier.Arrive(proc);
@@ -746,12 +743,11 @@ class Sptrsv {
 
                     do {
                         node--;
-                        x[node] = b[node];
-                        double acc = 0.0;
+                        double acc = b[node];
                         for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) {
-                            acc += vals[i] * x[inner[i]];
+                            acc -= vals[i] * x[inner[i]];
                         }
-                        x[node] = (x[node] - acc) / vals[outer[node]];
+                        x[node] = acc / vals[outer[node]];
                     } while (node != lowerB);
                 }
 

From f9eb927125b89786769b4d5179624f70d5d97dd6 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Tue, 24 Mar 2026 15:52:33 +0100
Subject: [PATCH 50/57] change of epsilon

---
 apps/maxbsp_ssp_sptrsv.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp
index a15e24f5..1102f506 100644
--- a/apps/maxbsp_ssp_sptrsv.cpp
+++ b/apps/maxbsp_ssp_sptrsv.cpp
@@ -42,7 +42,7 @@ using namespace osp;
 
 namespace {
 
-constexpr double EPSILON = 1e-12;
+constexpr double EPSILON = 1e-8;
 constexpr unsigned kDefaultStaleness = 2U;
 constexpr int defaultSynchronisationCosts = 500;
 

From 4d7fb9473f7b1c4d27c09b116b625f0a51afb534 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Wed, 25 Mar 2026 10:08:40 +0100
Subject: [PATCH 51/57] change lower triangular bounds to proc first

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 41 +++++++-------
 .../sptrsv_simulator/sptrsv_kernels.hpp       | 56 +++++++++++--------
 2 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 4dcc9047..b3638534 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -75,7 +75,7 @@ class Sptrsv {
 
     unsigned numSupersteps_;
 
-    std::vector<std::vector<std::vector<EigenIdxType>>> vectorStepProcessorVertices_;
+    std::vector<std::vector<std::vector<EigenIdxType>>> vectorProcessorStepVerticesL_;
     std::vector<std::vector<std::vector<EigenIdxType>>> vectorStepProcessorVerticesU_;
     std::vector<int> ready_;
 
@@ -87,14 +87,14 @@ class Sptrsv {
     Sptrsv(BspInstance<SparseMatrixImp<EigenIdxType>> &inst) : instance_(&inst) {};
 
     void SetupCsrNoPermutation(const BspSchedule<SparseMatrixImp<EigenIdxType>> &schedule) {
-        vectorStepProcessorVertices_ = std::vector<std::vector<std::vector<EigenIdxType>>>(
-            schedule.NumberOfSupersteps(), std::vector<std::vector<EigenIdxType>>(schedule.GetInstance().NumberOfProcessors()));
+        vectorProcessorStepVerticesL_ = std::vector<std::vector<std::vector<EigenIdxType>>>(
+            schedule.GetInstance().NumberOfProcessors(), std::vector<std::vector<EigenIdxType>>(schedule.NumberOfSupersteps()));
 
         vectorStepProcessorVerticesU_ = std::vector<std::vector<std::vector<EigenIdxType>>>(
             schedule.NumberOfSupersteps(), std::vector<std::vector<EigenIdxType>>(schedule.GetInstance().NumberOfProcessors()));
 
         boundsArrayL_ = std::vector<std::vector<std::vector<EigenIdxType>>>(
-            schedule.NumberOfSupersteps(), std::vector<std::vector<EigenIdxType>>(schedule.GetInstance().NumberOfProcessors()));
+            schedule.GetInstance().NumberOfProcessors(), std::vector<std::vector<EigenIdxType>>(schedule.NumberOfSupersteps()));
         boundsArrayU_ = std::vector<std::vector<std::vector<EigenIdxType>>>(
             schedule.NumberOfSupersteps(), std::vector<std::vector<EigenIdxType>>(schedule.GetInstance().NumberOfProcessors()));
 
@@ -107,27 +107,30 @@ class Sptrsv {
             switch (id) {
                 case 0: {
                     for (UVertType node = 0; node < numberOfVertices; ++node) {
-                        vectorStepProcessorVertices_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back(
+                        vectorProcessorStepVerticesL_[schedule.AssignedProcessor(node)][schedule.AssignedSuperstep(node)].push_back(
                             static_cast<EigenIdxType>(node));
                     }
 
-                    for (unsigned int step = 0; step < schedule.NumberOfSupersteps(); ++step) {
-                        for (unsigned int proc = 0; proc < instance_->NumberOfProcessors(); ++proc) {
-                            if (!vectorStepProcessorVertices_[step][proc].empty()) {
-                                EigenIdxType start = vectorStepProcessorVertices_[step][proc][0];
-                                EigenIdxType prev = vectorStepProcessorVertices_[step][proc][0];
-
-                                for (UVertType i = 1; i < vectorStepProcessorVertices_[step][proc].size(); ++i) {
-                                    if (vectorStepProcessorVertices_[step][proc][i] != prev + 1) {
-                                        boundsArrayL_[step][proc].push_back(start);
-                                        boundsArrayL_[step][proc].push_back(prev);
-                                        start = vectorStepProcessorVertices_[step][proc][i];
+                    for (unsigned int proc = 0; proc < instance_->NumberOfProcessors(); ++proc) {
+                        for (unsigned int step = 0; step < schedule.NumberOfSupersteps(); ++step) {
+                            const auto &vectorVerticesL = vectorProcessorStepVerticesL_[proc][step];
+                            auto &localBoundsArrayL_ = boundsArrayL_[proc][step];
+
+                            if (!vectorVerticesL.empty()) {
+                                EigenIdxType start = vectorVerticesL[0];
+                                EigenIdxType prev = vectorVerticesL[0];
+
+                                for (UVertType i = 1; i < vectorVerticesL.size(); ++i) {
+                                    if (vectorVerticesL[i] != prev + 1) {
+                                        localBoundsArrayL_.push_back(start);
+                                        localBoundsArrayL_.push_back(prev);
+                                        start = vectorVerticesL[i];
                                     }
-                                    prev = vectorStepProcessorVertices_[step][proc][i];
+                                    prev = vectorVerticesL[i];
                                 }
 
-                                boundsArrayL_[step][proc].push_back(start);
-                                boundsArrayL_[step][proc].push_back(prev);
+                                localBoundsArrayL_.push_back(start);
+                                localBoundsArrayL_.push_back(prev);
                             }
                         }
                     }
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
index ccf323be..f2988213 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
@@ -69,19 +69,21 @@ void SpLTrSvBSPParallel(double *__restrict__ const x,
                         const IdxType *__restrict__ const outer,
                         const IdxType *__restrict__ const inner,
                         const double *__restrict__ const val,
-                        const std::vector<std::vector<std::vector<IdxType>>> &BoundsStepProcIdx) {
+                        const std::vector<std::vector<std::vector<IdxType>>> &BoundsProcStepIdx) {
     static_assert(std::is_integral_v<IdxType>);
 
-#pragma omp parallel num_threads(BoundsStepProcIdx[0U].size())
+#pragma omp parallel num_threads(BoundsProcStepIdx.size())
     {
         const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
-        const std::size_t numSuperSteps = BoundsStepProcIdx.size();
+        const std::vector<std::vector<IdxType>> &BoundsStepIdx = BoundsProcStepIdx[proc];
+        const std::size_t numSuperSteps = BoundsStepIdx.size();
 
         for (std::size_t step = 0U; step < numSuperSteps; ++step) {
-            const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size();
+            const std::vector<IdxType> &BoundIdx = BoundsStepIdx[step];
+            const std::size_t ubIdx = BoundIdx.size();
             for (std::size_t idx = 0U; idx < ubIdx; ++idx) {
-                IdxType row = BoundsStepProcIdx[step][proc][idx];
-                const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx];
+                IdxType row = BoundIdx[idx];
+                const IdxType ubRow = BoundIdx[++idx];
                 for (; row <= ubRow; ++row) {
                     double acc = b[row];
                     for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
@@ -100,19 +102,21 @@ void SpLTrSvBSPParallelInPlace(double *__restrict__ const x,
                                const IdxType *__restrict__ const outer,
                                const IdxType *__restrict__ const inner,
                                const double *__restrict__ const val,
-                               const std::vector<std::vector<std::vector<IdxType>>> &BoundsStepProcIdx) {
+                               const std::vector<std::vector<std::vector<IdxType>>> &BoundsProcStepIdx) {
     static_assert(std::is_integral_v<IdxType>);
 
-#pragma omp parallel num_threads(BoundsStepProcIdx[0U].size())
+#pragma omp parallel num_threads(BoundsProcStepIdx.size())
     {
         const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
-        const std::size_t numSuperSteps = BoundsStepProcIdx.size();
+        const std::vector<std::vector<IdxType>> &BoundsStepIdx = BoundsProcStepIdx[proc];
+        const std::size_t numSuperSteps = BoundsStepIdx.size();
 
         for (std::size_t step = 0U; step < numSuperSteps; ++step) {
-            const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size();
+            const std::vector<IdxType> &BoundIdx = BoundsStepIdx[step];
+            const std::size_t ubIdx = BoundIdx.size();
             for (std::size_t idx = 0U; idx < ubIdx; ++idx) {
-                IdxType row = BoundsStepProcIdx[step][proc][idx];
-                const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx];
+                IdxType row = BoundIdx[idx];
+                const IdxType ubRow = BoundIdx[++idx];
                 for (; row <= ubRow; ++row) {
                     double acc = x[row];
                     for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
@@ -132,23 +136,25 @@ void SpLTrSvSSPParallel(double *__restrict__ const x,
                         const IdxType *__restrict__ const outer,
                         const IdxType *__restrict__ const inner,
                         const double *__restrict__ const val,
-                        const std::vector<std::vector<std::vector<IdxType>>> &BoundsStepProcIdx) {
+                        const std::vector<std::vector<std::vector<IdxType>>> &BoundsProcStepIdx) {
     static_assert(std::is_integral_v<IdxType>);
 
-    const std::size_t nthreads = BoundsStepProcIdx[0U].size();
+    const std::size_t nthreads = BoundsProcStepIdx.size();
     FlatCheckpointCounterBarrier barrier(nthreads);
 
 #pragma omp parallel num_threads(nthreads)
     {
         const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
-        for (std::size_t step = 0; step < BoundsStepProcIdx.size(); ++step) {
-            const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size();
+        const std::vector<std::vector<IdxType>> &BoundsStepIdx = BoundsProcStepIdx[proc];
+        for (std::size_t step = 0; step < BoundsStepIdx.size(); ++step) {
+            const std::vector<IdxType> &BoundsIdx = BoundsStepIdx[step];
+            const std::size_t ubIdx = BoundsIdx.size();
             if (ubIdx > 0U) {
                 barrier.Wait(proc, staleness - 1U);
             }
             for (std::size_t idx = 0; idx < ubIdx; ++idx) {
-                IdxType row = BoundsStepProcIdx[step][proc][idx];
-                const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx];
+                IdxType row = BoundsIdx[idx];
+                const IdxType ubRow = BoundsIdx[++idx];
                 for (; row <= ubRow; ++row) {
                     double acc = b[row];
                     for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
@@ -167,23 +173,25 @@ void SpLTrSvSSPParallelInPlace(double *__restrict__ const x,
                                const IdxType *__restrict__ const outer,
                                const IdxType *__restrict__ const inner,
                                const double *__restrict__ const val,
-                               const std::vector<std::vector<std::vector<IdxType>>> &BoundsStepProcIdx) {
+                               const std::vector<std::vector<std::vector<IdxType>>> &BoundsProcStepIdx) {
     static_assert(std::is_integral_v<IdxType>);
 
-    const std::size_t nthreads = BoundsStepProcIdx[0U].size();
+    const std::size_t nthreads = BoundsProcStepIdx.size();
     FlatCheckpointCounterBarrier barrier(nthreads);
 
 #pragma omp parallel num_threads(nthreads)
     {
         const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
-        for (std::size_t step = 0; step < BoundsStepProcIdx.size(); ++step) {
-            const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size();
+        const std::vector<std::vector<IdxType>> &BoundsStepIdx = BoundsProcStepIdx[proc];
+        for (std::size_t step = 0; step < BoundsStepIdx.size(); ++step) {
+            const std::vector<IdxType> &BoundsIdx = BoundsStepIdx[step];
+            const std::size_t ubIdx = BoundsIdx.size();
             if (ubIdx > 0U) {
                 barrier.Wait(proc, staleness - 1U);
             }
             for (std::size_t idx = 0; idx < ubIdx; ++idx) {
-                IdxType row = BoundsStepProcIdx[step][proc][idx];
-                const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx];
+                IdxType row = BoundsIdx[idx];
+                const IdxType ubRow = BoundsIdx[++idx];
                 for (; row <= ubRow; ++row) {
                     double acc = x[row];
                     for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {

From 6ee7cbad8238ec8142d1dff1f1ef567aab3cf75a Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Wed, 25 Mar 2026 10:29:35 +0100
Subject: [PATCH 52/57] u solve swap processor and superstep order (for
 locality)

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 71 +++++++++++--------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index b3638534..2871be86 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -76,7 +76,7 @@ class Sptrsv {
     unsigned numSupersteps_;
 
     std::vector<std::vector<std::vector<EigenIdxType>>> vectorProcessorStepVerticesL_;
-    std::vector<std::vector<std::vector<EigenIdxType>>> vectorStepProcessorVerticesU_;
+    std::vector<std::vector<std::vector<EigenIdxType>>> vectorProcessorStepVerticesU_;
     std::vector<int> ready_;
 
     std::vector<std::vector<std::vector<EigenIdxType>>> boundsArrayL_;
@@ -90,13 +90,13 @@ class Sptrsv {
         vectorProcessorStepVerticesL_ = std::vector<std::vector<std::vector<EigenIdxType>>>(
             schedule.GetInstance().NumberOfProcessors(), std::vector<std::vector<EigenIdxType>>(schedule.NumberOfSupersteps()));
 
-        vectorStepProcessorVerticesU_ = std::vector<std::vector<std::vector<EigenIdxType>>>(
-            schedule.NumberOfSupersteps(), std::vector<std::vector<EigenIdxType>>(schedule.GetInstance().NumberOfProcessors()));
+        vectorProcessorStepVerticesU_ = std::vector<std::vector<std::vector<EigenIdxType>>>(
+            schedule.GetInstance().NumberOfProcessors(), std::vector<std::vector<EigenIdxType>>(schedule.NumberOfSupersteps()));
 
         boundsArrayL_ = std::vector<std::vector<std::vector<EigenIdxType>>>(
             schedule.GetInstance().NumberOfProcessors(), std::vector<std::vector<EigenIdxType>>(schedule.NumberOfSupersteps()));
         boundsArrayU_ = std::vector<std::vector<std::vector<EigenIdxType>>>(
-            schedule.NumberOfSupersteps(), std::vector<std::vector<EigenIdxType>>(schedule.GetInstance().NumberOfProcessors()));
+            schedule.GetInstance().NumberOfProcessors(), std::vector<std::vector<EigenIdxType>>(schedule.NumberOfSupersteps()));
 
         numSupersteps_ = schedule.NumberOfSupersteps();
         UVertType numberOfVertices = instance_->GetComputationalDag().NumVertices();
@@ -141,29 +141,32 @@ class Sptrsv {
                     UVertType node = numberOfVertices;
                     do {
                         node--;
-                        vectorStepProcessorVerticesU_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back(
+                        vectorProcessorStepVerticesU_[schedule.AssignedProcessor(node)][schedule.AssignedSuperstep(node)].push_back(
                             // --- SSP SpTRSV kernel integration from BspSptrsvCSR.hpp/cpp ---
 
                             static_cast<EigenIdxType>(node));
                     } while (node > 0);
 
-                    for (unsigned int step = 0; step < schedule.NumberOfSupersteps(); ++step) {
-                        for (unsigned int proc = 0; proc < instance_->NumberOfProcessors(); ++proc) {
-                            if (!vectorStepProcessorVerticesU_[step][proc].empty()) {
-                                EigenIdxType startU = static_cast<EigenIdxType>(vectorStepProcessorVerticesU_[step][proc][0]);
-                                EigenIdxType prevU = static_cast<EigenIdxType>(vectorStepProcessorVerticesU_[step][proc][0]);
-
-                                for (UVertType i = 1; i < vectorStepProcessorVerticesU_[step][proc].size(); ++i) {
-                                    if (static_cast<EigenIdxType>(vectorStepProcessorVerticesU_[step][proc][i]) != prevU - 1) {
-                                        boundsArrayU_[step][proc].push_back(startU);
-                                        boundsArrayU_[step][proc].push_back(prevU);
-                                        startU = static_cast<EigenIdxType>(vectorStepProcessorVerticesU_[step][proc][i]);
+                    for (unsigned int proc = 0; proc < instance_->NumberOfProcessors(); ++proc) {
+                        for (unsigned int step = 0; step < schedule.NumberOfSupersteps(); ++step) {
+                            const auto &vectorVerticesU = vectorProcessorStepVerticesU_[proc][step];
+                            auto &localBoundsArrayU = boundsArrayU_[proc][step];
+
+                            if (!vectorVerticesU.empty()) {
+                                EigenIdxType startU = static_cast<EigenIdxType>(vectorVerticesU[0]);
+                                EigenIdxType prevU = static_cast<EigenIdxType>(vectorVerticesU[0]);
+
+                                for (UVertType i = 1; i < vectorVerticesU.size(); ++i) {
+                                    if (static_cast<EigenIdxType>(vectorVerticesU[i]) != prevU - 1) {
+                                        localBoundsArrayU.push_back(startU);
+                                        localBoundsArrayU.push_back(prevU);
+                                        startU = static_cast<EigenIdxType>(vectorVerticesU[i]);
                                     }
-                                    prevU = static_cast<EigenIdxType>(vectorStepProcessorVerticesU_[step][proc][i]);
+                                    prevU = static_cast<EigenIdxType>(vectorVerticesU[i]);
                                 }
 
-                                boundsArrayU_[step][proc].push_back(startU);
-                                boundsArrayU_[step][proc].push_back(prevU);
+                                localBoundsArrayU.push_back(startU);
+                                localBoundsArrayU.push_back(prevU);
                             }
                         }
                     }
@@ -474,13 +477,15 @@ class Sptrsv {
         {
             // Process each superstep starting from the last one (opposite of lsolve)
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+            const auto& procLocalBoundsArrayU = boundsArrayU_[proc];
             unsigned step = numSupersteps_;
             do {
                 step--;
-                const std::size_t boundsStrSize = boundsArrayU_[step][proc].size();
-                for (std::size_t index = 0; index < boundsStrSize; index += 2) {
-                    EigenIdxType node = boundsArrayU_[step][proc][index] + 1;
-                    const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1];
+                const auto &localBoundsArrayU = procLocalBoundsArrayU[step];
+                const std::size_t boundsStrSize = localBoundsArrayU.size();
+                for (std::size_t index = 0; index < boundsStrSize; ++index) {
+                    EigenIdxType node = localBoundsArrayU[index] + 1;
+                    const EigenIdxType lowerB = localBoundsArrayU[++index];
 
                     do {
                         node--;
@@ -517,13 +522,15 @@ class Sptrsv {
         {
             // Process each superstep starting from the last one (opposite of lsolve)
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+            const auto &procLocalBoundsArrayU = boundsArrayU_[proc];
             unsigned step = numSupersteps_;
             do {
                 step--;
-                const std::size_t boundsStrSize = boundsArrayU_[step][proc].size();
-                for (std::size_t index = 0; index < boundsStrSize; index += 2) {
-                    EigenIdxType node = boundsArrayU_[step][proc][index] + 1;
-                    const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1];
+                const auto &localBoundsArrayU = procLocalBoundsArrayU[step];
+                const std::size_t boundsStrSize = localBoundsArrayU.size();
+                for (std::size_t index = 0; index < boundsStrSize; ++index) {
+                    EigenIdxType node = localBoundsArrayU[index] + 1;
+                    const EigenIdxType lowerB = localBoundsArrayU[++index];
 
                     do {
                         node--;
@@ -732,17 +739,19 @@ class Sptrsv {
 #    pragma omp parallel num_threads(nthreads)
         {
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+            const auto &procLocalBoundsArrayU = boundsArrayU_[proc];
             unsigned step = numSupersteps_;
             do {
                 step--;
-                const std::size_t boundsStrSize = boundsArrayU_[step][proc].size();
+                const auto &localBoundsArrayU = procLocalBoundsArrayU[step];
+                const std::size_t boundsStrSize = localBoundsArrayU.size();
                 if (boundsStrSize > 0U) {
                     barrier.Wait(proc, staleness - 1U);
                 }
 
-                for (std::size_t index = 0; index < boundsStrSize; index += 2) {
-                    EigenIdxType node = boundsArrayU_[step][proc][index] + 1;
-                    const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1];
+                for (std::size_t index = 0; index < boundsStrSize; ++index) {
+                    EigenIdxType node = localBoundsArrayU[index] + 1;
+                    const EigenIdxType lowerB = localBoundsArrayU[++index];
 
                     do {
                         node--;

From 58060450607f231306be3a3613569ef1f1353ba7 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Wed, 25 Mar 2026 11:40:00 +0100
Subject: [PATCH 53/57] small sptrsv kernel optimisations

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 2871be86..90cc2f28 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -580,9 +580,12 @@ class Sptrsv {
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+            const auto &stepPtr = procStepPtr_[proc];
+            const auto &stepNum = procStepNum_[proc];
+
             for (unsigned step = 0; step < numSupersteps_; step++) {
-                const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
-                for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
+                const UVertType upperLimit = stepPtr[step] + stepNum[step];
+                for (UVertType rowIdx = stepPtr[step]; rowIdx < upperLimit; rowIdx++) {
                     double acc = x[rowIdx];
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
                         acc -= val_[i] * x[colIdx_[i]];
@@ -608,10 +611,13 @@ class Sptrsv {
 
 #    pragma omp parallel num_threads(instance_->NumberOfProcessors())
         {
+            const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+            const auto &stepPtr = procStepPtr_[proc];
+            const auto &stepNum = procStepNum_[proc];
+
             for (unsigned step = 0; step < numSupersteps_; step++) {
-                const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
-                const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
-                for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
+                const UVertType upperLimit = stepPtr[step] + stepNum[step];
+                for (UVertType rowIdx = stepPtr[step]; rowIdx < upperLimit; rowIdx++) {
                     double acc = b[rowIdx];
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
                         acc -= val_[i] * x[colIdx_[i]];
@@ -639,13 +645,18 @@ class Sptrsv {
 #    pragma omp parallel num_threads(nthreads)
         {
             const std::size_t proc = static_cast<std::size_t>(omp_get_thread_num());
+            const auto &stepPtr = procStepPtr_[proc];
+            const auto &stepNum = procStepNum_[proc];
+
             for (unsigned step = 0; step < numSupersteps_; ++step) {
-                if (procStepNum_[proc][step] > 0U) {
+                UVertType rowIdx = stepPtr[step];
+                const UVertType upperLimit = stepPtr[step] + stepNum[step];
+
+                if (rowIdx != upperLimit) {
                     barrier.Wait(proc, staleness - 1U);
                 }
 
-                const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step];
-                for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) {
+                for (; rowIdx < upperLimit; rowIdx++) {
                     double acc = x[rowIdx];
                     for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) {
                         acc -= val_[i] * x[colIdx_[i]];

From 72817ba73553e92f6b163421fac1836a4ce232e8 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Wed, 25 Mar 2026 13:03:13 +0100
Subject: [PATCH 54/57] sptrsv iterator optimisations

---
 .../sptrsv_simulator/sptrsv_kernels.hpp       | 46 +++++++++++--------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
index f2988213..fcbcf424 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp
@@ -79,11 +79,11 @@ void SpLTrSvBSPParallel(double *__restrict__ const x,
         const std::size_t numSuperSteps = BoundsStepIdx.size();
 
         for (std::size_t step = 0U; step < numSuperSteps; ++step) {
-            const std::vector<IdxType> &BoundIdx = BoundsStepIdx[step];
-            const std::size_t ubIdx = BoundIdx.size();
-            for (std::size_t idx = 0U; idx < ubIdx; ++idx) {
-                IdxType row = BoundIdx[idx];
-                const IdxType ubRow = BoundIdx[++idx];
+            const std::vector<IdxType> &BoundsIdx = BoundsStepIdx[step];
+            const auto idxItEnd = BoundsIdx.cend();
+            for (auto idxIt = BoundsIdx.cbegin(); idxIt != idxItEnd; ++idxIt) {
+                IdxType row = *idxIt;
+                const IdxType ubRow = *(++idxIt);
                 for (; row <= ubRow; ++row) {
                     double acc = b[row];
                     for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
@@ -112,11 +112,11 @@ void SpLTrSvBSPParallelInPlace(double *__restrict__ const x,
         const std::size_t numSuperSteps = BoundsStepIdx.size();
 
         for (std::size_t step = 0U; step < numSuperSteps; ++step) {
-            const std::vector<IdxType> &BoundIdx = BoundsStepIdx[step];
-            const std::size_t ubIdx = BoundIdx.size();
-            for (std::size_t idx = 0U; idx < ubIdx; ++idx) {
-                IdxType row = BoundIdx[idx];
-                const IdxType ubRow = BoundIdx[++idx];
+            const std::vector<IdxType> &BoundsIdx = BoundsStepIdx[step];
+            const auto idxItEnd = BoundsIdx.cend();
+            for (auto idxIt = BoundsIdx.cbegin(); idxIt != idxItEnd; ++idxIt) {
+                IdxType row = *idxIt;
+                const IdxType ubRow = *(++idxIt);
                 for (; row <= ubRow; ++row) {
                     double acc = x[row];
                     for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
@@ -148,13 +148,16 @@ void SpLTrSvSSPParallel(double *__restrict__ const x,
         const std::vector<std::vector<IdxType>> &BoundsStepIdx = BoundsProcStepIdx[proc];
         for (std::size_t step = 0; step < BoundsStepIdx.size(); ++step) {
             const std::vector<IdxType> &BoundsIdx = BoundsStepIdx[step];
-            const std::size_t ubIdx = BoundsIdx.size();
-            if (ubIdx > 0U) {
+            auto idxIt = BoundsIdx.cbegin();
+            const auto idxItEnd = BoundsIdx.cend();
+
+            if (idxIt != idxItEnd) {
                 barrier.Wait(proc, staleness - 1U);
             }
-            for (std::size_t idx = 0; idx < ubIdx; ++idx) {
-                IdxType row = BoundsIdx[idx];
-                const IdxType ubRow = BoundsIdx[++idx];
+
+            for (; idxIt != idxItEnd; ++idxIt) {
+                IdxType row = *idxIt;
+                const IdxType ubRow = *(++idxIt);
                 for (; row <= ubRow; ++row) {
                     double acc = b[row];
                     for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {
@@ -185,13 +188,16 @@ void SpLTrSvSSPParallelInPlace(double *__restrict__ const x,
         const std::vector<std::vector<IdxType>> &BoundsStepIdx = BoundsProcStepIdx[proc];
         for (std::size_t step = 0; step < BoundsStepIdx.size(); ++step) {
             const std::vector<IdxType> &BoundsIdx = BoundsStepIdx[step];
-            const std::size_t ubIdx = BoundsIdx.size();
-            if (ubIdx > 0U) {
+            auto idxIt = BoundsIdx.cbegin();
+            const auto idxItEnd = BoundsIdx.cend();
+
+            if (idxIt != idxItEnd) {
                 barrier.Wait(proc, staleness - 1U);
             }
-            for (std::size_t idx = 0; idx < ubIdx; ++idx) {
-                IdxType row = BoundsIdx[idx];
-                const IdxType ubRow = BoundsIdx[++idx];
+
+            for (; idxIt != idxItEnd; ++idxIt) {
+                IdxType row = *idxIt;
+                const IdxType ubRow = *(++idxIt);
                 for (; row <= ubRow; ++row) {
                     double acc = x[row];
                     for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) {

From 5c71d4a23450ba2294bb20d5586fea2711e501bf Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Wed, 25 Mar 2026 13:18:06 +0100
Subject: [PATCH 55/57] more sptrsv iterator optimisations

---
 .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 90cc2f28..c246243a 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -482,10 +482,10 @@ class Sptrsv {
             do {
                 step--;
                 const auto &localBoundsArrayU = procLocalBoundsArrayU[step];
-                const std::size_t boundsStrSize = localBoundsArrayU.size();
-                for (std::size_t index = 0; index < boundsStrSize; ++index) {
-                    EigenIdxType node = localBoundsArrayU[index] + 1;
-                    const EigenIdxType lowerB = localBoundsArrayU[++index];
+                const auto idxItEnd = localBoundsArrayU.cend();
+                for (auto idxIt = localBoundsArrayU.cbegin(); idxIt != idxItEnd; ++idxIt) {
+                    EigenIdxType node = (*idxIt) + 1;
+                    const EigenIdxType lowerB = *(++idxIt);
 
                     do {
                         node--;
@@ -528,9 +528,10 @@ class Sptrsv {
                 step--;
                 const auto &localBoundsArrayU = procLocalBoundsArrayU[step];
                 const std::size_t boundsStrSize = localBoundsArrayU.size();
-                for (std::size_t index = 0; index < boundsStrSize; ++index) {
-                    EigenIdxType node = localBoundsArrayU[index] + 1;
-                    const EigenIdxType lowerB = localBoundsArrayU[++index];
+                const auto idxItEnd = localBoundsArrayU.cend();
+                for (auto idxIt = localBoundsArrayU.cbegin(); idxIt != idxItEnd; ++idxIt) {
+                    EigenIdxType node = (*idxIt) + 1;
+                    const EigenIdxType lowerB = *(++idxIt);
 
                     do {
                         node--;
@@ -755,14 +756,16 @@ class Sptrsv {
             do {
                 step--;
                 const auto &localBoundsArrayU = procLocalBoundsArrayU[step];
-                const std::size_t boundsStrSize = localBoundsArrayU.size();
-                if (boundsStrSize > 0U) {
+                auto idxIt = localBoundsArrayU.cbegin();
+                const auto idxItEnd = localBoundsArrayU.cend();
+
+                if (idxIt != idxItEnd) {
                     barrier.Wait(proc, staleness - 1U);
                 }
 
-                for (std::size_t index = 0; index < boundsStrSize; ++index) {
-                    EigenIdxType node = localBoundsArrayU[index] + 1;
-                    const EigenIdxType lowerB = localBoundsArrayU[++index];
+                for (; idxIt != idxItEnd; ++idxIt) {
+                    EigenIdxType node = (*idxIt) + 1;
+                    const EigenIdxType lowerB = *(++idxIt);
 
                     do {
                         node--;

From d3e6adef7d802cdc058ea05088d81926794903b9 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Wed, 1 Apr 2026 14:58:18 +0200
Subject: [PATCH 56/57] removed pedantic warnings

---
 tests/kl_lambda.cpp | 4 ++--
 tests/kl_total.cpp  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/kl_lambda.cpp b/tests/kl_lambda.cpp
index 8acaf427..6d985d25 100644
--- a/tests/kl_lambda.cpp
+++ b/tests/kl_lambda.cpp
@@ -356,7 +356,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_incremental_update_test) {
 
     CheckEqualLambdaMap(lambda_map, kl_6.GetCommCostF().nodeLambdaMap_);
     CheckEqualAffinityTable(affinity, kl_6.GetAffinityTable(), nodes_to_check);
-};
+}
 
 BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_test) {
     using graph = ComputationalDagEdgeIdxVectorImplDefIntT;
@@ -486,7 +486,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_test) {
 
     CheckEqualLambdaMap(lambda_map, kl_6.GetCommCostF().nodeLambdaMap_);
     CheckEqualAffinityTable(affinity, kl_6.GetAffinityTable(), nodes_to_check);
-};
+}
 
 BOOST_AUTO_TEST_CASE(KlLambdaImproverInnerLoopPenaltyTest) {
     using Graph = ComputationalDagEdgeIdxVectorImplDefIntT;
diff --git a/tests/kl_total.cpp b/tests/kl_total.cpp
index 5b48d12d..f0d1e25f 100644
--- a/tests/kl_total.cpp
+++ b/tests/kl_total.cpp
@@ -887,7 +887,7 @@ BOOST_AUTO_TEST_CASE(KlImprover_incremental_update_test) {
     nodes_to_check.erase(v3);
 
     CheckEqualAffinityTable(affinity, kl_6.GetAffinityTable(), nodes_to_check);
-};
+}
 
 // BOOST_AUTO_TEST_CASE(kl_total_comm_large_test_graphs) {
 //     std::vector<std::string> filenames_graph = LargeSpaaGraphs();

From 0e598b2d623daa0184ffb6998109d2e4f2d31c60 Mon Sep 17 00:00:00 2001
From: Raphael Steiner <raphael.steiner@huawei.com>
Date: Wed, 1 Apr 2026 15:10:17 +0200
Subject: [PATCH 57/57] reactivation of concept

---
 include/osp/concepts/directed_graph_concept.hpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/osp/concepts/directed_graph_concept.hpp b/include/osp/concepts/directed_graph_concept.hpp
index 09bc9900..aaa537ad 100644
--- a/include/osp/concepts/directed_graph_concept.hpp
+++ b/include/osp/concepts/directed_graph_concept.hpp
@@ -64,15 +64,13 @@ struct IsDirectedGraph<T,
                                    decltype(std::declval<T>().Children(std::declval<VertexIdxT<T>>())),
                                    decltype(std::declval<T>().InDegree(std::declval<VertexIdxT<T>>())),
                                    decltype(std::declval<T>().OutDegree(std::declval<VertexIdxT<T>>()))>>
-    : std::conjunction<
-                       IsForwardRangeOf<decltype(std::declval<T>().Vertices()), VertexIdxT<T>>,
+    : std::conjunction<IsForwardRangeOf<decltype(std::declval<T>().Vertices()), VertexIdxT<T>>,
                        std::is_integral<decltype(std::declval<T>().NumVertices())>,
                        std::is_integral<decltype(std::declval<T>().NumEdges())>,
-                       IsInputRangeOf<decltype(std::declval<T>().Parents(std::declval<VertexIdxT<T>>())), VertexIdxT<T>>
-                    //    IsInputRangeOf<decltype(std::declval<T>().Children(std::declval<VertexIdxT<T>>())), VertexIdxT<T>>,
-                    //    std::is_integral<decltype(std::declval<T>().InDegree(std::declval<VertexIdxT<T>>()))>,
-                    //    std::is_integral<decltype(std::declval<T>().OutDegree(std::declval<VertexIdxT<T>>()))>
-                       > {};
+                       IsInputRangeOf<decltype(std::declval<T>().Parents(std::declval<VertexIdxT<T>>())), VertexIdxT<T>>,
+                       IsInputRangeOf<decltype(std::declval<T>().Children(std::declval<VertexIdxT<T>>())), VertexIdxT<T>>,
+                       std::is_integral<decltype(std::declval<T>().InDegree(std::declval<VertexIdxT<T>>()))>,
+                       std::is_integral<decltype(std::declval<T>().OutDegree(std::declval<VertexIdxT<T>>()))>> {};
 
 template <typename T>
 inline constexpr bool isDirectedGraphV = IsDirectedGraph<T>::value;