From d0cd9a4cbec4f69f4e0d93db689ea2de4195615f Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Wed, 4 Feb 2026 10:58:17 +0100 Subject: [PATCH 01/57] adding basic ssp sptrsv kernel (non-optimized) --- apps/CMakeLists.txt | 2 + apps/maxbsp_ssp_sptrsv.cpp | 108 ++++++++++++++++++ .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 73 ++++++++++++ 3 files changed, 183 insertions(+) create mode 100644 apps/maxbsp_ssp_sptrsv.cpp diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 97935a92..2c7cbb5e 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -55,5 +55,7 @@ endif() endif() +_add_executable( maxbsp_ssp_sptrsv ) + # Custom target to compile all the executables add_custom_target( build_executables DEPENDS ${executable_list} ) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp new file mode 100644 index 00000000..f89aac94 --- /dev/null +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -0,0 +1,108 @@ +/* + * maxbsp_ssp_sptrsv.cpp + * Demonstrates maxbsp scheduling with staleness=2, then runs SpTRSV with SSP kernel. + */ + +#include +#include +#include +#include +#include "osp/auxiliary/sptrsv_simulator/sptrsv.hpp" +#include "osp/bsp/model/BspInstance.hpp" +#include "osp/bsp/model/BspSchedule.hpp" +#include "osp/bsp/model/MaxBspSchedule.hpp" +#include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCores.hpp" +#include "osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp" +#include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp" +#include + +using namespace osp; + +int main(int argc, char* argv[]) { + // Accept matrix filename and iteration count as arguments + std::string filename = "../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx"; + int num_iterations = 1; + if (argc > 1) { + filename = argv[1]; + } + if (argc > 2) { + num_iterations = std::stoi(argv[2]); + } + + // Load matrix + Eigen::SparseMatrix lCsr; + bool matrixLoadSuccess = Eigen::loadMarket(lCsr, filename); + if (!matrixLoadSuccess) { + std::cerr << "Failed to read matrix from " << filename << std::endl; + return 1; + } + std::cout << "Loaded matrix of size " << lCsr.rows() << " x " << lCsr.cols() << " with " << lCsr.nonZeros() << " non-zeros.\n"; + + // Setup graph and architecture + SparseMatrixImp graph; + graph.SetCsr(&lCsr); + Eigen::SparseMatrix lCsc = lCsr; + graph.SetCsc(&lCsc); + BspArchitecture> architecture(16, 1, 500); // 16 processors + BspInstance> instance(graph, architecture); + + // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness=2) + GreedyVarianceSspScheduler> ssp_scheduler; + MaxBspSchedule> ssp_schedule(instance); + ssp_scheduler.ComputeSchedule(ssp_schedule); + + // Setup SpTRSV kernel + Sptrsv sptrsv_kernel(instance); + sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule); + + size_t n = static_cast(lCsc.cols()); + + // Benchmark SSP L-solve + double ssp_total_time = 0.0; + std::vector ssp_result(n, 0.0); + for (int iter = 0; iter < num_iterations; ++iter) { + std::vector x(n, 0.0); + std::vector b(n, 1.0); + sptrsv_kernel.x_ = x.data(); + sptrsv_kernel.b_ = b.data(); + auto start = std::chrono::high_resolution_clock::now(); + sptrsv_kernel.SspLsolveStaleness2(); + auto end = std::chrono::high_resolution_clock::now(); + ssp_total_time += std::chrono::duration(end - start).count(); + if (iter == 0) ssp_result = std::vector(x.begin(), x.end()); + } + double ssp_avg_time = ssp_total_time / num_iterations; + + // Benchmark serial L-solve + double serial_total_time = 0.0; + std::vector serial_result(n, 0.0); + for (int iter = 0; iter < num_iterations; ++iter) { + std::vector x_serial(n, 0.0); + std::vector b_serial(n, 1.0); + sptrsv_kernel.x_ = x_serial.data(); + sptrsv_kernel.b_ = b_serial.data(); + auto start = std::chrono::high_resolution_clock::now(); + sptrsv_kernel.LsolveSerial(); + auto end = std::chrono::high_resolution_clock::now(); + serial_total_time += std::chrono::duration(end - start).count(); + if (iter == 0) serial_result = std::vector(x_serial.begin(), x_serial.end()); + } + double serial_avg_time = serial_total_time / num_iterations; + + // Compare results + double max_diff = 0.0; + for (size_t i = 0; i < n; ++i) { + double diff = std::abs(ssp_result[i] - serial_result[i]); + if (diff > max_diff) max_diff = diff; + } + std::cout << "Max difference between SSP and serial L-solve: " << max_diff << std::endl; + if (max_diff < 1e-10) { + std::cout << "SSP L-solve matches serial L-solve!" << std::endl; + } else { + std::cout << "SSP L-solve does NOT match serial L-solve!" << std::endl; + } + std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_avg_time << " seconds" << std::endl; + std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl; + std::cout << "MaxBSP with staleness=2 and SSP SpTRSV executed." << std::endl; + return 0; +} diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 436e3dd4..2371e351 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -24,6 +24,7 @@ limitations under the License. # include # include +# include # include # include # include @@ -36,6 +37,28 @@ limitations under the License. # include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp" namespace osp { +// Portable cpu_relax definition +#if defined(__x86_64__) || defined(_M_X64) +#include +inline void cpu_relax() { _mm_pause(); } +#elif defined(__aarch64__) +inline void cpu_relax() { asm volatile("yield" ::: "memory"); } +#else +inline void cpu_relax() { std::this_thread::yield(); } +#endif +// SSPBarrierRaph for staleness-aware synchronization +class SSPBarrierRaph { +private: + alignas(64) std::atomic threadCounter{0U}; + void barrier_sleep() const {} +public: + void arrive() { threadCounter.fetch_add(1U, std::memory_order_release); } + void wait(std::size_t arr_token) { + while ((threadCounter.load(std::memory_order_relaxed) < arr_token) || (threadCounter.load(std::memory_order_acquire) < arr_token)) { + cpu_relax(); + } + } +}; template class Sptrsv { @@ -126,6 +149,8 @@ class Sptrsv { do { node--; vectorStepProcessorVerticesU_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back( + // --- SSP SpTRSV kernel integration from BspSptrsvCSR.hpp/cpp --- + static_cast(node)); } while (node > 0); @@ -479,6 +504,54 @@ class Sptrsv { std::size_t GetNumberOfVertices() { return instance_->NumberOfVertices(); } + // SSP Lsolve with staleness=2 (allowing at most one superstep of lag) + void SspLsolveStaleness2() { + constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference + const unsigned nthreads = instance_->NumberOfProcessors(); + std::vector> stepDone(numSupersteps_); + for (auto &counter : stepDone) { + counter.store(0U, std::memory_order_relaxed); + } + + auto *csr = instance_->GetComputationalDag().GetCSR(); + const auto *outer = csr->outerIndexPtr(); + const auto *inner = csr->innerIndexPtr(); + const auto *vals = csr->valuePtr(); + + #pragma omp parallel num_threads(nthreads) + { + const unsigned proc = static_cast(omp_get_thread_num()); + for (unsigned step = 0; step < numSupersteps_; ++step) { + if (step >= staleness) { + const unsigned waitStep = step - static_cast(staleness); + while (stepDone[waitStep].load(std::memory_order_acquire) < nthreads) { + cpu_relax(); + } + } + // Each thread processes its assigned node ranges for this superstep + const size_t boundsStrSize = boundsArrayL_[step][proc].size(); + for (size_t index = 0; index < boundsStrSize; index += 2) { + EigenIdxType lowerB = boundsArrayL_[step][proc][index]; + const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; + for (EigenIdxType node = lowerB; node <= upperB; ++node) { + // Initialize solution for this node + x_[node] = b_[node]; + // Perform lower-triangular solve for this node + for (EigenIdxType i = outer[node]; + i < outer[node + 1] - 1; + ++i) { + // Subtract contributions from previously solved nodes + x_[node] -= vals[i] * x_[inner[i]]; + } + // Divide by diagonal element to complete solve for this node + x_[node] /= vals[outer[node + 1] - 1]; + } + } + stepDone[step].fetch_add(1U, std::memory_order_release); + } + } + } + virtual ~Sptrsv() = default; }; From 1c8622e500dacf2cd652ef83ae337110eb167a37 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Wed, 4 Feb 2026 13:32:04 +0100 Subject: [PATCH 02/57] Improvements --- apps/maxbsp_ssp_sptrsv.cpp | 3 ++ .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 38 +++++++++++++++---- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index f89aac94..0377b7ce 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -103,6 +103,9 @@ int main(int argc, char* argv[]) { } std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_avg_time << " seconds" << std::endl; std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl; + if (ssp_avg_time > 0.0) { + std::cout << "Speedup (serial/SSP): " << (serial_avg_time / ssp_avg_time) << "x" << std::endl; + } std::cout << "MaxBSP with staleness=2 and SSP SpTRSV executed." << std::endl; return 0; } diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 2371e351..cc58fe76 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -25,11 +25,14 @@ limitations under the License. # include # include # include +# include # include # include # include +# include # include # include +# include # include # include "osp/bsp/model/BspInstance.hpp" @@ -91,6 +94,8 @@ class Sptrsv { std::vector>> boundsArrayL_; std::vector>> boundsArrayU_; + std::unique_ptr[]> stepDone_; + std::size_t stepDoneSize_ = 0U; Sptrsv() = default; @@ -109,6 +114,13 @@ class Sptrsv { schedule.NumberOfSupersteps(), std::vector>(schedule.GetInstance().NumberOfProcessors())); numSupersteps_ = schedule.NumberOfSupersteps(); + if (stepDoneSize_ != static_cast(numSupersteps_)) { + stepDone_ = std::make_unique[]>(numSupersteps_); + stepDoneSize_ = static_cast(numSupersteps_); + } + for (std::size_t i = 0; i < stepDoneSize_; ++i) { + stepDone_[i].store(0U, std::memory_order_relaxed); + } size_t numberOfVertices = instance_->GetComputationalDag().NumVertices(); # pragma omp parallel num_threads(2) @@ -508,9 +520,8 @@ class Sptrsv { void SspLsolveStaleness2() { constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference const unsigned nthreads = instance_->NumberOfProcessors(); - std::vector> stepDone(numSupersteps_); - for (auto &counter : stepDone) { - counter.store(0U, std::memory_order_relaxed); + for (std::size_t i = 0; i < stepDoneSize_; ++i) { + stepDone_[i].store(0U, std::memory_order_relaxed); } auto *csr = instance_->GetComputationalDag().GetCSR(); @@ -524,11 +535,24 @@ class Sptrsv { for (unsigned step = 0; step < numSupersteps_; ++step) { if (step >= staleness) { const unsigned waitStep = step - static_cast(staleness); - while (stepDone[waitStep].load(std::memory_order_acquire) < nthreads) { - cpu_relax(); + unsigned spinCount = 0U; + auto backoff = std::chrono::nanoseconds(50); + while (stepDone_[waitStep].load(std::memory_order_relaxed) < nthreads) { + if (spinCount < 2000U) { + cpu_relax(); + ++spinCount; + } else if (spinCount < 4000U) { + std::this_thread::yield(); + ++spinCount; + } else { + std::this_thread::sleep_for(backoff); + if (backoff < std::chrono::nanoseconds(500)) { + backoff *= 2; + } + } } + std::atomic_thread_fence(std::memory_order_acquire); } - // Each thread processes its assigned node ranges for this superstep const size_t boundsStrSize = boundsArrayL_[step][proc].size(); for (size_t index = 0; index < boundsStrSize; index += 2) { EigenIdxType lowerB = boundsArrayL_[step][proc][index]; @@ -547,7 +571,7 @@ class Sptrsv { x_[node] /= vals[outer[node + 1] - 1]; } } - stepDone[step].fetch_add(1U, std::memory_order_release); + stepDone_[step].fetch_add(1U, std::memory_order_release); } } } From d13bd4c570358fdaff83c34b26b7f5dcdc3465e3 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Wed, 4 Feb 2026 13:42:12 +0100 Subject: [PATCH 03/57] Make class for barrier functionality --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 81 +++++++++++-------- 1 file changed, 48 insertions(+), 33 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index cc58fe76..61123022 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -63,6 +63,49 @@ class SSPBarrierRaph { } }; +class SspStalenessBarrier { + private: + std::unique_ptr[]> stepDone_; + std::size_t stepDoneSize_ = 0U; + + public: + void Reset(std::size_t numSupersteps) { + if (stepDoneSize_ != numSupersteps) { + stepDone_ = std::make_unique[]>(numSupersteps); + stepDoneSize_ = numSupersteps; + } + for (std::size_t i = 0; i < stepDoneSize_; ++i) { + stepDone_[i].store(0U, std::memory_order_relaxed); + } + } + + void WaitIfNeeded(unsigned step, unsigned staleness, unsigned nthreads) { + if (step < staleness) { + return; + } + const unsigned waitStep = step - staleness; + unsigned spinCount = 0U; + auto backoff = std::chrono::nanoseconds(50); + while (stepDone_[waitStep].load(std::memory_order_relaxed) < nthreads) { + if (spinCount < 2000U) { + cpu_relax(); + ++spinCount; + } else if (spinCount < 4000U) { + std::this_thread::yield(); + ++spinCount; + } else { + std::this_thread::sleep_for(backoff); + if (backoff < std::chrono::nanoseconds(500)) { + backoff *= 2; + } + } + } + std::atomic_thread_fence(std::memory_order_acquire); + } + + void Arrive(unsigned step) { stepDone_[step].fetch_add(1U, std::memory_order_release); } +}; + template class Sptrsv { using UVertType = typename SparseMatrixImp::VertexIdx; @@ -94,8 +137,7 @@ class Sptrsv { std::vector>> boundsArrayL_; std::vector>> boundsArrayU_; - std::unique_ptr[]> stepDone_; - std::size_t stepDoneSize_ = 0U; + SspStalenessBarrier sspBarrier_; Sptrsv() = default; @@ -114,13 +156,7 @@ class Sptrsv { schedule.NumberOfSupersteps(), std::vector>(schedule.GetInstance().NumberOfProcessors())); numSupersteps_ = schedule.NumberOfSupersteps(); - if (stepDoneSize_ != static_cast(numSupersteps_)) { - stepDone_ = std::make_unique[]>(numSupersteps_); - stepDoneSize_ = static_cast(numSupersteps_); - } - for (std::size_t i = 0; i < stepDoneSize_; ++i) { - stepDone_[i].store(0U, std::memory_order_relaxed); - } + sspBarrier_.Reset(static_cast(numSupersteps_)); size_t numberOfVertices = instance_->GetComputationalDag().NumVertices(); # pragma omp parallel num_threads(2) @@ -520,9 +556,7 @@ class Sptrsv { void SspLsolveStaleness2() { constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference const unsigned nthreads = instance_->NumberOfProcessors(); - for (std::size_t i = 0; i < stepDoneSize_; ++i) { - stepDone_[i].store(0U, std::memory_order_relaxed); - } + sspBarrier_.Reset(static_cast(numSupersteps_)); auto *csr = instance_->GetComputationalDag().GetCSR(); const auto *outer = csr->outerIndexPtr(); @@ -533,26 +567,7 @@ class Sptrsv { { const unsigned proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; ++step) { - if (step >= staleness) { - const unsigned waitStep = step - static_cast(staleness); - unsigned spinCount = 0U; - auto backoff = std::chrono::nanoseconds(50); - while (stepDone_[waitStep].load(std::memory_order_relaxed) < nthreads) { - if (spinCount < 2000U) { - cpu_relax(); - ++spinCount; - } else if (spinCount < 4000U) { - std::this_thread::yield(); - ++spinCount; - } else { - std::this_thread::sleep_for(backoff); - if (backoff < std::chrono::nanoseconds(500)) { - backoff *= 2; - } - } - } - std::atomic_thread_fence(std::memory_order_acquire); - } + sspBarrier_.WaitIfNeeded(step, static_cast(staleness), nthreads); const size_t boundsStrSize = boundsArrayL_[step][proc].size(); for (size_t index = 0; index < boundsStrSize; index += 2) { EigenIdxType lowerB = boundsArrayL_[step][proc][index]; @@ -571,7 +586,7 @@ class Sptrsv { x_[node] /= vals[outer[node + 1] - 1]; } } - stepDone_[step].fetch_add(1U, std::memory_order_release); + sspBarrier_.Arrive(step); } } } From fba4e7cb06602138626d63d553e30fc186c79b31 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Wed, 4 Feb 2026 13:47:51 +0100 Subject: [PATCH 04/57] Adding comments --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 61123022..de32d6c8 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -49,20 +49,9 @@ inline void cpu_relax() { asm volatile("yield" ::: "memory"); } #else inline void cpu_relax() { std::this_thread::yield(); } #endif -// SSPBarrierRaph for staleness-aware synchronization -class SSPBarrierRaph { -private: - alignas(64) std::atomic threadCounter{0U}; - void barrier_sleep() const {} -public: - void arrive() { threadCounter.fetch_add(1U, std::memory_order_release); } - void wait(std::size_t arr_token) { - while ((threadCounter.load(std::memory_order_relaxed) < arr_token) || (threadCounter.load(std::memory_order_acquire) < arr_token)) { - cpu_relax(); - } - } -}; +// Staleness-aware barrier for SSP: threads may run up to (staleness-1) steps ahead. +// Internally tracks per-step completion counts and uses adaptive backoff to limit spinning. class SspStalenessBarrier { private: std::unique_ptr[]> stepDone_; @@ -70,6 +59,7 @@ class SspStalenessBarrier { public: void Reset(std::size_t numSupersteps) { + // Reinitialize counters for a new schedule/run. if (stepDoneSize_ != numSupersteps) { stepDone_ = std::make_unique[]>(numSupersteps); stepDoneSize_ = numSupersteps; @@ -80,6 +70,7 @@ class SspStalenessBarrier { } void WaitIfNeeded(unsigned step, unsigned staleness, unsigned nthreads) { + // Enforce: step may start only when all threads completed (step - staleness). if (step < staleness) { return; } @@ -87,6 +78,7 @@ class SspStalenessBarrier { unsigned spinCount = 0U; auto backoff = std::chrono::nanoseconds(50); while (stepDone_[waitStep].load(std::memory_order_relaxed) < nthreads) { + // Adaptive backoff: spin -> yield -> short sleep to reduce contention. if (spinCount < 2000U) { cpu_relax(); ++spinCount; @@ -103,6 +95,7 @@ class SspStalenessBarrier { std::atomic_thread_fence(std::memory_order_acquire); } + // Mark completion of a superstep by this thread. void Arrive(unsigned step) { stepDone_[step].fetch_add(1U, std::memory_order_release); } }; @@ -552,10 +545,12 @@ class Sptrsv { std::size_t GetNumberOfVertices() { return instance_->NumberOfVertices(); } - // SSP Lsolve with staleness=2 (allowing at most one superstep of lag) + // SSP Lsolve with staleness=2 (allowing at most one superstep of lag). + // Uses the staleness barrier to respect dependencies between supersteps. void SspLsolveStaleness2() { constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference const unsigned nthreads = instance_->NumberOfProcessors(); + // Reset per-step completion counters for this run. sspBarrier_.Reset(static_cast(numSupersteps_)); auto *csr = instance_->GetComputationalDag().GetCSR(); @@ -567,7 +562,9 @@ class Sptrsv { { const unsigned proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; ++step) { + // Ensure we are not more than (staleness-1) supersteps ahead. sspBarrier_.WaitIfNeeded(step, static_cast(staleness), nthreads); + // Process nodes assigned to this (step, proc) pair. const size_t boundsStrSize = boundsArrayL_[step][proc].size(); for (size_t index = 0; index < boundsStrSize; index += 2) { EigenIdxType lowerB = boundsArrayL_[step][proc][index]; @@ -586,6 +583,7 @@ class Sptrsv { x_[node] /= vals[outer[node + 1] - 1]; } } + // Signal completion of this superstep for staleness tracking. sspBarrier_.Arrive(step); } } From 37f0d0aa65b9892b948924ff4be028c87e395799 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Wed, 4 Feb 2026 14:55:12 +0100 Subject: [PATCH 05/57] comparing against growlocal --- apps/maxbsp_ssp_sptrsv.cpp | 53 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 0377b7ce..dd1112d0 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -51,9 +51,13 @@ int main(int argc, char* argv[]) { MaxBspSchedule> ssp_schedule(instance); ssp_scheduler.ComputeSchedule(ssp_schedule); + // Create a non-SSP schedule using GrowLocalAutoCores + GrowLocalAutoCores> growlocal_scheduler; + BspSchedule> growlocal_schedule(instance); + growlocal_scheduler.ComputeSchedule(growlocal_schedule); + // Setup SpTRSV kernel Sptrsv sptrsv_kernel(instance); - sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule); size_t n = static_cast(lCsc.cols()); @@ -63,6 +67,7 @@ int main(int argc, char* argv[]) { for (int iter = 0; iter < num_iterations; ++iter) { std::vector x(n, 0.0); std::vector b(n, 1.0); + sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule); sptrsv_kernel.x_ = x.data(); sptrsv_kernel.b_ = b.data(); auto start = std::chrono::high_resolution_clock::now(); @@ -73,6 +78,23 @@ int main(int argc, char* argv[]) { } double ssp_avg_time = ssp_total_time / num_iterations; + // Benchmark GrowLocalAutoCores schedule with non-SSP L-solve (no permutation) + double growlocal_total_time = 0.0; + std::vector growlocal_result(n, 0.0); + for (int iter = 0; iter < num_iterations; ++iter) { + std::vector x(n, 0.0); + std::vector b(n, 1.0); + sptrsv_kernel.SetupCsrNoPermutation(growlocal_schedule); + sptrsv_kernel.x_ = x.data(); + sptrsv_kernel.b_ = b.data(); + auto start = std::chrono::high_resolution_clock::now(); + sptrsv_kernel.LsolveNoPermutation(); + auto end = std::chrono::high_resolution_clock::now(); + growlocal_total_time += std::chrono::duration(end - start).count(); + if (iter == 0) growlocal_result = std::vector(x.begin(), x.end()); + } + double growlocal_avg_time = growlocal_total_time / num_iterations; + // Benchmark serial L-solve double serial_total_time = 0.0; std::vector serial_result(n, 0.0); @@ -101,11 +123,38 @@ int main(int argc, char* argv[]) { } else { std::cout << "SSP L-solve does NOT match serial L-solve!" << std::endl; } + double max_diff_growlocal = 0.0; + for (size_t i = 0; i < n; ++i) { + double diff = std::abs(growlocal_result[i] - serial_result[i]); + if (diff > max_diff_growlocal) max_diff_growlocal = diff; + } + std::cout << "Max difference between GrowLocalAutoCores and serial L-solve: " << max_diff_growlocal << std::endl; + if (max_diff_growlocal < 1e-10) { + std::cout << "GrowLocalAutoCores L-solve matches serial L-solve!" << std::endl; + } else { + std::cout << "GrowLocalAutoCores L-solve does NOT match serial L-solve!" << std::endl; + } + + double max_diff_ssp_growlocal = 0.0; + for (size_t i = 0; i < n; ++i) { + double diff = std::abs(ssp_result[i] - growlocal_result[i]); + if (diff > max_diff_ssp_growlocal) max_diff_ssp_growlocal = diff; + } + std::cout << "Max difference between SSP and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal << std::endl; + std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_avg_time << " seconds" << std::endl; + std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time + << " seconds" << std::endl; std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl; if (ssp_avg_time > 0.0) { std::cout << "Speedup (serial/SSP): " << (serial_avg_time / ssp_avg_time) << "x" << std::endl; } - std::cout << "MaxBSP with staleness=2 and SSP SpTRSV executed." << std::endl; + if (growlocal_avg_time > 0.0) { + std::cout << "Speedup (serial/GrowLocalAutoCores): " << (serial_avg_time / growlocal_avg_time) << "x" << std::endl; + } + if (ssp_avg_time > 0.0) { + std::cout << "Speedup (GrowLocalAutoCores/SSP): " << (growlocal_avg_time / ssp_avg_time) << "x" << std::endl; + } + std::cout << "MaxBSP staleness=2 SSP and GrowLocalAutoCores SpTRSV executed." << std::endl; return 0; } From 057ffb0f01ddf487858faa4adc5500599bb2563b Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 5 Feb 2026 14:07:32 +0100 Subject: [PATCH 06/57] move executable in cmake --- apps/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 2c7cbb5e..ef5823ee 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -36,16 +36,18 @@ if(Boost_FOUND) _add_executable( osp_turnus ) -_add_executable ( osp ) +_add_executable( osp ) configure_file(config/osp_config.json osp_config.json COPYONLY) -_add_executable ( bsp_test_suite ) +_add_executable( bsp_test_suite ) _add_executable( coarser_plotter ) if(Eigen3_FOUND) -_add_executable ( sptrsv_test_suite ) +_add_executable( sptrsv_test_suite ) + +_add_executable( maxbsp_ssp_sptrsv ) endif() if (COPT_FOUND) @@ -55,7 +57,5 @@ endif() endif() -_add_executable( maxbsp_ssp_sptrsv ) - # Custom target to compile all the executables add_custom_target( build_executables DEPENDS ${executable_list} ) From 866e6269358f62c6791820fca75c896de2789a16 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 5 Feb 2026 15:35:43 +0100 Subject: [PATCH 07/57] FlatBarrier --- .../WeakBarriers/flat_barrier.hpp | 93 +++++++++++++++++++ .../GrowLocalAutoCoresParallel.hpp | 3 +- include/osp/config/config.hpp | 25 +++++ tests/CMakeLists.txt | 2 + tests/weak_barrier.cpp | 32 +++++++ 5 files changed, 153 insertions(+), 2 deletions(-) create mode 100644 include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp create mode 100644 include/osp/config/config.hpp create mode 100644 tests/weak_barrier.cpp diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp new file mode 100644 index 00000000..9b35ac8d --- /dev/null +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp @@ -0,0 +1,93 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include +#include + +#include "osp/config/config.hpp" + +namespace osp { + +// Portable cpu_relax definition +#if defined(__x86_64__) || defined(_M_X64) +# include + +inline void cpu_relax() { _mm_pause(); } +#elif defined(__aarch64__) +inline void cpu_relax() { asm volatile("yield" ::: "memory"); } +#else +inline void cpu_relax() { std::this_thread::yield(); } +#endif + +struct alignas(CACHE_LINE_SIZE) AlignedAtomicFlag { + std::atomic flag_; + int8_t pad[CACHE_LINE_SIZE - sizeof(std::atomic)]; + + static_assert(std::atomic::is_always_lock_free); + static_assert(sizeof(int8_t) == 1U); +}; + +/** + * @brief A weak synchronisation barrier which can be reused. + * Instatiate with number of threads. Each thread should call "Arrive" with its thread id to indicate that its work has been + * completed. Each thread can then call "Wait" to wait till all other threads have completed their work. + * + * The barrier can be reset and reused after calling "Reset" for each thread. + * + * WARNING: The reset is NOT synchronised, thus a second FlatBarrier is required to synchronise the reset of the barrier. That is + * do NOT call "Reset" immediately after "Wait" as this could cause other threads not to see that the work has been completed. + * + */ +class FlatBarrier { + private: + std::vector flags_; + + public: + FlatBarrier(std::size_t numThreads) : flags_(std::vector(numThreads)) {}; + + inline void Arrive(std::size_t threadId); + inline void Wait() const; + inline void Reset(std::size_t threadId); + + FlatBarrier() = delete; + FlatBarrier(const FlatBarrier &) = delete; + FlatBarrier(FlatBarrier &&) = delete; + FlatBarrier &operator=(const FlatBarrier &) = delete; + FlatBarrier &operator=(FlatBarrier &&) = delete; + ~FlatBarrier() = default; +}; + +inline void FlatBarrier::Arrive(std::size_t threadId) { flags_[threadId].flag_.store(true, std::memory_order_relaxed); } + +inline void FlatBarrier::Wait() const { + for (const AlignedAtomicFlag &flag : flags_) { + std::size_t cntr = 0U; + while (not flag.flag_.load(std::memory_order_relaxed)) { + ++cntr; + if (cntr % 256U == 0U) { + cpu_relax(); + } + } + } +} + +inline void FlatBarrier::Reset(std::size_t threadId) { flags_[threadId].flag_.store(false, std::memory_order_relaxed); } + +} // end namespace osp diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp index 7f9ac6cb..5f7bcaaf 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp @@ -36,11 +36,10 @@ limitations under the License. #include "osp/auxiliary/misc.hpp" #include "osp/bsp/model/BspSchedule.hpp" #include "osp/bsp/scheduler/Scheduler.hpp" +#include "osp/config/config.hpp" namespace osp { -static constexpr std::size_t CACHE_LINE_SIZE = 64; - template struct GrowLocalAutoCoresParallelParams { VertT minSuperstepSize_ = 20; diff --git a/include/osp/config/config.hpp b/include/osp/config/config.hpp new file mode 100644 index 00000000..9cc6c5a8 --- /dev/null +++ b/include/osp/config/config.hpp @@ -0,0 +1,25 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +namespace osp { + +static constexpr std::size_t CACHE_LINE_SIZE = 64U; + +} // end namespace osp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ebc5c6cf..d6a8f8c2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -129,6 +129,8 @@ _add_test( bit_mask ) _add_test( hash_pair ) +_add_test( weak_barrier ) + ## io _add_test( filereader DATA ) diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp new file mode 100644 index 00000000..833f2111 --- /dev/null +++ b/tests/weak_barrier.cpp @@ -0,0 +1,32 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#define BOOST_TEST_MODULE WeakBarrierTests + +#include +#include +#include + +#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp" + +using namespace osp; + +BOOST_AUTO_TEST_CASE(TestAlignedAtomicFlag) { + BOOST_CHECK_EQUAL(sizeof(AlignedAtomicFlag), 64U); + BOOST_CHECK_EQUAL(alignof(AlignedAtomicFlag), 64U); +} From 361d516c700275e93651f8fd21ebeb1c24a5e819 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 5 Feb 2026 16:37:10 +0100 Subject: [PATCH 08/57] initial barrier version --- .../WeakBarriers/flat_barrier.hpp | 5 +- tests/weak_barrier.cpp | 94 +++++++++++++++++++ 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp index 9b35ac8d..13beedba 100644 --- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp @@ -37,7 +37,7 @@ inline void cpu_relax() { std::this_thread::yield(); } #endif struct alignas(CACHE_LINE_SIZE) AlignedAtomicFlag { - std::atomic flag_; + std::atomic flag_{false}; int8_t pad[CACHE_LINE_SIZE - sizeof(std::atomic)]; static_assert(std::atomic::is_always_lock_free); @@ -54,6 +54,7 @@ struct alignas(CACHE_LINE_SIZE) AlignedAtomicFlag { * WARNING: The reset is NOT synchronised, thus a second FlatBarrier is required to synchronise the reset of the barrier. That is * do NOT call "Reset" immediately after "Wait" as this could cause other threads not to see that the work has been completed. * + * WARNING: A thread calling "Wait" before calling "Arrive" with its thread id results in a deadlock. */ class FlatBarrier { private: @@ -81,7 +82,7 @@ inline void FlatBarrier::Wait() const { std::size_t cntr = 0U; while (not flag.flag_.load(std::memory_order_relaxed)) { ++cntr; - if (cntr % 256U == 0U) { + if (cntr % 128U == 0U) { cpu_relax(); } } diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp index 833f2111..08661f39 100644 --- a/tests/weak_barrier.cpp +++ b/tests/weak_barrier.cpp @@ -18,9 +18,13 @@ limitations under the License. #define BOOST_TEST_MODULE WeakBarrierTests +#include #include #include #include +#include +#include +#include #include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp" @@ -30,3 +34,93 @@ BOOST_AUTO_TEST_CASE(TestAlignedAtomicFlag) { BOOST_CHECK_EQUAL(sizeof(AlignedAtomicFlag), 64U); BOOST_CHECK_EQUAL(alignof(AlignedAtomicFlag), 64U); } + +BOOST_AUTO_TEST_CASE(TestFlatBarrier_2Threads) { + constexpr std::size_t numThreads = 2U; + constexpr std::size_t numBarriers = 1024U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + std::array barrier{FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}}; + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(cntr); + } + barrier[0].Arrive(threadId); + barrier[0].Wait(); + barrier[2].Reset(threadId); + barrier[1].Arrive(threadId); + barrier[1].Wait(); + barrier[0].Reset(threadId); + barrier[2].Arrive(threadId); + barrier[2].Wait(); + barrier[1].Reset(threadId); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + for (std::size_t ind = 0U; ind < ans.size(); ++ind) { + BOOST_CHECK_EQUAL(ans[ind], ind / numThreads); + } +} + +BOOST_AUTO_TEST_CASE(TestFlatBarrier_128Threads) { + constexpr std::size_t numThreads = 128U; + constexpr std::size_t numBarriers = 8U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + std::array barrier{FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}}; + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(cntr); + } + barrier[0].Arrive(threadId); + barrier[0].Wait(); + barrier[2].Reset(threadId); + barrier[1].Arrive(threadId); + barrier[1].Wait(); + barrier[0].Reset(threadId); + barrier[2].Arrive(threadId); + barrier[2].Wait(); + barrier[1].Reset(threadId); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + for (std::size_t ind = 0U; ind < ans.size(); ++ind) { + BOOST_CHECK_EQUAL(ans[ind], ind / numThreads); + } +} From 51a1ad136dbfb0d8cb2fd93a9e03cdc8c2251dea Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 5 Feb 2026 16:58:09 +0100 Subject: [PATCH 09/57] improved FlatBarrier --- .../WeakBarriers/flat_barrier.hpp | 23 +++++++++---------- tests/weak_barrier.cpp | 18 ++++----------- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp index 13beedba..ca7f4f21 100644 --- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp @@ -49,12 +49,10 @@ struct alignas(CACHE_LINE_SIZE) AlignedAtomicFlag { * Instatiate with number of threads. Each thread should call "Arrive" with its thread id to indicate that its work has been * completed. Each thread can then call "Wait" to wait till all other threads have completed their work. * - * The barrier can be reset and reused after calling "Reset" for each thread. + * WARNING: The barrier can be reused IF AND ONLY IF another synchronisation, i.e. through a second FlatBarrier, takes place in between + * the "Wait" and "Arrive". * - * WARNING: The reset is NOT synchronised, thus a second FlatBarrier is required to synchronise the reset of the barrier. That is - * do NOT call "Reset" immediately after "Wait" as this could cause other threads not to see that the work has been completed. - * - * WARNING: A thread calling "Wait" before calling "Arrive" with its thread id results in a deadlock. + * WARNING: A thread calling "Wait" before calling "Arrive" with its thread id is undefined behaviour and can result in a deadlock. */ class FlatBarrier { private: @@ -64,8 +62,7 @@ class FlatBarrier { FlatBarrier(std::size_t numThreads) : flags_(std::vector(numThreads)) {}; inline void Arrive(std::size_t threadId); - inline void Wait() const; - inline void Reset(std::size_t threadId); + inline void Wait(std::size_t threadId) const; FlatBarrier() = delete; FlatBarrier(const FlatBarrier &) = delete; @@ -75,12 +72,16 @@ class FlatBarrier { ~FlatBarrier() = default; }; -inline void FlatBarrier::Arrive(std::size_t threadId) { flags_[threadId].flag_.store(true, std::memory_order_relaxed); } +inline void FlatBarrier::Arrive(std::size_t threadId) { + const bool oldVal = flags_[threadId].flag_.load(std::memory_order_relaxed); + flags_[threadId].flag_.store(!oldVal, std::memory_order_relaxed); +} -inline void FlatBarrier::Wait() const { +inline void FlatBarrier::Wait(std::size_t threadId) const { + const bool val = flags_[threadId].flag_.load(std::memory_order_relaxed); for (const AlignedAtomicFlag &flag : flags_) { std::size_t cntr = 0U; - while (not flag.flag_.load(std::memory_order_relaxed)) { + while (flag.flag_.load(std::memory_order_relaxed) != val) { ++cntr; if (cntr % 128U == 0U) { cpu_relax(); @@ -89,6 +90,4 @@ inline void FlatBarrier::Wait() const { } } -inline void FlatBarrier::Reset(std::size_t threadId) { flags_[threadId].flag_.store(false, std::memory_order_relaxed); } - } // end namespace osp diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp index 08661f39..9aafa6d5 100644 --- a/tests/weak_barrier.cpp +++ b/tests/weak_barrier.cpp @@ -55,14 +55,9 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_2Threads) { ans.emplace_back(cntr); } barrier[0].Arrive(threadId); - barrier[0].Wait(); - barrier[2].Reset(threadId); + barrier[0].Wait(threadId); barrier[1].Arrive(threadId); - barrier[1].Wait(); - barrier[0].Reset(threadId); - barrier[2].Arrive(threadId); - barrier[2].Wait(); - barrier[1].Reset(threadId); + barrier[1].Wait(threadId); } }; @@ -100,14 +95,9 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_128Threads) { ans.emplace_back(cntr); } barrier[0].Arrive(threadId); - barrier[0].Wait(); - barrier[2].Reset(threadId); + barrier[0].Wait(threadId); barrier[1].Arrive(threadId); - barrier[1].Wait(); - barrier[0].Reset(threadId); - barrier[2].Arrive(threadId); - barrier[2].Wait(); - barrier[1].Reset(threadId); + barrier[1].Wait(threadId); } }; From 60a9ca92ccee877250816ae03b0dddd86bb294b5 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 5 Feb 2026 17:09:31 +0100 Subject: [PATCH 10/57] small test fix --- tests/weak_barrier.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp index 9aafa6d5..c7cc0d45 100644 --- a/tests/weak_barrier.cpp +++ b/tests/weak_barrier.cpp @@ -44,7 +44,7 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_2Threads) { std::mutex ans_mutex; - std::array barrier{FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}}; + std::array barrier{FlatBarrier{numThreads}, FlatBarrier{numThreads}}; std::vector threads(numThreads); From 85844c2af29694fc5e407841180aa5548b23f3eb Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Fri, 6 Feb 2026 08:40:30 +0100 Subject: [PATCH 11/57] fixed barrier and ssp test --- .../WeakBarriers/flat_barrier.hpp | 12 +- tests/weak_barrier.cpp | 104 +++++++++++++++++- 2 files changed, 108 insertions(+), 8 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp index ca7f4f21..7c4e586c 100644 --- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp @@ -61,8 +61,8 @@ class FlatBarrier { public: FlatBarrier(std::size_t numThreads) : flags_(std::vector(numThreads)) {}; - inline void Arrive(std::size_t threadId); - inline void Wait(std::size_t threadId) const; + inline void Arrive(const std::size_t threadId); + inline void Wait(const std::size_t threadId) const; FlatBarrier() = delete; FlatBarrier(const FlatBarrier &) = delete; @@ -72,16 +72,16 @@ class FlatBarrier { ~FlatBarrier() = default; }; -inline void FlatBarrier::Arrive(std::size_t threadId) { +inline void FlatBarrier::Arrive(const std::size_t threadId) { const bool oldVal = flags_[threadId].flag_.load(std::memory_order_relaxed); - flags_[threadId].flag_.store(!oldVal, std::memory_order_relaxed); + flags_[threadId].flag_.store(!oldVal, std::memory_order_release); } -inline void FlatBarrier::Wait(std::size_t threadId) const { +inline void FlatBarrier::Wait(const std::size_t threadId) const { const bool val = flags_[threadId].flag_.load(std::memory_order_relaxed); for (const AlignedAtomicFlag &flag : flags_) { std::size_t cntr = 0U; - while (flag.flag_.load(std::memory_order_relaxed) != val) { + while (flag.flag_.load(std::memory_order_acquire) != val) { ++cntr; if (cntr % 128U == 0U) { cpu_relax(); diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp index c7cc0d45..8870b18c 100644 --- a/tests/weak_barrier.cpp +++ b/tests/weak_barrier.cpp @@ -48,7 +48,7 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_2Threads) { std::vector threads(numThreads); - auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](std::size_t threadId) { + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { { std::lock_guard lock(ans_mutex); @@ -88,7 +88,7 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_128Threads) { std::vector threads(numThreads); - auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](std::size_t threadId) { + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { { std::lock_guard lock(ans_mutex); @@ -114,3 +114,103 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_128Threads) { BOOST_CHECK_EQUAL(ans[ind], ind / numThreads); } } + +BOOST_AUTO_TEST_CASE(TestFlatBarrier_SSP_2Threads) { + constexpr std::size_t numThreads = 2U; + constexpr std::size_t numBarriers = 1024U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + constexpr std::size_t numSync = 4U; + std::array barrier{ + FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}}; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + barrier[1U].Arrive(threadId); + barrier[2U].Arrive(threadId); + } + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + barrier[(cntr - 2U + numSync) % numSync].Wait(threadId); + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(threadId); + } + barrier[cntr % numSync].Arrive(threadId); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + + std::vector cntrs(numThreads, 0); + for (const std::size_t work : ans) { + const std::size_t current = ++cntrs[work]; + for (const std::size_t cntr : cntrs) { + BOOST_CHECK_GE(cntr, std::max(current, static_cast(2U)) - 2U); + } + } +} + +BOOST_AUTO_TEST_CASE(TestFlatBarrier_SSP_128Threads) { + constexpr std::size_t numThreads = 128U; + constexpr std::size_t numBarriers = 32U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + constexpr std::size_t numSync = 4U; + std::array barrier{ + FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}, FlatBarrier{numThreads}}; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + barrier[1U].Arrive(threadId); + barrier[2U].Arrive(threadId); + } + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + barrier[(cntr - 2U + numSync) % numSync].Wait(threadId); + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(threadId); + } + barrier[cntr % numSync].Arrive(threadId); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + + std::vector cntrs(numThreads, 0); + for (const std::size_t work : ans) { + const std::size_t current = ++cntrs[work]; + for (const std::size_t cntr : cntrs) { + BOOST_CHECK_GE(cntr, std::max(current, static_cast(2U)) - 2U); + } + } +} From ca2e4ecd7aa3fd95a0cc37a1e99b45a3385b2602 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Fri, 6 Feb 2026 09:33:31 +0100 Subject: [PATCH 12/57] barrier with counter --- .../WeakBarriers/cpu_relax.hpp | 38 ++++ .../WeakBarriers/flat_barrier.hpp | 12 +- .../flat_checkpoint_counter_barrier.hpp | 82 +++++++++ tests/weak_barrier.cpp | 173 +++++++++++++++++- 4 files changed, 292 insertions(+), 13 deletions(-) create mode 100644 include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp create mode 100644 include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp new file mode 100644 index 00000000..d9e5e268 --- /dev/null +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp @@ -0,0 +1,38 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include + +#if defined(__x86_64__) || defined(_M_X64) +# include +#endif + +namespace osp { + +// Portable cpu_relax definition +#if defined(__x86_64__) || defined(_M_X64) +inline void cpu_relax() { _mm_pause(); } +#elif defined(__aarch64__) +inline void cpu_relax() { asm volatile("yield" ::: "memory"); } +#else +inline void cpu_relax() { std::this_thread::yield(); } +#endif + +} // end namespace osp diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp index 7c4e586c..2de8adcc 100644 --- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp @@ -21,21 +21,11 @@ limitations under the License. #include #include +#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp" #include "osp/config/config.hpp" namespace osp { -// Portable cpu_relax definition -#if defined(__x86_64__) || defined(_M_X64) -# include - -inline void cpu_relax() { _mm_pause(); } -#elif defined(__aarch64__) -inline void cpu_relax() { asm volatile("yield" ::: "memory"); } -#else -inline void cpu_relax() { std::this_thread::yield(); } -#endif - struct alignas(CACHE_LINE_SIZE) AlignedAtomicFlag { std::atomic flag_{false}; int8_t pad[CACHE_LINE_SIZE - sizeof(std::atomic)]; diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp new file mode 100644 index 00000000..87607def --- /dev/null +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp @@ -0,0 +1,82 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include +#include +#include +#include + +#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp" +#include "osp/config/config.hpp" + +namespace osp { + +struct alignas(CACHE_LINE_SIZE) AlignedAtomicCounter { + std::atomic cntr_{0U}; + int8_t pad[CACHE_LINE_SIZE - sizeof(std::atomic)]; + + static_assert(std::atomic::is_always_lock_free); + static_assert(sizeof(int8_t) == 1U); +}; + +class FlatCheckpointCounterBarrier { + private: + std::vector cntrs_; + mutable std::vector> cachedCntrs_; + + public: + FlatCheckpointCounterBarrier(std::size_t numThreads) + : cntrs_(std::vector(numThreads)), + cachedCntrs_(std::vector>(numThreads, std::vector(numThreads, 0U))) {}; + + inline void Arrive(const std::size_t threadId); + inline void Wait(const std::size_t threadId, const std::size_t diff) const; + + FlatCheckpointCounterBarrier() = delete; + FlatCheckpointCounterBarrier(const FlatCheckpointCounterBarrier &) = delete; + FlatCheckpointCounterBarrier(FlatCheckpointCounterBarrier &&) = delete; + FlatCheckpointCounterBarrier &operator=(const FlatCheckpointCounterBarrier &) = delete; + FlatCheckpointCounterBarrier &operator=(FlatCheckpointCounterBarrier &&) = delete; + ~FlatCheckpointCounterBarrier() = default; +}; + +inline void FlatCheckpointCounterBarrier::Arrive(const std::size_t threadId) { + const std::size_t curr = cntrs_[threadId].cntr_.fetch_add(1U, std::memory_order_release) + 1U; + cachedCntrs_[threadId][threadId] = curr; +} + +inline void FlatCheckpointCounterBarrier::Wait(const std::size_t threadId, const std::size_t diff) const { + std::vector &localCachedCntrs = cachedCntrs_[threadId]; + + const std::size_t minVal = std::max(localCachedCntrs[threadId], diff) - diff; + + for (std::size_t ind = 0U; ind < cntrs_.size(); ++ind) { + std::size_t loopCntr = 0U; + while ((localCachedCntrs[ind] < minVal) + && ((localCachedCntrs[ind] = cntrs_[ind].cntr_.load(std::memory_order_acquire)) < minVal)) { + ++loopCntr; + if (loopCntr % 128U == 0U) { + cpu_relax(); + } + } + } +} + +} // end namespace osp diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp index 8870b18c..c3bc2f34 100644 --- a/tests/weak_barrier.cpp +++ b/tests/weak_barrier.cpp @@ -27,6 +27,7 @@ limitations under the License. #include #include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp" +#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp" using namespace osp; @@ -77,7 +78,7 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_2Threads) { BOOST_AUTO_TEST_CASE(TestFlatBarrier_128Threads) { constexpr std::size_t numThreads = 128U; - constexpr std::size_t numBarriers = 8U; + constexpr std::size_t numBarriers = 16U; std::vector ans; ans.reserve(numThreads * numBarriers); @@ -167,7 +168,7 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_SSP_2Threads) { BOOST_AUTO_TEST_CASE(TestFlatBarrier_SSP_128Threads) { constexpr std::size_t numThreads = 128U; - constexpr std::size_t numBarriers = 32U; + constexpr std::size_t numBarriers = 16U; std::vector ans; ans.reserve(numThreads * numBarriers); @@ -214,3 +215,171 @@ BOOST_AUTO_TEST_CASE(TestFlatBarrier_SSP_128Threads) { } } } + +BOOST_AUTO_TEST_CASE(TestAlignedAtomicCounter) { + BOOST_CHECK_EQUAL(sizeof(AlignedAtomicCounter), 64U); + BOOST_CHECK_EQUAL(alignof(AlignedAtomicCounter), 64U); +} + + +BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrier_2Threads) { + constexpr std::size_t numThreads = 2U; + constexpr std::size_t numBarriers = 1024U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + FlatCheckpointCounterBarrier barrier{numThreads}; + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(cntr); + } + barrier.Arrive(threadId); + barrier.Wait(threadId, 0U); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + for (std::size_t ind = 0U; ind < ans.size(); ++ind) { + BOOST_CHECK_EQUAL(ans[ind], ind / numThreads); + } +} + +BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrier_128Threads) { + constexpr std::size_t numThreads = 128U; + constexpr std::size_t numBarriers = 16U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + FlatCheckpointCounterBarrier barrier{numThreads}; + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(cntr); + } + barrier.Arrive(threadId); + barrier.Wait(threadId, 0U); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + for (std::size_t ind = 0U; ind < ans.size(); ++ind) { + BOOST_CHECK_EQUAL(ans[ind], ind / numThreads); + } +} + +BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrier_SSP_2Threads) { + constexpr std::size_t numThreads = 2U; + constexpr std::size_t numBarriers = 1024U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + FlatCheckpointCounterBarrier barrier{numThreads}; + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + barrier.Wait(threadId, 1U); + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(threadId); + } + barrier.Arrive(threadId); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + + std::vector cntrs(numThreads, 0); + for (const std::size_t work : ans) { + const std::size_t current = ++cntrs[work]; + for (const std::size_t cntr : cntrs) { + BOOST_CHECK_GE(cntr, std::max(current, static_cast(2U)) - 2U); + } + } +} + +BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrier_SSP_128Threads) { + constexpr std::size_t numThreads = 128U; + constexpr std::size_t numBarriers = 16U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + FlatCheckpointCounterBarrier barrier{numThreads}; + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + barrier.Wait(threadId, 1U); + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(threadId); + } + barrier.Arrive(threadId); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + + std::vector cntrs(numThreads, 0); + for (const std::size_t work : ans) { + const std::size_t current = ++cntrs[work]; + for (const std::size_t cntr : cntrs) { + BOOST_CHECK_GE(cntr, std::max(current, static_cast(2U)) - 2U); + } + } +} \ No newline at end of file From 82de6ded9060b9990e89daf32063f33bfc3223bf Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Fri, 6 Feb 2026 10:00:43 +0100 Subject: [PATCH 13/57] moved cpu_relax --- include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index de32d6c8..d607a80a 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -35,20 +35,12 @@ limitations under the License. # include # include +# include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp" # include "osp/bsp/model/BspInstance.hpp" # include "osp/bsp/model/BspSchedule.hpp" # include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp" namespace osp { -// Portable cpu_relax definition -#if defined(__x86_64__) || defined(_M_X64) -#include -inline void cpu_relax() { _mm_pause(); } -#elif defined(__aarch64__) -inline void cpu_relax() { asm volatile("yield" ::: "memory"); } -#else -inline void cpu_relax() { std::this_thread::yield(); } -#endif // Staleness-aware barrier for SSP: threads may run up to (staleness-1) steps ahead. // Internally tracks per-step completion counts and uses adaptive backoff to limit spinning. From 9c6bc82bbd58bcdf48509729694afbafb48009e7 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Fri, 6 Feb 2026 12:56:36 +0100 Subject: [PATCH 14/57] Adding barrier to ssp sptrsv / adding to benchmark app --- apps/maxbsp_ssp_sptrsv.cpp | 100 ++++++++--- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 99 ++++------- tests/weak_barrier.cpp | 163 ++++++++++++++++++ 3 files changed, 277 insertions(+), 85 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index dd1112d0..4c8ed159 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -19,15 +19,21 @@ using namespace osp; int main(int argc, char* argv[]) { - // Accept matrix filename and iteration count as arguments + // Accept matrix filename and iteration count as arguments (threads via OMP_NUM_THREADS or optional arg) std::string filename = "../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx"; int num_iterations = 1; + unsigned num_threads = 16U; if (argc > 1) { filename = argv[1]; } if (argc > 2) { num_iterations = std::stoi(argv[2]); } + if (const char *omp_env = std::getenv("OMP_NUM_THREADS")) { + num_threads = static_cast(std::stoul(omp_env)); + } else if (argc > 3) { + num_threads = static_cast(std::stoul(argv[3])); + } // Load matrix Eigen::SparseMatrix lCsr; @@ -43,7 +49,7 @@ int main(int argc, char* argv[]) { graph.SetCsr(&lCsr); Eigen::SparseMatrix lCsc = lCsr; graph.SetCsc(&lCsc); - BspArchitecture> architecture(16, 1, 500); // 16 processors + BspArchitecture> architecture(num_threads, 1, 500); // configurable processors BspInstance> instance(graph, architecture); // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness=2) @@ -61,22 +67,43 @@ int main(int argc, char* argv[]) { size_t n = static_cast(lCsc.cols()); - // Benchmark SSP L-solve - double ssp_total_time = 0.0; - std::vector ssp_result(n, 0.0); + // Benchmark SSP L-solve with cached barrier + double ssp_cached_total_time = 0.0; + std::vector ssp_cached_result(n, 0.0); + for (int iter = 0; iter < num_iterations; ++iter) { + std::vector x(n, 0.0); + std::vector b(n, 1.0); + sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule); + sptrsv_kernel.x_ = x.data(); + sptrsv_kernel.b_ = b.data(); + FlatCheckpointCounterBarrierCached barrier(num_threads); + auto ops = Sptrsv::MakeBarrierOps(barrier); + auto start = std::chrono::high_resolution_clock::now(); + sptrsv_kernel.SspLsolveStaleness2(ops); + auto end = std::chrono::high_resolution_clock::now(); + ssp_cached_total_time += std::chrono::duration(end - start).count(); + if (iter == 0) ssp_cached_result = std::vector(x.begin(), x.end()); + } + double ssp_cached_avg_time = ssp_cached_total_time / num_iterations; + + // Benchmark SSP L-solve with flat barrier + double ssp_flat_total_time = 0.0; + std::vector ssp_flat_result(n, 0.0); for (int iter = 0; iter < num_iterations; ++iter) { std::vector x(n, 0.0); std::vector b(n, 1.0); sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule); sptrsv_kernel.x_ = x.data(); sptrsv_kernel.b_ = b.data(); + FlatCheckpointCounterBarrier barrier(num_threads); + auto ops = Sptrsv::MakeBarrierOps(barrier); auto start = std::chrono::high_resolution_clock::now(); - sptrsv_kernel.SspLsolveStaleness2(); + sptrsv_kernel.SspLsolveStaleness2(ops); auto end = std::chrono::high_resolution_clock::now(); - ssp_total_time += std::chrono::duration(end - start).count(); - if (iter == 0) ssp_result = std::vector(x.begin(), x.end()); + ssp_flat_total_time += std::chrono::duration(end - start).count(); + if (iter == 0) ssp_flat_result = std::vector(x.begin(), x.end()); } - double ssp_avg_time = ssp_total_time / num_iterations; + double ssp_flat_avg_time = ssp_flat_total_time / num_iterations; // Benchmark GrowLocalAutoCores schedule with non-SSP L-solve (no permutation) double growlocal_total_time = 0.0; @@ -114,14 +141,26 @@ int main(int argc, char* argv[]) { // Compare results double max_diff = 0.0; for (size_t i = 0; i < n; ++i) { - double diff = std::abs(ssp_result[i] - serial_result[i]); + double diff = std::abs(ssp_cached_result[i] - serial_result[i]); if (diff > max_diff) max_diff = diff; } - std::cout << "Max difference between SSP and serial L-solve: " << max_diff << std::endl; + std::cout << "Max difference between SSP (cached barrier) and serial L-solve: " << max_diff << std::endl; if (max_diff < 1e-10) { - std::cout << "SSP L-solve matches serial L-solve!" << std::endl; + std::cout << "SSP (cached barrier) L-solve matches serial L-solve!" << std::endl; } else { - std::cout << "SSP L-solve does NOT match serial L-solve!" << std::endl; + std::cout << "SSP (cached barrier) L-solve does NOT match serial L-solve!" << std::endl; + } + + double max_diff_flat = 0.0; + for (size_t i = 0; i < n; ++i) { + double diff = std::abs(ssp_flat_result[i] - serial_result[i]); + if (diff > max_diff_flat) max_diff_flat = diff; + } + std::cout << "Max difference between SSP (flat barrier) and serial L-solve: " << max_diff_flat << std::endl; + if (max_diff_flat < 1e-10) { + std::cout << "SSP (flat barrier) L-solve matches serial L-solve!" << std::endl; + } else { + std::cout << "SSP (flat barrier) L-solve does NOT match serial L-solve!" << std::endl; } double max_diff_growlocal = 0.0; for (size_t i = 0; i < n; ++i) { @@ -137,24 +176,43 @@ int main(int argc, char* argv[]) { double max_diff_ssp_growlocal = 0.0; for (size_t i = 0; i < n; ++i) { - double diff = std::abs(ssp_result[i] - growlocal_result[i]); + double diff = std::abs(ssp_cached_result[i] - growlocal_result[i]); if (diff > max_diff_ssp_growlocal) max_diff_ssp_growlocal = diff; } - std::cout << "Max difference between SSP and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal << std::endl; + std::cout << "Max difference between SSP (cached barrier) and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal + << std::endl; - std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_avg_time << " seconds" << std::endl; + double max_diff_ssp_flat_cached = 0.0; + for (size_t i = 0; i < n; ++i) { + double diff = std::abs(ssp_flat_result[i] - ssp_cached_result[i]); + if (diff > max_diff_ssp_flat_cached) max_diff_ssp_flat_cached = diff; + } + std::cout << "Max difference between SSP (flat barrier) and SSP (cached barrier): " << max_diff_ssp_flat_cached + << std::endl; + + std::cout << "Average SSP (cached barrier) L-solve time (" << num_iterations << " runs): " << ssp_cached_avg_time + << " seconds" << std::endl; + std::cout << "Average SSP (flat barrier) L-solve time (" << num_iterations << " runs): " << ssp_flat_avg_time + << " seconds" << std::endl; std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time << " seconds" << std::endl; std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl; - if (ssp_avg_time > 0.0) { - std::cout << "Speedup (serial/SSP): " << (serial_avg_time / ssp_avg_time) << "x" << std::endl; + if (ssp_cached_avg_time > 0.0) { + std::cout << "Speedup (serial/SSP cached): " << (serial_avg_time / ssp_cached_avg_time) << "x" << std::endl; + } + if (ssp_flat_avg_time > 0.0) { + std::cout << "Speedup (serial/SSP flat): " << (serial_avg_time / ssp_flat_avg_time) << "x" << std::endl; } if (growlocal_avg_time > 0.0) { std::cout << "Speedup (serial/GrowLocalAutoCores): " << (serial_avg_time / growlocal_avg_time) << "x" << std::endl; } - if (ssp_avg_time > 0.0) { - std::cout << "Speedup (GrowLocalAutoCores/SSP): " << (growlocal_avg_time / ssp_avg_time) << "x" << std::endl; + if (ssp_cached_avg_time > 0.0) { + std::cout << "Speedup (GrowLocalAutoCores/SSP cached): " << (growlocal_avg_time / ssp_cached_avg_time) << "x" + << std::endl; + } + if (ssp_flat_avg_time > 0.0) { + std::cout << "Speedup (GrowLocalAutoCores/SSP flat): " << (growlocal_avg_time / ssp_flat_avg_time) << "x" << std::endl; } - std::cout << "MaxBSP staleness=2 SSP and GrowLocalAutoCores SpTRSV executed." << std::endl; + std::cout << "MaxBSP staleness=2 SSP (cached+flat) and GrowLocalAutoCores SpTRSV executed." << std::endl; return 0; } diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index d607a80a..7d08e32c 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -35,62 +35,14 @@ limitations under the License. # include # include -# include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp" +# include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp" +# include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp" # include "osp/bsp/model/BspInstance.hpp" # include "osp/bsp/model/BspSchedule.hpp" # include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp" namespace osp { -// Staleness-aware barrier for SSP: threads may run up to (staleness-1) steps ahead. -// Internally tracks per-step completion counts and uses adaptive backoff to limit spinning. -class SspStalenessBarrier { - private: - std::unique_ptr[]> stepDone_; - std::size_t stepDoneSize_ = 0U; - - public: - void Reset(std::size_t numSupersteps) { - // Reinitialize counters for a new schedule/run. - if (stepDoneSize_ != numSupersteps) { - stepDone_ = std::make_unique[]>(numSupersteps); - stepDoneSize_ = numSupersteps; - } - for (std::size_t i = 0; i < stepDoneSize_; ++i) { - stepDone_[i].store(0U, std::memory_order_relaxed); - } - } - - void WaitIfNeeded(unsigned step, unsigned staleness, unsigned nthreads) { - // Enforce: step may start only when all threads completed (step - staleness). - if (step < staleness) { - return; - } - const unsigned waitStep = step - staleness; - unsigned spinCount = 0U; - auto backoff = std::chrono::nanoseconds(50); - while (stepDone_[waitStep].load(std::memory_order_relaxed) < nthreads) { - // Adaptive backoff: spin -> yield -> short sleep to reduce contention. - if (spinCount < 2000U) { - cpu_relax(); - ++spinCount; - } else if (spinCount < 4000U) { - std::this_thread::yield(); - ++spinCount; - } else { - std::this_thread::sleep_for(backoff); - if (backoff < std::chrono::nanoseconds(500)) { - backoff *= 2; - } - } - } - std::atomic_thread_fence(std::memory_order_acquire); - } - - // Mark completion of a superstep by this thread. - void Arrive(unsigned step) { stepDone_[step].fetch_add(1U, std::memory_order_release); } -}; - template class Sptrsv { using UVertType = typename SparseMatrixImp::VertexIdx; @@ -99,6 +51,23 @@ class Sptrsv { const BspInstance> *instance_; public: + struct BarrierOps { + void *ctx; + void (*arrive)(void *ctx, std::size_t threadId); + void (*wait)(void *ctx, std::size_t threadId, std::size_t diff); + }; + + template + static BarrierOps MakeBarrierOps(BarrierT &barrier) { + return BarrierOps{ + static_cast(&barrier), + [](void *ctx, std::size_t threadId) { + static_cast(ctx)->Arrive(threadId); + }, + [](void *ctx, std::size_t threadId, std::size_t diff) { + static_cast(ctx)->Wait(threadId, diff); + }}; + } std::vector val_; std::vector cscVal_; @@ -122,7 +91,6 @@ class Sptrsv { std::vector>> boundsArrayL_; std::vector>> boundsArrayU_; - SspStalenessBarrier sspBarrier_; Sptrsv() = default; @@ -141,7 +109,6 @@ class Sptrsv { schedule.NumberOfSupersteps(), std::vector>(schedule.GetInstance().NumberOfProcessors())); numSupersteps_ = schedule.NumberOfSupersteps(); - sspBarrier_.Reset(static_cast(numSupersteps_)); size_t numberOfVertices = instance_->GetComputationalDag().NumVertices(); # pragma omp parallel num_threads(2) @@ -538,12 +505,10 @@ class Sptrsv { std::size_t GetNumberOfVertices() { return instance_->NumberOfVertices(); } // SSP Lsolve with staleness=2 (allowing at most one superstep of lag). - // Uses the staleness barrier to respect dependencies between supersteps. - void SspLsolveStaleness2() { + // Barrier operations are injected via function pointers. + void SspLsolveStaleness2(const BarrierOps &barrierOps) { constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference const unsigned nthreads = instance_->NumberOfProcessors(); - // Reset per-step completion counters for this run. - sspBarrier_.Reset(static_cast(numSupersteps_)); auto *csr = instance_->GetComputationalDag().GetCSR(); const auto *outer = csr->outerIndexPtr(); @@ -552,10 +517,10 @@ class Sptrsv { #pragma omp parallel num_threads(nthreads) { - const unsigned proc = static_cast(omp_get_thread_num()); + const std::size_t proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; ++step) { - // Ensure we are not more than (staleness-1) supersteps ahead. - sspBarrier_.WaitIfNeeded(step, static_cast(staleness), nthreads); + // Enforce staleness window before starting this superstep. + barrierOps.wait(barrierOps.ctx, proc, staleness - 1U); // Process nodes assigned to this (step, proc) pair. const size_t boundsStrSize = boundsArrayL_[step][proc].size(); for (size_t index = 0; index < boundsStrSize; index += 2) { @@ -565,9 +530,7 @@ class Sptrsv { // Initialize solution for this node x_[node] = b_[node]; // Perform lower-triangular solve for this node - for (EigenIdxType i = outer[node]; - i < outer[node + 1] - 1; - ++i) { + for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { // Subtract contributions from previously solved nodes x_[node] -= vals[i] * x_[inner[i]]; } @@ -575,12 +538,20 @@ class Sptrsv { x_[node] /= vals[outer[node + 1] - 1]; } } - // Signal completion of this superstep for staleness tracking. - sspBarrier_.Arrive(step); + // Signal completion of this superstep. + barrierOps.arrive(barrierOps.ctx, proc); } } } + // Default SSP Lsolve uses the cached flat checkpoint counter barrier. + void SspLsolveStaleness2() { + const unsigned nthreads = instance_->NumberOfProcessors(); + FlatCheckpointCounterBarrierCached barrier(nthreads); + const BarrierOps ops = MakeBarrierOps(barrier); + SspLsolveStaleness2(ops); + } + virtual ~Sptrsv() = default; }; diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp index c3bc2f34..22ad875a 100644 --- a/tests/weak_barrier.cpp +++ b/tests/weak_barrier.cpp @@ -28,6 +28,7 @@ limitations under the License. #include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_barrier.hpp" #include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp" +#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp" using namespace osp; @@ -375,6 +376,168 @@ BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrier_SSP_128Threads) { BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + std::vector cntrs(numThreads, 0); + for (const std::size_t work : ans) { + const std::size_t current = ++cntrs[work]; + for (const std::size_t cntr : cntrs) { + BOOST_CHECK_GE(cntr, std::max(current, static_cast(2U)) - 2U); + } + } +} + +BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrierCached_2Threads) { + constexpr std::size_t numThreads = 2U; + constexpr std::size_t numBarriers = 1024U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + FlatCheckpointCounterBarrierCached barrier{numThreads}; + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(cntr); + } + barrier.Arrive(threadId); + barrier.Wait(threadId, 0U); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + for (std::size_t ind = 0U; ind < ans.size(); ++ind) { + BOOST_CHECK_EQUAL(ans[ind], ind / numThreads); + } +} + +BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrierCached_128Threads) { + constexpr std::size_t numThreads = 128U; + constexpr std::size_t numBarriers = 16U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + FlatCheckpointCounterBarrierCached barrier{numThreads}; + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(cntr); + } + barrier.Arrive(threadId); + barrier.Wait(threadId, 0U); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + for (std::size_t ind = 0U; ind < ans.size(); ++ind) { + BOOST_CHECK_EQUAL(ans[ind], ind / numThreads); + } +} + +BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrierCached_SSP_2Threads) { + constexpr std::size_t numThreads = 2U; + constexpr std::size_t numBarriers = 1024U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + FlatCheckpointCounterBarrierCached barrier{numThreads}; + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + barrier.Wait(threadId, 1U); + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(threadId); + } + barrier.Arrive(threadId); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + + std::vector cntrs(numThreads, 0); + for (const std::size_t work : ans) { + const std::size_t current = ++cntrs[work]; + for (const std::size_t cntr : cntrs) { + BOOST_CHECK_GE(cntr, std::max(current, static_cast(2U)) - 2U); + } + } +} + +BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrierCached_SSP_128Threads) { + constexpr std::size_t numThreads = 128U; + constexpr std::size_t numBarriers = 16U; + + std::vector ans; + ans.reserve(numThreads * numBarriers); + + std::mutex ans_mutex; + + FlatCheckpointCounterBarrierCached barrier{numThreads}; + + std::vector threads(numThreads); + + auto threadWork = [&ans, &ans_mutex, numBarriers, &barrier](const std::size_t threadId) { + for (std::size_t cntr = 0U; cntr < numBarriers; ++cntr) { + barrier.Wait(threadId, 1U); + { + std::lock_guard lock(ans_mutex); + ans.emplace_back(threadId); + } + barrier.Arrive(threadId); + } + }; + + for (std::size_t threadId = 0U; threadId < numThreads; ++threadId) { + threads[threadId] = std::thread(threadWork, threadId); + } + + for (auto &thread : threads) { + thread.join(); + } + + BOOST_CHECK_EQUAL(ans.size(), numThreads * numBarriers); + std::vector cntrs(numThreads, 0); for (const std::size_t work : ans) { const std::size_t current = ++cntrs[work]; From c2bbfc9fa1f091b1c173d5dee85a17d35cb3d539 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Fri, 6 Feb 2026 12:57:10 +0100 Subject: [PATCH 15/57] add different barrier implementation --- ...flat_checkpoint_counter_barrier_cached.hpp | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp new file mode 100644 index 00000000..bd5d7fab --- /dev/null +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp @@ -0,0 +1,95 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include +#include +#include + +#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp" +#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp" + +namespace osp { + +class FlatCheckpointCounterBarrierCached { + private: + std::vector cntrs_; + // Change vs flat_checkpoint_counter_barrier.hpp: flatten 2D cache into 1D array + // to improve locality and avoid nested vector indirections. + std::vector cachedCntrs_; + // Keep explicit thread count for fast index math instead of cntrs_.size(). + std::size_t numThreads_ = 0U; + + inline std::size_t &Cached(std::size_t row, std::size_t col) { + // Helper to map (row, col) to flat index. + return cachedCntrs_[row * numThreads_ + col]; + } + + inline const std::size_t &Cached(std::size_t row, std::size_t col) const { + // Const helper for the same flat index mapping. + return cachedCntrs_[row * numThreads_ + col]; + } + + public: + FlatCheckpointCounterBarrierCached(std::size_t numThreads) + : cntrs_(std::vector(numThreads)), + // Allocate one contiguous block instead of vector-of-vectors. + cachedCntrs_(numThreads * numThreads, 0U), + numThreads_(numThreads) {} + + inline void Arrive(const std::size_t threadId); + inline void Wait(const std::size_t threadId, const std::size_t diff) const; + + FlatCheckpointCounterBarrierCached() = delete; + FlatCheckpointCounterBarrierCached(const FlatCheckpointCounterBarrierCached &) = delete; + FlatCheckpointCounterBarrierCached(FlatCheckpointCounterBarrierCached &&) = delete; + FlatCheckpointCounterBarrierCached &operator=(const FlatCheckpointCounterBarrierCached &) = delete; + FlatCheckpointCounterBarrierCached &operator=(FlatCheckpointCounterBarrierCached &&) = delete; + ~FlatCheckpointCounterBarrierCached() = default; +}; + +inline void FlatCheckpointCounterBarrierCached::Arrive(const std::size_t threadId) { + const std::size_t curr = cntrs_[threadId].cntr_.fetch_add(1U, std::memory_order_release) + 1U; + // Update cached counter via flat indexing helper. + Cached(threadId, threadId) = curr; +} + +inline void FlatCheckpointCounterBarrierCached::Wait(const std::size_t threadId, const std::size_t diff) const { + // Compute row base once for flat cache; avoids vector-of-vectors access. + const std::size_t base = threadId * numThreads_; + // Cast away const instead of marking cachedCntrs_ mutable in this class. + std::size_t *localCached = const_cast(cachedCntrs_.data() + base); + const std::size_t localThreadVal = localCached[threadId]; + const std::size_t minVal = std::max(localThreadVal, diff) - diff; + // Hoist data pointer and use numThreads_ instead of cntrs_.size(). + const AlignedAtomicCounter *cntrs = cntrs_.data(); + + for (std::size_t ind = 0U; ind < numThreads_; ++ind) { + std::size_t loopCntr = 0U; + while ((localCached[ind] < minVal) + && ((localCached[ind] = cntrs[ind].cntr_.load(std::memory_order_acquire)) < minVal)) { + ++loopCntr; + if (loopCntr % 128U == 0U) { + cpu_relax(); + } + } + } +} + +} // end namespace osp From d2b4a07a3b68ffe5c2ad84922dfa9435e2ac944e Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Fri, 6 Feb 2026 15:04:15 +0100 Subject: [PATCH 16/57] Corrections and cleaning --- apps/maxbsp_ssp_sptrsv.cpp | 88 ++++++------------- ...flat_checkpoint_counter_barrier_cached.hpp | 2 +- 2 files changed, 30 insertions(+), 60 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 4c8ed159..40c62ce3 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -67,26 +67,7 @@ int main(int argc, char* argv[]) { size_t n = static_cast(lCsc.cols()); - // Benchmark SSP L-solve with cached barrier - double ssp_cached_total_time = 0.0; - std::vector ssp_cached_result(n, 0.0); - for (int iter = 0; iter < num_iterations; ++iter) { - std::vector x(n, 0.0); - std::vector b(n, 1.0); - sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule); - sptrsv_kernel.x_ = x.data(); - sptrsv_kernel.b_ = b.data(); - FlatCheckpointCounterBarrierCached barrier(num_threads); - auto ops = Sptrsv::MakeBarrierOps(barrier); - auto start = std::chrono::high_resolution_clock::now(); - sptrsv_kernel.SspLsolveStaleness2(ops); - auto end = std::chrono::high_resolution_clock::now(); - ssp_cached_total_time += std::chrono::duration(end - start).count(); - if (iter == 0) ssp_cached_result = std::vector(x.begin(), x.end()); - } - double ssp_cached_avg_time = ssp_cached_total_time / num_iterations; - - // Benchmark SSP L-solve with flat barrier + // Benchmark SSP L-solve double ssp_flat_total_time = 0.0; std::vector ssp_flat_result(n, 0.0); for (int iter = 0; iter < num_iterations; ++iter) { @@ -139,80 +120,69 @@ int main(int argc, char* argv[]) { double serial_avg_time = serial_total_time / num_iterations; // Compare results - double max_diff = 0.0; - for (size_t i = 0; i < n; ++i) { - double diff = std::abs(ssp_cached_result[i] - serial_result[i]); - if (diff > max_diff) max_diff = diff; - } - std::cout << "Max difference between SSP (cached barrier) and serial L-solve: " << max_diff << std::endl; - if (max_diff < 1e-10) { - std::cout << "SSP (cached barrier) L-solve matches serial L-solve!" << std::endl; - } else { - std::cout << "SSP (cached barrier) L-solve does NOT match serial L-solve!" << std::endl; - } - double max_diff_flat = 0.0; + double frobNorm = 0.0; for (size_t i = 0; i < n; ++i) { double diff = std::abs(ssp_flat_result[i] - serial_result[i]); if (diff > max_diff_flat) max_diff_flat = diff; + frobNorm += diff * diff; } - std::cout << "Max difference between SSP (flat barrier) and serial L-solve: " << max_diff_flat << std::endl; - if (max_diff_flat < 1e-10) { - std::cout << "SSP (flat barrier) L-solve matches serial L-solve!" << std::endl; + frobNorm = std::sqrt(frobNorm); + std::cout << "Frobenius norm of difference: " << frobNorm << std::endl; + std::cout << "Max difference between SSP and serial L-solve: " << max_diff_flat << std::endl; + if (frobNorm <= 1e-30 || max_diff_flat < 1e-10 * frobNorm) { + std::cout << "SSP L-solve matches serial L-solve!" << std::endl; } else { - std::cout << "SSP (flat barrier) L-solve does NOT match serial L-solve!" << std::endl; + std::cout << "SSP L-solve does NOT match serial L-solve!" << std::endl; + std::cout << "Relative error: " << (max_diff_flat / frobNorm) << std::endl; } double max_diff_growlocal = 0.0; + double frobNormGrowlocal = 0.0; for (size_t i = 0; i < n; ++i) { double diff = std::abs(growlocal_result[i] - serial_result[i]); if (diff > max_diff_growlocal) max_diff_growlocal = diff; + frobNormGrowlocal += diff * diff; } + frobNormGrowlocal = std::sqrt(frobNormGrowlocal); std::cout << "Max difference between GrowLocalAutoCores and serial L-solve: " << max_diff_growlocal << std::endl; - if (max_diff_growlocal < 1e-10) { + if (frobNormGrowlocal <= 1e-30 || max_diff_growlocal < 1e-10 * frobNormGrowlocal) { std::cout << "GrowLocalAutoCores L-solve matches serial L-solve!" << std::endl; } else { std::cout << "GrowLocalAutoCores L-solve does NOT match serial L-solve!" << std::endl; + std::cout << "Relative error: " << (max_diff_growlocal / frobNormGrowlocal) << std::endl; } double max_diff_ssp_growlocal = 0.0; + double frobNormSspGrowlocal = 0.0; for (size_t i = 0; i < n; ++i) { - double diff = std::abs(ssp_cached_result[i] - growlocal_result[i]); + double diff = std::abs(ssp_flat_result[i] - growlocal_result[i]); if (diff > max_diff_ssp_growlocal) max_diff_ssp_growlocal = diff; + frobNormSspGrowlocal += diff * diff; } - std::cout << "Max difference between SSP (cached barrier) and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal + frobNormSspGrowlocal = std::sqrt(frobNormSspGrowlocal); + std::cout << "Max difference between SSP and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal << std::endl; - - double max_diff_ssp_flat_cached = 0.0; - for (size_t i = 0; i < n; ++i) { - double diff = std::abs(ssp_flat_result[i] - ssp_cached_result[i]); - if (diff > max_diff_ssp_flat_cached) max_diff_ssp_flat_cached = diff; + if (frobNormSspGrowlocal <= 1e-30 || max_diff_ssp_growlocal < 1e-10 * frobNormSspGrowlocal) { + std::cout << "SSP L-solve matches GrowLocalAutoCores L-solve!" << std::endl; + } else { + std::cout << "SSP L-solve does NOT match GrowLocalAutoCores L-solve!" << std::endl; + std::cout << "Relative error: " << (max_diff_ssp_growlocal / frobNormSspGrowlocal) << std::endl; } - std::cout << "Max difference between SSP (flat barrier) and SSP (cached barrier): " << max_diff_ssp_flat_cached - << std::endl; - std::cout << "Average SSP (cached barrier) L-solve time (" << num_iterations << " runs): " << ssp_cached_avg_time - << " seconds" << std::endl; - std::cout << "Average SSP (flat barrier) L-solve time (" << num_iterations << " runs): " << ssp_flat_avg_time + std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_flat_avg_time << " seconds" << std::endl; std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time << " seconds" << std::endl; std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl; - if (ssp_cached_avg_time > 0.0) { - std::cout << "Speedup (serial/SSP cached): " << (serial_avg_time / ssp_cached_avg_time) << "x" << std::endl; - } if (ssp_flat_avg_time > 0.0) { - std::cout << "Speedup (serial/SSP flat): " << (serial_avg_time / ssp_flat_avg_time) << "x" << std::endl; + std::cout << "Speedup (serial/SSP): " << (serial_avg_time / ssp_flat_avg_time) << "x" << std::endl; } if (growlocal_avg_time > 0.0) { std::cout << "Speedup (serial/GrowLocalAutoCores): " << (serial_avg_time / growlocal_avg_time) << "x" << std::endl; } - if (ssp_cached_avg_time > 0.0) { - std::cout << "Speedup (GrowLocalAutoCores/SSP cached): " << (growlocal_avg_time / ssp_cached_avg_time) << "x" - << std::endl; - } if (ssp_flat_avg_time > 0.0) { - std::cout << "Speedup (GrowLocalAutoCores/SSP flat): " << (growlocal_avg_time / ssp_flat_avg_time) << "x" << std::endl; + std::cout << "Speedup (GrowLocalAutoCores/SSP): " << (growlocal_avg_time / ssp_flat_avg_time) << "x" << std::endl; } - std::cout << "MaxBSP staleness=2 SSP (cached+flat) and GrowLocalAutoCores SpTRSV executed." << std::endl; + std::cout << "MaxBSP staleness=2 SSP and GrowLocalAutoCores SpTRSV executed." << std::endl; return 0; } diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp index bd5d7fab..76cd5e4a 100644 --- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp @@ -92,4 +92,4 @@ inline void FlatCheckpointCounterBarrierCached::Wait(const std::size_t threadId, } } -} // end namespace osp +} \ No newline at end of file From bc18d26b8957512cc59f2eed47e0bf1925d17a61 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Fri, 6 Feb 2026 15:48:21 +0100 Subject: [PATCH 17/57] removed false sharing --- .../WeakBarriers/flat_checkpoint_counter_barrier.hpp | 10 +++++++++- tests/weak_barrier.cpp | 12 ++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp index 87607def..df3b53f1 100644 --- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp @@ -28,6 +28,13 @@ limitations under the License. namespace osp { +constexpr std::size_t RoundUpToCacheLine(std::size_t num) { + std::size_t size = ((num * sizeof(std::size_t) + CACHE_LINE_SIZE - 1U) / CACHE_LINE_SIZE) * CACHE_LINE_SIZE; + std::size_t ans = (size + sizeof(std::size_t) - 1U) / sizeof(std::size_t); + + return ans; +} + struct alignas(CACHE_LINE_SIZE) AlignedAtomicCounter { std::atomic cntr_{0U}; int8_t pad[CACHE_LINE_SIZE - sizeof(std::atomic)]; @@ -44,7 +51,8 @@ class FlatCheckpointCounterBarrier { public: FlatCheckpointCounterBarrier(std::size_t numThreads) : cntrs_(std::vector(numThreads)), - cachedCntrs_(std::vector>(numThreads, std::vector(numThreads, 0U))) {}; + cachedCntrs_( + std::vector>(numThreads, std::vector(RoundUpToCacheLine(numThreads), 0U))) {}; inline void Arrive(const std::size_t threadId); inline void Wait(const std::size_t threadId, const std::size_t diff) const; diff --git a/tests/weak_barrier.cpp b/tests/weak_barrier.cpp index 22ad875a..a5d8ad01 100644 --- a/tests/weak_barrier.cpp +++ b/tests/weak_barrier.cpp @@ -545,4 +545,16 @@ BOOST_AUTO_TEST_CASE(TestFlatCheckpointCounterBarrierCached_SSP_128Threads) { BOOST_CHECK_GE(cntr, std::max(current, static_cast(2U)) - 2U); } } +} + +BOOST_AUTO_TEST_CASE(TestVectorPadding) { + for (std::size_t i = 0U; i < 257; ++i) { + const std::size_t numCacheLines = (i * sizeof(std::size_t) + CACHE_LINE_SIZE - 1U) / CACHE_LINE_SIZE; + const std::size_t ans = RoundUpToCacheLine(i); + + BOOST_CHECK_LE(numCacheLines * CACHE_LINE_SIZE, ans * sizeof(std::size_t)); + if (ans > 0U) { + BOOST_CHECK_GT(numCacheLines * CACHE_LINE_SIZE, (ans - 1U) * sizeof(std::size_t)); + } + } } \ No newline at end of file From c454e8a2f49ebfbcb3a757e39a19a93f4f0d52da Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 12 Feb 2026 16:51:10 +0100 Subject: [PATCH 18/57] added SSP grow local --- apps/maxbsp_ssp_sptrsv.cpp | 194 ++++++++++++++++++++++++------------- 1 file changed, 124 insertions(+), 70 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 40c62ce3..c18f4f7d 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -3,22 +3,57 @@ * Demonstrates maxbsp scheduling with staleness=2, then runs SpTRSV with SSP kernel. */ -#include -#include #include +#include +#include #include +#include + #include "osp/auxiliary/sptrsv_simulator/sptrsv.hpp" #include "osp/bsp/model/BspInstance.hpp" #include "osp/bsp/model/BspSchedule.hpp" #include "osp/bsp/model/MaxBspSchedule.hpp" -#include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCores.hpp" #include "osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp" +#include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCores.hpp" +#include "osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp" #include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp" -#include using namespace osp; -int main(int argc, char* argv[]) { +#define EPSILON 1e-20 + +double L2NormalisedDiff(const std::vector &v, const std::vector &w) { + assert(v.size() == w.size()); + double l2diff = 0.0; + double frobNorm = 0.0; + for (std::size_t i = 0U; i < v.size(); ++i) { + const double absdiff = std::abs(v[i] - w[i]); + l2diff += absdiff * absdiff; + + const double vAbs = std::abs(v[i]); + const double wAbs = std::abs(w[i]); + + frobNorm += ((vAbs * vAbs) + (wAbs * wAbs)) / 2.0; + } + l2diff = std::sqrt(l2diff); + frobNorm = std::sqrt(frobNorm); + const double ratio = l2diff / (frobNorm + EPSILON); + return ratio; +} + +double LInftyNormalisedDiff(const std::vector &v, const std::vector &w) { + double diff = 0.0; + for (std::size_t i = 0U; i < v.size(); ++i) { + const double absdiff = std::abs(v[i] - w[i]); + const double vAbs = std::abs(v[i]); + const double wAbs = std::abs(w[i]); + + diff = std::max(diff, 2 * absdiff / (vAbs + wAbs + EPSILON)); + } + return diff; +} + +int main(int argc, char *argv[]) { // Accept matrix filename and iteration count as arguments (threads via OMP_NUM_THREADS or optional arg) std::string filename = "../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx"; int num_iterations = 1; @@ -49,13 +84,18 @@ int main(int argc, char* argv[]) { graph.SetCsr(&lCsr); Eigen::SparseMatrix lCsc = lCsr; graph.SetCsc(&lCsc); - BspArchitecture> architecture(num_threads, 1, 500); // configurable processors + BspArchitecture> architecture(num_threads, 1, 500); // configurable processors BspInstance> instance(graph, architecture); // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness=2) - GreedyVarianceSspScheduler> ssp_scheduler; - MaxBspSchedule> ssp_schedule(instance); - ssp_scheduler.ComputeSchedule(ssp_schedule); + GreedyVarianceSspScheduler> ssp_var_scheduler; + MaxBspSchedule> ssp_var_schedule(instance); + ssp_var_scheduler.ComputeSchedule(ssp_var_schedule); + + // Create SSP-aware schedule using GrowLocalMaxBsp (staleness=2) + GrowLocalSSP> ssp_gl_scheduler; + MaxBspSchedule> ssp_gl_schedule(instance); + ssp_gl_scheduler.ComputeSchedule(ssp_gl_schedule); // Create a non-SSP schedule using GrowLocalAutoCores GrowLocalAutoCores> growlocal_scheduler; @@ -67,13 +107,13 @@ int main(int argc, char* argv[]) { size_t n = static_cast(lCsc.cols()); - // Benchmark SSP L-solve - double ssp_flat_total_time = 0.0; - std::vector ssp_flat_result(n, 0.0); + // Benchmark SSP Variance L-solve + double ssp_var_flat_total_time = 0.0; + std::vector ssp_var_flat_result(n, 0.0); for (int iter = 0; iter < num_iterations; ++iter) { std::vector x(n, 0.0); std::vector b(n, 1.0); - sptrsv_kernel.SetupCsrNoPermutation(ssp_schedule); + sptrsv_kernel.SetupCsrNoPermutation(ssp_var_schedule); sptrsv_kernel.x_ = x.data(); sptrsv_kernel.b_ = b.data(); FlatCheckpointCounterBarrier barrier(num_threads); @@ -81,10 +121,33 @@ int main(int argc, char* argv[]) { auto start = std::chrono::high_resolution_clock::now(); sptrsv_kernel.SspLsolveStaleness2(ops); auto end = std::chrono::high_resolution_clock::now(); - ssp_flat_total_time += std::chrono::duration(end - start).count(); - if (iter == 0) ssp_flat_result = std::vector(x.begin(), x.end()); + ssp_var_flat_total_time += std::chrono::duration(end - start).count(); + if (iter == 0) { + ssp_var_flat_result = std::vector(x.begin(), x.end()); + } } - double ssp_flat_avg_time = ssp_flat_total_time / num_iterations; + double ssp_var_flat_avg_time = ssp_var_flat_total_time / num_iterations; + + // Benchmark SSP GrowLocal L-solve + double ssp_gl_flat_total_time = 0.0; + std::vector ssp_gl_flat_result(n, 0.0); + for (int iter = 0; iter < num_iterations; ++iter) { + std::vector x(n, 0.0); + std::vector b(n, 1.0); + sptrsv_kernel.SetupCsrNoPermutation(ssp_gl_schedule); + sptrsv_kernel.x_ = x.data(); + sptrsv_kernel.b_ = b.data(); + FlatCheckpointCounterBarrier barrier(num_threads); + auto ops = Sptrsv::MakeBarrierOps(barrier); + auto start = std::chrono::high_resolution_clock::now(); + sptrsv_kernel.SspLsolveStaleness2(ops); + auto end = std::chrono::high_resolution_clock::now(); + ssp_gl_flat_total_time += std::chrono::duration(end - start).count(); + if (iter == 0) { + ssp_gl_flat_result = std::vector(x.begin(), x.end()); + } + } + double ssp_gl_flat_avg_time = ssp_gl_flat_total_time / num_iterations; // Benchmark GrowLocalAutoCores schedule with non-SSP L-solve (no permutation) double growlocal_total_time = 0.0; @@ -99,7 +162,9 @@ int main(int argc, char* argv[]) { sptrsv_kernel.LsolveNoPermutation(); auto end = std::chrono::high_resolution_clock::now(); growlocal_total_time += std::chrono::duration(end - start).count(); - if (iter == 0) growlocal_result = std::vector(x.begin(), x.end()); + if (iter == 0) { + growlocal_result = std::vector(x.begin(), x.end()); + } } double growlocal_avg_time = growlocal_total_time / num_iterations; @@ -115,74 +180,63 @@ int main(int argc, char* argv[]) { sptrsv_kernel.LsolveSerial(); auto end = std::chrono::high_resolution_clock::now(); serial_total_time += std::chrono::duration(end - start).count(); - if (iter == 0) serial_result = std::vector(x_serial.begin(), x_serial.end()); + if (iter == 0) { + serial_result = std::vector(x_serial.begin(), x_serial.end()); + } } double serial_avg_time = serial_total_time / num_iterations; // Compare results - double max_diff_flat = 0.0; - double frobNorm = 0.0; - for (size_t i = 0; i < n; ++i) { - double diff = std::abs(ssp_flat_result[i] - serial_result[i]); - if (diff > max_diff_flat) max_diff_flat = diff; - frobNorm += diff * diff; - } - frobNorm = std::sqrt(frobNorm); - std::cout << "Frobenius norm of difference: " << frobNorm << std::endl; - std::cout << "Max difference between SSP and serial L-solve: " << max_diff_flat << std::endl; - if (frobNorm <= 1e-30 || max_diff_flat < 1e-10 * frobNorm) { - std::cout << "SSP L-solve matches serial L-solve!" << std::endl; - } else { - std::cout << "SSP L-solve does NOT match serial L-solve!" << std::endl; - std::cout << "Relative error: " << (max_diff_flat / frobNorm) << std::endl; - } - double max_diff_growlocal = 0.0; - double frobNormGrowlocal = 0.0; - for (size_t i = 0; i < n; ++i) { - double diff = std::abs(growlocal_result[i] - serial_result[i]); - if (diff > max_diff_growlocal) max_diff_growlocal = diff; - frobNormGrowlocal += diff * diff; - } - frobNormGrowlocal = std::sqrt(frobNormGrowlocal); - std::cout << "Max difference between GrowLocalAutoCores and serial L-solve: " << max_diff_growlocal << std::endl; - if (frobNormGrowlocal <= 1e-30 || max_diff_growlocal < 1e-10 * frobNormGrowlocal) { - std::cout << "GrowLocalAutoCores L-solve matches serial L-solve!" << std::endl; + const double varDiff = LInftyNormalisedDiff(ssp_var_flat_result, serial_result); + + std::cout << "Max relative difference between SSP Variance and serial L-solve: " << varDiff << std::endl; + if (varDiff < EPSILON) { + std::cout << "SSP Variance L-solve matches serial L-solve!" << std::endl; } else { - std::cout << "GrowLocalAutoCores L-solve does NOT match serial L-solve!" << std::endl; - std::cout << "Relative error: " << (max_diff_growlocal / frobNormGrowlocal) << std::endl; + std::cout << "SSP Variance L-solve does NOT match serial L-solve!" << std::endl; } - double max_diff_ssp_growlocal = 0.0; - double frobNormSspGrowlocal = 0.0; - for (size_t i = 0; i < n; ++i) { - double diff = std::abs(ssp_flat_result[i] - growlocal_result[i]); - if (diff > max_diff_ssp_growlocal) max_diff_ssp_growlocal = diff; - frobNormSspGrowlocal += diff * diff; + const double GLSSPDiff = LInftyNormalisedDiff(ssp_gl_flat_result, serial_result); + + std::cout << "Max relative difference between SSP GrowLocal and serial L-solve: " << GLSSPDiff << std::endl; + if (GLSSPDiff < EPSILON) { + std::cout << "SSP GrowLocal L-solve matches serial L-solve!" << std::endl; + } else { + std::cout << "SSP GrowLocal L-solve does NOT match serial L-solve!" << std::endl; } - frobNormSspGrowlocal = std::sqrt(frobNormSspGrowlocal); - std::cout << "Max difference between SSP and GrowLocalAutoCores L-solve: " << max_diff_ssp_growlocal - << std::endl; - if (frobNormSspGrowlocal <= 1e-30 || max_diff_ssp_growlocal < 1e-10 * frobNormSspGrowlocal) { - std::cout << "SSP L-solve matches GrowLocalAutoCores L-solve!" << std::endl; + + const double GLPDiff = LInftyNormalisedDiff(growlocal_result, serial_result); + + std::cout << "Max relative difference between GrowLocal and serial L-solve: " << GLPDiff << std::endl; + if (GLPDiff < EPSILON) { + std::cout << "GrowLocal L-solve matches serial L-solve!" << std::endl; } else { - std::cout << "SSP L-solve does NOT match GrowLocalAutoCores L-solve!" << std::endl; - std::cout << "Relative error: " << (max_diff_ssp_growlocal / frobNormSspGrowlocal) << std::endl; + std::cout << "GrowLocal L-solve does NOT match serial L-solve!" << std::endl; } - std::cout << "Average SSP L-solve time (" << num_iterations << " runs): " << ssp_flat_avg_time - << " seconds" << std::endl; - std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time - << " seconds" << std::endl; - std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl; - if (ssp_flat_avg_time > 0.0) { - std::cout << "Speedup (serial/SSP): " << (serial_avg_time / ssp_flat_avg_time) << "x" << std::endl; + std::cout << "Average SSP Variance L-solve time (" << num_iterations << " runs): " << ssp_var_flat_avg_time << " seconds" + << std::endl; + std::cout << "Average SSP GrowLocal L-solve time (" << num_iterations << " runs): " << ssp_gl_flat_avg_time << " seconds" + << std::endl; + std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time << " seconds" + << std::endl; + std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl << std::endl; + + if (ssp_var_flat_avg_time > 0.0) { + std::cout << "Speedup (serial/SSP Var): " << (serial_avg_time / ssp_var_flat_avg_time) << "x" << std::endl; + } + if (ssp_gl_flat_avg_time > 0.0) { + std::cout << "Speedup (serial/SSP GL): " << (serial_avg_time / ssp_gl_flat_avg_time) << "x" << std::endl; } if (growlocal_avg_time > 0.0) { std::cout << "Speedup (serial/GrowLocalAutoCores): " << (serial_avg_time / growlocal_avg_time) << "x" << std::endl; } - if (ssp_flat_avg_time > 0.0) { - std::cout << "Speedup (GrowLocalAutoCores/SSP): " << (growlocal_avg_time / ssp_flat_avg_time) << "x" << std::endl; + if (ssp_var_flat_avg_time > 0.0) { + std::cout << "Speedup (GrowLocalAutoCores/SSP Var): " << (growlocal_avg_time / ssp_var_flat_avg_time) << "x" << std::endl; + } + if (ssp_gl_flat_avg_time > 0.0) { + std::cout << "Speedup (GrowLocalAutoCores/SSP GL): " << (growlocal_avg_time / ssp_gl_flat_avg_time) << "x" << std::endl; } - std::cout << "MaxBSP staleness=2 SSP and GrowLocalAutoCores SpTRSV executed." << std::endl; + return 0; } From b32f4dc40bc5302d5a9bc2872a79fd61c1d798b4 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 12 Feb 2026 17:11:22 +0100 Subject: [PATCH 19/57] changed splitting of work between supersteps --- include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp index 8ce849c6..23462bd8 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp @@ -71,7 +71,7 @@ typename std::deque>::difference_type GrowLocalSSP::m typename std::deque::difference_type lengthNext = std::distance(nextSuperstepReady.cbegin(), nextSuperstepReady.cend()); - typename std::deque::difference_type ans = ((lengthCurrently + lengthNext + 2) / 3) * 2; + typename std::deque::difference_type ans = ((lengthCurrently + lengthNext + 1) / 2); return ans; } From 27f0edd551e45eb6868ef4735e5e1494b64f664f Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Fri, 13 Feb 2026 13:59:20 +0100 Subject: [PATCH 20/57] aligned allocators --- .../WeakBarriers/aligned_allocator.hpp | 70 +++++++++++++++++ .../flat_checkpoint_counter_barrier.hpp | 14 ++-- tests/CMakeLists.txt | 2 + tests/aligned_allocator.cpp | 77 +++++++++++++++++++ 4 files changed, 157 insertions(+), 6 deletions(-) create mode 100644 include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp create mode 100644 tests/aligned_allocator.cpp diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp new file mode 100644 index 00000000..b5103a91 --- /dev/null +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp @@ -0,0 +1,70 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Christos Matzoros, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include +#include +#include + +namespace osp { + +template +struct AlignedAllocator { + static_assert(alignment > 0U, "Alignment must be a positive integer."); + static_assert((alignment & (alignment - 1U)) == 0U, "Alignment must be a power of two."); + static_assert(alignment % alignof(T) == 0U, "Alignment must be a multiple of the alignment of the type."); + + using value_type = T; + + template + struct rebind { + using other = AlignedAllocator; + }; + + AlignedAllocator() noexcept = default; + + template + AlignedAllocator(const AlignedAllocator &) noexcept {} + + inline T *allocate(std::size_t size) { return reinterpret_cast(std::aligned_alloc(alignment, size * sizeof(T))); } + + inline void deallocate(T *p, [[maybe_unused]] std::size_t size) { std::free(p); } + + template + inline void construct(U *p, Args &&...args) { + new (static_cast(p)) U(std::forward(args)...); + } + + template + inline void destroy(U *p) noexcept { + p->~U(); + } +}; + +template +constexpr bool operator==(const AlignedAllocator &, const AlignedAllocator &) noexcept { + return (T_alignment == U_alignment); +} + +template +constexpr bool operator!=(const AlignedAllocator &, const AlignedAllocator &) noexcept { + return (T_alignment != U_alignment); +} + +} // end namespace osp diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp index df3b53f1..5b25acd3 100644 --- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp @@ -23,6 +23,7 @@ limitations under the License. #include #include +#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp" #include "osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp" #include "osp/config/config.hpp" @@ -46,13 +47,14 @@ struct alignas(CACHE_LINE_SIZE) AlignedAtomicCounter { class FlatCheckpointCounterBarrier { private: std::vector cntrs_; - mutable std::vector> cachedCntrs_; + mutable std::vector>> cachedCntrs_; public: FlatCheckpointCounterBarrier(std::size_t numThreads) : cntrs_(std::vector(numThreads)), - cachedCntrs_( - std::vector>(numThreads, std::vector(RoundUpToCacheLine(numThreads), 0U))) {}; + cachedCntrs_(std::vector>>( + numThreads, + std::vector>(RoundUpToCacheLine(numThreads), 0U))) {}; inline void Arrive(const std::size_t threadId); inline void Wait(const std::size_t threadId, const std::size_t diff) const; @@ -66,12 +68,12 @@ class FlatCheckpointCounterBarrier { }; inline void FlatCheckpointCounterBarrier::Arrive(const std::size_t threadId) { - const std::size_t curr = cntrs_[threadId].cntr_.fetch_add(1U, std::memory_order_release) + 1U; - cachedCntrs_[threadId][threadId] = curr; + cntrs_[threadId].cntr_.fetch_add(1U, std::memory_order_release); + ++cachedCntrs_[threadId][threadId]; } inline void FlatCheckpointCounterBarrier::Wait(const std::size_t threadId, const std::size_t diff) const { - std::vector &localCachedCntrs = cachedCntrs_[threadId]; + std::vector> &localCachedCntrs = cachedCntrs_[threadId]; const std::size_t minVal = std::max(localCachedCntrs[threadId], diff) - diff; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d6a8f8c2..7bf3aed5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -131,6 +131,8 @@ _add_test( hash_pair ) _add_test( weak_barrier ) +_add_test( aligned_allocator ) + ## io _add_test( filereader DATA ) diff --git a/tests/aligned_allocator.cpp b/tests/aligned_allocator.cpp new file mode 100644 index 00000000..6f03257d --- /dev/null +++ b/tests/aligned_allocator.cpp @@ -0,0 +1,77 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Christos Matzoros, Pal Andras Papp, Raphael S. Steiner +*/ + +#define BOOST_TEST_MODULE AlignedAllocatorTests + +#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp" + +#include +#include + +using namespace osp; + +BOOST_AUTO_TEST_CASE(TestAlignedAllocation32) { + constexpr std::size_t alignment = 32U; + + std::vector> vec(7, 7U); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); + + for (unsigned i = 0U; i < 2048U; ++i) { + vec.emplace_back(i); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); + } + + vec.resize(8000U); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); + vec.resize(5U); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); +} + +BOOST_AUTO_TEST_CASE(TestAlignedAllocation16) { + constexpr std::size_t alignment = 16U; + + std::vector> vec(7, 7U); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); + + for (unsigned i = 0U; i < 2048U; ++i) { + vec.emplace_back(i); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); + } + + vec.resize(8000U); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); + vec.resize(5U); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); +} + +BOOST_AUTO_TEST_CASE(TestAlignedAllocation64) { + constexpr std::size_t alignment = 64U; + + std::vector> vec(7, 7U); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); + + for (unsigned i = 0U; i < 2048U; ++i) { + vec.emplace_back('a'); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); + } + + vec.resize(8000U); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); + vec.resize(5U); + BOOST_CHECK_EQUAL(reinterpret_cast(static_cast(vec.data())) % alignment, 0U); +} From ef2abf8578db0239b125985f6d59a9a0a4fdde61 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 19 Feb 2026 08:42:23 +0100 Subject: [PATCH 21/57] compact sparse graph mtx file reader --- ...tx_to_compact_sparse_graph_file_reader.hpp | 138 ++++++++++++++++++ .../adj_list_impl/compact_sparse_graph.hpp | 4 +- tests/filereader.cpp | 71 +++++++++ 3 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 include/osp/auxiliary/io/mtx_to_compact_sparse_graph_file_reader.hpp diff --git a/include/osp/auxiliary/io/mtx_to_compact_sparse_graph_file_reader.hpp b/include/osp/auxiliary/io/mtx_to_compact_sparse_graph_file_reader.hpp new file mode 100644 index 00000000..1730f594 --- /dev/null +++ b/include/osp/auxiliary/io/mtx_to_compact_sparse_graph_file_reader.hpp @@ -0,0 +1,138 @@ +/* +Copyright 2026 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Christos Matzoros, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include +#include + +#include "osp/auxiliary/io/mtx_graph_file_reader.hpp" +#include "osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp" + +namespace osp { +namespace file_reader { + +template <> +bool ReadComputationalDagMartixMarketFormat< + CompactSparseGraph>( std::ifstream &infile, CompactSparseGraph + &graph) { + using GraphT + = CompactSparseGraph; + using VertexT = VertexIdxT; + + std::vector> edges; + std::string line; + + // Skip comments or empty lines (robustly) + while (std::getline(infile, line)) { + if (line.empty() || line[0] == '%') { + continue; + } + + // Null byte check + if (line.find('\0') != std::string::npos) { + std::cerr << "Error: Null byte detected in header line.\n"; + return false; + } + + if (line.size() > MAX_LINE_LENGTH) { + std::cerr << "Error: Line too long, possible malformed or malicious file.\n"; + return false; + } + break; // We found the actual header line + } + + if (infile.eof()) { + std::cerr << "Error: Unexpected end of file while reading header.\n"; + return false; + } + + VertexT mRow = 0; + VertexT mCol = 0; + std::size_t nEntries = 0; + + std::istringstream headerStream(line); + if (!(headerStream >> mRow >> mCol >> nEntries) || mRow <= 0 || mCol <= 0 || mRow != mCol) { + std::cerr << "Error: Invalid header or non-square matrix.\n"; + return false; + } + + const VertexT numNodes = mRow; + + std::size_t entriesRead = 0; + while (entriesRead < nEntries && std::getline(infile, line)) { + if (line.empty() || line[0] == '%') { + continue; + } + if (line.size() > MAX_LINE_LENGTH) { + std::cerr << "Error: Line too long.\n"; + return false; + } + + std::istringstream entryStream(line); + VertexT row = std::numeric_limits::max(); + VertexT col = std::numeric_limits::max(); + double val = 0.0; + + if (!(entryStream >> row >> col >> val)) { + std::cerr << "Error: Malformed matrix entry.\n"; + return false; + } + + row -= 1; + col -= 1; // Convert to 0-based + + if (row >= mRow || col >= mCol) { + std::cerr << "Error: Matrix entry out of bounds.\n"; + return false; + } + + if (row < col) { + std::cerr << "Error: Expected lower-triangular matrix.\n"; + return false; + } + + if (row != col) { + edges.emplace_back(col, row); + } + + ++entriesRead; + } + + if (entriesRead != nEntries) { + std::cerr << "Error: Incomplete matrix entries.\n"; + return false; + } + + while (std::getline(infile, line)) { + if (!line.empty() && line[0] != '%') { + std::cerr << "Error: Extra data after matrix content.\n"; + return false; + } + } + + graph = GraphT(numNodes, edges); + + return true; +} + +} // namespace file_reader +} // namespace osp diff --git a/include/osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp b/include/osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp index 9d4614fb..e7488bc4 100644 --- a/include/osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp +++ b/include/osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp @@ -831,7 +831,7 @@ class CompactSparseGraph { template inline std::enable_if_t VertexCommWeight(const VertexIdx) const { - return static_cast(0); + return static_cast(1); } template @@ -841,7 +841,7 @@ class CompactSparseGraph { template inline std::enable_if_t VertexMemWeight(const VertexIdx) const { - return static_cast(0); + return static_cast(1); } template diff --git a/tests/filereader.cpp b/tests/filereader.cpp index 0f6c0917..85347b3b 100644 --- a/tests/filereader.cpp +++ b/tests/filereader.cpp @@ -25,6 +25,7 @@ limitations under the License. #include "osp/auxiliary/io/dot_graph_file_reader.hpp" #include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include "osp/auxiliary/io/mtx_graph_file_reader.hpp" +#include "osp/auxiliary/io/mtx_to_compact_sparse_graph_file_reader.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" #include "osp/graph_implementations/boost_graphs/boost_graph.hpp" @@ -169,6 +170,76 @@ BOOST_AUTO_TEST_CASE(TestMtxBoostGraph) { BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(7).begin(), graph.Children(7).end(), c7.begin(), c7.end()); } +BOOST_AUTO_TEST_CASE(TestMtxCompactSparseGraph) { + // Getting root git directory + std::filesystem::path cwd = std::filesystem::current_path(); + std::cout << cwd << std::endl; + while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) { + cwd = cwd.parent_path(); + std::cout << cwd << std::endl; + } + + CompactSparseGraph + graph; + + bool status + = file_reader::ReadComputationalDagMartixMarketFormat((cwd / "data/mtx_tests/ErdosRenyi_8_19_A.mtx").string(), graph); + + std::cout << "STATUS:" << status << std::endl; + BOOST_CHECK(status); + BOOST_CHECK_EQUAL(graph.NumVertices(), 8); + BOOST_CHECK_EQUAL(graph.NumEdges(), 19); + + // ---- Node 0 + std::vector p0{}; + std::vector c0{2, 3, 4, 5, 6}; + + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(0).begin(), graph.Parents(0).end(), p0.begin(), p0.end()); + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(0).begin(), graph.Children(0).end(), c0.begin(), c0.end()); + + // ---- Node 1 + std::vector p1{}; + std::vector c1{2, 3, 5, 6}; + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(1).begin(), graph.Parents(1).end(), p1.begin(), p1.end()); + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(1).begin(), graph.Children(1).end(), c1.begin(), c1.end()); + + // ---- Node 2 + std::vector p2{0, 1}; + std::vector c2{3, 5}; + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(2).begin(), graph.Parents(2).end(), p2.begin(), p2.end()); + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(2).begin(), graph.Children(2).end(), c2.begin(), c2.end()); + + // ---- Node 3 + std::vector p3{0, 1, 2}; + std::vector c3{4, 5, 6, 7}; + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(3).begin(), graph.Parents(3).end(), p3.begin(), p3.end()); + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(3).begin(), graph.Children(3).end(), c3.begin(), c3.end()); + + // ---- Node 4 + std::vector p4{0, 3}; + std::vector c4{5, 6, 7}; + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(4).begin(), graph.Parents(4).end(), p4.begin(), p4.end()); + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(4).begin(), graph.Children(4).end(), c4.begin(), c4.end()); + + // ---- Node 5 + std::vector p5{0, 1, 2, 3, 4}; + std::vector c5{}; + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(5).begin(), graph.Parents(5).end(), p5.begin(), p5.end()); + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(5).begin(), graph.Children(5).end(), c5.begin(), c5.end()); + + // ---- Node 6 + std::vector p6{0, 1, 3, 4}; + std::vector c6{7}; + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(6).begin(), graph.Parents(6).end(), p6.begin(), p6.end()); + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(6).begin(), graph.Children(6).end(), c6.begin(), c6.end()); + + // ---- Node 7 + std::vector p7{3, 4, 6}; + std::vector c7{}; + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(7).begin(), graph.Parents(7).end(), p7.begin(), p7.end()); + BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(7).begin(), graph.Children(7).end(), c7.begin(), c7.end()); +} + BOOST_AUTO_TEST_CASE(TestBicgstab) { // Getting root git directory std::filesystem::path cwd = std::filesystem::current_path(); From 63827f5194709e4ecdff2529be8f87e30ee5af92 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 19 Feb 2026 08:46:27 +0100 Subject: [PATCH 22/57] vertex weight test --- tests/filereader.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/filereader.cpp b/tests/filereader.cpp index 85347b3b..2d809458 100644 --- a/tests/filereader.cpp +++ b/tests/filereader.cpp @@ -196,48 +196,56 @@ BOOST_AUTO_TEST_CASE(TestMtxCompactSparseGraph) { BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(0).begin(), graph.Parents(0).end(), p0.begin(), p0.end()); BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(0).begin(), graph.Children(0).end(), c0.begin(), c0.end()); + BOOST_CHECK_EQUAL(graph.VertexWorkWeight(0), p0.size() + 1); // ---- Node 1 std::vector p1{}; std::vector c1{2, 3, 5, 6}; BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(1).begin(), graph.Parents(1).end(), p1.begin(), p1.end()); BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(1).begin(), graph.Children(1).end(), c1.begin(), c1.end()); + BOOST_CHECK_EQUAL(graph.VertexWorkWeight(1), p1.size() + 1); // ---- Node 2 std::vector p2{0, 1}; std::vector c2{3, 5}; BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(2).begin(), graph.Parents(2).end(), p2.begin(), p2.end()); BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(2).begin(), graph.Children(2).end(), c2.begin(), c2.end()); + BOOST_CHECK_EQUAL(graph.VertexWorkWeight(2), p2.size() + 1); // ---- Node 3 std::vector p3{0, 1, 2}; std::vector c3{4, 5, 6, 7}; BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(3).begin(), graph.Parents(3).end(), p3.begin(), p3.end()); BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(3).begin(), graph.Children(3).end(), c3.begin(), c3.end()); + BOOST_CHECK_EQUAL(graph.VertexWorkWeight(3), p3.size() + 1); // ---- Node 4 std::vector p4{0, 3}; std::vector c4{5, 6, 7}; BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(4).begin(), graph.Parents(4).end(), p4.begin(), p4.end()); BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(4).begin(), graph.Children(4).end(), c4.begin(), c4.end()); + BOOST_CHECK_EQUAL(graph.VertexWorkWeight(4), p4.size() + 1); // ---- Node 5 std::vector p5{0, 1, 2, 3, 4}; std::vector c5{}; BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(5).begin(), graph.Parents(5).end(), p5.begin(), p5.end()); BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(5).begin(), graph.Children(5).end(), c5.begin(), c5.end()); + BOOST_CHECK_EQUAL(graph.VertexWorkWeight(5), p5.size() + 1); // ---- Node 6 std::vector p6{0, 1, 3, 4}; std::vector c6{7}; BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(6).begin(), graph.Parents(6).end(), p6.begin(), p6.end()); BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(6).begin(), graph.Children(6).end(), c6.begin(), c6.end()); + BOOST_CHECK_EQUAL(graph.VertexWorkWeight(6), p6.size() + 1); // ---- Node 7 std::vector p7{3, 4, 6}; std::vector c7{}; BOOST_CHECK_EQUAL_COLLECTIONS(graph.Parents(7).begin(), graph.Parents(7).end(), p7.begin(), p7.end()); BOOST_CHECK_EQUAL_COLLECTIONS(graph.Children(7).begin(), graph.Children(7).end(), c7.begin(), c7.end()); + BOOST_CHECK_EQUAL(graph.VertexWorkWeight(7), p7.size() + 1); } BOOST_AUTO_TEST_CASE(TestBicgstab) { From fc2a31504527bbcdb02bf128b4dd87be0c1dcd26 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 19 Feb 2026 13:39:18 +0100 Subject: [PATCH 23/57] fixed allocation length --- .../WeakBarriers/aligned_allocator.hpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp index b5103a91..906f87a2 100644 --- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/aligned_allocator.hpp @@ -42,9 +42,8 @@ struct AlignedAllocator { template AlignedAllocator(const AlignedAllocator &) noexcept {} - inline T *allocate(std::size_t size) { return reinterpret_cast(std::aligned_alloc(alignment, size * sizeof(T))); } - - inline void deallocate(T *p, [[maybe_unused]] std::size_t size) { std::free(p); } + inline T *allocate(std::size_t size); + inline void deallocate(T *p, [[maybe_unused]] std::size_t size); template inline void construct(U *p, Args &&...args) { @@ -57,6 +56,17 @@ struct AlignedAllocator { } }; +template +inline T *AlignedAllocator::allocate(std::size_t size) { + std::size_t allocationSize = ((size * sizeof(T) + alignment - 1U) / alignment) * alignment; + return reinterpret_cast(std::aligned_alloc(alignment, allocationSize)); +} + +template +inline void AlignedAllocator::deallocate(T *p, [[maybe_unused]] std::size_t size) { + std::free(p); +} + template constexpr bool operator==(const AlignedAllocator &, const AlignedAllocator &) noexcept { return (T_alignment == U_alignment); From 33ed5d553427c0e908c218cc50cb9f1a525d8a51 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Thu, 19 Feb 2026 13:54:46 +0100 Subject: [PATCH 24/57] clean ssp sptrsv --- apps/maxbsp_ssp_sptrsv.cpp | 8 ++--- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 35 +++---------------- 2 files changed, 7 insertions(+), 36 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index c18f4f7d..7d0153ac 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -116,10 +116,8 @@ int main(int argc, char *argv[]) { sptrsv_kernel.SetupCsrNoPermutation(ssp_var_schedule); sptrsv_kernel.x_ = x.data(); sptrsv_kernel.b_ = b.data(); - FlatCheckpointCounterBarrier barrier(num_threads); - auto ops = Sptrsv::MakeBarrierOps(barrier); auto start = std::chrono::high_resolution_clock::now(); - sptrsv_kernel.SspLsolveStaleness2(ops); + sptrsv_kernel.SspLsolveStaleness2(); auto end = std::chrono::high_resolution_clock::now(); ssp_var_flat_total_time += std::chrono::duration(end - start).count(); if (iter == 0) { @@ -137,10 +135,8 @@ int main(int argc, char *argv[]) { sptrsv_kernel.SetupCsrNoPermutation(ssp_gl_schedule); sptrsv_kernel.x_ = x.data(); sptrsv_kernel.b_ = b.data(); - FlatCheckpointCounterBarrier barrier(num_threads); - auto ops = Sptrsv::MakeBarrierOps(barrier); auto start = std::chrono::high_resolution_clock::now(); - sptrsv_kernel.SspLsolveStaleness2(ops); + sptrsv_kernel.SspLsolveStaleness2(); auto end = std::chrono::high_resolution_clock::now(); ssp_gl_flat_total_time += std::chrono::duration(end - start).count(); if (iter == 0) { diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 7d08e32c..22dcc294 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -36,7 +36,6 @@ limitations under the License. # include # include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp" -# include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier_cached.hpp" # include "osp/bsp/model/BspInstance.hpp" # include "osp/bsp/model/BspSchedule.hpp" # include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp" @@ -51,23 +50,6 @@ class Sptrsv { const BspInstance> *instance_; public: - struct BarrierOps { - void *ctx; - void (*arrive)(void *ctx, std::size_t threadId); - void (*wait)(void *ctx, std::size_t threadId, std::size_t diff); - }; - - template - static BarrierOps MakeBarrierOps(BarrierT &barrier) { - return BarrierOps{ - static_cast(&barrier), - [](void *ctx, std::size_t threadId) { - static_cast(ctx)->Arrive(threadId); - }, - [](void *ctx, std::size_t threadId, std::size_t diff) { - static_cast(ctx)->Wait(threadId, diff); - }}; - } std::vector val_; std::vector cscVal_; @@ -505,10 +487,11 @@ class Sptrsv { std::size_t GetNumberOfVertices() { return instance_->NumberOfVertices(); } // SSP Lsolve with staleness=2 (allowing at most one superstep of lag). - // Barrier operations are injected via function pointers. - void SspLsolveStaleness2(const BarrierOps &barrierOps) { + // Uses FlatCheckpointCounterBarrier created internally. + void SspLsolveStaleness2() { constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference const unsigned nthreads = instance_->NumberOfProcessors(); + FlatCheckpointCounterBarrier barrier(nthreads); auto *csr = instance_->GetComputationalDag().GetCSR(); const auto *outer = csr->outerIndexPtr(); @@ -520,7 +503,7 @@ class Sptrsv { const std::size_t proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; ++step) { // Enforce staleness window before starting this superstep. - barrierOps.wait(barrierOps.ctx, proc, staleness - 1U); + barrier.Wait(proc, staleness - 1U); // Process nodes assigned to this (step, proc) pair. const size_t boundsStrSize = boundsArrayL_[step][proc].size(); for (size_t index = 0; index < boundsStrSize; index += 2) { @@ -539,19 +522,11 @@ class Sptrsv { } } // Signal completion of this superstep. - barrierOps.arrive(barrierOps.ctx, proc); + barrier.Arrive(proc); } } } - // Default SSP Lsolve uses the cached flat checkpoint counter barrier. - void SspLsolveStaleness2() { - const unsigned nthreads = instance_->NumberOfProcessors(); - FlatCheckpointCounterBarrierCached barrier(nthreads); - const BarrierOps ops = MakeBarrierOps(barrier); - SspLsolveStaleness2(ops); - } - virtual ~Sptrsv() = default; }; From 9fc69e94307f55cb351443fe717f083db04903d3 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 19 Feb 2026 14:20:30 +0100 Subject: [PATCH 25/57] made staleness a parameter --- apps/maxbsp_ssp_sptrsv.cpp | 12 ++++--- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 4 +-- .../GreedySchedulers/GrowLocalMaxBsp.hpp | 36 +++++++++---------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 7d0153ac..065cf572 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -20,7 +20,7 @@ using namespace osp; -#define EPSILON 1e-20 +#define EPSILON 1e-50 double L2NormalisedDiff(const std::vector &v, const std::vector &w) { assert(v.size() == w.size()); @@ -87,13 +87,15 @@ int main(int argc, char *argv[]) { BspArchitecture> architecture(num_threads, 1, 500); // configurable processors BspInstance> instance(graph, architecture); + constexpr unsigned staleness = 2U; + // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness=2) GreedyVarianceSspScheduler> ssp_var_scheduler; MaxBspSchedule> ssp_var_schedule(instance); - ssp_var_scheduler.ComputeSchedule(ssp_var_schedule); + ssp_var_scheduler.ComputeSspSchedule(ssp_var_schedule, staleness); // Create SSP-aware schedule using GrowLocalMaxBsp (staleness=2) - GrowLocalSSP> ssp_gl_scheduler; + GrowLocalSSP, staleness> ssp_gl_scheduler; MaxBspSchedule> ssp_gl_schedule(instance); ssp_gl_scheduler.ComputeSchedule(ssp_gl_schedule); @@ -117,7 +119,7 @@ int main(int argc, char *argv[]) { sptrsv_kernel.x_ = x.data(); sptrsv_kernel.b_ = b.data(); auto start = std::chrono::high_resolution_clock::now(); - sptrsv_kernel.SspLsolveStaleness2(); + sptrsv_kernel.SspLsolveStaleness(); auto end = std::chrono::high_resolution_clock::now(); ssp_var_flat_total_time += std::chrono::duration(end - start).count(); if (iter == 0) { @@ -136,7 +138,7 @@ int main(int argc, char *argv[]) { sptrsv_kernel.x_ = x.data(); sptrsv_kernel.b_ = b.data(); auto start = std::chrono::high_resolution_clock::now(); - sptrsv_kernel.SspLsolveStaleness2(); + sptrsv_kernel.SspLsolveStaleness(); auto end = std::chrono::high_resolution_clock::now(); ssp_gl_flat_total_time += std::chrono::duration(end - start).count(); if (iter == 0) { diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 22dcc294..0fbc80c5 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -488,8 +488,8 @@ class Sptrsv { // SSP Lsolve with staleness=2 (allowing at most one superstep of lag). // Uses FlatCheckpointCounterBarrier created internally. - void SspLsolveStaleness2() { - constexpr std::size_t staleness = 2U; // Maximum allowed superstep difference + template + void SspLsolveStaleness() { const unsigned nthreads = instance_->NumberOfProcessors(); FlatCheckpointCounterBarrier barrier(nthreads); diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp index cfd8f85f..ad088a4c 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp @@ -39,7 +39,7 @@ struct GrowLocalSSPParams { WeightT syncCostMultiplierParallelCheck_ = 4; }; -template +template class GrowLocalSSP : public MaxBspScheduler { static_assert(isDirectedGraphV); static_assert(hasVertexWeightsV); @@ -47,7 +47,7 @@ class GrowLocalSSP : public MaxBspScheduler { private: using VertexType = VertexIdxT; - static constexpr unsigned staleness{2U}; + static constexpr unsigned staleness{staleness_t}; GrowLocalSSPParams, VWorkwT> params_; /*! Vertices ready in current superstep */ @@ -89,18 +89,18 @@ class GrowLocalSSP : public MaxBspScheduler { std::string GetScheduleName() const override { return "GrowLocalSSP"; } }; -template -inline GrowLocalSSPParams, VWorkwT> &GrowLocalSSP::GetParameters() { +template +inline GrowLocalSSPParams, VWorkwT> &GrowLocalSSP::GetParameters() { return params_; } -template -inline const GrowLocalSSPParams, VWorkwT> &GrowLocalSSP::GetParameters() const { +template +inline const GrowLocalSSPParams, VWorkwT> &GrowLocalSSP::GetParameters() const { return params_; } -template -void GrowLocalSSP::Init(const unsigned numProcs) { +template +void GrowLocalSSP::Init(const unsigned numProcs) { currentlyReady_.clear(); for (auto &stepFutureReady : futureReady_) { @@ -125,8 +125,8 @@ void GrowLocalSSP::Init(const unsigned numProcs) { } } -template -void GrowLocalSSP::ReleaseMemory() { +template +void GrowLocalSSP::ReleaseMemory() { currentlyReady_.clear(); currentlyReady_.shrink_to_fit(); @@ -159,8 +159,8 @@ void GrowLocalSSP::ReleaseMemory() { } } -template -inline typename std::deque>::difference_type GrowLocalSSP::MaxAllReadyUsage( +template +inline typename std::deque>::difference_type GrowLocalSSP::MaxAllReadyUsage( const std::deque> ¤tlyReady, const std::deque> &nextSuperstepReady) const { if constexpr (staleness == 1U) { return std::distance(currentlyReady.cbegin(), currentlyReady.cend()); @@ -176,8 +176,8 @@ inline typename std::deque>::difference_type GrowLocalSSP -bool GrowLocalSSP::ChanceToFinish(const unsigned superStep) const { +template +bool GrowLocalSSP::ChanceToFinish(const unsigned superStep) const { bool ans = std::all_of(futureReady_.cbegin(), futureReady_.cend(), [](const auto &deq) { return deq.empty(); }); if (ans) { @@ -204,13 +204,13 @@ bool GrowLocalSSP::ChanceToFinish(const unsigned superStep) const { return ans; } -template -ReturnStatus GrowLocalSSP::ComputeSchedule(BspSchedule &schedule) { +template +ReturnStatus GrowLocalSSP::ComputeSchedule(BspSchedule &schedule) { return MaxBspScheduler::ComputeSchedule(schedule); } -template -ReturnStatus GrowLocalSSP::ComputeSchedule(MaxBspSchedule &schedule) { +template +ReturnStatus GrowLocalSSP::ComputeSchedule(MaxBspSchedule &schedule) { const BspInstance &instance = schedule.GetInstance(); const GraphT &graph = instance.GetComputationalDag(); const VertexType numVertices = graph.NumVertices(); From ae1c62b5a601c7b980e254dad8927bcee746780c Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 19 Feb 2026 14:23:21 +0100 Subject: [PATCH 26/57] fix text --- apps/maxbsp_ssp_sptrsv.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 065cf572..a9b3416c 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -89,12 +89,12 @@ int main(int argc, char *argv[]) { constexpr unsigned staleness = 2U; - // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness=2) + // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness) GreedyVarianceSspScheduler> ssp_var_scheduler; MaxBspSchedule> ssp_var_schedule(instance); ssp_var_scheduler.ComputeSspSchedule(ssp_var_schedule, staleness); - // Create SSP-aware schedule using GrowLocalMaxBsp (staleness=2) + // Create SSP-aware schedule using GrowLocalMaxBsp (staleness) GrowLocalSSP, staleness> ssp_gl_scheduler; MaxBspSchedule> ssp_gl_schedule(instance); ssp_gl_scheduler.ComputeSchedule(ssp_gl_schedule); From e4460ee30fb06dae208299a2a68594c86d31dcec Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Thu, 19 Feb 2026 14:42:20 +0100 Subject: [PATCH 27/57] improved parameter --- include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp index ad088a4c..64a5b97f 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp @@ -34,7 +34,7 @@ namespace osp { template struct GrowLocalSSPParams { - VertT minSuperstepSize_ = 10; + VertT minSuperstepSize_ = 20; WeightT syncCostMultiplierMinSuperstepWeight_ = 1; WeightT syncCostMultiplierParallelCheck_ = 4; }; From 8f4c8031722b9b07b45955cc49ff33ba5eac8fb7 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Fri, 20 Feb 2026 12:09:27 +0100 Subject: [PATCH 28/57] Benchmark for ssp --- apps/maxbsp_ssp_sptrsv.cpp | 630 ++++++++++++++++++++++++++----------- 1 file changed, 454 insertions(+), 176 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index a9b3416c..57ab6ae8 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -1,11 +1,30 @@ /* * maxbsp_ssp_sptrsv.cpp - * Demonstrates maxbsp scheduling with staleness=2, then runs SpTRSV with SSP kernel. + * Benchmark for SpTRSV using: + * - variance_ssp + * - growlocal_ssp + * - growlocal + * - eigen_serial + * + * Outputs per-iteration runtime rows to CSV: + * graph,Algorithm,processors,time to compute schedule,schedule supersteps, + * schedule synchronization costs,staleness,runtime */ #include +#include #include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include #include #include @@ -20,25 +39,80 @@ using namespace osp; -#define EPSILON 1e-50 +namespace { -double L2NormalisedDiff(const std::vector &v, const std::vector &w) { - assert(v.size() == w.size()); - double l2diff = 0.0; - double frobNorm = 0.0; - for (std::size_t i = 0U; i < v.size(); ++i) { - const double absdiff = std::abs(v[i] - w[i]); - l2diff += absdiff * absdiff; +constexpr double EPSILON = 1e-12; +constexpr unsigned kDefaultStaleness = 2U; - const double vAbs = std::abs(v[i]); - const double wAbs = std::abs(w[i]); +enum class Algorithm { + VarianceSsp, + GrowLocalSsp, + GrowLocal, + EigenSerial +}; + +struct Args { + std::string inputPath; + std::string outputCsv = "sptrsv_benchmark.csv"; + int iterations = 100; + unsigned processors = 16U; + std::set algorithms; +}; + +struct CsvRow { + std::string graph; + std::string algorithm; + unsigned processors; + double scheduleTimeSeconds; + unsigned supersteps; + double scheduleSyncCosts; + unsigned staleness; + double runtimeSeconds; +}; + +struct SummaryKey { + std::string graph; + std::string algorithm; + unsigned processors; + unsigned staleness; + + bool operator<(const SummaryKey &other) const { + if (graph != other.graph) { + return graph < other.graph; + } + if (algorithm != other.algorithm) { + return algorithm < other.algorithm; + } + if (processors != other.processors) { + return processors < other.processors; + } + return staleness < other.staleness; + } +}; + +struct SummaryAgg { + double scheduleTimeSeconds = 0.0; + unsigned supersteps = 0U; + double scheduleSyncCosts = 0.0; + double sumLogRuntime = 0.0; + std::size_t samples = 0U; +}; - frobNorm += ((vAbs * vAbs) + (wAbs * wAbs)) / 2.0; +std::string CsvEscape(const std::string &s) { + if (s.find(',') == std::string::npos && s.find('"') == std::string::npos && s.find('\n') == std::string::npos + && s.find('\r') == std::string::npos) { + return s; } - l2diff = std::sqrt(l2diff); - frobNorm = std::sqrt(frobNorm); - const double ratio = l2diff / (frobNorm + EPSILON); - return ratio; + std::string out = "\""; + for (const char c : s) { + if (c == '"') { + out += "\"\""; + } else { + out.push_back(c); + } + } + out += "\""; + return out; } double LInftyNormalisedDiff(const std::vector &v, const std::vector &w) { @@ -47,194 +121,398 @@ double LInftyNormalisedDiff(const std::vector &v, const std::vector 1) { - filename = argv[1]; - } - if (argc > 2) { - num_iterations = std::stoi(argv[2]); - } - if (const char *omp_env = std::getenv("OMP_NUM_THREADS")) { - num_threads = static_cast(std::stoul(omp_env)); - } else if (argc > 3) { - num_threads = static_cast(std::stoul(argv[3])); - } - - // Load matrix - Eigen::SparseMatrix lCsr; - bool matrixLoadSuccess = Eigen::loadMarket(lCsr, filename); - if (!matrixLoadSuccess) { - std::cerr << "Failed to read matrix from " << filename << std::endl; - return 1; +void PrintUsage(const char *prog) { + std::cout << "Usage:\n" + << " " << prog + << " --input [--output ] [--iterations ] [--processors

]\n" + << " [--variance-ssp] [--growlocal-ssp] [--growlocal] [--eigen-serial] [--all]\n\n" + << "Examples:\n" + << " " << prog << " --input ../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx --all\n" + << " " << prog + << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --variance-ssp --growlocal-ssp --growlocal\n"; +} + +bool ParseArgs(int argc, char *argv[], Args &args) { + if (const char *ompEnv = std::getenv("OMP_NUM_THREADS")) { + args.processors = static_cast(std::stoul(ompEnv)); } - std::cout << "Loaded matrix of size " << lCsr.rows() << " x " << lCsr.cols() << " with " << lCsr.nonZeros() << " non-zeros.\n"; - - // Setup graph and architecture - SparseMatrixImp graph; - graph.SetCsr(&lCsr); - Eigen::SparseMatrix lCsc = lCsr; - graph.SetCsc(&lCsc); - BspArchitecture> architecture(num_threads, 1, 500); // configurable processors - BspInstance> instance(graph, architecture); - - constexpr unsigned staleness = 2U; - - // Create SSP-aware schedule using GreedyVarianceSspScheduler (staleness) - GreedyVarianceSspScheduler> ssp_var_scheduler; - MaxBspSchedule> ssp_var_schedule(instance); - ssp_var_scheduler.ComputeSspSchedule(ssp_var_schedule, staleness); - - // Create SSP-aware schedule using GrowLocalMaxBsp (staleness) - GrowLocalSSP, staleness> ssp_gl_scheduler; - MaxBspSchedule> ssp_gl_schedule(instance); - ssp_gl_scheduler.ComputeSchedule(ssp_gl_schedule); - - // Create a non-SSP schedule using GrowLocalAutoCores - GrowLocalAutoCores> growlocal_scheduler; - BspSchedule> growlocal_schedule(instance); - growlocal_scheduler.ComputeSchedule(growlocal_schedule); - - // Setup SpTRSV kernel - Sptrsv sptrsv_kernel(instance); - - size_t n = static_cast(lCsc.cols()); - - // Benchmark SSP Variance L-solve - double ssp_var_flat_total_time = 0.0; - std::vector ssp_var_flat_result(n, 0.0); - for (int iter = 0; iter < num_iterations; ++iter) { - std::vector x(n, 0.0); - std::vector b(n, 1.0); - sptrsv_kernel.SetupCsrNoPermutation(ssp_var_schedule); - sptrsv_kernel.x_ = x.data(); - sptrsv_kernel.b_ = b.data(); - auto start = std::chrono::high_resolution_clock::now(); - sptrsv_kernel.SspLsolveStaleness(); - auto end = std::chrono::high_resolution_clock::now(); - ssp_var_flat_total_time += std::chrono::duration(end - start).count(); - if (iter == 0) { - ssp_var_flat_result = std::vector(x.begin(), x.end()); + + for (int i = 1; i < argc; ++i) { + const std::string flag = argv[i]; + + const bool needsValue + = (flag == "--input" || flag == "--output" || flag == "--iterations" || flag == "--processors"); + if (needsValue && i + 1 >= argc) { + std::cerr << "Missing value for " << flag << "\n"; + return false; } - } - double ssp_var_flat_avg_time = ssp_var_flat_total_time / num_iterations; - - // Benchmark SSP GrowLocal L-solve - double ssp_gl_flat_total_time = 0.0; - std::vector ssp_gl_flat_result(n, 0.0); - for (int iter = 0; iter < num_iterations; ++iter) { - std::vector x(n, 0.0); - std::vector b(n, 1.0); - sptrsv_kernel.SetupCsrNoPermutation(ssp_gl_schedule); - sptrsv_kernel.x_ = x.data(); - sptrsv_kernel.b_ = b.data(); - auto start = std::chrono::high_resolution_clock::now(); - sptrsv_kernel.SspLsolveStaleness(); - auto end = std::chrono::high_resolution_clock::now(); - ssp_gl_flat_total_time += std::chrono::duration(end - start).count(); - if (iter == 0) { - ssp_gl_flat_result = std::vector(x.begin(), x.end()); + + if (flag == "--input") { + args.inputPath = argv[++i]; + } else if (flag == "--output") { + args.outputCsv = argv[++i]; + } else if (flag == "--iterations") { + args.iterations = std::stoi(argv[++i]); + } else if (flag == "--processors") { + args.processors = static_cast(std::stoul(argv[++i])); + } else if (flag == "--variance-ssp") { + args.algorithms.insert(Algorithm::VarianceSsp); + } else if (flag == "--growlocal-ssp") { + args.algorithms.insert(Algorithm::GrowLocalSsp); + } else if (flag == "--growlocal") { + args.algorithms.insert(Algorithm::GrowLocal); + } else if (flag == "--eigen-serial") { + args.algorithms.insert(Algorithm::EigenSerial); + } else if (flag == "--all") { + args.algorithms = {Algorithm::VarianceSsp, Algorithm::GrowLocalSsp, Algorithm::GrowLocal, Algorithm::EigenSerial}; + } else if (flag == "--help" || flag == "-h") { + PrintUsage(argv[0]); + return false; + } else { + std::cerr << "Unknown option: " << flag << "\n"; + return false; } } - double ssp_gl_flat_avg_time = ssp_gl_flat_total_time / num_iterations; - - // Benchmark GrowLocalAutoCores schedule with non-SSP L-solve (no permutation) - double growlocal_total_time = 0.0; - std::vector growlocal_result(n, 0.0); - for (int iter = 0; iter < num_iterations; ++iter) { - std::vector x(n, 0.0); - std::vector b(n, 1.0); - sptrsv_kernel.SetupCsrNoPermutation(growlocal_schedule); - sptrsv_kernel.x_ = x.data(); - sptrsv_kernel.b_ = b.data(); - auto start = std::chrono::high_resolution_clock::now(); - sptrsv_kernel.LsolveNoPermutation(); - auto end = std::chrono::high_resolution_clock::now(); - growlocal_total_time += std::chrono::duration(end - start).count(); - if (iter == 0) { - growlocal_result = std::vector(x.begin(), x.end()); + + if (args.inputPath.empty()) { + std::cerr << "--input is required\n"; + return false; + } + if (args.iterations <= 0) { + std::cerr << "--iterations must be > 0\n"; + return false; + } + if (args.processors == 0U) { + std::cerr << "--processors must be > 0\n"; + return false; + } + if (args.algorithms.empty()) { + std::cerr << "No algorithm selected. Use --all or explicit flags.\n"; + return false; + } + + return true; +} + +std::vector CollectInputGraphs(const std::string &inputPath) { + std::vector inputs; + const std::filesystem::path p(inputPath); + + if (!std::filesystem::exists(p)) { + throw std::runtime_error("Input path does not exist: " + inputPath); + } + + if (std::filesystem::is_regular_file(p)) { + if (p.extension() == ".mtx") { + inputs.push_back(p); } + return inputs; } - double growlocal_avg_time = growlocal_total_time / num_iterations; - - // Benchmark serial L-solve - double serial_total_time = 0.0; - std::vector serial_result(n, 0.0); - for (int iter = 0; iter < num_iterations; ++iter) { - std::vector x_serial(n, 0.0); - std::vector b_serial(n, 1.0); - sptrsv_kernel.x_ = x_serial.data(); - sptrsv_kernel.b_ = b_serial.data(); - auto start = std::chrono::high_resolution_clock::now(); - sptrsv_kernel.LsolveSerial(); - auto end = std::chrono::high_resolution_clock::now(); - serial_total_time += std::chrono::duration(end - start).count(); - if (iter == 0) { - serial_result = std::vector(x_serial.begin(), x_serial.end()); + + if (std::filesystem::is_directory(p)) { + for (const auto &entry : std::filesystem::recursive_directory_iterator(p)) { + if (!entry.is_regular_file()) { + continue; + } + if (entry.path().extension() == ".mtx") { + inputs.push_back(entry.path()); + } } } - double serial_avg_time = serial_total_time / num_iterations; - // Compare results - const double varDiff = LInftyNormalisedDiff(ssp_var_flat_result, serial_result); + std::sort(inputs.begin(), inputs.end()); + return inputs; +} + +void EnsureCsvHeader(std::ofstream &csv) { + csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,ScheduleSynchronizationCosts,Staleness,RuntimeSeconds\n"; +} - std::cout << "Max relative difference between SSP Variance and serial L-solve: " << varDiff << std::endl; - if (varDiff < EPSILON) { - std::cout << "SSP Variance L-solve matches serial L-solve!" << std::endl; - } else { - std::cout << "SSP Variance L-solve does NOT match serial L-solve!" << std::endl; +void EnsureSummaryCsvHeader(std::ofstream &csv) { + csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,ScheduleSynchronizationCosts,Staleness," + "RuntimeSamples,RuntimeGeometricMeanSeconds\n"; +} + +void WriteCsvRow(std::ofstream &csv, const CsvRow &row) { + csv << CsvEscape(row.graph) << "," << row.algorithm << "," << row.processors << "," << row.scheduleTimeSeconds << "," + << row.supersteps << "," << row.scheduleSyncCosts << "," << row.staleness << "," << row.runtimeSeconds << "\n"; +} + +std::string BuildSummaryCsvPath(const std::string &detailPath) { + const std::filesystem::path p(detailPath); + const std::string stem = p.stem().string(); + const std::string ext = p.has_extension() ? p.extension().string() : std::string(".csv"); + const std::filesystem::path summary = p.parent_path() / (stem + "_summary" + ext); + return summary.string(); +} + +std::string FormatExperimentStartTimestampForFilename() { + const std::time_t now = std::time(nullptr); + std::tm localTm{}; +#ifdef _WIN32 + localtime_s(&localTm, &now); +#else + localtime_r(&now, &localTm); +#endif + std::ostringstream oss; + oss << std::put_time(&localTm, "%d-%m-%Y_%H%M"); + return oss.str(); +} + +std::string BuildTimestampedCsvPath(const std::string &basePath, const std::string ×tamp) { + const std::filesystem::path p(basePath); + const std::string stem = p.stem().string(); + const std::string ext = p.has_extension() ? p.extension().string() : std::string(".csv"); + const std::filesystem::path out = p.parent_path() / (stem + "_" + timestamp + ext); + return out.string(); +} + +template +double ComputeScheduleSyncCosts(const BspInstance> &instance, const ScheduleT &schedule) { + if (schedule.NumberOfSupersteps() == 0U) { + return 0.0; } + return static_cast(schedule.NumberOfSupersteps() - 1U) * static_cast(instance.SynchronisationCosts()); +} + +} // namespace - const double GLSSPDiff = LInftyNormalisedDiff(ssp_gl_flat_result, serial_result); +int main(int argc, char *argv[]) { + const std::string experimentStart = FormatExperimentStartTimestampForFilename(); - std::cout << "Max relative difference between SSP GrowLocal and serial L-solve: " << GLSSPDiff << std::endl; - if (GLSSPDiff < EPSILON) { - std::cout << "SSP GrowLocal L-solve matches serial L-solve!" << std::endl; - } else { - std::cout << "SSP GrowLocal L-solve does NOT match serial L-solve!" << std::endl; + Args args; + if (!ParseArgs(argc, argv, args)) { + PrintUsage(argv[0]); + return 1; } - const double GLPDiff = LInftyNormalisedDiff(growlocal_result, serial_result); + std::vector graphFiles; + try { + graphFiles = CollectInputGraphs(args.inputPath); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } - std::cout << "Max relative difference between GrowLocal and serial L-solve: " << GLPDiff << std::endl; - if (GLPDiff < EPSILON) { - std::cout << "GrowLocal L-solve matches serial L-solve!" << std::endl; - } else { - std::cout << "GrowLocal L-solve does NOT match serial L-solve!" << std::endl; + if (graphFiles.empty()) { + std::cerr << "No .mtx files found at input path: " << args.inputPath << std::endl; + return 1; } - std::cout << "Average SSP Variance L-solve time (" << num_iterations << " runs): " << ssp_var_flat_avg_time << " seconds" - << std::endl; - std::cout << "Average SSP GrowLocal L-solve time (" << num_iterations << " runs): " << ssp_gl_flat_avg_time << " seconds" - << std::endl; - std::cout << "Average GrowLocalAutoCores L-solve time (" << num_iterations << " runs): " << growlocal_avg_time << " seconds" - << std::endl; - std::cout << "Average serial L-solve time (" << num_iterations << " runs): " << serial_avg_time << " seconds" << std::endl << std::endl; + const std::string detailCsvPath = BuildTimestampedCsvPath(args.outputCsv, experimentStart); + std::ofstream csv(detailCsvPath, std::ios::out | std::ios::trunc); + if (!csv.is_open()) { + std::cerr << "Failed to open CSV output: " << detailCsvPath << std::endl; + return 1; + } + EnsureCsvHeader(csv); - if (ssp_var_flat_avg_time > 0.0) { - std::cout << "Speedup (serial/SSP Var): " << (serial_avg_time / ssp_var_flat_avg_time) << "x" << std::endl; + const std::string summaryCsvPath = BuildSummaryCsvPath(detailCsvPath); + std::ofstream summaryCsv(summaryCsvPath, std::ios::out | std::ios::trunc); + if (!summaryCsv.is_open()) { + std::cerr << "Failed to open summary CSV output: " << summaryCsvPath << std::endl; + return 1; } - if (ssp_gl_flat_avg_time > 0.0) { - std::cout << "Speedup (serial/SSP GL): " << (serial_avg_time / ssp_gl_flat_avg_time) << "x" << std::endl; + EnsureSummaryCsvHeader(summaryCsv); + + std::cout << "Running benchmark on " << graphFiles.size() << " graph(s), iterations=" << args.iterations + << ", processors=" << args.processors << std::endl; + std::cout << "Experiment id timestamp: " << experimentStart << std::endl; + + std::vector bufferedRows; + bufferedRows.reserve(graphFiles.size() * args.algorithms.size() * static_cast(args.iterations)); + + for (const auto &graphPath : graphFiles) { + const std::string graphName = graphPath.filename().string(); + + Eigen::SparseMatrix lCsr; + if (!Eigen::loadMarket(lCsr, graphPath.string())) { + std::cerr << "Failed to load matrix: " << graphPath << std::endl; + continue; + } + + Eigen::SparseMatrix lCsc = lCsr; + + SparseMatrixImp graph; + graph.SetCsr(&lCsr); + graph.SetCsc(&lCsc); + + BspArchitecture> architecture(args.processors, 1, 500); + BspInstance> instance(graph, architecture); + + Sptrsv sptrsv(instance); + const std::size_t n = static_cast(lCsr.cols()); + + std::vector serialRefX(n, 0.0); + std::vector serialB(n, 1.0); + sptrsv.x_ = serialRefX.data(); + sptrsv.b_ = serialB.data(); + sptrsv.LsolveSerial(); + + std::cout << "Graph: " << graphName << " (" << lCsr.rows() << "x" << lCsr.cols() << ", nnz=" << lCsr.nonZeros() << ")\n"; + + if (args.algorithms.count(Algorithm::VarianceSsp) > 0U) { + GreedyVarianceSspScheduler> scheduler; + MaxBspSchedule> schedule(instance); + + const auto t0 = std::chrono::high_resolution_clock::now(); + scheduler.ComputeSspSchedule(schedule, kDefaultStaleness); + const auto t1 = std::chrono::high_resolution_clock::now(); + const double scheduleTime = std::chrono::duration(t1 - t0).count(); + + sptrsv.SetupCsrNoPermutation(schedule); + const unsigned supersteps = schedule.NumberOfSupersteps(); + const double syncCosts = ComputeScheduleSyncCosts(instance, schedule); + + for (int iter = 0; iter < args.iterations; ++iter) { + std::vector x(n, 0.0); + std::vector b(n, 1.0); + sptrsv.x_ = x.data(); + sptrsv.b_ = b.data(); + + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.SspLsolveStaleness(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); + + if (iter == 0) { + const double diff = LInftyNormalisedDiff(x, serialRefX); + std::cout << " variance_ssp first-run max relative diff vs serial: " << diff << std::endl; + } + + bufferedRows.push_back(CsvRow{graphName, + "variance_ssp", + args.processors, + scheduleTime, + supersteps, + syncCosts, + kDefaultStaleness, + runtime}); + } + } + + if (args.algorithms.count(Algorithm::GrowLocalSsp) > 0U) { + GrowLocalSSP, kDefaultStaleness> scheduler; + MaxBspSchedule> schedule(instance); + + const auto t0 = std::chrono::high_resolution_clock::now(); + scheduler.ComputeSchedule(schedule); + const auto t1 = std::chrono::high_resolution_clock::now(); + const double scheduleTime = std::chrono::duration(t1 - t0).count(); + + sptrsv.SetupCsrNoPermutation(schedule); + const unsigned supersteps = schedule.NumberOfSupersteps(); + const double syncCosts = ComputeScheduleSyncCosts(instance, schedule); + + for (int iter = 0; iter < args.iterations; ++iter) { + std::vector x(n, 0.0); + std::vector b(n, 1.0); + sptrsv.x_ = x.data(); + sptrsv.b_ = b.data(); + + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.SspLsolveStaleness(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); + + if (iter == 0) { + const double diff = LInftyNormalisedDiff(x, serialRefX); + std::cout << " growlocal_ssp first-run max relative diff vs serial: " << diff << std::endl; + } + + bufferedRows.push_back(CsvRow{graphName, + "growlocal_ssp", + args.processors, + scheduleTime, + supersteps, + syncCosts, + kDefaultStaleness, + runtime}); + } + } + + if (args.algorithms.count(Algorithm::GrowLocal) > 0U) { + GrowLocalAutoCores> scheduler; + BspSchedule> schedule(instance); + + const auto t0 = std::chrono::high_resolution_clock::now(); + scheduler.ComputeSchedule(schedule); + const auto t1 = std::chrono::high_resolution_clock::now(); + const double scheduleTime = std::chrono::duration(t1 - t0).count(); + + sptrsv.SetupCsrNoPermutation(schedule); + const unsigned supersteps = schedule.NumberOfSupersteps(); + const double syncCosts = ComputeScheduleSyncCosts(instance, schedule); + + for (int iter = 0; iter < args.iterations; ++iter) { + std::vector x(n, 0.0); + std::vector b(n, 1.0); + sptrsv.x_ = x.data(); + sptrsv.b_ = b.data(); + + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.LsolveNoPermutation(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); + + if (iter == 0) { + const double diff = LInftyNormalisedDiff(x, serialRefX); + std::cout << " growlocal first-run max relative diff vs serial: " << diff << std::endl; + } + + bufferedRows.push_back(CsvRow{ + graphName, "growlocal", args.processors, scheduleTime, supersteps, syncCosts, 1U, runtime}); + } + } + + if (args.algorithms.count(Algorithm::EigenSerial) > 0U) { + for (int iter = 0; iter < args.iterations; ++iter) { + std::vector x(n, 0.0); + std::vector b(n, 1.0); + sptrsv.x_ = x.data(); + sptrsv.b_ = b.data(); + + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.LsolveSerial(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); + + bufferedRows.push_back(CsvRow{graphName, "eigen_serial", 1U, 0.0, 1U, 0.0, 0U, runtime}); + } + } } - if (growlocal_avg_time > 0.0) { - std::cout << "Speedup (serial/GrowLocalAutoCores): " << (serial_avg_time / growlocal_avg_time) << "x" << std::endl; + + for (const CsvRow &row : bufferedRows) { + WriteCsvRow(csv, row); } - if (ssp_var_flat_avg_time > 0.0) { - std::cout << "Speedup (GrowLocalAutoCores/SSP Var): " << (growlocal_avg_time / ssp_var_flat_avg_time) << "x" << std::endl; + + std::map summary; + constexpr double kMinRuntime = 1e-15; + for (const CsvRow &row : bufferedRows) { + SummaryKey key{row.graph, row.algorithm, row.processors, row.staleness}; + SummaryAgg &agg = summary[key]; + if (agg.samples == 0U) { + agg.scheduleTimeSeconds = row.scheduleTimeSeconds; + agg.supersteps = row.supersteps; + agg.scheduleSyncCosts = row.scheduleSyncCosts; + } + agg.sumLogRuntime += std::log(std::max(row.runtimeSeconds, kMinRuntime)); + ++agg.samples; } - if (ssp_gl_flat_avg_time > 0.0) { - std::cout << "Speedup (GrowLocalAutoCores/SSP GL): " << (growlocal_avg_time / ssp_gl_flat_avg_time) << "x" << std::endl; + + for (const auto &[key, agg] : summary) { + const double geomean = std::exp(agg.sumLogRuntime / static_cast(agg.samples)); + summaryCsv << CsvEscape(key.graph) << "," << key.algorithm << "," << key.processors << "," << agg.scheduleTimeSeconds + << "," << agg.supersteps << "," << agg.scheduleSyncCosts << "," << key.staleness + << "," << agg.samples << "," << geomean << "\n"; } + std::cout << "Benchmark complete. CSV written to: " << detailCsvPath << std::endl; + std::cout << "Summary CSV written to: " << summaryCsvPath << std::endl; return 0; } From 65d169b0514e20c984fef99c9556767f80f0beb2 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Fri, 20 Feb 2026 14:34:04 +0100 Subject: [PATCH 29/57] benchmark additions and checkpoints --- apps/maxbsp_ssp_sptrsv.cpp | 179 ++++++++++++++++++++++++------------- 1 file changed, 118 insertions(+), 61 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 57ab6ae8..988653be 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -43,12 +44,15 @@ namespace { constexpr double EPSILON = 1e-12; constexpr unsigned kDefaultStaleness = 2U; +constexpr int defaultSynchronisationCosts = 500; + +constexpr int preMeasureIterations = 2; enum class Algorithm { VarianceSsp, GrowLocalSsp, GrowLocal, - EigenSerial + Serial }; struct Args { @@ -65,9 +69,10 @@ struct CsvRow { unsigned processors; double scheduleTimeSeconds; unsigned supersteps; - double scheduleSyncCosts; + int SyncCosts; unsigned staleness; double runtimeSeconds; + bool correctness; }; struct SummaryKey { @@ -93,9 +98,10 @@ struct SummaryKey { struct SummaryAgg { double scheduleTimeSeconds = 0.0; unsigned supersteps = 0U; - double scheduleSyncCosts = 0.0; + int SyncCosts = 0; double sumLogRuntime = 0.0; std::size_t samples = 0U; + bool correctness = false; }; std::string CsvEscape(const std::string &s) { @@ -167,9 +173,9 @@ bool ParseArgs(int argc, char *argv[], Args &args) { } else if (flag == "--growlocal") { args.algorithms.insert(Algorithm::GrowLocal); } else if (flag == "--eigen-serial") { - args.algorithms.insert(Algorithm::EigenSerial); + args.algorithms.insert(Algorithm::Serial); } else if (flag == "--all") { - args.algorithms = {Algorithm::VarianceSsp, Algorithm::GrowLocalSsp, Algorithm::GrowLocal, Algorithm::EigenSerial}; + args.algorithms = {Algorithm::VarianceSsp, Algorithm::GrowLocalSsp, Algorithm::GrowLocal, Algorithm::Serial}; } else if (flag == "--help" || flag == "-h") { PrintUsage(argv[0]); return false; @@ -201,7 +207,11 @@ bool ParseArgs(int argc, char *argv[], Args &args) { std::vector CollectInputGraphs(const std::string &inputPath) { std::vector inputs; - const std::filesystem::path p(inputPath); + std::filesystem::path p(inputPath); + + while (std::filesystem::exists(p) && std::filesystem::is_symlink(p)) { + p = std::filesystem::read_symlink(p); + } if (!std::filesystem::exists(p)) { throw std::runtime_error("Input path does not exist: " + inputPath); @@ -211,16 +221,18 @@ std::vector CollectInputGraphs(const std::string &inputPa if (p.extension() == ".mtx") { inputs.push_back(p); } - return inputs; - } - - if (std::filesystem::is_directory(p)) { + } else if (std::filesystem::is_directory(p)) { for (const auto &entry : std::filesystem::recursive_directory_iterator(p)) { - if (!entry.is_regular_file()) { + auto entryPath = entry.path(); + while (std::filesystem::exists(entryPath) && std::filesystem::is_symlink(entryPath)) { + entryPath = std::filesystem::read_symlink(entryPath); + } + + if (!std::filesystem::is_regular_file(entryPath)) { continue; } - if (entry.path().extension() == ".mtx") { - inputs.push_back(entry.path()); + if (entryPath.extension() == ".mtx") { + inputs.push_back(entryPath); } } } @@ -230,17 +242,17 @@ std::vector CollectInputGraphs(const std::string &inputPa } void EnsureCsvHeader(std::ofstream &csv) { - csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,ScheduleSynchronizationCosts,Staleness,RuntimeSeconds\n"; + csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,SynchronizationCosts,Staleness,RuntimeSeconds,Correctness\n"; } void EnsureSummaryCsvHeader(std::ofstream &csv) { - csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,ScheduleSynchronizationCosts,Staleness," - "RuntimeSamples,RuntimeGeometricMeanSeconds\n"; + csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,SynchronizationCosts,Staleness," + "RuntimeSamples,RuntimeGeometricMeanSeconds,Correctness\n"; } void WriteCsvRow(std::ofstream &csv, const CsvRow &row) { csv << CsvEscape(row.graph) << "," << row.algorithm << "," << row.processors << "," << row.scheduleTimeSeconds << "," - << row.supersteps << "," << row.scheduleSyncCosts << "," << row.staleness << "," << row.runtimeSeconds << "\n"; + << row.supersteps << "," << row.SyncCosts << "," << row.staleness << "," << row.runtimeSeconds << "," << row.correctness << "\n"; } std::string BuildSummaryCsvPath(const std::string &detailPath) { @@ -272,12 +284,8 @@ std::string BuildTimestampedCsvPath(const std::string &basePath, const std::stri return out.string(); } -template -double ComputeScheduleSyncCosts(const BspInstance> &instance, const ScheduleT &schedule) { - if (schedule.NumberOfSupersteps() == 0U) { - return 0.0; - } - return static_cast(schedule.NumberOfSupersteps() - 1U) * static_cast(instance.SynchronisationCosts()); +int ComputeSyncCosts(const BspInstance> &instance) { + return instance.GetArchitecture().SynchronisationCosts(); } } // namespace @@ -326,6 +334,7 @@ int main(int argc, char *argv[]) { std::vector bufferedRows; bufferedRows.reserve(graphFiles.size() * args.algorithms.size() * static_cast(args.iterations)); + typename std::vector::difference_type writtenEntries = 0U; for (const auto &graphPath : graphFiles) { const std::string graphName = graphPath.filename().string(); @@ -342,7 +351,7 @@ int main(int argc, char *argv[]) { graph.SetCsr(&lCsr); graph.SetCsc(&lCsc); - BspArchitecture> architecture(args.processors, 1, 500); + BspArchitecture> architecture(args.processors, 1, defaultSynchronisationCosts); BspInstance> instance(graph, architecture); Sptrsv sptrsv(instance); @@ -367,9 +376,10 @@ int main(int argc, char *argv[]) { sptrsv.SetupCsrNoPermutation(schedule); const unsigned supersteps = schedule.NumberOfSupersteps(); - const double syncCosts = ComputeScheduleSyncCosts(instance, schedule); + const int syncCosts = ComputeSyncCosts(instance); - for (int iter = 0; iter < args.iterations; ++iter) { + bool correct = false; + for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { std::vector x(n, 0.0); std::vector b(n, 1.0); sptrsv.x_ = x.data(); @@ -382,17 +392,26 @@ int main(int argc, char *argv[]) { if (iter == 0) { const double diff = LInftyNormalisedDiff(x, serialRefX); - std::cout << " variance_ssp first-run max relative diff vs serial: " << diff << std::endl; + correct = (diff < EPSILON); + std::cout << " Variance_SSP first-run max relative diff vs serial: " << diff << std::endl; + } + + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Variance_SSP", + args.processors, + scheduleTime, + supersteps, + syncCosts, + kDefaultStaleness, + runtime, + correct}); } + } - bufferedRows.push_back(CsvRow{graphName, - "variance_ssp", - args.processors, - scheduleTime, - supersteps, - syncCosts, - kDefaultStaleness, - runtime}); + for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) { + WriteCsvRow(csv, *it); + ++writtenEntries; } } @@ -407,9 +426,10 @@ int main(int argc, char *argv[]) { sptrsv.SetupCsrNoPermutation(schedule); const unsigned supersteps = schedule.NumberOfSupersteps(); - const double syncCosts = ComputeScheduleSyncCosts(instance, schedule); + const int syncCosts = ComputeSyncCosts(instance); - for (int iter = 0; iter < args.iterations; ++iter) { + bool correct = false; + for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { std::vector x(n, 0.0); std::vector b(n, 1.0); sptrsv.x_ = x.data(); @@ -422,17 +442,26 @@ int main(int argc, char *argv[]) { if (iter == 0) { const double diff = LInftyNormalisedDiff(x, serialRefX); - std::cout << " growlocal_ssp first-run max relative diff vs serial: " << diff << std::endl; + correct = (diff < EPSILON); + std::cout << " Growlocal_SSP first-run max relative diff vs serial: " << diff << std::endl; } - bufferedRows.push_back(CsvRow{graphName, - "growlocal_ssp", - args.processors, - scheduleTime, - supersteps, - syncCosts, - kDefaultStaleness, - runtime}); + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Growlocal_SSP", + args.processors, + scheduleTime, + supersteps, + syncCosts, + kDefaultStaleness, + runtime, + correct}); + } + } + + for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) { + WriteCsvRow(csv, *it); + ++writtenEntries; } } @@ -447,9 +476,10 @@ int main(int argc, char *argv[]) { sptrsv.SetupCsrNoPermutation(schedule); const unsigned supersteps = schedule.NumberOfSupersteps(); - const double syncCosts = ComputeScheduleSyncCosts(instance, schedule); + const int syncCosts = ComputeSyncCosts(instance); - for (int iter = 0; iter < args.iterations; ++iter) { + bool correct; + for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { std::vector x(n, 0.0); std::vector b(n, 1.0); sptrsv.x_ = x.data(); @@ -462,16 +492,31 @@ int main(int argc, char *argv[]) { if (iter == 0) { const double diff = LInftyNormalisedDiff(x, serialRefX); - std::cout << " growlocal first-run max relative diff vs serial: " << diff << std::endl; + correct = (diff < EPSILON); + std::cout << " Growlocal first-run max relative diff vs serial: " << diff << std::endl; + } + + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Growlocal", + args.processors, + scheduleTime, + supersteps, + syncCosts, + 1U, + runtime, + correct}); } + } - bufferedRows.push_back(CsvRow{ - graphName, "growlocal", args.processors, scheduleTime, supersteps, syncCosts, 1U, runtime}); + for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) { + WriteCsvRow(csv, *it); + ++writtenEntries; } } - if (args.algorithms.count(Algorithm::EigenSerial) > 0U) { - for (int iter = 0; iter < args.iterations; ++iter) { + if (args.algorithms.count(Algorithm::Serial) > 0U) { + for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { std::vector x(n, 0.0); std::vector b(n, 1.0); sptrsv.x_ = x.data(); @@ -482,13 +527,24 @@ int main(int argc, char *argv[]) { const auto e = std::chrono::high_resolution_clock::now(); const double runtime = std::chrono::duration(e - s).count(); - bufferedRows.push_back(CsvRow{graphName, "eigen_serial", 1U, 0.0, 1U, 0.0, 0U, runtime}); + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Serial", + 1U, + 0.0, + 1U, + 0, + 1U, + runtime, + true}); + } } - } - } - for (const CsvRow &row : bufferedRows) { - WriteCsvRow(csv, row); + for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) { + WriteCsvRow(csv, *it); + ++writtenEntries; + } + } } std::map summary; @@ -499,7 +555,8 @@ int main(int argc, char *argv[]) { if (agg.samples == 0U) { agg.scheduleTimeSeconds = row.scheduleTimeSeconds; agg.supersteps = row.supersteps; - agg.scheduleSyncCosts = row.scheduleSyncCosts; + agg.SyncCosts = row.SyncCosts; + agg.correctness = row.correctness; } agg.sumLogRuntime += std::log(std::max(row.runtimeSeconds, kMinRuntime)); ++agg.samples; @@ -508,8 +565,8 @@ int main(int argc, char *argv[]) { for (const auto &[key, agg] : summary) { const double geomean = std::exp(agg.sumLogRuntime / static_cast(agg.samples)); summaryCsv << CsvEscape(key.graph) << "," << key.algorithm << "," << key.processors << "," << agg.scheduleTimeSeconds - << "," << agg.supersteps << "," << agg.scheduleSyncCosts << "," << key.staleness - << "," << agg.samples << "," << geomean << "\n"; + << "," << agg.supersteps << "," << agg.SyncCosts << "," << key.staleness + << "," << agg.samples << "," << geomean << "," << agg.correctness << "\n"; } std::cout << "Benchmark complete. CSV written to: " << detailCsvPath << std::endl; From 7ad26e7191a3e8c7202382914bdcc46cfa2aabb7 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Mon, 23 Feb 2026 17:06:52 +0100 Subject: [PATCH 30/57] wait at barrier only if there is something to compute --- include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 0fbc80c5..d2461e7f 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -502,10 +502,12 @@ class Sptrsv { { const std::size_t proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; ++step) { - // Enforce staleness window before starting this superstep. - barrier.Wait(proc, staleness - 1U); // Process nodes assigned to this (step, proc) pair. const size_t boundsStrSize = boundsArrayL_[step][proc].size(); + // Enforce staleness window before starting this superstep. + if (boundsStrSize > 0U) { + barrier.Wait(proc, staleness - 1U); + } for (size_t index = 0; index < boundsStrSize; index += 2) { EigenIdxType lowerB = boundsArrayL_[step][proc][index]; const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; From 54c492887bc749c913bc43d7aa928fcdb01908c9 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Tue, 24 Feb 2026 09:43:05 +0100 Subject: [PATCH 31/57] improved busy waiting --- .../sptrsv_simulator/WeakBarriers/cpu_relax.hpp | 12 +++++------- .../WeakBarriers/flat_checkpoint_counter_barrier.hpp | 6 +----- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp index d9e5e268..7b1c79ca 100644 --- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/cpu_relax.hpp @@ -18,21 +18,19 @@ limitations under the License. #pragma once -#include - -#if defined(__x86_64__) || defined(_M_X64) -# include -#endif - namespace osp { // Portable cpu_relax definition #if defined(__x86_64__) || defined(_M_X64) +# include inline void cpu_relax() { _mm_pause(); } #elif defined(__aarch64__) +inline void cpu_relax() { asm volatile("isb" ::: "memory"); } +#elif defined(__arm__) inline void cpu_relax() { asm volatile("yield" ::: "memory"); } #else -inline void cpu_relax() { std::this_thread::yield(); } +#include +inline void cpu_relax() { std::atomic_signal_fence(std::memory_order_acquire); } #endif } // end namespace osp diff --git a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp index 5b25acd3..533f6845 100644 --- a/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp @@ -78,13 +78,9 @@ inline void FlatCheckpointCounterBarrier::Wait(const std::size_t threadId, const const std::size_t minVal = std::max(localCachedCntrs[threadId], diff) - diff; for (std::size_t ind = 0U; ind < cntrs_.size(); ++ind) { - std::size_t loopCntr = 0U; while ((localCachedCntrs[ind] < minVal) && ((localCachedCntrs[ind] = cntrs_[ind].cntr_.load(std::memory_order_acquire)) < minVal)) { - ++loopCntr; - if (loopCntr % 128U == 0U) { - cpu_relax(); - } + cpu_relax(); } } } From 707ea5ea2bc6a44c9b2fe889420af224cdb7fb13 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Wed, 11 Mar 2026 11:13:48 +0100 Subject: [PATCH 32/57] Adding usolve ssp sptsv --- apps/maxbsp_ssp_sptrsv.cpp | 272 ++++++++++++++---- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 42 +++ 2 files changed, 256 insertions(+), 58 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 988653be..272411ba 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -1,6 +1,6 @@ /* * maxbsp_ssp_sptrsv.cpp - * Benchmark for SpTRSV using: + * Benchmark for SpTRSV (Lsolve + Usolve) using: * - variance_ssp * - growlocal_ssp * - growlocal @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -60,6 +61,7 @@ struct Args { std::string outputCsv = "sptrsv_benchmark.csv"; int iterations = 100; unsigned processors = 16U; + bool runUsolve = true; std::set algorithms; }; @@ -135,12 +137,29 @@ double LInftyNormalisedDiff(const std::vector &v, const std::vector [--output ] [--iterations ] [--processors

]\n" + << " --input [--output ] [--iterations ] [--processors

] [--run-usolve <0|1>]\n" << " [--variance-ssp] [--growlocal-ssp] [--growlocal] [--eigen-serial] [--all]\n\n" << "Examples:\n" << " " << prog << " --input ../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx --all\n" << " " << prog - << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --variance-ssp --growlocal-ssp --growlocal\n"; + << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --run-usolve 0 --variance-ssp --growlocal-ssp --growlocal\n"; +} + +bool ParseBoolValue(const std::string &value, bool &parsed) { + std::string normalised = value; + std::transform(normalised.begin(), normalised.end(), normalised.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + + if (normalised == "1" || normalised == "true" || normalised == "yes" || normalised == "on") { + parsed = true; + return true; + } + if (normalised == "0" || normalised == "false" || normalised == "no" || normalised == "off") { + parsed = false; + return true; + } + return false; } bool ParseArgs(int argc, char *argv[], Args &args) { @@ -151,8 +170,8 @@ bool ParseArgs(int argc, char *argv[], Args &args) { for (int i = 1; i < argc; ++i) { const std::string flag = argv[i]; - const bool needsValue - = (flag == "--input" || flag == "--output" || flag == "--iterations" || flag == "--processors"); + const bool needsValue = (flag == "--input" || flag == "--output" || flag == "--iterations" + || flag == "--processors" || flag == "--run-usolve"); if (needsValue && i + 1 >= argc) { std::cerr << "Missing value for " << flag << "\n"; return false; @@ -166,6 +185,13 @@ bool ParseArgs(int argc, char *argv[], Args &args) { args.iterations = std::stoi(argv[++i]); } else if (flag == "--processors") { args.processors = static_cast(std::stoul(argv[++i])); + } else if (flag == "--run-usolve") { + bool parsed = false; + if (!ParseBoolValue(argv[++i], parsed)) { + std::cerr << "Invalid value for --run-usolve. Use 0/1, false/true, no/yes, or off/on.\n"; + return false; + } + args.runUsolve = parsed; } else if (flag == "--variance-ssp") { args.algorithms.insert(Algorithm::VarianceSsp); } else if (flag == "--growlocal-ssp") { @@ -329,11 +355,12 @@ int main(int argc, char *argv[]) { EnsureSummaryCsvHeader(summaryCsv); std::cout << "Running benchmark on " << graphFiles.size() << " graph(s), iterations=" << args.iterations - << ", processors=" << args.processors << std::endl; + << ", processors=" << args.processors << ", run-usolve=" << (args.runUsolve ? "1" : "0") << std::endl; std::cout << "Experiment id timestamp: " << experimentStart << std::endl; std::vector bufferedRows; - bufferedRows.reserve(graphFiles.size() * args.algorithms.size() * static_cast(args.iterations)); + bufferedRows.reserve((args.runUsolve ? 2U : 1U) * graphFiles.size() * args.algorithms.size() + * static_cast(args.iterations)); typename std::vector::difference_type writtenEntries = 0U; for (const auto &graphPath : graphFiles) { @@ -357,12 +384,21 @@ int main(int argc, char *argv[]) { Sptrsv sptrsv(instance); const std::size_t n = static_cast(lCsr.cols()); - std::vector serialRefX(n, 0.0); - std::vector serialB(n, 1.0); - sptrsv.x_ = serialRefX.data(); - sptrsv.b_ = serialB.data(); + std::vector serialRefXL(n, 0.0); + std::vector serialBL(n, 1.0); + sptrsv.x_ = serialRefXL.data(); + sptrsv.b_ = serialBL.data(); sptrsv.LsolveSerial(); + std::vector serialRefXU; + if (args.runUsolve) { + std::vector serialBU(n, 1.0); + serialRefXU.assign(n, 0.0); + sptrsv.x_ = serialRefXU.data(); + sptrsv.b_ = serialBU.data(); + sptrsv.UsolveSerial(); + } + std::cout << "Graph: " << graphName << " (" << lCsr.rows() << "x" << lCsr.cols() << ", nnz=" << lCsr.nonZeros() << ")\n"; if (args.algorithms.count(Algorithm::VarianceSsp) > 0U) { @@ -378,22 +414,23 @@ int main(int argc, char *argv[]) { const unsigned supersteps = schedule.NumberOfSupersteps(); const int syncCosts = ComputeSyncCosts(instance); - bool correct = false; + bool correctL = false; + bool correctU = false; for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector x(n, 0.0); - std::vector b(n, 1.0); - sptrsv.x_ = x.data(); - sptrsv.b_ = b.data(); + std::vector xL(n, 0.0); + std::vector bL(n, 1.0); + sptrsv.x_ = xL.data(); + sptrsv.b_ = bL.data(); - const auto s = std::chrono::high_resolution_clock::now(); + const auto sL = std::chrono::high_resolution_clock::now(); sptrsv.SspLsolveStaleness(); - const auto e = std::chrono::high_resolution_clock::now(); - const double runtime = std::chrono::duration(e - s).count(); + const auto eL = std::chrono::high_resolution_clock::now(); + const double runtimeL = std::chrono::duration(eL - sL).count(); if (iter == 0) { - const double diff = LInftyNormalisedDiff(x, serialRefX); - correct = (diff < EPSILON); - std::cout << " Variance_SSP first-run max relative diff vs serial: " << diff << std::endl; + const double diffL = LInftyNormalisedDiff(xL, serialRefXL); + correctL = (diffL < EPSILON); + std::cout << " Variance_SSP first-run max relative diff vs serial lsolve: " << diffL << std::endl; } if (iter >= preMeasureIterations) { @@ -404,8 +441,39 @@ int main(int argc, char *argv[]) { supersteps, syncCosts, kDefaultStaleness, - runtime, - correct}); + runtimeL, + correctL}); + } + + if (args.runUsolve) { + std::vector xU(n, 0.0); + std::vector bU(n, 1.0); + sptrsv.x_ = xU.data(); + sptrsv.b_ = bU.data(); + + const auto sU = std::chrono::high_resolution_clock::now(); + sptrsv.SspUsolveStaleness(); + const auto eU = std::chrono::high_resolution_clock::now(); + const double runtimeU = std::chrono::duration(eU - sU).count(); + + if (iter == 0) { + const double diffU = LInftyNormalisedDiff(xU, serialRefXU); + correctU = (diffU < EPSILON); + std::cout << " Variance_SSP_Usolve first-run max relative diff vs serial usolve: " << diffU + << std::endl; + } + + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Variance_SSP_Usolve", + args.processors, + scheduleTime, + supersteps, + syncCosts, + kDefaultStaleness, + runtimeU, + correctU}); + } } } @@ -428,22 +496,23 @@ int main(int argc, char *argv[]) { const unsigned supersteps = schedule.NumberOfSupersteps(); const int syncCosts = ComputeSyncCosts(instance); - bool correct = false; + bool correctL = false; + bool correctU = false; for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector x(n, 0.0); - std::vector b(n, 1.0); - sptrsv.x_ = x.data(); - sptrsv.b_ = b.data(); + std::vector xL(n, 0.0); + std::vector bL(n, 1.0); + sptrsv.x_ = xL.data(); + sptrsv.b_ = bL.data(); - const auto s = std::chrono::high_resolution_clock::now(); + const auto sL = std::chrono::high_resolution_clock::now(); sptrsv.SspLsolveStaleness(); - const auto e = std::chrono::high_resolution_clock::now(); - const double runtime = std::chrono::duration(e - s).count(); + const auto eL = std::chrono::high_resolution_clock::now(); + const double runtimeL = std::chrono::duration(eL - sL).count(); if (iter == 0) { - const double diff = LInftyNormalisedDiff(x, serialRefX); - correct = (diff < EPSILON); - std::cout << " Growlocal_SSP first-run max relative diff vs serial: " << diff << std::endl; + const double diffL = LInftyNormalisedDiff(xL, serialRefXL); + correctL = (diffL < EPSILON); + std::cout << " Growlocal_SSP first-run max relative diff vs serial lsolve: " << diffL << std::endl; } if (iter >= preMeasureIterations) { @@ -454,8 +523,39 @@ int main(int argc, char *argv[]) { supersteps, syncCosts, kDefaultStaleness, - runtime, - correct}); + runtimeL, + correctL}); + } + + if (args.runUsolve) { + std::vector xU(n, 0.0); + std::vector bU(n, 1.0); + sptrsv.x_ = xU.data(); + sptrsv.b_ = bU.data(); + + const auto sU = std::chrono::high_resolution_clock::now(); + sptrsv.SspUsolveStaleness(); + const auto eU = std::chrono::high_resolution_clock::now(); + const double runtimeU = std::chrono::duration(eU - sU).count(); + + if (iter == 0) { + const double diffU = LInftyNormalisedDiff(xU, serialRefXU); + correctU = (diffU < EPSILON); + std::cout << " Growlocal_SSP_Usolve first-run max relative diff vs serial usolve: " << diffU + << std::endl; + } + + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Growlocal_SSP_Usolve", + args.processors, + scheduleTime, + supersteps, + syncCosts, + kDefaultStaleness, + runtimeU, + correctU}); + } } } @@ -478,22 +578,23 @@ int main(int argc, char *argv[]) { const unsigned supersteps = schedule.NumberOfSupersteps(); const int syncCosts = ComputeSyncCosts(instance); - bool correct; + bool correctL = false; + bool correctU = false; for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector x(n, 0.0); - std::vector b(n, 1.0); - sptrsv.x_ = x.data(); - sptrsv.b_ = b.data(); + std::vector xL(n, 0.0); + std::vector bL(n, 1.0); + sptrsv.x_ = xL.data(); + sptrsv.b_ = bL.data(); - const auto s = std::chrono::high_resolution_clock::now(); + const auto sL = std::chrono::high_resolution_clock::now(); sptrsv.LsolveNoPermutation(); - const auto e = std::chrono::high_resolution_clock::now(); - const double runtime = std::chrono::duration(e - s).count(); + const auto eL = std::chrono::high_resolution_clock::now(); + const double runtimeL = std::chrono::duration(eL - sL).count(); if (iter == 0) { - const double diff = LInftyNormalisedDiff(x, serialRefX); - correct = (diff < EPSILON); - std::cout << " Growlocal first-run max relative diff vs serial: " << diff << std::endl; + const double diffL = LInftyNormalisedDiff(xL, serialRefXL); + correctL = (diffL < EPSILON); + std::cout << " Growlocal first-run max relative diff vs serial lsolve: " << diffL << std::endl; } if (iter >= preMeasureIterations) { @@ -504,8 +605,39 @@ int main(int argc, char *argv[]) { supersteps, syncCosts, 1U, - runtime, - correct}); + runtimeL, + correctL}); + } + + if (args.runUsolve) { + std::vector xU(n, 0.0); + std::vector bU(n, 1.0); + sptrsv.x_ = xU.data(); + sptrsv.b_ = bU.data(); + + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.UsolveNoPermutation(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); + + if (iter == 0) { + const double diff = LInftyNormalisedDiff(xU, serialRefXU); + correctU = (diff < EPSILON); + std::cout << " Growlocal_Usolve first-run max relative diff vs serial usolve: " << diff + << std::endl; + } + + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Growlocal_Usolve", + args.processors, + scheduleTime, + supersteps, + syncCosts, + 1U, + runtime, + correctU}); + } } } @@ -517,15 +649,15 @@ int main(int argc, char *argv[]) { if (args.algorithms.count(Algorithm::Serial) > 0U) { for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector x(n, 0.0); - std::vector b(n, 1.0); - sptrsv.x_ = x.data(); - sptrsv.b_ = b.data(); + std::vector xL(n, 0.0); + std::vector bL(n, 1.0); + sptrsv.x_ = xL.data(); + sptrsv.b_ = bL.data(); - const auto s = std::chrono::high_resolution_clock::now(); + const auto sL = std::chrono::high_resolution_clock::now(); sptrsv.LsolveSerial(); - const auto e = std::chrono::high_resolution_clock::now(); - const double runtime = std::chrono::duration(e - s).count(); + const auto eL = std::chrono::high_resolution_clock::now(); + const double runtimeL = std::chrono::duration(eL - sL).count(); if (iter >= preMeasureIterations) { bufferedRows.emplace_back(CsvRow{graphName, @@ -535,9 +667,33 @@ int main(int argc, char *argv[]) { 1U, 0, 1U, - runtime, + runtimeL, true}); } + + if (args.runUsolve) { + std::vector xU(n, 0.0); + std::vector bU(n, 1.0); + sptrsv.x_ = xU.data(); + sptrsv.b_ = bU.data(); + + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.UsolveSerial(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); + + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Serial_Usolve", + 1U, + 0.0, + 1U, + 0, + 1U, + runtime, + true}); + } + } } for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) { diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index d2461e7f..5c34bac6 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -529,6 +529,48 @@ class Sptrsv { } } + // SSP Usolve with configurable staleness. + // Uses FlatCheckpointCounterBarrier created internally. + template + void SspUsolveStaleness() { + const unsigned nthreads = instance_->NumberOfProcessors(); + FlatCheckpointCounterBarrier barrier(nthreads); + + auto *csc = instance_->GetComputationalDag().GetCSC(); + const auto *outer = csc->outerIndexPtr(); + const auto *inner = csc->innerIndexPtr(); + const auto *vals = csc->valuePtr(); + + #pragma omp parallel num_threads(nthreads) + { + const std::size_t proc = static_cast(omp_get_thread_num()); + unsigned step = numSupersteps_; + do { + step--; + const size_t boundsStrSize = boundsArrayU_[step][proc].size(); + if (boundsStrSize > 0U) { + barrier.Wait(proc, staleness - 1U); + } + + for (size_t index = 0; index < boundsStrSize; index += 2) { + EigenIdxType node = boundsArrayU_[step][proc][index] + 1; + const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1]; + + do { + node--; + x_[node] = b_[node]; + for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { + x_[node] -= vals[i] * x_[inner[i]]; + } + x_[node] /= vals[outer[node]]; + } while (node != lowerB); + } + + barrier.Arrive(proc); + } while (step != 0); + } + } + virtual ~Sptrsv() = default; }; From 649fe9c4a57a5c17538c289dd08b7ab9a5dcc097 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Mon, 16 Mar 2026 09:35:50 +0100 Subject: [PATCH 33/57] Reverted benchmark and in-place kernels added --- apps/maxbsp_ssp_sptrsv.cpp | 272 ++++-------------- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 41 +++ 2 files changed, 94 insertions(+), 219 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 272411ba..079c7d3a 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -1,6 +1,6 @@ /* * maxbsp_ssp_sptrsv.cpp - * Benchmark for SpTRSV (Lsolve + Usolve) using: + * Benchmark for SpTRSV using: * - variance_ssp * - growlocal_ssp * - growlocal @@ -13,7 +13,6 @@ #include #include -#include #include #include #include @@ -61,7 +60,6 @@ struct Args { std::string outputCsv = "sptrsv_benchmark.csv"; int iterations = 100; unsigned processors = 16U; - bool runUsolve = true; std::set algorithms; }; @@ -137,29 +135,12 @@ double LInftyNormalisedDiff(const std::vector &v, const std::vector [--output ] [--iterations ] [--processors

] [--run-usolve <0|1>]\n" + << " --input [--output ] [--iterations ] [--processors

]\n" << " [--variance-ssp] [--growlocal-ssp] [--growlocal] [--eigen-serial] [--all]\n\n" << "Examples:\n" << " " << prog << " --input ../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx --all\n" << " " << prog - << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --run-usolve 0 --variance-ssp --growlocal-ssp --growlocal\n"; -} - -bool ParseBoolValue(const std::string &value, bool &parsed) { - std::string normalised = value; - std::transform(normalised.begin(), normalised.end(), normalised.begin(), [](unsigned char c) { - return static_cast(std::tolower(c)); - }); - - if (normalised == "1" || normalised == "true" || normalised == "yes" || normalised == "on") { - parsed = true; - return true; - } - if (normalised == "0" || normalised == "false" || normalised == "no" || normalised == "off") { - parsed = false; - return true; - } - return false; + << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --variance-ssp --growlocal-ssp --growlocal\n"; } bool ParseArgs(int argc, char *argv[], Args &args) { @@ -170,8 +151,8 @@ bool ParseArgs(int argc, char *argv[], Args &args) { for (int i = 1; i < argc; ++i) { const std::string flag = argv[i]; - const bool needsValue = (flag == "--input" || flag == "--output" || flag == "--iterations" - || flag == "--processors" || flag == "--run-usolve"); + const bool needsValue + = (flag == "--input" || flag == "--output" || flag == "--iterations" || flag == "--processors"); if (needsValue && i + 1 >= argc) { std::cerr << "Missing value for " << flag << "\n"; return false; @@ -185,13 +166,6 @@ bool ParseArgs(int argc, char *argv[], Args &args) { args.iterations = std::stoi(argv[++i]); } else if (flag == "--processors") { args.processors = static_cast(std::stoul(argv[++i])); - } else if (flag == "--run-usolve") { - bool parsed = false; - if (!ParseBoolValue(argv[++i], parsed)) { - std::cerr << "Invalid value for --run-usolve. Use 0/1, false/true, no/yes, or off/on.\n"; - return false; - } - args.runUsolve = parsed; } else if (flag == "--variance-ssp") { args.algorithms.insert(Algorithm::VarianceSsp); } else if (flag == "--growlocal-ssp") { @@ -355,12 +329,11 @@ int main(int argc, char *argv[]) { EnsureSummaryCsvHeader(summaryCsv); std::cout << "Running benchmark on " << graphFiles.size() << " graph(s), iterations=" << args.iterations - << ", processors=" << args.processors << ", run-usolve=" << (args.runUsolve ? "1" : "0") << std::endl; + << ", processors=" << args.processors << std::endl; std::cout << "Experiment id timestamp: " << experimentStart << std::endl; std::vector bufferedRows; - bufferedRows.reserve((args.runUsolve ? 2U : 1U) * graphFiles.size() * args.algorithms.size() - * static_cast(args.iterations)); + bufferedRows.reserve(graphFiles.size() * args.algorithms.size() * static_cast(args.iterations)); typename std::vector::difference_type writtenEntries = 0U; for (const auto &graphPath : graphFiles) { @@ -384,20 +357,9 @@ int main(int argc, char *argv[]) { Sptrsv sptrsv(instance); const std::size_t n = static_cast(lCsr.cols()); - std::vector serialRefXL(n, 0.0); - std::vector serialBL(n, 1.0); - sptrsv.x_ = serialRefXL.data(); - sptrsv.b_ = serialBL.data(); - sptrsv.LsolveSerial(); - - std::vector serialRefXU; - if (args.runUsolve) { - std::vector serialBU(n, 1.0); - serialRefXU.assign(n, 0.0); - sptrsv.x_ = serialRefXU.data(); - sptrsv.b_ = serialBU.data(); - sptrsv.UsolveSerial(); - } + std::vector serialRefX(n, 1.0); + sptrsv.x_ = serialRefX.data(); + sptrsv.LsolveSerialInPlace(); std::cout << "Graph: " << graphName << " (" << lCsr.rows() << "x" << lCsr.cols() << ", nnz=" << lCsr.nonZeros() << ")\n"; @@ -414,23 +376,20 @@ int main(int argc, char *argv[]) { const unsigned supersteps = schedule.NumberOfSupersteps(); const int syncCosts = ComputeSyncCosts(instance); - bool correctL = false; - bool correctU = false; + bool correct = false; for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector xL(n, 0.0); - std::vector bL(n, 1.0); - sptrsv.x_ = xL.data(); - sptrsv.b_ = bL.data(); + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); - const auto sL = std::chrono::high_resolution_clock::now(); - sptrsv.SspLsolveStaleness(); - const auto eL = std::chrono::high_resolution_clock::now(); - const double runtimeL = std::chrono::duration(eL - sL).count(); + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.SspLsolveStalenessInPlace(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); if (iter == 0) { - const double diffL = LInftyNormalisedDiff(xL, serialRefXL); - correctL = (diffL < EPSILON); - std::cout << " Variance_SSP first-run max relative diff vs serial lsolve: " << diffL << std::endl; + const double diff = LInftyNormalisedDiff(x, serialRefX); + correct = (diff < EPSILON); + std::cout << " Variance_SSP first-run max relative diff vs serial: " << diff << std::endl; } if (iter >= preMeasureIterations) { @@ -441,39 +400,8 @@ int main(int argc, char *argv[]) { supersteps, syncCosts, kDefaultStaleness, - runtimeL, - correctL}); - } - - if (args.runUsolve) { - std::vector xU(n, 0.0); - std::vector bU(n, 1.0); - sptrsv.x_ = xU.data(); - sptrsv.b_ = bU.data(); - - const auto sU = std::chrono::high_resolution_clock::now(); - sptrsv.SspUsolveStaleness(); - const auto eU = std::chrono::high_resolution_clock::now(); - const double runtimeU = std::chrono::duration(eU - sU).count(); - - if (iter == 0) { - const double diffU = LInftyNormalisedDiff(xU, serialRefXU); - correctU = (diffU < EPSILON); - std::cout << " Variance_SSP_Usolve first-run max relative diff vs serial usolve: " << diffU - << std::endl; - } - - if (iter >= preMeasureIterations) { - bufferedRows.emplace_back(CsvRow{graphName, - "Variance_SSP_Usolve", - args.processors, - scheduleTime, - supersteps, - syncCosts, - kDefaultStaleness, - runtimeU, - correctU}); - } + runtime, + correct}); } } @@ -496,23 +424,20 @@ int main(int argc, char *argv[]) { const unsigned supersteps = schedule.NumberOfSupersteps(); const int syncCosts = ComputeSyncCosts(instance); - bool correctL = false; - bool correctU = false; + bool correct = false; for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector xL(n, 0.0); - std::vector bL(n, 1.0); - sptrsv.x_ = xL.data(); - sptrsv.b_ = bL.data(); + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); - const auto sL = std::chrono::high_resolution_clock::now(); - sptrsv.SspLsolveStaleness(); - const auto eL = std::chrono::high_resolution_clock::now(); - const double runtimeL = std::chrono::duration(eL - sL).count(); + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.SspLsolveStalenessInPlace(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); if (iter == 0) { - const double diffL = LInftyNormalisedDiff(xL, serialRefXL); - correctL = (diffL < EPSILON); - std::cout << " Growlocal_SSP first-run max relative diff vs serial lsolve: " << diffL << std::endl; + const double diff = LInftyNormalisedDiff(x, serialRefX); + correct = (diff < EPSILON); + std::cout << " Growlocal_SSP first-run max relative diff vs serial: " << diff << std::endl; } if (iter >= preMeasureIterations) { @@ -523,39 +448,8 @@ int main(int argc, char *argv[]) { supersteps, syncCosts, kDefaultStaleness, - runtimeL, - correctL}); - } - - if (args.runUsolve) { - std::vector xU(n, 0.0); - std::vector bU(n, 1.0); - sptrsv.x_ = xU.data(); - sptrsv.b_ = bU.data(); - - const auto sU = std::chrono::high_resolution_clock::now(); - sptrsv.SspUsolveStaleness(); - const auto eU = std::chrono::high_resolution_clock::now(); - const double runtimeU = std::chrono::duration(eU - sU).count(); - - if (iter == 0) { - const double diffU = LInftyNormalisedDiff(xU, serialRefXU); - correctU = (diffU < EPSILON); - std::cout << " Growlocal_SSP_Usolve first-run max relative diff vs serial usolve: " << diffU - << std::endl; - } - - if (iter >= preMeasureIterations) { - bufferedRows.emplace_back(CsvRow{graphName, - "Growlocal_SSP_Usolve", - args.processors, - scheduleTime, - supersteps, - syncCosts, - kDefaultStaleness, - runtimeU, - correctU}); - } + runtime, + correct}); } } @@ -578,23 +472,20 @@ int main(int argc, char *argv[]) { const unsigned supersteps = schedule.NumberOfSupersteps(); const int syncCosts = ComputeSyncCosts(instance); - bool correctL = false; - bool correctU = false; + bool correct; for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector xL(n, 0.0); - std::vector bL(n, 1.0); - sptrsv.x_ = xL.data(); - sptrsv.b_ = bL.data(); + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); - const auto sL = std::chrono::high_resolution_clock::now(); - sptrsv.LsolveNoPermutation(); - const auto eL = std::chrono::high_resolution_clock::now(); - const double runtimeL = std::chrono::duration(eL - sL).count(); + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.LsolveNoPermutationInPlace(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); if (iter == 0) { - const double diffL = LInftyNormalisedDiff(xL, serialRefXL); - correctL = (diffL < EPSILON); - std::cout << " Growlocal first-run max relative diff vs serial lsolve: " << diffL << std::endl; + const double diff = LInftyNormalisedDiff(x, serialRefX); + correct = (diff < EPSILON); + std::cout << " Growlocal first-run max relative diff vs serial: " << diff << std::endl; } if (iter >= preMeasureIterations) { @@ -605,39 +496,8 @@ int main(int argc, char *argv[]) { supersteps, syncCosts, 1U, - runtimeL, - correctL}); - } - - if (args.runUsolve) { - std::vector xU(n, 0.0); - std::vector bU(n, 1.0); - sptrsv.x_ = xU.data(); - sptrsv.b_ = bU.data(); - - const auto s = std::chrono::high_resolution_clock::now(); - sptrsv.UsolveNoPermutation(); - const auto e = std::chrono::high_resolution_clock::now(); - const double runtime = std::chrono::duration(e - s).count(); - - if (iter == 0) { - const double diff = LInftyNormalisedDiff(xU, serialRefXU); - correctU = (diff < EPSILON); - std::cout << " Growlocal_Usolve first-run max relative diff vs serial usolve: " << diff - << std::endl; - } - - if (iter >= preMeasureIterations) { - bufferedRows.emplace_back(CsvRow{graphName, - "Growlocal_Usolve", - args.processors, - scheduleTime, - supersteps, - syncCosts, - 1U, - runtime, - correctU}); - } + runtime, + correct}); } } @@ -649,15 +509,13 @@ int main(int argc, char *argv[]) { if (args.algorithms.count(Algorithm::Serial) > 0U) { for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector xL(n, 0.0); - std::vector bL(n, 1.0); - sptrsv.x_ = xL.data(); - sptrsv.b_ = bL.data(); + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); - const auto sL = std::chrono::high_resolution_clock::now(); - sptrsv.LsolveSerial(); - const auto eL = std::chrono::high_resolution_clock::now(); - const double runtimeL = std::chrono::duration(eL - sL).count(); + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.LsolveSerialInPlace(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); if (iter >= preMeasureIterations) { bufferedRows.emplace_back(CsvRow{graphName, @@ -667,33 +525,9 @@ int main(int argc, char *argv[]) { 1U, 0, 1U, - runtimeL, + runtime, true}); } - - if (args.runUsolve) { - std::vector xU(n, 0.0); - std::vector bU(n, 1.0); - sptrsv.x_ = xU.data(); - sptrsv.b_ = bU.data(); - - const auto s = std::chrono::high_resolution_clock::now(); - sptrsv.UsolveSerial(); - const auto e = std::chrono::high_resolution_clock::now(); - const double runtime = std::chrono::duration(e - s).count(); - - if (iter >= preMeasureIterations) { - bufferedRows.emplace_back(CsvRow{graphName, - "Serial_Usolve", - 1U, - 0.0, - 1U, - 0, - 1U, - runtime, - true}); - } - } } for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) { diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 5c34bac6..3323c07a 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -529,6 +529,47 @@ class Sptrsv { } } + // SSP Lsolve in-place with staleness=2 (allowing at most one superstep of lag). + // Uses FlatCheckpointCounterBarrier created internally. + template + void SspLsolveStalenessInPlace() { + const unsigned nthreads = instance_->NumberOfProcessors(); + FlatCheckpointCounterBarrier barrier(nthreads); + + auto *csr = instance_->GetComputationalDag().GetCSR(); + const auto *outer = csr->outerIndexPtr(); + const auto *inner = csr->innerIndexPtr(); + const auto *vals = csr->valuePtr(); + + #pragma omp parallel num_threads(nthreads) + { + const std::size_t proc = static_cast(omp_get_thread_num()); + for (unsigned step = 0; step < numSupersteps_; ++step) { + // Process nodes assigned to this (step, proc) pair. + const size_t boundsStrSize = boundsArrayL_[step][proc].size(); + // Enforce staleness window before starting this superstep. + if (boundsStrSize > 0U) { + barrier.Wait(proc, staleness - 1U); + } + for (size_t index = 0; index < boundsStrSize; index += 2) { + EigenIdxType lowerB = boundsArrayL_[step][proc][index]; + const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; + for (EigenIdxType node = lowerB; node <= upperB; ++node) { + // Perform lower-triangular solve for this node + for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { + // Subtract contributions from previously solved nodes + x_[node] -= vals[i] * x_[inner[i]]; + } + // Divide by diagonal element to complete solve for this node + x_[node] /= vals[outer[node + 1] - 1]; + } + } + // Signal completion of this superstep. + barrier.Arrive(proc); + } + } + } + // SSP Usolve with configurable staleness. // Uses FlatCheckpointCounterBarrier created internally. template From ff5e15f209ef6446e0b8cbbd065fc5038ce0f129 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Tue, 17 Mar 2026 11:56:52 +0100 Subject: [PATCH 34/57] making const --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 150 +++++++++--------- 1 file changed, 75 insertions(+), 75 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 3323c07a..906e8850 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -131,7 +131,7 @@ class Sptrsv { do { node--; vectorStepProcessorVerticesU_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back( - // --- SSP SpTRSV kernel integration from BspSptrsvCSR.hpp/cpp --- + // --- SSP SpTRSV kernel integration from BspSptrsvCSR.hpp/cpp --- static_cast(node)); } while (node > 0); @@ -240,39 +240,43 @@ class Sptrsv { } void LsolveSerial() { + const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); + const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); + const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); for (EigenIdxType i = 0; i < numberOfVertices; ++i) { x_[i] = b_[i]; - for (EigenIdxType j = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i]; - j < (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i + 1] - 1; - ++j) { - x_[i] -= (*(instance_->GetComputationalDag().GetCSR())).valuePtr()[j] - * x_[(*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr()[j]]; + for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) { + x_[i] -= valPtr[j] * x_[inner[j]]; } - x_[i] /= (*(instance_->GetComputationalDag().GetCSR())) - .valuePtr()[(*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i + 1] - 1]; + x_[i] /= valPtr[outer[i + 1] - 1]; } } void UsolveSerial() { - EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); + const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); + const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); + const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + + const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); EigenIdxType i = numberOfVertices; do { i--; x_[i] = b_[i]; - for (EigenIdxType j = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i] + 1; - j < (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i + 1]; - ++j) { - x_[i] -= (*(instance_->GetComputationalDag().GetCSC())).valuePtr()[j] - * x_[(*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr()[j]]; + for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) { + x_[i] -= valPtr[j] * x_[inner[j]]; } - x_[i] /= (*(instance_->GetComputationalDag().GetCSC())) - .valuePtr()[(*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i]]; + x_[i] /= valPtr[outer[i]]; } while (i != 0); } void LsolveNoPermutationInPlace() { + const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); + const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); + const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { const size_t proc = static_cast(omp_get_thread_num()); @@ -284,14 +288,10 @@ class Sptrsv { const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; for (EigenIdxType node = lowerB; node <= upperB; ++node) { - for (EigenIdxType i = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node]; - i < (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node + 1] - 1; - ++i) { - x_[node] -= (*(instance_->GetComputationalDag().GetCSR())).valuePtr()[i] - * x_[(*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr()[i]]; + for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { + x_[node] -= valPtr[i] * x_[inner[i]]; } - x_[node] /= (*(instance_->GetComputationalDag().GetCSR())) - .valuePtr()[(*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node + 1] - 1]; + x_[node] /= valPtr[outer[node + 1] - 1]; } } # pragma omp barrier @@ -300,6 +300,10 @@ class Sptrsv { } void UsolveNoPermutationInPlace() { + const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); + const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); + const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { // Process each superstep starting from the last one (opposite of lsolve) @@ -314,14 +318,10 @@ class Sptrsv { do { node--; - for (EigenIdxType i = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node] + 1; - i < (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node + 1]; - ++i) { - x_[node] -= (*(instance_->GetComputationalDag().GetCSC())).valuePtr()[i] - * x_[(*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr()[i]]; + for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { + x_[node] -= valPtr[i] * x_[inner[i]]; } - x_[node] /= (*(instance_->GetComputationalDag().GetCSC())) - .valuePtr()[(*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node]]; + x_[node] /= valPtr[outer[node]]; } while (node != lowerB); } # pragma omp barrier @@ -330,6 +330,10 @@ class Sptrsv { } void LsolveNoPermutation() { + const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); + const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); + const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { const size_t proc = static_cast(omp_get_thread_num()); @@ -342,14 +346,10 @@ class Sptrsv { for (EigenIdxType node = lowerB; node <= upperB; ++node) { x_[node] = b_[node]; - for (EigenIdxType i = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node]; - i < (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node + 1] - 1; - ++i) { - x_[node] -= (*(instance_->GetComputationalDag().GetCSR())).valuePtr()[i] - * x_[(*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr()[i]]; + for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { + x_[node] -= valPtr[i] * x_[inner[i]]; } - x_[node] /= (*(instance_->GetComputationalDag().GetCSR())) - .valuePtr()[(*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[node + 1] - 1]; + x_[node] /= valPtr[outer[node + 1] - 1]; } } # pragma omp barrier @@ -358,6 +358,10 @@ class Sptrsv { } void UsolveNoPermutation() { + const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); + const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); + const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { // Process each superstep starting from the last one (opposite of lsolve) @@ -373,14 +377,10 @@ class Sptrsv { do { node--; x_[node] = b_[node]; - for (EigenIdxType i = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node] + 1; - i < (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node + 1]; - ++i) { - x_[node] -= (*(instance_->GetComputationalDag().GetCSC())).valuePtr()[i] - * x_[(*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr()[i]]; + for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { + x_[node] -= valPtr[i] * x_[inner[i]]; } - x_[node] /= (*(instance_->GetComputationalDag().GetCSC())) - .valuePtr()[(*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[node]]; + x_[node] /= valPtr[outer[node]]; } while (node != lowerB); } # pragma omp barrier @@ -389,32 +389,32 @@ class Sptrsv { } void LsolveSerialInPlace() { - EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); + const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); + const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); + const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + + const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); for (EigenIdxType i = 0; i < numberOfVertices; ++i) { - for (EigenIdxType j = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i]; - j < (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i + 1] - 1; - ++j) { - x_[i] -= (*(instance_->GetComputationalDag().GetCSR())).valuePtr()[j] - * x_[(*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr()[j]]; + for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) { + x_[i] -= valPtr[j] * x_[inner[j]]; } - x_[i] /= (*(instance_->GetComputationalDag().GetCSR())) - .valuePtr()[(*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr()[i + 1] - 1]; + x_[i] /= valPtr[outer[i + 1] - 1]; } } void UsolveSerialInPlace() { - EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); + const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); + const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); + const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + + const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); EigenIdxType i = numberOfVertices; do { i--; - for (EigenIdxType j = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i] + 1; - j < (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i + 1]; - ++j) { - x_[i] -= (*(instance_->GetComputationalDag().GetCSC())).valuePtr()[j] - * x_[(*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr()[j]]; + for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) { + x_[i] -= valPtr[j] * x_[inner[j]]; } - x_[i] /= (*(instance_->GetComputationalDag().GetCSC())) - .valuePtr()[(*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr()[i]]; + x_[i] /= valPtr[outer[i]]; } while (i != 0); } @@ -493,12 +493,12 @@ class Sptrsv { const unsigned nthreads = instance_->NumberOfProcessors(); FlatCheckpointCounterBarrier barrier(nthreads); - auto *csr = instance_->GetComputationalDag().GetCSR(); - const auto *outer = csr->outerIndexPtr(); - const auto *inner = csr->innerIndexPtr(); - const auto *vals = csr->valuePtr(); + const auto *csr = instance_->GetComputationalDag().GetCSR(); + const EigenIdxType *outer = csr->outerIndexPtr(); + const EigenIdxType *inner = csr->innerIndexPtr(); + const double *vals = csr->valuePtr(); - #pragma omp parallel num_threads(nthreads) +# pragma omp parallel num_threads(nthreads) { const std::size_t proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; ++step) { @@ -536,12 +536,12 @@ class Sptrsv { const unsigned nthreads = instance_->NumberOfProcessors(); FlatCheckpointCounterBarrier barrier(nthreads); - auto *csr = instance_->GetComputationalDag().GetCSR(); - const auto *outer = csr->outerIndexPtr(); - const auto *inner = csr->innerIndexPtr(); - const auto *vals = csr->valuePtr(); + const auto *csr = instance_->GetComputationalDag().GetCSR(); + const EigenIdxType *outer = csr->outerIndexPtr(); + const EigenIdxType *inner = csr->innerIndexPtr(); + const double *vals = csr->valuePtr(); - #pragma omp parallel num_threads(nthreads) +# pragma omp parallel num_threads(nthreads) { const std::size_t proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; ++step) { @@ -577,12 +577,12 @@ class Sptrsv { const unsigned nthreads = instance_->NumberOfProcessors(); FlatCheckpointCounterBarrier barrier(nthreads); - auto *csc = instance_->GetComputationalDag().GetCSC(); - const auto *outer = csc->outerIndexPtr(); - const auto *inner = csc->innerIndexPtr(); - const auto *vals = csc->valuePtr(); + const auto *csc = instance_->GetComputationalDag().GetCSC(); + const EigenIdxType *outer = csc->outerIndexPtr(); + const EigenIdxType *inner = csc->innerIndexPtr(); + const double *vals = csc->valuePtr(); - #pragma omp parallel num_threads(nthreads) +# pragma omp parallel num_threads(nthreads) { const std::size_t proc = static_cast(omp_get_thread_num()); unsigned step = numSupersteps_; From 8f584922aed1e1c45ca41f33b30a9d112e030d69 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Tue, 17 Mar 2026 12:04:13 +0100 Subject: [PATCH 35/57] inplace reset --- apps/maxbsp_ssp_sptrsv.cpp | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 079c7d3a..69ed2c57 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -14,8 +14,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -134,8 +134,7 @@ double LInftyNormalisedDiff(const std::vector &v, const std::vector [--output ] [--iterations ] [--processors

]\n" + << " " << prog << " --input [--output ] [--iterations ] [--processors

]\n" << " [--variance-ssp] [--growlocal-ssp] [--growlocal] [--eigen-serial] [--all]\n\n" << "Examples:\n" << " " << prog << " --input ../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx --all\n" @@ -290,6 +289,12 @@ int ComputeSyncCosts(const BspInstance> &instance) { } // namespace +void resetOnes(std::vector &x) { + for (double &val : x) { + val = 1.0; + } +} + int main(int argc, char *argv[]) { const std::string experimentStart = FormatExperimentStartTimestampForFilename(); @@ -377,9 +382,10 @@ int main(int argc, char *argv[]) { const int syncCosts = ComputeSyncCosts(instance); bool correct = false; + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector x(n, 1.0); - sptrsv.x_ = x.data(); + resetOnes(x); const auto s = std::chrono::high_resolution_clock::now(); sptrsv.SspLsolveStalenessInPlace(); @@ -425,9 +431,10 @@ int main(int argc, char *argv[]) { const int syncCosts = ComputeSyncCosts(instance); bool correct = false; + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector x(n, 1.0); - sptrsv.x_ = x.data(); + resetOnes(x); const auto s = std::chrono::high_resolution_clock::now(); sptrsv.SspLsolveStalenessInPlace(); @@ -473,9 +480,10 @@ int main(int argc, char *argv[]) { const int syncCosts = ComputeSyncCosts(instance); bool correct; + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector x(n, 1.0); - sptrsv.x_ = x.data(); + resetOnes(x); const auto s = std::chrono::high_resolution_clock::now(); sptrsv.LsolveNoPermutationInPlace(); @@ -508,9 +516,10 @@ int main(int argc, char *argv[]) { } if (args.algorithms.count(Algorithm::Serial) > 0U) { + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { - std::vector x(n, 1.0); - sptrsv.x_ = x.data(); + resetOnes(x); const auto s = std::chrono::high_resolution_clock::now(); sptrsv.LsolveSerialInPlace(); @@ -555,8 +564,8 @@ int main(int argc, char *argv[]) { for (const auto &[key, agg] : summary) { const double geomean = std::exp(agg.sumLogRuntime / static_cast(agg.samples)); summaryCsv << CsvEscape(key.graph) << "," << key.algorithm << "," << key.processors << "," << agg.scheduleTimeSeconds - << "," << agg.supersteps << "," << agg.SyncCosts << "," << key.staleness - << "," << agg.samples << "," << geomean << "," << agg.correctness << "\n"; + << "," << agg.supersteps << "," << agg.SyncCosts << "," << key.staleness << "," << agg.samples << "," + << geomean << "," << agg.correctness << "\n"; } std::cout << "Benchmark complete. CSV written to: " << detailCsvPath << std::endl; From 4b97007414abbe4ff4e90ca43fba0064272dd843 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Tue, 17 Mar 2026 14:22:04 +0100 Subject: [PATCH 36/57] make eigen matrix compressed --- apps/maxbsp_ssp_sptrsv.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 69ed2c57..4ad3565e 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -349,8 +349,10 @@ int main(int argc, char *argv[]) { std::cerr << "Failed to load matrix: " << graphPath << std::endl; continue; } + lCsr.makeCompressed(); Eigen::SparseMatrix lCsc = lCsr; + lCsc.makeCompressed(); SparseMatrixImp graph; graph.SetCsr(&lCsr); From 1f506b3454ac6b611e45aa9dd1b9a9fe4d50c58a Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Wed, 18 Mar 2026 08:59:50 +0100 Subject: [PATCH 37/57] const everywhere --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 192 ++++++++++-------- 1 file changed, 107 insertions(+), 85 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 906e8850..e02e5968 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -239,43 +239,48 @@ class Sptrsv { rowPtr_.push_back(colIdx_.size()); } - void LsolveSerial() { - const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); - const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); - const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + void LsolveSerial() const { + const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); + const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); + const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + double *const x = x_; + const double *const b = b_; - EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); + const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); for (EigenIdxType i = 0; i < numberOfVertices; ++i) { - x_[i] = b_[i]; + x[i] = b[i]; for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) { - x_[i] -= valPtr[j] * x_[inner[j]]; + x[i] -= valPtr[j] * x[inner[j]]; } - x_[i] /= valPtr[outer[i + 1] - 1]; + x[i] /= valPtr[outer[i + 1] - 1]; } } - void UsolveSerial() { - const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); - const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); - const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + void UsolveSerial() const { + const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); + const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); + const double *const valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + double *const x = x_; + const double *const b = b_; const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); EigenIdxType i = numberOfVertices; do { i--; - x_[i] = b_[i]; + x[i] = b[i]; for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) { - x_[i] -= valPtr[j] * x_[inner[j]]; + x[i] -= valPtr[j] * x[inner[j]]; } - x_[i] /= valPtr[outer[i]]; + x[i] /= valPtr[outer[i]]; } while (i != 0); } - void LsolveNoPermutationInPlace() { - const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); - const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); - const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + void LsolveNoPermutationInPlace() const { + const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); + const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); + const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + double *const x = x_; # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { @@ -289,9 +294,9 @@ class Sptrsv { for (EigenIdxType node = lowerB; node <= upperB; ++node) { for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { - x_[node] -= valPtr[i] * x_[inner[i]]; + x[node] -= valPtr[i] * x[inner[i]]; } - x_[node] /= valPtr[outer[node + 1] - 1]; + x[node] /= valPtr[outer[node + 1] - 1]; } } # pragma omp barrier @@ -299,10 +304,11 @@ class Sptrsv { } } - void UsolveNoPermutationInPlace() { - const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); - const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); - const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + void UsolveNoPermutationInPlace() const { + const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); + const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); + const double *const valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + double *const x = x_; # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { @@ -319,9 +325,9 @@ class Sptrsv { do { node--; for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { - x_[node] -= valPtr[i] * x_[inner[i]]; + x[node] -= valPtr[i] * x[inner[i]]; } - x_[node] /= valPtr[outer[node]]; + x[node] /= valPtr[outer[node]]; } while (node != lowerB); } # pragma omp barrier @@ -329,10 +335,12 @@ class Sptrsv { } } - void LsolveNoPermutation() { - const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); - const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); - const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + void LsolveNoPermutation() const { + const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); + const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); + const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + double *const x = x_; + const double *const b = b_; # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { @@ -345,11 +353,11 @@ class Sptrsv { const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; for (EigenIdxType node = lowerB; node <= upperB; ++node) { - x_[node] = b_[node]; + x[node] = b[node]; for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { - x_[node] -= valPtr[i] * x_[inner[i]]; + x[node] -= valPtr[i] * x[inner[i]]; } - x_[node] /= valPtr[outer[node + 1] - 1]; + x[node] /= valPtr[outer[node + 1] - 1]; } } # pragma omp barrier @@ -357,10 +365,12 @@ class Sptrsv { } } - void UsolveNoPermutation() { - const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); - const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); - const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + void UsolveNoPermutation() const { + const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); + const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); + const double *const valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + double *const x = x_; + const double *const b = b_; # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { @@ -376,11 +386,11 @@ class Sptrsv { do { node--; - x_[node] = b_[node]; + x[node] = b[node]; for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { - x_[node] -= valPtr[i] * x_[inner[i]]; + x[node] -= valPtr[i] * x[inner[i]]; } - x_[node] /= valPtr[outer[node]]; + x[node] /= valPtr[outer[node]]; } while (node != lowerB); } # pragma omp barrier @@ -388,37 +398,41 @@ class Sptrsv { } } - void LsolveSerialInPlace() { - const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); - const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); - const double *valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + void LsolveSerialInPlace() const { + const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); + const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); + const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); + double *const x = x_; const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); for (EigenIdxType i = 0; i < numberOfVertices; ++i) { for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) { - x_[i] -= valPtr[j] * x_[inner[j]]; + x[i] -= valPtr[j] * x[inner[j]]; } - x_[i] /= valPtr[outer[i + 1] - 1]; + x[i] /= valPtr[outer[i + 1] - 1]; } } - void UsolveSerialInPlace() { - const EigenIdxType *outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); - const EigenIdxType *inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); - const double *valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + void UsolveSerialInPlace() const { + const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSC())).outerIndexPtr(); + const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSC())).innerIndexPtr(); + const double *const valPtr = (*(instance_->GetComputationalDag().GetCSC())).valuePtr(); + double *const x = x_; const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); EigenIdxType i = numberOfVertices; do { i--; for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) { - x_[i] -= valPtr[j] * x_[inner[j]]; + x[i] -= valPtr[j] * x[inner[j]]; } - x_[i] /= valPtr[outer[i]]; + x[i] /= valPtr[outer[i]]; } while (i != 0); } - void LsolveWithPermutationInPlace() { + void LsolveWithPermutationInPlace() const { + double *const x = x_; + # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { for (unsigned step = 0; step < numSupersteps_; step++) { @@ -426,10 +440,10 @@ class Sptrsv { const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc]; for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) { for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { - x_[rowIdx] -= val_[i] * x_[colIdx_[i]]; + x[rowIdx] -= val_[i] * x[colIdx_[i]]; } - x_[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1]; + x[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1]; } # pragma omp barrier @@ -437,19 +451,22 @@ class Sptrsv { } } - void LsolveWithPermutation() { + void LsolveWithPermutation() const { + double *const x = x_; + const double *const b = b_; + # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { for (unsigned step = 0; step < numSupersteps_; step++) { const size_t proc = static_cast(omp_get_thread_num()); const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc]; for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) { - x_[rowIdx] = b_[rowIdx]; + x[rowIdx] = b[rowIdx]; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { - x_[rowIdx] -= val_[i] * x_[colIdx_[i]]; + x[rowIdx] -= val_[i] * x[colIdx_[i]]; } - x_[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1]; + x[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1]; } # pragma omp barrier @@ -458,7 +475,7 @@ class Sptrsv { } void ResetX() { - EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); + const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); for (EigenIdxType i = 0; i < numberOfVertices; i++) { x_[i] = 1.0; } @@ -484,19 +501,21 @@ class Sptrsv { } } - std::size_t GetNumberOfVertices() { return instance_->NumberOfVertices(); } + std::size_t GetNumberOfVertices() const { return instance_->NumberOfVertices(); } // SSP Lsolve with staleness=2 (allowing at most one superstep of lag). // Uses FlatCheckpointCounterBarrier created internally. template - void SspLsolveStaleness() { + void SspLsolveStaleness() const { const unsigned nthreads = instance_->NumberOfProcessors(); FlatCheckpointCounterBarrier barrier(nthreads); - const auto *csr = instance_->GetComputationalDag().GetCSR(); - const EigenIdxType *outer = csr->outerIndexPtr(); - const EigenIdxType *inner = csr->innerIndexPtr(); - const double *vals = csr->valuePtr(); + const auto *const csr = instance_->GetComputationalDag().GetCSR(); + const EigenIdxType *const outer = csr->outerIndexPtr(); + const EigenIdxType *const inner = csr->innerIndexPtr(); + const double *const vals = csr->valuePtr(); + double *const x = x_; + const double *const b = b_; # pragma omp parallel num_threads(nthreads) { @@ -513,14 +532,14 @@ class Sptrsv { const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; for (EigenIdxType node = lowerB; node <= upperB; ++node) { // Initialize solution for this node - x_[node] = b_[node]; + x[node] = b[node]; // Perform lower-triangular solve for this node for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { // Subtract contributions from previously solved nodes - x_[node] -= vals[i] * x_[inner[i]]; + x[node] -= vals[i] * x[inner[i]]; } // Divide by diagonal element to complete solve for this node - x_[node] /= vals[outer[node + 1] - 1]; + x[node] /= vals[outer[node + 1] - 1]; } } // Signal completion of this superstep. @@ -532,14 +551,15 @@ class Sptrsv { // SSP Lsolve in-place with staleness=2 (allowing at most one superstep of lag). // Uses FlatCheckpointCounterBarrier created internally. template - void SspLsolveStalenessInPlace() { + void SspLsolveStalenessInPlace() const { const unsigned nthreads = instance_->NumberOfProcessors(); FlatCheckpointCounterBarrier barrier(nthreads); - const auto *csr = instance_->GetComputationalDag().GetCSR(); - const EigenIdxType *outer = csr->outerIndexPtr(); - const EigenIdxType *inner = csr->innerIndexPtr(); - const double *vals = csr->valuePtr(); + const auto *const csr = instance_->GetComputationalDag().GetCSR(); + const EigenIdxType *const outer = csr->outerIndexPtr(); + const EigenIdxType *const inner = csr->innerIndexPtr(); + const double *const vals = csr->valuePtr(); + double *const x = x_; # pragma omp parallel num_threads(nthreads) { @@ -558,10 +578,10 @@ class Sptrsv { // Perform lower-triangular solve for this node for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { // Subtract contributions from previously solved nodes - x_[node] -= vals[i] * x_[inner[i]]; + x[node] -= vals[i] * x[inner[i]]; } // Divide by diagonal element to complete solve for this node - x_[node] /= vals[outer[node + 1] - 1]; + x[node] /= vals[outer[node + 1] - 1]; } } // Signal completion of this superstep. @@ -573,14 +593,16 @@ class Sptrsv { // SSP Usolve with configurable staleness. // Uses FlatCheckpointCounterBarrier created internally. template - void SspUsolveStaleness() { + void SspUsolveStaleness() const { const unsigned nthreads = instance_->NumberOfProcessors(); FlatCheckpointCounterBarrier barrier(nthreads); - const auto *csc = instance_->GetComputationalDag().GetCSC(); - const EigenIdxType *outer = csc->outerIndexPtr(); - const EigenIdxType *inner = csc->innerIndexPtr(); - const double *vals = csc->valuePtr(); + const auto *const csc = instance_->GetComputationalDag().GetCSC(); + const EigenIdxType *const outer = csc->outerIndexPtr(); + const EigenIdxType *const inner = csc->innerIndexPtr(); + const double *const vals = csc->valuePtr(); + double *const x = x_; + const double *const b = b_; # pragma omp parallel num_threads(nthreads) { @@ -599,11 +621,11 @@ class Sptrsv { do { node--; - x_[node] = b_[node]; + x[node] = b[node]; for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { - x_[node] -= vals[i] * x_[inner[i]]; + x[node] -= vals[i] * x[inner[i]]; } - x_[node] /= vals[outer[node]]; + x[node] /= vals[outer[node]]; } while (node != lowerB); } From ebfa82adbdd2f417a12a0cebf6eff8761cae621a Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Wed, 18 Mar 2026 16:05:22 +0100 Subject: [PATCH 38/57] Add localaccumulator for inner loop of sptrsv kernels --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 69 +++++++++++-------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index e02e5968..b992614b 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -249,10 +249,11 @@ class Sptrsv { const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); for (EigenIdxType i = 0; i < numberOfVertices; ++i) { x[i] = b[i]; + double acc = 0.0; for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) { - x[i] -= valPtr[j] * x[inner[j]]; + acc += valPtr[j] * x[inner[j]]; } - x[i] /= valPtr[outer[i + 1] - 1]; + x[i] = (x[i] - acc) / valPtr[outer[i + 1] - 1]; } } @@ -269,10 +270,11 @@ class Sptrsv { do { i--; x[i] = b[i]; + double acc = 0.0; for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) { - x[i] -= valPtr[j] * x[inner[j]]; + acc += valPtr[j] * x[inner[j]]; } - x[i] /= valPtr[outer[i]]; + x[i] = (x[i] - acc) / valPtr[outer[i]]; } while (i != 0); } @@ -293,10 +295,11 @@ class Sptrsv { const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; for (EigenIdxType node = lowerB; node <= upperB; ++node) { + double acc = 0.0; for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { - x[node] -= valPtr[i] * x[inner[i]]; + acc += valPtr[i] * x[inner[i]]; } - x[node] /= valPtr[outer[node + 1] - 1]; + x[node] = (x[node] - acc) / valPtr[outer[node + 1] - 1]; } } # pragma omp barrier @@ -324,10 +327,11 @@ class Sptrsv { do { node--; + double acc = 0.0; for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { - x[node] -= valPtr[i] * x[inner[i]]; + acc += valPtr[i] * x[inner[i]]; } - x[node] /= valPtr[outer[node]]; + x[node] = (x[node] - acc) / valPtr[outer[node]]; } while (node != lowerB); } # pragma omp barrier @@ -354,10 +358,11 @@ class Sptrsv { for (EigenIdxType node = lowerB; node <= upperB; ++node) { x[node] = b[node]; + double acc = 0.0; for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { - x[node] -= valPtr[i] * x[inner[i]]; + acc += valPtr[i] * x[inner[i]]; } - x[node] /= valPtr[outer[node + 1] - 1]; + x[node] = (x[node] - acc) / valPtr[outer[node + 1] - 1]; } } # pragma omp barrier @@ -387,10 +392,11 @@ class Sptrsv { do { node--; x[node] = b[node]; + double acc = 0.0; for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { - x[node] -= valPtr[i] * x[inner[i]]; + acc += valPtr[i] * x[inner[i]]; } - x[node] /= valPtr[outer[node]]; + x[node] = (x[node] - acc) / valPtr[outer[node]]; } while (node != lowerB); } # pragma omp barrier @@ -406,10 +412,11 @@ class Sptrsv { const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); for (EigenIdxType i = 0; i < numberOfVertices; ++i) { + double acc = 0.0; for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) { - x[i] -= valPtr[j] * x[inner[j]]; + acc += valPtr[j] * x[inner[j]]; } - x[i] /= valPtr[outer[i + 1] - 1]; + x[i] = (x[i] - acc) / valPtr[outer[i + 1] - 1]; } } @@ -423,10 +430,11 @@ class Sptrsv { EigenIdxType i = numberOfVertices; do { i--; + double acc = 0.0; for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) { - x[i] -= valPtr[j] * x[inner[j]]; + acc += valPtr[j] * x[inner[j]]; } - x[i] /= valPtr[outer[i]]; + x[i] = (x[i] - acc) / valPtr[outer[i]]; } while (i != 0); } @@ -439,11 +447,12 @@ class Sptrsv { const size_t proc = static_cast(omp_get_thread_num()); const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc]; for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) { + double acc = 0.0; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { - x[rowIdx] -= val_[i] * x[colIdx_[i]]; + acc += val_[i] * x[colIdx_[i]]; } - x[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1]; + x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; } # pragma omp barrier @@ -462,11 +471,12 @@ class Sptrsv { const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc]; for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) { x[rowIdx] = b[rowIdx]; + double acc = 0.0; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { - x[rowIdx] -= val_[i] * x[colIdx_[i]]; + acc += val_[i] * x[colIdx_[i]]; } - x[rowIdx] /= val_[rowPtr_[rowIdx + 1] - 1]; + x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; } # pragma omp barrier @@ -533,13 +543,14 @@ class Sptrsv { for (EigenIdxType node = lowerB; node <= upperB; ++node) { // Initialize solution for this node x[node] = b[node]; + double acc = 0.0; // Perform lower-triangular solve for this node for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { - // Subtract contributions from previously solved nodes - x[node] -= vals[i] * x[inner[i]]; + // Accumulate contributions from previously solved nodes + acc += vals[i] * x[inner[i]]; } // Divide by diagonal element to complete solve for this node - x[node] /= vals[outer[node + 1] - 1]; + x[node] = (x[node] - acc) / vals[outer[node + 1] - 1]; } } // Signal completion of this superstep. @@ -575,13 +586,14 @@ class Sptrsv { EigenIdxType lowerB = boundsArrayL_[step][proc][index]; const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; for (EigenIdxType node = lowerB; node <= upperB; ++node) { + double acc = 0.0; // Perform lower-triangular solve for this node for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { - // Subtract contributions from previously solved nodes - x[node] -= vals[i] * x[inner[i]]; + // Accumulate contributions from previously solved nodes + acc += vals[i] * x[inner[i]]; } // Divide by diagonal element to complete solve for this node - x[node] /= vals[outer[node + 1] - 1]; + x[node] = (x[node] - acc) / vals[outer[node + 1] - 1]; } } // Signal completion of this superstep. @@ -622,10 +634,11 @@ class Sptrsv { do { node--; x[node] = b[node]; + double acc = 0.0; for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { - x[node] -= vals[i] * x[inner[i]]; + acc += vals[i] * x[inner[i]]; } - x[node] /= vals[outer[node]]; + x[node] = (x[node] - acc) / vals[outer[node]]; } while (node != lowerB); } From 2a371ca008eb53ffa1987a0dc384ca4759ce60e5 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Mon, 23 Mar 2026 09:41:39 +0100 Subject: [PATCH 39/57] Loop Processor Permutation --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 104 +++++++++++++++++- tests/sptrsv.cpp | 14 ++- 2 files changed, 110 insertions(+), 8 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index b992614b..4e8e2f98 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -27,12 +27,14 @@ limitations under the License. # include # include # include +# include # include # include # include # include # include # include +# include # include # include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp" @@ -59,6 +61,9 @@ class Sptrsv { std::vector rowIdx_; std::vector colPtr_; + std::vector> procStepPtr_; + std::vector> procStepNum_; + std::vector> stepProcPtr_; std::vector> stepProcNum_; @@ -166,6 +171,91 @@ class Sptrsv { } } + void SetupCsrWithPermutationLoopProcessors(const BspSchedule> &schedule, std::vector &perm) { + const auto *const csr = instance_->GetComputationalDag().GetCSR(); + const EigenIdxType *const outer = csr->outerIndexPtr(); + const EigenIdxType *const inner = csr->innerIndexPtr(); + const double *const values = csr->valuePtr(); + + const SparseMatrixImp &graph = instance_->GetComputationalDag(); + assert(static_cast(graph.NumVertices()) + static_cast(graph.NumEdges()) <= static_cast(std::numeric_limits::max())); + const unsigned numVert = static_cast(graph.NumVertices()); + numSupersteps_ = schedule.NumberOfSupersteps(); + const unsigned numProcs = instance_->NumberOfProcessors(); + + perm = std::vector(numVert, 0U); + + val_ = std::vector(static_cast(csr->nonZeros())); + colIdx_ = std::vector(static_cast(csr->nonZeros())); + rowPtr_ = std::vector(numVert + 1U, 0U); + + procStepPtr_ = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); + procStepNum_ = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); + + for (const auto vert : graph.Vertices()) { + const unsigned whichStep = schedule.AssignedSuperstep(vert); + const unsigned whichProc = schedule.AssignedProcessor(vert); + + perm[vert] = procStepNum_[whichProc][whichStep]++; // offsets + } + + unsigned accNode = 0U; + for (unsigned step = 0U; step < numSupersteps_; ++step) { + for (unsigned proc = 0U; proc < numProcs; ++proc) { + procStepPtr_[proc][step] = accNode; + accNode += procStepNum_[proc][step]; + } + } + + for (const auto vert : graph.Vertices()) { + perm[vert] += procStepPtr_[schedule.AssignedProcessor(vert)][schedule.AssignedSuperstep(vert)]; + } + + std::vector> entryAccumulation = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); + + for (const auto vert : graph.Vertices()) { + const unsigned whichStep = schedule.AssignedSuperstep(vert); + const unsigned whichProc = schedule.AssignedProcessor(vert); + + rowPtr_[perm[vert]] = entryAccumulation[whichProc][whichStep]; + entryAccumulation[whichProc][whichStep] += static_cast(graph.InDegree(vert)) + 1; + } + + unsigned accEntry = 0U; + for (unsigned step = 0U; step < numSupersteps_; ++step) { + for (unsigned proc = 0U; proc < numProcs; ++proc) { + unsigned temp = entryAccumulation[proc][step]; + entryAccumulation[proc][step] = accEntry; + accEntry += temp; + } + } + rowPtr_[numVert] = accEntry; + assert(static_cast(accEntry) == static_cast(graph.NumVertices()) + static_cast(graph.NumEdges()) ); + + for (const auto vert : graph.Vertices()) { + rowPtr_[perm[vert]] += entryAccumulation[schedule.AssignedProcessor(vert)][schedule.AssignedSuperstep(vert)]; + } + + for (const auto vert : graph.Vertices()) { + std::vector> parents; + parents.reserve(graph.InDegree(vert)); + for (EigenIdxType edge = outer[vert]; edge < outer[vert + 1] - 1; ++edge) { + parents.emplace_back(perm[static_cast(inner[edge])], static_cast(edge)); + } + std::sort(parents.begin(), parents.end()); + + const unsigned permVert = perm[vert]; + UVertType location = rowPtr_[permVert]; + for (const auto [permPar, edgeIdx] : parents) { + colIdx_[location] = permPar; + val_[location] = values[edgeIdx]; + ++location; + } + colIdx_[location] = permVert; + val_[location] = values[outer[vert + 1] - 1]; + } + } + void SetupCsrWithPermutation(const BspSchedule> &schedule, std::vector &perm) { std::vector permInv(perm.size()); for (size_t i = 0; i < perm.size(); i++) { @@ -443,10 +533,10 @@ class Sptrsv { # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { + const size_t proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; step++) { - const size_t proc = static_cast(omp_get_thread_num()); - const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc]; - for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) { + const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; + for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { double acc = 0.0; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { acc += val_[i] * x[colIdx_[i]]; @@ -491,12 +581,14 @@ class Sptrsv { } } - void PermuteXVector(const std::vector &perm) { + template + void PermuteXVector(const std::vector &perm) { + static_assert(std::is_integral_v); std::vector vecPerm(perm.size()); - for (size_t i = 0; i < perm.size(); i++) { + for (IntegralType i = 0; i < perm.size(); i++) { vecPerm[i] = x_[perm[i]]; } - for (size_t i = 0; i < perm.size(); i++) { + for (IntegralType i = 0; i < perm.size(); i++) { x_[i] = vecPerm[i]; } } diff --git a/tests/sptrsv.cpp b/tests/sptrsv.cpp index 59605ae8..d11ffc77 100644 --- a/tests/sptrsv.cpp +++ b/tests/sptrsv.cpp @@ -227,8 +227,18 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) { BOOST_CHECK(CompareVectors(uXRef, uXOsp)); // Lsolve in-place With PERMUTATION - std::vector perm = ScheduleNodePermuterBasic(scheduleCs, LOOP_PROCESSORS); - sim.SetupCsrWithPermutation(scheduleCs, perm); + std::vector perm;// = ScheduleNodePermuterBasic(scheduleCs, LOOP_PROCESSORS); + sim.SetupCsrWithPermutationLoopProcessors(scheduleCs, perm); + std::vector permCheck(graph.NumVertices(), false); + BOOST_CHECK_EQUAL(permCheck.size(), perm.size()); + for (const auto vert : graph.Vertices()) { + BOOST_CHECK(not permCheck[perm[vert]]); + permCheck[perm[vert]] = true; + } + for (const bool val : permCheck) { + BOOST_CHECK(val); + } + // Comparisson with osp serial in place L solve // Eigen From 57c8365f2606795bc38064ca7d25d282f6ebabfc Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Mon, 23 Mar 2026 10:35:28 +0100 Subject: [PATCH 40/57] Processor First Permutation --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 115 ++++++++++++++++++ tests/sptrsv.cpp | 31 +++++ 2 files changed, 146 insertions(+) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 4e8e2f98..21d2ed29 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -64,6 +64,8 @@ class Sptrsv { std::vector> procStepPtr_; std::vector> procStepNum_; + std::vector procFirstStepPtr_; + std::vector> stepProcPtr_; std::vector> stepProcNum_; @@ -256,6 +258,95 @@ class Sptrsv { } } + void SetupCsrWithPermutationProcessorsFirst(const BspSchedule> &schedule, std::vector &perm) { + const auto *const csr = instance_->GetComputationalDag().GetCSR(); + const EigenIdxType *const outer = csr->outerIndexPtr(); + const EigenIdxType *const inner = csr->innerIndexPtr(); + const double *const values = csr->valuePtr(); + + const SparseMatrixImp &graph = instance_->GetComputationalDag(); + assert(static_cast(graph.NumVertices()) + static_cast(graph.NumEdges()) <= static_cast(std::numeric_limits::max())); + const unsigned numVert = static_cast(graph.NumVertices()); + numSupersteps_ = schedule.NumberOfSupersteps(); + const unsigned numProcs = instance_->NumberOfProcessors(); + + perm = std::vector(numVert, 0U); + + val_ = std::vector(static_cast(csr->nonZeros())); + colIdx_ = std::vector(static_cast(csr->nonZeros())); + rowPtr_ = std::vector(numVert + 1U, 0U); + + procFirstStepPtr_ = std::vector(0U); + procFirstStepPtr_.reserve(numProcs + numSupersteps_ + 1U); + + procStepNum_ = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); + + for (const auto vert : graph.Vertices()) { + const unsigned whichStep = schedule.AssignedSuperstep(vert); + const unsigned whichProc = schedule.AssignedProcessor(vert); + + perm[vert] = procStepNum_[whichProc][whichStep]++; // offsets + } + + unsigned accNode = 0U; + for (unsigned proc = 0U; proc < numProcs; ++proc) { + for (unsigned step = 0U; step < numSupersteps_; ++step) { + procFirstStepPtr_.emplace_back(accNode); + accNode += procStepNum_[proc][step]; + } + } + procFirstStepPtr_.emplace_back(accNode); + + + for (const auto vert : graph.Vertices()) { + perm[vert] += procFirstStepPtr_[schedule.AssignedProcessor(vert) * numSupersteps_ + schedule.AssignedSuperstep(vert)]; + } + + std::vector> entryAccumulation = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); + + for (const auto vert : graph.Vertices()) { + const unsigned whichStep = schedule.AssignedSuperstep(vert); + const unsigned whichProc = schedule.AssignedProcessor(vert); + + rowPtr_[perm[vert]] = entryAccumulation[whichProc][whichStep]; + entryAccumulation[whichProc][whichStep] += static_cast(graph.InDegree(vert)) + 1; + } + + unsigned accEntry = 0U; + for (unsigned proc = 0U; proc < numProcs; ++proc) { + for (unsigned step = 0U; step < numSupersteps_; ++step) { + unsigned temp = entryAccumulation[proc][step]; + entryAccumulation[proc][step] = accEntry; + accEntry += temp; + } + } + rowPtr_[numVert] = accEntry; + assert(static_cast(accEntry) == static_cast(graph.NumVertices()) + static_cast(graph.NumEdges()) ); + + for (const auto vert : graph.Vertices()) { + rowPtr_[perm[vert]] += entryAccumulation[schedule.AssignedProcessor(vert)][schedule.AssignedSuperstep(vert)]; + } + + for (const auto vert : graph.Vertices()) { + std::vector> parents; + parents.reserve(graph.InDegree(vert)); + for (EigenIdxType edge = outer[vert]; edge < outer[vert + 1] - 1; ++edge) { + parents.emplace_back(perm[static_cast(inner[edge])], static_cast(edge)); + } + std::sort(parents.begin(), parents.end()); + + const unsigned permVert = perm[vert]; + UVertType location = rowPtr_[permVert]; + for (const auto [permPar, edgeIdx] : parents) { + colIdx_[location] = permPar; + val_[location] = values[edgeIdx]; + ++location; + } + colIdx_[location] = permVert; + val_[location] = values[outer[vert + 1] - 1]; + } + } + void SetupCsrWithPermutation(const BspSchedule> &schedule, std::vector &perm) { std::vector permInv(perm.size()); for (size_t i = 0; i < perm.size(); i++) { @@ -545,6 +636,30 @@ class Sptrsv { x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; } +# pragma omp barrier + } + } + } + + void LsolveWithProcFirstPermutationInPlace() const { + double *const x = x_; + +# pragma omp parallel num_threads(instance_->NumberOfProcessors()) + { + const unsigned proc = static_cast(omp_get_thread_num()); + const auto endStepPtr = std::next(procFirstStepPtr_.cbegin(), (proc + 1U) * numSupersteps_); + for (auto stepPtr = std::next(procFirstStepPtr_.cbegin(), proc * numSupersteps_); stepPtr != endStepPtr;) { + UVertType rowIdx = *stepPtr; + const UVertType endRowIdx = *(++stepPtr); + for (; rowIdx != endRowIdx; ++rowIdx) { + double acc = 0.0; + for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { + acc += val_[i] * x[colIdx_[i]]; + } + + x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; + } + # pragma omp barrier } } diff --git a/tests/sptrsv.cpp b/tests/sptrsv.cpp index d11ffc77..034fc271 100644 --- a/tests/sptrsv.cpp +++ b/tests/sptrsv.cpp @@ -255,6 +255,37 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) { sim.PermuteXVector(perm); BOOST_CHECK(CompareVectors(lXRef, lXOsp)); + + + + + sim.SetupCsrWithPermutationProcessorsFirst(scheduleCs, perm); + permCheck = std::vector(graph.NumVertices(), false); + BOOST_CHECK_EQUAL(permCheck.size(), perm.size()); + for (const auto vert : graph.Vertices()) { + BOOST_CHECK(not permCheck[perm[vert]]); + permCheck[perm[vert]] = true; + } + for (const bool val : permCheck) { + BOOST_CHECK(val); + } + + + // Comparisson with osp serial in place L solve + // Eigen + lBRef.setConstant(0.1); + lXRef.setConstant(0.1); + lXRef = lView.solve(lBRef); + // OSP + lXOsp.setConstant(0.1); + lBOsp.setZero(); // this will not be used as x will take the values that already has instead of the b values + sim.x_ = &lXOsp[0]; + sim.b_ = &lBOsp[0]; + // sim.permute_x_vector(perm); + sim.LsolveWithProcFirstPermutationInPlace(); + + sim.PermuteXVector(perm); + BOOST_CHECK(CompareVectors(lXRef, lXOsp)); } #endif From 8bd04d2a33363cb452dea6c73d11eb8d81b3cc6a Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Mon, 23 Mar 2026 11:28:35 +0100 Subject: [PATCH 41/57] Loop Processor SSP kernel --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 38 +++++++++++++++- tests/sptrsv.cpp | 44 +++++++++++++++++++ 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 21d2ed29..656c8fe7 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -673,8 +673,8 @@ class Sptrsv { { for (unsigned step = 0; step < numSupersteps_; step++) { const size_t proc = static_cast(omp_get_thread_num()); - const UVertType upperLimit = stepProcPtr_[step][proc] + stepProcNum_[step][proc]; - for (UVertType rowIdx = stepProcPtr_[step][proc]; rowIdx < upperLimit; rowIdx++) { + const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; + for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { x[rowIdx] = b[rowIdx]; double acc = 0.0; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { @@ -689,6 +689,40 @@ class Sptrsv { } } + template + void SspLsolveStalenessInPlaceWithPermutation() const { + const unsigned nthreads = instance_->NumberOfProcessors(); + FlatCheckpointCounterBarrier barrier(nthreads); + + const auto *const csr = instance_->GetComputationalDag().GetCSR(); + const EigenIdxType *const outer = csr->outerIndexPtr(); + const EigenIdxType *const inner = csr->innerIndexPtr(); + const double *const vals = csr->valuePtr(); + double *const x = x_; + +# pragma omp parallel num_threads(nthreads) + { + const std::size_t proc = static_cast(omp_get_thread_num()); + for (unsigned step = 0; step < numSupersteps_; ++step) { + if (procStepNum_[proc][step] > 0U) { + barrier.Wait(proc, staleness - 1U); + } + + const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; + for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { + double acc = 0.0; + for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { + acc += val_[i] * x[colIdx_[i]]; + } + + x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; + } + // Signal completion of this superstep. + barrier.Arrive(proc); + } + } + } + void ResetX() { const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); for (EigenIdxType i = 0; i < numberOfVertices; i++) { diff --git a/tests/sptrsv.cpp b/tests/sptrsv.cpp index 034fc271..e85c7a9c 100644 --- a/tests/sptrsv.cpp +++ b/tests/sptrsv.cpp @@ -37,6 +37,8 @@ limitations under the License. # include "osp/graph_algorithms/directed_graph_util.hpp" # include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp" +# include "osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp" + using namespace osp; bool CompareVectors(Eigen::VectorXd &v1, Eigen::VectorXd &v2) { @@ -286,6 +288,48 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) { sim.PermuteXVector(perm); BOOST_CHECK(CompareVectors(lXRef, lXOsp)); + + + + + + GrowLocalSSP, 2U> schedulerSSP; + MaxBspSchedule> scheduleSSP(instance); + + schedulerSSP.ComputeSchedule(scheduleSSP); + + sim.SetupCsrWithPermutationLoopProcessors(scheduleSSP, perm); + permCheck = std::vector(graph.NumVertices(), false); + BOOST_CHECK_EQUAL(permCheck.size(), perm.size()); + for (const auto vert : graph.Vertices()) { + BOOST_CHECK(not permCheck[perm[vert]]); + permCheck[perm[vert]] = true; + } + for (const bool val : permCheck) { + BOOST_CHECK(val); + } + + + // Comparisson with osp serial in place L solve + // Eigen + lBRef.setConstant(0.1); + lXRef.setConstant(0.1); + lXRef = lView.solve(lBRef); + // OSP + lXOsp.setConstant(0.1); + lBOsp.setZero(); // this will not be used as x will take the values that already has instead of the b values + sim.x_ = &lXOsp[0]; + sim.b_ = &lBOsp[0]; + // sim.permute_x_vector(perm); + sim.SspLsolveStalenessInPlaceWithPermutation<2U>(); + + sim.PermuteXVector(perm); + BOOST_CHECK(CompareVectors(lXRef, lXOsp)); + + + + + } #endif From c7eef33c9db2103b524cfa8e5ef9289df3222e1e Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Mon, 23 Mar 2026 11:35:39 +0100 Subject: [PATCH 42/57] SSP Proc first SpTrSV kernels and tests --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 39 ++++++++++++++++++- tests/sptrsv.cpp | 28 ++++++++++++- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 656c8fe7..d7deb63b 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -690,7 +690,7 @@ class Sptrsv { } template - void SspLsolveStalenessInPlaceWithPermutation() const { + void SspLsolveStalenessWithPermutationInPlace() const { const unsigned nthreads = instance_->NumberOfProcessors(); FlatCheckpointCounterBarrier barrier(nthreads); @@ -723,6 +723,43 @@ class Sptrsv { } } + template + void SspLsolveStalenessWithProcFirstPermutationInPlace() const { + const unsigned nthreads = instance_->NumberOfProcessors(); + FlatCheckpointCounterBarrier barrier(nthreads); + + const auto *const csr = instance_->GetComputationalDag().GetCSR(); + const EigenIdxType *const outer = csr->outerIndexPtr(); + const EigenIdxType *const inner = csr->innerIndexPtr(); + const double *const vals = csr->valuePtr(); + double *const x = x_; + +# pragma omp parallel num_threads(nthreads) + { + const unsigned proc = static_cast(omp_get_thread_num()); + const auto endStepPtr = std::next(procFirstStepPtr_.cbegin(), (proc + 1U) * numSupersteps_); + for (auto stepPtr = std::next(procFirstStepPtr_.cbegin(), proc * numSupersteps_); stepPtr != endStepPtr;) { + UVertType rowIdx = *stepPtr; + const UVertType endRowIdx = *(++stepPtr); + + if (rowIdx != endRowIdx) { + barrier.Wait(proc, staleness - 1U); + } + + for (; rowIdx != endRowIdx; ++rowIdx) { + double acc = 0.0; + for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { + acc += val_[i] * x[colIdx_[i]]; + } + + x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; + } + // Signal completion of this superstep. + barrier.Arrive(proc); + } + } + } + void ResetX() { const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); for (EigenIdxType i = 0; i < numberOfVertices; i++) { diff --git a/tests/sptrsv.cpp b/tests/sptrsv.cpp index e85c7a9c..355a36d7 100644 --- a/tests/sptrsv.cpp +++ b/tests/sptrsv.cpp @@ -321,7 +321,7 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) { sim.x_ = &lXOsp[0]; sim.b_ = &lBOsp[0]; // sim.permute_x_vector(perm); - sim.SspLsolveStalenessInPlaceWithPermutation<2U>(); + sim.SspLsolveStalenessWithPermutationInPlace<2U>(); sim.PermuteXVector(perm); BOOST_CHECK(CompareVectors(lXRef, lXOsp)); @@ -329,7 +329,33 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) { + sim.SetupCsrWithPermutationProcessorsFirst(scheduleSSP, perm); + permCheck = std::vector(graph.NumVertices(), false); + BOOST_CHECK_EQUAL(permCheck.size(), perm.size()); + for (const auto vert : graph.Vertices()) { + BOOST_CHECK(not permCheck[perm[vert]]); + permCheck[perm[vert]] = true; + } + for (const bool val : permCheck) { + BOOST_CHECK(val); + } + + + // Comparisson with osp serial in place L solve + // Eigen + lBRef.setConstant(0.1); + lXRef.setConstant(0.1); + lXRef = lView.solve(lBRef); + // OSP + lXOsp.setConstant(0.1); + lBOsp.setZero(); // this will not be used as x will take the values that already has instead of the b values + sim.x_ = &lXOsp[0]; + sim.b_ = &lBOsp[0]; + // sim.permute_x_vector(perm); + sim.SspLsolveStalenessWithProcFirstPermutationInPlace<2U>(); + sim.PermuteXVector(perm); + BOOST_CHECK(CompareVectors(lXRef, lXOsp)); } #endif From d0a974cb4f45e23de9388b2774f8266ee5b90381 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Mon, 23 Mar 2026 14:33:23 +0100 Subject: [PATCH 43/57] update ssp bench with permutation --- apps/maxbsp_ssp_sptrsv.cpp | 185 +++++++++++++++++++++++++++++++++++-- 1 file changed, 178 insertions(+), 7 deletions(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 4ad3565e..70b99822 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -51,7 +51,10 @@ constexpr int preMeasureIterations = 2; enum class Algorithm { VarianceSsp, GrowLocalSsp, + GrowLocalSspPermSteps, + GrowLocalSspPermProcs, GrowLocal, + GrowLocalPermSteps, Serial }; @@ -135,11 +138,13 @@ double LInftyNormalisedDiff(const std::vector &v, const std::vector [--output ] [--iterations ] [--processors

]\n" - << " [--variance-ssp] [--growlocal-ssp] [--growlocal] [--eigen-serial] [--all]\n\n" + << " [--variance-ssp] [--growlocal-ssp] [--growlocal-ssp-perm-step] [--growlocal-ssp-perm-proc] [--growlocal] " + "[--growlocal-perm-step] [--eigen-serial] [--all]\n\n" << "Examples:\n" << " " << prog << " --input ../data/mtx_tests/ErdosRenyi_2k_14k_A.mtx --all\n" << " " << prog - << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --variance-ssp --growlocal-ssp --growlocal\n"; + << " --input ../data/mtx_tests --output bench.csv --iterations 100 --processors 16 --variance-ssp --growlocal-ssp " + "--growlocal\n"; } bool ParseArgs(int argc, char *argv[], Args &args) { @@ -150,8 +155,7 @@ bool ParseArgs(int argc, char *argv[], Args &args) { for (int i = 1; i < argc; ++i) { const std::string flag = argv[i]; - const bool needsValue - = (flag == "--input" || flag == "--output" || flag == "--iterations" || flag == "--processors"); + const bool needsValue = (flag == "--input" || flag == "--output" || flag == "--iterations" || flag == "--processors"); if (needsValue && i + 1 >= argc) { std::cerr << "Missing value for " << flag << "\n"; return false; @@ -169,12 +173,24 @@ bool ParseArgs(int argc, char *argv[], Args &args) { args.algorithms.insert(Algorithm::VarianceSsp); } else if (flag == "--growlocal-ssp") { args.algorithms.insert(Algorithm::GrowLocalSsp); + } else if (flag == "--growlocal-ssp-perm-step") { + args.algorithms.insert(Algorithm::GrowLocalSspPermSteps); + } else if (flag == "--growlocal-ssp-perm-proc") { + args.algorithms.insert(Algorithm::GrowLocalSspPermProcs); } else if (flag == "--growlocal") { args.algorithms.insert(Algorithm::GrowLocal); + } else if (flag == "--growlocal-perm-step") { + args.algorithms.insert(Algorithm::GrowLocalPermSteps); } else if (flag == "--eigen-serial") { args.algorithms.insert(Algorithm::Serial); } else if (flag == "--all") { - args.algorithms = {Algorithm::VarianceSsp, Algorithm::GrowLocalSsp, Algorithm::GrowLocal, Algorithm::Serial}; + args.algorithms = {Algorithm::VarianceSsp, + Algorithm::GrowLocalSsp, + Algorithm::GrowLocalSspPermProcs, + Algorithm::GrowLocalSspPermSteps, + Algorithm::GrowLocal, + Algorithm::GrowLocalPermSteps, + Algorithm::Serial}; } else if (flag == "--help" || flag == "-h") { PrintUsage(argv[0]); return false; @@ -241,7 +257,8 @@ std::vector CollectInputGraphs(const std::string &inputPa } void EnsureCsvHeader(std::ofstream &csv) { - csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,SynchronizationCosts,Staleness,RuntimeSeconds,Correctness\n"; + csv << "Graph,Algorithm,Processors,ScheduleTimeSeconds,ScheduleSupersteps,SynchronizationCosts,Staleness,RuntimeSeconds," + "Correctness\n"; } void EnsureSummaryCsvHeader(std::ofstream &csv) { @@ -251,7 +268,8 @@ void EnsureSummaryCsvHeader(std::ofstream &csv) { void WriteCsvRow(std::ofstream &csv, const CsvRow &row) { csv << CsvEscape(row.graph) << "," << row.algorithm << "," << row.processors << "," << row.scheduleTimeSeconds << "," - << row.supersteps << "," << row.SyncCosts << "," << row.staleness << "," << row.runtimeSeconds << "," << row.correctness << "\n"; + << row.supersteps << "," << row.SyncCosts << "," << row.staleness << "," << row.runtimeSeconds << "," << row.correctness + << "\n"; } std::string BuildSummaryCsvPath(const std::string &detailPath) { @@ -468,6 +486,108 @@ int main(int argc, char *argv[]) { } } + if (args.algorithms.count(Algorithm::GrowLocalSspPermSteps) > 0U) { + GrowLocalSSP, kDefaultStaleness> scheduler; + MaxBspSchedule> schedule(instance); + + const auto t0 = std::chrono::high_resolution_clock::now(); + scheduler.ComputeSchedule(schedule); + const auto t1 = std::chrono::high_resolution_clock::now(); + const double scheduleTime = std::chrono::duration(t1 - t0).count(); + + std::vector perm; + sptrsv.SetupCsrWithPermutationLoopProcessors(schedule, perm); + const unsigned supersteps = schedule.NumberOfSupersteps(); + const int syncCosts = ComputeSyncCosts(instance); + + bool correct = false; + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); + for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { + resetOnes(x); + + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.SspLsolveStalenessWithPermutationInPlace(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); + + if (iter == 0) { + sptrsv.PermuteXVector(perm); + const double diff = LInftyNormalisedDiff(x, serialRefX); + correct = (diff < EPSILON); + std::cout << " Growlocal_SSP_Perm_Step first-run max relative diff vs serial: " << diff << std::endl; + } + + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Growlocal_SSP_Perm_Step", + args.processors, + scheduleTime, + supersteps, + syncCosts, + kDefaultStaleness, + runtime, + correct}); + } + } + + for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) { + WriteCsvRow(csv, *it); + ++writtenEntries; + } + } + + if (args.algorithms.count(Algorithm::GrowLocalSspPermProcs) > 0U) { + GrowLocalSSP, kDefaultStaleness> scheduler; + MaxBspSchedule> schedule(instance); + + const auto t0 = std::chrono::high_resolution_clock::now(); + scheduler.ComputeSchedule(schedule); + const auto t1 = std::chrono::high_resolution_clock::now(); + const double scheduleTime = std::chrono::duration(t1 - t0).count(); + + std::vector perm; + sptrsv.SetupCsrWithPermutationProcessorsFirst(schedule, perm); + const unsigned supersteps = schedule.NumberOfSupersteps(); + const int syncCosts = ComputeSyncCosts(instance); + + bool correct = false; + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); + for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { + resetOnes(x); + + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.SspLsolveStalenessWithProcFirstPermutationInPlace(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); + + if (iter == 0) { + sptrsv.PermuteXVector(perm); + const double diff = LInftyNormalisedDiff(x, serialRefX); + correct = (diff < EPSILON); + std::cout << " Growlocal_SSP_Perm_Proc first-run max relative diff vs serial: " << diff << std::endl; + } + + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Growlocal_SSP_Perm_Proc", + args.processors, + scheduleTime, + supersteps, + syncCosts, + kDefaultStaleness, + runtime, + correct}); + } + } + + for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) { + WriteCsvRow(csv, *it); + ++writtenEntries; + } + } + if (args.algorithms.count(Algorithm::GrowLocal) > 0U) { GrowLocalAutoCores> scheduler; BspSchedule> schedule(instance); @@ -517,6 +637,57 @@ int main(int argc, char *argv[]) { } } + if (args.algorithms.count(Algorithm::GrowLocalPermSteps) > 0U) { + GrowLocalAutoCores> scheduler; + BspSchedule> schedule(instance); + + const auto t0 = std::chrono::high_resolution_clock::now(); + scheduler.ComputeSchedule(schedule); + const auto t1 = std::chrono::high_resolution_clock::now(); + const double scheduleTime = std::chrono::duration(t1 - t0).count(); + + std::vector perm; + sptrsv.SetupCsrWithPermutationLoopProcessors(schedule, perm); + const unsigned supersteps = schedule.NumberOfSupersteps(); + const int syncCosts = ComputeSyncCosts(instance); + + bool correct; + std::vector x(n, 1.0); + sptrsv.x_ = x.data(); + for (int iter = 0; iter < args.iterations + preMeasureIterations; ++iter) { + resetOnes(x); + + const auto s = std::chrono::high_resolution_clock::now(); + sptrsv.LsolveWithPermutationInPlace(); + const auto e = std::chrono::high_resolution_clock::now(); + const double runtime = std::chrono::duration(e - s).count(); + + if (iter == 0) { + sptrsv.PermuteXVector(perm); + const double diff = LInftyNormalisedDiff(x, serialRefX); + correct = (diff < EPSILON); + std::cout << " Growlocal_Perm_Step first-run max relative diff vs serial: " << diff << std::endl; + } + + if (iter >= preMeasureIterations) { + bufferedRows.emplace_back(CsvRow{graphName, + "Growlocal_Perm_Step", + args.processors, + scheduleTime, + supersteps, + syncCosts, + 1U, + runtime, + correct}); + } + } + + for (auto it = std::next(bufferedRows.cbegin(), writtenEntries); it != bufferedRows.cend(); ++it) { + WriteCsvRow(csv, *it); + ++writtenEntries; + } + } + if (args.algorithms.count(Algorithm::Serial) > 0U) { std::vector x(n, 1.0); sptrsv.x_ = x.data(); From f167feb7bc568eb765cd523711f0df407b6e2eb7 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Mon, 23 Mar 2026 15:15:46 +0100 Subject: [PATCH 44/57] missing references --- include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index d7deb63b..b221918f 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -248,7 +248,7 @@ class Sptrsv { const unsigned permVert = perm[vert]; UVertType location = rowPtr_[permVert]; - for (const auto [permPar, edgeIdx] : parents) { + for (const auto &[permPar, edgeIdx] : parents) { colIdx_[location] = permPar; val_[location] = values[edgeIdx]; ++location; @@ -337,7 +337,7 @@ class Sptrsv { const unsigned permVert = perm[vert]; UVertType location = rowPtr_[permVert]; - for (const auto [permPar, edgeIdx] : parents) { + for (const auto &[permPar, edgeIdx] : parents) { colIdx_[location] = permPar; val_[location] = values[edgeIdx]; ++location; From 2298bb789b5f8eee55962087b2a5a9386b20c2df Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Tue, 24 Mar 2026 14:50:27 +0100 Subject: [PATCH 45/57] Sparse kernels progress --- apps/maxbsp_ssp_sptrsv.cpp | 6 +- .../StatsModules/BspSptrsvStatsModule.hpp | 4 +- .../StringToScheduler/get_coarser.hpp | 2 +- .../sptrsv_simulator/ScheduleNodePermuter.hpp | 16 +- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 227 ++++++------------ .../sptrsv_simulator/sptrsv_kernels.hpp | 160 ++++++++++++ include/osp/bsp/model/BspSchedule.hpp | 4 +- .../GreedySchedulers/GrowLocalMaxBsp.hpp | 2 +- .../GreedySchedulers/RandomGreedy.hpp | 2 +- include/osp/coarser/Sarkar/SarkarMul.hpp | 4 +- include/osp/coarser/SquashA/SquashAMul.hpp | 2 +- include/osp/coarser/coarser_util.hpp | 4 +- .../coarser/top_order/top_order_coarser.hpp | 10 +- .../osp/concepts/directed_graph_concept.hpp | 12 +- .../osp/graph_algorithms/cuthill_mckee.hpp | 2 +- .../directed_graph_path_util.hpp | 2 +- .../eigen_sparse_iterator.hpp | 17 +- .../eigen_matrix_adapter/sparse_matrix.hpp | 13 +- tests/sparse_matrix_impl.cpp | 12 +- tests/sptrsv.cpp | 2 +- 20 files changed, 302 insertions(+), 201 deletions(-) create mode 100644 include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index 70b99822..a15e24f5 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -495,7 +495,7 @@ int main(int argc, char *argv[]) { const auto t1 = std::chrono::high_resolution_clock::now(); const double scheduleTime = std::chrono::duration(t1 - t0).count(); - std::vector perm; + std::vector::VertexIdx> perm; sptrsv.SetupCsrWithPermutationLoopProcessors(schedule, perm); const unsigned supersteps = schedule.NumberOfSupersteps(); const int syncCosts = ComputeSyncCosts(instance); @@ -546,7 +546,7 @@ int main(int argc, char *argv[]) { const auto t1 = std::chrono::high_resolution_clock::now(); const double scheduleTime = std::chrono::duration(t1 - t0).count(); - std::vector perm; + std::vector::VertexIdx> perm; sptrsv.SetupCsrWithPermutationProcessorsFirst(schedule, perm); const unsigned supersteps = schedule.NumberOfSupersteps(); const int syncCosts = ComputeSyncCosts(instance); @@ -646,7 +646,7 @@ int main(int argc, char *argv[]) { const auto t1 = std::chrono::high_resolution_clock::now(); const double scheduleTime = std::chrono::duration(t1 - t0).count(); - std::vector perm; + std::vector::VertexIdx> perm; sptrsv.SetupCsrWithPermutationLoopProcessors(schedule, perm); const unsigned supersteps = schedule.NumberOfSupersteps(); const int syncCosts = ComputeSyncCosts(instance); diff --git a/apps/test_suite_runner/StatsModules/BspSptrsvStatsModule.hpp b/apps/test_suite_runner/StatsModules/BspSptrsvStatsModule.hpp index 969bc114..79bf53ae 100644 --- a/apps/test_suite_runner/StatsModules/BspSptrsvStatsModule.hpp +++ b/apps/test_suite_runner/StatsModules/BspSptrsvStatsModule.hpp @@ -93,11 +93,13 @@ class BspSptrsvStatsModule : public IStatisticModule { || std::is_same_v>>) { using IndexT = std::conditional_t>>, int32_t, int64_t>; + using UndexT + = std::conditional_t>>, uint32_t, uint64_t>; auto instance = schedule.GetInstance(); Sptrsv sim{instance}; - std::vector perm; + std::vector perm; if (mode_ == NO_PERMUTE) { sim.SetupCsrNoPermutation(schedule); diff --git a/apps/test_suite_runner/StringToScheduler/get_coarser.hpp b/apps/test_suite_runner/StringToScheduler/get_coarser.hpp index e0455870..9d60b8d6 100644 --- a/apps/test_suite_runner/StringToScheduler/get_coarser.hpp +++ b/apps/test_suite_runner/StringToScheduler/get_coarser.hpp @@ -88,7 +88,7 @@ std::unique_ptr> GetCoarserByName(const ConfigParse .value_or(std::numeric_limits>::max())); coarserPtr->SetCommunicationThreshold(paramsPt.get_optional>("communication_threshold") .value_or(std::numeric_limits>::max())); - coarserPtr->SetSuperNodeSizeThreshold(paramsPt.get_optional("super_node_size_threshold").value_or(10)); + coarserPtr->SetSuperNodeSizeThreshold(paramsPt.get_optional>("super_node_size_threshold").value_or(10)); coarserPtr->SetNodeDistThreshold(paramsPt.get_optional("node_dist_threshold").value_or(10)); } }; diff --git a/include/osp/auxiliary/sptrsv_simulator/ScheduleNodePermuter.hpp b/include/osp/auxiliary/sptrsv_simulator/ScheduleNodePermuter.hpp index 3378c5b9..b5f8e577 100644 --- a/include/osp/auxiliary/sptrsv_simulator/ScheduleNodePermuter.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/ScheduleNodePermuter.hpp @@ -34,25 +34,25 @@ enum ScheduleNodePermutationModes { LOOP_PROCESSORS, SNAKE_PROCESSORS, PROCESSOR * * @param sched BSP Schedule * @param mode ordering of processors - * @return std::vector vec[prev_node_name] = new_node_name(location) + * @return std::vector> vec[prev_node_name] = new_node_name(location) */ template -std::vector ScheduleNodePermuterBasic(const BspSchedule &sched, - const ScheduleNodePermutationModes mode = LOOP_PROCESSORS) { +std::vector> ScheduleNodePermuterBasic(const BspSchedule &sched, + const ScheduleNodePermutationModes mode = LOOP_PROCESSORS) { // superstep, processor, nodes - std::vector>> allocation( + std::vector>>> allocation( sched.NumberOfSupersteps(), - std::vector>(sched.GetInstance().NumberOfProcessors(), std::vector({}))); - for (size_t node = 0; node < sched.GetInstance().NumberOfVertices(); node++) { + std::vector>>(sched.GetInstance().NumberOfProcessors(), std::vector>({}))); + for (VertexIdxT node = 0; node < sched.GetInstance().NumberOfVertices(); node++) { allocation[sched.AssignedSuperstep(node)][sched.AssignedProcessor(node)].emplace_back(node); } // reordering and allocating into permutation - std::vector permutation(sched.GetInstance().NumberOfVertices()); + std::vector> permutation(sched.GetInstance().NumberOfVertices()); if (mode == LOOP_PROCESSORS || mode == SNAKE_PROCESSORS) { bool forward = true; - size_t counter = 0; + VertexIdxT counter = 0; for (auto stepIt = allocation.begin(); stepIt != allocation.cend(); stepIt++) { if (forward) { for (auto procIt = stepIt->begin(); procIt != stepIt->cend(); procIt++) { diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index b221918f..6de9a89e 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -41,6 +41,7 @@ limitations under the License. # include "osp/bsp/model/BspInstance.hpp" # include "osp/bsp/model/BspSchedule.hpp" # include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp" +# include "osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp" namespace osp { @@ -61,13 +62,13 @@ class Sptrsv { std::vector rowIdx_; std::vector colPtr_; - std::vector> procStepPtr_; - std::vector> procStepNum_; + std::vector> procStepPtr_; + std::vector> procStepNum_; - std::vector procFirstStepPtr_; + std::vector procFirstStepPtr_; - std::vector> stepProcPtr_; - std::vector> stepProcNum_; + std::vector> stepProcPtr_; + std::vector> stepProcNum_; double *x_; const double *b_; @@ -98,14 +99,14 @@ class Sptrsv { schedule.NumberOfSupersteps(), std::vector>(schedule.GetInstance().NumberOfProcessors())); numSupersteps_ = schedule.NumberOfSupersteps(); - size_t numberOfVertices = instance_->GetComputationalDag().NumVertices(); + UVertType numberOfVertices = instance_->GetComputationalDag().NumVertices(); # pragma omp parallel num_threads(2) { int id = omp_get_thread_num(); switch (id) { case 0: { - for (size_t node = 0; node < numberOfVertices; ++node) { + for (UVertType node = 0; node < numberOfVertices; ++node) { vectorStepProcessorVertices_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back( static_cast(node)); } @@ -116,7 +117,7 @@ class Sptrsv { EigenIdxType start = vectorStepProcessorVertices_[step][proc][0]; EigenIdxType prev = vectorStepProcessorVertices_[step][proc][0]; - for (size_t i = 1; i < vectorStepProcessorVertices_[step][proc].size(); ++i) { + for (UVertType i = 1; i < vectorStepProcessorVertices_[step][proc].size(); ++i) { if (vectorStepProcessorVertices_[step][proc][i] != prev + 1) { boundsArrayL_[step][proc].push_back(start); boundsArrayL_[step][proc].push_back(prev); @@ -134,7 +135,7 @@ class Sptrsv { break; } case 1: { - size_t node = numberOfVertices; + UVertType node = numberOfVertices; do { node--; vectorStepProcessorVerticesU_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back( @@ -149,7 +150,7 @@ class Sptrsv { EigenIdxType startU = static_cast(vectorStepProcessorVerticesU_[step][proc][0]); EigenIdxType prevU = static_cast(vectorStepProcessorVerticesU_[step][proc][0]); - for (size_t i = 1; i < vectorStepProcessorVerticesU_[step][proc].size(); ++i) { + for (UVertType i = 1; i < vectorStepProcessorVerticesU_[step][proc].size(); ++i) { if (static_cast(vectorStepProcessorVerticesU_[step][proc][i]) != prevU - 1) { boundsArrayU_[step][proc].push_back(startU); boundsArrayU_[step][proc].push_back(prevU); @@ -173,26 +174,26 @@ class Sptrsv { } } - void SetupCsrWithPermutationLoopProcessors(const BspSchedule> &schedule, std::vector &perm) { + void SetupCsrWithPermutationLoopProcessors(const BspSchedule> &schedule, std::vector &perm) { const auto *const csr = instance_->GetComputationalDag().GetCSR(); const EigenIdxType *const outer = csr->outerIndexPtr(); const EigenIdxType *const inner = csr->innerIndexPtr(); const double *const values = csr->valuePtr(); const SparseMatrixImp &graph = instance_->GetComputationalDag(); - assert(static_cast(graph.NumVertices()) + static_cast(graph.NumEdges()) <= static_cast(std::numeric_limits::max())); - const unsigned numVert = static_cast(graph.NumVertices()); + assert(static_cast(graph.NumVertices()) + static_cast(graph.NumEdges()) <= static_cast(std::numeric_limits::max())); + const UVertType numVert = static_cast(graph.NumVertices()); numSupersteps_ = schedule.NumberOfSupersteps(); const unsigned numProcs = instance_->NumberOfProcessors(); - perm = std::vector(numVert, 0U); + perm = std::vector(numVert, 0U); - val_ = std::vector(static_cast(csr->nonZeros())); - colIdx_ = std::vector(static_cast(csr->nonZeros())); + val_ = std::vector(static_cast(csr->nonZeros())); + colIdx_ = std::vector(static_cast(csr->nonZeros())); rowPtr_ = std::vector(numVert + 1U, 0U); - procStepPtr_ = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); - procStepNum_ = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); + procStepPtr_ = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); + procStepNum_ = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); for (const auto vert : graph.Vertices()) { const unsigned whichStep = schedule.AssignedSuperstep(vert); @@ -201,7 +202,7 @@ class Sptrsv { perm[vert] = procStepNum_[whichProc][whichStep]++; // offsets } - unsigned accNode = 0U; + UVertType accNode = 0U; for (unsigned step = 0U; step < numSupersteps_; ++step) { for (unsigned proc = 0U; proc < numProcs; ++proc) { procStepPtr_[proc][step] = accNode; @@ -213,20 +214,20 @@ class Sptrsv { perm[vert] += procStepPtr_[schedule.AssignedProcessor(vert)][schedule.AssignedSuperstep(vert)]; } - std::vector> entryAccumulation = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); + std::vector> entryAccumulation = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); for (const auto vert : graph.Vertices()) { const unsigned whichStep = schedule.AssignedSuperstep(vert); const unsigned whichProc = schedule.AssignedProcessor(vert); rowPtr_[perm[vert]] = entryAccumulation[whichProc][whichStep]; - entryAccumulation[whichProc][whichStep] += static_cast(graph.InDegree(vert)) + 1; + entryAccumulation[whichProc][whichStep] += static_cast(graph.InDegree(vert)) + 1; } - unsigned accEntry = 0U; + UVertType accEntry = 0U; for (unsigned step = 0U; step < numSupersteps_; ++step) { for (unsigned proc = 0U; proc < numProcs; ++proc) { - unsigned temp = entryAccumulation[proc][step]; + UVertType temp = entryAccumulation[proc][step]; entryAccumulation[proc][step] = accEntry; accEntry += temp; } @@ -239,14 +240,14 @@ class Sptrsv { } for (const auto vert : graph.Vertices()) { - std::vector> parents; + std::vector> parents; parents.reserve(graph.InDegree(vert)); for (EigenIdxType edge = outer[vert]; edge < outer[vert + 1] - 1; ++edge) { - parents.emplace_back(perm[static_cast(inner[edge])], static_cast(edge)); + parents.emplace_back(perm[static_cast(inner[edge])], static_cast(edge)); } std::sort(parents.begin(), parents.end()); - const unsigned permVert = perm[vert]; + const UVertType permVert = perm[vert]; UVertType location = rowPtr_[permVert]; for (const auto &[permPar, edgeIdx] : parents) { colIdx_[location] = permPar; @@ -258,28 +259,28 @@ class Sptrsv { } } - void SetupCsrWithPermutationProcessorsFirst(const BspSchedule> &schedule, std::vector &perm) { + void SetupCsrWithPermutationProcessorsFirst(const BspSchedule> &schedule, std::vector &perm) { const auto *const csr = instance_->GetComputationalDag().GetCSR(); const EigenIdxType *const outer = csr->outerIndexPtr(); const EigenIdxType *const inner = csr->innerIndexPtr(); const double *const values = csr->valuePtr(); const SparseMatrixImp &graph = instance_->GetComputationalDag(); - assert(static_cast(graph.NumVertices()) + static_cast(graph.NumEdges()) <= static_cast(std::numeric_limits::max())); - const unsigned numVert = static_cast(graph.NumVertices()); + assert(static_cast(graph.NumVertices()) + static_cast(graph.NumEdges()) <= static_cast(std::numeric_limits::max())); + const UVertType numVert = static_cast(graph.NumVertices()); numSupersteps_ = schedule.NumberOfSupersteps(); const unsigned numProcs = instance_->NumberOfProcessors(); - perm = std::vector(numVert, 0U); + perm = std::vector(numVert, 0U); - val_ = std::vector(static_cast(csr->nonZeros())); - colIdx_ = std::vector(static_cast(csr->nonZeros())); + val_ = std::vector(static_cast(csr->nonZeros())); + colIdx_ = std::vector(static_cast(csr->nonZeros())); rowPtr_ = std::vector(numVert + 1U, 0U); - procFirstStepPtr_ = std::vector(0U); + procFirstStepPtr_ = std::vector(0U); procFirstStepPtr_.reserve(numProcs + numSupersteps_ + 1U); - procStepNum_ = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); + procStepNum_ = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); for (const auto vert : graph.Vertices()) { const unsigned whichStep = schedule.AssignedSuperstep(vert); @@ -288,7 +289,7 @@ class Sptrsv { perm[vert] = procStepNum_[whichProc][whichStep]++; // offsets } - unsigned accNode = 0U; + UVertType accNode = 0U; for (unsigned proc = 0U; proc < numProcs; ++proc) { for (unsigned step = 0U; step < numSupersteps_; ++step) { procFirstStepPtr_.emplace_back(accNode); @@ -302,20 +303,20 @@ class Sptrsv { perm[vert] += procFirstStepPtr_[schedule.AssignedProcessor(vert) * numSupersteps_ + schedule.AssignedSuperstep(vert)]; } - std::vector> entryAccumulation = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); + std::vector> entryAccumulation = std::vector>(numProcs, std::vector(numSupersteps_, 0U)); for (const auto vert : graph.Vertices()) { const unsigned whichStep = schedule.AssignedSuperstep(vert); const unsigned whichProc = schedule.AssignedProcessor(vert); rowPtr_[perm[vert]] = entryAccumulation[whichProc][whichStep]; - entryAccumulation[whichProc][whichStep] += static_cast(graph.InDegree(vert)) + 1; + entryAccumulation[whichProc][whichStep] += static_cast(graph.InDegree(vert)) + 1; } - unsigned accEntry = 0U; + UVertType accEntry = 0U; for (unsigned proc = 0U; proc < numProcs; ++proc) { for (unsigned step = 0U; step < numSupersteps_; ++step) { - unsigned temp = entryAccumulation[proc][step]; + UVertType temp = entryAccumulation[proc][step]; entryAccumulation[proc][step] = accEntry; accEntry += temp; } @@ -328,14 +329,14 @@ class Sptrsv { } for (const auto vert : graph.Vertices()) { - std::vector> parents; + std::vector> parents; parents.reserve(graph.InDegree(vert)); for (EigenIdxType edge = outer[vert]; edge < outer[vert + 1] - 1; ++edge) { - parents.emplace_back(perm[static_cast(inner[edge])], static_cast(edge)); + parents.emplace_back(perm[static_cast(inner[edge])], static_cast(edge)); } std::sort(parents.begin(), parents.end()); - const unsigned permVert = perm[vert]; + const UVertType permVert = perm[vert]; UVertType location = rowPtr_[permVert]; for (const auto &[permPar, edgeIdx] : parents) { colIdx_[location] = permPar; @@ -347,25 +348,25 @@ class Sptrsv { } } - void SetupCsrWithPermutation(const BspSchedule> &schedule, std::vector &perm) { - std::vector permInv(perm.size()); - for (size_t i = 0; i < perm.size(); i++) { + void SetupCsrWithPermutation(const BspSchedule> &schedule, std::vector &perm) { + std::vector permInv(perm.size()); + for (UVertType i = 0; i < perm.size(); i++) { permInv[perm[i]] = i; } numSupersteps_ = schedule.NumberOfSupersteps(); val_.clear(); - val_.reserve(static_cast(instance_->GetComputationalDag().GetCSR()->nonZeros())); + val_.reserve(static_cast(instance_->GetComputationalDag().GetCSR()->nonZeros())); colIdx_.clear(); - colIdx_.reserve(static_cast(instance_->GetComputationalDag().GetCSR()->nonZeros())); + colIdx_.reserve(static_cast(instance_->GetComputationalDag().GetCSR()->nonZeros())); rowPtr_.clear(); rowPtr_.reserve(instance_->NumberOfVertices() + 1); stepProcPtr_ - = std::vector>(numSupersteps_, std::vector(instance_->NumberOfProcessors(), 0)); + = std::vector>(numSupersteps_, std::vector(instance_->NumberOfProcessors(), 0)); stepProcNum_ = schedule.NumAssignedNodesPerSuperstepProcessor(); @@ -385,10 +386,10 @@ class Sptrsv { } } - stepProcPtr_[currentStep][currentProcessor] = static_cast(rowPtr_.size()); + stepProcPtr_[currentStep][currentProcessor] = static_cast(rowPtr_.size()); } - rowPtr_.push_back(colIdx_.size()); + rowPtr_.push_back(static_cast(colIdx_.size())); std::set parents; @@ -403,7 +404,7 @@ class Sptrsv { const auto *outer = instance_->GetComputationalDag().GetCSR()->outerIndexPtr(); for (UVertType parInd = static_cast(outer[node]); parInd < static_cast(outer[node + 1] - 1); ++parInd) { - if (static_cast(instance_->GetComputationalDag().GetCSR()->innerIndexPtr()[parInd]) == permInv[par]) { + if (static_cast(instance_->GetComputationalDag().GetCSR()->innerIndexPtr()[parInd]) == permInv[par]) { val_.push_back(instance_->GetComputationalDag().GetCSR()->valuePtr()[parInd]); found++; } @@ -417,7 +418,7 @@ class Sptrsv { ->valuePtr()[instance_->GetComputationalDag().GetCSR()->outerIndexPtr()[node + 1] - 1]); } - rowPtr_.push_back(colIdx_.size()); + rowPtr_.push_back(static_cast(colIdx_.size())); } void LsolveSerial() const { @@ -426,16 +427,9 @@ class Sptrsv { const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); double *const x = x_; const double *const b = b_; - const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); - for (EigenIdxType i = 0; i < numberOfVertices; ++i) { - x[i] = b[i]; - double acc = 0.0; - for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) { - acc += valPtr[j] * x[inner[j]]; - } - x[i] = (x[i] - acc) / valPtr[outer[i + 1] - 1]; - } + + SpLTrSvSerial(numberOfVertices, x, b, outer, inner, valPtr); } void UsolveSerial() const { @@ -465,27 +459,7 @@ class Sptrsv { const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); double *const x = x_; -# pragma omp parallel num_threads(instance_->NumberOfProcessors()) - { - const size_t proc = static_cast(omp_get_thread_num()); - for (unsigned step = 0; step < numSupersteps_; ++step) { - const size_t boundsStrSize = boundsArrayL_[step][proc].size(); - - for (size_t index = 0; index < boundsStrSize; index += 2) { - EigenIdxType lowerB = boundsArrayL_[step][proc][index]; - const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; - - for (EigenIdxType node = lowerB; node <= upperB; ++node) { - double acc = 0.0; - for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { - acc += valPtr[i] * x[inner[i]]; - } - x[node] = (x[node] - acc) / valPtr[outer[node + 1] - 1]; - } - } -# pragma omp barrier - } - } + SpLTrSvBSPParallelInPlace(x, outer, inner, valPtr, boundsArrayL_); } void UsolveNoPermutationInPlace() const { @@ -497,12 +471,12 @@ class Sptrsv { # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { // Process each superstep starting from the last one (opposite of lsolve) - const size_t proc = static_cast(omp_get_thread_num()); + const std::size_t proc = static_cast(omp_get_thread_num()); unsigned step = numSupersteps_; do { step--; - const size_t boundsStrSize = boundsArrayU_[step][proc].size(); - for (size_t index = 0; index < boundsStrSize; index += 2) { + const std::size_t boundsStrSize = boundsArrayU_[step][proc].size(); + for (std::size_t index = 0; index < boundsStrSize; index += 2) { EigenIdxType node = boundsArrayU_[step][proc][index] + 1; const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1]; @@ -527,28 +501,7 @@ class Sptrsv { double *const x = x_; const double *const b = b_; -# pragma omp parallel num_threads(instance_->NumberOfProcessors()) - { - const size_t proc = static_cast(omp_get_thread_num()); - for (unsigned step = 0; step < numSupersteps_; ++step) { - const size_t boundsStrSize = boundsArrayL_[step][proc].size(); - - for (size_t index = 0; index < boundsStrSize; index += 2) { - EigenIdxType lowerB = boundsArrayL_[step][proc][index]; - const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; - - for (EigenIdxType node = lowerB; node <= upperB; ++node) { - x[node] = b[node]; - double acc = 0.0; - for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { - acc += valPtr[i] * x[inner[i]]; - } - x[node] = (x[node] - acc) / valPtr[outer[node + 1] - 1]; - } - } -# pragma omp barrier - } - } + SpLTrSvBSPParallel(x, b, outer, inner, valPtr, boundsArrayL_); } void UsolveNoPermutation() const { @@ -561,12 +514,12 @@ class Sptrsv { # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { // Process each superstep starting from the last one (opposite of lsolve) - const size_t proc = static_cast(omp_get_thread_num()); + const std::size_t proc = static_cast(omp_get_thread_num()); unsigned step = numSupersteps_; do { step--; - const size_t boundsStrSize = boundsArrayU_[step][proc].size(); - for (size_t index = 0; index < boundsStrSize; index += 2) { + const std::size_t boundsStrSize = boundsArrayU_[step][proc].size(); + for (std::size_t index = 0; index < boundsStrSize; index += 2) { EigenIdxType node = boundsArrayU_[step][proc][index] + 1; const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1]; @@ -590,15 +543,9 @@ class Sptrsv { const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); double *const x = x_; - const EigenIdxType numberOfVertices = static_cast(instance_->NumberOfVertices()); - for (EigenIdxType i = 0; i < numberOfVertices; ++i) { - double acc = 0.0; - for (EigenIdxType j = outer[i]; j < outer[i + 1] - 1; ++j) { - acc += valPtr[j] * x[inner[j]]; - } - x[i] = (x[i] - acc) / valPtr[outer[i + 1] - 1]; - } + + SpLTrSvSerialInPlace(numberOfVertices, x, outer, inner, valPtr); } void UsolveSerialInPlace() const { @@ -624,7 +571,7 @@ class Sptrsv { # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { - const size_t proc = static_cast(omp_get_thread_num()); + const std::size_t proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; step++) { const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { @@ -644,25 +591,7 @@ class Sptrsv { void LsolveWithProcFirstPermutationInPlace() const { double *const x = x_; -# pragma omp parallel num_threads(instance_->NumberOfProcessors()) - { - const unsigned proc = static_cast(omp_get_thread_num()); - const auto endStepPtr = std::next(procFirstStepPtr_.cbegin(), (proc + 1U) * numSupersteps_); - for (auto stepPtr = std::next(procFirstStepPtr_.cbegin(), proc * numSupersteps_); stepPtr != endStepPtr;) { - UVertType rowIdx = *stepPtr; - const UVertType endRowIdx = *(++stepPtr); - for (; rowIdx != endRowIdx; ++rowIdx) { - double acc = 0.0; - for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { - acc += val_[i] * x[colIdx_[i]]; - } - - x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; - } - -# pragma omp barrier - } - } + SpLTrSvProcPermBSPParallelInPlace(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_); } void LsolveWithPermutation() const { @@ -672,7 +601,7 @@ class Sptrsv { # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { for (unsigned step = 0; step < numSupersteps_; step++) { - const size_t proc = static_cast(omp_get_thread_num()); + const std::size_t proc = static_cast(omp_get_thread_num()); const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { x[rowIdx] = b[rowIdx]; @@ -779,17 +708,17 @@ class Sptrsv { } } - void PermuteXVectorInverse(const std::vector &perm) { + void PermuteXVectorInverse(const std::vector &perm) { std::vector vecUnperm(perm.size()); - for (size_t i = 0; i < perm.size(); i++) { + for (UVertType i = 0; i < perm.size(); i++) { vecUnperm[perm[i]] = x_[i]; } - for (size_t i = 0; i < perm.size(); i++) { + for (UVertType i = 0; i < perm.size(); i++) { x_[i] = vecUnperm[i]; } } - std::size_t GetNumberOfVertices() const { return instance_->NumberOfVertices(); } + UVertType GetNumberOfVertices() const { return instance_->NumberOfVertices(); } // SSP Lsolve with staleness=2 (allowing at most one superstep of lag). // Uses FlatCheckpointCounterBarrier created internally. @@ -810,12 +739,12 @@ class Sptrsv { const std::size_t proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; ++step) { // Process nodes assigned to this (step, proc) pair. - const size_t boundsStrSize = boundsArrayL_[step][proc].size(); + const std::size_t boundsStrSize = boundsArrayL_[step][proc].size(); // Enforce staleness window before starting this superstep. if (boundsStrSize > 0U) { barrier.Wait(proc, staleness - 1U); } - for (size_t index = 0; index < boundsStrSize; index += 2) { + for (std::size_t index = 0; index < boundsStrSize; index += 2) { EigenIdxType lowerB = boundsArrayL_[step][proc][index]; const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; for (EigenIdxType node = lowerB; node <= upperB; ++node) { @@ -855,12 +784,12 @@ class Sptrsv { const std::size_t proc = static_cast(omp_get_thread_num()); for (unsigned step = 0; step < numSupersteps_; ++step) { // Process nodes assigned to this (step, proc) pair. - const size_t boundsStrSize = boundsArrayL_[step][proc].size(); + const std::size_t boundsStrSize = boundsArrayL_[step][proc].size(); // Enforce staleness window before starting this superstep. if (boundsStrSize > 0U) { barrier.Wait(proc, staleness - 1U); } - for (size_t index = 0; index < boundsStrSize; index += 2) { + for (std::size_t index = 0; index < boundsStrSize; index += 2) { EigenIdxType lowerB = boundsArrayL_[step][proc][index]; const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; for (EigenIdxType node = lowerB; node <= upperB; ++node) { @@ -900,12 +829,12 @@ class Sptrsv { unsigned step = numSupersteps_; do { step--; - const size_t boundsStrSize = boundsArrayU_[step][proc].size(); + const std::size_t boundsStrSize = boundsArrayU_[step][proc].size(); if (boundsStrSize > 0U) { barrier.Wait(proc, staleness - 1U); } - for (size_t index = 0; index < boundsStrSize; index += 2) { + for (std::size_t index = 0; index < boundsStrSize; index += 2) { EigenIdxType node = boundsArrayU_[step][proc][index] + 1; const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1]; diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp new file mode 100644 index 00000000..26739e6e --- /dev/null +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp @@ -0,0 +1,160 @@ +/* +Copyright 2026 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Christos Matzoros, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include + +#include +#include +#include + +#include "osp/auxiliary/sptrsv_simulator/WeakBarriers/flat_checkpoint_counter_barrier.hpp" + +namespace osp { + +template +void SpLTrSvSerial(const IdxType N, + double *__restrict__ const x, + const double *__restrict__ const b, + const IdxType *__restrict__ const outer, + const IdxType *__restrict__ const inner, + const double *__restrict__ const val) { + static_assert(std::is_integral_v); + + for (IdxType row = 0; row < N; ++row) { + double acc = b[row]; + for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { + acc -= val[entryIdx] * x[inner[entryIdx]]; + } + x[row] = acc / val[outer[row + 1] - 1]; + } +} + +template +void SpLTrSvSerialInPlace(const IdxType N, + double *__restrict__ const x, + const IdxType *__restrict__ const outer, + const IdxType *__restrict__ const inner, + const double *__restrict__ const val) { + static_assert(std::is_integral_v); + + for (IdxType row = 0; row < N; ++row) { + double acc = x[row]; + for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { + acc -= val[entryIdx] * x[inner[entryIdx]]; + } + x[row] = acc / val[outer[row + 1] - 1]; + } +} + +template +void SpLTrSvBSPParallel(double *__restrict__ const x, + const double *__restrict__ const b, + const IdxType *__restrict__ const outer, + const IdxType *__restrict__ const inner, + const double *__restrict__ const val, + const std::vector>> &BoundsStepProcIdx) { + static_assert(std::is_integral_v); + +#pragma omp parallel num_threads(BoundsStepProcIdx[0U].size()) + { + const std::size_t proc = static_cast(omp_get_thread_num()); + const std::size_t numSuperSteps = BoundsStepProcIdx.size(); + + for (std::size_t step = 0U; step < numSuperSteps; ++step) { + const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size(); + for (std::size_t idx = 0U; idx < ubIdx; ++idx) { + IdxType row = BoundsStepProcIdx[step][proc][idx]; + const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx]; + for (; row <= ubRow; ++row) { + double acc = b[row]; + for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { + acc -= val[entryIdx] * x[inner[entryIdx]]; + } + x[row] = acc / val[outer[row + 1] - 1]; + } + } +#pragma omp barrier + } + } +} + +template +void SpLTrSvBSPParallelInPlace(double *__restrict__ const x, + const IdxType *__restrict__ const outer, + const IdxType *__restrict__ const inner, + const double *__restrict__ const val, + const std::vector>> &BoundsStepProcIdx) { + static_assert(std::is_integral_v); + +#pragma omp parallel num_threads(BoundsStepProcIdx[0U].size()) + { + const std::size_t proc = static_cast(omp_get_thread_num()); + const std::size_t numSuperSteps = BoundsStepProcIdx.size(); + + for (std::size_t step = 0U; step < numSuperSteps; ++step) { + const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size(); + for (std::size_t idx = 0U; idx < ubIdx; ++idx) { + IdxType row = BoundsStepProcIdx[step][proc][idx]; + const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx]; + for (; row <= ubRow; ++row) { + double acc = x[row]; + for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { + acc -= val[entryIdx] * x[inner[entryIdx]]; + } + x[row] = acc / val[outer[row + 1] - 1]; + } + } +#pragma omp barrier + } + } +} + +template +void SpLTrSvProcPermBSPParallelInPlace(double *__restrict__ const x, + const IdxType *__restrict__ const outer, + const IdxType *__restrict__ const inner, + const double *__restrict__ const val, + const unsigned numProcs, + const unsigned numSuperSteps, + const std::vector &procStepPtr) { + static_assert(std::is_integral_v); + +#pragma omp parallel num_threads(numProcs) + { + const unsigned proc = static_cast(omp_get_thread_num()); + const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps); + for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) { + IdxType row = *stepPtr; + const IdxType endRow = *(++stepPtr); + for (; row != endRow; ++row) { + double acc = x[row]; + for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { + acc -= val[entryIdx] * x[inner[entryIdx]]; + } + + x[row] = acc / val[outer[row + 1] - 1]; + } + +#pragma omp barrier + } + } +} + +} // end namespace osp diff --git a/include/osp/bsp/model/BspSchedule.hpp b/include/osp/bsp/model/BspSchedule.hpp index d132c267..d5abeb0b 100644 --- a/include/osp/bsp/model/BspSchedule.hpp +++ b/include/osp/bsp/model/BspSchedule.hpp @@ -553,8 +553,8 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval * * @return A 2D vector containing the number of nodes assigned to each processor in each superstep. */ - [[nodiscard]] std::vector> NumAssignedNodesPerSuperstepProcessor() const { - std::vector> num(numberOfSupersteps_, std::vector(instance_->NumberOfProcessors(), 0)); + [[nodiscard]] std::vector> NumAssignedNodesPerSuperstepProcessor() const { + std::vector> num(numberOfSupersteps_, std::vector(instance_->NumberOfProcessors(), 0)); for (const auto &v : instance_->Vertices()) { num[nodeToSuperstepAssignment_[v]][nodeToProcessorAssignment_[v]] += 1; diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp index 64a5b97f..b9bd1ced 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalMaxBsp.hpp @@ -469,7 +469,7 @@ ReturnStatus GrowLocalSSP::ComputeSchedule(MaxBspSchedule(bestNewAssignments[proc].size()); for (const VertexType &node : bestNewAssignments[proc]) { schedule.SetAssignedProcessor(node, proc); diff --git a/include/osp/bsp/scheduler/GreedySchedulers/RandomGreedy.hpp b/include/osp/bsp/scheduler/GreedySchedulers/RandomGreedy.hpp index 6297a7ba..6f9f5164 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/RandomGreedy.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/RandomGreedy.hpp @@ -69,7 +69,7 @@ class RandomGreedy : public Scheduler { bool fewSources = next.size() < instance.NumberOfProcessors() ? true : false; unsigned failCounter = 0; while (!next.empty() && failCounter < 20) { - std::uniform_int_distribution randNodeIdx(0, next.size() - 1); + std::uniform_int_distribution randNodeIdx(0, static_cast(next.size() - 1)); VertexType nodeInd = randNodeIdx(g); const auto &node = next[nodeInd]; bool processorSet = false; diff --git a/include/osp/coarser/Sarkar/SarkarMul.hpp b/include/osp/coarser/Sarkar/SarkarMul.hpp index 86793b99..195ee6a2 100644 --- a/include/osp/coarser/Sarkar/SarkarMul.hpp +++ b/include/osp/coarser/Sarkar/SarkarMul.hpp @@ -122,7 +122,7 @@ ReturnStatus SarkarMul::RunSingleContractionMode(VertexIdx if (firstCoarsen_) { currentNumVertices = MultilevelCoarser::GetOriginalGraph()->NumVertices(); } else { - currentNumVertices = MultilevelCoarser::dagHistory_.back()->NumVertices(); + currentNumVertices = static_cast>(MultilevelCoarser::dagHistory_.back()->NumVertices()); } GraphTCoarse coarsenedDag; @@ -145,7 +145,7 @@ ReturnStatus SarkarMul::RunSingleContractionMode(VertexIdx status = std::max( status, MultilevelCoarser::AddContraction(std::move(contractionMap), std::move(coarsenedDag))); - VertexIdxT newNumVertices = MultilevelCoarser::dagHistory_.back()->NumVertices(); + VertexIdxT newNumVertices = static_cast>(MultilevelCoarser::dagHistory_.back()->NumVertices()); diffVertices = currentNumVertices - newNumVertices; return status; diff --git a/include/osp/coarser/SquashA/SquashAMul.hpp b/include/osp/coarser/SquashA/SquashAMul.hpp index 2d0c85fb..8a3fbd32 100644 --- a/include/osp/coarser/SquashA/SquashAMul.hpp +++ b/include/osp/coarser/SquashA/SquashAMul.hpp @@ -97,7 +97,7 @@ ReturnStatus SquashAMul::RunContractions() { status = std::max( status, MultilevelCoarser::AddContraction(std::move(contractionMap), std::move(coarsenedDag))); - VertexIdxT newNumVertices = MultilevelCoarser::dagHistory_.back()->NumVertices(); + VertexIdxT newNumVertices = static_cast>(MultilevelCoarser::dagHistory_.back()->NumVertices()); if (newNumVertices == currentNumVertices) { noChangeInARow++; diff --git a/include/osp/coarser/coarser_util.hpp b/include/osp/coarser/coarser_util.hpp index aec1a48e..1f5a0203 100644 --- a/include/osp/coarser/coarser_util.hpp +++ b/include/osp/coarser/coarser_util.hpp @@ -490,9 +490,9 @@ bool PullBackSchedule(const BspSchedule &scheduleIn, template bool PullBackSchedule(const BspSchedule &scheduleIn, - const std::vector> &reverseVertexMap, + const std::vector> &reverseVertexMap, BspSchedule &scheduleOut) { - for (unsigned idx = 0; idx < reverseVertexMap.size(); ++idx) { + for (const auto &idx : scheduleOut.GetInstance().GetComputationalDag().Vertices()) { const auto &v = reverseVertexMap[idx]; scheduleOut.SetAssignedSuperstep(idx, scheduleIn.AssignedSuperstep(v)); diff --git a/include/osp/coarser/top_order/top_order_coarser.hpp b/include/osp/coarser/top_order/top_order_coarser.hpp index d3b2d9cc..59567653 100644 --- a/include/osp/coarser/top_order/top_order_coarser.hpp +++ b/include/osp/coarser/top_order/top_order_coarser.hpp @@ -43,7 +43,7 @@ class TopOrderCoarser : public Coarser { VMemwT currentMemory_ = 0; VWorkwT currentWork_ = 0; VCommwT currentCommunication_ = 0; - VertexType currentSuperNodeIdx_ = 0; + VertexIdxT currentSuperNodeIdx_ = 0; void FinishSuperNodeAddEdges(const GraphTIn &dagIn, GraphTOut &dagOut, @@ -56,8 +56,8 @@ class TopOrderCoarser : public Coarser { for (const auto &node : nodes) { if constexpr (hasEdgeWeightsV && hasEdgeWeightsV) { for (const auto &inEdge : InEdges(node, dagIn)) { - const VertexType parentRev = reverseVertexMap[Source(inEdge, dagIn)]; - if (parentRev != currentSuperNodeIdx_ && parentRev != std::numeric_limits::max()) { + const VertexIdxT parentRev = reverseVertexMap[Source(inEdge, dagIn)]; + if (parentRev != currentSuperNodeIdx_ && parentRev != std::numeric_limits>::max()) { auto pair = EdgeDesc(parentRev, currentSuperNodeIdx_, dagOut); if (pair.second) { dagOut.SetEdgeCommWeight(pair.first, dagOut.EdgeCommWeight(pair.first) + dagIn.EdgeCommWeight(inEdge)); @@ -68,8 +68,8 @@ class TopOrderCoarser : public Coarser { } } else { for (const auto &parent : dagIn.Parents(node)) { - const VertexType parentRev = reverseVertexMap[parent]; - if (parentRev != currentSuperNodeIdx_ && parentRev != std::numeric_limits::max()) { + const VertexIdxT parentRev = reverseVertexMap[parent]; + if (parentRev != currentSuperNodeIdx_ && parentRev != std::numeric_limits>::max()) { if (not Edge(parentRev, currentSuperNodeIdx_, dagOut)) { dagOut.AddEdge(parentRev, currentSuperNodeIdx_); } diff --git a/include/osp/concepts/directed_graph_concept.hpp b/include/osp/concepts/directed_graph_concept.hpp index aaa537ad..09bc9900 100644 --- a/include/osp/concepts/directed_graph_concept.hpp +++ b/include/osp/concepts/directed_graph_concept.hpp @@ -64,13 +64,15 @@ struct IsDirectedGraph().Children(std::declval>())), decltype(std::declval().InDegree(std::declval>())), decltype(std::declval().OutDegree(std::declval>()))>> - : std::conjunction().Vertices()), VertexIdxT>, + : std::conjunction< + IsForwardRangeOf().Vertices()), VertexIdxT>, std::is_integral().NumVertices())>, std::is_integral().NumEdges())>, - IsInputRangeOf().Parents(std::declval>())), VertexIdxT>, - IsInputRangeOf().Children(std::declval>())), VertexIdxT>, - std::is_integral().InDegree(std::declval>()))>, - std::is_integral().OutDegree(std::declval>()))>> {}; + IsInputRangeOf().Parents(std::declval>())), VertexIdxT> + // IsInputRangeOf().Children(std::declval>())), VertexIdxT>, + // std::is_integral().InDegree(std::declval>()))>, + // std::is_integral().OutDegree(std::declval>()))> + > {}; template inline constexpr bool isDirectedGraphV = IsDirectedGraph::value; diff --git a/include/osp/graph_algorithms/cuthill_mckee.hpp b/include/osp/graph_algorithms/cuthill_mckee.hpp index 6470d17d..c1dcc2bb 100644 --- a/include/osp/graph_algorithms/cuthill_mckee.hpp +++ b/include/osp/graph_algorithms/cuthill_mckee.hpp @@ -205,7 +205,7 @@ std::vector> CuthillMckeeUndirected(const GraphT &dag, bool s } } - nodeCounter += currentLevel.size(); + nodeCounter += static_cast(currentLevel.size()); if (nodePriority.empty()) { // the dag has more than one connected components diff --git a/include/osp/graph_algorithms/directed_graph_path_util.hpp b/include/osp/graph_algorithms/directed_graph_path_util.hpp index 37733275..16423bc6 100644 --- a/include/osp/graph_algorithms/directed_graph_path_util.hpp +++ b/include/osp/graph_algorithms/directed_graph_path_util.hpp @@ -132,7 +132,7 @@ std::size_t LongestPath(const GraphT &graph) { if (visitCounter[child] == graph.InDegree(child)) { bfsQueue.push(child); distances[child] = distances[current] + 1; - maxEdgecount = std::max(maxEdgecount, distances[child]); + maxEdgecount = std::max(maxEdgecount, static_cast(distances[child])); } } } diff --git a/include/osp/graph_implementations/eigen_matrix_adapter/eigen_sparse_iterator.hpp b/include/osp/graph_implementations/eigen_matrix_adapter/eigen_sparse_iterator.hpp index b621ee3b..227a6724 100644 --- a/include/osp/graph_implementations/eigen_matrix_adapter/eigen_sparse_iterator.hpp +++ b/include/osp/graph_implementations/eigen_matrix_adapter/eigen_sparse_iterator.hpp @@ -30,6 +30,7 @@ class EigenCSRRange { public: using CSRMatrix = Eigen::SparseMatrix; using Inner = typename CSRMatrix::InnerIterator; + using UsignedType = std::make_unsigned_t; class Iterator { Inner it_; @@ -41,10 +42,10 @@ class EigenCSRRange { } public: - using value_type = std::size_t; + using value_type = UsignedType; using reference = value_type; using pointer = void; - using difference_type = std::ptrdiff_t; + using difference_type = UsignedType; using iterator_category = std::input_iterator_tag; Iterator() = default; @@ -58,7 +59,7 @@ class EigenCSRRange { Iterator(const CSRMatrix &mat, EigenIdxType idx) : it_(mat, idx) { SkipDiagonal(); } - reference operator*() const { return static_cast(it_.col()); } + reference operator*() const { return static_cast(it_.col()); } Iterator &operator++() { ++it_; @@ -93,9 +94,11 @@ class EigenCSCRange { const Graph &graph_; EigenIdxType index_; - public: + + public: using CSCMatrix = Eigen::SparseMatrix; using Inner = typename CSCMatrix::InnerIterator; + using UsignedType = std::make_unsigned_t; class Iterator { Inner it_; @@ -107,10 +110,10 @@ class EigenCSCRange { } public: - using value_type = std::size_t; + using value_type = UsignedType; using reference = value_type; using pointer = void; - using difference_type = std::ptrdiff_t; + using difference_type = UsignedType; using iterator_category = std::input_iterator_tag; Iterator() = default; @@ -124,7 +127,7 @@ class EigenCSCRange { Iterator(const CSCMatrix &mat, EigenIdxType idx) : it_(mat, idx) { SkipDiagonal(); } - reference operator*() const { return static_cast(it_.row()); } + reference operator*() const { return static_cast(it_.row()); } Iterator &operator++() { ++it_; diff --git a/include/osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp b/include/osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp index 96bdad19..d3233467 100644 --- a/include/osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp +++ b/include/osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp @@ -20,6 +20,9 @@ limitations under the License. #ifdef EIGEN_FOUND +# include +# include + # include # include "eigen_sparse_iterator.hpp" @@ -47,7 +50,7 @@ class SparseMatrixImp { public: // Vertex index type must match Eigen's StorageIndex (signed 32-bit) - using VertexIdx = size_t; + using VertexIdx = std::make_unsigned_t; // Required graph trait aliases (used in concept checks) using VertexWorkWeightType = EigenIdxType; @@ -70,10 +73,10 @@ class SparseMatrixImp { const MatrixCSC *GetCSC() const { return lCscP_; } /// @brief Number of vertices = number of rows in the matrix - size_t NumVertices() const noexcept { return static_cast(lCsrP_->rows()); } + VertexIdx NumVertices() const noexcept { return static_cast(lCsrP_->rows()); } /// @brief Return a range over all vertices [0, NumVertices) - auto Vertices() const { return osp::IntegralRange(NumVertices()); } + auto Vertices() const { return osp::IntegralRange(NumVertices()); } /// @brief Number of edges = total non-zeros minus diagonal elements VertexIdx NumEdges() const noexcept { return static_cast(lCsrP_->nonZeros() - lCsrP_->rows()); } @@ -116,8 +119,6 @@ class SparseMatrixImp { using SparseMatrixGraphInt32T = SparseMatrixImp; using SparseMatrixGraphInt64T = SparseMatrixImp; -static_assert(isDirectedGraphEdgeDescV>, "SparseMatrix must satisfy the directed_graph_edge_desc concept"); - // Verify that SparseMatrixImp satisfies the directed graph concept static_assert(isDirectedGraphV>, "SparseMatrix must satisfy directed_graph_concept"); @@ -130,6 +131,8 @@ static_assert(hasVertexWeightsV>, "CompactSparseGraph m static_assert(isComputationalDagTypedVerticesV>, "CompactSparseGraph must satisfy the is_computation_dag concept"); +static_assert(isDirectedGraphEdgeDescV>, "SparseMatrix must satisfy the directed_graph_edge_desc concept"); + } // namespace osp #endif diff --git a/tests/sparse_matrix_impl.cpp b/tests/sparse_matrix_impl.cpp index 291f4aeb..8e928df9 100644 --- a/tests/sparse_matrix_impl.cpp +++ b/tests/sparse_matrix_impl.cpp @@ -136,7 +136,7 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter1) { size_t idx = 0; - for (const long unsigned int &v : graph.Vertices()) { + for (const auto &v : graph.Vertices()) { BOOST_CHECK_EQUAL(v, vertices[idx++]); size_t i = 0; @@ -234,6 +234,8 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter2) { lCsc = lCsr; SparseMatrixImp graph; + using UVertType = VertexIdxT>; + graph.SetCsr(&lCsr); graph.SetCsc(&lCsc); @@ -244,7 +246,7 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter2) { BOOST_CHECK_EQUAL(static_cast(graph.NumEdges()), graph2.NumEdges()); for (const auto &vert : graph2.Vertices()) { - auto chldren = graph.Children(vert); + auto chldren = graph.Children(static_cast(vert)); auto chldren2 = graph2.Children(vert); auto it = chldren.begin(); auto it_other = chldren.begin(); @@ -269,7 +271,7 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter2) { ++it_other; ++it2; } - BOOST_CHECK_EQUAL(cntr, graph.OutDegree(vert)); + BOOST_CHECK_EQUAL(static_cast(cntr), graph.OutDegree(static_cast(vert))); BOOST_CHECK_EQUAL(cntr, graph1.OutDegree(vert)); BOOST_CHECK_EQUAL(cntr, graph2.OutDegree(vert)); BOOST_CHECK(it == end); @@ -278,7 +280,7 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter2) { } for (const auto &vert : graph2.Vertices()) { - auto parents = graph.Parents(vert); + auto parents = graph.Parents(static_cast(vert)); auto parents2 = graph2.Parents(vert); auto it = parents.begin(); auto it_other = parents.begin(); @@ -301,7 +303,7 @@ BOOST_AUTO_TEST_CASE(TestSparseMatrixAdapter2) { ++it; ++it2; } - BOOST_CHECK_EQUAL(cntr, graph.InDegree(vert)); + BOOST_CHECK_EQUAL(static_cast(cntr), graph.InDegree(static_cast(vert))); BOOST_CHECK_EQUAL(cntr, graph1.InDegree(vert)); BOOST_CHECK_EQUAL(cntr, graph2.InDegree(vert)); BOOST_CHECK(it == end); diff --git a/tests/sptrsv.cpp b/tests/sptrsv.cpp index 355a36d7..d235aea0 100644 --- a/tests/sptrsv.cpp +++ b/tests/sptrsv.cpp @@ -229,7 +229,7 @@ BOOST_AUTO_TEST_CASE(TestEigenSptrsv) { BOOST_CHECK(CompareVectors(uXRef, uXOsp)); // Lsolve in-place With PERMUTATION - std::vector perm;// = ScheduleNodePermuterBasic(scheduleCs, LOOP_PROCESSORS); + std::vector::VertexIdx> perm;// = ScheduleNodePermuterBasic(scheduleCs, LOOP_PROCESSORS); sim.SetupCsrWithPermutationLoopProcessors(scheduleCs, perm); std::vector permCheck(graph.NumVertices(), false); BOOST_CHECK_EQUAL(permCheck.size(), perm.size()); From 019bf4445755e519851f6673b958863a0ea24520 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Tue, 24 Mar 2026 15:03:21 +0100 Subject: [PATCH 46/57] ssp sptrsv kernel --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 32 +----- .../sptrsv_simulator/sptrsv_kernels.hpp | 107 ++++++++++++++++++ 2 files changed, 108 insertions(+), 31 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 6de9a89e..76b59a28 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -654,39 +654,9 @@ class Sptrsv { template void SspLsolveStalenessWithProcFirstPermutationInPlace() const { - const unsigned nthreads = instance_->NumberOfProcessors(); - FlatCheckpointCounterBarrier barrier(nthreads); - - const auto *const csr = instance_->GetComputationalDag().GetCSR(); - const EigenIdxType *const outer = csr->outerIndexPtr(); - const EigenIdxType *const inner = csr->innerIndexPtr(); - const double *const vals = csr->valuePtr(); double *const x = x_; -# pragma omp parallel num_threads(nthreads) - { - const unsigned proc = static_cast(omp_get_thread_num()); - const auto endStepPtr = std::next(procFirstStepPtr_.cbegin(), (proc + 1U) * numSupersteps_); - for (auto stepPtr = std::next(procFirstStepPtr_.cbegin(), proc * numSupersteps_); stepPtr != endStepPtr;) { - UVertType rowIdx = *stepPtr; - const UVertType endRowIdx = *(++stepPtr); - - if (rowIdx != endRowIdx) { - barrier.Wait(proc, staleness - 1U); - } - - for (; rowIdx != endRowIdx; ++rowIdx) { - double acc = 0.0; - for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { - acc += val_[i] * x[colIdx_[i]]; - } - - x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; - } - // Signal completion of this superstep. - barrier.Arrive(proc); - } - } + SpLTrSvProcPermSSPParallelInPlace(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_); } void ResetX() { diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp index 26739e6e..a6cec829 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp @@ -126,6 +126,38 @@ void SpLTrSvBSPParallelInPlace(double *__restrict__ const x, } } +template +void SpLTrSvProcPermBSPParallel(double *__restrict__ const x, + const double *__restrict__ const b, + const IdxType *__restrict__ const outer, + const IdxType *__restrict__ const inner, + const double *__restrict__ const val, + const unsigned numProcs, + const unsigned numSuperSteps, + const std::vector &procStepPtr) { + static_assert(std::is_integral_v); + +#pragma omp parallel num_threads(numProcs) + { + const unsigned proc = static_cast(omp_get_thread_num()); + const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps); + for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) { + IdxType row = *stepPtr; + const IdxType endRow = *(++stepPtr); + for (; row != endRow; ++row) { + double acc = b[row]; + for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { + acc -= val[entryIdx] * x[inner[entryIdx]]; + } + + x[row] = acc / val[outer[row + 1] - 1]; + } + +#pragma omp barrier + } + } +} + template void SpLTrSvProcPermBSPParallelInPlace(double *__restrict__ const x, const IdxType *__restrict__ const outer, @@ -157,4 +189,79 @@ void SpLTrSvProcPermBSPParallelInPlace(double *__restrict__ const x, } } +template +void SpLTrSvProcPermSSPParallel(double *__restrict__ const x, + const double *__restrict__ const b, + const IdxType *__restrict__ const outer, + const IdxType *__restrict__ const inner, + const double *__restrict__ const val, + const unsigned numProcs, + const unsigned numSuperSteps, + const std::vector &procStepPtr) { + static_assert(std::is_integral_v); + + FlatCheckpointCounterBarrier barrier(numProcs); +#pragma omp parallel num_threads(numProcs) + { + const unsigned proc = static_cast(omp_get_thread_num()); + const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps); + for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) { + IdxType row = *stepPtr; + const IdxType endRow = *(++stepPtr); + + if (row != endRow) { + barrier.Wait(proc, staleness - 1U); + } + + for (; row != endRow; ++row) { + double acc = b[row]; + for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; entryIdx++) { + acc -= val[entryIdx] * x[inner[entryIdx]]; + } + + x[row] = acc / val[outer[row + 1] - 1]; + } + // Signal completion of this superstep. + barrier.Arrive(proc); + } + } +} + +template +void SpLTrSvProcPermSSPParallelInPlace(double *__restrict__ const x, + const IdxType *__restrict__ const outer, + const IdxType *__restrict__ const inner, + const double *__restrict__ const val, + const unsigned numProcs, + const unsigned numSuperSteps, + const std::vector &procStepPtr) { + static_assert(std::is_integral_v); + + FlatCheckpointCounterBarrier barrier(numProcs); +#pragma omp parallel num_threads(numProcs) + { + const unsigned proc = static_cast(omp_get_thread_num()); + const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps); + for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) { + IdxType row = *stepPtr; + const IdxType endRow = *(++stepPtr); + + if (row != endRow) { + barrier.Wait(proc, staleness - 1U); + } + + for (; row != endRow; ++row) { + double acc = x[row]; + for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; entryIdx++) { + acc -= val[entryIdx] * x[inner[entryIdx]]; + } + + x[row] = acc / val[outer[row + 1] - 1]; + } + // Signal completion of this superstep. + barrier.Arrive(proc); + } + } +} + } // end namespace osp From a310d833f2b74814cf5b782712c3c6edf994c40b Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Tue, 24 Mar 2026 15:24:30 +0100 Subject: [PATCH 47/57] SSP SpTrSV kernels --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 80 ++----------------- .../sptrsv_simulator/sptrsv_kernels.hpp | 73 ++++++++++++++++- 2 files changed, 79 insertions(+), 74 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 76b59a28..62e1696f 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -694,89 +694,25 @@ class Sptrsv { // Uses FlatCheckpointCounterBarrier created internally. template void SspLsolveStaleness() const { - const unsigned nthreads = instance_->NumberOfProcessors(); - FlatCheckpointCounterBarrier barrier(nthreads); - - const auto *const csr = instance_->GetComputationalDag().GetCSR(); - const EigenIdxType *const outer = csr->outerIndexPtr(); - const EigenIdxType *const inner = csr->innerIndexPtr(); - const double *const vals = csr->valuePtr(); + const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); + const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); + const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); double *const x = x_; const double *const b = b_; -# pragma omp parallel num_threads(nthreads) - { - const std::size_t proc = static_cast(omp_get_thread_num()); - for (unsigned step = 0; step < numSupersteps_; ++step) { - // Process nodes assigned to this (step, proc) pair. - const std::size_t boundsStrSize = boundsArrayL_[step][proc].size(); - // Enforce staleness window before starting this superstep. - if (boundsStrSize > 0U) { - barrier.Wait(proc, staleness - 1U); - } - for (std::size_t index = 0; index < boundsStrSize; index += 2) { - EigenIdxType lowerB = boundsArrayL_[step][proc][index]; - const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; - for (EigenIdxType node = lowerB; node <= upperB; ++node) { - // Initialize solution for this node - x[node] = b[node]; - double acc = 0.0; - // Perform lower-triangular solve for this node - for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { - // Accumulate contributions from previously solved nodes - acc += vals[i] * x[inner[i]]; - } - // Divide by diagonal element to complete solve for this node - x[node] = (x[node] - acc) / vals[outer[node + 1] - 1]; - } - } - // Signal completion of this superstep. - barrier.Arrive(proc); - } - } + SpLTrSvSSPParallel(x, b, outer, inner, valPtr, boundsArrayL_); } // SSP Lsolve in-place with staleness=2 (allowing at most one superstep of lag). // Uses FlatCheckpointCounterBarrier created internally. template void SspLsolveStalenessInPlace() const { - const unsigned nthreads = instance_->NumberOfProcessors(); - FlatCheckpointCounterBarrier barrier(nthreads); - - const auto *const csr = instance_->GetComputationalDag().GetCSR(); - const EigenIdxType *const outer = csr->outerIndexPtr(); - const EigenIdxType *const inner = csr->innerIndexPtr(); - const double *const vals = csr->valuePtr(); + const EigenIdxType *const outer = (*(instance_->GetComputationalDag().GetCSR())).outerIndexPtr(); + const EigenIdxType *const inner = (*(instance_->GetComputationalDag().GetCSR())).innerIndexPtr(); + const double *const valPtr = (*(instance_->GetComputationalDag().GetCSR())).valuePtr(); double *const x = x_; -# pragma omp parallel num_threads(nthreads) - { - const std::size_t proc = static_cast(omp_get_thread_num()); - for (unsigned step = 0; step < numSupersteps_; ++step) { - // Process nodes assigned to this (step, proc) pair. - const std::size_t boundsStrSize = boundsArrayL_[step][proc].size(); - // Enforce staleness window before starting this superstep. - if (boundsStrSize > 0U) { - barrier.Wait(proc, staleness - 1U); - } - for (std::size_t index = 0; index < boundsStrSize; index += 2) { - EigenIdxType lowerB = boundsArrayL_[step][proc][index]; - const EigenIdxType upperB = boundsArrayL_[step][proc][index + 1]; - for (EigenIdxType node = lowerB; node <= upperB; ++node) { - double acc = 0.0; - // Perform lower-triangular solve for this node - for (EigenIdxType i = outer[node]; i < outer[node + 1] - 1; ++i) { - // Accumulate contributions from previously solved nodes - acc += vals[i] * x[inner[i]]; - } - // Divide by diagonal element to complete solve for this node - x[node] = (x[node] - acc) / vals[outer[node + 1] - 1]; - } - } - // Signal completion of this superstep. - barrier.Arrive(proc); - } - } + SpLTrSvSSPParallelInPlace(x, outer, inner, valPtr, boundsArrayL_); } // SSP Usolve with configurable staleness. diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp index a6cec829..d5a18fdc 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp @@ -126,6 +126,77 @@ void SpLTrSvBSPParallelInPlace(double *__restrict__ const x, } } +template +void SpLTrSvSSPParallel(double *__restrict__ const x, + const double *__restrict__ const b, + const IdxType *__restrict__ const outer, + const IdxType *__restrict__ const inner, + const double *__restrict__ const val, + const std::vector>> &BoundsStepProcIdx) { + static_assert(std::is_integral_v); + + const std::size_t nthreads = BoundsStepProcIdx[0U].size(); + FlatCheckpointCounterBarrier barrier(nthreads); + +#pragma omp parallel num_threads(nthreads) + { + const std::size_t proc = static_cast(omp_get_thread_num()); + for (std::size_t step = 0; step < BoundsStepProcIdx.size(); ++step) { + const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size(); + if (ubIdx > 0U) { + barrier.Wait(proc, staleness - 1U); + } + for (std::size_t idx = 0; idx < ubIdx; ++idx) { + IdxType row = BoundsStepProcIdx[step][proc][idx]; + const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx]; + for (; row <= ubRow; ++row) { + double acc = b[row]; + for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { + acc -= val[entryIdx] * x[inner[entryIdx]]; + } + x[row] = acc / val[outer[row + 1] - 1]; + } + } + barrier.Arrive(proc); + } + } +} + +template +void SpLTrSvSSPParallelInPlace(double *__restrict__ const x, + const IdxType *__restrict__ const outer, + const IdxType *__restrict__ const inner, + const double *__restrict__ const val, + const std::vector>> &BoundsStepProcIdx) { + static_assert(std::is_integral_v); + + const std::size_t nthreads = BoundsStepProcIdx[0U].size(); + FlatCheckpointCounterBarrier barrier(nthreads); + +#pragma omp parallel num_threads(nthreads) + { + const std::size_t proc = static_cast(omp_get_thread_num()); + for (std::size_t step = 0; step < BoundsStepProcIdx.size(); ++step) { + const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size(); + if (ubIdx > 0U) { + barrier.Wait(proc, staleness - 1U); + } + for (std::size_t idx = 0; idx < ubIdx; ++idx) { + IdxType row = BoundsStepProcIdx[step][proc][idx]; + const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx]; + for (; row <= ubRow; ++row) { + double acc = x[row]; + for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { + acc -= val[entryIdx] * x[inner[entryIdx]]; + } + x[row] = acc / val[outer[row + 1] - 1]; + } + } + barrier.Arrive(proc); + } + } +} + template void SpLTrSvProcPermBSPParallel(double *__restrict__ const x, const double *__restrict__ const b, @@ -221,7 +292,6 @@ void SpLTrSvProcPermSSPParallel(double *__restrict__ const x, x[row] = acc / val[outer[row + 1] - 1]; } - // Signal completion of this superstep. barrier.Arrive(proc); } } @@ -258,7 +328,6 @@ void SpLTrSvProcPermSSPParallelInPlace(double *__restrict__ const x, x[row] = acc / val[outer[row + 1] - 1]; } - // Signal completion of this superstep. barrier.Arrive(proc); } } From 64cd7d7dc6a3503483fc59e01d72775cc090daf7 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Tue, 24 Mar 2026 15:37:53 +0100 Subject: [PATCH 48/57] changed to pointer --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 4 ++-- .../sptrsv_simulator/sptrsv_kernels.hpp | 24 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 62e1696f..84166644 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -591,7 +591,7 @@ class Sptrsv { void LsolveWithProcFirstPermutationInPlace() const { double *const x = x_; - SpLTrSvProcPermBSPParallelInPlace(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_); + SpLTrSvProcPermBSPParallelInPlace(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_.data()); } void LsolveWithPermutation() const { @@ -656,7 +656,7 @@ class Sptrsv { void SspLsolveStalenessWithProcFirstPermutationInPlace() const { double *const x = x_; - SpLTrSvProcPermSSPParallelInPlace(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_); + SpLTrSvProcPermSSPParallelInPlace(x, rowPtr_.data(), colIdx_.data(), val_.data(), instance_->NumberOfProcessors(), numSupersteps_, procFirstStepPtr_.data()); } void ResetX() { diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp index d5a18fdc..ccf323be 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp @@ -205,14 +205,14 @@ void SpLTrSvProcPermBSPParallel(double *__restrict__ const x, const double *__restrict__ const val, const unsigned numProcs, const unsigned numSuperSteps, - const std::vector &procStepPtr) { + const IdxType *__restrict__ const procStepPtr) { static_assert(std::is_integral_v); #pragma omp parallel num_threads(numProcs) { const unsigned proc = static_cast(omp_get_thread_num()); - const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps); - for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) { + const IdxType *const endStepPtr = procStepPtr + ((proc + 1U) * numSuperSteps); + for (const IdxType *stepPtr = procStepPtr + (proc * numSuperSteps); stepPtr != endStepPtr;) { IdxType row = *stepPtr; const IdxType endRow = *(++stepPtr); for (; row != endRow; ++row) { @@ -236,14 +236,14 @@ void SpLTrSvProcPermBSPParallelInPlace(double *__restrict__ const x, const double *__restrict__ const val, const unsigned numProcs, const unsigned numSuperSteps, - const std::vector &procStepPtr) { + const IdxType *__restrict__ const procStepPtr) { static_assert(std::is_integral_v); #pragma omp parallel num_threads(numProcs) { const unsigned proc = static_cast(omp_get_thread_num()); - const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps); - for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) { + const IdxType *const endStepPtr = procStepPtr + ((proc + 1U) * numSuperSteps); + for (const IdxType *stepPtr = procStepPtr + (proc * numSuperSteps); stepPtr != endStepPtr;) { IdxType row = *stepPtr; const IdxType endRow = *(++stepPtr); for (; row != endRow; ++row) { @@ -268,15 +268,15 @@ void SpLTrSvProcPermSSPParallel(double *__restrict__ const x, const double *__restrict__ const val, const unsigned numProcs, const unsigned numSuperSteps, - const std::vector &procStepPtr) { + const IdxType *__restrict__ const procStepPtr) { static_assert(std::is_integral_v); FlatCheckpointCounterBarrier barrier(numProcs); #pragma omp parallel num_threads(numProcs) { const unsigned proc = static_cast(omp_get_thread_num()); - const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps); - for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) { + const IdxType *const endStepPtr = procStepPtr + ((proc + 1U) * numSuperSteps); + for (const IdxType *stepPtr = procStepPtr + (proc * numSuperSteps); stepPtr != endStepPtr;) { IdxType row = *stepPtr; const IdxType endRow = *(++stepPtr); @@ -304,15 +304,15 @@ void SpLTrSvProcPermSSPParallelInPlace(double *__restrict__ const x, const double *__restrict__ const val, const unsigned numProcs, const unsigned numSuperSteps, - const std::vector &procStepPtr) { + const IdxType *__restrict__ const procStepPtr) { static_assert(std::is_integral_v); FlatCheckpointCounterBarrier barrier(numProcs); #pragma omp parallel num_threads(numProcs) { const unsigned proc = static_cast(omp_get_thread_num()); - const auto endStepPtr = std::next(procStepPtr.cbegin(), (proc + 1U) * numSuperSteps); - for (auto stepPtr = std::next(procStepPtr.cbegin(), proc * numSuperSteps); stepPtr != endStepPtr;) { + const IdxType *const endStepPtr = procStepPtr + ((proc + 1U) * numSuperSteps); + for (const IdxType *stepPtr = procStepPtr + (proc * numSuperSteps); stepPtr != endStepPtr;) { IdxType row = *stepPtr; const IdxType endRow = *(++stepPtr); From 7cc544b6fedc988a40cba82595e5a62ed37b2cad Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Tue, 24 Mar 2026 15:44:08 +0100 Subject: [PATCH 49/57] initialised accumulator --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 52 +++++++++---------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 84166644..4dcc9047 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -444,12 +444,11 @@ class Sptrsv { EigenIdxType i = numberOfVertices; do { i--; - x[i] = b[i]; - double acc = 0.0; + double acc = b[i]; for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) { - acc += valPtr[j] * x[inner[j]]; + acc -= valPtr[j] * x[inner[j]]; } - x[i] = (x[i] - acc) / valPtr[outer[i]]; + x[i] = acc / valPtr[outer[i]]; } while (i != 0); } @@ -482,11 +481,11 @@ class Sptrsv { do { node--; - double acc = 0.0; + double acc = x[node]; for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { - acc += valPtr[i] * x[inner[i]]; + acc -= valPtr[i] * x[inner[i]]; } - x[node] = (x[node] - acc) / valPtr[outer[node]]; + x[node] = acc / valPtr[outer[node]]; } while (node != lowerB); } # pragma omp barrier @@ -525,12 +524,11 @@ class Sptrsv { do { node--; - x[node] = b[node]; - double acc = 0.0; + double acc = b[node]; for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { - acc += valPtr[i] * x[inner[i]]; + acc -= valPtr[i] * x[inner[i]]; } - x[node] = (x[node] - acc) / valPtr[outer[node]]; + x[node] = acc / valPtr[outer[node]]; } while (node != lowerB); } # pragma omp barrier @@ -558,11 +556,11 @@ class Sptrsv { EigenIdxType i = numberOfVertices; do { i--; - double acc = 0.0; + double acc = x[i]; for (EigenIdxType j = outer[i] + 1; j < outer[i + 1]; ++j) { - acc += valPtr[j] * x[inner[j]]; + acc -= valPtr[j] * x[inner[j]]; } - x[i] = (x[i] - acc) / valPtr[outer[i]]; + x[i] = acc / valPtr[outer[i]]; } while (i != 0); } @@ -575,12 +573,12 @@ class Sptrsv { for (unsigned step = 0; step < numSupersteps_; step++) { const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { - double acc = 0.0; + double acc = x[rowIdx]; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { - acc += val_[i] * x[colIdx_[i]]; + acc -= val_[i] * x[colIdx_[i]]; } - x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; + x[rowIdx] = acc / val_[rowPtr_[rowIdx + 1] - 1]; } # pragma omp barrier @@ -604,13 +602,12 @@ class Sptrsv { const std::size_t proc = static_cast(omp_get_thread_num()); const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { - x[rowIdx] = b[rowIdx]; - double acc = 0.0; + double acc = b[rowIdx]; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { - acc += val_[i] * x[colIdx_[i]]; + acc -= val_[i] * x[colIdx_[i]]; } - x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; + x[rowIdx] = acc / val_[rowPtr_[rowIdx + 1] - 1]; } # pragma omp barrier @@ -639,12 +636,12 @@ class Sptrsv { const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { - double acc = 0.0; + double acc = x[rowIdx]; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { - acc += val_[i] * x[colIdx_[i]]; + acc -= val_[i] * x[colIdx_[i]]; } - x[rowIdx] = (x[rowIdx] - acc) / val_[rowPtr_[rowIdx + 1] - 1]; + x[rowIdx] = acc / val_[rowPtr_[rowIdx + 1] - 1]; } // Signal completion of this superstep. barrier.Arrive(proc); @@ -746,12 +743,11 @@ class Sptrsv { do { node--; - x[node] = b[node]; - double acc = 0.0; + double acc = b[node]; for (EigenIdxType i = outer[node] + 1; i < outer[node + 1]; ++i) { - acc += vals[i] * x[inner[i]]; + acc -= vals[i] * x[inner[i]]; } - x[node] = (x[node] - acc) / vals[outer[node]]; + x[node] = acc / vals[outer[node]]; } while (node != lowerB); } From f9eb927125b89786769b4d5179624f70d5d97dd6 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Tue, 24 Mar 2026 15:52:33 +0100 Subject: [PATCH 50/57] change of epsilon --- apps/maxbsp_ssp_sptrsv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/maxbsp_ssp_sptrsv.cpp b/apps/maxbsp_ssp_sptrsv.cpp index a15e24f5..1102f506 100644 --- a/apps/maxbsp_ssp_sptrsv.cpp +++ b/apps/maxbsp_ssp_sptrsv.cpp @@ -42,7 +42,7 @@ using namespace osp; namespace { -constexpr double EPSILON = 1e-12; +constexpr double EPSILON = 1e-8; constexpr unsigned kDefaultStaleness = 2U; constexpr int defaultSynchronisationCosts = 500; From 4d7fb9473f7b1c4d27c09b116b625f0a51afb534 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Wed, 25 Mar 2026 10:08:40 +0100 Subject: [PATCH 51/57] change lower triangular bounds to proc first --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 41 +++++++------- .../sptrsv_simulator/sptrsv_kernels.hpp | 56 +++++++++++-------- 2 files changed, 54 insertions(+), 43 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 4dcc9047..b3638534 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -75,7 +75,7 @@ class Sptrsv { unsigned numSupersteps_; - std::vector>> vectorStepProcessorVertices_; + std::vector>> vectorProcessorStepVerticesL_; std::vector>> vectorStepProcessorVerticesU_; std::vector ready_; @@ -87,14 +87,14 @@ class Sptrsv { Sptrsv(BspInstance> &inst) : instance_(&inst) {}; void SetupCsrNoPermutation(const BspSchedule> &schedule) { - vectorStepProcessorVertices_ = std::vector>>( - schedule.NumberOfSupersteps(), std::vector>(schedule.GetInstance().NumberOfProcessors())); + vectorProcessorStepVerticesL_ = std::vector>>( + schedule.GetInstance().NumberOfProcessors(), std::vector>(schedule.NumberOfSupersteps())); vectorStepProcessorVerticesU_ = std::vector>>( schedule.NumberOfSupersteps(), std::vector>(schedule.GetInstance().NumberOfProcessors())); boundsArrayL_ = std::vector>>( - schedule.NumberOfSupersteps(), std::vector>(schedule.GetInstance().NumberOfProcessors())); + schedule.GetInstance().NumberOfProcessors(), std::vector>(schedule.NumberOfSupersteps())); boundsArrayU_ = std::vector>>( schedule.NumberOfSupersteps(), std::vector>(schedule.GetInstance().NumberOfProcessors())); @@ -107,27 +107,30 @@ class Sptrsv { switch (id) { case 0: { for (UVertType node = 0; node < numberOfVertices; ++node) { - vectorStepProcessorVertices_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back( + vectorProcessorStepVerticesL_[schedule.AssignedProcessor(node)][schedule.AssignedSuperstep(node)].push_back( static_cast(node)); } - for (unsigned int step = 0; step < schedule.NumberOfSupersteps(); ++step) { - for (unsigned int proc = 0; proc < instance_->NumberOfProcessors(); ++proc) { - if (!vectorStepProcessorVertices_[step][proc].empty()) { - EigenIdxType start = vectorStepProcessorVertices_[step][proc][0]; - EigenIdxType prev = vectorStepProcessorVertices_[step][proc][0]; - - for (UVertType i = 1; i < vectorStepProcessorVertices_[step][proc].size(); ++i) { - if (vectorStepProcessorVertices_[step][proc][i] != prev + 1) { - boundsArrayL_[step][proc].push_back(start); - boundsArrayL_[step][proc].push_back(prev); - start = vectorStepProcessorVertices_[step][proc][i]; + for (unsigned int proc = 0; proc < instance_->NumberOfProcessors(); ++proc) { + for (unsigned int step = 0; step < schedule.NumberOfSupersteps(); ++step) { + const auto &vectorVerticesL = vectorProcessorStepVerticesL_[proc][step]; + auto &localBoundsArrayL_ = boundsArrayL_[proc][step]; + + if (!vectorVerticesL.empty()) { + EigenIdxType start = vectorVerticesL[0]; + EigenIdxType prev = vectorVerticesL[0]; + + for (UVertType i = 1; i < vectorVerticesL.size(); ++i) { + if (vectorVerticesL[i] != prev + 1) { + localBoundsArrayL_.push_back(start); + localBoundsArrayL_.push_back(prev); + start = vectorVerticesL[i]; } - prev = vectorStepProcessorVertices_[step][proc][i]; + prev = vectorVerticesL[i]; } - boundsArrayL_[step][proc].push_back(start); - boundsArrayL_[step][proc].push_back(prev); + localBoundsArrayL_.push_back(start); + localBoundsArrayL_.push_back(prev); } } } diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp index ccf323be..f2988213 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp @@ -69,19 +69,21 @@ void SpLTrSvBSPParallel(double *__restrict__ const x, const IdxType *__restrict__ const outer, const IdxType *__restrict__ const inner, const double *__restrict__ const val, - const std::vector>> &BoundsStepProcIdx) { + const std::vector>> &BoundsProcStepIdx) { static_assert(std::is_integral_v); -#pragma omp parallel num_threads(BoundsStepProcIdx[0U].size()) +#pragma omp parallel num_threads(BoundsProcStepIdx.size()) { const std::size_t proc = static_cast(omp_get_thread_num()); - const std::size_t numSuperSteps = BoundsStepProcIdx.size(); + const std::vector> &BoundsStepIdx = BoundsProcStepIdx[proc]; + const std::size_t numSuperSteps = BoundsStepIdx.size(); for (std::size_t step = 0U; step < numSuperSteps; ++step) { - const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size(); + const std::vector &BoundIdx = BoundsStepIdx[step]; + const std::size_t ubIdx = BoundIdx.size(); for (std::size_t idx = 0U; idx < ubIdx; ++idx) { - IdxType row = BoundsStepProcIdx[step][proc][idx]; - const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx]; + IdxType row = BoundIdx[idx]; + const IdxType ubRow = BoundIdx[++idx]; for (; row <= ubRow; ++row) { double acc = b[row]; for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { @@ -100,19 +102,21 @@ void SpLTrSvBSPParallelInPlace(double *__restrict__ const x, const IdxType *__restrict__ const outer, const IdxType *__restrict__ const inner, const double *__restrict__ const val, - const std::vector>> &BoundsStepProcIdx) { + const std::vector>> &BoundsProcStepIdx) { static_assert(std::is_integral_v); -#pragma omp parallel num_threads(BoundsStepProcIdx[0U].size()) +#pragma omp parallel num_threads(BoundsProcStepIdx.size()) { const std::size_t proc = static_cast(omp_get_thread_num()); - const std::size_t numSuperSteps = BoundsStepProcIdx.size(); + const std::vector> &BoundsStepIdx = BoundsProcStepIdx[proc]; + const std::size_t numSuperSteps = BoundsStepIdx.size(); for (std::size_t step = 0U; step < numSuperSteps; ++step) { - const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size(); + const std::vector &BoundIdx = BoundsStepIdx[step]; + const std::size_t ubIdx = BoundIdx.size(); for (std::size_t idx = 0U; idx < ubIdx; ++idx) { - IdxType row = BoundsStepProcIdx[step][proc][idx]; - const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx]; + IdxType row = BoundIdx[idx]; + const IdxType ubRow = BoundIdx[++idx]; for (; row <= ubRow; ++row) { double acc = x[row]; for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { @@ -132,23 +136,25 @@ void SpLTrSvSSPParallel(double *__restrict__ const x, const IdxType *__restrict__ const outer, const IdxType *__restrict__ const inner, const double *__restrict__ const val, - const std::vector>> &BoundsStepProcIdx) { + const std::vector>> &BoundsProcStepIdx) { static_assert(std::is_integral_v); - const std::size_t nthreads = BoundsStepProcIdx[0U].size(); + const std::size_t nthreads = BoundsProcStepIdx.size(); FlatCheckpointCounterBarrier barrier(nthreads); #pragma omp parallel num_threads(nthreads) { const std::size_t proc = static_cast(omp_get_thread_num()); - for (std::size_t step = 0; step < BoundsStepProcIdx.size(); ++step) { - const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size(); + const std::vector> &BoundsStepIdx = BoundsProcStepIdx[proc]; + for (std::size_t step = 0; step < BoundsStepIdx.size(); ++step) { + const std::vector &BoundsIdx = BoundsStepIdx[step]; + const std::size_t ubIdx = BoundsIdx.size(); if (ubIdx > 0U) { barrier.Wait(proc, staleness - 1U); } for (std::size_t idx = 0; idx < ubIdx; ++idx) { - IdxType row = BoundsStepProcIdx[step][proc][idx]; - const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx]; + IdxType row = BoundsIdx[idx]; + const IdxType ubRow = BoundsIdx[++idx]; for (; row <= ubRow; ++row) { double acc = b[row]; for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { @@ -167,23 +173,25 @@ void SpLTrSvSSPParallelInPlace(double *__restrict__ const x, const IdxType *__restrict__ const outer, const IdxType *__restrict__ const inner, const double *__restrict__ const val, - const std::vector>> &BoundsStepProcIdx) { + const std::vector>> &BoundsProcStepIdx) { static_assert(std::is_integral_v); - const std::size_t nthreads = BoundsStepProcIdx[0U].size(); + const std::size_t nthreads = BoundsProcStepIdx.size(); FlatCheckpointCounterBarrier barrier(nthreads); #pragma omp parallel num_threads(nthreads) { const std::size_t proc = static_cast(omp_get_thread_num()); - for (std::size_t step = 0; step < BoundsStepProcIdx.size(); ++step) { - const std::size_t ubIdx = BoundsStepProcIdx[step][proc].size(); + const std::vector> &BoundsStepIdx = BoundsProcStepIdx[proc]; + for (std::size_t step = 0; step < BoundsStepIdx.size(); ++step) { + const std::vector &BoundsIdx = BoundsStepIdx[step]; + const std::size_t ubIdx = BoundsIdx.size(); if (ubIdx > 0U) { barrier.Wait(proc, staleness - 1U); } for (std::size_t idx = 0; idx < ubIdx; ++idx) { - IdxType row = BoundsStepProcIdx[step][proc][idx]; - const IdxType ubRow = BoundsStepProcIdx[step][proc][++idx]; + IdxType row = BoundsIdx[idx]; + const IdxType ubRow = BoundsIdx[++idx]; for (; row <= ubRow; ++row) { double acc = x[row]; for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { From 6ee7cbad8238ec8142d1dff1f1ef567aab3cf75a Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Wed, 25 Mar 2026 10:29:35 +0100 Subject: [PATCH 52/57] u solve swap processor and superstep order (for locality) --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 71 +++++++++++-------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index b3638534..2871be86 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -76,7 +76,7 @@ class Sptrsv { unsigned numSupersteps_; std::vector>> vectorProcessorStepVerticesL_; - std::vector>> vectorStepProcessorVerticesU_; + std::vector>> vectorProcessorStepVerticesU_; std::vector ready_; std::vector>> boundsArrayL_; @@ -90,13 +90,13 @@ class Sptrsv { vectorProcessorStepVerticesL_ = std::vector>>( schedule.GetInstance().NumberOfProcessors(), std::vector>(schedule.NumberOfSupersteps())); - vectorStepProcessorVerticesU_ = std::vector>>( - schedule.NumberOfSupersteps(), std::vector>(schedule.GetInstance().NumberOfProcessors())); + vectorProcessorStepVerticesU_ = std::vector>>( + schedule.GetInstance().NumberOfProcessors(), std::vector>(schedule.NumberOfSupersteps())); boundsArrayL_ = std::vector>>( schedule.GetInstance().NumberOfProcessors(), std::vector>(schedule.NumberOfSupersteps())); boundsArrayU_ = std::vector>>( - schedule.NumberOfSupersteps(), std::vector>(schedule.GetInstance().NumberOfProcessors())); + schedule.GetInstance().NumberOfProcessors(), std::vector>(schedule.NumberOfSupersteps())); numSupersteps_ = schedule.NumberOfSupersteps(); UVertType numberOfVertices = instance_->GetComputationalDag().NumVertices(); @@ -141,29 +141,32 @@ class Sptrsv { UVertType node = numberOfVertices; do { node--; - vectorStepProcessorVerticesU_[schedule.AssignedSuperstep(node)][schedule.AssignedProcessor(node)].push_back( + vectorProcessorStepVerticesU_[schedule.AssignedProcessor(node)][schedule.AssignedSuperstep(node)].push_back( // --- SSP SpTRSV kernel integration from BspSptrsvCSR.hpp/cpp --- static_cast(node)); } while (node > 0); - for (unsigned int step = 0; step < schedule.NumberOfSupersteps(); ++step) { - for (unsigned int proc = 0; proc < instance_->NumberOfProcessors(); ++proc) { - if (!vectorStepProcessorVerticesU_[step][proc].empty()) { - EigenIdxType startU = static_cast(vectorStepProcessorVerticesU_[step][proc][0]); - EigenIdxType prevU = static_cast(vectorStepProcessorVerticesU_[step][proc][0]); - - for (UVertType i = 1; i < vectorStepProcessorVerticesU_[step][proc].size(); ++i) { - if (static_cast(vectorStepProcessorVerticesU_[step][proc][i]) != prevU - 1) { - boundsArrayU_[step][proc].push_back(startU); - boundsArrayU_[step][proc].push_back(prevU); - startU = static_cast(vectorStepProcessorVerticesU_[step][proc][i]); + for (unsigned int proc = 0; proc < instance_->NumberOfProcessors(); ++proc) { + for (unsigned int step = 0; step < schedule.NumberOfSupersteps(); ++step) { + const auto &vectorVerticesU = vectorProcessorStepVerticesU_[proc][step]; + auto &localBoundsArrayU = boundsArrayU_[proc][step]; + + if (!vectorVerticesU.empty()) { + EigenIdxType startU = static_cast(vectorVerticesU[0]); + EigenIdxType prevU = static_cast(vectorVerticesU[0]); + + for (UVertType i = 1; i < vectorVerticesU.size(); ++i) { + if (static_cast(vectorVerticesU[i]) != prevU - 1) { + localBoundsArrayU.push_back(startU); + localBoundsArrayU.push_back(prevU); + startU = static_cast(vectorVerticesU[i]); } - prevU = static_cast(vectorStepProcessorVerticesU_[step][proc][i]); + prevU = static_cast(vectorVerticesU[i]); } - boundsArrayU_[step][proc].push_back(startU); - boundsArrayU_[step][proc].push_back(prevU); + localBoundsArrayU.push_back(startU); + localBoundsArrayU.push_back(prevU); } } } @@ -474,13 +477,15 @@ class Sptrsv { { // Process each superstep starting from the last one (opposite of lsolve) const std::size_t proc = static_cast(omp_get_thread_num()); + const auto& procLocalBoundsArrayU = boundsArrayU_[proc]; unsigned step = numSupersteps_; do { step--; - const std::size_t boundsStrSize = boundsArrayU_[step][proc].size(); - for (std::size_t index = 0; index < boundsStrSize; index += 2) { - EigenIdxType node = boundsArrayU_[step][proc][index] + 1; - const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1]; + const auto &localBoundsArrayU = procLocalBoundsArrayU[step]; + const std::size_t boundsStrSize = localBoundsArrayU.size(); + for (std::size_t index = 0; index < boundsStrSize; ++index) { + EigenIdxType node = localBoundsArrayU[index] + 1; + const EigenIdxType lowerB = localBoundsArrayU[++index]; do { node--; @@ -517,13 +522,15 @@ class Sptrsv { { // Process each superstep starting from the last one (opposite of lsolve) const std::size_t proc = static_cast(omp_get_thread_num()); + const auto &procLocalBoundsArrayU = boundsArrayU_[proc]; unsigned step = numSupersteps_; do { step--; - const std::size_t boundsStrSize = boundsArrayU_[step][proc].size(); - for (std::size_t index = 0; index < boundsStrSize; index += 2) { - EigenIdxType node = boundsArrayU_[step][proc][index] + 1; - const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1]; + const auto &localBoundsArrayU = procLocalBoundsArrayU[step]; + const std::size_t boundsStrSize = localBoundsArrayU.size(); + for (std::size_t index = 0; index < boundsStrSize; ++index) { + EigenIdxType node = localBoundsArrayU[index] + 1; + const EigenIdxType lowerB = localBoundsArrayU[++index]; do { node--; @@ -732,17 +739,19 @@ class Sptrsv { # pragma omp parallel num_threads(nthreads) { const std::size_t proc = static_cast(omp_get_thread_num()); + const auto &procLocalBoundsArrayU = boundsArrayU_[proc]; unsigned step = numSupersteps_; do { step--; - const std::size_t boundsStrSize = boundsArrayU_[step][proc].size(); + const auto &localBoundsArrayU = procLocalBoundsArrayU[step]; + const std::size_t boundsStrSize = localBoundsArrayU.size(); if (boundsStrSize > 0U) { barrier.Wait(proc, staleness - 1U); } - for (std::size_t index = 0; index < boundsStrSize; index += 2) { - EigenIdxType node = boundsArrayU_[step][proc][index] + 1; - const EigenIdxType lowerB = boundsArrayU_[step][proc][index + 1]; + for (std::size_t index = 0; index < boundsStrSize; ++index) { + EigenIdxType node = localBoundsArrayU[index] + 1; + const EigenIdxType lowerB = localBoundsArrayU[++index]; do { node--; From 58060450607f231306be3a3613569ef1f1353ba7 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Wed, 25 Mar 2026 11:40:00 +0100 Subject: [PATCH 53/57] small sptrsv kernel optimisations --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 2871be86..90cc2f28 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -580,9 +580,12 @@ class Sptrsv { # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { const std::size_t proc = static_cast(omp_get_thread_num()); + const auto &stepPtr = procStepPtr_[proc]; + const auto &stepNum = procStepNum_[proc]; + for (unsigned step = 0; step < numSupersteps_; step++) { - const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; - for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { + const UVertType upperLimit = stepPtr[step] + stepNum[step]; + for (UVertType rowIdx = stepPtr[step]; rowIdx < upperLimit; rowIdx++) { double acc = x[rowIdx]; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { acc -= val_[i] * x[colIdx_[i]]; @@ -608,10 +611,13 @@ class Sptrsv { # pragma omp parallel num_threads(instance_->NumberOfProcessors()) { + const std::size_t proc = static_cast(omp_get_thread_num()); + const auto &stepPtr = procStepPtr_[proc]; + const auto &stepNum = procStepNum_[proc]; + for (unsigned step = 0; step < numSupersteps_; step++) { - const std::size_t proc = static_cast(omp_get_thread_num()); - const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; - for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { + const UVertType upperLimit = stepPtr[step] + stepNum[step]; + for (UVertType rowIdx = stepPtr[step]; rowIdx < upperLimit; rowIdx++) { double acc = b[rowIdx]; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { acc -= val_[i] * x[colIdx_[i]]; @@ -639,13 +645,18 @@ class Sptrsv { # pragma omp parallel num_threads(nthreads) { const std::size_t proc = static_cast(omp_get_thread_num()); + const auto &stepPtr = procStepPtr_[proc]; + const auto &stepNum = procStepNum_[proc]; + for (unsigned step = 0; step < numSupersteps_; ++step) { - if (procStepNum_[proc][step] > 0U) { + UVertType rowIdx = stepPtr[step]; + const UVertType upperLimit = stepPtr[step] + stepNum[step]; + + if (rowIdx != upperLimit) { barrier.Wait(proc, staleness - 1U); } - const UVertType upperLimit = procStepPtr_[proc][step] + procStepNum_[proc][step]; - for (UVertType rowIdx = procStepPtr_[proc][step]; rowIdx < upperLimit; rowIdx++) { + for (; rowIdx < upperLimit; rowIdx++) { double acc = x[rowIdx]; for (UVertType i = rowPtr_[rowIdx]; i < rowPtr_[rowIdx + 1] - 1; i++) { acc -= val_[i] * x[colIdx_[i]]; From 72817ba73553e92f6b163421fac1836a4ce232e8 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Wed, 25 Mar 2026 13:03:13 +0100 Subject: [PATCH 54/57] sptrsv iterator optimisations --- .../sptrsv_simulator/sptrsv_kernels.hpp | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp index f2988213..fcbcf424 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv_kernels.hpp @@ -79,11 +79,11 @@ void SpLTrSvBSPParallel(double *__restrict__ const x, const std::size_t numSuperSteps = BoundsStepIdx.size(); for (std::size_t step = 0U; step < numSuperSteps; ++step) { - const std::vector &BoundIdx = BoundsStepIdx[step]; - const std::size_t ubIdx = BoundIdx.size(); - for (std::size_t idx = 0U; idx < ubIdx; ++idx) { - IdxType row = BoundIdx[idx]; - const IdxType ubRow = BoundIdx[++idx]; + const std::vector &BoundsIdx = BoundsStepIdx[step]; + const auto idxItEnd = BoundsIdx.cend(); + for (auto idxIt = BoundsIdx.cbegin(); idxIt != idxItEnd; ++idxIt) { + IdxType row = *idxIt; + const IdxType ubRow = *(++idxIt); for (; row <= ubRow; ++row) { double acc = b[row]; for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { @@ -112,11 +112,11 @@ void SpLTrSvBSPParallelInPlace(double *__restrict__ const x, const std::size_t numSuperSteps = BoundsStepIdx.size(); for (std::size_t step = 0U; step < numSuperSteps; ++step) { - const std::vector &BoundIdx = BoundsStepIdx[step]; - const std::size_t ubIdx = BoundIdx.size(); - for (std::size_t idx = 0U; idx < ubIdx; ++idx) { - IdxType row = BoundIdx[idx]; - const IdxType ubRow = BoundIdx[++idx]; + const std::vector &BoundsIdx = BoundsStepIdx[step]; + const auto idxItEnd = BoundsIdx.cend(); + for (auto idxIt = BoundsIdx.cbegin(); idxIt != idxItEnd; ++idxIt) { + IdxType row = *idxIt; + const IdxType ubRow = *(++idxIt); for (; row <= ubRow; ++row) { double acc = x[row]; for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { @@ -148,13 +148,16 @@ void SpLTrSvSSPParallel(double *__restrict__ const x, const std::vector> &BoundsStepIdx = BoundsProcStepIdx[proc]; for (std::size_t step = 0; step < BoundsStepIdx.size(); ++step) { const std::vector &BoundsIdx = BoundsStepIdx[step]; - const std::size_t ubIdx = BoundsIdx.size(); - if (ubIdx > 0U) { + auto idxIt = BoundsIdx.cbegin(); + const auto idxItEnd = BoundsIdx.cend(); + + if (idxIt != idxItEnd) { barrier.Wait(proc, staleness - 1U); } - for (std::size_t idx = 0; idx < ubIdx; ++idx) { - IdxType row = BoundsIdx[idx]; - const IdxType ubRow = BoundsIdx[++idx]; + + for (; idxIt != idxItEnd; ++idxIt) { + IdxType row = *idxIt; + const IdxType ubRow = *(++idxIt); for (; row <= ubRow; ++row) { double acc = b[row]; for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { @@ -185,13 +188,16 @@ void SpLTrSvSSPParallelInPlace(double *__restrict__ const x, const std::vector> &BoundsStepIdx = BoundsProcStepIdx[proc]; for (std::size_t step = 0; step < BoundsStepIdx.size(); ++step) { const std::vector &BoundsIdx = BoundsStepIdx[step]; - const std::size_t ubIdx = BoundsIdx.size(); - if (ubIdx > 0U) { + auto idxIt = BoundsIdx.cbegin(); + const auto idxItEnd = BoundsIdx.cend(); + + if (idxIt != idxItEnd) { barrier.Wait(proc, staleness - 1U); } - for (std::size_t idx = 0; idx < ubIdx; ++idx) { - IdxType row = BoundsIdx[idx]; - const IdxType ubRow = BoundsIdx[++idx]; + + for (; idxIt != idxItEnd; ++idxIt) { + IdxType row = *idxIt; + const IdxType ubRow = *(++idxIt); for (; row <= ubRow; ++row) { double acc = x[row]; for (IdxType entryIdx = outer[row]; entryIdx < outer[row + 1] - 1; ++entryIdx) { From 5c71d4a23450ba2294bb20d5586fea2711e501bf Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Wed, 25 Mar 2026 13:18:06 +0100 Subject: [PATCH 55/57] more sptrsv iterator optimisations --- .../osp/auxiliary/sptrsv_simulator/sptrsv.hpp | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 90cc2f28..c246243a 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -482,10 +482,10 @@ class Sptrsv { do { step--; const auto &localBoundsArrayU = procLocalBoundsArrayU[step]; - const std::size_t boundsStrSize = localBoundsArrayU.size(); - for (std::size_t index = 0; index < boundsStrSize; ++index) { - EigenIdxType node = localBoundsArrayU[index] + 1; - const EigenIdxType lowerB = localBoundsArrayU[++index]; + const auto idxItEnd = localBoundsArrayU.cend(); + for (auto idxIt = localBoundsArrayU.cbegin(); idxIt != idxItEnd; ++idxIt) { + EigenIdxType node = (*idxIt) + 1; + const EigenIdxType lowerB = *(++idxIt); do { node--; @@ -528,9 +528,10 @@ class Sptrsv { step--; const auto &localBoundsArrayU = procLocalBoundsArrayU[step]; const std::size_t boundsStrSize = localBoundsArrayU.size(); - for (std::size_t index = 0; index < boundsStrSize; ++index) { - EigenIdxType node = localBoundsArrayU[index] + 1; - const EigenIdxType lowerB = localBoundsArrayU[++index]; + const auto idxItEnd = localBoundsArrayU.cend(); + for (auto idxIt = localBoundsArrayU.cbegin(); idxIt != idxItEnd; ++idxIt) { + EigenIdxType node = (*idxIt) + 1; + const EigenIdxType lowerB = *(++idxIt); do { node--; @@ -755,14 +756,16 @@ class Sptrsv { do { step--; const auto &localBoundsArrayU = procLocalBoundsArrayU[step]; - const std::size_t boundsStrSize = localBoundsArrayU.size(); - if (boundsStrSize > 0U) { + auto idxIt = localBoundsArrayU.cbegin(); + const auto idxItEnd = localBoundsArrayU.cend(); + + if (idxIt != idxItEnd) { barrier.Wait(proc, staleness - 1U); } - for (std::size_t index = 0; index < boundsStrSize; ++index) { - EigenIdxType node = localBoundsArrayU[index] + 1; - const EigenIdxType lowerB = localBoundsArrayU[++index]; + for (; idxIt != idxItEnd; ++idxIt) { + EigenIdxType node = (*idxIt) + 1; + const EigenIdxType lowerB = *(++idxIt); do { node--; From d3e6adef7d802cdc058ea05088d81926794903b9 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Wed, 1 Apr 2026 14:58:18 +0200 Subject: [PATCH 56/57] removed pedantic warnings --- tests/kl_lambda.cpp | 4 ++-- tests/kl_total.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/kl_lambda.cpp b/tests/kl_lambda.cpp index 8acaf427..6d985d25 100644 --- a/tests/kl_lambda.cpp +++ b/tests/kl_lambda.cpp @@ -356,7 +356,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_incremental_update_test) { CheckEqualLambdaMap(lambda_map, kl_6.GetCommCostF().nodeLambdaMap_); CheckEqualAffinityTable(affinity, kl_6.GetAffinityTable(), nodes_to_check); -}; +} BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_test) { using graph = ComputationalDagEdgeIdxVectorImplDefIntT; @@ -486,7 +486,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_test) { CheckEqualLambdaMap(lambda_map, kl_6.GetCommCostF().nodeLambdaMap_); CheckEqualAffinityTable(affinity, kl_6.GetAffinityTable(), nodes_to_check); -}; +} BOOST_AUTO_TEST_CASE(KlLambdaImproverInnerLoopPenaltyTest) { using Graph = ComputationalDagEdgeIdxVectorImplDefIntT; diff --git a/tests/kl_total.cpp b/tests/kl_total.cpp index 5b48d12d..f0d1e25f 100644 --- a/tests/kl_total.cpp +++ b/tests/kl_total.cpp @@ -887,7 +887,7 @@ BOOST_AUTO_TEST_CASE(KlImprover_incremental_update_test) { nodes_to_check.erase(v3); CheckEqualAffinityTable(affinity, kl_6.GetAffinityTable(), nodes_to_check); -}; +} // BOOST_AUTO_TEST_CASE(kl_total_comm_large_test_graphs) { // std::vector filenames_graph = LargeSpaaGraphs(); From 0e598b2d623daa0184ffb6998109d2e4f2d31c60 Mon Sep 17 00:00:00 2001 From: Raphael Steiner Date: Wed, 1 Apr 2026 15:10:17 +0200 Subject: [PATCH 57/57] reactivation of concept --- include/osp/concepts/directed_graph_concept.hpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/include/osp/concepts/directed_graph_concept.hpp b/include/osp/concepts/directed_graph_concept.hpp index 09bc9900..aaa537ad 100644 --- a/include/osp/concepts/directed_graph_concept.hpp +++ b/include/osp/concepts/directed_graph_concept.hpp @@ -64,15 +64,13 @@ struct IsDirectedGraph().Children(std::declval>())), decltype(std::declval().InDegree(std::declval>())), decltype(std::declval().OutDegree(std::declval>()))>> - : std::conjunction< - IsForwardRangeOf().Vertices()), VertexIdxT>, + : std::conjunction().Vertices()), VertexIdxT>, std::is_integral().NumVertices())>, std::is_integral().NumEdges())>, - IsInputRangeOf().Parents(std::declval>())), VertexIdxT> - // IsInputRangeOf().Children(std::declval>())), VertexIdxT>, - // std::is_integral().InDegree(std::declval>()))>, - // std::is_integral().OutDegree(std::declval>()))> - > {}; + IsInputRangeOf().Parents(std::declval>())), VertexIdxT>, + IsInputRangeOf().Children(std::declval>())), VertexIdxT>, + std::is_integral().InDegree(std::declval>()))>, + std::is_integral().OutDegree(std::declval>()))>> {}; template inline constexpr bool isDirectedGraphV = IsDirectedGraph::value;