diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e8cb55..9c0194f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,15 +20,15 @@ option(ENABLE_WARNINGS "Enable warnings" OFF)
# Check for TBB
if(NOT MSVC AND NOT DISABLE_PAR)
- find_package(TBB QUIET)
- if(TBB_FOUND)
- message(STATUS "TBB found. Enabling parallel execution.")
- else()
- message(STATUS "TBB not found. Disabling parallel execution.")
- set(DISABLE_PAR ON)
- endif()
+ find_package(TBB QUIET)
+ if(TBB_FOUND)
+ message(STATUS "TBB found. Enabling parallel execution.")
+ else()
+ message(STATUS "TBB not found. Disabling parallel execution.")
+ set(DISABLE_PAR ON)
+ endif()
elseif(DISABLE_PAR)
- message(STATUS "DISABLE_PAR set. Disabling parallel execution.")
+ message(STATUS "DISABLE_PAR set. Disabling parallel execution.")
endif()
# Create the ctrack library
@@ -40,41 +40,41 @@ target_include_directories(ctrack INTERFACE
# Configure ctrack based on TBB availability
if(DISABLE_PAR)
- target_compile_definitions(ctrack INTERFACE CTRACK_DISABLE_EXECUTION_POLICY)
+ target_compile_definitions(ctrack INTERFACE CTRACK_DISABLE_EXECUTION_POLICY)
elseif(NOT MSVC AND TBB_FOUND)
- target_link_libraries(ctrack INTERFACE TBB::tbb)
+ target_link_libraries(ctrack INTERFACE TBB::tbb)
endif()
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if(ENABLE_WARNINGS)
- if (NOT MSVC)
- include(cmake/add_warning.cmake)
- include(cmake/warnings.cmake)
- endif()
+ if (NOT MSVC)
+ include(cmake/add_warning.cmake)
+ include(cmake/warnings.cmake)
+ endif()
endif()
# Add the examples subdirectory if not disabled
if(NOT DISABLE_EXAMPLES)
- add_subdirectory(examples)
+ add_subdirectory(examples)
else()
- message(STATUS "Building examples disabled.")
+ message(STATUS "Building examples disabled.")
endif()
# Add the benchmark subdirectory if enabled
if(BUILD_BENCHMARK)
- add_subdirectory(benchmark)
- message(STATUS "Building benchmark enabled.")
+ add_subdirectory(benchmark)
+ message(STATUS "Building benchmark enabled.")
else()
- message(STATUS "Building benchmark disabled.")
+ message(STATUS "Building benchmark disabled.")
endif()
# Add the test subdirectory if enabled
if(BUILD_TESTS)
- add_subdirectory(test)
- enable_testing()
- message(STATUS "Building tests enabled.")
+ add_subdirectory(test)
+ enable_testing()
+ message(STATUS "Building tests enabled.")
else()
- message(STATUS "Building tests disabled.")
+ message(STATUS "Building tests disabled.")
endif()
# Installation
@@ -109,4 +109,4 @@ install(FILES
"${CMAKE_CURRENT_BINARY_DIR}/ctrackConfig.cmake"
"${CMAKE_CURRENT_BINARY_DIR}/ctrackConfigVersion.cmake"
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ctrack
-)
\ No newline at end of file
+)
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 1e014d6..9d04305 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,24 +1,29 @@
-add_executable(ctrack_benchmark ctrack_benchmark.cpp)
-target_link_libraries(ctrack_benchmark PRIVATE ctrack)
-
-# Enable threading support
-set(THREADS_PREFER_PTHREAD_FLAG ON)
-find_package(Threads REQUIRED)
-target_link_libraries(ctrack_benchmark PRIVATE Threads::Threads)
-
-# Add filesystem library if needed (for older compilers)
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+# Helper macro — avoids repetition
+macro(add_ctrack_benchmark target_name clock_define)
+ add_executable(${target_name} ctrack_benchmark.cpp)
+ target_link_libraries(${target_name} PRIVATE ctrack)
+ target_compile_options(${target_name} PRIVATE -O3)
+ if(NOT "${clock_define}" STREQUAL "")
+ target_compile_definitions(${target_name} PRIVATE ${clock_define})
+ endif()
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
+ find_package(Threads REQUIRED)
+ target_link_libraries(${target_name} PRIVATE Threads::Threads)
+ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0)
- target_link_libraries(ctrack_benchmark PRIVATE stdc++fs)
+ target_link_libraries(${target_name} PRIVATE stdc++fs)
endif()
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0)
- target_link_libraries(ctrack_benchmark PRIVATE c++fs)
+ target_link_libraries(${target_name} PRIVATE c++fs)
endif()
-endif()
-
-# Set output directory
-set_target_properties(ctrack_benchmark
- PROPERTIES
+ endif()
+ set_target_properties(${target_name} PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark"
-)
\ No newline at end of file
+ )
+endmacro()
+
+add_ctrack_benchmark(ctrack_benchmark "")
+add_ctrack_benchmark(ctrack_benchmark_rdtsc CTRACK_CLOCK_RDTSC)
+add_ctrack_benchmark(ctrack_benchmark_rdtscp CTRACK_CLOCK_RDTSCP)
+add_ctrack_benchmark(ctrack_benchmark_rdtscp_lfence CTRACK_CLOCK_RDTSCP_LFENCE)
diff --git a/benchmark/bench_results.svg b/benchmark/bench_results.svg
new file mode 100644
index 0000000..36068fa
--- /dev/null
+++ b/benchmark/bench_results.svg
@@ -0,0 +1,114 @@
+
diff --git a/benchmark/ctrack_benchmark.cpp b/benchmark/ctrack_benchmark.cpp
index e6ff4be..75c60f2 100644
--- a/benchmark/ctrack_benchmark.cpp
+++ b/benchmark/ctrack_benchmark.cpp
@@ -1,811 +1,870 @@
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-#ifdef _WIN32
-#include
-#include
-#else
-#include
-#include
-#endif
-
-// Configuration
-struct BenchmarkConfig
-{
- size_t total_events = 50'000'000; // Default 50 million events
- size_t thread_count = std::thread::hardware_concurrency();
- bool record_baseline = false;
- bool compare_baseline = false;
- std::string baseline_file = "ctrack_baseline.json";
- bool verbose = false;
-};
-
-// Baseline data structure
-struct BaselineData
-{
- double accuracy_error_percent;
- double accuracy_error_ms_per_event;
- double overhead_percent;
- double overhead_ms;
- double overhead_ns_per_event;
- double memory_bytes_per_event;
- double calculation_time_ms;
- double peak_calc_memory_mb;
- size_t total_events;
- size_t thread_count;
- std::string timestamp;
- std::string platform;
-};
-
-// Global config
-BenchmarkConfig g_config;
-
-// Get current memory usage in bytes
-size_t get_memory_usage()
-{
-#ifdef _WIN32
- PROCESS_MEMORY_COUNTERS_EX pmc;
- GetProcessMemoryInfo(GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS *)&pmc, sizeof(pmc));
- return pmc.WorkingSetSize;
-#else
- struct rusage usage;
- getrusage(RUSAGE_SELF, &usage);
- return usage.ru_maxrss * 1024; // Convert KB to bytes on Linux
-#endif
-}
-
-// Precise busy wait function - waits for specified nanoseconds
-void busy_wait_ns(int64_t nanoseconds)
-{
- auto start = std::chrono::high_resolution_clock::now();
- auto target_duration = std::chrono::nanoseconds(nanoseconds);
-
- while (true)
- {
- auto now = std::chrono::high_resolution_clock::now();
- auto elapsed = now - start;
- if (elapsed >= target_duration)
- {
- break;
- }
- }
-}
-
-// Benchmark functions with predictable timing
-void leaf_function(int depth)
-{
- CTRACK_NAME("leaf_function");
- // Busy wait for 1 microsecond (1000 ns)
- busy_wait_ns(1000);
-}
-
-void level_3_function(int depth)
-{
- CTRACK_NAME("level_3_function");
- // Busy wait for 500 ns
- busy_wait_ns(500);
-
- // Call leaf function twice
- leaf_function(depth + 1);
- leaf_function(depth + 1);
-}
-
-void level_2_function(int depth, int iterations)
-{
- CTRACK_NAME("level_2_function");
- // Busy wait for 300 ns
- busy_wait_ns(300);
-
- for (int i = 0; i < iterations; ++i)
- {
- level_3_function(depth + 1);
- }
-}
-
-void level_1_function(int iterations)
-{
- CTRACK_NAME("level_1_function");
- // Busy wait for 200 ns
- busy_wait_ns(200);
-
- level_2_function(1, iterations);
-}
-
-// Version without CTRACK for overhead measurement
-void leaf_function_no_track(int depth)
-{
- busy_wait_ns(1000);
-}
-
-void level_3_function_no_track(int depth)
-{
- busy_wait_ns(500);
- leaf_function_no_track(depth + 1);
- leaf_function_no_track(depth + 1);
-}
-
-void level_2_function_no_track(int depth, int iterations)
-{
- busy_wait_ns(300);
- for (int i = 0; i < iterations; ++i)
- {
- level_3_function_no_track(depth + 1);
- }
-}
-
-void level_1_function_no_track(int iterations)
-{
- busy_wait_ns(200);
- level_2_function_no_track(1, iterations);
-}
-
-// Worker thread function
-void benchmark_worker(size_t events_per_thread, std::atomic &start_flag)
-{
- // Wait for start signal
- while (!start_flag.load())
- {
- std::this_thread::yield();
- }
-
- // Calculate iterations to reach target event count
- // Each level_1 call generates: 1 + 1 + iterations * (1 + 2) events
- // For iterations=10: 1 + 1 + 10 * 3 = 32 events per call
- const int iterations = 10;
- const int events_per_call = 2 + iterations * 3;
- size_t calls_needed = events_per_thread / events_per_call;
-
- for (size_t i = 0; i < calls_needed; ++i)
- {
- level_1_function(iterations);
- }
-}
-
-// Worker thread function without tracking
-void benchmark_worker_no_track(size_t events_per_thread, std::atomic &start_flag)
-{
- while (!start_flag.load())
- {
- std::this_thread::yield();
- }
-
- const int iterations = 10;
- const int events_per_call = 2 + iterations * 3;
- size_t calls_needed = events_per_thread / events_per_call;
-
- for (size_t i = 0; i < calls_needed; ++i)
- {
- level_1_function_no_track(iterations);
- }
-}
-
-// Parse timing from CTRACK results string for a specific function
-double parse_function_timing(const std::string &results, const std::string &function_name)
-{
- // Look for the Details section first
- size_t details_pos = results.find("Details");
- if (details_pos == std::string::npos)
- {
- return -1.0; // Details section not found
- }
-
- // Look for the function name after the Details section
- size_t func_pos = results.find(function_name, details_pos);
- if (func_pos == std::string::npos)
- {
- return -1.0; // Function not found in Details section
- }
-
- // Find the line containing this function in the Details section
- size_t line_start = results.rfind('\n', func_pos);
- if (line_start == std::string::npos)
- line_start = details_pos;
- else
- line_start++; // Skip the newline
-
- size_t line_end = results.find('\n', func_pos);
- if (line_end == std::string::npos)
- line_end = results.length();
-
- std::string line = results.substr(line_start, line_end - line_start);
-
- // Look for the "time acc" column value (4th column after filename, function, line)
- // Split by | and find the 4th field
- std::vector fields;
- std::istringstream iss(line);
- std::string field;
-
- while (std::getline(iss, field, '|'))
- {
- // Trim whitespace
- field.erase(0, field.find_first_not_of(" \t"));
- field.erase(field.find_last_not_of(" \t") + 1);
- if (!field.empty())
- {
- fields.push_back(field);
- }
- }
-
- // The time acc should be in the 4th field (0-indexed: filename=0, function=1, line=2, time_acc=3)
- if (fields.size() > 3)
- {
- std::string time_acc = fields[3];
-
- // Parse value and unit from time_acc (e.g., "2.09 ms")
- std::istringstream time_iss(time_acc);
- double value;
- std::string unit;
-
- if (time_iss >> value >> unit)
- {
- // Convert to nanoseconds based on unit
- if (unit == "s")
- return value * 1e9;
- else if (unit == "ms")
- return value * 1e6;
- else if (unit == "mcs")
- return value * 1e3;
- else if (unit == "ns")
- return value;
- }
- }
-
- return -1.0; // Could not parse
-}
-
-// Measure accuracy by comparing known timings with CTRACK measurements
-std::pair measure_accuracy()
-{
- std::cout << "\n=== Measuring Accuracy ===" << std::endl;
-
- // Clear any previous tracking data by getting and discarding results
- ctrack::result_as_string();
-
- // Run a controlled test with known timings
- const int test_iterations = 100;
- for (int i = 0; i < test_iterations; ++i)
- {
- level_1_function(10);
- }
-
- // Get results
- auto results = ctrack::result_as_string();
-
- // Expected timings per iteration (in nanoseconds):
- // leaf_function: 1000ns (called 20 times per iteration) = 20,000ns total per iteration
- // level_3_function: 500ns + 2*1000ns = 2500ns (called 10 times per iteration) = 25,000ns total per iteration
- // level_2_function: 300ns + 10*2500ns = 25,300ns (called 1 time per iteration) = 25,300ns total per iteration
- // level_1_function: 200ns + 25,300ns = 25,500ns (called 1 time per iteration) = 25,500ns total per iteration
-
- struct ExpectedTiming
- {
- std::string name;
- double expected_total_ns;
- int call_count;
- };
-
- std::vector expected_timings = {
- {"leaf_function", 1000.0 * 20 * test_iterations, 20 * test_iterations},
- {"level_3_function", 2500.0 * 10 * test_iterations, 10 * test_iterations},
- {"level_2_function", 25300.0 * 1 * test_iterations, 1 * test_iterations},
- {"level_1_function", 25500.0 * 1 * test_iterations, 1 * test_iterations}};
-
- double total_expected_time = 0.0;
- double total_actual_time = 0.0;
- double max_absolute_error = 0.0;
-
- if (g_config.verbose)
- {
- std::cout << "Function accuracy analysis:" << std::endl;
- }
-
- for (const auto &timing : expected_timings)
- {
- double actual_ns = parse_function_timing(results, timing.name);
- if (actual_ns > 0)
- {
- double expected_ns = timing.expected_total_ns;
- double absolute_error = std::abs(actual_ns - expected_ns);
- double percent_error = (absolute_error / expected_ns) * 100.0;
-
- total_expected_time += expected_ns;
- total_actual_time += actual_ns;
- max_absolute_error = (std::max)(max_absolute_error, absolute_error);
-
- if (g_config.verbose)
- {
- std::cout << " " << timing.name << ": expected " << expected_ns / 1e6 << " ms, got "
- << actual_ns / 1e6 << " ms (error: " << percent_error << "%)" << std::endl;
- }
- }
- else if (g_config.verbose)
- {
- std::cout << " " << timing.name << ": could not parse timing" << std::endl;
- }
- }
-
- double overall_error_percent = 0.0;
- double overall_error_ms = 0.0;
-
- if (total_expected_time > 0)
- {
- double total_absolute_error = std::abs(total_actual_time - total_expected_time);
- overall_error_percent = (total_absolute_error / total_expected_time) * 100.0;
-
- // Calculate total number of events across all functions
- double total_events = 0;
- for (const auto &timing : expected_timings)
- {
- total_events += timing.call_count;
- }
-
- // Convert to milliseconds per event
- overall_error_ms = (total_absolute_error / 1e6) / total_events; // Convert to milliseconds per event
- }
-
- if (g_config.verbose)
- {
- std::cout << "Overall accuracy error: " << overall_error_percent << "% (" << overall_error_ms << " ms per event)" << std::endl;
- }
-
- return {overall_error_percent, overall_error_ms};
-}
-
-// Measure overhead by comparing with and without CTRACK
-std::tuple measure_overhead()
-{
- std::cout << "\n=== Measuring Overhead ===" << std::endl;
-
- const size_t overhead_events = 1'000'000; // 1M events for overhead test
- size_t events_per_thread = overhead_events / g_config.thread_count;
-
- // Measure without CTRACK
- auto start_no_track = std::chrono::high_resolution_clock::now();
- {
- std::vector threads;
- std::atomic start_flag{false};
-
- for (size_t i = 0; i < g_config.thread_count; ++i)
- {
- threads.emplace_back(benchmark_worker_no_track, events_per_thread, std::ref(start_flag));
- }
-
- start_flag = true;
-
- for (auto &t : threads)
- {
- t.join();
- }
- }
- auto end_no_track = std::chrono::high_resolution_clock::now();
- auto duration_no_track = std::chrono::duration_cast(end_no_track - start_no_track).count();
-
- // Clear tracking data by getting and discarding results
- ctrack::result_as_string();
-
- // Measure with CTRACK
- auto start_track = std::chrono::high_resolution_clock::now();
- {
- std::vector threads;
- std::atomic start_flag{false};
-
- for (size_t i = 0; i < g_config.thread_count; ++i)
- {
- threads.emplace_back(benchmark_worker, events_per_thread, std::ref(start_flag));
- }
-
- start_flag = true;
-
- for (auto &t : threads)
- {
- t.join();
- }
- }
- auto end_track = std::chrono::high_resolution_clock::now();
- auto duration_track = std::chrono::duration_cast(end_track - start_track).count();
-
- double overhead_percent = ((double)(duration_track - duration_no_track) / duration_no_track) * 100.0;
- double overhead_ms = (duration_track - duration_no_track) / 1000.0; // Convert microseconds to milliseconds
- double overhead_ns_per_event = ((duration_track - duration_no_track) * 1000.0) / overhead_events; // nanoseconds per event
-
- if (g_config.verbose)
- {
- std::cout << "Without CTRACK: " << duration_no_track << " µs" << std::endl;
- std::cout << "With CTRACK: " << duration_track << " µs" << std::endl;
- std::cout << "Overhead: " << overhead_percent << "% (" << overhead_ms << " ms total, "
- << overhead_ns_per_event << " ns per event)" << std::endl;
- }
-
- return {overhead_percent, overhead_ms, overhead_ns_per_event};
-}
-
-// Measure memory usage and calculation time
-std::tuple measure_memory_and_calculation_time()
-{
- std::cout << "\n=== Measuring Memory Usage and Calculation Time ===" << std::endl;
-
- // Clear any previous tracking data by getting and discarding results
- ctrack::result_as_string();
-
- // Measure initial memory
- size_t initial_memory = get_memory_usage();
-
- // Generate events
- size_t events_per_thread = g_config.total_events / g_config.thread_count;
-
- if (g_config.verbose)
- {
- std::cout << "Generating " << g_config.total_events << " events across "
- << g_config.thread_count << " threads..." << std::endl;
- }
-
- auto gen_start = std::chrono::high_resolution_clock::now();
- {
- std::vector threads;
- std::atomic start_flag{false};
-
- for (size_t i = 0; i < g_config.thread_count; ++i)
- {
- threads.emplace_back(benchmark_worker, events_per_thread, std::ref(start_flag));
- }
-
- start_flag = true;
-
- for (auto &t : threads)
- {
- t.join();
- }
- }
- auto gen_end = std::chrono::high_resolution_clock::now();
-
- // Measure memory after event generation
- size_t post_event_memory = get_memory_usage();
- size_t memory_used = post_event_memory - initial_memory;
- double bytes_per_event = (double)memory_used / g_config.total_events;
-
- if (g_config.verbose)
- {
- auto gen_duration = std::chrono::duration_cast(gen_end - gen_start).count();
- std::cout << "Event generation took: " << gen_duration << " ms" << std::endl;
- std::cout << "Memory used: " << memory_used / (1024.0 * 1024.0) << " MB" << std::endl;
- std::cout << "Memory per event: " << bytes_per_event << " bytes" << std::endl;
- }
-
- // Measure calculation time and peak memory usage
- std::atomic monitoring{true};
- std::atomic peak_memory{post_event_memory};
-
- // Start memory monitoring thread
- std::thread monitor_thread([&monitoring, &peak_memory, initial_memory]()
- {
- while (monitoring.load()) {
- size_t current_memory = get_memory_usage();
- size_t current_peak = peak_memory.load();
- while (current_memory > current_peak &&
- !peak_memory.compare_exchange_weak(current_peak, current_memory)) {}
- std::this_thread::sleep_for(std::chrono::milliseconds(10)); // Poll every 10ms
- } });
-
- auto calc_start = std::chrono::high_resolution_clock::now();
- auto results = ctrack::result_as_string();
- auto calc_end = std::chrono::high_resolution_clock::now();
-
- // Stop monitoring
- monitoring = false;
- monitor_thread.join();
-
- auto calc_duration = std::chrono::duration_cast(calc_end - calc_start).count() / 1000.0;
- double peak_calc_memory_mb = (peak_memory.load() - initial_memory) / (1024.0 * 1024.0);
-
- if (g_config.verbose)
- {
- std::cout << "Result calculation took: " << calc_duration << " ms" << std::endl;
- std::cout << "Peak memory during calculation: " << peak_calc_memory_mb << " MB" << std::endl;
- }
-
- return {bytes_per_event, calc_duration, peak_calc_memory_mb};
-}
-
-// Save baseline to file
-void save_baseline(const BaselineData &data)
-{
- std::ofstream file(g_config.baseline_file);
- if (!file)
- {
- std::cerr << "Error: Could not open baseline file for writing: " << g_config.baseline_file << std::endl;
- return;
- }
-
- // Simple JSON format
- file << "{\n";
- file << " \"accuracy_error_percent\": " << data.accuracy_error_percent << ",\n";
- file << " \"accuracy_error_ms_per_event\": " << data.accuracy_error_ms_per_event << ",\n";
- file << " \"overhead_percent\": " << data.overhead_percent << ",\n";
- file << " \"overhead_ms\": " << data.overhead_ms << ",\n";
- file << " \"overhead_ns_per_event\": " << data.overhead_ns_per_event << ",\n";
- file << " \"memory_bytes_per_event\": " << data.memory_bytes_per_event << ",\n";
- file << " \"calculation_time_ms\": " << data.calculation_time_ms << ",\n";
- file << " \"peak_calc_memory_mb\": " << data.peak_calc_memory_mb << ",\n";
- file << " \"total_events\": " << data.total_events << ",\n";
- file << " \"thread_count\": " << data.thread_count << ",\n";
- file << " \"timestamp\": \"" << data.timestamp << "\",\n";
- file << " \"platform\": \"" << data.platform << "\"\n";
- file << "}\n";
-
- std::cout << "\nBaseline saved to: " << g_config.baseline_file << std::endl;
-}
-
-// Load baseline from file
-bool load_baseline(BaselineData &data)
-{
- std::ifstream file(g_config.baseline_file);
- if (!file)
- {
- return false;
- }
-
- // Simple JSON parsing (production code would use a proper JSON library)
- std::string line;
- while (std::getline(file, line))
- {
- if (line.find("\"accuracy_error_percent\":") != std::string::npos)
- {
- size_t pos = line.find(": ") + 2;
- size_t end = line.find(",", pos);
- data.accuracy_error_percent = std::stod(line.substr(pos, end - pos));
- }
- else if (line.find("\"accuracy_error_ms_per_event\":") != std::string::npos)
- {
- size_t pos = line.find(": ") + 2;
- size_t end = line.find(",", pos);
- data.accuracy_error_ms_per_event = std::stod(line.substr(pos, end - pos));
- }
- else if (line.find("\"overhead_percent\":") != std::string::npos)
- {
- size_t pos = line.find(": ") + 2;
- size_t end = line.find(",", pos);
- data.overhead_percent = std::stod(line.substr(pos, end - pos));
- }
- else if (line.find("\"overhead_ms\":") != std::string::npos)
- {
- size_t pos = line.find(": ") + 2;
- size_t end = line.find(",", pos);
- data.overhead_ms = std::stod(line.substr(pos, end - pos));
- }
- else if (line.find("\"overhead_ns_per_event\":") != std::string::npos)
- {
- size_t pos = line.find(": ") + 2;
- size_t end = line.find(",", pos);
- data.overhead_ns_per_event = std::stod(line.substr(pos, end - pos));
- }
- else if (line.find("\"memory_bytes_per_event\":") != std::string::npos)
- {
- size_t pos = line.find(": ") + 2;
- size_t end = line.find(",", pos);
- data.memory_bytes_per_event = std::stod(line.substr(pos, end - pos));
- }
- else if (line.find("\"calculation_time_ms\":") != std::string::npos)
- {
- size_t pos = line.find(": ") + 2;
- size_t end = line.find(",", pos);
- data.calculation_time_ms = std::stod(line.substr(pos, end - pos));
- }
- else if (line.find("\"peak_calc_memory_mb\":") != std::string::npos)
- {
- size_t pos = line.find(": ") + 2;
- size_t end = line.find(",", pos);
- data.peak_calc_memory_mb = std::stod(line.substr(pos, end - pos));
- }
- else if (line.find("\"total_events\":") != std::string::npos)
- {
- size_t pos = line.find(": ") + 2;
- size_t end = line.find(",", pos);
- data.total_events = std::stoull(line.substr(pos, end - pos));
- }
- else if (line.find("\"thread_count\":") != std::string::npos)
- {
- size_t pos = line.find(": ") + 2;
- size_t end = line.find(",", pos);
- data.thread_count = std::stoull(line.substr(pos, end - pos));
- }
- }
-
- return true;
-}
-
-// Compare current results with baseline
-void compare_with_baseline(const BaselineData ¤t)
-{
- BaselineData baseline;
- if (!load_baseline(baseline))
- {
- std::cerr << "Error: Could not load baseline file: " << g_config.baseline_file << std::endl;
- return;
- }
-
- std::cout << "\n=== Baseline Comparison ===" << std::endl;
- std::cout << std::fixed << std::setprecision(2);
- auto print_comparison = [](const std::string &metric, double baseline_val, double current_val, bool lower_is_better = true)
- {
- double diff = current_val - baseline_val;
- double percent_change = (diff / baseline_val) * 100.0;
-
- std::string direction = (diff > 0) ? "increased" : "decreased";
- std::string indicator = (lower_is_better ? (diff > 0 ? "worse" : "better") : (diff > 0 ? "better" : "worse"));
-
- std::cout << metric << ":\n";
- std::cout << " Baseline: " << baseline_val << "\n";
- std::cout << " Current: " << current_val << "\n";
- std::cout << " Change: " << indicator << " - " << std::abs(percent_change) << "% " << direction << "\n\n";
- };
-
- print_comparison("Accuracy Error %", baseline.accuracy_error_percent, current.accuracy_error_percent);
- print_comparison("Accuracy Error (ms/event)", baseline.accuracy_error_ms_per_event, current.accuracy_error_ms_per_event);
- print_comparison("Overhead %", std::abs(baseline.overhead_percent), std::abs(current.overhead_percent));
- print_comparison("Overhead Time (ms)", std::abs(baseline.overhead_ms), std::abs(current.overhead_ms));
- print_comparison("Overhead per Event (ns)", baseline.overhead_ns_per_event, current.overhead_ns_per_event);
- print_comparison("Memory/Event (bytes)", baseline.memory_bytes_per_event, current.memory_bytes_per_event);
- print_comparison("Calculation Time (ms)", baseline.calculation_time_ms, current.calculation_time_ms);
- print_comparison("Peak Calc Memory (MB)", baseline.peak_calc_memory_mb, current.peak_calc_memory_mb);
-}
-
-// Get platform string
-std::string get_platform()
-{
-#ifdef _WIN32
- return "Windows";
-#elif __APPLE__
- return "macOS";
-#elif __linux__
- return "Linux";
-#else
- return "Unknown";
-#endif
-}
-
-// Get current timestamp
-std::string get_timestamp()
-{
- auto now = std::chrono::system_clock::now();
- auto time_t = std::chrono::system_clock::to_time_t(now);
- std::stringstream ss;
-#ifdef _WIN32
- struct tm time_info;
- localtime_s(&time_info, &time_t);
- ss << std::put_time(&time_info, "%Y-%m-%d %H:%M:%S");
-#else
- ss << std::put_time(std::localtime(&time_t), "%Y-%m-%d %H:%M:%S");
-#endif
- return ss.str();
-}
-
-// Print usage
-void print_usage(const char *program_name)
-{
- std::cout << "Usage: " << program_name << " [options]\n";
- std::cout << "Options:\n";
- std::cout << " --events Number of events to generate (default: 50000000)\n";
- std::cout << " --threads Number of threads to use (default: hardware concurrency)\n";
- std::cout << " --baseline Baseline file path (default: ctrack_baseline.json)\n";
- std::cout << " --record-baseline Record current results as baseline\n";
- std::cout << " --compare-baseline Compare results with baseline\n";
- std::cout << " --verbose Enable verbose output\n";
- std::cout << " --help Show this help message\n";
-}
-
-// Parse command line arguments
-bool parse_args(int argc, char *argv[])
-{
- for (int i = 1; i < argc; ++i)
- {
- std::string arg = argv[i];
-
- if (arg == "--help")
- {
- print_usage(argv[0]);
- return false;
- }
- else if (arg == "--events" && i + 1 < argc)
- {
- g_config.total_events = std::stoull(argv[++i]);
- }
- else if (arg == "--threads" && i + 1 < argc)
- {
- g_config.thread_count = std::stoull(argv[++i]);
- }
- else if (arg == "--baseline" && i + 1 < argc)
- {
- g_config.baseline_file = argv[++i];
- }
- else if (arg == "--record-baseline")
- {
- g_config.record_baseline = true;
- }
- else if (arg == "--compare-baseline")
- {
- g_config.compare_baseline = true;
- }
- else if (arg == "--verbose")
- {
- g_config.verbose = true;
- }
- else
- {
- std::cerr << "Unknown option: " << arg << std::endl;
- print_usage(argv[0]);
- return false;
- }
- }
-
- return true;
-}
-
-int main(int argc, char *argv[])
-{
- if (!parse_args(argc, argv))
- {
- return 1;
- }
-
- std::cout << "CTRACK Comprehensive Benchmark\n";
- std::cout << "==============================\n";
- std::cout << "Total events: " << g_config.total_events << "\n";
- std::cout << "Thread count: " << g_config.thread_count << "\n";
- std::cout << "Events per thread: " << g_config.total_events / g_config.thread_count << "\n";
-
- // Run benchmarks
- auto [accuracy_error_percent, accuracy_error_ms_per_event] = measure_accuracy();
- auto [overhead_percent, overhead_ms, overhead_ns_per_event] = measure_overhead();
- auto [bytes_per_event, calc_time, peak_calc_memory] = measure_memory_and_calculation_time();
-
- // Prepare results
- BaselineData current_data;
- current_data.accuracy_error_percent = accuracy_error_percent;
- current_data.accuracy_error_ms_per_event = accuracy_error_ms_per_event;
- current_data.overhead_percent = overhead_percent;
- current_data.overhead_ms = overhead_ms;
- current_data.overhead_ns_per_event = overhead_ns_per_event;
- current_data.memory_bytes_per_event = bytes_per_event;
- current_data.calculation_time_ms = calc_time;
- current_data.peak_calc_memory_mb = peak_calc_memory;
- current_data.total_events = g_config.total_events;
- current_data.thread_count = g_config.thread_count;
- current_data.timestamp = get_timestamp();
- current_data.platform = get_platform();
-
- // Print summary
- std::cout << "\n=== Benchmark Results ===" << std::endl;
- std::cout << std::fixed << std::setprecision(2);
- std::cout << "Accuracy error: " << accuracy_error_percent << "% (" << accuracy_error_ms_per_event << " ms per event)" << std::endl;
- std::cout << "Overhead: " << overhead_percent << "% (" << overhead_ms << " ms total, "
- << overhead_ns_per_event << " ns per event)" << std::endl;
- std::cout << "Memory per event: " << bytes_per_event << " bytes" << std::endl;
- std::cout << "Calculation time: " << calc_time << " ms" << std::endl;
- std::cout << "Peak calculation memory: " << peak_calc_memory << " MB" << std::endl;
-
- // Handle baseline operations
- if (g_config.record_baseline)
- {
- save_baseline(current_data);
- }
-
- if (g_config.compare_baseline)
- {
- compare_with_baseline(current_data);
- }
-
- return 0;
-}
\ No newline at end of file
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#ifdef _WIN32
+#include
+#include
+#else
+#include
+#include
+#endif
+
+// Prevent the compiler from inlining or collapsing calls across call-sites.
+// With -O3 the _no_track helpers would otherwise be fully inlined into the
+// worker loop, letting the optimiser merge/eliminate busy-wait iterations and
+// producing artificially low (even negative) overhead measurements.
+#if defined(_MSC_VER)
+#define BENCHMARK_NOINLINE __declspec(noinline)
+#else
+#define BENCHMARK_NOINLINE __attribute__((noinline))
+#endif
+
+// ---------------------------------------------------------------------------
+// Orthogonal wall-clock: does NOT share the vDSO/TSC path used by either
+// std::chrono or ctrack's internal clocks, so it can measure overhead without
+// self-measurement bias regardless of which ctrack clock variant is compiled.
+//
+// On Windows we fall back to QueryPerformanceCounter which goes through the
+// HAL and is independent of both RDTSC and the C++ runtime clock.
+// ---------------------------------------------------------------------------
+inline int64_t raw_clock_ns()
+{
+#ifdef _WIN32
+ LARGE_INTEGER freq, cnt;
+ QueryPerformanceFrequency(&freq);
+ QueryPerformanceCounter(&cnt);
+ return static_cast(cnt.QuadPart * 1'000'000'000LL / freq.QuadPart);
+#else
+ struct timespec ts;
+ clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+ return static_cast(ts.tv_sec) * 1'000'000'000LL + ts.tv_nsec;
+#endif
+}
+
+
+// Configuration
+struct BenchmarkConfig
+{
+ size_t total_events = 50'000'000; // Default 50 million events
+ size_t thread_count = std::thread::hardware_concurrency();
+ bool record_baseline = false;
+ bool compare_baseline = false;
+ std::string baseline_file = "ctrack_baseline.json";
+ bool verbose = false;
+};
+
+// Baseline data structure
+struct BaselineData
+{
+ double accuracy_error_percent;
+ double accuracy_error_us_per_event;
+ double overhead_percent;
+ double overhead_ms;
+ double overhead_ns_per_event;
+ double memory_bytes_per_event;
+ double calculation_time_ms;
+ double peak_calc_memory_mb;
+ size_t total_events;
+ size_t thread_count;
+ std::string timestamp;
+ std::string platform;
+};
+
+// Global config
+BenchmarkConfig g_config;
+
+// Get current memory usage in bytes
+size_t get_memory_usage()
+{
+#ifdef _WIN32
+ PROCESS_MEMORY_COUNTERS_EX pmc;
+ GetProcessMemoryInfo(GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS *)&pmc, sizeof(pmc));
+ return pmc.WorkingSetSize;
+#else
+ struct rusage usage;
+ getrusage(RUSAGE_SELF, &usage);
+ return usage.ru_maxrss * 1024; // Convert KB to bytes on Linux
+#endif
+}
+
+// Precise busy wait function - waits for specified nanoseconds
+BENCHMARK_NOINLINE void busy_wait_ns(int64_t nanoseconds)
+{
+ auto start = std::chrono::high_resolution_clock::now();
+ auto target_duration = std::chrono::nanoseconds(nanoseconds);
+
+ while (true)
+ {
+ auto now = std::chrono::high_resolution_clock::now();
+ auto elapsed = now - start;
+ if (elapsed >= target_duration)
+ {
+ break;
+ }
+ }
+}
+
+// Benchmark functions with predictable timing
+void leaf_function(int depth)
+{
+ CTRACK_NAME("leaf_function");
+ // Busy wait for 1 microsecond (1000 ns)
+ busy_wait_ns(1000);
+}
+
+void level_3_function(int depth)
+{
+ CTRACK_NAME("level_3_function");
+ // Busy wait for 500 ns
+ busy_wait_ns(500);
+
+ // Call leaf function twice
+ leaf_function(depth + 1);
+ leaf_function(depth + 1);
+}
+
+void level_2_function(int depth, int iterations)
+{
+ CTRACK_NAME("level_2_function");
+ // Busy wait for 300 ns
+ busy_wait_ns(300);
+
+ for (int i = 0; i < iterations; ++i)
+ {
+ level_3_function(depth + 1);
+ }
+}
+
+void level_1_function(int iterations)
+{
+ CTRACK_NAME("level_1_function");
+ // Busy wait for 200 ns
+ busy_wait_ns(200);
+
+ level_2_function(1, iterations);
+}
+
+// Version without CTRACK for overhead measurement
+BENCHMARK_NOINLINE void leaf_function_no_track(int depth)
+{
+ busy_wait_ns(1000);
+}
+
+BENCHMARK_NOINLINE void level_3_function_no_track(int depth)
+{
+ busy_wait_ns(500);
+ leaf_function_no_track(depth + 1);
+ leaf_function_no_track(depth + 1);
+}
+
+BENCHMARK_NOINLINE void level_2_function_no_track(int depth, int iterations)
+{
+ busy_wait_ns(300);
+ for (int i = 0; i < iterations; ++i)
+ {
+ level_3_function_no_track(depth + 1);
+ }
+}
+
+BENCHMARK_NOINLINE void level_1_function_no_track(int iterations)
+{
+ busy_wait_ns(200);
+ level_2_function_no_track(1, iterations);
+}
+
+// Worker thread function
+void benchmark_worker(size_t events_per_thread, std::atomic &start_flag)
+{
+ // Wait for start signal
+ while (!start_flag.load())
+ {
+ std::this_thread::yield();
+ }
+
+ // Calculate iterations to reach target event count
+ // Each level_1 call generates: 1 + 1 + iterations * (1 + 2) events
+ // For iterations=10: 1 + 1 + 10 * 3 = 32 events per call
+ const int iterations = 10;
+ const int events_per_call = 2 + iterations * 3;
+ size_t calls_needed = events_per_thread / events_per_call;
+
+ for (size_t i = 0; i < calls_needed; ++i)
+ {
+ level_1_function(iterations);
+ }
+}
+
+// Worker thread function without tracking
+void benchmark_worker_no_track(size_t events_per_thread, std::atomic &start_flag)
+{
+ while (!start_flag.load())
+ {
+ std::this_thread::yield();
+ }
+
+ const int iterations = 10;
+ const int events_per_call = 2 + iterations * 3;
+ size_t calls_needed = events_per_thread / events_per_call;
+
+ for (size_t i = 0; i < calls_needed; ++i)
+ {
+ level_1_function_no_track(iterations);
+ }
+}
+
+// Parse timing from CTRACK results string for a specific function
+double parse_function_timing(const std::string &results, const std::string &function_name)
+{
+ // Look for the Details section first
+ size_t details_pos = results.find("Details");
+ if (details_pos == std::string::npos)
+ {
+ return -1.0; // Details section not found
+ }
+
+ // Look for the function name after the Details section
+ size_t func_pos = results.find(function_name, details_pos);
+ if (func_pos == std::string::npos)
+ {
+ return -1.0; // Function not found in Details section
+ }
+
+ // Find the line containing this function in the Details section
+ size_t line_start = results.rfind('\n', func_pos);
+ if (line_start == std::string::npos)
+ line_start = details_pos;
+ else
+ line_start++; // Skip the newline
+
+ size_t line_end = results.find('\n', func_pos);
+ if (line_end == std::string::npos)
+ line_end = results.length();
+
+ std::string line = results.substr(line_start, line_end - line_start);
+
+ // Look for the "time acc" column value (4th column after filename, function, line)
+ // Split by | and find the 4th field
+ std::vector fields;
+ std::istringstream iss(line);
+ std::string field;
+
+ while (std::getline(iss, field, '|'))
+ {
+ // Trim whitespace
+ field.erase(0, field.find_first_not_of(" \t"));
+ field.erase(field.find_last_not_of(" \t") + 1);
+ if (!field.empty())
+ {
+ fields.push_back(field);
+ }
+ }
+
+ // The time acc should be in the 4th field (0-indexed: filename=0, function=1, line=2, time_acc=3)
+ if (fields.size() > 3)
+ {
+ std::string time_acc = fields[3];
+
+ // Parse value and unit from time_acc (e.g., "2.09 ms")
+ std::istringstream time_iss(time_acc);
+ double value;
+ std::string unit;
+
+ if (time_iss >> value >> unit)
+ {
+ // Convert to nanoseconds based on unit
+ if (unit == "s")
+ return value * 1e9;
+ else if (unit == "ms")
+ return value * 1e6;
+ else if (unit == "us")
+ return value * 1e3;
+ else if (unit == "ns")
+ return value;
+ }
+ }
+
+ return -1.0; // Could not parse
+}
+
+// Measure accuracy by comparing known timings with CTRACK measurements
+std::pair measure_accuracy()
+{
+ std::cout << "\n=== Measuring Accuracy ===" << std::endl;
+
+ // Clear any previous tracking data by getting and discarding results
+ ctrack::result_as_string();
+
+ // Run a controlled test with known timings
+ const int test_iterations = 100;
+ for (int i = 0; i < test_iterations; ++i)
+ {
+ level_1_function(10);
+ }
+
+ // Get results
+ auto results = ctrack::result_as_string();
+
+ // Expected timings per iteration (in nanoseconds):
+ // leaf_function: 1000ns (called 20 times per iteration) = 20,000ns total per iteration
+ // level_3_function: 500ns + 2*1000ns = 2500ns (called 10 times per iteration) = 25,000ns total per iteration
+ // level_2_function: 300ns + 10*2500ns = 25,300ns (called 1 time per iteration) = 25,300ns total per iteration
+ // level_1_function: 200ns + 25,300ns = 25,500ns (called 1 time per iteration) = 25,500ns total per iteration
+
+ struct ExpectedTiming
+ {
+ std::string name;
+ double expected_total_ns;
+ int call_count;
+ };
+
+ std::vector expected_timings = {
+ {"leaf_function", 1000.0 * 20 * test_iterations, 20 * test_iterations},
+ {"level_3_function", 2500.0 * 10 * test_iterations, 10 * test_iterations},
+ {"level_2_function", 25300.0 * 1 * test_iterations, 1 * test_iterations},
+ {"level_1_function", 25500.0 * 1 * test_iterations, 1 * test_iterations}};
+
+ double total_expected_time = 0.0;
+ double total_actual_time = 0.0;
+ double max_absolute_error = 0.0;
+
+ if (g_config.verbose)
+ {
+ std::cout << "Function accuracy analysis:" << std::endl;
+ }
+
+ for (const auto &timing : expected_timings)
+ {
+ double actual_ns = parse_function_timing(results, timing.name);
+ if (actual_ns > 0)
+ {
+ double expected_ns = timing.expected_total_ns;
+ double absolute_error = std::abs(actual_ns - expected_ns);
+ double percent_error = (absolute_error / expected_ns) * 100.0;
+
+ total_expected_time += expected_ns;
+ total_actual_time += actual_ns;
+ max_absolute_error = (std::max)(max_absolute_error, absolute_error);
+
+ if (g_config.verbose)
+ {
+ std::cout << " " << timing.name << ": expected " << expected_ns / 1e6 << " ms, got "
+ << actual_ns / 1e6 << " ms (error: " << percent_error << "%)" << std::endl;
+ }
+ }
+ else if (g_config.verbose)
+ {
+ std::cout << " " << timing.name << ": could not parse timing" << std::endl;
+ }
+ }
+
+ double overall_error_percent = 0.0;
+ double overall_error_ms = 0.0;
+
+ if (total_expected_time > 0)
+ {
+ double total_absolute_error = std::abs(total_actual_time - total_expected_time);
+ overall_error_percent = (total_absolute_error / total_expected_time) * 100.0;
+
+ // Calculate total number of events across all functions
+ double total_events = 0;
+ for (const auto &timing : expected_timings)
+ {
+ total_events += timing.call_count;
+ }
+
+ // Convert to milliseconds per event
+ overall_error_ms = (total_absolute_error / 1e3) / total_events; // Convert to us per event
+ }
+
+ if (g_config.verbose)
+ {
+ std::cout << "Overall accuracy error: " << overall_error_percent << "% (" << overall_error_ms << " ms per event)" << std::endl;
+ }
+
+ return {overall_error_percent, overall_error_ms};
+}
+
+// ---------------------------------------------------------------------------
+// measure_overhead: uses raw_clock_ns() (CLOCK_MONOTONIC_RAW / QPC) so the
+// outer timer is orthogonal to whatever clock ctrack uses internally.
+// This eliminates the vDSO-cache self-measurement bias that made the chrono
+// build appear to have artificially low overhead.
+// ---------------------------------------------------------------------------
+std::tuple measure_overhead()
+{
+ std::cout << "\n=== Measuring Overhead ===" << std::endl;
+
+ const size_t overhead_events = 1'000'000;
+ size_t events_per_thread = overhead_events / g_config.thread_count;
+
+ // Helper: spawn threads, wait for join, return nothing (timing done outside)
+ auto run_variant = [&](bool with_track)
+ {
+ std::vector threads;
+ std::atomic start_flag{false};
+ for (size_t i = 0; i < g_config.thread_count; ++i)
+ {
+ if (with_track)
+ threads.emplace_back(benchmark_worker, events_per_thread, std::ref(start_flag));
+ else
+ threads.emplace_back(benchmark_worker_no_track, events_per_thread, std::ref(start_flag));
+ }
+ start_flag = true;
+ for (auto &t : threads) t.join();
+ // NOTE: result_as_string() is intentionally NOT called here.
+ // It must stay outside the timed window.
+ };
+
+ // Warmup
+ run_variant(false);
+ ctrack::result_as_string(); // clear accumulated state
+ run_variant(true);
+ ctrack::result_as_string(); // clear accumulated state
+
+ // Multi-trial with alternating order
+ const int NUM_TRIALS = 5;
+ std::vector no_track_times, track_times;
+
+ for (int trial = 0; trial < NUM_TRIALS; ++trial)
+ {
+ bool no_track_first = (trial % 2 == 0);
+
+ // measure(with_track): clear ctrack state BEFORE t0, time pure work,
+ // discard results AFTER t1.
+ auto measure = [&](bool with_track) -> double
+ {
+ // Pre-clear: outside timed window
+ ctrack::result_as_string();
+
+ int64_t t0 = raw_clock_ns(); // ← CLOCK_MONOTONIC_RAW / QPC
+ run_variant(with_track);
+ int64_t t1 = raw_clock_ns(); // ← CLOCK_MONOTONIC_RAW / QPC
+
+ // Post-clear: outside timed window
+ if (with_track) ctrack::result_as_string();
+
+ return static_cast(t1 - t0) / 1'000.0; // ns → µs
+ };
+
+ if (no_track_first)
+ {
+ no_track_times.push_back(measure(false));
+ track_times .push_back(measure(true));
+ }
+ else
+ {
+ track_times .push_back(measure(true));
+ no_track_times.push_back(measure(false));
+ }
+ }
+
+ // Median to reject scheduler outliers
+ auto median = [](std::vector v) -> double
+ {
+ std::sort(v.begin(), v.end());
+ return v[v.size() / 2];
+ };
+
+ double dur_no_track = median(no_track_times);
+ double dur_track = median(track_times);
+ double raw_diff = dur_track - dur_no_track; // µs
+ double clamped_diff = std::max(0.0, raw_diff);
+
+ double overhead_percent = (clamped_diff / dur_no_track) * 100.0;
+ double overhead_ms = clamped_diff / 1'000.0;
+ double overhead_ns_per_event = (clamped_diff * 1'000.0) / overhead_events;
+
+ if (g_config.verbose)
+ {
+ std::cout << "Without ctrack (median): " << dur_no_track << " µs\n";
+ std::cout << "With ctrack (median): " << dur_track << " µs\n";
+ if (raw_diff < 0)
+ std::cout << "Raw diff: " << raw_diff << " µs (negative — clamped to 0, measurement noise)\n";
+ std::cout << "Overhead: " << overhead_percent << "% ("
+ << overhead_ms << " ms, " << overhead_ns_per_event << " ns/event)\n";
+ }
+
+ return {overhead_percent, overhead_ms, overhead_ns_per_event};
+}
+
+std::tuple measure_memory_and_calculation_time()
+{
+ std::cout << "\n=== Measuring Memory Usage and Calculation Time ===" << std::endl;
+ ctrack::result_as_string();
+ size_t initial_memory = get_memory_usage();
+ size_t events_per_thread = g_config.total_events / g_config.thread_count;
+
+ if (g_config.verbose)
+ {
+ std::cout << "Generating " << g_config.total_events << " events across "
+ << g_config.thread_count << " threads..." << std::endl;
+ }
+
+ auto gen_start = std::chrono::high_resolution_clock::now();
+ {
+ std::vector threads;
+ std::atomic start_flag{false};
+
+ for (size_t i = 0; i < g_config.thread_count; ++i)
+ {
+ threads.emplace_back(benchmark_worker, events_per_thread, std::ref(start_flag));
+ }
+
+ start_flag = true;
+
+ for (auto &t : threads)
+ {
+ t.join();
+ }
+ }
+ auto gen_end = std::chrono::high_resolution_clock::now();
+
+ // Measure memory after event generation
+ size_t post_event_memory = get_memory_usage();
+ size_t memory_used = post_event_memory - initial_memory;
+ double bytes_per_event = (double)memory_used / g_config.total_events;
+
+ if (g_config.verbose)
+ {
+ auto gen_duration = std::chrono::duration_cast(gen_end - gen_start).count();
+ std::cout << "Event generation took: " << gen_duration << " ms" << std::endl;
+ std::cout << "Memory used: " << memory_used / (1024.0 * 1024.0) << " MB" << std::endl;
+ std::cout << "Memory per event: " << bytes_per_event << " bytes" << std::endl;
+ }
+
+ // Measure calculation time and peak memory usage
+ std::atomic monitoring{true};
+ std::atomic peak_memory{post_event_memory};
+
+ // Start memory monitoring thread
+ std::thread monitor_thread([&monitoring, &peak_memory, initial_memory]()
+ {
+ while (monitoring.load()) {
+ size_t current_memory = get_memory_usage();
+ size_t current_peak = peak_memory.load();
+ while (current_memory > current_peak &&
+ !peak_memory.compare_exchange_weak(current_peak, current_memory)) {}
+ std::this_thread::sleep_for(std::chrono::milliseconds(10)); // Poll every 10ms
+ } });
+
+ auto calc_start = std::chrono::high_resolution_clock::now();
+ auto results = ctrack::result_as_string();
+ auto calc_end = std::chrono::high_resolution_clock::now();
+
+ // Stop monitoring
+ monitoring = false;
+ monitor_thread.join();
+
+ auto calc_duration = std::chrono::duration_cast(calc_end - calc_start).count() / 1000.0;
+ double peak_calc_memory_mb = (peak_memory.load() - initial_memory) / (1024.0 * 1024.0);
+
+ if (g_config.verbose)
+ {
+ std::cout << "Result calculation took: " << calc_duration << " ms" << std::endl;
+ std::cout << "Peak memory during calculation: " << peak_calc_memory_mb << " MB" << std::endl;
+ }
+
+ return {bytes_per_event, calc_duration, peak_calc_memory_mb};
+}
+
+// Save baseline to file
+void save_baseline(const BaselineData &data)
+{
+ std::ofstream file(g_config.baseline_file);
+ if (!file)
+ {
+ std::cerr << "Error: Could not open baseline file for writing: " << g_config.baseline_file << std::endl;
+ return;
+ }
+
+ // Simple JSON format
+ file << "{\n";
+ file << " \"accuracy_error_percent\": " << data.accuracy_error_percent << ",\n";
+ file << " \"accuracy_error_ms_per_event\": " << data.accuracy_error_us_per_event << ",\n";
+ file << " \"overhead_percent\": " << data.overhead_percent << ",\n";
+ file << " \"overhead_ms\": " << data.overhead_ms << ",\n";
+ file << " \"overhead_ns_per_event\": " << data.overhead_ns_per_event << ",\n";
+ file << " \"memory_bytes_per_event\": " << data.memory_bytes_per_event << ",\n";
+ file << " \"calculation_time_ms\": " << data.calculation_time_ms << ",\n";
+ file << " \"peak_calc_memory_mb\": " << data.peak_calc_memory_mb << ",\n";
+ file << " \"total_events\": " << data.total_events << ",\n";
+ file << " \"thread_count\": " << data.thread_count << ",\n";
+ file << " \"timestamp\": \"" << data.timestamp << "\",\n";
+ file << " \"platform\": \"" << data.platform << "\"\n";
+ file << "}\n";
+
+ std::cout << "\nBaseline saved to: " << g_config.baseline_file << std::endl;
+}
+
+// Load baseline from file
+bool load_baseline(BaselineData &data)
+{
+ std::ifstream file(g_config.baseline_file);
+ if (!file)
+ {
+ return false;
+ }
+
+ // Simple JSON parsing (production code would use a proper JSON library)
+ std::string line;
+ while (std::getline(file, line))
+ {
+ if (line.find("\"accuracy_error_percent\":") != std::string::npos)
+ {
+ size_t pos = line.find(": ") + 2;
+ size_t end = line.find(",", pos);
+ data.accuracy_error_percent = std::stod(line.substr(pos, end - pos));
+ }
+ else if (line.find("\"accuracy_error_ms_per_event\":") != std::string::npos)
+ {
+ size_t pos = line.find(": ") + 2;
+ size_t end = line.find(",", pos);
+ data.accuracy_error_us_per_event = std::stod(line.substr(pos, end - pos));
+ }
+ else if (line.find("\"overhead_percent\":") != std::string::npos)
+ {
+ size_t pos = line.find(": ") + 2;
+ size_t end = line.find(",", pos);
+ data.overhead_percent = std::stod(line.substr(pos, end - pos));
+ }
+ else if (line.find("\"overhead_ms\":") != std::string::npos)
+ {
+ size_t pos = line.find(": ") + 2;
+ size_t end = line.find(",", pos);
+ data.overhead_ms = std::stod(line.substr(pos, end - pos));
+ }
+ else if (line.find("\"overhead_ns_per_event\":") != std::string::npos)
+ {
+ size_t pos = line.find(": ") + 2;
+ size_t end = line.find(",", pos);
+ data.overhead_ns_per_event = std::stod(line.substr(pos, end - pos));
+ }
+ else if (line.find("\"memory_bytes_per_event\":") != std::string::npos)
+ {
+ size_t pos = line.find(": ") + 2;
+ size_t end = line.find(",", pos);
+ data.memory_bytes_per_event = std::stod(line.substr(pos, end - pos));
+ }
+ else if (line.find("\"calculation_time_ms\":") != std::string::npos)
+ {
+ size_t pos = line.find(": ") + 2;
+ size_t end = line.find(",", pos);
+ data.calculation_time_ms = std::stod(line.substr(pos, end - pos));
+ }
+ else if (line.find("\"peak_calc_memory_mb\":") != std::string::npos)
+ {
+ size_t pos = line.find(": ") + 2;
+ size_t end = line.find(",", pos);
+ data.peak_calc_memory_mb = std::stod(line.substr(pos, end - pos));
+ }
+ else if (line.find("\"total_events\":") != std::string::npos)
+ {
+ size_t pos = line.find(": ") + 2;
+ size_t end = line.find(",", pos);
+ data.total_events = std::stoull(line.substr(pos, end - pos));
+ }
+ else if (line.find("\"thread_count\":") != std::string::npos)
+ {
+ size_t pos = line.find(": ") + 2;
+ size_t end = line.find(",", pos);
+ data.thread_count = std::stoull(line.substr(pos, end - pos));
+ }
+ }
+
+ return true;
+}
+
+// Compare current results with baseline
+void compare_with_baseline(const BaselineData ¤t)
+{
+ BaselineData baseline;
+ if (!load_baseline(baseline))
+ {
+ std::cerr << "Error: Could not load baseline file: " << g_config.baseline_file << std::endl;
+ return;
+ }
+
+ std::cout << "\n=== Baseline Comparison ===" << std::endl;
+ std::cout << std::fixed << std::setprecision(2);
+ auto print_comparison = [](const std::string &metric, double baseline_val, double current_val, bool lower_is_better = true)
+ {
+ double diff = current_val - baseline_val;
+ double percent_change = (diff / baseline_val) * 100.0;
+
+ std::string direction = (diff > 0) ? "increased" : "decreased";
+ std::string indicator = (lower_is_better ? (diff > 0 ? "worse" : "better") : (diff > 0 ? "better" : "worse"));
+
+ std::cout << metric << ":\n";
+ std::cout << " Baseline: " << baseline_val << "\n";
+ std::cout << " Current: " << current_val << "\n";
+ std::cout << " Change: " << indicator << " - " << std::abs(percent_change) << "% " << direction << "\n\n";
+ };
+
+ print_comparison("Accuracy Error %", baseline.accuracy_error_percent, current.accuracy_error_percent);
+ print_comparison("Accuracy Error (ms/event)", baseline.accuracy_error_us_per_event, current.accuracy_error_us_per_event);
+ print_comparison("Overhead %", std::abs(baseline.overhead_percent), std::abs(current.overhead_percent));
+ print_comparison("Overhead Time (ms)", std::abs(baseline.overhead_ms), std::abs(current.overhead_ms));
+ print_comparison("Overhead per Event (ns)", baseline.overhead_ns_per_event, current.overhead_ns_per_event);
+ print_comparison("Memory/Event (bytes)", baseline.memory_bytes_per_event, current.memory_bytes_per_event);
+ print_comparison("Calculation Time (ms)", baseline.calculation_time_ms, current.calculation_time_ms);
+ print_comparison("Peak Calc Memory (MB)", baseline.peak_calc_memory_mb, current.peak_calc_memory_mb);
+}
+
+// Get platform string
+std::string get_platform()
+{
+#ifdef _WIN32
+ return "Windows";
+#elif __APPLE__
+ return "macOS";
+#elif __linux__
+ return "Linux";
+#else
+ return "Unknown";
+#endif
+}
+
+// Get current timestamp
+std::string get_timestamp()
+{
+ auto now = std::chrono::system_clock::now();
+ auto time_t = std::chrono::system_clock::to_time_t(now);
+ std::stringstream ss;
+#ifdef _WIN32
+ struct tm time_info;
+ localtime_s(&time_info, &time_t);
+ ss << std::put_time(&time_info, "%Y-%m-%d %H:%M:%S");
+#else
+ ss << std::put_time(std::localtime(&time_t), "%Y-%m-%d %H:%M:%S");
+#endif
+ return ss.str();
+}
+
+// Print usage
+void print_usage(const char *program_name)
+{
+ std::cout << "Usage: " << program_name << " [options]\n";
+ std::cout << "Options:\n";
+ std::cout << " --events Number of events to generate (default: 50000000)\n";
+ std::cout << " --threads Number of threads to use (default: hardware concurrency)\n";
+ std::cout << " --baseline Baseline file path (default: ctrack_baseline.json)\n";
+ std::cout << " --record-baseline Record current results as baseline\n";
+ std::cout << " --compare-baseline Compare results with baseline\n";
+ std::cout << " --verbose Enable verbose output\n";
+ std::cout << " --help Show this help message\n";
+}
+
+// Parse command line arguments
+bool parse_args(int argc, char *argv[])
+{
+ for (int i = 1; i < argc; ++i)
+ {
+ std::string arg = argv[i];
+
+ if (arg == "--help")
+ {
+ print_usage(argv[0]);
+ return false;
+ }
+ else if (arg == "--events" && i + 1 < argc)
+ {
+ g_config.total_events = std::stoull(argv[++i]);
+ }
+ else if (arg == "--threads" && i + 1 < argc)
+ {
+ g_config.thread_count = std::stoull(argv[++i]);
+ }
+ else if (arg == "--baseline" && i + 1 < argc)
+ {
+ g_config.baseline_file = argv[++i];
+ }
+ else if (arg == "--record-baseline")
+ {
+ g_config.record_baseline = true;
+ }
+ else if (arg == "--compare-baseline")
+ {
+ g_config.compare_baseline = true;
+ }
+ else if (arg == "--verbose")
+ {
+ g_config.verbose = true;
+ }
+ else
+ {
+ std::cerr << "Unknown option: " << arg << std::endl;
+ print_usage(argv[0]);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int main(int argc, char *argv[])
+{
+ if (!parse_args(argc, argv))
+ {
+ return 1;
+ }
+
+ std::cout << "CTRACK Comprehensive Benchmark\n";
+ std::cout << "==============================\n";
+ std::cout << "Total events: " << g_config.total_events << "\n";
+ std::cout << "Thread count: " << g_config.thread_count << "\n";
+ std::cout << "Events per thread: " << g_config.total_events / g_config.thread_count << "\n";
+
+ // Run benchmarks
+ auto [accuracy_error_percent, accuracy_error_us_per_event] = measure_accuracy();
+ auto [overhead_percent, overhead_ms, overhead_ns_per_event] = measure_overhead();
+ auto [bytes_per_event, calc_time, peak_calc_memory] = measure_memory_and_calculation_time();
+
+ // Prepare results
+ BaselineData current_data;
+ current_data.accuracy_error_percent = accuracy_error_percent;
+ current_data.accuracy_error_us_per_event = accuracy_error_us_per_event;
+ current_data.overhead_percent = overhead_percent;
+ current_data.overhead_ms = overhead_ms;
+ current_data.overhead_ns_per_event = overhead_ns_per_event;
+ current_data.memory_bytes_per_event = bytes_per_event;
+ current_data.calculation_time_ms = calc_time;
+ current_data.peak_calc_memory_mb = peak_calc_memory;
+ current_data.total_events = g_config.total_events;
+ current_data.thread_count = g_config.thread_count;
+ current_data.timestamp = get_timestamp();
+ current_data.platform = get_platform();
+
+ // Print summary
+ std::cout << "\n=== Benchmark Results ===" << std::endl;
+ std::cout << std::fixed << std::setprecision(2);
+ std::cout << "Accuracy error: " << accuracy_error_percent << "% (" << accuracy_error_us_per_event << " us per event)" << std::endl;
+ std::cout << "Overhead: " << overhead_percent << "% (" << overhead_ms << " ms total, "
+ << overhead_ns_per_event << " ns per event)" << std::endl;
+ std::cout << "Memory per event: " << bytes_per_event << " bytes" << std::endl;
+ std::cout << "Calculation time: " << calc_time << " ms" << std::endl;
+ std::cout << "Peak calculation memory: " << peak_calc_memory << " MB" << std::endl;
+
+ // Handle baseline operations
+ if (g_config.record_baseline)
+ {
+ save_baseline(current_data);
+ }
+
+ if (g_config.compare_baseline)
+ {
+ compare_with_baseline(current_data);
+ }
+
+ return 0;
+}
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 2e82e61..ea3cc76 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,5 @@
+#add_compile_definitions(CTRACK_CLOCK_RDTSC) # ""=chrono
# Create executables for each example
add_executable(basic_singlethreaded basic_singlethreaded.cpp)
add_executable(multithreaded_prime_counter multithreaded_prime_counter.cpp)
diff --git a/examples/basic_singlethreaded.cpp b/examples/basic_singlethreaded.cpp
index 27fcd81..802d314 100644
--- a/examples/basic_singlethreaded.cpp
+++ b/examples/basic_singlethreaded.cpp
@@ -43,4 +43,4 @@ int main() {
ctrack::result_print();
//std::cout << ctrack::result_as_string() << std::endl;
return 0;
-}
\ No newline at end of file
+}
diff --git a/include/ctrack.hpp b/include/ctrack.hpp
index 52d309c..527504c 100644
--- a/include/ctrack.hpp
+++ b/include/ctrack.hpp
@@ -27,6 +27,7 @@
#include
#include
#include
+#include
#define CTRACK_VERSION_MAJOR 1
#define CTRACK_VERSION_MINOR 1
@@ -38,8 +39,8 @@
// Create a string version
#define CTRACK_VERSION_STRING \
- TOSTRING(CTRACK_VERSION_MAJOR) \
- "_" TOSTRING(CTRACK_VERSION_MINOR) "_" TOSTRING(CTRACK_VERSION_PATCH)
+TOSTRING(CTRACK_VERSION_MAJOR) \
+"_" TOSTRING(CTRACK_VERSION_MINOR) "_" TOSTRING(CTRACK_VERSION_PATCH)
// Use the version string as the namespace name
#define CTRACK_VERSION_NAMESPACE v##CTRACK_VERSION_MAJOR##_##CTRACK_VERSION_MINOR##_##CTRACK_VERSION_PATCH
@@ -47,1211 +48,1483 @@
namespace ctrack
{
- inline namespace CTRACK_VERSION_NAMESPACE
- {
+
+// Cross-platform inline + intrinsic shims (to survive from compiler optim)
+#if defined(_MSC_VER)
+#define CTRACK_ALWAYS_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#define CTRACK_ALWAYS_INLINE inline __attribute__((always_inline))
+#else
+#define CTRACK_ALWAYS_INLINE inline
+#endif
+
+// TSC clock backends (x86_64 only)
+// otherwise, only Clock_Chrono compiles
+#if defined(__x86_64__) || defined(_M_X64)
+
+#if defined(_MSC_VER)
+#include
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include
+#else
+#include
+#include
+#endif
+
+// TSC -> ns conversion state
+// Defined once inside EventHandler constructor. Then read only
+inline double cycles_per_ns = 3000.0; // 3Ghz
+inline std::chrono::system_clock::time_point tsc_anchor_system{};
+inline uint64_t tsc_anchor_cycles = 0;
+
+// CPUID wrapper
+inline void ctrack_cpuid(
+ uint32_t leaf,
+ uint32_t subleaf,
+ uint32_t& eax,
+ uint32_t& ebx,
+ uint32_t& ecx,
+ uint32_t& edx)
+{
+#if defined(_MSC_VER)
+ int regs[4];
+ __cpuidex(regs, static_cast(leaf), static_cast(subleaf));
+ eax = regs[0]; ebx = regs[1]; ecx = regs[2]; edx = regs[3];
+#else
+ __cpuid_count(leaf, subleaf, eax, ebx, ecx, edx);
+#endif
+}
+
+// C1: CPUID 0x15, exact TSC frequency. Intel Skylake+ (2015+)
+inline double tsc_ghz_from_cpuid_15h() {
+ uint32_t a, b, c, d;
+ ctrack_cpuid(0, 0, a, b, c, d);
+ if (a < 0x15) return 0.0;
+
+ ctrack_cpuid(0x15, 0, a, b, c, d);
+ // EAX = denominator, EBX = numerator, ECX = core crystal Hz
+ if (a == 0 || b == 0 || c == 0) return 0.0;
+ return (static_cast(c) * b / a) / 1e9;
+}
+
+// C2: CPUID 0x16, base frequency in MHz. Intel Haswell+ (2013+)
+inline double tsc_ghz_from_cpuid_16h() {
+ uint32_t a, b, c, d;
+ ctrack_cpuid(0, 0, a, b, c, d);
+ if (a < 0x16) return 0.0;
+
+ ctrack_cpuid(0x16, 0, a, b, c, d);
+ uint32_t base_mhz = a & 0xFFFF;
+ if (base_mhz == 0) return 0.0;
+ return static_cast(base_mhz) / 1000.0;
+}
+
+// C3 (Linux): intel_pstate base_frequency Intel CPU only
+inline double tsc_ghz_from_sysfs_base() {
+#if defined(__linux__)
+ std::ifstream f("/sys/devices/system/cpu/cpu0/cpufreq/base_frequency");
+ if (!f) return 0.0;
+ double khz;
+ if (!(f >> khz) || khz <= 0.0) return 0.0;
+ return khz / 1e6;
+#else
+ return 0.0;
+#endif
+}
+
+// C4 (Windows): registry ~MHz, set at boot from CPUID
+inline double tsc_ghz_from_windows_registry() {
+#if defined(_WIN32)
+ HKEY key;
+ if (RegOpenKeyExA(HKEY_LOCAL_MACHINE,
+ "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0, KEY_READ, &key) != ERROR_SUCCESS)
+ return 0.0;
+ DWORD mhz = 0, size = sizeof(DWORD);
+ LONG status = RegQueryValueExA(key, "~MHz", nullptr, nullptr, reinterpret_cast(&mhz), &size);
+ RegCloseKey(key);
+ if (status != ERROR_SUCCESS || mhz == 0) return 0.0;
+ return static_cast(mhz) / 1000.0;
+#else
+ return 0.0;
+#endif
+}
+
+// Calibration fallback: lightweight runtime calibration (~3ms)
+//
+// Last-resort fallback for AMD bare-metal and virtualized environments
+// where no static frequency source is available. Three 1ms samples,
+// median wins. This is the *only* path that pays a startup cost; users
+// on Intel hardware will exit at C1 or C2 before reaching here.
+inline double tsc_ghz_from_calibration() {
+ constexpr int N = 3;
+ double samples[N];
+
+ for (int i = 0; i < N; ++i) {
+ auto wall_t0 = std::chrono::steady_clock::now();
+ uint64_t tsc_t0 = __rdtsc();
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ uint64_t tsc_t1 = __rdtsc();
+ auto wall_t1 = std::chrono::steady_clock::now();
+
+ double ns = std::chrono::duration(wall_t1 - wall_t0).count();
+ if (ns <= 0.0) { samples[i] = 0.0; continue; }
+ samples[i] = static_cast(tsc_t1 - tsc_t0) / ns; // cycles/ns = GHz
+ }
+
+ std::sort(samples, samples + N);
+ return samples[N / 2]; // median rejects the worst scheduler hiccup
+}
+
+// Master calibration: try sources in order, abort if all fail
+inline void calibrate_tsc() {
+ double ghz = tsc_ghz_from_cpuid_15h();
+ if (ghz <= 0.0) ghz = tsc_ghz_from_cpuid_16h();
+ if (ghz <= 0.0) ghz = tsc_ghz_from_sysfs_base();
+ if (ghz <= 0.0) ghz = tsc_ghz_from_windows_registry();
+ if (ghz <= 0.0) ghz = tsc_ghz_from_calibration();
+
+ if (ghz <= 0.0) {
+ std::cerr <<
+ "[ctrack] FATAL: TSC clock backend selected at compile time but no usable frequency source found.\n"
+ "[ctrack] Rebuild without CTRACK_CLOCK_RDTSC / RDTSCP / RDTSCP_LFENCE to use the chrono fallback.\n";
+ std::abort();
+ }
+
+ cycles_per_ns = ghz;
+ tsc_anchor_cycles = __rdtsc();
+ tsc_anchor_system = std::chrono::system_clock::now();
+}
+
+inline uint_fast64_t cycles_to_ns(uint64_t cycles) {
+ return static_cast(cycles / cycles_per_ns);
+}
+
+inline std::string cycles_to_timestring(uint64_t tp) {
+ int64_t delta_cycles = static_cast(tp) - static_cast(tsc_anchor_cycles);
+ auto delta_ns = std::chrono::nanoseconds(static_cast(delta_cycles / cycles_per_ns));
+ auto system_tp = tsc_anchor_system + delta_ns;
+ auto tt = std::chrono::system_clock::to_time_t(system_tp);
+ std::tm tm{};
+#if defined(_WIN32)
+ localtime_s(&tm, &tt);
+#else
+ localtime_r(&tt, &tm);
+#endif
+ std::ostringstream oss;
+ oss << std::put_time(&tm, "%Y-%m-%d %H:%M:%S");
+ return oss.str();
+}
+
+#if defined(CTRACK_CLOCK_RDTSC)
+struct Clock_RDTSC {
+ using time_point = uint64_t;
+ CTRACK_ALWAYS_INLINE static time_point NOW() { return __rdtsc(); }
+ static inline uint_fast64_t duration_ns(time_point s, time_point e) { return cycles_to_ns(e - s); }
+ static inline std::string to_string(const time_point &tp) { return cycles_to_timestring(tp); }
+};
+using ActiveClock = Clock_RDTSC;
+#elif defined(CTRACK_CLOCK_RDTSCP)
+struct Clock_RDTSCP {
+ using time_point = uint64_t;
+ CTRACK_ALWAYS_INLINE static time_point NOW() { unsigned int aux; return __rdtscp(&aux); }
+ static inline uint_fast64_t duration_ns(time_point s, time_point e) { return cycles_to_ns(e - s); }
+ static inline std::string to_string(const time_point &tp) { return cycles_to_timestring(tp); }
+};
+using ActiveClock = Clock_RDTSCP;
+#elif defined(CTRACK_CLOCK_RDTSCP_LFENCE)
+struct Clock_RDTSCP_LFENCE {
+ using time_point = uint64_t;
+ CTRACK_ALWAYS_INLINE static time_point NOW() { _mm_lfence(); unsigned int aux; return __rdtscp(&aux); }
+ static inline uint_fast64_t duration_ns(time_point s, time_point e) { return cycles_to_ns(e - s); }
+ static inline std::string to_string(const time_point &tp) { return cycles_to_timestring(tp); }
+};
+using ActiveClock = Clock_RDTSCP_LFENCE;
+#endif
+
+#else // not x86_64
+
+// Hard-fail at compile time if a TSC backend is requested on a non-x86 build.
+#if defined(CTRACK_CLOCK_RDTSC) || defined(CTRACK_CLOCK_RDTSCP) || defined(CTRACK_CLOCK_RDTSCP_LFENCE)
+#error "CTRACK_CLOCK_RDTSC* requires x86_64. Remove the macro to use Clock_Chrono."
+#endif
+
+#endif // x86_64
+
+// ── Chrono fallback (default if no TSC backend selected) ─────────────────
+#if !defined(CTRACK_CLOCK_RDTSC) && !defined(CTRACK_CLOCK_RDTSCP) && !defined(CTRACK_CLOCK_RDTSCP_LFENCE)
+struct Clock_Chrono {
+ using time_point = std::chrono::high_resolution_clock::time_point;
+ CTRACK_ALWAYS_INLINE static time_point NOW() {
+ return std::chrono::high_resolution_clock::now();
+ }
+ static inline uint_fast64_t duration_ns(time_point s, time_point e) {
+ return std::chrono::duration_cast(e - s).count();
+ }
+ static inline std::string to_string(const time_point &tp) {
+ auto system_tp = std::chrono::system_clock::now() +
+ std::chrono::duration_cast(
+ tp - std::chrono::high_resolution_clock::now());
+ auto tt = std::chrono::system_clock::to_time_t(system_tp);
+ std::tm tm{};
+#if defined(_WIN32)
+ localtime_s(&tm, &tt);
+#else
+ localtime_r(&tt, &tm);
+#endif
+ std::ostringstream oss;
+ oss << std::put_time(&tm, "%Y-%m-%d %H:%M:%S");
+ return oss.str();
+ }
+};
+using ActiveClock = Clock_Chrono;
+#endif // chrono
+
+
+inline namespace CTRACK_VERSION_NAMESPACE
+{
#ifndef CTRACK_DISABLE_EXECUTION_POLICY
- constexpr auto execution_policy = std::execution::par_unseq;
+constexpr auto execution_policy = std::execution::par_unseq;
#define OPT_EXEC_POLICY execution_policy,
#else
#define OPT_EXEC_POLICY
#endif
- template
- auto sum_field(const std::vector &vec, Field T::*field)
- {
- using FieldType = std::decay_t().*field)>;
- return std::transform_reduce(
- OPT_EXEC_POLICY
- vec.begin(),
- vec.end(),
- FieldType{},
- std::plus<>(),
- [field](const auto &item)
- { return item.*field; });
- }
-
- template
- auto sum_squared_field(const std::vector &values, Field T::*field)
- {
- using FieldType = std::decay_t().*field)>;
- return std::transform_reduce(
- OPT_EXEC_POLICY
- values.begin(),
- values.end(),
- FieldType{},
- std::plus<>(),
- [field](const T &v)
- {
- return (v.*field) * (v.*field);
- });
- }
-
- template
- double calculate_std_dev_field(std::vector &values, Field T::*field, const double mean)
- {
- double res = std::transform_reduce(
- OPT_EXEC_POLICY
- values.begin(),
- values.end(),
- 0.0,
- std::plus<>(),
- [mean, field](const T &v)
- {
- return std::pow(static_cast(v.*field) - mean, 2);
- });
-
- return sqrt(res / values.size());
- }
-
- template
- auto get_distinct_field_values(const std::vector &vec, Field T::*field)
- {
- std::set().*field)>> distinct_values;
-
- std::transform(vec.begin(), vec.end(),
- std::inserter(distinct_values, distinct_values.end()),
- [field](const T *item)
- { return item->*field; });
- return distinct_values;
- }
-
- template
- auto get_distinct_field_values(const std::vector &vec, Field T::*field)
- {
- std::set().*field)>> distinct_values;
-
- std::transform(vec.begin(), vec.end(),
- std::inserter(distinct_values, distinct_values.end()),
- [field](const T &item)
- { return item.*field; });
- return distinct_values;
- }
-
- template
- size_t count_distinct_field_values(const std::vector &vec, Field T::*field)
- {
- return get_distinct_field_values(vec, field).size();
- }
-
- template
- void order_pointer_vector_by_field(std::vector &vec, MemberType StructType::*member, bool asc = true)
- {
- std::sort(OPT_EXEC_POLICY vec.begin(), vec.end(),
- [member, asc](const StructType *a, const StructType *b)
- {
- if (asc)
- return (a->*member) < (b->*member);
- else
- return (a->*member) > (b->*member);
- });
- }
-
- template
- size_t countAllEvents(const std::deque> &events)
- {
- return std::transform_reduce(
- OPT_EXEC_POLICY
- events.begin(),
- events.end(),
- size_t(0),
- std::plus<>(),
- [](const auto &vec)
- {
- return vec.size();
- });
- }
-
- struct ColorScheme
- {
- std::string border_color;
- std::string header_color;
- std::string top_header_color;
- std::string row_color;
-
- ColorScheme(const std::string &border,
- const std::string &header,
- const std::string &top_header,
- const std::string &row)
- : border_color(border),
- header_color(header),
- top_header_color(top_header),
- row_color(row) {}
- };
-
- static inline const ColorScheme default_colors{
- "\033[38;5;24m", // Darker Blue (Border)
- "\033[1;38;5;135m", // Purple (Header)
- "\033[1;38;5;92m", // Darker Purple (Top Header)
- "\033[38;5;39m" // Light Blue (Row)
- };
-
- // Alternate color scheme (still nice to read on terminals)
- static inline const ColorScheme alternate_colors{
- "\033[38;5;28m", // Dark Green (Border)
- "\033[1;38;5;208m", // Orange (Header)
- "\033[1;38;5;130m", // Dark Orange (Top Header)
- "\033[38;5;71m" // Light Green (Row)
- };
-
- class BeautifulTable
- {
- private:
- std::vector> top_header;
- std::vector header;
- std::vector> rows;
- std::vector columnWidths;
- bool useColor;
- ColorScheme colors;
- static inline const std::string RESET_COLOR = "\033[0m";
-
- void updateColumnWidths(const std::vector &row)
- {
- for (size_t i = 0; i < row.size(); ++i)
- {
- if (i >= columnWidths.size())
- {
- columnWidths.push_back(row[i].length());
- }
- else
- {
- columnWidths[i] = std::max(columnWidths[i], row[i].length());
- }
- }
- }
-
- template
- void printHorizontalLine(StreamType &stream) const
- {
- if (useColor)
- stream << colors.border_color;
- stream << "+";
- for (size_t width : columnWidths)
- {
- stream << std::string(width + 2, '-') << "+";
- }
- if (useColor)
- stream << RESET_COLOR;
- stream << "\n";
- }
-
- template
- void printRow(StreamType &stream, const std::vector &row, const std::string &color, bool center = false) const
- {
- if (useColor)
- stream << colors.border_color;
- stream << "|";
- if (useColor)
- stream << RESET_COLOR << color;
- for (size_t i = 0; i < row.size(); ++i)
- {
- if (center)
- {
- size_t padding = columnWidths[i] - row[i].length();
- size_t leftPadding = padding / 2;
- size_t rightPadding = padding - leftPadding;
- stream << std::string(leftPadding + 1, ' ') << row[i] << std::string(rightPadding + 1, ' ');
- }
- else
- {
- stream << " " << std::setw(static_cast(columnWidths[i])) << std::right << row[i] << " ";
- }
- if (useColor)
- stream << RESET_COLOR << colors.border_color;
- stream << "|";
- if (useColor)
- stream << RESET_COLOR << color;
- }
- if (useColor)
- stream << RESET_COLOR;
- stream << "\n";
- }
-
- template
- void printRow(StreamType &stream, const std::vector> &row, const std::string &color) const
- {
- if (useColor)
- stream << colors.border_color;
- stream << "|";
- if (useColor)
- stream << RESET_COLOR << color;
- int y = 0;
- for (size_t i = 0; i < row.size(); ++i)
- {
- size_t sum = row[i].second - 1;
- for (int x = y; x < y + row[i].second; x++)
- {
- sum += columnWidths[x] + 2;
- }
- y += row[i].second;
-
- size_t textWidth = row[i].first.length();
- size_t totalPadding = sum - textWidth;
- size_t leftPadding = totalPadding / 2;
- size_t rightPadding = totalPadding - leftPadding;
-
- // Print left padding
- stream << std::string(leftPadding, ' ');
-
- // Print text
- stream << row[i].first;
-
- // Print right padding
- stream << std::string(rightPadding, ' ');
- if (useColor)
- stream << RESET_COLOR << colors.border_color;
- stream << "|";
- if (useColor)
- stream << RESET_COLOR << color;
- }
- if (useColor)
- stream << RESET_COLOR;
- stream << "\n";
- }
-
- public:
- BeautifulTable(const std::vector &headerColumns, bool enableColor = false, const ColorScheme &colors = default_colors, const std::vector> &top_header = {})
- : top_header(top_header), header(headerColumns), useColor(enableColor), colors(colors)
- {
- updateColumnWidths(header);
- }
-
- void addRow(const std::vector &row)
- {
- if (row.size() != header.size())
- {
- throw std::invalid_argument("Row size must match header size");
- }
- rows.push_back(row);
- updateColumnWidths(row);
- }
-
- template
- void print(StreamType &stream) const
- {
- if (top_header.size() > 0)
- {
- printHorizontalLine(stream);
- printRow(stream, top_header, colors.top_header_color);
- }
- printHorizontalLine(stream);
- printRow(stream, header, colors.header_color, true);
- printHorizontalLine(stream);
- for (const auto &row : rows)
- {
- printRow(stream, row, colors.row_color);
- printHorizontalLine(stream);
- }
- }
-
- template
- static inline std::string table_string(const T &value)
- {
- std::ostringstream oss;
- oss << value;
- return oss.str();
- }
-
- static inline std::string table_time(uint_fast64_t nanoseconds)
- {
- return table_time(static_cast(nanoseconds));
- }
-
- static inline std::string table_time(double nanoseconds)
- {
- const char *units[] = {"ns", "mcs", "ms", "s"};
- int unit = 0;
- double value = static_cast(nanoseconds);
- while (value >= 1000 && unit < 3)
- {
- value /= 1000;
- unit++;
- }
- std::ostringstream oss;
- oss << std::fixed << std::setprecision(2) << value << " " << units[unit];
- return oss.str();
- }
-
- static inline std::string table_percentage(uint_fast64_t value, uint_fast64_t total)
- {
- if (total == 0)
- {
- return "nan%";
- }
-
- // Calculate the percentage
- double percentage = (static_cast(value) / total) * 100.0;
-
- // Format the percentage as a string with 2 decimal places
- std::ostringstream ss;
- ss << std::fixed << std::setprecision(2) << percentage << "%";
-
- return ss.str();
- }
-
- static inline std::string table_timepoint(const std::chrono::high_resolution_clock::time_point &tp)
- {
- auto system_tp = std::chrono::system_clock::now() +
- std::chrono::duration_cast(
- tp - std::chrono::high_resolution_clock::now());
-
- auto tt = std::chrono::system_clock::to_time_t(system_tp);
- std::tm tm{};
+template
+auto sum_field(const std::vector &vec, Field T::*field)
+{
+ using FieldType = std::decay_t().*field)>;
+ return std::transform_reduce(
+ OPT_EXEC_POLICY
+ vec.begin(),
+ vec.end(),
+ FieldType{},
+ std::plus<>(),
+ [field](const auto &item)
+ { return item.*field; }
+ );
+}
-#if defined(_WIN32)
- localtime_s(&tm, &tt);
-#else
- localtime_r(&tt, &tm);
+template
+auto sum_squared_field(const std::vector &values, Field T::*field)
+{
+ using FieldType = std::decay_t().*field)>;
+ return std::transform_reduce(
+ OPT_EXEC_POLICY
+ values.begin(),
+ values.end(),
+ FieldType{},
+ std::plus<>(),
+ [field](const T &v)
+ {
+ return (v.*field) * (v.*field);
+ }
+ );
+}
+
+template
+double calculate_std_dev_field(std::vector &values, Field T::*field, const double mean)
+{
+ double res = std::transform_reduce(
+ OPT_EXEC_POLICY
+ values.begin(),
+ values.end(),
+ 0.0,
+ std::plus<>(),
+ [mean, field](const T &v)
+ {
+ return std::pow(static_cast(v.*field) - mean, 2);
+ }
+ );
+
+ return sqrt(res / values.size());
+}
+
+template
+auto get_distinct_field_values(const std::vector &vec, Field T::*field)
+{
+ std::set().*field)>> distinct_values;
+
+ std::transform(
+ vec.begin(), vec.end(),
+ std::inserter(distinct_values, distinct_values.end()),
+ [field](const T *item)
+ { return item->*field; }
+ );
+ return distinct_values;
+}
+
+template
+auto get_distinct_field_values(const std::vector &vec, Field T::*field)
+{
+ std::set().*field)>> distinct_values;
+
+ std::transform(
+ vec.begin(), vec.end(),
+ std::inserter(distinct_values, distinct_values.end()),
+ [field](const T &item)
+ { return item.*field; }
+ );
+ return distinct_values;
+}
+
+template
+size_t count_distinct_field_values(const std::vector &vec, Field T::*field)
+{
+ return get_distinct_field_values(vec, field).size();
+}
+
+template
+void order_pointer_vector_by_field(std::vector &vec, MemberType StructType::*member, bool asc = true)
+{
+ std::sort(
+ OPT_EXEC_POLICY vec.begin(), vec.end(),
+ [member, asc](const StructType *a, const StructType *b){
+ if (asc)
+ return (a->*member) < (b->*member);
+ else
+ return (a->*member) > (b->*member);
+ });
+}
+
+template
+size_t countAllEvents(const std::deque> &events)
+{
+ return std::transform_reduce(
+ OPT_EXEC_POLICY
+ events.begin(),
+ events.end(),
+ size_t(0),
+ std::plus<>(),
+ [](const auto &vec)
+ {
+ return vec.size();
+ });
+}
+
+struct ColorScheme
+{
+ std::string border_color;
+ std::string header_color;
+ std::string top_header_color;
+ std::string row_color;
+
+ ColorScheme(const std::string &border,
+ const std::string &header,
+ const std::string &top_header,
+ const std::string &row)
+ : border_color(border),
+ header_color(header),
+ top_header_color(top_header),
+ row_color(row)
+ {}
+};
+
+static inline const ColorScheme default_colors{
+ "\033[38;5;24m", // Darker Blue (Border)
+ "\033[1;38;5;135m", // Purple (Header)
+ "\033[1;38;5;92m", // Darker Purple (Top Header)
+ "\033[38;5;39m" // Light Blue (Row)
+};
+
+// Alternate color scheme (still nice to read on terminals)
+static inline const ColorScheme alternate_colors{
+ "\033[38;5;28m", // Dark Green (Border)
+ "\033[1;38;5;208m", // Orange (Header)
+ "\033[1;38;5;130m", // Dark Orange (Top Header)
+ "\033[38;5;71m" // Light Green (Row)
+};
+
+class BeautifulTable
+{
+private:
+ std::vector> top_header;
+ std::vector header;
+ std::vector> rows;
+ std::vector columnWidths;
+ bool useColor;
+ ColorScheme colors;
+ static inline const std::string RESET_COLOR = "\033[0m";
+
+ void updateColumnWidths(const std::vector &row)
+ {
+ for (size_t i = 0; i < row.size(); ++i)
+ {
+ if (i >= columnWidths.size())
+ {
+ columnWidths.push_back(row[i].length());
+ }
+ else
+ {
+ columnWidths[i] = std::max(columnWidths[i], row[i].length());
+ }
+ }
+ }
+
+ template
+ void printHorizontalLine(StreamType &stream) const
+ {
+ if (useColor)
+ stream << colors.border_color;
+ stream << "+";
+ for (size_t width : columnWidths)
+ {
+ stream << std::string(width + 2, '-') << "+";
+ }
+ if (useColor)
+ stream << RESET_COLOR;
+ stream << "\n";
+ }
+
+ template
+ void printRow(StreamType &stream, const std::vector &row, const std::string &color, bool center = false) const
+ {
+ if (useColor)
+ stream << colors.border_color;
+ stream << "|";
+ if (useColor)
+ stream << RESET_COLOR << color;
+ for (size_t i = 0; i < row.size(); ++i)
+ {
+ if (center)
+ {
+ size_t padding = columnWidths[i] - row[i].length();
+ size_t leftPadding = padding / 2;
+ size_t rightPadding = padding - leftPadding;
+ stream << std::string(leftPadding + 1, ' ') << row[i] << std::string(rightPadding + 1, ' ');
+ }
+ else
+ {
+ stream << " " << std::setw(static_cast(columnWidths[i])) << std::right << row[i] << " ";
+ }
+ if (useColor)
+ stream << RESET_COLOR << colors.border_color;
+ stream << "|";
+ if (useColor)
+ stream << RESET_COLOR << color;
+ }
+ if (useColor)
+ stream << RESET_COLOR;
+ stream << "\n";
+ }
+
+ template
+ void printRow(StreamType &stream, const std::vector> &row, const std::string &color) const
+ {
+ if (useColor)
+ stream << colors.border_color;
+ stream << "|";
+ if (useColor)
+ stream << RESET_COLOR << color;
+ int y = 0;
+ for (size_t i = 0; i < row.size(); ++i)
+ {
+ size_t sum = row[i].second - 1;
+ for (int x = y; x < y + row[i].second; x++)
+ {
+ sum += columnWidths[x] + 2;
+ }
+ y += row[i].second;
+
+ size_t textWidth = row[i].first.length();
+ size_t totalPadding = sum - textWidth;
+ size_t leftPadding = totalPadding / 2;
+ size_t rightPadding = totalPadding - leftPadding;
+
+ // Print left padding
+ stream << std::string(leftPadding, ' ');
+
+ // Print text
+ stream << row[i].first;
+
+ // Print right padding
+ stream << std::string(rightPadding, ' ');
+ if (useColor)
+ stream << RESET_COLOR << colors.border_color;
+ stream << "|";
+ if (useColor)
+ stream << RESET_COLOR << color;
+ }
+ if (useColor)
+ stream << RESET_COLOR;
+ stream << "\n";
+ }
+
+public:
+ BeautifulTable(const std::vector &headerColumns, bool enableColor = false, const ColorScheme &colors = default_colors, const std::vector> &top_header = {})
+ : top_header(top_header), header(headerColumns), useColor(enableColor), colors(colors)
+ {
+ updateColumnWidths(header);
+ }
+
+ void addRow(const std::vector &row)
+ {
+ if (row.size() != header.size())
+ {
+ throw std::invalid_argument("Row size must match header size");
+ }
+ rows.push_back(row);
+ updateColumnWidths(row);
+ }
+
+ template
+ void print(StreamType &stream) const
+ {
+ if (top_header.size() > 0)
+ {
+ printHorizontalLine(stream);
+ printRow(stream, top_header, colors.top_header_color);
+ }
+ printHorizontalLine(stream);
+ printRow(stream, header, colors.header_color, true);
+ printHorizontalLine(stream);
+ for (const auto &row : rows)
+ {
+ printRow(stream, row, colors.row_color);
+ printHorizontalLine(stream);
+ }
+ }
+
+ template
+ static inline std::string table_string(const T &value)
+ {
+ std::ostringstream oss;
+ oss << value;
+ return oss.str();
+ }
+
+ static inline std::string table_time(uint_fast64_t nanoseconds)
+ {
+ return table_time(static_cast(nanoseconds));
+ }
+
+ static inline std::string table_time(double nanoseconds)
+ {
+ const char *units[] = {"ns", "us", "ms", "s"};
+ int unit = 0;
+ double value = static_cast(nanoseconds);
+ while (value >= 1000 && unit < 3)
+ {
+ value /= 1000;
+ unit++;
+ }
+ std::ostringstream oss;
+ oss << std::fixed << std::setprecision(2) << value << " " << units[unit];
+ return oss.str();
+ }
+
+ static inline std::string table_percentage(uint_fast64_t value, uint_fast64_t total)
+ {
+ if (total == 0)
+ {
+ return "nan%";
+ }
+
+ // Calculate the percentage
+ double percentage = (static_cast(value) / total) * 100.0;
+
+ // Format the percentage as a string with 2 decimal places
+ std::ostringstream ss;
+ ss << std::fixed << std::setprecision(2) << percentage << "%";
+
+ return ss.str();
+ }
+
+ static inline std::string table_timepoint(const ActiveClock::time_point &tp)
+ {
+ return ActiveClock::to_string(tp);
+ }
+
+ static inline std::string stable_shortenPath(const std::string &fullPath, size_t maxLength = 35)
+ {
+ namespace fs = std::filesystem;
+
+ fs::path path(fullPath);
+ std::string filename = path.filename().string();
+
+ if (filename.length() <= maxLength)
+ {
+ return filename;
+ }
+
+ // If filename is too long, truncate it and add ...
+ return filename.substr(0, maxLength - 3) + "...";
+ }
+
+ using bt = BeautifulTable;
+};
+
+
+
+
+
+
+struct Event
+{
+ ActiveClock::time_point start_time;
+ ActiveClock::time_point end_time;
+ int line;
+ int thread_id;
+ std::string_view filename;
+ std::string_view function;
+ unsigned int event_id;
+
+ Event(const ActiveClock::time_point &start_time, const ActiveClock::time_point &end_time, const std::string_view filename, const int line, const std::string_view function, const int thread_id, const unsigned int event_id)
+ : start_time(start_time), end_time(end_time), line(line), thread_id(thread_id), filename(filename), function(function), event_id(event_id)
+ {}
+};
+
+struct Simple_Event
+{
+ uint_fast64_t duration = 0;
+ ActiveClock::time_point start_time{};
+ int_fast64_t unique_id = 0;
+ ActiveClock::time_point end_time{};
+ Simple_Event(const ActiveClock::time_point &start_time, const ActiveClock::time_point &end_time, const uint_fast64_t duration, const int_fast64_t unique_id) : duration(duration), start_time(start_time), unique_id(unique_id), end_time(end_time) {}
+ Simple_Event() {}
+};
+
+inline bool cmp_simple_event_by_duration_asc(const Simple_Event &a, const Simple_Event &b)
+{
+ return a.duration < b.duration;
+}
+inline bool cmp_simple_event_by_start_time_asc(const Simple_Event &a, const Simple_Event &b)
+{
+ return a.start_time < b.start_time;
+}
+
+inline uint_fast64_t get_unique_event_id(unsigned int thread_id, unsigned int event_id)
+{
+ uint_fast64_t uniqueId = static_cast(thread_id);
+ uniqueId = uniqueId << 32;
+ uniqueId += static_cast(event_id);
+ return uniqueId;
+}
+
+inline std::vector create_simple_events(const std::vector &events)
+{
+ std::vector simple_events{};
+ simple_events.resize(events.size());
+ std::transform(
+ OPT_EXEC_POLICY
+ events.begin(),
+ events.end(),
+ simple_events.begin(),
+ [](const Event &event)
+ {
+ Simple_Event simple_event(event.start_time, event.end_time, ActiveClock::duration_ns(event.start_time, event.end_time), get_unique_event_id(event.thread_id, event.event_id));
+ return simple_event;
+ });
+ return simple_events;
+}
+
+inline std::vector create_simple_events(const std::vector &events)
+{
+ std::vector simple_events{};
+ simple_events.resize(events.size());
+ std::transform(
+ OPT_EXEC_POLICY
+ events.begin(),
+ events.end(),
+ simple_events.begin(),
+ [](const Event *event){
+ Simple_Event simple_event(event->start_time, event->end_time, ActiveClock::duration_ns(event->start_time, event->end_time), get_unique_event_id(event->thread_id, event->event_id));
+ return simple_event;
+ });
+ return simple_events;
+}
+
+// requires already sorted
+inline std::vector sorted_create_grouped_simple_events(const std::vector &events)
+{
+ std::vector result{};
+ if (events.size() == 0)
+ return result;
+ result.push_back(events[0]);
+ unsigned int current_idx = 0;
+
+ for (size_t i = 1; i < events.size(); i++)
+ {
+ if (result[current_idx].end_time >= events[i].start_time)
+ {
+ result[current_idx].end_time = std::max(result[current_idx].end_time, events[i].end_time);
+ }
+ else
+ {
+ result.push_back(events[i]);
+ current_idx++;
+ }
+ }
+
+ for (auto &entry : result)
+ {
+ entry.duration = ActiveClock::duration_ns(entry.start_time, entry.end_time);
+ }
+
+ return result;
+}
+
+inline std::vector load_child_events_simple(const std::vector &parent_events_simple,
+ const std::unordered_map &events_map,
+ const std::unordered_map> &child_graph)
+{
+ std::vector child_events{};
+
+ for (const auto &simple_parent_event : parent_events_simple)
+ {
+ auto it = child_graph.find(simple_parent_event.unique_id);
+ if (it != child_graph.end())
+ {
+ auto &parent_event = events_map.at(simple_parent_event.unique_id);
+ for (auto &child_id : it->second)
+ {
+ auto &child_event = events_map.at(child_id);
+ if (child_event->filename == parent_event->filename &&
+ child_event->function == parent_event->function &&
+ child_event->line == parent_event->line)
+ continue;
+
+ child_events.push_back(child_event);
+ }
+ }
+ }
+
+ return create_simple_events(child_events);
+};
+
+class EventGroup
+{
+public:
+ void calculateStats(unsigned int non_center_percent, const std::unordered_map &events_map, const std::unordered_map> &child_graph)
+ {
+ if (all_events.size() == 0)
+ return;
+
+
+ auto all_events_simple = create_simple_events(all_events);
+ std::sort(OPT_EXEC_POLICY all_events_simple.begin(), all_events_simple.end(), cmp_simple_event_by_duration_asc);
+ all_cnt = static_cast(all_events_simple.size());
+ const double factor = (1.0 / static_cast(all_cnt));
+
+ auto all_child_events_simple = load_child_events_simple(all_events_simple, events_map, child_graph);
+
+ all_time_acc = sum_field(all_events_simple, &Simple_Event::duration);
+
+ const double all_mean = all_time_acc * factor;
+ if (std::fpclassify(all_mean) == FP_ZERO)
+ return;
+
+ all_st = calculate_std_dev_field(all_events_simple, &Simple_Event::duration, all_mean); // std::sqrt(all_variance);
+ all_cv = all_st / all_mean;
+
+ all_thread_cnt = static_cast(get_distinct_field_values(all_events, &Event::thread_id).size());
+ unsigned int amount_non_center = all_cnt * non_center_percent / 100;
+
+ fastest_range = non_center_percent;
+ slowest_range = 100 - non_center_percent;
+
+ std::vector fastest_events_simple, slowest_events_simple, center_events_simple;
+ fastest_events_simple.reserve(amount_non_center);
+ slowest_events_simple.reserve(amount_non_center);
+ if (all_cnt > 2)
+ center_events_simple.reserve(all_cnt - 2 * amount_non_center);
+
+ for (unsigned int i = 0; i < all_events_simple.size(); i++)
+ {
+ if (i < amount_non_center)
+ {
+ fastest_events_simple.push_back(all_events_simple[i]);
+ }
+ else if (i >= all_cnt - amount_non_center)
+ {
+ slowest_events_simple.push_back(all_events_simple[i]);
+ }
+ else
+ {
+ center_events_simple.push_back(all_events_simple[i]);
+ }
+ }
+ if (amount_non_center > 0)
+ {
+ // fastest
+ fastest_min = fastest_events_simple[0].duration;
+ fastest_mean = sum_field(fastest_events_simple, &Simple_Event::duration) / static_cast(amount_non_center);
+
+ // slowest
+ slowest_max = slowest_events_simple[slowest_events_simple.size() - 1].duration;
+ slowest_mean = sum_field(slowest_events_simple, &Simple_Event::duration) / static_cast(amount_non_center);
+ }
+
+ // center
+ center_min = center_events_simple[0].duration;
+ center_max = center_events_simple[center_events_simple.size() - 1].duration;
+ center_mean = sum_field(center_events_simple, &Simple_Event::duration) / static_cast(center_events_simple.size());
+ if (center_events_simple.size() % 2 == 1)
+ center_med = center_events_simple[center_events_simple.size() / 2].duration;
+ else
+ center_med = (center_events_simple[center_events_simple.size() / 2].duration + center_events_simple[center_events_simple.size() / 2 - 1].duration) / 2;
+
+ auto center_child_events_simple = load_child_events_simple(center_events_simple, events_map, child_graph);
+
+ std::sort(OPT_EXEC_POLICY center_events_simple.begin(), center_events_simple.end(), cmp_simple_event_by_start_time_asc);
+ center_grouped = sorted_create_grouped_simple_events(center_events_simple);
+ center_time_active = sum_field(center_grouped, &Simple_Event::duration);
+
+ std::sort(OPT_EXEC_POLICY center_child_events_simple.begin(), center_child_events_simple.end(), cmp_simple_event_by_start_time_asc);
+ auto center_child_events_grouped = sorted_create_grouped_simple_events(center_child_events_simple);
+ center_time_active_exclusive = center_time_active - sum_field(center_child_events_grouped, &Simple_Event::duration);
+
+ std::sort(OPT_EXEC_POLICY all_events_simple.begin(), all_events_simple.end(), cmp_simple_event_by_start_time_asc);
+ all_grouped = sorted_create_grouped_simple_events(all_events_simple);
+ all_time_active = sum_field(all_grouped, &Simple_Event::duration);
+
+ std::sort(OPT_EXEC_POLICY all_child_events_simple.begin(), all_child_events_simple.end(), cmp_simple_event_by_start_time_asc);
+ auto all_child_events_grouped = sorted_create_grouped_simple_events(all_child_events_simple);
+ all_time_active_exclusive = all_time_active - sum_field(all_child_events_grouped, &Simple_Event::duration);
+ }
+
+ // all_group
+
+ double all_cv = 0.0;
+ double all_st = 0.0;
+
+ unsigned int all_cnt = 0;
+ uint_fast64_t all_time_acc = 0;
+ uint_fast64_t all_time_active = 0;
+ uint_fast64_t all_time_active_exclusive = 0;
+ unsigned int all_thread_cnt = 0;
+ std::vector all_grouped = {};
+ std::vector all_events = {};
+
+ // fastest_group
+ unsigned int fastest_range = 0;
+ uint_fast64_t fastest_min = 0;
+ double fastest_mean = 0.0;
+
+ // slowest group
+ unsigned int slowest_range = 0;
+ uint_fast64_t slowest_max = 0;
+ double slowest_mean = 0.0;
+
+ // center group
+
+ uint_fast64_t center_min = 0;
+ uint_fast64_t center_max = 0;
+ uint_fast64_t center_med = 0;
+ double center_mean = 0;
+ uint_fast64_t center_time_active = 0;
+ uint_fast64_t center_time_active_exclusive = 0;
+ std::vector center_grouped = {};
+
+ std::string filename = {};
+ std::string function_name = {};
+ int line = 0;
+
+private:
+};
+
+typedef std::vector t_events;
+typedef std::map> sub_events;
+
+struct store
+{
+ inline static std::atomic write_events_locked = false;
+ inline static std::mutex event_mutex;
+ inline static ActiveClock::time_point track_start_time = ActiveClock::NOW();
+ inline static std::atomic store_clear_cnt = 0;
+
+ inline static std::atomic thread_cnt = -1;
+ inline static std::deque a_events{};
+ inline static std::deque a_sub_events{};
+
+ inline static std::deque a_current_event_id{}, a_current_event_cnt{}, a_string_id{};
+
+ inline static std::deque a_thread_ids{};
+};
+inline thread_local t_events *event_ptr = nullptr;
+inline thread_local sub_events *sub_events_ptr = nullptr;
+
+inline thread_local unsigned int *current_event_id = nullptr;
+inline thread_local unsigned int *current_event_cnt = nullptr;
+inline thread_local unsigned int *string_id = nullptr;
+
+inline thread_local int *thread_id = nullptr;
+
+typedef std::map line_result;
+typedef std::map function_result;
+typedef std::map filename_result;
+
+struct ctrack_result_settings
+{
+ unsigned int non_center_percent = 1;
+ double min_percent_active_exclusive = 0.0; // between 0-100
+ double percent_exclude_fastest_active_exclusive = 0.0; // between 0-100
+};
+
+struct summary_row
+{
+ std::string filename;
+ std::string function_name;
+ int line{};
+ int calls{};
+ double percent_ae_bracket{}; // ae[center]% by configuration
+ double percent_ae_all{}; // ae[0-100]%
+ std::chrono::nanoseconds time_ae_all{};
+ std::chrono::nanoseconds time_a_all{};
+};
+
+struct summary_table
+{
+ std::vector rows;
+};
+
+struct detail_stats
+{
+ // Info fields
+ std::string filename;
+ std::string function_name;
+ int line{};
+ std::chrono::nanoseconds time_acc{}; // Simple sum of all execution times (can exceed wall clock in MT)
+ std::chrono::nanoseconds sd{}; // Standard deviation
+ double cv{}; // Coefficient of variation (sd/mean)
+ int calls{}; // Total number of calls
+ int threads{}; // Number of different threads that called this function
+
+ // Summary-like fields (for unified access)
+ double percent_ae_bracket{}; // ae[center]% as percentage of total time
+ double percent_ae_all{}; // ae[0-100]% as percentage of total time
+ std::chrono::nanoseconds time_ae_all{}; // Active exclusive time (wall clock minus child functions)
+ std::chrono::nanoseconds time_a_all{}; // Active time (actual wall clock time, handles MT overlap)
+
+ // Fastest/Center/Slowest stats
+ std::chrono::nanoseconds fastest_min{};
+ std::chrono::nanoseconds fastest_mean{};
+ std::chrono::nanoseconds center_min{};
+ std::chrono::nanoseconds center_mean{};
+ std::chrono::nanoseconds center_med{};
+ std::chrono::nanoseconds center_time_a{}; // Active time for center range
+ std::chrono::nanoseconds center_time_ae{}; // Active exclusive time for center range
+ std::chrono::nanoseconds center_max{};
+ std::chrono::nanoseconds slowest_mean{};
+ std::chrono::nanoseconds slowest_max{};
+
+ // Percentile ranges for reference
+ unsigned int fastest_range{};
+ unsigned int slowest_range{};
+};
+
+struct detail_table
+{
+ std::vector rows;
+};
+
+struct ctrack_result_tables
+{
+ // Meta information
+ ActiveClock::time_point start_time;
+ ActiveClock::time_point end_time;
+ std::chrono::nanoseconds time_total{};
+ std::chrono::nanoseconds time_ctracked{};
+
+ // Table data
+ summary_table summary;
+ detail_table details;
+
+ // Settings used
+ ctrack_result_settings settings;
+};
+
+class ctrack_result
+{
+public:
+ ctrack_result(const ctrack_result_settings &settings, const ActiveClock::time_point &track_start_time, const ActiveClock::time_point &track_end_time) : settings(settings), track_start_time(track_start_time), track_end_time(track_end_time)
+ {
+ time_total = ActiveClock::duration_ns(track_start_time, track_end_time);
+ center_intervall_str = "[" + std::to_string(settings.non_center_percent) + "-" + std::to_string(100 - settings.non_center_percent) + "]";
+ }
+
+ template
+ void get_summary_table(StreamType &stream, bool use_color = false)
+ {
+ BeautifulTable info({
+ "Start",
+ "End",
+ "time total",
+ "time ctracked",
+ "time ctracked %",
+ }, use_color, alternate_colors);
+
+ info.addRow({BeautifulTable::table_timepoint(tables.start_time),
+ BeautifulTable::table_timepoint(tables.end_time),
+ BeautifulTable::table_time(static_cast(tables.time_total.count())),
+ BeautifulTable::table_time(static_cast(tables.time_ctracked.count())),
+ BeautifulTable::table_percentage(static_cast(tables.time_ctracked.count()), static_cast(tables.time_total.count()))});
+
+ info.print(stream);
+ BeautifulTable table({
+ "filename",
+ "function",
+ "line",
+ "calls",
+ "ae" + center_intervall_str + "%",
+ "ae[0-100]%",
+ "time ae[0-100]",
+ "time a[0-100]"}, use_color, alternate_colors);
+
+ for (const auto &row : tables.summary.rows)
+ {
+ table.addRow({
+ BeautifulTable::stable_shortenPath(row.filename),
+ row.function_name,
+ BeautifulTable::table_string(row.line),
+ BeautifulTable::table_string(row.calls),
+ BeautifulTable::table_percentage(static_cast(row.percent_ae_bracket * tables.time_total.count() / 100.0), tables.time_total.count()),
+ BeautifulTable::table_percentage(static_cast(row.percent_ae_all * tables.time_total.count() / 100.0), tables.time_total.count()),
+ BeautifulTable::table_time(static_cast(row.time_ae_all.count())),
+ BeautifulTable::table_time(static_cast(row.time_a_all.count()))
+ });
+ }
+
+ table.print(stream);
+ }
+
+ template
+ void get_detail_table(StreamType &stream, bool use_color = false, bool reverse_vector = false)
+ {
+ auto details_copy = tables.details.rows;
+ if (reverse_vector)
+ {
+ std::reverse(details_copy.begin(), details_copy.end());
+ }
+ for (int i = static_cast(details_copy.size()) - 1; i >= 0; i--)
+ {
+ const auto &detail = details_copy[i];
+
+ BeautifulTable info({"filename", "function", "line", "time acc", "sd", "cv", "calls", "threads"},
+ use_color, default_colors);
+ info.addRow({
+ BeautifulTable::stable_shortenPath(detail.filename),
+ detail.function_name,
+ BeautifulTable::table_string(detail.line),
+ BeautifulTable::table_time(static_cast(detail.time_acc.count())),
+ BeautifulTable::table_time(static_cast(detail.sd.count())),
+ BeautifulTable::table_string(detail.cv),
+ BeautifulTable::table_string(detail.calls),
+ BeautifulTable::table_string(detail.threads)
+ });
+
+ const auto fastest_header = "fastest[0-" + std::to_string(detail.fastest_range) + "]%";
+ const auto center_header = "center" + center_intervall_str + "%";
+ const auto slowest_header = "slowest[" + std::to_string(detail.slowest_range) + "-100]%";
+
+ BeautifulTable table(
+ {"min", "mean", "min", "mean", "med", "time a", "time ae", "max", "mean", "max"},
+ use_color,
+ default_colors,
+ {
+ {fastest_header, 2},
+ {center_header, 6},
+ {slowest_header, 2}
+ }
+ );
+
+ table.addRow({
+ BeautifulTable::table_time(static_cast(detail.fastest_min.count())),
+ BeautifulTable::table_time(static_cast(detail.fastest_mean.count())),
+ BeautifulTable::table_time(static_cast(detail.center_min.count())),
+ BeautifulTable::table_time(static_cast(detail.center_mean.count())),
+ BeautifulTable::table_time(static_cast(detail.center_med.count())),
+ BeautifulTable::table_time(static_cast(detail.center_time_a.count())),
+ BeautifulTable::table_time(static_cast(detail.center_time_ae.count())),
+ BeautifulTable::table_time(static_cast(detail.center_max.count())),
+ BeautifulTable::table_time(static_cast(detail.slowest_mean.count())),
+ BeautifulTable::table_time(static_cast(detail.slowest_max.count()))
+ });
+ info.print(stream);
+ table.print(stream);
+
+ stream << std::endl;
+ }
+ }
+
+ void calculate_stats()
+ {
+ std::vector grouped_events{};
+ for (auto &[filename, filename_entry] : f_res)
+ {
+ ctracked_files++;
+ for (auto &[function, function_entry] : filename_entry)
+ {
+ ctracked_functions++;
+ for (auto &[line, line_entry] : function_entry)
+ {
+ ctracked_uses++;
+ line_entry.filename = filename;
+ line_entry.function_name = function;
+ line_entry.line = line;
+ line_entry.calculateStats(settings.non_center_percent, a_events, child_graph);
+ sorted_events.push_back(&line_entry);
+ grouped_events.insert(grouped_events.end(), line_entry.all_grouped.begin(), line_entry.all_grouped.end());
+ }
+ }
+ }
+
+ std::sort(OPT_EXEC_POLICY grouped_events.begin(), grouped_events.end(), cmp_simple_event_by_start_time_asc);
+ auto all_grouped = sorted_create_grouped_simple_events(grouped_events);
+ sum_time_active_exclusive = sum_field(all_grouped, &Simple_Event::duration);
+
+ order_pointer_vector_by_field(sorted_events, &EventGroup::all_time_active_exclusive, false);
+
+ int fastest_events = static_cast(sorted_events.size() * settings.percent_exclude_fastest_active_exclusive / 100);
+ // remove fastest keep in mind fastest elements are at the back
+ if (fastest_events > 0)
+ sorted_events.erase(sorted_events.end() - fastest_events, sorted_events.end());
+
+ uint_fast64_t min_time_active_exclusive = static_cast