From bd85696f8c83653ec4ba7ee6af9ffac22a54c976 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Tue, 16 Jun 2026 10:57:13 +0200
Subject: [PATCH 1/6] test(trace-normalization): add microbenchmarks for
 tag/metric/truncate normalization

Extend the existing criterion bench to cover the per-char UTF-8 state
machines that run on every ingested span but were previously unmeasured:
`normalize_tag` (ASCII / mixed-unicode / over-length), `normalize_metric_name`,
`truncate_utf8` (UTF-8 boundary walk-back), and `normalize_span_start_duration`
(quantifying the SystemTime read on the year-2000 path).

Adds a `bench-internals` feature, mirroring `libdd-sampling`, to expose the
otherwise-private `normalize_metric_name`/`truncate_utf8` without changing the
shipped public API.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 libdd-trace-normalization/Cargo.toml          |   5 +
 .../benches/normalization_utils.rs            | 160 +++++++++++++++++-
 .../src/normalize_utils.rs                    |  18 ++
 3 files changed, 181 insertions(+), 2 deletions(-)

diff --git a/libdd-trace-normalization/Cargo.toml b/libdd-trace-normalization/Cargo.toml
index 1579a5e299..e510084532 100644
--- a/libdd-trace-normalization/Cargo.toml
+++ b/libdd-trace-normalization/Cargo.toml
@@ -19,6 +19,10 @@ arbitrary = { version = "1.3", features = ["derive"], optional = true }
 
 [features]
 fuzzing = ["arbitrary"]
+# Exposes thin public `*_bench_wrapper` shims (e.g. for `normalize_metric_name`, `truncate_utf8`)
+# so benchmarks can reach otherwise-internal functions. The benchmarked functions themselves are
+# left untouched. Not intended for downstream consumers — enable only when running benches.
+bench-internals = []
 
 [dev-dependencies]
 rand = "0.8.5"
@@ -29,3 +33,4 @@ criterion = "0.5"
 name = "normalization_utils"
 harness = false
 path = "benches/normalization_utils.rs"
+required-features = ["bench-internals"]
diff --git a/libdd-trace-normalization/benches/normalization_utils.rs b/libdd-trace-normalization/benches/normalization_utils.rs
index 6c69ff495f..35afe30f93 100644
--- a/libdd-trace-normalization/benches/normalization_utils.rs
+++ b/libdd-trace-normalization/benches/normalization_utils.rs
@@ -6,7 +6,10 @@ use criterion::Throughput::Elements;
 use criterion::{
     criterion_group, criterion_main, BatchSize, BenchmarkGroup, BenchmarkId, Criterion,
 };
-use libdd_trace_normalization::normalize_utils::{normalize_name, normalize_service};
+use libdd_trace_normalization::normalize_utils::{
+    normalize_metric_name_bench_wrapper, normalize_name, normalize_service,
+    normalize_span_start_duration, normalize_tag, truncate_utf8_bench_wrapper,
+};
 use libdd_trace_normalization::normalizer::normalize_trace;
 use libdd_trace_protobuf::pb;
 use std::hint::black_box;
@@ -142,10 +145,163 @@ fn normalize_span_bench(c: &mut Criterion) {
     );
 }
 
+/// `normalize_tag` runs on every ingested tag key/value. It is the heaviest normalization
+/// function: a nested loop combining an ASCII fast-path with per-codepoint UTF-8 scanning and a
+/// char-class state machine. We exercise realistic tag values plus the unicode and over-length
+/// paths that defeat the ASCII fast-path.
+fn normalize_tag_bench(c: &mut Criterion) {
+    let group = c.benchmark_group("normalization/normalize_tag");
+    let cases = &[
+        // Empty input: measures the early-return baseline.
+        "",
+        // Already-clean realistic tag values: ASCII fast-path only.
+        "ascii:http.method:get",
+        "ascii:env:production",
+        "ascii:resource:get_/api/v1/users/{id}",
+        // Mixed: needs the illegal-char state machine but stays ASCII.
+        "mixed:Some Service Name!!",
+        // Unicode service name: exercises the codepoint-scanning slow path.
+        "unicode:café-Über-Sérvice",
+        "unicode:Data🐨dog🐶 繋がっ⛰てて",
+        // Over-length (> MAX_TAG_LEN = 200): forces the loop to run to the codepoint cap.
+        "over-length-ascii:over_length_ascii_value_that_keeps_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going",
+    ];
+    normalize_fnmut_string(group, cases, 1000, "normalize_tag", normalize_tag);
+}
+
+/// `normalize_metric_name` runs on every span name. Similar complexity to `normalize_tag` with a
+/// one-byte lookahead (`last_written_char`) to collapse separators.
+fn normalize_metric_name_bench(c: &mut Criterion) {
+    let group = c.benchmark_group("normalization/normalize_metric_name");
+    let cases = &[
+        // Empty input: measures the early-return baseline.
+        "",
+        // Already-clean span names.
+        "http.request",
+        "django.controller",
+        // Names needing separator collapsing / illegal-char replacement.
+        "GET /some/raclette",
+        "rails.action_controller.process",
+        // Over-length (> MAX_NAME_LEN = 100).
+        "Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.",
+    ];
+    normalize_fnmut_string(
+        group,
+        cases,
+        1000,
+        "normalize_metric_name",
+        normalize_metric_name_bench_wrapper,
+    );
+}
+
+/// `truncate_utf8` is called before every name/service/type normalization to enforce a byte
+/// limit while preserving UTF-8 boundaries. We bench the over-length cases (where it actually does
+/// work) at the real limits used in the code, including a multi-byte boundary that must be walked
+/// back.
+fn truncate_utf8_bench(c: &mut Criterion) {
+    let group = c.benchmark_group("normalization/truncate_utf8");
+    // MAX_SERVICE_LEN / MAX_NAME_LEN / MAX_TYPE_LEN are all 100 in the source.
+    const LIMIT: usize = 100;
+    let ascii_over = "a".repeat(256);
+    // Multi-byte chars (3 bytes each) so the limit falls mid-codepoint and must be walked back.
+    let unicode_over = "繋".repeat(128);
+    let cases: &[(&str, &str)] = &[
+        ("over-length-ascii", ascii_over.as_str()),
+        ("over-length-unicode", unicode_over.as_str()),
+    ];
+
+    normalize_fnmut_string_with(
+        group,
+        cases,
+        1000,
+        "truncate_utf8",
+        move |s: &mut String| truncate_utf8_bench_wrapper(s, LIMIT),
+    );
+}
+
+/// `normalize_span_start_duration` runs on every span and, in the common case where the start
+/// timestamp predates the year-2000 cutoff, performs a `SystemTime` read. We bench in a tight loop
+/// to confirm that read isn't a meaningful per-span tax. The "clean" case skips the clock; the
+/// "needs-clock" case forces the `SystemTime::elapsed()` path.
+fn normalize_span_start_duration_bench(c: &mut Criterion) {
+    let mut group = c.benchmark_group("normalization/normalize_span_start_duration");
+    group.throughput(Elements(1000));
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(2));
+    group.sample_size(200);
+    group.sampling_mode(criterion::SamplingMode::Flat);
+
+    // (start, duration): valid recent timestamp (no clock read) vs. a too-old start that forces
+    // the SystemTime read.
+    let cases: &[(&str, i64, i64)] = &[
+        ("clean", 1_448_466_874_000_000_000, 10_000_000),
+        ("needs-clock", 0, 10_000_000),
+    ];
+
+    for (label, start, duration) in cases {
+        group.bench_with_input(
+            BenchmarkId::new("normalize_span_start_duration", label),
+            &(*start, *duration),
+            |b, &(start, duration)| {
+                b.iter(|| {
+                    let mut s = black_box(start);
+                    let mut d = black_box(duration);
+                    normalize_span_start_duration(black_box(&mut s), black_box(&mut d));
+                    black_box((s, d));
+                });
+            },
+        );
+    }
+    group.finish();
+}
+
+/// Like [`normalize_fnmut_string`] but takes labelled cases (label, input) so over-length inputs
+/// don't need to be displayed verbatim in benchmark ids.
+#[inline]
+fn normalize_fnmut_string_with<F>(
+    mut group: BenchmarkGroup<WallTime>,
+    cases: &[(&str, &str)],
+    elements: usize,
+    function_name: &str,
+    mut function: F,
+) where
+    F: FnMut(&mut String),
+{
+    group.throughput(Elements(elements as u64));
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(2));
+    group.sample_size(200);
+    group.sampling_mode(criterion::SamplingMode::Flat);
+
+    for (label, case) in cases {
+        group.bench_with_input(BenchmarkId::new(function_name, label), *case, |b, case| {
+            b.iter_batched_ref(
+                || {
+                    let mut strings = Vec::with_capacity(elements);
+                    (0..elements).for_each(|_| strings.push(case.to_owned()));
+                    strings
+                },
+                |strings| {
+                    #[allow(clippy::unit_arg)]
+                    strings.iter_mut().for_each(|string| {
+                        black_box(function(black_box(string)));
+                    });
+                },
+                BatchSize::LargeInput,
+            )
+        });
+    }
+    group.finish();
+}
+
 criterion_group!(
     benches,
     normalize_service_bench,
     normalize_name_bench,
-    normalize_span_bench
+    normalize_span_bench,
+    normalize_tag_bench,
+    normalize_metric_name_bench,
+    truncate_utf8_bench,
+    normalize_span_start_duration_bench
 );
 criterion_main!(benches);
diff --git a/libdd-trace-normalization/src/normalize_utils.rs b/libdd-trace-normalization/src/normalize_utils.rs
index b70093c817..0c188cdded 100644
--- a/libdd-trace-normalization/src/normalize_utils.rs
+++ b/libdd-trace-normalization/src/normalize_utils.rs
@@ -272,6 +272,15 @@ fn normalize_metric_name(name: &mut String) {
     bytes.truncate(write_cursor);
 }
 
+/// Wrapper exposing [`normalize_metric_name`] for benchmarks only (see the `bench-internals`
+/// feature). Not part of the public API; the benchmarked function itself is left untouched.
+#[cfg(feature = "bench-internals")]
+#[doc(hidden)]
+#[inline(always)]
+pub fn normalize_metric_name_bench_wrapper(name: &mut String) {
+    normalize_metric_name(name)
+}
+
 // truncate_utf8 truncates the given string to make sure it uses less than limit bytes.
 // If the last character is a utf8 character that would be split, it removes it
 // entirely to make sure the resulting string is not broken.
@@ -280,6 +289,15 @@ pub(crate) fn truncate_utf8(s: &mut String, limit: usize) {
     s.truncate(boundary);
 }
 
+/// Wrapper exposing [`truncate_utf8`] for benchmarks only (see the `bench-internals` feature).
+/// Not part of the public API; the benchmarked function itself is left untouched.
+#[cfg(feature = "bench-internals")]
+#[doc(hidden)]
+#[inline(always)]
+pub fn truncate_utf8_bench_wrapper(s: &mut String, limit: usize) {
+    truncate_utf8(s, limit)
+}
+
 #[cfg(test)]
 mod tests {
 

From ed81de7e054960ede6a7721231fb6ba0128ff058 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Tue, 16 Jun 2026 15:25:02 +0200
Subject: [PATCH 2/6] doc: reword Cargo.toml comment

---
 libdd-trace-normalization/Cargo.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libdd-trace-normalization/Cargo.toml b/libdd-trace-normalization/Cargo.toml
index e510084532..23d2c372aa 100644
--- a/libdd-trace-normalization/Cargo.toml
+++ b/libdd-trace-normalization/Cargo.toml
@@ -19,9 +19,9 @@ arbitrary = { version = "1.3", features = ["derive"], optional = true }
 
 [features]
 fuzzing = ["arbitrary"]
-# Exposes thin public `*_bench_wrapper` shims (e.g. for `normalize_metric_name`, `truncate_utf8`)
-# so benchmarks can reach otherwise-internal functions. The benchmarked functions themselves are
-# left untouched. Not intended for downstream consumers — enable only when running benches.
+# Exposes thin public `*_bench_wrapper` shims so benchmarks can reach
+# otherwise-internal functions. The benchmarked functions themselves are left
+# untouched. Enable only when running benches.
 bench-internals = []
 
 [dev-dependencies]

From 5a09b1fcf82905eb25681d25f55d9fdb16e752be Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Tue, 16 Jun 2026 15:44:47 +0200
Subject: [PATCH 3/6] test(trace-normalization): batch
 normalize_span_start_duration bench

Convert the bench from a single-call `b.iter` to the batched
`iter_batched_ref` + 1000-element inner loop used by the other benches in
this file. The previous form set `throughput(Elements(1000))` and
`SamplingMode::Flat` but measured one call per iteration, so the throughput
number was meaningless and the ns-scale "clean" path was swamped by timer
overhead.

The batch is rebuilt in untimed setup because the function mutates its
inputs in place: on the year-2000 path the first call rewrites `start` to a
recent timestamp, which would make a second call on the same value skip the
clock branch.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../benches/normalization_utils.rs            | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/libdd-trace-normalization/benches/normalization_utils.rs b/libdd-trace-normalization/benches/normalization_utils.rs
index 35afe30f93..d89a25a487 100644
--- a/libdd-trace-normalization/benches/normalization_utils.rs
+++ b/libdd-trace-normalization/benches/normalization_utils.rs
@@ -225,7 +225,13 @@ fn truncate_utf8_bench(c: &mut Criterion) {
 /// "needs-clock" case forces the `SystemTime::elapsed()` path.
 fn normalize_span_start_duration_bench(c: &mut Criterion) {
     let mut group = c.benchmark_group("normalization/normalize_span_start_duration");
-    group.throughput(Elements(1000));
+    // Each measured iteration normalizes a batch of `ELEMENTS` spans so the per-span cost (a few
+    // integer ops, or a `SystemTime` read on the year-2000 path) isn't swamped by timer overhead.
+    // The batch is rebuilt fresh in (untimed) setup because the function mutates its inputs in
+    // place: on the "needs-clock" path the first call rewrites `start` to a recent timestamp, which
+    // would make a second call on the same value skip the clock branch.
+    const ELEMENTS: usize = 1000;
+    group.throughput(Elements(ELEMENTS as u64));
     group.warm_up_time(Duration::from_secs(1));
     group.measurement_time(Duration::from_secs(2));
     group.sample_size(200);
@@ -243,12 +249,16 @@ fn normalize_span_start_duration_bench(c: &mut Criterion) {
             BenchmarkId::new("normalize_span_start_duration", label),
             &(*start, *duration),
             |b, &(start, duration)| {
-                b.iter(|| {
-                    let mut s = black_box(start);
-                    let mut d = black_box(duration);
-                    normalize_span_start_duration(black_box(&mut s), black_box(&mut d));
-                    black_box((s, d));
-                });
+                b.iter_batched_ref(
+                    || vec![(start, duration); ELEMENTS],
+                    |pairs| {
+                        pairs.iter_mut().for_each(|(s, d)| {
+                            normalize_span_start_duration(black_box(s), black_box(d));
+                        });
+                        black_box(pairs);
+                    },
+                    BatchSize::LargeInput,
+                )
             },
         );
     }

From 7834e6f9020a2234f683009242a0f00198ae5cd9 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Tue, 16 Jun 2026 15:50:44 +0200
Subject: [PATCH 4/6] test: add black_box to avoid undue inlining

---
 libdd-trace-normalization/benches/normalization_utils.rs | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libdd-trace-normalization/benches/normalization_utils.rs b/libdd-trace-normalization/benches/normalization_utils.rs
index d89a25a487..33cae1175a 100644
--- a/libdd-trace-normalization/benches/normalization_utils.rs
+++ b/libdd-trace-normalization/benches/normalization_utils.rs
@@ -252,10 +252,9 @@ fn normalize_span_start_duration_bench(c: &mut Criterion) {
                 b.iter_batched_ref(
                     || vec![(start, duration); ELEMENTS],
                     |pairs| {
-                        pairs.iter_mut().for_each(|(s, d)| {
-                            normalize_span_start_duration(black_box(s), black_box(d));
-                        });
-                        black_box(pairs);
+                        for (s, d) in pairs {
+                            black_box(normalize_span_start_duration(black_box(s), black_box(d)));
+                        }
                     },
                     BatchSize::LargeInput,
                 )

From ee82929d01bef38f59f36cb2f37c43122075fab2 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Tue, 16 Jun 2026 17:40:24 +0200
Subject: [PATCH 5/6] ci(trace-normalization): enable bench-internals feature
 in CI benchmark script

The normalization_utils bench target uses required-features = ["bench-internals"],
so Cargo silently skips it unless that feature is explicitly activated. Add
libdd-trace-normalization/bench-internals to the --features list in
run_benchmarks_ci.sh so the new (and existing) normalization benchmarks
appear in CI results.
---
 benchmark/run_benchmarks_ci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/run_benchmarks_ci.sh b/benchmark/run_benchmarks_ci.sh
index 49fdc38ee5..751fef487e 100755
--- a/benchmark/run_benchmarks_ci.sh
+++ b/benchmark/run_benchmarks_ci.sh
@@ -22,7 +22,7 @@ pushd "${PROJECT_DIR}" > /dev/null
 
 # Run benchmarks
 message "Running benchmarks"
-cargo bench --workspace --features libdd-crashtracker/benchmarking,libdd-sampling/v04_span,libdd-sampling/bench-internals -- --warm-up-time 1 --measurement-time 5 --sample-size=200
+cargo bench --workspace --features libdd-crashtracker/benchmarking,libdd-sampling/v04_span,libdd-sampling/bench-internals,libdd-trace-normalization/bench-internals -- --warm-up-time 1 --measurement-time 5 --sample-size=200
 message "Finished running benchmarks"
 
 # Copy the benchmark results to the output directory

From 302189ae0766790d31b1762febd682a7f74eee4e Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Tue, 16 Jun 2026 17:43:16 +0200
Subject: [PATCH 6/6] fix: clippy warning (remove useless black box)

---
 libdd-trace-normalization/benches/normalization_utils.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdd-trace-normalization/benches/normalization_utils.rs b/libdd-trace-normalization/benches/normalization_utils.rs
index 33cae1175a..1fbeb2e9d5 100644
--- a/libdd-trace-normalization/benches/normalization_utils.rs
+++ b/libdd-trace-normalization/benches/normalization_utils.rs
@@ -253,7 +253,7 @@ fn normalize_span_start_duration_bench(c: &mut Criterion) {
                     || vec![(start, duration); ELEMENTS],
                     |pairs| {
                         for (s, d) in pairs {
-                            black_box(normalize_span_start_duration(black_box(s), black_box(d)));
+                            normalize_span_start_duration(black_box(s), black_box(d));
                         }
                     },
                     BatchSize::LargeInput,