diff --git a/benchmark/run_benchmarks_ci.sh b/benchmark/run_benchmarks_ci.sh
index 49fdc38ee5..751fef487e 100755
--- a/benchmark/run_benchmarks_ci.sh
+++ b/benchmark/run_benchmarks_ci.sh
@@ -22,7 +22,7 @@ pushd "${PROJECT_DIR}" > /dev/null
 
 # Run benchmarks
 message "Running benchmarks"
-cargo bench --workspace --features libdd-crashtracker/benchmarking,libdd-sampling/v04_span,libdd-sampling/bench-internals -- --warm-up-time 1 --measurement-time 5 --sample-size=200
+cargo bench --workspace --features libdd-crashtracker/benchmarking,libdd-sampling/v04_span,libdd-sampling/bench-internals,libdd-trace-normalization/bench-internals -- --warm-up-time 1 --measurement-time 5 --sample-size=200
 message "Finished running benchmarks"
 
 # Copy the benchmark results to the output directory
diff --git a/libdd-trace-normalization/Cargo.toml b/libdd-trace-normalization/Cargo.toml
index 1579a5e299..23d2c372aa 100644
--- a/libdd-trace-normalization/Cargo.toml
+++ b/libdd-trace-normalization/Cargo.toml
@@ -19,6 +19,10 @@ arbitrary = { version = "1.3", features = ["derive"], optional = true }
 
 [features]
 fuzzing = ["arbitrary"]
+# Exposes thin public `*_bench_wrapper` shims so benchmarks can reach
+# otherwise-internal functions. The benchmarked functions themselves are left
+# untouched. Enable only when running benches.
+bench-internals = []
 
 [dev-dependencies]
 rand = "0.8.5"
@@ -29,3 +33,4 @@ criterion = "0.5"
 name = "normalization_utils"
 harness = false
 path = "benches/normalization_utils.rs"
+required-features = ["bench-internals"]
diff --git a/libdd-trace-normalization/benches/normalization_utils.rs b/libdd-trace-normalization/benches/normalization_utils.rs
index 6c69ff495f..1fbeb2e9d5 100644
--- a/libdd-trace-normalization/benches/normalization_utils.rs
+++ b/libdd-trace-normalization/benches/normalization_utils.rs
@@ -6,7 +6,10 @@ use criterion::Throughput::Elements;
 use criterion::{
     criterion_group, criterion_main, BatchSize, BenchmarkGroup, BenchmarkId, Criterion,
 };
-use libdd_trace_normalization::normalize_utils::{normalize_name, normalize_service};
+use libdd_trace_normalization::normalize_utils::{
+    normalize_metric_name_bench_wrapper, normalize_name, normalize_service,
+    normalize_span_start_duration, normalize_tag, truncate_utf8_bench_wrapper,
+};
 use libdd_trace_normalization::normalizer::normalize_trace;
 use libdd_trace_protobuf::pb;
 use std::hint::black_box;
@@ -142,10 +145,172 @@ fn normalize_span_bench(c: &mut Criterion) {
     );
 }
 
+/// `normalize_tag` runs on every ingested tag key/value. It is the heaviest normalization
+/// function: a nested loop combining an ASCII fast-path with per-codepoint UTF-8 scanning and a
+/// char-class state machine. We exercise realistic tag values plus the unicode and over-length
+/// paths that defeat the ASCII fast-path.
+fn normalize_tag_bench(c: &mut Criterion) {
+    let group = c.benchmark_group("normalization/normalize_tag");
+    let cases = &[
+        // Empty input: measures the early-return baseline.
+        "",
+        // Already-clean realistic tag values: ASCII fast-path only.
+        "ascii:http.method:get",
+        "ascii:env:production",
+        "ascii:resource:get_/api/v1/users/{id}",
+        // Mixed: needs the illegal-char state machine but stays ASCII.
+        "mixed:Some Service Name!!",
+        // Unicode service name: exercises the codepoint-scanning slow path.
+        "unicode:café-Über-Sérvice",
+        "unicode:Data🐨dog🐶 繋がっ⛰てて",
+        // Over-length (> MAX_TAG_LEN = 200): forces the loop to run to the codepoint cap.
+        "over-length-ascii:over_length_ascii_value_that_keeps_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going_and_going",
+    ];
+    normalize_fnmut_string(group, cases, 1000, "normalize_tag", normalize_tag);
+}
+
+/// `normalize_metric_name` runs on every span name. Similar complexity to `normalize_tag` with a
+/// one-byte lookahead (`last_written_char`) to collapse separators.
+fn normalize_metric_name_bench(c: &mut Criterion) {
+    let group = c.benchmark_group("normalization/normalize_metric_name");
+    let cases = &[
+        // Empty input: measures the early-return baseline.
+        "",
+        // Already-clean span names.
+        "http.request",
+        "django.controller",
+        // Names needing separator collapsing / illegal-char replacement.
+        "GET /some/raclette",
+        "rails.action_controller.process",
+        // Over-length (> MAX_NAME_LEN = 100).
+        "Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.Too-Long-.",
+    ];
+    normalize_fnmut_string(
+        group,
+        cases,
+        1000,
+        "normalize_metric_name",
+        normalize_metric_name_bench_wrapper,
+    );
+}
+
+/// `truncate_utf8` is called before every name/service/type normalization to enforce a byte
+/// limit while preserving UTF-8 boundaries. We bench the over-length cases (where it actually does
+/// work) at the real limits used in the code, including a multi-byte boundary that must be walked
+/// back.
+fn truncate_utf8_bench(c: &mut Criterion) {
+    let group = c.benchmark_group("normalization/truncate_utf8");
+    // MAX_SERVICE_LEN / MAX_NAME_LEN / MAX_TYPE_LEN are all 100 in the source.
+    const LIMIT: usize = 100;
+    let ascii_over = "a".repeat(256);
+    // Multi-byte chars (3 bytes each) so the limit falls mid-codepoint and must be walked back.
+    let unicode_over = "繋".repeat(128);
+    let cases: &[(&str, &str)] = &[
+        ("over-length-ascii", ascii_over.as_str()),
+        ("over-length-unicode", unicode_over.as_str()),
+    ];
+
+    normalize_fnmut_string_with(
+        group,
+        cases,
+        1000,
+        "truncate_utf8",
+        move |s: &mut String| truncate_utf8_bench_wrapper(s, LIMIT),
+    );
+}
+
+/// `normalize_span_start_duration` runs on every span and, in the common case where the start
+/// timestamp predates the year-2000 cutoff, performs a `SystemTime` read. We bench in a tight loop
+/// to confirm that read isn't a meaningful per-span tax. The "clean" case skips the clock; the
+/// "needs-clock" case forces the `SystemTime::elapsed()` path.
+fn normalize_span_start_duration_bench(c: &mut Criterion) {
+    let mut group = c.benchmark_group("normalization/normalize_span_start_duration");
+    // Each measured iteration normalizes a batch of `ELEMENTS` spans so the per-span cost (a few
+    // integer ops, or a `SystemTime` read on the year-2000 path) isn't swamped by timer overhead.
+    // The batch is rebuilt fresh in (untimed) setup because the function mutates its inputs in
+    // place: on the "needs-clock" path the first call rewrites `start` to a recent timestamp, which
+    // would make a second call on the same value skip the clock branch.
+    const ELEMENTS: usize = 1000;
+    group.throughput(Elements(ELEMENTS as u64));
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(2));
+    group.sample_size(200);
+    group.sampling_mode(criterion::SamplingMode::Flat);
+
+    // (start, duration): valid recent timestamp (no clock read) vs. a too-old start that forces
+    // the SystemTime read.
+    let cases: &[(&str, i64, i64)] = &[
+        ("clean", 1_448_466_874_000_000_000, 10_000_000),
+        ("needs-clock", 0, 10_000_000),
+    ];
+
+    for (label, start, duration) in cases {
+        group.bench_with_input(
+            BenchmarkId::new("normalize_span_start_duration", label),
+            &(*start, *duration),
+            |b, &(start, duration)| {
+                b.iter_batched_ref(
+                    || vec![(start, duration); ELEMENTS],
+                    |pairs| {
+                        for (s, d) in pairs {
+                            normalize_span_start_duration(black_box(s), black_box(d));
+                        }
+                    },
+                    BatchSize::LargeInput,
+                )
+            },
+        );
+    }
+    group.finish();
+}
+
+/// Like [`normalize_fnmut_string`] but takes labelled cases (label, input) so over-length inputs
+/// don't need to be displayed verbatim in benchmark ids.
+#[inline]
+fn normalize_fnmut_string_with<F>(
+    mut group: BenchmarkGroup<WallTime>,
+    cases: &[(&str, &str)],
+    elements: usize,
+    function_name: &str,
+    mut function: F,
+) where
+    F: FnMut(&mut String),
+{
+    group.throughput(Elements(elements as u64));
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(2));
+    group.sample_size(200);
+    group.sampling_mode(criterion::SamplingMode::Flat);
+
+    for (label, case) in cases {
+        group.bench_with_input(BenchmarkId::new(function_name, label), *case, |b, case| {
+            b.iter_batched_ref(
+                || {
+                    let mut strings = Vec::with_capacity(elements);
+                    (0..elements).for_each(|_| strings.push(case.to_owned()));
+                    strings
+                },
+                |strings| {
+                    #[allow(clippy::unit_arg)]
+                    strings.iter_mut().for_each(|string| {
+                        black_box(function(black_box(string)));
+                    });
+                },
+                BatchSize::LargeInput,
+            )
+        });
+    }
+    group.finish();
+}
+
 criterion_group!(
     benches,
     normalize_service_bench,
     normalize_name_bench,
-    normalize_span_bench
+    normalize_span_bench,
+    normalize_tag_bench,
+    normalize_metric_name_bench,
+    truncate_utf8_bench,
+    normalize_span_start_duration_bench
 );
 criterion_main!(benches);
diff --git a/libdd-trace-normalization/src/normalize_utils.rs b/libdd-trace-normalization/src/normalize_utils.rs
index b70093c817..0c188cdded 100644
--- a/libdd-trace-normalization/src/normalize_utils.rs
+++ b/libdd-trace-normalization/src/normalize_utils.rs
@@ -272,6 +272,15 @@ fn normalize_metric_name(name: &mut String) {
     bytes.truncate(write_cursor);
 }
 
+/// Wrapper exposing [`normalize_metric_name`] for benchmarks only (see the `bench-internals`
+/// feature). Not part of the public API; the benchmarked function itself is left untouched.
+#[cfg(feature = "bench-internals")]
+#[doc(hidden)]
+#[inline(always)]
+pub fn normalize_metric_name_bench_wrapper(name: &mut String) {
+    normalize_metric_name(name)
+}
+
 // truncate_utf8 truncates the given string to make sure it uses less than limit bytes.
 // If the last character is a utf8 character that would be split, it removes it
 // entirely to make sure the resulting string is not broken.
@@ -280,6 +289,15 @@ pub(crate) fn truncate_utf8(s: &mut String, limit: usize) {
     s.truncate(boundary);
 }
 
+/// Wrapper exposing [`truncate_utf8`] for benchmarks only (see the `bench-internals` feature).
+/// Not part of the public API; the benchmarked function itself is left untouched.
+#[cfg(feature = "bench-internals")]
+#[doc(hidden)]
+#[inline(always)]
+pub fn truncate_utf8_bench_wrapper(s: &mut String, limit: usize) {
+    truncate_utf8(s, limit)
+}
+
 #[cfg(test)]
 mod tests {