sid732 · sid732 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,9 @@ First release.
   estimate, following each provider's documented tokenization.
 - `route_pdf`, which routes each page to text, OCR, or vision and reports the
   tokens saved versus sending every page as an image.
+- Routed text is normalized — stray control characters (e.g. PDF discretionary
+  hyphens) are stripped and line endings collapsed — while classification still
+  runs on the raw text layer.
 - `localctx` command-line interface.
 - A `local-context-router` Agent Skill for Claude Code and Codex.
 

diff --git a/src/localcontextrouter/__init__.py b/src/localcontextrouter/__init__.py
@@ -17,6 +17,7 @@
 from .ocr import ocr_png_text, run_ocr
 from .pdf import Pdf, classify_pdf
 from .router import route_pdf
+from .text import clean_text
 from .tokens import (
     claude_image_tokens,
     estimate_text_tokens,
@@ -40,6 +41,7 @@
     "claude_image_tokens",
     "classify_pdf",
     "classify_text",
+    "clean_text",
     "compute_signals",
     "estimate_text_tokens",
     "is_vision_worthy",

diff --git a/src/localcontextrouter/router.py b/src/localcontextrouter/router.py
@@ -17,6 +17,7 @@
 from .models import PageClass, PageRoute, RouteResult, Source, TokenEstimate
 from .ocr import ocr_png_text
 from .pdf import Pdf
+from .text import clean_text
 from .tokens import claude_image_tokens, estimate_text_tokens
 
 
@@ -31,10 +32,10 @@ def route_pdf(path: str | Path, *, render_scale: float = 2.0) -> RouteResult:
 
             if classification.page_class is PageClass.DIGITAL:
                 source = Source.VISION if is_vision_worthy(features)[0] else Source.TEXT
-                page_text = text
+                page_text = clean_text(text)
             else:
                 source = Source.OCR
-                page_text = ocr_png_text(pdf.render_page_png(index, scale=render_scale))
+                page_text = clean_text(ocr_png_text(pdf.render_page_png(index, scale=render_scale)))
 
             # text_tokens reflects the text we would actually send (OCR output for
             # scanned pages), so the reported savings are honest.

diff --git a/src/localcontextrouter/text.py b/src/localcontextrouter/text.py
@@ -0,0 +1,23 @@
+"""Text normalization for routed output.
+
+Applied to the text a page contributes to the model — not before
+classification, which relies on seeing control and replacement characters to
+spot a broken text layer.
+"""
+
+from __future__ import annotations
+
+import unicodedata
+
+_KEEP = {"\n", "\t"}
+
+
+def clean_text(text: str) -> str:
+    """Normalize line endings and drop stray control characters.
+
+    PDFs sometimes encode discretionary hyphens and other artifacts as control
+    characters (e.g. U+0002), which would otherwise leak into the model's input.
+    Newlines and tabs are preserved; CR and CRLF collapse to ``\\n``.
+    """
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    return "".join(char for char in text if char in _KEEP or unicodedata.category(char) != "Cc")
diff --git a/tests/test_router.py b/tests/test_router.py
@@ -6,7 +6,9 @@
 import pytest
 
 from localcontextrouter.models import Source
+from localcontextrouter.pdf import Pdf
 from localcontextrouter.router import route_pdf
+from localcontextrouter.text import clean_text
 
 PROSE = (
     "The quarterly report summarizes revenue, expenses, and net income for the "
@@ -31,6 +33,16 @@ def test_text_page_reports_token_savings(make_text_pdf: Callable[..., Path]) ->
     assert result.tokens_saved == page.tokens.saved > 0
 
 
+def test_output_text_is_cleaned(make_text_pdf: Callable[..., Path]) -> None:
+    pdf_path = make_text_pdf(PROSE)
+    result = route_pdf(pdf_path)
+    with Pdf(pdf_path) as pdf:
+        raw = pdf.page_text(0)
+    # The router emits cleaned text; classification still runs on the raw layer.
+    assert result.pages[0].text == clean_text(raw)
+    assert "\r" not in result.pages[0].text
+
+
 def test_routes_table_page_to_vision(make_table_pdf: Callable[..., Path]) -> None:
     result = route_pdf(make_table_pdf())
     page = result.pages[0]

diff --git a/tests/test_text.py b/tests/test_text.py
@@ -0,0 +1,27 @@
+"""Tests for output text normalization."""
+
+from localcontextrouter.text import clean_text
+
+
+def test_strips_control_characters() -> None:
+    # U+0002 is how some PDFs encode a discretionary hyphen.
+    assert clean_text("observ\x02ability") == "observability"
+    assert clean_text("a\x00b\x07c") == "abc"
+
+
+def test_keeps_newlines_and_tabs() -> None:
+    assert clean_text("a\tb\nc") == "a\tb\nc"
+
+
+def test_normalizes_line_endings() -> None:
+    assert clean_text("a\r\nb\rc") == "a\nb\nc"
+
+
+def test_leaves_clean_text_untouched() -> None:
+    text = "Revenue rose. Net income up.\nAll figures in USD."
+    assert clean_text(text) == text
+
+
+def test_preserves_unicode_punctuation() -> None:
+    # Bullets and accents are not control characters and must survive.
+    assert clean_text("• café") == "• café"