diff --git a/CHANGELOG.md b/CHANGELOG.md index b02f87c..733d7e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,9 @@ First release. estimate, following each provider's documented tokenization. - `route_pdf`, which routes each page to text, OCR, or vision and reports the tokens saved versus sending every page as an image. +- Routed text is normalized — stray control characters (e.g. PDF discretionary + hyphens) are stripped and line endings collapsed — while classification still + runs on the raw text layer. - `localctx` command-line interface. - A `local-context-router` Agent Skill for Claude Code and Codex. diff --git a/src/localcontextrouter/__init__.py b/src/localcontextrouter/__init__.py index 4d555d2..0e1d024 100644 --- a/src/localcontextrouter/__init__.py +++ b/src/localcontextrouter/__init__.py @@ -17,6 +17,7 @@ from .ocr import ocr_png_text, run_ocr from .pdf import Pdf, classify_pdf from .router import route_pdf +from .text import clean_text from .tokens import ( claude_image_tokens, estimate_text_tokens, @@ -40,6 +41,7 @@ "claude_image_tokens", "classify_pdf", "classify_text", + "clean_text", "compute_signals", "estimate_text_tokens", "is_vision_worthy", diff --git a/src/localcontextrouter/router.py b/src/localcontextrouter/router.py index d6a0fdf..1869bdf 100644 --- a/src/localcontextrouter/router.py +++ b/src/localcontextrouter/router.py @@ -17,6 +17,7 @@ from .models import PageClass, PageRoute, RouteResult, Source, TokenEstimate from .ocr import ocr_png_text from .pdf import Pdf +from .text import clean_text from .tokens import claude_image_tokens, estimate_text_tokens @@ -31,10 +32,10 @@ def route_pdf(path: str | Path, *, render_scale: float = 2.0) -> RouteResult: if classification.page_class is PageClass.DIGITAL: source = Source.VISION if is_vision_worthy(features)[0] else Source.TEXT - page_text = text + page_text = clean_text(text) else: source = Source.OCR - page_text = ocr_png_text(pdf.render_page_png(index, scale=render_scale)) + page_text = clean_text(ocr_png_text(pdf.render_page_png(index, scale=render_scale))) # text_tokens reflects the text we would actually send (OCR output for # scanned pages), so the reported savings are honest. diff --git a/src/localcontextrouter/text.py b/src/localcontextrouter/text.py new file mode 100644 index 0000000..ebf8a95 --- /dev/null +++ b/src/localcontextrouter/text.py @@ -0,0 +1,23 @@ +"""Text normalization for routed output. + +Applied to the text a page contributes to the model — not before +classification, which relies on seeing control and replacement characters to +spot a broken text layer. +""" + +from __future__ import annotations + +import unicodedata + +_KEEP = {"\n", "\t"} + + +def clean_text(text: str) -> str: + """Normalize line endings and drop stray control characters. + + PDFs sometimes encode discretionary hyphens and other artifacts as control + characters (e.g. U+0002), which would otherwise leak into the model's input. + Newlines and tabs are preserved; CR and CRLF collapse to ``\\n``. + """ + text = text.replace("\r\n", "\n").replace("\r", "\n") + return "".join(char for char in text if char in _KEEP or unicodedata.category(char) != "Cc") diff --git a/tests/test_router.py b/tests/test_router.py index 1b0270f..a114e08 100644 --- a/tests/test_router.py +++ b/tests/test_router.py @@ -6,7 +6,9 @@ import pytest from localcontextrouter.models import Source +from localcontextrouter.pdf import Pdf from localcontextrouter.router import route_pdf +from localcontextrouter.text import clean_text PROSE = ( "The quarterly report summarizes revenue, expenses, and net income for the " @@ -31,6 +33,16 @@ def test_text_page_reports_token_savings(make_text_pdf: Callable[..., Path]) -> assert result.tokens_saved == page.tokens.saved > 0 +def test_output_text_is_cleaned(make_text_pdf: Callable[..., Path]) -> None: + pdf_path = make_text_pdf(PROSE) + result = route_pdf(pdf_path) + with Pdf(pdf_path) as pdf: + raw = pdf.page_text(0) + # The router emits cleaned text; classification still runs on the raw layer. + assert result.pages[0].text == clean_text(raw) + assert "\r" not in result.pages[0].text + + def test_routes_table_page_to_vision(make_table_pdf: Callable[..., Path]) -> None: result = route_pdf(make_table_pdf()) page = result.pages[0] diff --git a/tests/test_text.py b/tests/test_text.py new file mode 100644 index 0000000..7e63ee5 --- /dev/null +++ b/tests/test_text.py @@ -0,0 +1,27 @@ +"""Tests for output text normalization.""" + +from localcontextrouter.text import clean_text + + +def test_strips_control_characters() -> None: + # U+0002 is how some PDFs encode a discretionary hyphen. + assert clean_text("observ\x02ability") == "observability" + assert clean_text("a\x00b\x07c") == "abc" + + +def test_keeps_newlines_and_tabs() -> None: + assert clean_text("a\tb\nc") == "a\tb\nc" + + +def test_normalizes_line_endings() -> None: + assert clean_text("a\r\nb\rc") == "a\nb\nc" + + +def test_leaves_clean_text_untouched() -> None: + text = "Revenue rose. Net income up.\nAll figures in USD." + assert clean_text(text) == text + + +def test_preserves_unicode_punctuation() -> None: + # Bullets and accents are not control characters and must survive. + assert clean_text("• café") == "• café"