Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ First release.
estimate, following each provider's documented tokenization.
- `route_pdf`, which routes each page to text, OCR, or vision and reports the
tokens saved versus sending every page as an image.
- Routed text is normalized — stray control characters (e.g. PDF discretionary
hyphens) are stripped and line endings collapsed — while classification still
runs on the raw text layer.
- `localctx` command-line interface.
- A `local-context-router` Agent Skill for Claude Code and Codex.

Expand Down
2 changes: 2 additions & 0 deletions src/localcontextrouter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .ocr import ocr_png_text, run_ocr
from .pdf import Pdf, classify_pdf
from .router import route_pdf
from .text import clean_text
from .tokens import (
claude_image_tokens,
estimate_text_tokens,
Expand All @@ -40,6 +41,7 @@
"claude_image_tokens",
"classify_pdf",
"classify_text",
"clean_text",
"compute_signals",
"estimate_text_tokens",
"is_vision_worthy",
Expand Down
5 changes: 3 additions & 2 deletions src/localcontextrouter/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .models import PageClass, PageRoute, RouteResult, Source, TokenEstimate
from .ocr import ocr_png_text
from .pdf import Pdf
from .text import clean_text
from .tokens import claude_image_tokens, estimate_text_tokens


Expand All @@ -31,10 +32,10 @@ def route_pdf(path: str | Path, *, render_scale: float = 2.0) -> RouteResult:

if classification.page_class is PageClass.DIGITAL:
source = Source.VISION if is_vision_worthy(features)[0] else Source.TEXT
page_text = text
page_text = clean_text(text)
else:
source = Source.OCR
page_text = ocr_png_text(pdf.render_page_png(index, scale=render_scale))
page_text = clean_text(ocr_png_text(pdf.render_page_png(index, scale=render_scale)))

# text_tokens reflects the text we would actually send (OCR output for
# scanned pages), so the reported savings are honest.
Expand Down
23 changes: 23 additions & 0 deletions src/localcontextrouter/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Text normalization for routed output.

Applied to the text a page contributes to the model — not before
classification, which relies on seeing control and replacement characters to
spot a broken text layer.
"""

from __future__ import annotations

import unicodedata

_KEEP = {"\n", "\t"}


def clean_text(text: str) -> str:
"""Normalize line endings and drop stray control characters.

PDFs sometimes encode discretionary hyphens and other artifacts as control
characters (e.g. U+0002), which would otherwise leak into the model's input.
Newlines and tabs are preserved; CR and CRLF collapse to ``\\n``.
"""
text = text.replace("\r\n", "\n").replace("\r", "\n")
return "".join(char for char in text if char in _KEEP or unicodedata.category(char) != "Cc")
12 changes: 12 additions & 0 deletions tests/test_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
import pytest

from localcontextrouter.models import Source
from localcontextrouter.pdf import Pdf
from localcontextrouter.router import route_pdf
from localcontextrouter.text import clean_text

PROSE = (
"The quarterly report summarizes revenue, expenses, and net income for the "
Expand All @@ -31,6 +33,16 @@ def test_text_page_reports_token_savings(make_text_pdf: Callable[..., Path]) ->
assert result.tokens_saved == page.tokens.saved > 0


def test_output_text_is_cleaned(make_text_pdf: Callable[..., Path]) -> None:
pdf_path = make_text_pdf(PROSE)
result = route_pdf(pdf_path)
with Pdf(pdf_path) as pdf:
raw = pdf.page_text(0)
# The router emits cleaned text; classification still runs on the raw layer.
assert result.pages[0].text == clean_text(raw)
assert "\r" not in result.pages[0].text


def test_routes_table_page_to_vision(make_table_pdf: Callable[..., Path]) -> None:
result = route_pdf(make_table_pdf())
page = result.pages[0]
Expand Down
27 changes: 27 additions & 0 deletions tests/test_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Tests for output text normalization."""

from localcontextrouter.text import clean_text


def test_strips_control_characters() -> None:
# U+0002 is how some PDFs encode a discretionary hyphen.
assert clean_text("observ\x02ability") == "observability"
assert clean_text("a\x00b\x07c") == "abc"


def test_keeps_newlines_and_tabs() -> None:
assert clean_text("a\tb\nc") == "a\tb\nc"


def test_normalizes_line_endings() -> None:
assert clean_text("a\r\nb\rc") == "a\nb\nc"


def test_leaves_clean_text_untouched() -> None:
text = "Revenue rose. Net income up.\nAll figures in USD."
assert clean_text(text) == text


def test_preserves_unicode_punctuation() -> None:
# Bullets and accents are not control characters and must survive.
assert clean_text("• café") == "• café"
Loading