From 42f105fdd740371d24cb65a2dd6150735f1decdb Mon Sep 17 00:00:00 2001 From: Siddharth Nashikkar Date: Fri, 19 Jun 2026 13:03:58 -0400 Subject: [PATCH 1/6] feat(core): add layout features and token types Add PageFeatures (image/path counts and coverage), TokenEstimate with a saved property, the Source.VISION case, the tokens field on PageRoute, and RouteResult.tokens_saved. --- src/localcontextrouter/models.py | 53 ++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/src/localcontextrouter/models.py b/src/localcontextrouter/models.py index ad2aaf7..daa9dd2 100644 --- a/src/localcontextrouter/models.py +++ b/src/localcontextrouter/models.py @@ -65,7 +65,7 @@ class OcrLine: class Source(str, Enum): - """Where a page's final text came from.""" + """Where a page's final content should come from.""" TEXT = "text" """Extracted directly from the embedded text layer.""" @@ -73,15 +73,55 @@ class Source(str, Enum): OCR = "ocr" """Produced by on-device OCR after rendering the page.""" + VISION = "vision" + """Send the page to a vision model — its meaning lives in the visuals.""" + + +@dataclass(frozen=True) +class PageFeatures: + """Layout signals for a page, derived from its content objects.""" + + width: float + """Page width in PDF points.""" + + height: float + """Page height in PDF points.""" + + image_count: int + """Number of raster image objects on the page.""" + + image_coverage: float + """Fraction of the page area covered by raster images, in 0...1.""" + + path_count: int + """Number of vector path objects (lines, curves, fills).""" + + path_coverage: float + """Fraction of the page area covered by vector paths, in 0...1.""" + + +@dataclass(frozen=True) +class TokenEstimate: + """Estimated token cost of a page as extracted text versus as an image.""" + + text_tokens: int + image_tokens: int + + @property + def saved(self) -> int: + """Tokens avoided by sending text instead of the page image (never negative).""" + return max(0, self.image_tokens - self.text_tokens) + @dataclass(frozen=True) class PageRoute: - """The routing outcome for one page: its classification, source, and text.""" + """The routing outcome for one page.""" index: int classification: Classification source: Source text: str + tokens: TokenEstimate @dataclass(frozen=True) @@ -94,3 +134,12 @@ class RouteResult: def text(self) -> str: """All page text joined in reading order.""" return "\n\n".join(page.text for page in self.pages) + + @property + def tokens_saved(self) -> int: + """Total tokens avoided versus sending every page as an image. + + Counts only pages routed to text or OCR; vision pages are sent as + images, so they save nothing. + """ + return sum(page.tokens.saved for page in self.pages if page.source is not Source.VISION) From a4191e62a008222ff93454296c6fb7df6d022be6 Mon Sep 17 00:00:00 2001 From: Siddharth Nashikkar Date: Fri, 19 Jun 2026 13:03:58 -0400 Subject: [PATCH 2/6] feat(core): extract page layout features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Pdf.page_features, which counts raster image and vector path objects and their page coverage via pypdfium2 — the signals that flag charts, tables, and diagrams without rendering. --- src/localcontextrouter/pdf.py | 36 ++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/localcontextrouter/pdf.py b/src/localcontextrouter/pdf.py index 25f2486..cd2af72 100644 --- a/src/localcontextrouter/pdf.py +++ b/src/localcontextrouter/pdf.py @@ -12,9 +12,10 @@ from pathlib import Path import pypdfium2 as pdfium +import pypdfium2.raw as pdfium_c from .classify import classify_text -from .models import Classification +from .models import Classification, PageFeatures class Pdf: @@ -49,6 +50,39 @@ def page_texts(self) -> Iterator[str]: for index in range(len(self)): yield self.page_text(index) + def page_features(self, index: int) -> PageFeatures: + """Summarize the page's image and vector-path content for routing. + + Counts raster image and vector path objects and the fraction of the page + each covers. Charts and diagrams emit many vector paths rather than raster + images, so the path signals catch content that an image count misses. + """ + page = self._doc[index] + try: + width, height = page.get_size() + page_area = width * height + image_count = path_count = 0 + image_area = path_area = 0.0 + for obj in page.get_objects(): + left, bottom, right, top = obj.get_bounds() + area = abs((right - left) * (top - bottom)) + if obj.type == pdfium_c.FPDF_PAGEOBJ_IMAGE: + image_count += 1 + image_area += area + elif obj.type == pdfium_c.FPDF_PAGEOBJ_PATH: + path_count += 1 + path_area += area + return PageFeatures( + width=width, + height=height, + image_count=image_count, + image_coverage=image_area / page_area if page_area else 0.0, + path_count=path_count, + path_coverage=path_area / page_area if page_area else 0.0, + ) + finally: + page.close() + def render_page_png(self, index: int, scale: float = 2.0) -> bytes: """Render the page at ``index`` to PNG bytes. From 17cbc286d4bcb2bea526630ad28f945377ae21f2 Mon Sep 17 00:00:00 2001 From: Siddharth Nashikkar Date: Fri, 19 Jun 2026 13:03:58 -0400 Subject: [PATCH 3/6] feat(core): detect vision-worthy pages Add is_vision_worthy: route a page to a vision model when images cover much of it, vectors cover a large area, or many vector paths suggest a table or chart. --- src/localcontextrouter/detect.py | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 src/localcontextrouter/detect.py diff --git a/src/localcontextrouter/detect.py b/src/localcontextrouter/detect.py new file mode 100644 index 0000000..76014c9 --- /dev/null +++ b/src/localcontextrouter/detect.py @@ -0,0 +1,39 @@ +"""Decide whether a page should go to a vision model rather than as text. + +Some pages carry a perfectly good text layer yet still lose their meaning when +flattened to text: tables, charts, diagrams, and figure-heavy layouts. Those are +worth the vision-token cost. This module decides that from cheap layout features +(:class:`~.models.PageFeatures`) — no rendering and no ML. +""" + +from __future__ import annotations + +from .models import PageFeatures + +#: A page with at least this many vector paths is treated as a table or diagram. +#: Charts and ruled tables emit many line/curve objects. +MIN_VISION_PATHS = 25 + +#: A page with at least this fraction covered by raster images is figure-heavy. +MIN_VISION_IMAGE_COVERAGE = 0.40 + +#: A page with at least this fraction covered by vector paths holds a large +#: filled chart or diagram. +MIN_VISION_PATH_COVERAGE = 0.30 + + +def is_vision_worthy(features: PageFeatures) -> tuple[bool, str]: + """Return whether a page should go to a vision model, with the reason.""" + if features.image_coverage >= MIN_VISION_IMAGE_COVERAGE: + return True, ( + f"{features.image_coverage:.0%} image coverage " + f"(>= {MIN_VISION_IMAGE_COVERAGE:.0%}); figure-heavy" + ) + if features.path_coverage >= MIN_VISION_PATH_COVERAGE: + return True, ( + f"{features.path_coverage:.0%} vector coverage " + f"(>= {MIN_VISION_PATH_COVERAGE:.0%}); large chart or diagram" + ) + if features.path_count >= MIN_VISION_PATHS: + return True, f"{features.path_count} vector paths (>= {MIN_VISION_PATHS}); table or diagram" + return False, "no dominant visual structure; text is faithful" From 106356d1d16a4d1d73055e8b7e691d29a21c86fd Mon Sep 17 00:00:00 2001 From: Siddharth Nashikkar Date: Fri, 19 Jun 2026 13:03:58 -0400 Subject: [PATCH 4/6] feat(core): estimate image vs text token cost Add token estimators following each provider's documented tokenization: Claude 28px patches with resolution caps, OpenAI tile counting, and a text estimate. --- src/localcontextrouter/tokens.py | 65 ++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 src/localcontextrouter/tokens.py diff --git a/src/localcontextrouter/tokens.py b/src/localcontextrouter/tokens.py new file mode 100644 index 0000000..b16b99e --- /dev/null +++ b/src/localcontextrouter/tokens.py @@ -0,0 +1,65 @@ +"""Estimate the token cost of a page as extracted text versus as an image. + +The numbers are estimates that follow each provider's documented tokenization so +the router can report the savings of routing a page to text instead of vision. + +- Claude tokenizes images in 28x28 pixel patches, downscaling so the long edge + fits a cap (1568 px / 1568 tokens for most models; 2576 px / 4784 tokens for + the high-resolution models). +- OpenAI's tile models bill a flat 85 tokens at ``detail="low"``; at ``"high"`` + they fit the image to 2048x2048, scale the short side to 768, and bill + 85 + 170 per 512x512 tile. +- Text is approximated at ~4 characters per token. +""" + +from __future__ import annotations + +import math + +_PATCH = 28 +_CLAUDE_MAX_TOKENS = 1568 +_CLAUDE_MAX_EDGE = 1568 +_CLAUDE_HIRES_MAX_TOKENS = 4784 +_CLAUDE_HIRES_MAX_EDGE = 2576 + +_CHARS_PER_TOKEN = 4 + + +def estimate_text_tokens(text: str) -> int: + """Estimate tokens for a block of text (~4 characters per token).""" + return math.ceil(len(text) / _CHARS_PER_TOKEN) + + +def _fit_long_edge(width: float, height: float, max_edge: int) -> tuple[float, float]: + long_edge = max(width, height) + if long_edge <= max_edge: + return width, height + scale = max_edge / long_edge + return width * scale, height * scale + + +def claude_image_tokens(width: float, height: float, *, high_res: bool = False) -> int: + """Estimate Claude image tokens for an image of the given pixel size.""" + if width <= 0 or height <= 0: + return 0 + max_edge = _CLAUDE_HIRES_MAX_EDGE if high_res else _CLAUDE_MAX_EDGE + cap = _CLAUDE_HIRES_MAX_TOKENS if high_res else _CLAUDE_MAX_TOKENS + fitted_w, fitted_h = _fit_long_edge(width, height, max_edge) + patches = math.ceil(fitted_w / _PATCH) * math.ceil(fitted_h / _PATCH) + return min(patches, cap) + + +def openai_image_tokens(width: float, height: float, *, detail: str = "high") -> int: + """Estimate OpenAI tile-model image tokens (GPT-4o / GPT-4.1 family).""" + if detail == "low": + return 85 + if width <= 0 or height <= 0: + return 0 + # Fit within 2048x2048, then scale the shortest side to 768. + fitted_w, fitted_h = _fit_long_edge(width, height, 2048) + short_edge = min(fitted_w, fitted_h) + if short_edge > 768: + scale = 768 / short_edge + fitted_w, fitted_h = fitted_w * scale, fitted_h * scale + tiles = math.ceil(fitted_w / 512) * math.ceil(fitted_h / 512) + return 85 + 170 * tiles From 925d1cb5371374b047665cf3cc4c45a36e71459d Mon Sep 17 00:00:00 2001 From: Siddharth Nashikkar Date: Fri, 19 Jun 2026 13:03:58 -0400 Subject: [PATCH 5/6] feat(core): route vision-worthy pages and report savings route_pdf now sends visually-dominant pages to vision and attaches a token estimate to every page, so RouteResult.tokens_saved shows the cost avoided. --- src/localcontextrouter/__init__.py | 14 ++++++++++++ src/localcontextrouter/router.py | 35 +++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/localcontextrouter/__init__.py b/src/localcontextrouter/__init__.py index e8a2f1e..767f375 100644 --- a/src/localcontextrouter/__init__.py +++ b/src/localcontextrouter/__init__.py @@ -1,19 +1,27 @@ """LocalContextRouter — cheapest faithful path for documents bound for a multimodal LLM.""" from .classify import classify_text, compute_signals +from .detect import is_vision_worthy from .models import ( BoundingBox, Classification, OcrLine, PageClass, + PageFeatures, PageRoute, PageSignals, RouteResult, Source, + TokenEstimate, ) from .ocr import ocr_png_text, run_ocr from .pdf import Pdf, classify_pdf from .router import route_pdf +from .tokens import ( + claude_image_tokens, + estimate_text_tokens, + openai_image_tokens, +) __version__ = "0.0.0" @@ -22,15 +30,21 @@ "Classification", "OcrLine", "PageClass", + "PageFeatures", "PageRoute", "PageSignals", "Pdf", "RouteResult", "Source", + "TokenEstimate", + "claude_image_tokens", "classify_pdf", "classify_text", "compute_signals", + "estimate_text_tokens", + "is_vision_worthy", "ocr_png_text", + "openai_image_tokens", "route_pdf", "run_ocr", "__version__", diff --git a/src/localcontextrouter/router.py b/src/localcontextrouter/router.py index f682934..d6a0fdf 100644 --- a/src/localcontextrouter/router.py +++ b/src/localcontextrouter/router.py @@ -1,7 +1,11 @@ -"""Route each PDF page to the cheapest faithful text source. +"""Route each PDF page to the cheapest faithful source: text, OCR, or vision. -Digital pages keep their extracted text; scanned or garbled pages are rendered -and sent to OCR. Vision routing and token accounting are added in a later phase. +- Digital pages keep their extracted text, unless their meaning lives in visuals + (tables, charts, diagrams) — those go to a vision model. +- Scanned or garbled pages are rendered and sent to OCR. + +Every page carries a token estimate so the savings of avoiding the image path +are visible. """ from __future__ import annotations @@ -9,21 +13,36 @@ from pathlib import Path from .classify import classify_text -from .models import PageClass, PageRoute, RouteResult, Source +from .detect import is_vision_worthy +from .models import PageClass, PageRoute, RouteResult, Source, TokenEstimate from .ocr import ocr_png_text from .pdf import Pdf +from .tokens import claude_image_tokens, estimate_text_tokens def route_pdf(path: str | Path, *, render_scale: float = 2.0) -> RouteResult: - """Route every page of a PDF and return per-page text with its source.""" + """Route every page of a PDF and return per-page content, source, and tokens.""" pages: list[PageRoute] = [] with Pdf(path) as pdf: for index in range(len(pdf)): text = pdf.page_text(index) classification = classify_text(text) + features = pdf.page_features(index) + if classification.page_class is PageClass.DIGITAL: - pages.append(PageRoute(index, classification, Source.TEXT, text)) + source = Source.VISION if is_vision_worthy(features)[0] else Source.TEXT + page_text = text else: - png = pdf.render_page_png(index, scale=render_scale) - pages.append(PageRoute(index, classification, Source.OCR, ocr_png_text(png))) + source = Source.OCR + page_text = ocr_png_text(pdf.render_page_png(index, scale=render_scale)) + + # text_tokens reflects the text we would actually send (OCR output for + # scanned pages), so the reported savings are honest. + estimate = TokenEstimate( + text_tokens=estimate_text_tokens(page_text), + image_tokens=claude_image_tokens( + features.width * render_scale, features.height * render_scale + ), + ) + pages.append(PageRoute(index, classification, source, page_text, estimate)) return RouteResult(pages) From a915c63154cdaafef17dadc0bd188a8db5b8af71 Mon Sep 17 00:00:00 2001 From: Siddharth Nashikkar Date: Fri, 19 Jun 2026 13:03:58 -0400 Subject: [PATCH 6/6] test(core): cover detection, tokens, and vision routing Test the detector on synthetic and real page features, the token formulas against documented provider examples, and routing of a table page to vision. --- tests/conftest.py | 25 ++++++++++++++++++ tests/test_detect.py | 61 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_router.py | 16 ++++++++++++ tests/test_tokens.py | 42 ++++++++++++++++++++++++++++++ 4 files changed, 144 insertions(+) create mode 100644 tests/test_detect.py create mode 100644 tests/test_tokens.py diff --git a/tests/conftest.py b/tests/conftest.py index 128a0ee..9bfa3f5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -36,6 +36,31 @@ def _make(text: str, pages: int = 1, name: str = "text.pdf") -> Path: return _make +@pytest.fixture +def make_table_pdf(tmp_path: Path) -> Callable[..., Path]: + """Return a factory for a page with a real text layer and a ruled table. + + The body text keeps the page ``DIGITAL`` while the many ruling lines make it + vision-worthy. + """ + + def _make(rows: int = 20, cols: int = 6, name: str = "table.pdf") -> Path: + pdf = FPDF() + pdf.add_page() + pdf.set_font("Helvetica", size=10) + pdf.multi_cell(0, 6, "Financial summary table with quarterly figures follows. " * 2) + top = 60 + for i in range(rows + 1): + pdf.line(10, top + i * 8, 200, top + i * 8) + for j in range(cols + 1): + pdf.line(10 + j * 30, top, 10 + j * 30, top + rows * 8) + path = tmp_path / name + pdf.output(str(path)) + return path + + return _make + + @pytest.fixture def make_image_pdf(tmp_path: Path) -> Callable[..., Path]: """Return a factory that writes an image-only PDF (no text layer).""" diff --git a/tests/test_detect.py b/tests/test_detect.py new file mode 100644 index 0000000..f6c0f21 --- /dev/null +++ b/tests/test_detect.py @@ -0,0 +1,61 @@ +"""Tests for the vision-worthy page detector.""" + +from collections.abc import Callable +from pathlib import Path + +from localcontextrouter.detect import is_vision_worthy +from localcontextrouter.models import PageFeatures +from localcontextrouter.pdf import Pdf + + +def _features( + *, + image_count: int = 0, + image_coverage: float = 0.0, + path_count: int = 0, + path_coverage: float = 0.0, +) -> PageFeatures: + return PageFeatures( + width=600, + height=800, + image_count=image_count, + image_coverage=image_coverage, + path_count=path_count, + path_coverage=path_coverage, + ) + + +def test_plain_page_is_not_vision_worthy() -> None: + worthy, _ = is_vision_worthy(_features(path_count=3)) + assert worthy is False + + +def test_figure_heavy_page_is_vision_worthy() -> None: + worthy, reason = is_vision_worthy(_features(image_count=1, image_coverage=0.6)) + assert worthy is True + assert "image" in reason + + +def test_large_vector_chart_is_vision_worthy() -> None: + worthy, reason = is_vision_worthy(_features(path_count=5, path_coverage=0.5)) + assert worthy is True + assert "chart" in reason or "diagram" in reason + + +def test_many_paths_is_vision_worthy() -> None: + worthy, reason = is_vision_worthy(_features(path_count=40)) + assert worthy is True + assert "table" in reason or "diagram" in reason + + +def test_table_pdf_features_trip_detector(make_table_pdf: Callable[..., Path]) -> None: + with Pdf(make_table_pdf()) as pdf: + features = pdf.page_features(0) + assert features.path_count >= 25 + assert is_vision_worthy(features)[0] is True + + +def test_prose_pdf_is_not_vision_worthy(make_text_pdf: Callable[..., Path]) -> None: + with Pdf(make_text_pdf("Plain prose with several sentences of body text. " * 3)) as pdf: + features = pdf.page_features(0) + assert is_vision_worthy(features)[0] is False diff --git a/tests/test_router.py b/tests/test_router.py index 2869748..1b0270f 100644 --- a/tests/test_router.py +++ b/tests/test_router.py @@ -23,6 +23,22 @@ def test_routes_digital_page_to_text(make_text_pdf: Callable[..., Path]) -> None assert "revenue" in result.text +def test_text_page_reports_token_savings(make_text_pdf: Callable[..., Path]) -> None: + result = route_pdf(make_text_pdf(PROSE)) + page = result.pages[0] + # A short prose page is far cheaper as text than as a full-page image. + assert page.tokens.image_tokens > page.tokens.text_tokens + assert result.tokens_saved == page.tokens.saved > 0 + + +def test_routes_table_page_to_vision(make_table_pdf: Callable[..., Path]) -> None: + result = route_pdf(make_table_pdf()) + page = result.pages[0] + assert page.source is Source.VISION + # Vision pages are sent as images, so they contribute no savings. + assert result.tokens_saved == 0 + + @pytest.mark.integration def test_routes_scanned_page_to_ocr(lcr_binary: Path, make_image_pdf: Callable[..., Path]) -> None: result = route_pdf(make_image_pdf("SCANNED INVOICE 2026")) diff --git a/tests/test_tokens.py b/tests/test_tokens.py new file mode 100644 index 0000000..fbea762 --- /dev/null +++ b/tests/test_tokens.py @@ -0,0 +1,42 @@ +"""Tests for token estimation, checked against documented provider examples.""" + +from localcontextrouter.tokens import ( + claude_image_tokens, + estimate_text_tokens, + openai_image_tokens, +) + + +def test_estimate_text_tokens() -> None: + assert estimate_text_tokens("") == 0 + assert estimate_text_tokens("abcd") == 1 + assert estimate_text_tokens("abcde") == 2 + + +def test_claude_tokens_match_documented_patch_counts() -> None: + # 28x28 patches: ceil(w/28) * ceil(h/28). + assert claude_image_tokens(1000, 1000) == 1296 + assert claude_image_tokens(1092, 1092) == 1521 + + +def test_claude_tokens_capped_for_large_images() -> None: + assert claude_image_tokens(4000, 4000) == 1568 + assert claude_image_tokens(8000, 8000, high_res=True) == 4784 + + +def test_claude_high_res_three_megapixel_page() -> None: + # 2000x1500 fits under the 2576 px high-res edge, so no downscale. + assert claude_image_tokens(2000, 1500, high_res=True) == 3888 + + +def test_claude_tokens_zero_for_empty() -> None: + assert claude_image_tokens(0, 100) == 0 + + +def test_openai_low_detail_is_flat() -> None: + assert openai_image_tokens(4000, 4000, detail="low") == 85 + + +def test_openai_high_detail_match_documented_examples() -> None: + assert openai_image_tokens(1024, 1024) == 765 + assert openai_image_tokens(2048, 4096) == 1105