Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/localcontextrouter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
"""LocalContextRouter — cheapest faithful path for documents bound for a multimodal LLM."""

from .classify import classify_text, compute_signals
from .detect import is_vision_worthy
from .models import (
BoundingBox,
Classification,
OcrLine,
PageClass,
PageFeatures,
PageRoute,
PageSignals,
RouteResult,
Source,
TokenEstimate,
)
from .ocr import ocr_png_text, run_ocr
from .pdf import Pdf, classify_pdf
from .router import route_pdf
from .tokens import (
claude_image_tokens,
estimate_text_tokens,
openai_image_tokens,
)

__version__ = "0.0.0"

Expand All @@ -22,15 +30,21 @@
"Classification",
"OcrLine",
"PageClass",
"PageFeatures",
"PageRoute",
"PageSignals",
"Pdf",
"RouteResult",
"Source",
"TokenEstimate",
"claude_image_tokens",
"classify_pdf",
"classify_text",
"compute_signals",
"estimate_text_tokens",
"is_vision_worthy",
"ocr_png_text",
"openai_image_tokens",
"route_pdf",
"run_ocr",
"__version__",
Expand Down
39 changes: 39 additions & 0 deletions src/localcontextrouter/detect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Decide whether a page should go to a vision model rather than as text.

Some pages carry a perfectly good text layer yet still lose their meaning when
flattened to text: tables, charts, diagrams, and figure-heavy layouts. Those are
worth the vision-token cost. This module decides that from cheap layout features
(:class:`~.models.PageFeatures`) — no rendering and no ML.
"""

from __future__ import annotations

from .models import PageFeatures

#: A page with at least this many vector paths is treated as a table or diagram.
#: Charts and ruled tables emit many line/curve objects.
MIN_VISION_PATHS = 25

#: A page with at least this fraction covered by raster images is figure-heavy.
MIN_VISION_IMAGE_COVERAGE = 0.40

#: A page with at least this fraction covered by vector paths holds a large
#: filled chart or diagram.
MIN_VISION_PATH_COVERAGE = 0.30


def is_vision_worthy(features: PageFeatures) -> tuple[bool, str]:
"""Return whether a page should go to a vision model, with the reason."""
if features.image_coverage >= MIN_VISION_IMAGE_COVERAGE:
return True, (
f"{features.image_coverage:.0%} image coverage "
f"(>= {MIN_VISION_IMAGE_COVERAGE:.0%}); figure-heavy"
)
if features.path_coverage >= MIN_VISION_PATH_COVERAGE:
return True, (
f"{features.path_coverage:.0%} vector coverage "
f"(>= {MIN_VISION_PATH_COVERAGE:.0%}); large chart or diagram"
)
if features.path_count >= MIN_VISION_PATHS:
return True, f"{features.path_count} vector paths (>= {MIN_VISION_PATHS}); table or diagram"
return False, "no dominant visual structure; text is faithful"
53 changes: 51 additions & 2 deletions src/localcontextrouter/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,23 +65,63 @@ class OcrLine:


class Source(str, Enum):
"""Where a page's final text came from."""
"""Where a page's final content should come from."""

TEXT = "text"
"""Extracted directly from the embedded text layer."""

OCR = "ocr"
"""Produced by on-device OCR after rendering the page."""

VISION = "vision"
"""Send the page to a vision model — its meaning lives in the visuals."""


@dataclass(frozen=True)
class PageFeatures:
"""Layout signals for a page, derived from its content objects."""

width: float
"""Page width in PDF points."""

height: float
"""Page height in PDF points."""

image_count: int
"""Number of raster image objects on the page."""

image_coverage: float
"""Fraction of the page area covered by raster images, in 0...1."""

path_count: int
"""Number of vector path objects (lines, curves, fills)."""

path_coverage: float
"""Fraction of the page area covered by vector paths, in 0...1."""


@dataclass(frozen=True)
class TokenEstimate:
"""Estimated token cost of a page as extracted text versus as an image."""

text_tokens: int
image_tokens: int

@property
def saved(self) -> int:
"""Tokens avoided by sending text instead of the page image (never negative)."""
return max(0, self.image_tokens - self.text_tokens)


@dataclass(frozen=True)
class PageRoute:
"""The routing outcome for one page: its classification, source, and text."""
"""The routing outcome for one page."""

index: int
classification: Classification
source: Source
text: str
tokens: TokenEstimate


@dataclass(frozen=True)
Expand All @@ -94,3 +134,12 @@ class RouteResult:
def text(self) -> str:
"""All page text joined in reading order."""
return "\n\n".join(page.text for page in self.pages)

@property
def tokens_saved(self) -> int:
"""Total tokens avoided versus sending every page as an image.

Counts only pages routed to text or OCR; vision pages are sent as
images, so they save nothing.
"""
return sum(page.tokens.saved for page in self.pages if page.source is not Source.VISION)
36 changes: 35 additions & 1 deletion src/localcontextrouter/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
from pathlib import Path

import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c

from .classify import classify_text
from .models import Classification
from .models import Classification, PageFeatures


class Pdf:
Expand Down Expand Up @@ -49,6 +50,39 @@ def page_texts(self) -> Iterator[str]:
for index in range(len(self)):
yield self.page_text(index)

def page_features(self, index: int) -> PageFeatures:
"""Summarize the page's image and vector-path content for routing.

Counts raster image and vector path objects and the fraction of the page
each covers. Charts and diagrams emit many vector paths rather than raster
images, so the path signals catch content that an image count misses.
"""
page = self._doc[index]
try:
width, height = page.get_size()
page_area = width * height
image_count = path_count = 0
image_area = path_area = 0.0
for obj in page.get_objects():
left, bottom, right, top = obj.get_bounds()
area = abs((right - left) * (top - bottom))
if obj.type == pdfium_c.FPDF_PAGEOBJ_IMAGE:
image_count += 1
image_area += area
elif obj.type == pdfium_c.FPDF_PAGEOBJ_PATH:
path_count += 1
path_area += area
return PageFeatures(
width=width,
height=height,
image_count=image_count,
image_coverage=image_area / page_area if page_area else 0.0,
path_count=path_count,
path_coverage=path_area / page_area if page_area else 0.0,
)
finally:
page.close()

def render_page_png(self, index: int, scale: float = 2.0) -> bytes:
"""Render the page at ``index`` to PNG bytes.

Expand Down
35 changes: 27 additions & 8 deletions src/localcontextrouter/router.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,48 @@
"""Route each PDF page to the cheapest faithful text source.
"""Route each PDF page to the cheapest faithful source: text, OCR, or vision.

Digital pages keep their extracted text; scanned or garbled pages are rendered
and sent to OCR. Vision routing and token accounting are added in a later phase.
- Digital pages keep their extracted text, unless their meaning lives in visuals
(tables, charts, diagrams) — those go to a vision model.
- Scanned or garbled pages are rendered and sent to OCR.

Every page carries a token estimate so the savings of avoiding the image path
are visible.
"""

from __future__ import annotations

from pathlib import Path

from .classify import classify_text
from .models import PageClass, PageRoute, RouteResult, Source
from .detect import is_vision_worthy
from .models import PageClass, PageRoute, RouteResult, Source, TokenEstimate
from .ocr import ocr_png_text
from .pdf import Pdf
from .tokens import claude_image_tokens, estimate_text_tokens


def route_pdf(path: str | Path, *, render_scale: float = 2.0) -> RouteResult:
"""Route every page of a PDF and return per-page text with its source."""
"""Route every page of a PDF and return per-page content, source, and tokens."""
pages: list[PageRoute] = []
with Pdf(path) as pdf:
for index in range(len(pdf)):
text = pdf.page_text(index)
classification = classify_text(text)
features = pdf.page_features(index)

if classification.page_class is PageClass.DIGITAL:
pages.append(PageRoute(index, classification, Source.TEXT, text))
source = Source.VISION if is_vision_worthy(features)[0] else Source.TEXT
page_text = text
else:
png = pdf.render_page_png(index, scale=render_scale)
pages.append(PageRoute(index, classification, Source.OCR, ocr_png_text(png)))
source = Source.OCR
page_text = ocr_png_text(pdf.render_page_png(index, scale=render_scale))

# text_tokens reflects the text we would actually send (OCR output for
# scanned pages), so the reported savings are honest.
estimate = TokenEstimate(
text_tokens=estimate_text_tokens(page_text),
image_tokens=claude_image_tokens(
features.width * render_scale, features.height * render_scale
),
)
pages.append(PageRoute(index, classification, source, page_text, estimate))
return RouteResult(pages)
65 changes: 65 additions & 0 deletions src/localcontextrouter/tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Estimate the token cost of a page as extracted text versus as an image.

The numbers are estimates that follow each provider's documented tokenization so
the router can report the savings of routing a page to text instead of vision.

- Claude tokenizes images in 28x28 pixel patches, downscaling so the long edge
fits a cap (1568 px / 1568 tokens for most models; 2576 px / 4784 tokens for
the high-resolution models).
- OpenAI's tile models bill a flat 85 tokens at ``detail="low"``; at ``"high"``
they fit the image to 2048x2048, scale the short side to 768, and bill
85 + 170 per 512x512 tile.
- Text is approximated at ~4 characters per token.
"""

from __future__ import annotations

import math

_PATCH = 28
_CLAUDE_MAX_TOKENS = 1568
_CLAUDE_MAX_EDGE = 1568
_CLAUDE_HIRES_MAX_TOKENS = 4784
_CLAUDE_HIRES_MAX_EDGE = 2576

_CHARS_PER_TOKEN = 4


def estimate_text_tokens(text: str) -> int:
"""Estimate tokens for a block of text (~4 characters per token)."""
return math.ceil(len(text) / _CHARS_PER_TOKEN)


def _fit_long_edge(width: float, height: float, max_edge: int) -> tuple[float, float]:
long_edge = max(width, height)
if long_edge <= max_edge:
return width, height
scale = max_edge / long_edge
return width * scale, height * scale


def claude_image_tokens(width: float, height: float, *, high_res: bool = False) -> int:
"""Estimate Claude image tokens for an image of the given pixel size."""
if width <= 0 or height <= 0:
return 0
max_edge = _CLAUDE_HIRES_MAX_EDGE if high_res else _CLAUDE_MAX_EDGE
cap = _CLAUDE_HIRES_MAX_TOKENS if high_res else _CLAUDE_MAX_TOKENS
fitted_w, fitted_h = _fit_long_edge(width, height, max_edge)
patches = math.ceil(fitted_w / _PATCH) * math.ceil(fitted_h / _PATCH)
return min(patches, cap)


def openai_image_tokens(width: float, height: float, *, detail: str = "high") -> int:
"""Estimate OpenAI tile-model image tokens (GPT-4o / GPT-4.1 family)."""
if detail == "low":
return 85
if width <= 0 or height <= 0:
return 0
# Fit within 2048x2048, then scale the shortest side to 768.
fitted_w, fitted_h = _fit_long_edge(width, height, 2048)
short_edge = min(fitted_w, fitted_h)
if short_edge > 768:
scale = 768 / short_edge
fitted_w, fitted_h = fitted_w * scale, fitted_h * scale
tiles = math.ceil(fitted_w / 512) * math.ceil(fitted_h / 512)
return 85 + 170 * tiles
25 changes: 25 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,31 @@ def _make(text: str, pages: int = 1, name: str = "text.pdf") -> Path:
return _make


@pytest.fixture
def make_table_pdf(tmp_path: Path) -> Callable[..., Path]:
"""Return a factory for a page with a real text layer and a ruled table.

The body text keeps the page ``DIGITAL`` while the many ruling lines make it
vision-worthy.
"""

def _make(rows: int = 20, cols: int = 6, name: str = "table.pdf") -> Path:
pdf = FPDF()
pdf.add_page()
pdf.set_font("Helvetica", size=10)
pdf.multi_cell(0, 6, "Financial summary table with quarterly figures follows. " * 2)
top = 60
for i in range(rows + 1):
pdf.line(10, top + i * 8, 200, top + i * 8)
for j in range(cols + 1):
pdf.line(10 + j * 30, top, 10 + j * 30, top + rows * 8)
path = tmp_path / name
pdf.output(str(path))
return path

return _make


@pytest.fixture
def make_image_pdf(tmp_path: Path) -> Callable[..., Path]:
"""Return a factory that writes an image-only PDF (no text layer)."""
Expand Down
Loading
Loading