Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,22 @@ jobs:
run: swift build -c release
- name: Test
run: swift test

integration:
name: Integration (OCR pipeline)
runs-on: macos-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: pip
- name: Build OCR binary
run: swift build -c release
working-directory: ocr
- name: Install
run: python -m pip install -e ".[dev]"
- name: Integration tests
run: pytest -m integration -q
env:
LCR_OCR_BIN: ${{ github.workspace }}/ocr/.build/release/lcr-ocr
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ classifiers = [
dynamic = ["version"]
dependencies = [
"pypdfium2>=4.30",
"pillow>=10.1",
]

[project.urls]
Expand Down Expand Up @@ -64,3 +65,6 @@ ignore_missing_imports = true
[tool.pytest.ini_options]
testpaths = ["tests"]
addopts = "-ra"
markers = [
"integration: tests that invoke the built lcr-ocr binary",
]
21 changes: 20 additions & 1 deletion src/localcontextrouter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,37 @@
"""LocalContextRouter — cheapest faithful path for documents bound for a multimodal LLM."""

from .classify import classify_text, compute_signals
from .models import Classification, PageClass, PageSignals
from .models import (
BoundingBox,
Classification,
OcrLine,
PageClass,
PageRoute,
PageSignals,
RouteResult,
Source,
)
from .ocr import ocr_png_text, run_ocr
from .pdf import Pdf, classify_pdf
from .router import route_pdf

__version__ = "0.0.0"

__all__ = [
"BoundingBox",
"Classification",
"OcrLine",
"PageClass",
"PageRoute",
"PageSignals",
"Pdf",
"RouteResult",
"Source",
"classify_pdf",
"classify_text",
"compute_signals",
"ocr_png_text",
"route_pdf",
"run_ocr",
"__version__",
]
51 changes: 51 additions & 0 deletions src/localcontextrouter/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,54 @@ class Classification:
page_class: PageClass
signals: PageSignals
reason: str


@dataclass(frozen=True)
class BoundingBox:
"""A normalized box with a top-left origin; all values are fractions in 0...1."""

x: float
y: float
width: float
height: float


@dataclass(frozen=True)
class OcrLine:
"""A single line recognized by the OCR binary."""

text: str
confidence: float
bounding_box: BoundingBox


class Source(str, Enum):
"""Where a page's final text came from."""

TEXT = "text"
"""Extracted directly from the embedded text layer."""

OCR = "ocr"
"""Produced by on-device OCR after rendering the page."""


@dataclass(frozen=True)
class PageRoute:
"""The routing outcome for one page: its classification, source, and text."""

index: int
classification: Classification
source: Source
text: str


@dataclass(frozen=True)
class RouteResult:
"""The routing outcome for a whole document."""

pages: list[PageRoute]

@property
def text(self) -> str:
"""All page text joined in reading order."""
return "\n\n".join(page.text for page in self.pages)
116 changes: 116 additions & 0 deletions src/localcontextrouter/ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""Bridge to the on-device ``lcr-ocr`` binary.

The Swift binary does the actual recognition (Apple Vision); this module finds
it, invokes it, and parses its JSON output into :class:`~.models.OcrLine`.
"""

from __future__ import annotations

import json
import os
import shutil
import subprocess
import tempfile
from pathlib import Path

from .models import BoundingBox, OcrLine

#: Environment variable that, if set, overrides where the binary is found.
BINARY_ENV_VAR = "LCR_OCR_BIN"

_BINARY_NAME = "lcr-ocr"
# Dev fallback: the binary built from the bundled Swift package in this repo.
_DEV_BINARY = Path(__file__).resolve().parents[2] / "ocr" / ".build" / "release" / _BINARY_NAME


class OcrBinaryNotFound(RuntimeError):
"""Raised when the ``lcr-ocr`` binary cannot be located."""


class OcrError(RuntimeError):
"""Raised when the ``lcr-ocr`` binary exits with an error."""


def locate_binary() -> Path:
"""Locate the ``lcr-ocr`` binary.

Resolution order: the ``LCR_OCR_BIN`` environment variable, then ``PATH``,
then the binary built from the bundled Swift package.
"""
override = os.environ.get(BINARY_ENV_VAR)
if override:
path = Path(override)
if not path.exists():
raise OcrBinaryNotFound(f"{BINARY_ENV_VAR} points to a missing file: {path}")
return path

on_path = shutil.which(_BINARY_NAME)
if on_path:
return Path(on_path)

if _DEV_BINARY.exists():
return _DEV_BINARY

raise OcrBinaryNotFound(
f"could not find '{_BINARY_NAME}'. Build it with 'swift build -c release' in "
f"the ocr/ directory, or set {BINARY_ENV_VAR} to its path."
)


def parse_ocr_lines(payload: str) -> list[OcrLine]:
"""Parse the binary's ``--json`` output into :class:`OcrLine` objects."""
return [
OcrLine(
text=item["text"],
confidence=float(item["confidence"]),
bounding_box=BoundingBox(
x=float(item["boundingBox"]["x"]),
y=float(item["boundingBox"]["y"]),
width=float(item["boundingBox"]["width"]),
height=float(item["boundingBox"]["height"]),
),
)
for item in json.loads(payload)
]


def run_ocr(
image_path: str | Path,
*,
fast: bool = False,
languages: list[str] | None = None,
correction: bool = True,
) -> list[OcrLine]:
"""Run the binary on an image file and return the recognized lines."""
args = [str(locate_binary()), str(image_path), "--json"]
if fast:
args.append("--fast")
if not correction:
args.append("--no-correction")
if languages:
args += ["--lang", ",".join(languages)]

result = subprocess.run(args, capture_output=True, text=True, check=False)
if result.returncode != 0:
raise OcrError(f"lcr-ocr exited with {result.returncode}: {result.stderr.strip()}")
return parse_ocr_lines(result.stdout)


def ocr_png_text(
png: bytes,
*,
fast: bool = False,
languages: list[str] | None = None,
correction: bool = True,
min_confidence: float = 0.0,
) -> str:
"""OCR a PNG given as bytes; return the recognized lines joined by newlines.

Lines below ``min_confidence`` are dropped — useful for filtering the
low-confidence glyphs that icons and logos tend to produce.
"""
with tempfile.NamedTemporaryFile(suffix=".png") as tmp:
tmp.write(png)
tmp.flush()
lines = run_ocr(tmp.name, fast=fast, languages=languages, correction=correction)
return "\n".join(line.text for line in lines if line.confidence >= min_confidence)
17 changes: 17 additions & 0 deletions src/localcontextrouter/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from __future__ import annotations

import io
from collections.abc import Iterator
from pathlib import Path

Expand Down Expand Up @@ -48,6 +49,22 @@ def page_texts(self) -> Iterator[str]:
for index in range(len(self)):
yield self.page_text(index)

def render_page_png(self, index: int, scale: float = 2.0) -> bytes:
"""Render the page at ``index`` to PNG bytes.

``scale`` multiplies the native size (2.0 ~= 144 DPI), trading speed for
OCR accuracy on small text. Used to feed image-only pages to OCR.
"""
page = self._doc[index]
bitmap = page.render(scale=scale)
try:
buffer = io.BytesIO()
bitmap.to_pil().save(buffer, format="PNG")
return buffer.getvalue()
finally:
bitmap.close()
page.close()

def close(self) -> None:
self._doc.close()

Expand Down
29 changes: 29 additions & 0 deletions src/localcontextrouter/router.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Route each PDF page to the cheapest faithful text source.

Digital pages keep their extracted text; scanned or garbled pages are rendered
and sent to OCR. Vision routing and token accounting are added in a later phase.
"""

from __future__ import annotations

from pathlib import Path

from .classify import classify_text
from .models import PageClass, PageRoute, RouteResult, Source
from .ocr import ocr_png_text
from .pdf import Pdf


def route_pdf(path: str | Path, *, render_scale: float = 2.0) -> RouteResult:
"""Route every page of a PDF and return per-page text with its source."""
pages: list[PageRoute] = []
with Pdf(path) as pdf:
for index in range(len(pdf)):
text = pdf.page_text(index)
classification = classify_text(text)
if classification.page_class is PageClass.DIGITAL:
pages.append(PageRoute(index, classification, Source.TEXT, text))
else:
png = pdf.render_page_png(index, scale=render_scale)
pages.append(PageRoute(index, classification, Source.OCR, ocr_png_text(png)))
return RouteResult(pages)
56 changes: 56 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Shared test fixtures: a binary guard and PDF builders."""

from collections.abc import Callable
from pathlib import Path

import pytest
from fpdf import FPDF
from PIL import Image, ImageDraw, ImageFont

from localcontextrouter.ocr import OcrBinaryNotFound, locate_binary


@pytest.fixture
def lcr_binary() -> Path:
"""Path to the lcr-ocr binary, or skip the test when it is not built."""
try:
return locate_binary()
except OcrBinaryNotFound:
pytest.skip("lcr-ocr binary not built")


@pytest.fixture
def make_text_pdf(tmp_path: Path) -> Callable[..., Path]:
"""Return a factory that writes a PDF with a real text layer."""

def _make(text: str, pages: int = 1, name: str = "text.pdf") -> Path:
pdf = FPDF()
pdf.set_font("Helvetica", size=12)
for _ in range(pages):
pdf.add_page()
pdf.multi_cell(0, 8, text)
path = tmp_path / name
pdf.output(str(path))
return path

return _make


@pytest.fixture
def make_image_pdf(tmp_path: Path) -> Callable[..., Path]:
"""Return a factory that writes an image-only PDF (no text layer)."""

def _make(text: str, name: str = "scan.pdf") -> Path:
image = Image.new("RGB", (900, 220), "white")
draw = ImageDraw.Draw(image)
draw.text((25, 80), text, fill="black", font=ImageFont.load_default(size=56))
png = tmp_path / "scan.png"
image.save(png)
pdf = FPDF()
pdf.add_page()
pdf.image(str(png), x=10, y=10, w=180)
path = tmp_path / name
pdf.output(str(path))
return path

return _make
Loading
Loading