sid732 · sid732 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -58,3 +58,22 @@ jobs:
         run: swift build -c release
       - name: Test
         run: swift test
+
+  integration:
+    name: Integration (OCR pipeline)
+    runs-on: macos-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: pip
+      - name: Build OCR binary
+        run: swift build -c release
+        working-directory: ocr
+      - name: Install
+        run: python -m pip install -e ".[dev]"
+      - name: Integration tests
+        run: pytest -m integration -q
+        env:
+          LCR_OCR_BIN: ${{ github.workspace }}/ocr/.build/release/lcr-ocr
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ classifiers = [
 dynamic = ["version"]
 dependencies = [
     "pypdfium2>=4.30",
+    "pillow>=10.1",
 ]
 
 [project.urls]
@@ -64,3 +65,6 @@ ignore_missing_imports = true
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 addopts = "-ra"
+markers = [
+    "integration: tests that invoke the built lcr-ocr binary",
+]
diff --git a/src/localcontextrouter/__init__.py b/src/localcontextrouter/__init__.py
@@ -1,18 +1,37 @@
 """LocalContextRouter — cheapest faithful path for documents bound for a multimodal LLM."""
 
 from .classify import classify_text, compute_signals
-from .models import Classification, PageClass, PageSignals
+from .models import (
+    BoundingBox,
+    Classification,
+    OcrLine,
+    PageClass,
+    PageRoute,
+    PageSignals,
+    RouteResult,
+    Source,
+)
+from .ocr import ocr_png_text, run_ocr
 from .pdf import Pdf, classify_pdf
+from .router import route_pdf
 
 __version__ = "0.0.0"
 
 __all__ = [
+    "BoundingBox",
     "Classification",
+    "OcrLine",
     "PageClass",
+    "PageRoute",
     "PageSignals",
     "Pdf",
+    "RouteResult",
+    "Source",
     "classify_pdf",
     "classify_text",
     "compute_signals",
+    "ocr_png_text",
+    "route_pdf",
+    "run_ocr",
     "__version__",
 ]
diff --git a/src/localcontextrouter/models.py b/src/localcontextrouter/models.py
@@ -43,3 +43,54 @@ class Classification:
     page_class: PageClass
     signals: PageSignals
     reason: str
+
+
+@dataclass(frozen=True)
+class BoundingBox:
+    """A normalized box with a top-left origin; all values are fractions in 0...1."""
+
+    x: float
+    y: float
+    width: float
+    height: float
+
+
+@dataclass(frozen=True)
+class OcrLine:
+    """A single line recognized by the OCR binary."""
+
+    text: str
+    confidence: float
+    bounding_box: BoundingBox
+
+
+class Source(str, Enum):
+    """Where a page's final text came from."""
+
+    TEXT = "text"
+    """Extracted directly from the embedded text layer."""
+
+    OCR = "ocr"
+    """Produced by on-device OCR after rendering the page."""
+
+
+@dataclass(frozen=True)
+class PageRoute:
+    """The routing outcome for one page: its classification, source, and text."""
+
+    index: int
+    classification: Classification
+    source: Source
+    text: str
+
+
+@dataclass(frozen=True)
+class RouteResult:
+    """The routing outcome for a whole document."""
+
+    pages: list[PageRoute]
+
+    @property
+    def text(self) -> str:
+        """All page text joined in reading order."""
+        return "\n\n".join(page.text for page in self.pages)
diff --git a/src/localcontextrouter/ocr.py b/src/localcontextrouter/ocr.py
@@ -0,0 +1,116 @@
+"""Bridge to the on-device ``lcr-ocr`` binary.
+
+The Swift binary does the actual recognition (Apple Vision); this module finds
+it, invokes it, and parses its JSON output into :class:`~.models.OcrLine`.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+from .models import BoundingBox, OcrLine
+
+#: Environment variable that, if set, overrides where the binary is found.
+BINARY_ENV_VAR = "LCR_OCR_BIN"
+
+_BINARY_NAME = "lcr-ocr"
+# Dev fallback: the binary built from the bundled Swift package in this repo.
+_DEV_BINARY = Path(__file__).resolve().parents[2] / "ocr" / ".build" / "release" / _BINARY_NAME
+
+
+class OcrBinaryNotFound(RuntimeError):
+    """Raised when the ``lcr-ocr`` binary cannot be located."""
+
+
+class OcrError(RuntimeError):
+    """Raised when the ``lcr-ocr`` binary exits with an error."""
+
+
+def locate_binary() -> Path:
+    """Locate the ``lcr-ocr`` binary.
+
+    Resolution order: the ``LCR_OCR_BIN`` environment variable, then ``PATH``,
+    then the binary built from the bundled Swift package.
+    """
+    override = os.environ.get(BINARY_ENV_VAR)
+    if override:
+        path = Path(override)
+        if not path.exists():
+            raise OcrBinaryNotFound(f"{BINARY_ENV_VAR} points to a missing file: {path}")
+        return path
+
+    on_path = shutil.which(_BINARY_NAME)
+    if on_path:
+        return Path(on_path)
+
+    if _DEV_BINARY.exists():
+        return _DEV_BINARY
+
+    raise OcrBinaryNotFound(
+        f"could not find '{_BINARY_NAME}'. Build it with 'swift build -c release' in "
+        f"the ocr/ directory, or set {BINARY_ENV_VAR} to its path."
+    )
+
+
+def parse_ocr_lines(payload: str) -> list[OcrLine]:
+    """Parse the binary's ``--json`` output into :class:`OcrLine` objects."""
+    return [
+        OcrLine(
+            text=item["text"],
+            confidence=float(item["confidence"]),
+            bounding_box=BoundingBox(
+                x=float(item["boundingBox"]["x"]),
+                y=float(item["boundingBox"]["y"]),
+                width=float(item["boundingBox"]["width"]),
+                height=float(item["boundingBox"]["height"]),
+            ),
+        )
+        for item in json.loads(payload)
+    ]
+
+
+def run_ocr(
+    image_path: str | Path,
+    *,
+    fast: bool = False,
+    languages: list[str] | None = None,
+    correction: bool = True,
+) -> list[OcrLine]:
+    """Run the binary on an image file and return the recognized lines."""
+    args = [str(locate_binary()), str(image_path), "--json"]
+    if fast:
+        args.append("--fast")
+    if not correction:
+        args.append("--no-correction")
+    if languages:
+        args += ["--lang", ",".join(languages)]
+
+    result = subprocess.run(args, capture_output=True, text=True, check=False)
+    if result.returncode != 0:
+        raise OcrError(f"lcr-ocr exited with {result.returncode}: {result.stderr.strip()}")
+    return parse_ocr_lines(result.stdout)
+
+
+def ocr_png_text(
+    png: bytes,
+    *,
+    fast: bool = False,
+    languages: list[str] | None = None,
+    correction: bool = True,
+    min_confidence: float = 0.0,
+) -> str:
+    """OCR a PNG given as bytes; return the recognized lines joined by newlines.
+
+    Lines below ``min_confidence`` are dropped — useful for filtering the
+    low-confidence glyphs that icons and logos tend to produce.
+    """
+    with tempfile.NamedTemporaryFile(suffix=".png") as tmp:
+        tmp.write(png)
+        tmp.flush()
+        lines = run_ocr(tmp.name, fast=fast, languages=languages, correction=correction)
+    return "\n".join(line.text for line in lines if line.confidence >= min_confidence)
diff --git a/src/localcontextrouter/pdf.py b/src/localcontextrouter/pdf.py
@@ -7,6 +7,7 @@
 
 from __future__ import annotations
 
+import io
 from collections.abc import Iterator
 from pathlib import Path
 
@@ -48,6 +49,22 @@ def page_texts(self) -> Iterator[str]:
         for index in range(len(self)):
             yield self.page_text(index)
 
+    def render_page_png(self, index: int, scale: float = 2.0) -> bytes:
+        """Render the page at ``index`` to PNG bytes.
+
+        ``scale`` multiplies the native size (2.0 ~= 144 DPI), trading speed for
+        OCR accuracy on small text. Used to feed image-only pages to OCR.
+        """
+        page = self._doc[index]
+        bitmap = page.render(scale=scale)
+        try:
+            buffer = io.BytesIO()
+            bitmap.to_pil().save(buffer, format="PNG")
+            return buffer.getvalue()
+        finally:
+            bitmap.close()
+            page.close()
+
     def close(self) -> None:
         self._doc.close()
 

diff --git a/src/localcontextrouter/router.py b/src/localcontextrouter/router.py
@@ -0,0 +1,29 @@
+"""Route each PDF page to the cheapest faithful text source.
+
+Digital pages keep their extracted text; scanned or garbled pages are rendered
+and sent to OCR. Vision routing and token accounting are added in a later phase.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from .classify import classify_text
+from .models import PageClass, PageRoute, RouteResult, Source
+from .ocr import ocr_png_text
+from .pdf import Pdf
+
+
+def route_pdf(path: str | Path, *, render_scale: float = 2.0) -> RouteResult:
+    """Route every page of a PDF and return per-page text with its source."""
+    pages: list[PageRoute] = []
+    with Pdf(path) as pdf:
+        for index in range(len(pdf)):
+            text = pdf.page_text(index)
+            classification = classify_text(text)
+            if classification.page_class is PageClass.DIGITAL:
+                pages.append(PageRoute(index, classification, Source.TEXT, text))
+            else:
+                png = pdf.render_page_png(index, scale=render_scale)
+                pages.append(PageRoute(index, classification, Source.OCR, ocr_png_text(png)))
+    return RouteResult(pages)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,56 @@
+"""Shared test fixtures: a binary guard and PDF builders."""
+
+from collections.abc import Callable
+from pathlib import Path
+
+import pytest
+from fpdf import FPDF
+from PIL import Image, ImageDraw, ImageFont
+
+from localcontextrouter.ocr import OcrBinaryNotFound, locate_binary
+
+
+@pytest.fixture
+def lcr_binary() -> Path:
+    """Path to the lcr-ocr binary, or skip the test when it is not built."""
+    try:
+        return locate_binary()
+    except OcrBinaryNotFound:
+        pytest.skip("lcr-ocr binary not built")
+
+
+@pytest.fixture
+def make_text_pdf(tmp_path: Path) -> Callable[..., Path]:
+    """Return a factory that writes a PDF with a real text layer."""
+
+    def _make(text: str, pages: int = 1, name: str = "text.pdf") -> Path:
+        pdf = FPDF()
+        pdf.set_font("Helvetica", size=12)
+        for _ in range(pages):
+            pdf.add_page()
+            pdf.multi_cell(0, 8, text)
+        path = tmp_path / name
+        pdf.output(str(path))
+        return path
+
+    return _make
+
+
+@pytest.fixture
+def make_image_pdf(tmp_path: Path) -> Callable[..., Path]:
+    """Return a factory that writes an image-only PDF (no text layer)."""
+
+    def _make(text: str, name: str = "scan.pdf") -> Path:
+        image = Image.new("RGB", (900, 220), "white")
+        draw = ImageDraw.Draw(image)
+        draw.text((25, 80), text, fill="black", font=ImageFont.load_default(size=56))
+        png = tmp_path / "scan.png"
+        image.save(png)
+        pdf = FPDF()
+        pdf.add_page()
+        pdf.image(str(png), x=10, y=10, w=180)
+        path = tmp_path / name
+        pdf.output(str(path))
+        return path
+
+    return _make