diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 80c8ba2..c637cd8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -58,3 +58,22 @@ jobs: run: swift build -c release - name: Test run: swift test + + integration: + name: Integration (OCR pipeline) + runs-on: macos-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + - name: Build OCR binary + run: swift build -c release + working-directory: ocr + - name: Install + run: python -m pip install -e ".[dev]" + - name: Integration tests + run: pytest -m integration -q + env: + LCR_OCR_BIN: ${{ github.workspace }}/ocr/.build/release/lcr-ocr diff --git a/pyproject.toml b/pyproject.toml index 9a1c4be..52fc952 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ classifiers = [ dynamic = ["version"] dependencies = [ "pypdfium2>=4.30", + "pillow>=10.1", ] [project.urls] @@ -64,3 +65,6 @@ ignore_missing_imports = true [tool.pytest.ini_options] testpaths = ["tests"] addopts = "-ra" +markers = [ + "integration: tests that invoke the built lcr-ocr binary", +] diff --git a/src/localcontextrouter/__init__.py b/src/localcontextrouter/__init__.py index 9cb89dc..e8a2f1e 100644 --- a/src/localcontextrouter/__init__.py +++ b/src/localcontextrouter/__init__.py @@ -1,18 +1,37 @@ """LocalContextRouter — cheapest faithful path for documents bound for a multimodal LLM.""" from .classify import classify_text, compute_signals -from .models import Classification, PageClass, PageSignals +from .models import ( + BoundingBox, + Classification, + OcrLine, + PageClass, + PageRoute, + PageSignals, + RouteResult, + Source, +) +from .ocr import ocr_png_text, run_ocr from .pdf import Pdf, classify_pdf +from .router import route_pdf __version__ = "0.0.0" __all__ = [ + "BoundingBox", "Classification", + "OcrLine", "PageClass", + "PageRoute", "PageSignals", "Pdf", + "RouteResult", + "Source", "classify_pdf", "classify_text", "compute_signals", + "ocr_png_text", + "route_pdf", + "run_ocr", "__version__", ] diff --git a/src/localcontextrouter/models.py b/src/localcontextrouter/models.py index cf7d88f..ad2aaf7 100644 --- a/src/localcontextrouter/models.py +++ b/src/localcontextrouter/models.py @@ -43,3 +43,54 @@ class Classification: page_class: PageClass signals: PageSignals reason: str + + +@dataclass(frozen=True) +class BoundingBox: + """A normalized box with a top-left origin; all values are fractions in 0...1.""" + + x: float + y: float + width: float + height: float + + +@dataclass(frozen=True) +class OcrLine: + """A single line recognized by the OCR binary.""" + + text: str + confidence: float + bounding_box: BoundingBox + + +class Source(str, Enum): + """Where a page's final text came from.""" + + TEXT = "text" + """Extracted directly from the embedded text layer.""" + + OCR = "ocr" + """Produced by on-device OCR after rendering the page.""" + + +@dataclass(frozen=True) +class PageRoute: + """The routing outcome for one page: its classification, source, and text.""" + + index: int + classification: Classification + source: Source + text: str + + +@dataclass(frozen=True) +class RouteResult: + """The routing outcome for a whole document.""" + + pages: list[PageRoute] + + @property + def text(self) -> str: + """All page text joined in reading order.""" + return "\n\n".join(page.text for page in self.pages) diff --git a/src/localcontextrouter/ocr.py b/src/localcontextrouter/ocr.py new file mode 100644 index 0000000..bf56e62 --- /dev/null +++ b/src/localcontextrouter/ocr.py @@ -0,0 +1,116 @@ +"""Bridge to the on-device ``lcr-ocr`` binary. + +The Swift binary does the actual recognition (Apple Vision); this module finds +it, invokes it, and parses its JSON output into :class:`~.models.OcrLine`. +""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import tempfile +from pathlib import Path + +from .models import BoundingBox, OcrLine + +#: Environment variable that, if set, overrides where the binary is found. +BINARY_ENV_VAR = "LCR_OCR_BIN" + +_BINARY_NAME = "lcr-ocr" +# Dev fallback: the binary built from the bundled Swift package in this repo. +_DEV_BINARY = Path(__file__).resolve().parents[2] / "ocr" / ".build" / "release" / _BINARY_NAME + + +class OcrBinaryNotFound(RuntimeError): + """Raised when the ``lcr-ocr`` binary cannot be located.""" + + +class OcrError(RuntimeError): + """Raised when the ``lcr-ocr`` binary exits with an error.""" + + +def locate_binary() -> Path: + """Locate the ``lcr-ocr`` binary. + + Resolution order: the ``LCR_OCR_BIN`` environment variable, then ``PATH``, + then the binary built from the bundled Swift package. + """ + override = os.environ.get(BINARY_ENV_VAR) + if override: + path = Path(override) + if not path.exists(): + raise OcrBinaryNotFound(f"{BINARY_ENV_VAR} points to a missing file: {path}") + return path + + on_path = shutil.which(_BINARY_NAME) + if on_path: + return Path(on_path) + + if _DEV_BINARY.exists(): + return _DEV_BINARY + + raise OcrBinaryNotFound( + f"could not find '{_BINARY_NAME}'. Build it with 'swift build -c release' in " + f"the ocr/ directory, or set {BINARY_ENV_VAR} to its path." + ) + + +def parse_ocr_lines(payload: str) -> list[OcrLine]: + """Parse the binary's ``--json`` output into :class:`OcrLine` objects.""" + return [ + OcrLine( + text=item["text"], + confidence=float(item["confidence"]), + bounding_box=BoundingBox( + x=float(item["boundingBox"]["x"]), + y=float(item["boundingBox"]["y"]), + width=float(item["boundingBox"]["width"]), + height=float(item["boundingBox"]["height"]), + ), + ) + for item in json.loads(payload) + ] + + +def run_ocr( + image_path: str | Path, + *, + fast: bool = False, + languages: list[str] | None = None, + correction: bool = True, +) -> list[OcrLine]: + """Run the binary on an image file and return the recognized lines.""" + args = [str(locate_binary()), str(image_path), "--json"] + if fast: + args.append("--fast") + if not correction: + args.append("--no-correction") + if languages: + args += ["--lang", ",".join(languages)] + + result = subprocess.run(args, capture_output=True, text=True, check=False) + if result.returncode != 0: + raise OcrError(f"lcr-ocr exited with {result.returncode}: {result.stderr.strip()}") + return parse_ocr_lines(result.stdout) + + +def ocr_png_text( + png: bytes, + *, + fast: bool = False, + languages: list[str] | None = None, + correction: bool = True, + min_confidence: float = 0.0, +) -> str: + """OCR a PNG given as bytes; return the recognized lines joined by newlines. + + Lines below ``min_confidence`` are dropped — useful for filtering the + low-confidence glyphs that icons and logos tend to produce. + """ + with tempfile.NamedTemporaryFile(suffix=".png") as tmp: + tmp.write(png) + tmp.flush() + lines = run_ocr(tmp.name, fast=fast, languages=languages, correction=correction) + return "\n".join(line.text for line in lines if line.confidence >= min_confidence) diff --git a/src/localcontextrouter/pdf.py b/src/localcontextrouter/pdf.py index 4f4750c..25f2486 100644 --- a/src/localcontextrouter/pdf.py +++ b/src/localcontextrouter/pdf.py @@ -7,6 +7,7 @@ from __future__ import annotations +import io from collections.abc import Iterator from pathlib import Path @@ -48,6 +49,22 @@ def page_texts(self) -> Iterator[str]: for index in range(len(self)): yield self.page_text(index) + def render_page_png(self, index: int, scale: float = 2.0) -> bytes: + """Render the page at ``index`` to PNG bytes. + + ``scale`` multiplies the native size (2.0 ~= 144 DPI), trading speed for + OCR accuracy on small text. Used to feed image-only pages to OCR. + """ + page = self._doc[index] + bitmap = page.render(scale=scale) + try: + buffer = io.BytesIO() + bitmap.to_pil().save(buffer, format="PNG") + return buffer.getvalue() + finally: + bitmap.close() + page.close() + def close(self) -> None: self._doc.close() diff --git a/src/localcontextrouter/router.py b/src/localcontextrouter/router.py new file mode 100644 index 0000000..f682934 --- /dev/null +++ b/src/localcontextrouter/router.py @@ -0,0 +1,29 @@ +"""Route each PDF page to the cheapest faithful text source. + +Digital pages keep their extracted text; scanned or garbled pages are rendered +and sent to OCR. Vision routing and token accounting are added in a later phase. +""" + +from __future__ import annotations + +from pathlib import Path + +from .classify import classify_text +from .models import PageClass, PageRoute, RouteResult, Source +from .ocr import ocr_png_text +from .pdf import Pdf + + +def route_pdf(path: str | Path, *, render_scale: float = 2.0) -> RouteResult: + """Route every page of a PDF and return per-page text with its source.""" + pages: list[PageRoute] = [] + with Pdf(path) as pdf: + for index in range(len(pdf)): + text = pdf.page_text(index) + classification = classify_text(text) + if classification.page_class is PageClass.DIGITAL: + pages.append(PageRoute(index, classification, Source.TEXT, text)) + else: + png = pdf.render_page_png(index, scale=render_scale) + pages.append(PageRoute(index, classification, Source.OCR, ocr_png_text(png))) + return RouteResult(pages) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..128a0ee --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,56 @@ +"""Shared test fixtures: a binary guard and PDF builders.""" + +from collections.abc import Callable +from pathlib import Path + +import pytest +from fpdf import FPDF +from PIL import Image, ImageDraw, ImageFont + +from localcontextrouter.ocr import OcrBinaryNotFound, locate_binary + + +@pytest.fixture +def lcr_binary() -> Path: + """Path to the lcr-ocr binary, or skip the test when it is not built.""" + try: + return locate_binary() + except OcrBinaryNotFound: + pytest.skip("lcr-ocr binary not built") + + +@pytest.fixture +def make_text_pdf(tmp_path: Path) -> Callable[..., Path]: + """Return a factory that writes a PDF with a real text layer.""" + + def _make(text: str, pages: int = 1, name: str = "text.pdf") -> Path: + pdf = FPDF() + pdf.set_font("Helvetica", size=12) + for _ in range(pages): + pdf.add_page() + pdf.multi_cell(0, 8, text) + path = tmp_path / name + pdf.output(str(path)) + return path + + return _make + + +@pytest.fixture +def make_image_pdf(tmp_path: Path) -> Callable[..., Path]: + """Return a factory that writes an image-only PDF (no text layer).""" + + def _make(text: str, name: str = "scan.pdf") -> Path: + image = Image.new("RGB", (900, 220), "white") + draw = ImageDraw.Draw(image) + draw.text((25, 80), text, fill="black", font=ImageFont.load_default(size=56)) + png = tmp_path / "scan.png" + image.save(png) + pdf = FPDF() + pdf.add_page() + pdf.image(str(png), x=10, y=10, w=180) + path = tmp_path / name + pdf.output(str(path)) + return path + + return _make diff --git a/tests/test_ocr.py b/tests/test_ocr.py new file mode 100644 index 0000000..c5e08f8 --- /dev/null +++ b/tests/test_ocr.py @@ -0,0 +1,58 @@ +"""Tests for the lcr-ocr bridge.""" + +from pathlib import Path + +import pytest +from PIL import Image, ImageDraw, ImageFont + +from localcontextrouter.ocr import ( + BINARY_ENV_VAR, + OcrBinaryNotFound, + locate_binary, + parse_ocr_lines, + run_ocr, +) + +SAMPLE_JSON = ( + '[{"text":"Hello","confidence":0.9,"boundingBox":{"x":0.1,"y":0.2,"width":0.3,"height":0.4}}]' +) + + +def test_parse_ocr_lines() -> None: + [line] = parse_ocr_lines(SAMPLE_JSON) + assert line.text == "Hello" + assert line.confidence == pytest.approx(0.9) + assert line.bounding_box.x == pytest.approx(0.1) + assert line.bounding_box.height == pytest.approx(0.4) + + +def test_parse_empty_payload() -> None: + assert parse_ocr_lines("[]") == [] + + +def test_locate_binary_env_pointing_at_missing_file( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + monkeypatch.setenv(BINARY_ENV_VAR, str(tmp_path / "nope")) + with pytest.raises(OcrBinaryNotFound): + locate_binary() + + +def test_locate_binary_prefers_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + fake = tmp_path / "lcr-ocr" + fake.write_text("#!/bin/sh\n") + monkeypatch.setenv(BINARY_ENV_VAR, str(fake)) + assert locate_binary() == fake + + +@pytest.mark.integration +def test_run_ocr_reads_text(lcr_binary: Path, tmp_path: Path) -> None: + image = Image.new("RGB", (700, 180), "white") + draw = ImageDraw.Draw(image) + draw.text((20, 70), "HELLO OCR", fill="black", font=ImageFont.load_default(size=52)) + path = tmp_path / "image.png" + image.save(path) + + lines = run_ocr(path) + transcript = " ".join(line.text for line in lines).upper() + assert "HELLO OCR" in transcript diff --git a/tests/test_router.py b/tests/test_router.py new file mode 100644 index 0000000..2869748 --- /dev/null +++ b/tests/test_router.py @@ -0,0 +1,32 @@ +"""Tests for the per-page router.""" + +from collections.abc import Callable +from pathlib import Path + +import pytest + +from localcontextrouter.models import Source +from localcontextrouter.router import route_pdf + +PROSE = ( + "The quarterly report summarizes revenue, expenses, and net income for the " + "period ending in March. All figures are stated in thousands of US dollars." +) + + +def test_routes_digital_page_to_text(make_text_pdf: Callable[..., Path]) -> None: + result = route_pdf(make_text_pdf(PROSE)) + assert len(result.pages) == 1 + page = result.pages[0] + assert page.source is Source.TEXT + assert "revenue" in page.text + assert "revenue" in result.text + + +@pytest.mark.integration +def test_routes_scanned_page_to_ocr(lcr_binary: Path, make_image_pdf: Callable[..., Path]) -> None: + result = route_pdf(make_image_pdf("SCANNED INVOICE 2026")) + assert len(result.pages) == 1 + page = result.pages[0] + assert page.source is Source.OCR + assert "INVOICE" in page.text.upper()