From 2fe6c8804621cdda85d6745473fe33d8689785b3 Mon Sep 17 00:00:00 2001 From: Siddharth Nashikkar Date: Thu, 18 Jun 2026 12:12:11 -0400 Subject: [PATCH 1/3] build(deps): add pypdfium2 for PDF text extraction Use pypdfium2 (permissively licensed, ships its own native library) so there is no system poppler dependency. Add fpdf2 as a dev dependency for test fixtures and a mypy override for the stub-less binding. --- pyproject.toml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7fcffd8..9a1c4be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,9 @@ classifiers = [ "Topic :: Text Processing", ] dynamic = ["version"] -dependencies = [] +dependencies = [ + "pypdfium2>=4.30", +] [project.urls] Homepage = "https://github.com/sid732/LocalContextRouter" @@ -33,6 +35,7 @@ dev = [ "pytest-cov>=5.0", "ruff>=0.6", "mypy>=1.11", + "fpdf2>=2.7", ] [tool.hatch.version] @@ -54,6 +57,10 @@ python_version = "3.10" strict = true files = ["src"] +[[tool.mypy.overrides]] +module = ["pypdfium2.*"] +ignore_missing_imports = true + [tool.pytest.ini_options] testpaths = ["tests"] addopts = "-ra" From e48c0d68a45171d40238f149a227e7f7b73eb055 Mon Sep 17 00:00:00 2001 From: Siddharth Nashikkar Date: Thu, 18 Jun 2026 12:12:11 -0400 Subject: [PATCH 2/3] feat(core): extract and classify PDF page text Add the Pdf handle (context-managed pypdfium2 document) and classify_pdf, which pulls each page's embedded text layer and runs it through the page classifier. --- src/localcontextrouter/__init__.py | 3 ++ src/localcontextrouter/pdf.py | 64 ++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 src/localcontextrouter/pdf.py diff --git a/src/localcontextrouter/__init__.py b/src/localcontextrouter/__init__.py index 3b024a8..9cb89dc 100644 --- a/src/localcontextrouter/__init__.py +++ b/src/localcontextrouter/__init__.py @@ -2,6 +2,7 @@ from .classify import classify_text, compute_signals from .models import Classification, PageClass, PageSignals +from .pdf import Pdf, classify_pdf __version__ = "0.0.0" @@ -9,6 +10,8 @@ "Classification", "PageClass", "PageSignals", + "Pdf", + "classify_pdf", "classify_text", "compute_signals", "__version__", diff --git a/src/localcontextrouter/pdf.py b/src/localcontextrouter/pdf.py new file mode 100644 index 0000000..4f4750c --- /dev/null +++ b/src/localcontextrouter/pdf.py @@ -0,0 +1,64 @@ +"""Read PDFs and pull each page's embedded text layer using pypdfium2. + +pypdfium2 is a permissively licensed binding to PDFium that ships its own native +library, so there is no system dependency (no poppler) to install alongside the +package. +""" + +from __future__ import annotations + +from collections.abc import Iterator +from pathlib import Path + +import pypdfium2 as pdfium + +from .classify import classify_text +from .models import Classification + + +class Pdf: + """A read-only handle over a PDF document. + + Use as a context manager so the native document is always released:: + + with Pdf(path) as pdf: + for text in pdf.page_texts(): + ... + """ + + def __init__(self, path: str | Path) -> None: + self.path = Path(path) + self._doc = pdfium.PdfDocument(str(self.path)) + + def __len__(self) -> int: + return len(self._doc) + + def page_text(self, index: int) -> str: + """Return the embedded text of the page at ``index``.""" + page = self._doc[index] + textpage = page.get_textpage() + try: + return str(textpage.get_text_bounded()) + finally: + textpage.close() + page.close() + + def page_texts(self) -> Iterator[str]: + """Yield the embedded text of every page in order.""" + for index in range(len(self)): + yield self.page_text(index) + + def close(self) -> None: + self._doc.close() + + def __enter__(self) -> Pdf: + return self + + def __exit__(self, *exc: object) -> None: + self.close() + + +def classify_pdf(path: str | Path) -> list[Classification]: + """Classify every page of a PDF from its extracted text layer.""" + with Pdf(path) as pdf: + return [classify_text(text) for text in pdf.page_texts()] From 9f5a5ed62ee0b5ea072e89c1a5e721ceb04d8cb3 Mon Sep 17 00:00:00 2001 From: Siddharth Nashikkar Date: Thu, 18 Jun 2026 12:12:11 -0400 Subject: [PATCH 3/3] test(core): cover PDF extraction and classification Build PDFs with fpdf2 and assert text extraction, page iteration, digital text pages, and a blank page classified as scanned. --- tests/test_pdf.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tests/test_pdf.py diff --git a/tests/test_pdf.py b/tests/test_pdf.py new file mode 100644 index 0000000..5a34e5c --- /dev/null +++ b/tests/test_pdf.py @@ -0,0 +1,57 @@ +"""Tests for PDF text extraction and per-page classification.""" + +from pathlib import Path + +from fpdf import FPDF + +from localcontextrouter import PageClass +from localcontextrouter.pdf import Pdf, classify_pdf + +PROSE = ( + "The quarterly report summarizes revenue, expenses, and net income for the " + "period ending in March. All figures are stated in thousands of US dollars." +) + + +def _text_pdf(path: Path, text: str, pages: int = 1) -> Path: + pdf = FPDF() + pdf.set_font("Helvetica", size=12) + for _ in range(pages): + pdf.add_page() + pdf.multi_cell(0, 8, text) + pdf.output(str(path)) + return path + + +def _blank_pdf(path: Path, pages: int = 1) -> Path: + pdf = FPDF() + for _ in range(pages): + pdf.add_page() + pdf.output(str(path)) + return path + + +def test_extracts_text(tmp_path: Path) -> None: + pdf_path = _text_pdf(tmp_path / "report.pdf", PROSE) + with Pdf(pdf_path) as pdf: + assert len(pdf) == 1 + assert "revenue" in pdf.page_text(0) + + +def test_page_texts_iterates_every_page(tmp_path: Path) -> None: + pdf_path = _text_pdf(tmp_path / "multi.pdf", PROSE, pages=3) + with Pdf(pdf_path) as pdf: + assert len(list(pdf.page_texts())) == 3 + + +def test_classify_pdf_marks_text_pages_digital(tmp_path: Path) -> None: + pdf_path = _text_pdf(tmp_path / "report.pdf", PROSE, pages=2) + results = classify_pdf(pdf_path) + assert len(results) == 2 + assert all(result.page_class is PageClass.DIGITAL for result in results) + + +def test_classify_pdf_marks_blank_page_scanned(tmp_path: Path) -> None: + pdf_path = _blank_pdf(tmp_path / "blank.pdf") + [result] = classify_pdf(pdf_path) + assert result.page_class is PageClass.SCANNED