sid732 · sid732 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,9 @@ classifiers = [
     "Topic :: Text Processing",
 ]
 dynamic = ["version"]
-dependencies = []
+dependencies = [
+    "pypdfium2>=4.30",
+]
 
 [project.urls]
 Homepage = "https://github.com/sid732/LocalContextRouter"
@@ -33,6 +35,7 @@ dev = [
     "pytest-cov>=5.0",
     "ruff>=0.6",
     "mypy>=1.11",
+    "fpdf2>=2.7",
 ]
 
 [tool.hatch.version]
@@ -54,6 +57,10 @@ python_version = "3.10"
 strict = true
 files = ["src"]
 
+[[tool.mypy.overrides]]
+module = ["pypdfium2.*"]
+ignore_missing_imports = true
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 addopts = "-ra"
diff --git a/src/localcontextrouter/__init__.py b/src/localcontextrouter/__init__.py
@@ -2,13 +2,16 @@
 
 from .classify import classify_text, compute_signals
 from .models import Classification, PageClass, PageSignals
+from .pdf import Pdf, classify_pdf
 
 __version__ = "0.0.0"
 
 __all__ = [
     "Classification",
     "PageClass",
     "PageSignals",
+    "Pdf",
+    "classify_pdf",
     "classify_text",
     "compute_signals",
     "__version__",

diff --git a/src/localcontextrouter/pdf.py b/src/localcontextrouter/pdf.py
@@ -0,0 +1,64 @@
+"""Read PDFs and pull each page's embedded text layer using pypdfium2.
+
+pypdfium2 is a permissively licensed binding to PDFium that ships its own native
+library, so there is no system dependency (no poppler) to install alongside the
+package.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from pathlib import Path
+
+import pypdfium2 as pdfium
+
+from .classify import classify_text
+from .models import Classification
+
+
+class Pdf:
+    """A read-only handle over a PDF document.
+
+    Use as a context manager so the native document is always released::
+
+        with Pdf(path) as pdf:
+            for text in pdf.page_texts():
+                ...
+    """
+
+    def __init__(self, path: str | Path) -> None:
+        self.path = Path(path)
+        self._doc = pdfium.PdfDocument(str(self.path))
+
+    def __len__(self) -> int:
+        return len(self._doc)
+
+    def page_text(self, index: int) -> str:
+        """Return the embedded text of the page at ``index``."""
+        page = self._doc[index]
+        textpage = page.get_textpage()
+        try:
+            return str(textpage.get_text_bounded())
+        finally:
+            textpage.close()
+            page.close()
+
+    def page_texts(self) -> Iterator[str]:
+        """Yield the embedded text of every page in order."""
+        for index in range(len(self)):
+            yield self.page_text(index)
+
+    def close(self) -> None:
+        self._doc.close()
+
+    def __enter__(self) -> Pdf:
+        return self
+
+    def __exit__(self, *exc: object) -> None:
+        self.close()
+
+
+def classify_pdf(path: str | Path) -> list[Classification]:
+    """Classify every page of a PDF from its extracted text layer."""
+    with Pdf(path) as pdf:
+        return [classify_text(text) for text in pdf.page_texts()]
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
@@ -0,0 +1,57 @@
+"""Tests for PDF text extraction and per-page classification."""
+
+from pathlib import Path
+
+from fpdf import FPDF
+
+from localcontextrouter import PageClass
+from localcontextrouter.pdf import Pdf, classify_pdf
+
+PROSE = (
+    "The quarterly report summarizes revenue, expenses, and net income for the "
+    "period ending in March. All figures are stated in thousands of US dollars."
+)
+
+
+def _text_pdf(path: Path, text: str, pages: int = 1) -> Path:
+    pdf = FPDF()
+    pdf.set_font("Helvetica", size=12)
+    for _ in range(pages):
+        pdf.add_page()
+        pdf.multi_cell(0, 8, text)
+    pdf.output(str(path))
+    return path
+
+
+def _blank_pdf(path: Path, pages: int = 1) -> Path:
+    pdf = FPDF()
+    for _ in range(pages):
+        pdf.add_page()
+    pdf.output(str(path))
+    return path
+
+
+def test_extracts_text(tmp_path: Path) -> None:
+    pdf_path = _text_pdf(tmp_path / "report.pdf", PROSE)
+    with Pdf(pdf_path) as pdf:
+        assert len(pdf) == 1
+        assert "revenue" in pdf.page_text(0)
+
+
+def test_page_texts_iterates_every_page(tmp_path: Path) -> None:
+    pdf_path = _text_pdf(tmp_path / "multi.pdf", PROSE, pages=3)
+    with Pdf(pdf_path) as pdf:
+        assert len(list(pdf.page_texts())) == 3
+
+
+def test_classify_pdf_marks_text_pages_digital(tmp_path: Path) -> None:
+    pdf_path = _text_pdf(tmp_path / "report.pdf", PROSE, pages=2)
+    results = classify_pdf(pdf_path)
+    assert len(results) == 2
+    assert all(result.page_class is PageClass.DIGITAL for result in results)
+
+
+def test_classify_pdf_marks_blank_page_scanned(tmp_path: Path) -> None:
+    pdf_path = _blank_pdf(tmp_path / "blank.pdf")
+    [result] = classify_pdf(pdf_path)
+    assert result.page_class is PageClass.SCANNED