From 2fe6c8804621cdda85d6745473fe33d8689785b3 Mon Sep 17 00:00:00 2001
From: Siddharth Nashikkar <siddharth.nashikkar@yahoo.com>
Date: Thu, 18 Jun 2026 12:12:11 -0400
Subject: [PATCH 1/3] build(deps): add pypdfium2 for PDF text extraction

Use pypdfium2 (permissively licensed, ships its own native library) so
there is no system poppler dependency. Add fpdf2 as a dev dependency for
test fixtures and a mypy override for the stub-less binding.
---
 pyproject.toml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7fcffd8..9a1c4be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,9 @@ classifiers = [
     "Topic :: Text Processing",
 ]
 dynamic = ["version"]
-dependencies = []
+dependencies = [
+    "pypdfium2>=4.30",
+]
 
 [project.urls]
 Homepage = "https://github.com/sid732/LocalContextRouter"
@@ -33,6 +35,7 @@ dev = [
     "pytest-cov>=5.0",
     "ruff>=0.6",
     "mypy>=1.11",
+    "fpdf2>=2.7",
 ]
 
 [tool.hatch.version]
@@ -54,6 +57,10 @@ python_version = "3.10"
 strict = true
 files = ["src"]
 
+[[tool.mypy.overrides]]
+module = ["pypdfium2.*"]
+ignore_missing_imports = true
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 addopts = "-ra"

From e48c0d68a45171d40238f149a227e7f7b73eb055 Mon Sep 17 00:00:00 2001
From: Siddharth Nashikkar <siddharth.nashikkar@yahoo.com>
Date: Thu, 18 Jun 2026 12:12:11 -0400
Subject: [PATCH 2/3] feat(core): extract and classify PDF page text

Add the Pdf handle (context-managed pypdfium2 document) and classify_pdf,
which pulls each page's embedded text layer and runs it through the page
classifier.
---
 src/localcontextrouter/__init__.py |  3 ++
 src/localcontextrouter/pdf.py      | 64 ++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 src/localcontextrouter/pdf.py

diff --git a/src/localcontextrouter/__init__.py b/src/localcontextrouter/__init__.py
index 3b024a8..9cb89dc 100644
--- a/src/localcontextrouter/__init__.py
+++ b/src/localcontextrouter/__init__.py
@@ -2,6 +2,7 @@
 
 from .classify import classify_text, compute_signals
 from .models import Classification, PageClass, PageSignals
+from .pdf import Pdf, classify_pdf
 
 __version__ = "0.0.0"
 
@@ -9,6 +10,8 @@
     "Classification",
     "PageClass",
     "PageSignals",
+    "Pdf",
+    "classify_pdf",
     "classify_text",
     "compute_signals",
     "__version__",
diff --git a/src/localcontextrouter/pdf.py b/src/localcontextrouter/pdf.py
new file mode 100644
index 0000000..4f4750c
--- /dev/null
+++ b/src/localcontextrouter/pdf.py
@@ -0,0 +1,64 @@
+"""Read PDFs and pull each page's embedded text layer using pypdfium2.
+
+pypdfium2 is a permissively licensed binding to PDFium that ships its own native
+library, so there is no system dependency (no poppler) to install alongside the
+package.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from pathlib import Path
+
+import pypdfium2 as pdfium
+
+from .classify import classify_text
+from .models import Classification
+
+
+class Pdf:
+    """A read-only handle over a PDF document.
+
+    Use as a context manager so the native document is always released::
+
+        with Pdf(path) as pdf:
+            for text in pdf.page_texts():
+                ...
+    """
+
+    def __init__(self, path: str | Path) -> None:
+        self.path = Path(path)
+        self._doc = pdfium.PdfDocument(str(self.path))
+
+    def __len__(self) -> int:
+        return len(self._doc)
+
+    def page_text(self, index: int) -> str:
+        """Return the embedded text of the page at ``index``."""
+        page = self._doc[index]
+        textpage = page.get_textpage()
+        try:
+            return str(textpage.get_text_bounded())
+        finally:
+            textpage.close()
+            page.close()
+
+    def page_texts(self) -> Iterator[str]:
+        """Yield the embedded text of every page in order."""
+        for index in range(len(self)):
+            yield self.page_text(index)
+
+    def close(self) -> None:
+        self._doc.close()
+
+    def __enter__(self) -> Pdf:
+        return self
+
+    def __exit__(self, *exc: object) -> None:
+        self.close()
+
+
+def classify_pdf(path: str | Path) -> list[Classification]:
+    """Classify every page of a PDF from its extracted text layer."""
+    with Pdf(path) as pdf:
+        return [classify_text(text) for text in pdf.page_texts()]

From 9f5a5ed62ee0b5ea072e89c1a5e721ceb04d8cb3 Mon Sep 17 00:00:00 2001
From: Siddharth Nashikkar <siddharth.nashikkar@yahoo.com>
Date: Thu, 18 Jun 2026 12:12:11 -0400
Subject: [PATCH 3/3] test(core): cover PDF extraction and classification

Build PDFs with fpdf2 and assert text extraction, page iteration, digital
text pages, and a blank page classified as scanned.
---
 tests/test_pdf.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 tests/test_pdf.py

diff --git a/tests/test_pdf.py b/tests/test_pdf.py
new file mode 100644
index 0000000..5a34e5c
--- /dev/null
+++ b/tests/test_pdf.py
@@ -0,0 +1,57 @@
+"""Tests for PDF text extraction and per-page classification."""
+
+from pathlib import Path
+
+from fpdf import FPDF
+
+from localcontextrouter import PageClass
+from localcontextrouter.pdf import Pdf, classify_pdf
+
+PROSE = (
+    "The quarterly report summarizes revenue, expenses, and net income for the "
+    "period ending in March. All figures are stated in thousands of US dollars."
+)
+
+
+def _text_pdf(path: Path, text: str, pages: int = 1) -> Path:
+    pdf = FPDF()
+    pdf.set_font("Helvetica", size=12)
+    for _ in range(pages):
+        pdf.add_page()
+        pdf.multi_cell(0, 8, text)
+    pdf.output(str(path))
+    return path
+
+
+def _blank_pdf(path: Path, pages: int = 1) -> Path:
+    pdf = FPDF()
+    for _ in range(pages):
+        pdf.add_page()
+    pdf.output(str(path))
+    return path
+
+
+def test_extracts_text(tmp_path: Path) -> None:
+    pdf_path = _text_pdf(tmp_path / "report.pdf", PROSE)
+    with Pdf(pdf_path) as pdf:
+        assert len(pdf) == 1
+        assert "revenue" in pdf.page_text(0)
+
+
+def test_page_texts_iterates_every_page(tmp_path: Path) -> None:
+    pdf_path = _text_pdf(tmp_path / "multi.pdf", PROSE, pages=3)
+    with Pdf(pdf_path) as pdf:
+        assert len(list(pdf.page_texts())) == 3
+
+
+def test_classify_pdf_marks_text_pages_digital(tmp_path: Path) -> None:
+    pdf_path = _text_pdf(tmp_path / "report.pdf", PROSE, pages=2)
+    results = classify_pdf(pdf_path)
+    assert len(results) == 2
+    assert all(result.page_class is PageClass.DIGITAL for result in results)
+
+
+def test_classify_pdf_marks_blank_page_scanned(tmp_path: Path) -> None:
+    pdf_path = _blank_pdf(tmp_path / "blank.pdf")
+    [result] = classify_pdf(pdf_path)
+    assert result.page_class is PageClass.SCANNED