Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ classifiers = [
"Topic :: Text Processing",
]
dynamic = ["version"]
dependencies = []
dependencies = [
"pypdfium2>=4.30",
]

[project.urls]
Homepage = "https://github.com/sid732/LocalContextRouter"
Expand All @@ -33,6 +35,7 @@ dev = [
"pytest-cov>=5.0",
"ruff>=0.6",
"mypy>=1.11",
"fpdf2>=2.7",
]

[tool.hatch.version]
Expand All @@ -54,6 +57,10 @@ python_version = "3.10"
strict = true
files = ["src"]

[[tool.mypy.overrides]]
module = ["pypdfium2.*"]
ignore_missing_imports = true

[tool.pytest.ini_options]
testpaths = ["tests"]
addopts = "-ra"
3 changes: 3 additions & 0 deletions src/localcontextrouter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@

from .classify import classify_text, compute_signals
from .models import Classification, PageClass, PageSignals
from .pdf import Pdf, classify_pdf

__version__ = "0.0.0"

__all__ = [
"Classification",
"PageClass",
"PageSignals",
"Pdf",
"classify_pdf",
"classify_text",
"compute_signals",
"__version__",
Expand Down
64 changes: 64 additions & 0 deletions src/localcontextrouter/pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""Read PDFs and pull each page's embedded text layer using pypdfium2.

pypdfium2 is a permissively licensed binding to PDFium that ships its own native
library, so there is no system dependency (no poppler) to install alongside the
package.
"""

from __future__ import annotations

from collections.abc import Iterator
from pathlib import Path

import pypdfium2 as pdfium

from .classify import classify_text
from .models import Classification


class Pdf:
"""A read-only handle over a PDF document.

Use as a context manager so the native document is always released::

with Pdf(path) as pdf:
for text in pdf.page_texts():
...
"""

def __init__(self, path: str | Path) -> None:
self.path = Path(path)
self._doc = pdfium.PdfDocument(str(self.path))

def __len__(self) -> int:
return len(self._doc)

def page_text(self, index: int) -> str:
"""Return the embedded text of the page at ``index``."""
page = self._doc[index]
textpage = page.get_textpage()
try:
return str(textpage.get_text_bounded())
finally:
textpage.close()
page.close()

def page_texts(self) -> Iterator[str]:
"""Yield the embedded text of every page in order."""
for index in range(len(self)):
yield self.page_text(index)

def close(self) -> None:
self._doc.close()

def __enter__(self) -> Pdf:
return self

def __exit__(self, *exc: object) -> None:
self.close()


def classify_pdf(path: str | Path) -> list[Classification]:
"""Classify every page of a PDF from its extracted text layer."""
with Pdf(path) as pdf:
return [classify_text(text) for text in pdf.page_texts()]
57 changes: 57 additions & 0 deletions tests/test_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Tests for PDF text extraction and per-page classification."""

from pathlib import Path

from fpdf import FPDF

from localcontextrouter import PageClass
from localcontextrouter.pdf import Pdf, classify_pdf

PROSE = (
"The quarterly report summarizes revenue, expenses, and net income for the "
"period ending in March. All figures are stated in thousands of US dollars."
)


def _text_pdf(path: Path, text: str, pages: int = 1) -> Path:
pdf = FPDF()
pdf.set_font("Helvetica", size=12)
for _ in range(pages):
pdf.add_page()
pdf.multi_cell(0, 8, text)
pdf.output(str(path))
return path


def _blank_pdf(path: Path, pages: int = 1) -> Path:
pdf = FPDF()
for _ in range(pages):
pdf.add_page()
pdf.output(str(path))
return path


def test_extracts_text(tmp_path: Path) -> None:
pdf_path = _text_pdf(tmp_path / "report.pdf", PROSE)
with Pdf(pdf_path) as pdf:
assert len(pdf) == 1
assert "revenue" in pdf.page_text(0)


def test_page_texts_iterates_every_page(tmp_path: Path) -> None:
pdf_path = _text_pdf(tmp_path / "multi.pdf", PROSE, pages=3)
with Pdf(pdf_path) as pdf:
assert len(list(pdf.page_texts())) == 3


def test_classify_pdf_marks_text_pages_digital(tmp_path: Path) -> None:
pdf_path = _text_pdf(tmp_path / "report.pdf", PROSE, pages=2)
results = classify_pdf(pdf_path)
assert len(results) == 2
assert all(result.page_class is PageClass.DIGITAL for result in results)


def test_classify_pdf_marks_blank_page_scanned(tmp_path: Path) -> None:
pdf_path = _blank_pdf(tmp_path / "blank.pdf")
[result] = classify_pdf(pdf_path)
assert result.page_class is PageClass.SCANNED
Loading