From de67c7b0b1cd540bdcf4d254c54fa0f2045ddcc8 Mon Sep 17 00:00:00 2001 From: Vishesh 'ironeagle' Bangotra Date: Fri, 2 Jan 2026 18:59:36 +0530 Subject: [PATCH] feat(pdf): add PDF client, scraper, parser, and end-to-end tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Introduce PDF submodule with client, scraper, and generic parser - Add filesystem PDF client and test-only mock routing - Add end-to-end PDF scrape → parse tests with typed output - Mirror HTML module architecture for consistency - Expose PDF primitives via omniread public API --- omniread/__init__.py | 6 ++++++ omniread/pdf/__init__.py | 9 +++++++++ omniread/pdf/client.py | 32 ++++++++++++++++++++++++++++++++ omniread/pdf/parser.py | 26 ++++++++++++++++++++++++++ omniread/pdf/scraper.py | 32 ++++++++++++++++++++++++++++++++ tests/conftest.py | 24 ++++++++++++++++++++++++ tests/mocks/pdf/simple.pdf | 32 ++++++++++++++++++++++++++++++++ tests/test_pdf_simple.py | 37 +++++++++++++++++++++++++++++++++++++ 8 files changed, 198 insertions(+) create mode 100644 omniread/pdf/__init__.py create mode 100644 omniread/pdf/client.py create mode 100644 omniread/pdf/parser.py create mode 100644 omniread/pdf/scraper.py create mode 100644 tests/mocks/pdf/simple.pdf create mode 100644 tests/test_pdf_simple.py diff --git a/omniread/__init__.py b/omniread/__init__.py index ffb41ac..df7ce37 100644 --- a/omniread/__init__.py +++ b/omniread/__init__.py @@ -1,5 +1,6 @@ from .core import Content, ContentType from .html import HTMLScraper, HTMLParser +from .pdf import FileSystemPDFClient, PDFScraper, PDFParser __all__ = [ # core @@ -9,4 +10,9 @@ __all__ = [ # html "HTMLScraper", "HTMLParser", + + # pdf + "FileSystemPDFClient", + "PDFScraper", + "PDFParser", ] diff --git a/omniread/pdf/__init__.py b/omniread/pdf/__init__.py new file mode 100644 index 0000000..b7281aa --- /dev/null +++ b/omniread/pdf/__init__.py @@ -0,0 +1,9 @@ +from .client import FileSystemPDFClient +from .scraper import PDFScraper +from .parser import PDFParser + +__all__ = [ + "FileSystemPDFClient", + "PDFScraper", + "PDFParser", +] diff --git a/omniread/pdf/client.py b/omniread/pdf/client.py new file mode 100644 index 0000000..595a294 --- /dev/null +++ b/omniread/pdf/client.py @@ -0,0 +1,32 @@ +from abc import ABC, abstractmethod +from pathlib import Path + + +class BasePDFClient(ABC): + """ + Abstract client responsible for retrieving PDF bytes + from a specific backing store (filesystem, S3, FTP, etc). + """ + + @abstractmethod + def fetch(self, source: str) -> bytes: + """ + Fetch raw PDF bytes from the given source. + """ + raise NotImplementedError + + +class FileSystemPDFClient(BasePDFClient): + """ + PDF client that reads from the local filesystem. + """ + + def fetch(self, path: Path) -> bytes: + + if not path.exists(): + raise FileNotFoundError(f"PDF not found: {path}") + + if not path.is_file(): + raise ValueError(f"Path is not a file: {path}") + + return path.read_bytes() diff --git a/omniread/pdf/parser.py b/omniread/pdf/parser.py new file mode 100644 index 0000000..f46a2f5 --- /dev/null +++ b/omniread/pdf/parser.py @@ -0,0 +1,26 @@ +from typing import Generic, TypeVar +from abc import abstractmethod + +from omniread.core.content import ContentType +from omniread.core.parser import BaseParser + +T = TypeVar("T") + + +class PDFParser(BaseParser[T], Generic[T]): + """ + Base PDF parser. + + Concrete implementations must define: + - the output type T + - the parsing strategy + """ + + supported_types = {ContentType.PDF} + + @abstractmethod + def parse(self) -> T: + """ + Parse PDF content into a structured output. + """ + raise NotImplementedError diff --git a/omniread/pdf/scraper.py b/omniread/pdf/scraper.py new file mode 100644 index 0000000..f561567 --- /dev/null +++ b/omniread/pdf/scraper.py @@ -0,0 +1,32 @@ +from typing import Any, Mapping, Optional + +from omniread.core.content import Content, ContentType +from omniread.core.scraper import BaseScraper +from omniread.pdf.client import BasePDFClient + + +class PDFScraper(BaseScraper): + """ + Scraper for PDF sources. + + Delegates byte retrieval to a PDF client and normalizes + output into Content. + """ + + def __init__(self, *, client: BasePDFClient): + self._client = client + + def fetch( + self, + source: str, + *, + metadata: Optional[Mapping[str, Any]] = None, + ) -> Content: + raw = self._client.fetch(source) + + return Content( + raw=raw, + source=source, + content_type=ContentType.PDF, + metadata=dict(metadata) if metadata else None, + ) diff --git a/tests/conftest.py b/tests/conftest.py index b53a401..23d7425 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,9 +6,12 @@ from jinja2 import Environment, BaseLoader from omniread.core.content import ContentType from omniread.html.scraper import HTMLScraper +from omniread.pdf.client import FileSystemPDFClient +from omniread.pdf.scraper import PDFScraper MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html" +MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf" def render_html(template_path, data_path) -> bytes: @@ -57,3 +60,24 @@ def http_scraper() -> HTMLScraper: client = httpx.Client(transport=transport) return HTMLScraper(client=client) + + +class MockPDFClient(FileSystemPDFClient): + """ + Test-only PDF client that routes logical identifiers + to fixture files. + """ + + def fetch(self, source: str) -> bytes: + if source in ["simple"]: + source = MOCK_PDF_DIR / f"{source}.pdf" + else: + raise FileNotFoundError(f"No mock PDF route for '{source}'") + + return super().fetch(source) + + +@pytest.fixture +def pdf_scraper() -> PDFScraper: + client = MockPDFClient() + return PDFScraper(client=client) diff --git a/tests/mocks/pdf/simple.pdf b/tests/mocks/pdf/simple.pdf new file mode 100644 index 0000000..c0f11a1 --- /dev/null +++ b/tests/mocks/pdf/simple.pdf @@ -0,0 +1,32 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >> +endobj +4 0 obj +<< /Length 44 >> +stream +BT +/F1 12 Tf +72 720 Td +(Simple PDF Test) Tj +ET +endstream +endobj +xref +0 5 +0000000000 65535 f +0000000010 00000 n +0000000061 00000 n +0000000116 00000 n +0000000203 00000 n +trailer +<< /Size 5 /Root 1 0 R >> +startxref +300 +%%EOF diff --git a/tests/test_pdf_simple.py b/tests/test_pdf_simple.py new file mode 100644 index 0000000..552c8a5 --- /dev/null +++ b/tests/test_pdf_simple.py @@ -0,0 +1,37 @@ +from typing import Literal +from pydantic import BaseModel + +from omniread.pdf import PDFParser +from omniread.core.content import Content + + +class ParsedPDF(BaseModel): + size_bytes: int + magic: Literal[b"%PDF"] + + +class SimplePDFParser(PDFParser[ParsedPDF]): + def parse(self) -> ParsedPDF: + raw = self.content.raw + + if not raw.startswith(b"%PDF"): + raise ValueError("Not a valid PDF") + + return ParsedPDF( + size_bytes=len(raw), + magic=b"%PDF", + ) + + +def test_end_to_end_pdf_simple(pdf_scraper): + # --- Scrape (identifier-based, routed in conftest) + content: Content = pdf_scraper.fetch("simple") + + assert content.raw.startswith(b"%PDF") + + # --- Parse + parser = SimplePDFParser(content) + result = parser.parse() + + assert result.magic == b"%PDF" + assert result.size_bytes > 100