feat(pdf): add PDF client, scraper, parser, and end-to-end tests

- Introduce PDF submodule with client, scraper, and generic parser - Add filesystem PDF client and test-only mock routing - Add end-to-end PDF scrape → parse tests with typed output - Mirror HTML module architecture for consistency - Expose PDF primitives via omniread public API
2026-01-02 18:59:36 +05:30
parent 390eb22e1b
commit de67c7b0b1
8 changed files with 198 additions and 0 deletions
--- a/omniread/init.py
+++ b/omniread/init.py
@@ -1,5 +1,6 @@
 from .core import Content, ContentType
 from .html import HTMLScraper, HTMLParser
 from .pdf import FileSystemPDFClient, PDFScraper, PDFParser
 __all__ = [
    # core
@@ -9,4 +10,9 @@ __all__ = [
    # html
    "HTMLScraper",
    "HTMLParser",
    # pdf
    "FileSystemPDFClient",
    "PDFScraper",
    "PDFParser",
 ]
--- a/omniread/pdf/init.py
+++ b/omniread/pdf/init.py
@@ -0,0 +1,9 @@
 from .client import FileSystemPDFClient
 from .scraper import PDFScraper
 from .parser import PDFParser
 __all__ = [
    "FileSystemPDFClient",
    "PDFScraper",
    "PDFParser",
 ]
--- a/omniread/pdf/client.py
+++ b/omniread/pdf/client.py
@@ -0,0 +1,32 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
 class BasePDFClient(ABC):
    """
    Abstract client responsible for retrieving PDF bytes
    from a specific backing store (filesystem, S3, FTP, etc).
    """
    @abstractmethod
    def fetch(self, source: str) -> bytes:
        """
        Fetch raw PDF bytes from the given source.
        """
        raise NotImplementedError
 class FileSystemPDFClient(BasePDFClient):
    """
    PDF client that reads from the local filesystem.
    """
    def fetch(self, path: Path) -> bytes:
        if not path.exists():
            raise FileNotFoundError(f"PDF not found: {path}")
        if not path.is_file():
            raise ValueError(f"Path is not a file: {path}")
        return path.read_bytes()
--- a/omniread/pdf/parser.py
+++ b/omniread/pdf/parser.py
@@ -0,0 +1,26 @@
 from typing import Generic, TypeVar
 from abc import abstractmethod
 from omniread.core.content import ContentType
 from omniread.core.parser import BaseParser
 T = TypeVar("T")
 class PDFParser(BaseParser[T], Generic[T]):
    """
    Base PDF parser.
    Concrete implementations must define:
    - the output type T
    - the parsing strategy
    """
    supported_types = {ContentType.PDF}
    @abstractmethod
    def parse(self) -> T:
        """
        Parse PDF content into a structured output.
        """
        raise NotImplementedError
--- a/omniread/pdf/scraper.py
+++ b/omniread/pdf/scraper.py
@@ -0,0 +1,32 @@
 from typing import Any, Mapping, Optional
 from omniread.core.content import Content, ContentType
 from omniread.core.scraper import BaseScraper
 from omniread.pdf.client import BasePDFClient
 class PDFScraper(BaseScraper):
    """
    Scraper for PDF sources.
    Delegates byte retrieval to a PDF client and normalizes
    output into Content.
    """
    def __init__(self, *, client: BasePDFClient):
        self._client = client
    def fetch(
        self,
        source: str,
        *,
        metadata: Optional[Mapping[str, Any]] = None,
    ) -> Content:
        raw = self._client.fetch(source)
        return Content(
            raw=raw,
            source=source,
            content_type=ContentType.PDF,
            metadata=dict(metadata) if metadata else None,
        )
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,9 +6,12 @@ from jinja2 import Environment, BaseLoader
 from omniread.core.content import ContentType
 from omniread.html.scraper import HTMLScraper
 from omniread.pdf.client import FileSystemPDFClient
 from omniread.pdf.scraper import PDFScraper
 MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html"
 MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf"
 def render_html(template_path, data_path) -> bytes:
@@ -57,3 +60,24 @@ def http_scraper() -> HTMLScraper:
    client = httpx.Client(transport=transport)
    return HTMLScraper(client=client)
 class MockPDFClient(FileSystemPDFClient):
    """
    Test-only PDF client that routes logical identifiers
    to fixture files.
    """
    def fetch(self, source: str) -> bytes:
        if source in ["simple"]:
            source = MOCK_PDF_DIR / f"{source}.pdf"
        else:
            raise FileNotFoundError(f"No mock PDF route for '{source}'")
        return super().fetch(source)
@pytest.fixture
 def pdf_scraper() -> PDFScraper:
    client = MockPDFClient()
    return PDFScraper(client=client)
--- a/tests/mocks/pdf/simple.pdf
+++ b/tests/mocks/pdf/simple.pdf
@@ -0,0 +1,32 @@
 %PDF-1.4
 1 0 obj
 << /Type /Catalog /Pages 2 0 R >>
 endobj
 2 0 obj
 << /Type /Pages /Kids [3 0 R] /Count 1 >>
 endobj
 3 0 obj
 << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
 endobj
 4 0 obj
 << /Length 44 >>
 stream
 BT
 /F1 12 Tf
 72 720 Td
 (Simple PDF Test) Tj
 ET
 endstream
 endobj
 xref
 0 5
 0000000000 65535 f
 0000000010 00000 n
 0000000061 00000 n
 0000000116 00000 n
 0000000203 00000 n
 trailer
 << /Size 5 /Root 1 0 R >>
 startxref
 300
 %%EOF
--- a/tests/test_pdf_simple.py
+++ b/tests/test_pdf_simple.py
@@ -0,0 +1,37 @@
 from typing import Literal
 from pydantic import BaseModel
 from omniread.pdf import PDFParser
 from omniread.core.content import Content
 class ParsedPDF(BaseModel):
    size_bytes: int
    magic: Literal[b"%PDF"]
 class SimplePDFParser(PDFParser[ParsedPDF]):
    def parse(self) -> ParsedPDF:
        raw = self.content.raw
        if not raw.startswith(b"%PDF"):
            raise ValueError("Not a valid PDF")
        return ParsedPDF(
            size_bytes=len(raw),
            magic=b"%PDF",
        )
 def test_end_to_end_pdf_simple(pdf_scraper):
    # --- Scrape (identifier-based, routed in conftest)
    content: Content = pdf_scraper.fetch("simple")
    assert content.raw.startswith(b"%PDF")
    # --- Parse
    parser = SimplePDFParser(content)
    result = parser.parse()
    assert result.magic == b"%PDF"
    assert result.size_bytes > 100