From de67c7b0b1cd540bdcf4d254c54fa0f2045ddcc8 Mon Sep 17 00:00:00 2001
From: Vishesh 'ironeagle' Bangotra <aetoskia@gmail.com>
Date: Fri, 2 Jan 2026 18:59:36 +0530
Subject: [PATCH] feat(pdf): add PDF client, scraper, parser, and end-to-end
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Introduce PDF submodule with client, scraper, and generic parser
- Add filesystem PDF client and test-only mock routing
- Add end-to-end PDF scrape → parse tests with typed output
- Mirror HTML module architecture for consistency
- Expose PDF primitives via omniread public API
---
 omniread/__init__.py       |  6 ++++++
 omniread/pdf/__init__.py   |  9 +++++++++
 omniread/pdf/client.py     | 32 ++++++++++++++++++++++++++++++++
 omniread/pdf/parser.py     | 26 ++++++++++++++++++++++++++
 omniread/pdf/scraper.py    | 32 ++++++++++++++++++++++++++++++++
 tests/conftest.py          | 24 ++++++++++++++++++++++++
 tests/mocks/pdf/simple.pdf | 32 ++++++++++++++++++++++++++++++++
 tests/test_pdf_simple.py   | 37 +++++++++++++++++++++++++++++++++++++
 8 files changed, 198 insertions(+)
 create mode 100644 omniread/pdf/__init__.py
 create mode 100644 omniread/pdf/client.py
 create mode 100644 omniread/pdf/parser.py
 create mode 100644 omniread/pdf/scraper.py
 create mode 100644 tests/mocks/pdf/simple.pdf
 create mode 100644 tests/test_pdf_simple.py

diff --git a/omniread/__init__.py b/omniread/__init__.py
index ffb41ac..df7ce37 100644
--- a/omniread/__init__.py
+++ b/omniread/__init__.py
@@ -1,5 +1,6 @@
 from .core import Content, ContentType
 from .html import HTMLScraper, HTMLParser
+from .pdf import FileSystemPDFClient, PDFScraper, PDFParser
 
 __all__ = [
     # core
@@ -9,4 +10,9 @@ __all__ = [
     # html
     "HTMLScraper",
     "HTMLParser",
+
+    # pdf
+    "FileSystemPDFClient",
+    "PDFScraper",
+    "PDFParser",
 ]
diff --git a/omniread/pdf/__init__.py b/omniread/pdf/__init__.py
new file mode 100644
index 0000000..b7281aa
--- /dev/null
+++ b/omniread/pdf/__init__.py
@@ -0,0 +1,9 @@
+from .client import FileSystemPDFClient
+from .scraper import PDFScraper
+from .parser import PDFParser
+
+__all__ = [
+    "FileSystemPDFClient",
+    "PDFScraper",
+    "PDFParser",
+]
diff --git a/omniread/pdf/client.py b/omniread/pdf/client.py
new file mode 100644
index 0000000..595a294
--- /dev/null
+++ b/omniread/pdf/client.py
@@ -0,0 +1,32 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+
+class BasePDFClient(ABC):
+    """
+    Abstract client responsible for retrieving PDF bytes
+    from a specific backing store (filesystem, S3, FTP, etc).
+    """
+
+    @abstractmethod
+    def fetch(self, source: str) -> bytes:
+        """
+        Fetch raw PDF bytes from the given source.
+        """
+        raise NotImplementedError
+
+
+class FileSystemPDFClient(BasePDFClient):
+    """
+    PDF client that reads from the local filesystem.
+    """
+
+    def fetch(self, path: Path) -> bytes:
+
+        if not path.exists():
+            raise FileNotFoundError(f"PDF not found: {path}")
+
+        if not path.is_file():
+            raise ValueError(f"Path is not a file: {path}")
+
+        return path.read_bytes()
diff --git a/omniread/pdf/parser.py b/omniread/pdf/parser.py
new file mode 100644
index 0000000..f46a2f5
--- /dev/null
+++ b/omniread/pdf/parser.py
@@ -0,0 +1,26 @@
+from typing import Generic, TypeVar
+from abc import abstractmethod
+
+from omniread.core.content import ContentType
+from omniread.core.parser import BaseParser
+
+T = TypeVar("T")
+
+
+class PDFParser(BaseParser[T], Generic[T]):
+    """
+    Base PDF parser.
+
+    Concrete implementations must define:
+    - the output type T
+    - the parsing strategy
+    """
+
+    supported_types = {ContentType.PDF}
+
+    @abstractmethod
+    def parse(self) -> T:
+        """
+        Parse PDF content into a structured output.
+        """
+        raise NotImplementedError
diff --git a/omniread/pdf/scraper.py b/omniread/pdf/scraper.py
new file mode 100644
index 0000000..f561567
--- /dev/null
+++ b/omniread/pdf/scraper.py
@@ -0,0 +1,32 @@
+from typing import Any, Mapping, Optional
+
+from omniread.core.content import Content, ContentType
+from omniread.core.scraper import BaseScraper
+from omniread.pdf.client import BasePDFClient
+
+
+class PDFScraper(BaseScraper):
+    """
+    Scraper for PDF sources.
+
+    Delegates byte retrieval to a PDF client and normalizes
+    output into Content.
+    """
+
+    def __init__(self, *, client: BasePDFClient):
+        self._client = client
+
+    def fetch(
+        self,
+        source: str,
+        *,
+        metadata: Optional[Mapping[str, Any]] = None,
+    ) -> Content:
+        raw = self._client.fetch(source)
+
+        return Content(
+            raw=raw,
+            source=source,
+            content_type=ContentType.PDF,
+            metadata=dict(metadata) if metadata else None,
+        )
diff --git a/tests/conftest.py b/tests/conftest.py
index b53a401..23d7425 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,9 +6,12 @@ from jinja2 import Environment, BaseLoader
 
 from omniread.core.content import ContentType
 from omniread.html.scraper import HTMLScraper
+from omniread.pdf.client import FileSystemPDFClient
+from omniread.pdf.scraper import PDFScraper
 
 
 MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html"
+MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf"
 
 
 def render_html(template_path, data_path) -> bytes:
@@ -57,3 +60,24 @@ def http_scraper() -> HTMLScraper:
     client = httpx.Client(transport=transport)
 
     return HTMLScraper(client=client)
+
+
+class MockPDFClient(FileSystemPDFClient):
+    """
+    Test-only PDF client that routes logical identifiers
+    to fixture files.
+    """
+
+    def fetch(self, source: str) -> bytes:
+        if source in ["simple"]:
+            source = MOCK_PDF_DIR / f"{source}.pdf"
+        else:
+            raise FileNotFoundError(f"No mock PDF route for '{source}'")
+
+        return super().fetch(source)
+
+
+@pytest.fixture
+def pdf_scraper() -> PDFScraper:
+    client = MockPDFClient()
+    return PDFScraper(client=client)
diff --git a/tests/mocks/pdf/simple.pdf b/tests/mocks/pdf/simple.pdf
new file mode 100644
index 0000000..c0f11a1
--- /dev/null
+++ b/tests/mocks/pdf/simple.pdf
@@ -0,0 +1,32 @@
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
+endobj
+4 0 obj
+<< /Length 44 >>
+stream
+BT
+/F1 12 Tf
+72 720 Td
+(Simple PDF Test) Tj
+ET
+endstream
+endobj
+xref
+0 5
+0000000000 65535 f
+0000000010 00000 n
+0000000061 00000 n
+0000000116 00000 n
+0000000203 00000 n
+trailer
+<< /Size 5 /Root 1 0 R >>
+startxref
+300
+%%EOF
diff --git a/tests/test_pdf_simple.py b/tests/test_pdf_simple.py
new file mode 100644
index 0000000..552c8a5
--- /dev/null
+++ b/tests/test_pdf_simple.py
@@ -0,0 +1,37 @@
+from typing import Literal
+from pydantic import BaseModel
+
+from omniread.pdf import PDFParser
+from omniread.core.content import Content
+
+
+class ParsedPDF(BaseModel):
+    size_bytes: int
+    magic: Literal[b"%PDF"]
+
+
+class SimplePDFParser(PDFParser[ParsedPDF]):
+    def parse(self) -> ParsedPDF:
+        raw = self.content.raw
+
+        if not raw.startswith(b"%PDF"):
+            raise ValueError("Not a valid PDF")
+
+        return ParsedPDF(
+            size_bytes=len(raw),
+            magic=b"%PDF",
+        )
+
+
+def test_end_to_end_pdf_simple(pdf_scraper):
+    # --- Scrape (identifier-based, routed in conftest)
+    content: Content = pdf_scraper.fetch("simple")
+
+    assert content.raw.startswith(b"%PDF")
+
+    # --- Parse
+    parser = SimplePDFParser(content)
+    result = parser.parse()
+
+    assert result.magic == b"%PDF"
+    assert result.size_bytes > 100