feat(pdf): add PDF client, scraper, parser, and end-to-end tests

- Introduce PDF submodule with client, scraper, and generic parser - Add filesystem PDF client and test-only mock routing - Add end-to-end PDF scrape → parse tests with typed output - Mirror HTML module architecture for consistency - Expose PDF primitives via omniread public API
2026-01-02 18:59:36 +05:30
parent 390eb22e1b
commit de67c7b0b1
8 changed files with 198 additions and 0 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,9 +6,12 @@ from jinja2 import Environment, BaseLoader

 from omniread.core.content import ContentType
 from omniread.html.scraper import HTMLScraper
+from omniread.pdf.client import FileSystemPDFClient
+from omniread.pdf.scraper import PDFScraper


 MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html"
+MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf"


 def render_html(template_path, data_path) -> bytes:
@@ -57,3 +60,24 @@ def http_scraper() -> HTMLScraper:
    client = httpx.Client(transport=transport)

    return HTMLScraper(client=client)
+
+
+class MockPDFClient(FileSystemPDFClient):
+    """
+    Test-only PDF client that routes logical identifiers
+    to fixture files.
+    """
+
+    def fetch(self, source: str) -> bytes:
+        if source in ["simple"]:
+            source = MOCK_PDF_DIR / f"{source}.pdf"
+        else:
+            raise FileNotFoundError(f"No mock PDF route for '{source}'")
+
+        return super().fetch(source)
+
+
+@pytest.fixture
+def pdf_scraper() -> PDFScraper:
+    client = MockPDFClient()
+    return PDFScraper(client=client)