feat(pdf): add PDF client, scraper, parser, and end-to-end tests

- Introduce PDF submodule with client, scraper, and generic parser - Add filesystem PDF client and test-only mock routing - Add end-to-end PDF scrape → parse tests with typed output - Mirror HTML module architecture for consistency - Expose PDF primitives via omniread public API
2026-01-02 18:59:36 +05:30
parent 390eb22e1b
commit de67c7b0b1
8 changed files with 198 additions and 0 deletions
--- a/tests/test_pdf_simple.py
+++ b/tests/test_pdf_simple.py
@@ -0,0 +1,37 @@
+from typing import Literal
+from pydantic import BaseModel
+
+from omniread.pdf import PDFParser
+from omniread.core.content import Content
+
+
+class ParsedPDF(BaseModel):
+    size_bytes: int
+    magic: Literal[b"%PDF"]
+
+
+class SimplePDFParser(PDFParser[ParsedPDF]):
+    def parse(self) -> ParsedPDF:
+        raw = self.content.raw
+
+        if not raw.startswith(b"%PDF"):
+            raise ValueError("Not a valid PDF")
+
+        return ParsedPDF(
+            size_bytes=len(raw),
+            magic=b"%PDF",
+        )
+
+
+def test_end_to_end_pdf_simple(pdf_scraper):
+    # --- Scrape (identifier-based, routed in conftest)
+    content: Content = pdf_scraper.fetch("simple")
+
+    assert content.raw.startswith(b"%PDF")
+
+    # --- Parse
+    parser = SimplePDFParser(content)
+    result = parser.parse()
+
+    assert result.magic == b"%PDF"
+    assert result.size_bytes > 100