feat(pdf): add PDF client, scraper, parser, and end-to-end tests

- Introduce PDF submodule with client, scraper, and generic parser
- Add filesystem PDF client and test-only mock routing
- Add end-to-end PDF scrape → parse tests with typed output
- Mirror HTML module architecture for consistency
- Expose PDF primitives via omniread public API
This commit is contained in:
2026-01-02 18:59:36 +05:30
parent 390eb22e1b
commit de67c7b0b1
8 changed files with 198 additions and 0 deletions

37
tests/test_pdf_simple.py Normal file
View File

@@ -0,0 +1,37 @@
from typing import Literal
from pydantic import BaseModel
from omniread.pdf import PDFParser
from omniread.core.content import Content
class ParsedPDF(BaseModel):
size_bytes: int
magic: Literal[b"%PDF"]
class SimplePDFParser(PDFParser[ParsedPDF]):
def parse(self) -> ParsedPDF:
raw = self.content.raw
if not raw.startswith(b"%PDF"):
raise ValueError("Not a valid PDF")
return ParsedPDF(
size_bytes=len(raw),
magic=b"%PDF",
)
def test_end_to_end_pdf_simple(pdf_scraper):
# --- Scrape (identifier-based, routed in conftest)
content: Content = pdf_scraper.fetch("simple")
assert content.raw.startswith(b"%PDF")
# --- Parse
parser = SimplePDFParser(content)
result = parser.parse()
assert result.magic == b"%PDF"
assert result.size_bytes > 100