feat(pdf): add PDF client, scraper, parser, and end-to-end tests
- Introduce PDF submodule with client, scraper, and generic parser - Add filesystem PDF client and test-only mock routing - Add end-to-end PDF scrape → parse tests with typed output - Mirror HTML module architecture for consistency - Expose PDF primitives via omniread public API
This commit is contained in:
37
tests/test_pdf_simple.py
Normal file
37
tests/test_pdf_simple.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from typing import Literal
|
||||
from pydantic import BaseModel
|
||||
|
||||
from omniread.pdf import PDFParser
|
||||
from omniread.core.content import Content
|
||||
|
||||
|
||||
class ParsedPDF(BaseModel):
|
||||
size_bytes: int
|
||||
magic: Literal[b"%PDF"]
|
||||
|
||||
|
||||
class SimplePDFParser(PDFParser[ParsedPDF]):
|
||||
def parse(self) -> ParsedPDF:
|
||||
raw = self.content.raw
|
||||
|
||||
if not raw.startswith(b"%PDF"):
|
||||
raise ValueError("Not a valid PDF")
|
||||
|
||||
return ParsedPDF(
|
||||
size_bytes=len(raw),
|
||||
magic=b"%PDF",
|
||||
)
|
||||
|
||||
|
||||
def test_end_to_end_pdf_simple(pdf_scraper):
|
||||
# --- Scrape (identifier-based, routed in conftest)
|
||||
content: Content = pdf_scraper.fetch("simple")
|
||||
|
||||
assert content.raw.startswith(b"%PDF")
|
||||
|
||||
# --- Parse
|
||||
parser = SimplePDFParser(content)
|
||||
result = parser.parse()
|
||||
|
||||
assert result.magic == b"%PDF"
|
||||
assert result.size_bytes > 100
|
||||
Reference in New Issue
Block a user