from typing import Literal from pydantic import BaseModel from omniread import ( # core Content, # pdf PDFParser, ) class ParsedPDF(BaseModel): size_bytes: int magic: Literal[b"%PDF"] class SimplePDFParser(PDFParser[ParsedPDF]): def parse(self) -> ParsedPDF: raw = self.content.raw if not raw.startswith(b"%PDF"): raise ValueError("Not a valid PDF") return ParsedPDF( size_bytes=len(raw), magic=b"%PDF", ) def test_end_to_end_pdf_simple(pdf_scraper): # --- Scrape (identifier-based, routed in conftest) content: Content = pdf_scraper.fetch("simple") assert content.raw.startswith(b"%PDF") # --- Parse parser = SimplePDFParser(content) result = parser.parse() assert result.magic == b"%PDF" assert result.size_bytes > 100