- Replace deep imports with top-level omniread exports in tests - Ensure tests validate only the supported public API surface - Align HTML and PDF tests with documented library usage
42 lines
861 B
Python
42 lines
861 B
Python
from typing import Literal
|
|
from pydantic import BaseModel
|
|
|
|
from omniread import (
|
|
# core
|
|
Content,
|
|
|
|
# pdf
|
|
PDFParser,
|
|
)
|
|
|
|
class ParsedPDF(BaseModel):
|
|
size_bytes: int
|
|
magic: Literal[b"%PDF"]
|
|
|
|
|
|
class SimplePDFParser(PDFParser[ParsedPDF]):
|
|
def parse(self) -> ParsedPDF:
|
|
raw = self.content.raw
|
|
|
|
if not raw.startswith(b"%PDF"):
|
|
raise ValueError("Not a valid PDF")
|
|
|
|
return ParsedPDF(
|
|
size_bytes=len(raw),
|
|
magic=b"%PDF",
|
|
)
|
|
|
|
|
|
def test_end_to_end_pdf_simple(pdf_scraper):
|
|
# --- Scrape (identifier-based, routed in conftest)
|
|
content: Content = pdf_scraper.fetch("simple")
|
|
|
|
assert content.raw.startswith(b"%PDF")
|
|
|
|
# --- Parse
|
|
parser = SimplePDFParser(content)
|
|
result = parser.parse()
|
|
|
|
assert result.magic == b"%PDF"
|
|
assert result.size_bytes > 100
|