feat(pdf): add PDF client, scraper, parser, and end-to-end tests
- Introduce PDF submodule with client, scraper, and generic parser - Add filesystem PDF client and test-only mock routing - Add end-to-end PDF scrape → parse tests with typed output - Mirror HTML module architecture for consistency - Expose PDF primitives via omniread public API
This commit is contained in:
@@ -6,9 +6,12 @@ from jinja2 import Environment, BaseLoader
|
||||
|
||||
from omniread.core.content import ContentType
|
||||
from omniread.html.scraper import HTMLScraper
|
||||
from omniread.pdf.client import FileSystemPDFClient
|
||||
from omniread.pdf.scraper import PDFScraper
|
||||
|
||||
|
||||
MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html"
|
||||
MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf"
|
||||
|
||||
|
||||
def render_html(template_path, data_path) -> bytes:
|
||||
@@ -57,3 +60,24 @@ def http_scraper() -> HTMLScraper:
|
||||
client = httpx.Client(transport=transport)
|
||||
|
||||
return HTMLScraper(client=client)
|
||||
|
||||
|
||||
class MockPDFClient(FileSystemPDFClient):
|
||||
"""
|
||||
Test-only PDF client that routes logical identifiers
|
||||
to fixture files.
|
||||
"""
|
||||
|
||||
def fetch(self, source: str) -> bytes:
|
||||
if source in ["simple"]:
|
||||
source = MOCK_PDF_DIR / f"{source}.pdf"
|
||||
else:
|
||||
raise FileNotFoundError(f"No mock PDF route for '{source}'")
|
||||
|
||||
return super().fetch(source)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pdf_scraper() -> PDFScraper:
|
||||
client = MockPDFClient()
|
||||
return PDFScraper(client=client)
|
||||
|
||||
32
tests/mocks/pdf/simple.pdf
Normal file
32
tests/mocks/pdf/simple.pdf
Normal file
@@ -0,0 +1,32 @@
|
||||
%PDF-1.4
|
||||
1 0 obj
|
||||
<< /Type /Catalog /Pages 2 0 R >>
|
||||
endobj
|
||||
2 0 obj
|
||||
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
||||
endobj
|
||||
3 0 obj
|
||||
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
|
||||
endobj
|
||||
4 0 obj
|
||||
<< /Length 44 >>
|
||||
stream
|
||||
BT
|
||||
/F1 12 Tf
|
||||
72 720 Td
|
||||
(Simple PDF Test) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
xref
|
||||
0 5
|
||||
0000000000 65535 f
|
||||
0000000010 00000 n
|
||||
0000000061 00000 n
|
||||
0000000116 00000 n
|
||||
0000000203 00000 n
|
||||
trailer
|
||||
<< /Size 5 /Root 1 0 R >>
|
||||
startxref
|
||||
300
|
||||
%%EOF
|
||||
37
tests/test_pdf_simple.py
Normal file
37
tests/test_pdf_simple.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from typing import Literal
|
||||
from pydantic import BaseModel
|
||||
|
||||
from omniread.pdf import PDFParser
|
||||
from omniread.core.content import Content
|
||||
|
||||
|
||||
class ParsedPDF(BaseModel):
|
||||
size_bytes: int
|
||||
magic: Literal[b"%PDF"]
|
||||
|
||||
|
||||
class SimplePDFParser(PDFParser[ParsedPDF]):
|
||||
def parse(self) -> ParsedPDF:
|
||||
raw = self.content.raw
|
||||
|
||||
if not raw.startswith(b"%PDF"):
|
||||
raise ValueError("Not a valid PDF")
|
||||
|
||||
return ParsedPDF(
|
||||
size_bytes=len(raw),
|
||||
magic=b"%PDF",
|
||||
)
|
||||
|
||||
|
||||
def test_end_to_end_pdf_simple(pdf_scraper):
|
||||
# --- Scrape (identifier-based, routed in conftest)
|
||||
content: Content = pdf_scraper.fetch("simple")
|
||||
|
||||
assert content.raw.startswith(b"%PDF")
|
||||
|
||||
# --- Parse
|
||||
parser = SimplePDFParser(content)
|
||||
result = parser.parse()
|
||||
|
||||
assert result.magic == b"%PDF"
|
||||
assert result.size_bytes > 100
|
||||
Reference in New Issue
Block a user