feat(pdf): add PDF client, scraper, parser, and end-to-end tests

- Introduce PDF submodule with client, scraper, and generic parser
- Add filesystem PDF client and test-only mock routing
- Add end-to-end PDF scrape → parse tests with typed output
- Mirror HTML module architecture for consistency
- Expose PDF primitives via omniread public API
This commit is contained in:
2026-01-02 18:59:36 +05:30
parent 390eb22e1b
commit de67c7b0b1
8 changed files with 198 additions and 0 deletions

View File

@@ -1,5 +1,6 @@
from .core import Content, ContentType from .core import Content, ContentType
from .html import HTMLScraper, HTMLParser from .html import HTMLScraper, HTMLParser
from .pdf import FileSystemPDFClient, PDFScraper, PDFParser
__all__ = [ __all__ = [
# core # core
@@ -9,4 +10,9 @@ __all__ = [
# html # html
"HTMLScraper", "HTMLScraper",
"HTMLParser", "HTMLParser",
# pdf
"FileSystemPDFClient",
"PDFScraper",
"PDFParser",
] ]

9
omniread/pdf/__init__.py Normal file
View File

@@ -0,0 +1,9 @@
from .client import FileSystemPDFClient
from .scraper import PDFScraper
from .parser import PDFParser
__all__ = [
"FileSystemPDFClient",
"PDFScraper",
"PDFParser",
]

32
omniread/pdf/client.py Normal file
View File

@@ -0,0 +1,32 @@
from abc import ABC, abstractmethod
from pathlib import Path
class BasePDFClient(ABC):
"""
Abstract client responsible for retrieving PDF bytes
from a specific backing store (filesystem, S3, FTP, etc).
"""
@abstractmethod
def fetch(self, source: str) -> bytes:
"""
Fetch raw PDF bytes from the given source.
"""
raise NotImplementedError
class FileSystemPDFClient(BasePDFClient):
"""
PDF client that reads from the local filesystem.
"""
def fetch(self, path: Path) -> bytes:
if not path.exists():
raise FileNotFoundError(f"PDF not found: {path}")
if not path.is_file():
raise ValueError(f"Path is not a file: {path}")
return path.read_bytes()

26
omniread/pdf/parser.py Normal file
View File

@@ -0,0 +1,26 @@
from typing import Generic, TypeVar
from abc import abstractmethod
from omniread.core.content import ContentType
from omniread.core.parser import BaseParser
T = TypeVar("T")
class PDFParser(BaseParser[T], Generic[T]):
"""
Base PDF parser.
Concrete implementations must define:
- the output type T
- the parsing strategy
"""
supported_types = {ContentType.PDF}
@abstractmethod
def parse(self) -> T:
"""
Parse PDF content into a structured output.
"""
raise NotImplementedError

32
omniread/pdf/scraper.py Normal file
View File

@@ -0,0 +1,32 @@
from typing import Any, Mapping, Optional
from omniread.core.content import Content, ContentType
from omniread.core.scraper import BaseScraper
from omniread.pdf.client import BasePDFClient
class PDFScraper(BaseScraper):
"""
Scraper for PDF sources.
Delegates byte retrieval to a PDF client and normalizes
output into Content.
"""
def __init__(self, *, client: BasePDFClient):
self._client = client
def fetch(
self,
source: str,
*,
metadata: Optional[Mapping[str, Any]] = None,
) -> Content:
raw = self._client.fetch(source)
return Content(
raw=raw,
source=source,
content_type=ContentType.PDF,
metadata=dict(metadata) if metadata else None,
)

View File

@@ -6,9 +6,12 @@ from jinja2 import Environment, BaseLoader
from omniread.core.content import ContentType from omniread.core.content import ContentType
from omniread.html.scraper import HTMLScraper from omniread.html.scraper import HTMLScraper
from omniread.pdf.client import FileSystemPDFClient
from omniread.pdf.scraper import PDFScraper
MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html" MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html"
MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf"
def render_html(template_path, data_path) -> bytes: def render_html(template_path, data_path) -> bytes:
@@ -57,3 +60,24 @@ def http_scraper() -> HTMLScraper:
client = httpx.Client(transport=transport) client = httpx.Client(transport=transport)
return HTMLScraper(client=client) return HTMLScraper(client=client)
class MockPDFClient(FileSystemPDFClient):
"""
Test-only PDF client that routes logical identifiers
to fixture files.
"""
def fetch(self, source: str) -> bytes:
if source in ["simple"]:
source = MOCK_PDF_DIR / f"{source}.pdf"
else:
raise FileNotFoundError(f"No mock PDF route for '{source}'")
return super().fetch(source)
@pytest.fixture
def pdf_scraper() -> PDFScraper:
client = MockPDFClient()
return PDFScraper(client=client)

View File

@@ -0,0 +1,32 @@
%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
endobj
4 0 obj
<< /Length 44 >>
stream
BT
/F1 12 Tf
72 720 Td
(Simple PDF Test) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000010 00000 n
0000000061 00000 n
0000000116 00000 n
0000000203 00000 n
trailer
<< /Size 5 /Root 1 0 R >>
startxref
300
%%EOF

37
tests/test_pdf_simple.py Normal file
View File

@@ -0,0 +1,37 @@
from typing import Literal
from pydantic import BaseModel
from omniread.pdf import PDFParser
from omniread.core.content import Content
class ParsedPDF(BaseModel):
size_bytes: int
magic: Literal[b"%PDF"]
class SimplePDFParser(PDFParser[ParsedPDF]):
def parse(self) -> ParsedPDF:
raw = self.content.raw
if not raw.startswith(b"%PDF"):
raise ValueError("Not a valid PDF")
return ParsedPDF(
size_bytes=len(raw),
magic=b"%PDF",
)
def test_end_to_end_pdf_simple(pdf_scraper):
# --- Scrape (identifier-based, routed in conftest)
content: Content = pdf_scraper.fetch("simple")
assert content.raw.startswith(b"%PDF")
# --- Parse
parser = SimplePDFParser(content)
result = parser.parse()
assert result.magic == b"%PDF"
assert result.size_bytes > 100