- Introduce PDF submodule with client, scraper, and generic parser - Add filesystem PDF client and test-only mock routing - Add end-to-end PDF scrape → parse tests with typed output - Mirror HTML module architecture for consistency - Expose PDF primitives via omniread public API
33 lines
772 B
Python
33 lines
772 B
Python
from abc import ABC, abstractmethod
|
|
from pathlib import Path
|
|
|
|
|
|
class BasePDFClient(ABC):
|
|
"""
|
|
Abstract client responsible for retrieving PDF bytes
|
|
from a specific backing store (filesystem, S3, FTP, etc).
|
|
"""
|
|
|
|
@abstractmethod
|
|
def fetch(self, source: str) -> bytes:
|
|
"""
|
|
Fetch raw PDF bytes from the given source.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class FileSystemPDFClient(BasePDFClient):
|
|
"""
|
|
PDF client that reads from the local filesystem.
|
|
"""
|
|
|
|
def fetch(self, path: Path) -> bytes:
|
|
|
|
if not path.exists():
|
|
raise FileNotFoundError(f"PDF not found: {path}")
|
|
|
|
if not path.is_file():
|
|
raise ValueError(f"Path is not a file: {path}")
|
|
|
|
return path.read_bytes()
|