Files
omniread/omniread/pdf/client.py
Vishesh 'ironeagle' Bangotra de67c7b0b1 feat(pdf): add PDF client, scraper, parser, and end-to-end tests
- Introduce PDF submodule with client, scraper, and generic parser
- Add filesystem PDF client and test-only mock routing
- Add end-to-end PDF scrape → parse tests with typed output
- Mirror HTML module architecture for consistency
- Expose PDF primitives via omniread public API
2026-01-02 18:59:36 +05:30

33 lines
772 B
Python

from abc import ABC, abstractmethod
from pathlib import Path
class BasePDFClient(ABC):
"""
Abstract client responsible for retrieving PDF bytes
from a specific backing store (filesystem, S3, FTP, etc).
"""
@abstractmethod
def fetch(self, source: str) -> bytes:
"""
Fetch raw PDF bytes from the given source.
"""
raise NotImplementedError
class FileSystemPDFClient(BasePDFClient):
"""
PDF client that reads from the local filesystem.
"""
def fetch(self, path: Path) -> bytes:
if not path.exists():
raise FileNotFoundError(f"PDF not found: {path}")
if not path.is_file():
raise ValueError(f"Path is not a file: {path}")
return path.read_bytes()