Files
omniread/tests/conftest.py
Vishesh 'ironeagle' Bangotra de67c7b0b1 feat(pdf): add PDF client, scraper, parser, and end-to-end tests
- Introduce PDF submodule with client, scraper, and generic parser
- Add filesystem PDF client and test-only mock routing
- Add end-to-end PDF scrape → parse tests with typed output
- Mirror HTML module architecture for consistency
- Expose PDF primitives via omniread public API
2026-01-02 18:59:36 +05:30

84 lines
2.1 KiB
Python

import json
import pytest
import httpx
from pathlib import Path
from jinja2 import Environment, BaseLoader
from omniread.core.content import ContentType
from omniread.html.scraper import HTMLScraper
from omniread.pdf.client import FileSystemPDFClient
from omniread.pdf.scraper import PDFScraper
MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html"
MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf"
def render_html(template_path, data_path) -> bytes:
template_text = Path(template_path).read_text(encoding="utf-8")
data = json.loads(Path(data_path).read_text(encoding="utf-8"))
env = Environment(
loader=BaseLoader(),
autoescape=False,
)
template = env.from_string(template_text)
rendered = template.render(**data)
return rendered.encode("utf-8")
def mock_transport(request: httpx.Request) -> httpx.Response:
"""
httpx MockTransport handler.
"""
path = request.url.path
if path not in ['/simple', '/table']:
return httpx.Response(
status_code=404,
content=b"Not Found",
request=request,
)
endpoint = path.split("/")[-1]
content = render_html(
MOCK_HTML_DIR / f"{endpoint}.html.jinja",
MOCK_HTML_DIR / f"{endpoint}.json",
)
return httpx.Response(
status_code=200,
headers={"Content-Type": ContentType.HTML.value},
content=content,
request=request,
)
@pytest.fixture
def http_scraper() -> HTMLScraper:
transport = httpx.MockTransport(mock_transport)
client = httpx.Client(transport=transport)
return HTMLScraper(client=client)
class MockPDFClient(FileSystemPDFClient):
"""
Test-only PDF client that routes logical identifiers
to fixture files.
"""
def fetch(self, source: str) -> bytes:
if source in ["simple"]:
source = MOCK_PDF_DIR / f"{source}.pdf"
else:
raise FileNotFoundError(f"No mock PDF route for '{source}'")
return super().fetch(source)
@pytest.fixture
def pdf_scraper() -> PDFScraper:
client = MockPDFClient()
return PDFScraper(client=client)