feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers

- Add smart httpx MockTransport routing based on endpoint paths
- Render HTML fixtures via Jinja templates populated from JSON data
- Introduce explicit, typed HTML parsers for semantic and table-based content
- Add end-to-end tests covering scraper → content → parser → Pydantic models
- Enforce explicit output contracts and avoid default dict-based parsing
This commit is contained in:
2026-01-02 18:31:34 +05:30
parent fa14a79ec9
commit 07293e4651
8 changed files with 156 additions and 31 deletions

View File

@@ -1,33 +1,57 @@
import json
import pytest
import httpx
from pathlib import Path
from jinja2 import Environment, BaseLoader
from omniread.core.content import ContentType
from omniread.html.scraper import HTMLScraper
TEST_HTML = b"""
<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<meta name="description" content="Simple test page">
</head>
<body>
<div id="content">Hello World</div>
<a href="https://example.com">Link</a>
</body>
</html>
"""
MOCK_DIR = Path(__file__).parent / "mocks"
def render_html(template_path, data_path) -> bytes:
template_text = Path(template_path).read_text(encoding="utf-8")
data = json.loads(Path(data_path).read_text(encoding="utf-8"))
env = Environment(
loader=BaseLoader(),
autoescape=False,
)
template = env.from_string(template_text)
rendered = template.render(**data)
return rendered.encode("utf-8")
def mock_transport(request: httpx.Request) -> httpx.Response:
"""
httpx MockTransport handler.
"""
path = request.url.path
if path == "/simple":
content = render_html(
MOCK_DIR / "simple.html.jinja",
MOCK_DIR / "simple.json",
)
elif path == "/table":
content = render_html(
MOCK_DIR / "table.html.jinja",
MOCK_DIR / "table.json",
)
else:
return httpx.Response(
status_code=404,
content=b"Not Found",
request=request,
)
return httpx.Response(
status_code=200,
headers={"Content-Type": ContentType.HTML.value},
content=TEST_HTML,
content=content,
request=request,
)
@@ -38,8 +62,4 @@ def http_scraper() -> HTMLScraper:
client = httpx.Client(transport=transport)
# Patch scraper to use our mocked client
scraper = HTMLScraper()
scraper._client = client # intentional test-only override
return scraper
return HTMLScraper(client=client)