feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers
- Add smart httpx MockTransport routing based on endpoint paths - Render HTML fixtures via Jinja templates populated from JSON data - Introduce explicit, typed HTML parsers for semantic and table-based content - Add end-to-end tests covering scraper → content → parser → Pydantic models - Enforce explicit output contracts and avoid default dict-based parsing
This commit is contained in:
@@ -1,33 +1,57 @@
|
||||
import json
|
||||
import pytest
|
||||
import httpx
|
||||
from pathlib import Path
|
||||
from jinja2 import Environment, BaseLoader
|
||||
|
||||
from omniread.core.content import ContentType
|
||||
from omniread.html.scraper import HTMLScraper
|
||||
|
||||
|
||||
TEST_HTML = b"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test Page</title>
|
||||
<meta name="description" content="Simple test page">
|
||||
</head>
|
||||
<body>
|
||||
<div id="content">Hello World</div>
|
||||
<a href="https://example.com">Link</a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
MOCK_DIR = Path(__file__).parent / "mocks"
|
||||
|
||||
|
||||
def render_html(template_path, data_path) -> bytes:
|
||||
template_text = Path(template_path).read_text(encoding="utf-8")
|
||||
data = json.loads(Path(data_path).read_text(encoding="utf-8"))
|
||||
|
||||
env = Environment(
|
||||
loader=BaseLoader(),
|
||||
autoescape=False,
|
||||
)
|
||||
template = env.from_string(template_text)
|
||||
|
||||
rendered = template.render(**data)
|
||||
return rendered.encode("utf-8")
|
||||
|
||||
|
||||
def mock_transport(request: httpx.Request) -> httpx.Response:
|
||||
"""
|
||||
httpx MockTransport handler.
|
||||
"""
|
||||
path = request.url.path
|
||||
|
||||
if path == "/simple":
|
||||
content = render_html(
|
||||
MOCK_DIR / "simple.html.jinja",
|
||||
MOCK_DIR / "simple.json",
|
||||
)
|
||||
elif path == "/table":
|
||||
content = render_html(
|
||||
MOCK_DIR / "table.html.jinja",
|
||||
MOCK_DIR / "table.json",
|
||||
)
|
||||
else:
|
||||
return httpx.Response(
|
||||
status_code=404,
|
||||
content=b"Not Found",
|
||||
request=request,
|
||||
)
|
||||
|
||||
return httpx.Response(
|
||||
status_code=200,
|
||||
headers={"Content-Type": ContentType.HTML.value},
|
||||
content=TEST_HTML,
|
||||
content=content,
|
||||
request=request,
|
||||
)
|
||||
|
||||
@@ -38,8 +62,4 @@ def http_scraper() -> HTMLScraper:
|
||||
|
||||
client = httpx.Client(transport=transport)
|
||||
|
||||
# Patch scraper to use our mocked client
|
||||
scraper = HTMLScraper()
|
||||
scraper._client = client # intentional test-only override
|
||||
|
||||
return scraper
|
||||
return HTMLScraper(client=client)
|
||||
|
||||
Reference in New Issue
Block a user