feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers

- Add smart httpx MockTransport routing based on endpoint paths
- Render HTML fixtures via Jinja templates populated from JSON data
- Introduce explicit, typed HTML parsers for semantic and table-based content
- Add end-to-end tests covering scraper → content → parser → Pydantic models
- Enforce explicit output contracts and avoid default dict-based parsing
This commit is contained in:
2026-01-02 18:31:34 +05:30
parent fa14a79ec9
commit 07293e4651
8 changed files with 156 additions and 31 deletions

View File

@@ -1,33 +1,57 @@
import json
import pytest
import httpx
from pathlib import Path
from jinja2 import Environment, BaseLoader
from omniread.core.content import ContentType
from omniread.html.scraper import HTMLScraper
TEST_HTML = b"""
<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<meta name="description" content="Simple test page">
</head>
<body>
<div id="content">Hello World</div>
<a href="https://example.com">Link</a>
</body>
</html>
"""
MOCK_DIR = Path(__file__).parent / "mocks"
def render_html(template_path, data_path) -> bytes:
template_text = Path(template_path).read_text(encoding="utf-8")
data = json.loads(Path(data_path).read_text(encoding="utf-8"))
env = Environment(
loader=BaseLoader(),
autoescape=False,
)
template = env.from_string(template_text)
rendered = template.render(**data)
return rendered.encode("utf-8")
def mock_transport(request: httpx.Request) -> httpx.Response:
"""
httpx MockTransport handler.
"""
path = request.url.path
if path == "/simple":
content = render_html(
MOCK_DIR / "simple.html.jinja",
MOCK_DIR / "simple.json",
)
elif path == "/table":
content = render_html(
MOCK_DIR / "table.html.jinja",
MOCK_DIR / "table.json",
)
else:
return httpx.Response(
status_code=404,
content=b"Not Found",
request=request,
)
return httpx.Response(
status_code=200,
headers={"Content-Type": ContentType.HTML.value},
content=TEST_HTML,
content=content,
request=request,
)
@@ -38,8 +62,4 @@ def http_scraper() -> HTMLScraper:
client = httpx.Client(transport=transport)
# Patch scraper to use our mocked client
scraper = HTMLScraper()
scraper._client = client # intentional test-only override
return scraper
return HTMLScraper(client=client)

View File

@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ title }}</title>
<meta name="description" content="{{ description }}">
</head>
<body>
<div id="content">{{ content }}</div>
<a href="{{ link_url }}">{{ link_text }}</a>
</body>
</html>

7
tests/mocks/simple.json Normal file
View File

@@ -0,0 +1,7 @@
{
"title": "Test Page",
"description": "Simple test page",
"content": "Hello World",
"link_url": "https://example.com",
"link_text": "Link"
}

View File

@@ -0,0 +1,31 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ title }}</title>
<meta name="description" content="{{ description }}">
</head>
<body>
<h1>{{ heading }}</h1>
<table id="{{ table_id }}">
<thead>
<tr>
{% for col in columns %}
<th>{{ col }}</th>
{% endfor %}
</tr>
</thead>
<tbody>
{% for row in rows %}
<tr>
{% for cell in row %}
<td>{{ cell }}</td>
{% endfor %}
</tr>
{% endfor %}
</tbody>
</table>
<a href="{{ link_url }}">{{ link_text }}</a>
</body>
</html>

14
tests/mocks/table.json Normal file
View File

@@ -0,0 +1,14 @@
{
"title": "Table Test Page",
"description": "HTML page with a table for parsing tests",
"heading": "Sample Table",
"table_id": "data-table",
"columns": ["Name", "Age", "City"],
"rows": [
["Alice", "30", "London"],
["Bob", "25", "New York"],
["Charlie", "35", "Berlin"]
],
"link_url": "https://example.org/details",
"link_text": "Details"
}

View File

@@ -7,26 +7,26 @@ from omniread.html.parser import HTMLParser
from omniread.core.content import Content
class ParsedHTML(BaseModel):
class ParsedSimpleHTML(BaseModel):
title: Optional[str]
description: Optional[str]
content: Optional[str]
link: Optional[str]
class TestHTMLParser(HTMLParser[ParsedHTML]):
class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
"""
Concrete HTML parser with explicit Pydantic return type.
Parser focused on high-level page semantics.
"""
def parse(self) -> ParsedHTML:
def parse(self) -> ParsedSimpleHTML:
soup = self._soup
meta = self.parse_meta()
content_div = soup.find("div", id="content")
link_tag: Tag | None = soup.find("a")
return ParsedHTML(
return ParsedSimpleHTML(
title=meta["title"],
description=meta["meta"].get("description"),
content=self.parse_div(content_div) if content_div else None,
@@ -34,16 +34,13 @@ class TestHTMLParser(HTMLParser[ParsedHTML]):
)
def test_end_to_end_html_scrape_and_parse(http_scraper):
# --- Scrape (real scraper, mocked transport)
content: Content = http_scraper.fetch("https://test.local")
def test_end_to_end_html_simple(http_scraper):
content: Content = http_scraper.fetch("https://test.local/simple")
# --- Parse
parser = TestHTMLParser(content)
parser = SimpleHTMLParser(content)
result = parser.parse()
# --- Assertions
assert isinstance(result, ParsedHTML)
assert isinstance(result, ParsedSimpleHTML)
assert result.title == "Test Page"
assert result.description == "Simple test page"

44
tests/test_html_table.py Normal file
View File

@@ -0,0 +1,44 @@
from typing import Optional
from pydantic import BaseModel
from omniread.html.parser import HTMLParser
from omniread.core.content import Content
class ParsedTableHTML(BaseModel):
title: Optional[str]
table: list[list[str]]
class TableHTMLParser(HTMLParser[ParsedTableHTML]):
"""
Parser focused on extracting tabular data.
"""
def parse(self) -> ParsedTableHTML:
soup = self._soup
table_tag = soup.find("table")
return ParsedTableHTML(
title=soup.title.string.strip() if soup.title else None,
table=self.parse_table(table_tag) if table_tag else [],
)
def test_end_to_end_html_table(http_scraper):
content: Content = http_scraper.fetch("https://test.local/table")
parser = TableHTMLParser(content)
result = parser.parse()
assert isinstance(result, ParsedTableHTML)
assert result.title == "Table Test Page"
assert result.table == [
["Name", "Age", "City"],
["Alice", "30", "London"],
["Bob", "25", "New York"],
["Charlie", "35", "Berlin"],
]