feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers

- Add smart httpx MockTransport routing based on endpoint paths
- Render HTML fixtures via Jinja templates populated from JSON data
- Introduce explicit, typed HTML parsers for semantic and table-based content
- Add end-to-end tests covering scraper → content → parser → Pydantic models
- Enforce explicit output contracts and avoid default dict-based parsing
This commit is contained in:
2026-01-02 18:31:34 +05:30
parent fa14a79ec9
commit 07293e4651
8 changed files with 156 additions and 31 deletions

View File

@@ -1,6 +1,7 @@
httpx==0.27.0 httpx==0.27.0
beautifulsoup4==4.12.0 beautifulsoup4==4.12.0
pydantic==2.12.3 pydantic==2.12.3
jinja2==3.1.6
# lxml==5.2.0 # lxml==5.2.0
pytest==7.4.0 pytest==7.4.0

View File

@@ -1,33 +1,57 @@
import json
import pytest import pytest
import httpx import httpx
from pathlib import Path
from jinja2 import Environment, BaseLoader
from omniread.core.content import ContentType from omniread.core.content import ContentType
from omniread.html.scraper import HTMLScraper from omniread.html.scraper import HTMLScraper
TEST_HTML = b""" MOCK_DIR = Path(__file__).parent / "mocks"
<!DOCTYPE html>
<html>
<head> def render_html(template_path, data_path) -> bytes:
<title>Test Page</title> template_text = Path(template_path).read_text(encoding="utf-8")
<meta name="description" content="Simple test page"> data = json.loads(Path(data_path).read_text(encoding="utf-8"))
</head>
<body> env = Environment(
<div id="content">Hello World</div> loader=BaseLoader(),
<a href="https://example.com">Link</a> autoescape=False,
</body> )
</html> template = env.from_string(template_text)
"""
rendered = template.render(**data)
return rendered.encode("utf-8")
def mock_transport(request: httpx.Request) -> httpx.Response: def mock_transport(request: httpx.Request) -> httpx.Response:
""" """
httpx MockTransport handler. httpx MockTransport handler.
""" """
path = request.url.path
if path == "/simple":
content = render_html(
MOCK_DIR / "simple.html.jinja",
MOCK_DIR / "simple.json",
)
elif path == "/table":
content = render_html(
MOCK_DIR / "table.html.jinja",
MOCK_DIR / "table.json",
)
else:
return httpx.Response(
status_code=404,
content=b"Not Found",
request=request,
)
return httpx.Response( return httpx.Response(
status_code=200, status_code=200,
headers={"Content-Type": ContentType.HTML.value}, headers={"Content-Type": ContentType.HTML.value},
content=TEST_HTML, content=content,
request=request, request=request,
) )
@@ -38,8 +62,4 @@ def http_scraper() -> HTMLScraper:
client = httpx.Client(transport=transport) client = httpx.Client(transport=transport)
# Patch scraper to use our mocked client return HTMLScraper(client=client)
scraper = HTMLScraper()
scraper._client = client # intentional test-only override
return scraper

View File

@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ title }}</title>
<meta name="description" content="{{ description }}">
</head>
<body>
<div id="content">{{ content }}</div>
<a href="{{ link_url }}">{{ link_text }}</a>
</body>
</html>

7
tests/mocks/simple.json Normal file
View File

@@ -0,0 +1,7 @@
{
"title": "Test Page",
"description": "Simple test page",
"content": "Hello World",
"link_url": "https://example.com",
"link_text": "Link"
}

View File

@@ -0,0 +1,31 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ title }}</title>
<meta name="description" content="{{ description }}">
</head>
<body>
<h1>{{ heading }}</h1>
<table id="{{ table_id }}">
<thead>
<tr>
{% for col in columns %}
<th>{{ col }}</th>
{% endfor %}
</tr>
</thead>
<tbody>
{% for row in rows %}
<tr>
{% for cell in row %}
<td>{{ cell }}</td>
{% endfor %}
</tr>
{% endfor %}
</tbody>
</table>
<a href="{{ link_url }}">{{ link_text }}</a>
</body>
</html>

14
tests/mocks/table.json Normal file
View File

@@ -0,0 +1,14 @@
{
"title": "Table Test Page",
"description": "HTML page with a table for parsing tests",
"heading": "Sample Table",
"table_id": "data-table",
"columns": ["Name", "Age", "City"],
"rows": [
["Alice", "30", "London"],
["Bob", "25", "New York"],
["Charlie", "35", "Berlin"]
],
"link_url": "https://example.org/details",
"link_text": "Details"
}

View File

@@ -7,26 +7,26 @@ from omniread.html.parser import HTMLParser
from omniread.core.content import Content from omniread.core.content import Content
class ParsedHTML(BaseModel): class ParsedSimpleHTML(BaseModel):
title: Optional[str] title: Optional[str]
description: Optional[str] description: Optional[str]
content: Optional[str] content: Optional[str]
link: Optional[str] link: Optional[str]
class TestHTMLParser(HTMLParser[ParsedHTML]): class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
""" """
Concrete HTML parser with explicit Pydantic return type. Parser focused on high-level page semantics.
""" """
def parse(self) -> ParsedHTML: def parse(self) -> ParsedSimpleHTML:
soup = self._soup soup = self._soup
meta = self.parse_meta() meta = self.parse_meta()
content_div = soup.find("div", id="content") content_div = soup.find("div", id="content")
link_tag: Tag | None = soup.find("a") link_tag: Tag | None = soup.find("a")
return ParsedHTML( return ParsedSimpleHTML(
title=meta["title"], title=meta["title"],
description=meta["meta"].get("description"), description=meta["meta"].get("description"),
content=self.parse_div(content_div) if content_div else None, content=self.parse_div(content_div) if content_div else None,
@@ -34,16 +34,13 @@ class TestHTMLParser(HTMLParser[ParsedHTML]):
) )
def test_end_to_end_html_scrape_and_parse(http_scraper): def test_end_to_end_html_simple(http_scraper):
# --- Scrape (real scraper, mocked transport) content: Content = http_scraper.fetch("https://test.local/simple")
content: Content = http_scraper.fetch("https://test.local")
# --- Parse parser = SimpleHTMLParser(content)
parser = TestHTMLParser(content)
result = parser.parse() result = parser.parse()
# --- Assertions assert isinstance(result, ParsedSimpleHTML)
assert isinstance(result, ParsedHTML)
assert result.title == "Test Page" assert result.title == "Test Page"
assert result.description == "Simple test page" assert result.description == "Simple test page"

44
tests/test_html_table.py Normal file
View File

@@ -0,0 +1,44 @@
from typing import Optional
from pydantic import BaseModel
from omniread.html.parser import HTMLParser
from omniread.core.content import Content
class ParsedTableHTML(BaseModel):
title: Optional[str]
table: list[list[str]]
class TableHTMLParser(HTMLParser[ParsedTableHTML]):
"""
Parser focused on extracting tabular data.
"""
def parse(self) -> ParsedTableHTML:
soup = self._soup
table_tag = soup.find("table")
return ParsedTableHTML(
title=soup.title.string.strip() if soup.title else None,
table=self.parse_table(table_tag) if table_tag else [],
)
def test_end_to_end_html_table(http_scraper):
content: Content = http_scraper.fetch("https://test.local/table")
parser = TableHTMLParser(content)
result = parser.parse()
assert isinstance(result, ParsedTableHTML)
assert result.title == "Table Test Page"
assert result.table == [
["Name", "Age", "City"],
["Alice", "30", "London"],
["Bob", "25", "New York"],
["Charlie", "35", "Berlin"],
]