feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers
- Add smart httpx MockTransport routing based on endpoint paths - Render HTML fixtures via Jinja templates populated from JSON data - Introduce explicit, typed HTML parsers for semantic and table-based content - Add end-to-end tests covering scraper → content → parser → Pydantic models - Enforce explicit output contracts and avoid default dict-based parsing
This commit is contained in:
@@ -1,33 +1,57 @@
|
||||
import json
|
||||
import pytest
|
||||
import httpx
|
||||
from pathlib import Path
|
||||
from jinja2 import Environment, BaseLoader
|
||||
|
||||
from omniread.core.content import ContentType
|
||||
from omniread.html.scraper import HTMLScraper
|
||||
|
||||
|
||||
TEST_HTML = b"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test Page</title>
|
||||
<meta name="description" content="Simple test page">
|
||||
</head>
|
||||
<body>
|
||||
<div id="content">Hello World</div>
|
||||
<a href="https://example.com">Link</a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
MOCK_DIR = Path(__file__).parent / "mocks"
|
||||
|
||||
|
||||
def render_html(template_path, data_path) -> bytes:
|
||||
template_text = Path(template_path).read_text(encoding="utf-8")
|
||||
data = json.loads(Path(data_path).read_text(encoding="utf-8"))
|
||||
|
||||
env = Environment(
|
||||
loader=BaseLoader(),
|
||||
autoescape=False,
|
||||
)
|
||||
template = env.from_string(template_text)
|
||||
|
||||
rendered = template.render(**data)
|
||||
return rendered.encode("utf-8")
|
||||
|
||||
|
||||
def mock_transport(request: httpx.Request) -> httpx.Response:
|
||||
"""
|
||||
httpx MockTransport handler.
|
||||
"""
|
||||
path = request.url.path
|
||||
|
||||
if path == "/simple":
|
||||
content = render_html(
|
||||
MOCK_DIR / "simple.html.jinja",
|
||||
MOCK_DIR / "simple.json",
|
||||
)
|
||||
elif path == "/table":
|
||||
content = render_html(
|
||||
MOCK_DIR / "table.html.jinja",
|
||||
MOCK_DIR / "table.json",
|
||||
)
|
||||
else:
|
||||
return httpx.Response(
|
||||
status_code=404,
|
||||
content=b"Not Found",
|
||||
request=request,
|
||||
)
|
||||
|
||||
return httpx.Response(
|
||||
status_code=200,
|
||||
headers={"Content-Type": ContentType.HTML.value},
|
||||
content=TEST_HTML,
|
||||
content=content,
|
||||
request=request,
|
||||
)
|
||||
|
||||
@@ -38,8 +62,4 @@ def http_scraper() -> HTMLScraper:
|
||||
|
||||
client = httpx.Client(transport=transport)
|
||||
|
||||
# Patch scraper to use our mocked client
|
||||
scraper = HTMLScraper()
|
||||
scraper._client = client # intentional test-only override
|
||||
|
||||
return scraper
|
||||
return HTMLScraper(client=client)
|
||||
|
||||
11
tests/mocks/simple.html.jinja
Normal file
11
tests/mocks/simple.html.jinja
Normal file
@@ -0,0 +1,11 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>{{ title }}</title>
|
||||
<meta name="description" content="{{ description }}">
|
||||
</head>
|
||||
<body>
|
||||
<div id="content">{{ content }}</div>
|
||||
<a href="{{ link_url }}">{{ link_text }}</a>
|
||||
</body>
|
||||
</html>
|
||||
7
tests/mocks/simple.json
Normal file
7
tests/mocks/simple.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"title": "Test Page",
|
||||
"description": "Simple test page",
|
||||
"content": "Hello World",
|
||||
"link_url": "https://example.com",
|
||||
"link_text": "Link"
|
||||
}
|
||||
31
tests/mocks/table.html.jinja
Normal file
31
tests/mocks/table.html.jinja
Normal file
@@ -0,0 +1,31 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>{{ title }}</title>
|
||||
<meta name="description" content="{{ description }}">
|
||||
</head>
|
||||
<body>
|
||||
<h1>{{ heading }}</h1>
|
||||
|
||||
<table id="{{ table_id }}">
|
||||
<thead>
|
||||
<tr>
|
||||
{% for col in columns %}
|
||||
<th>{{ col }}</th>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for row in rows %}
|
||||
<tr>
|
||||
{% for cell in row %}
|
||||
<td>{{ cell }}</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<a href="{{ link_url }}">{{ link_text }}</a>
|
||||
</body>
|
||||
</html>
|
||||
14
tests/mocks/table.json
Normal file
14
tests/mocks/table.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"title": "Table Test Page",
|
||||
"description": "HTML page with a table for parsing tests",
|
||||
"heading": "Sample Table",
|
||||
"table_id": "data-table",
|
||||
"columns": ["Name", "Age", "City"],
|
||||
"rows": [
|
||||
["Alice", "30", "London"],
|
||||
["Bob", "25", "New York"],
|
||||
["Charlie", "35", "Berlin"]
|
||||
],
|
||||
"link_url": "https://example.org/details",
|
||||
"link_text": "Details"
|
||||
}
|
||||
@@ -7,26 +7,26 @@ from omniread.html.parser import HTMLParser
|
||||
from omniread.core.content import Content
|
||||
|
||||
|
||||
class ParsedHTML(BaseModel):
|
||||
class ParsedSimpleHTML(BaseModel):
|
||||
title: Optional[str]
|
||||
description: Optional[str]
|
||||
content: Optional[str]
|
||||
link: Optional[str]
|
||||
|
||||
|
||||
class TestHTMLParser(HTMLParser[ParsedHTML]):
|
||||
class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
|
||||
"""
|
||||
Concrete HTML parser with explicit Pydantic return type.
|
||||
Parser focused on high-level page semantics.
|
||||
"""
|
||||
|
||||
def parse(self) -> ParsedHTML:
|
||||
def parse(self) -> ParsedSimpleHTML:
|
||||
soup = self._soup
|
||||
meta = self.parse_meta()
|
||||
|
||||
content_div = soup.find("div", id="content")
|
||||
link_tag: Tag | None = soup.find("a")
|
||||
|
||||
return ParsedHTML(
|
||||
return ParsedSimpleHTML(
|
||||
title=meta["title"],
|
||||
description=meta["meta"].get("description"),
|
||||
content=self.parse_div(content_div) if content_div else None,
|
||||
@@ -34,16 +34,13 @@ class TestHTMLParser(HTMLParser[ParsedHTML]):
|
||||
)
|
||||
|
||||
|
||||
def test_end_to_end_html_scrape_and_parse(http_scraper):
|
||||
# --- Scrape (real scraper, mocked transport)
|
||||
content: Content = http_scraper.fetch("https://test.local")
|
||||
def test_end_to_end_html_simple(http_scraper):
|
||||
content: Content = http_scraper.fetch("https://test.local/simple")
|
||||
|
||||
# --- Parse
|
||||
parser = TestHTMLParser(content)
|
||||
parser = SimpleHTMLParser(content)
|
||||
result = parser.parse()
|
||||
|
||||
# --- Assertions
|
||||
assert isinstance(result, ParsedHTML)
|
||||
assert isinstance(result, ParsedSimpleHTML)
|
||||
|
||||
assert result.title == "Test Page"
|
||||
assert result.description == "Simple test page"
|
||||
44
tests/test_html_table.py
Normal file
44
tests/test_html_table.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from omniread.html.parser import HTMLParser
|
||||
from omniread.core.content import Content
|
||||
|
||||
|
||||
class ParsedTableHTML(BaseModel):
|
||||
title: Optional[str]
|
||||
table: list[list[str]]
|
||||
|
||||
|
||||
class TableHTMLParser(HTMLParser[ParsedTableHTML]):
|
||||
"""
|
||||
Parser focused on extracting tabular data.
|
||||
"""
|
||||
|
||||
def parse(self) -> ParsedTableHTML:
|
||||
soup = self._soup
|
||||
|
||||
table_tag = soup.find("table")
|
||||
|
||||
return ParsedTableHTML(
|
||||
title=soup.title.string.strip() if soup.title else None,
|
||||
table=self.parse_table(table_tag) if table_tag else [],
|
||||
)
|
||||
|
||||
|
||||
def test_end_to_end_html_table(http_scraper):
|
||||
content: Content = http_scraper.fetch("https://test.local/table")
|
||||
|
||||
parser = TableHTMLParser(content)
|
||||
result = parser.parse()
|
||||
|
||||
assert isinstance(result, ParsedTableHTML)
|
||||
|
||||
assert result.title == "Table Test Page"
|
||||
assert result.table == [
|
||||
["Name", "Age", "City"],
|
||||
["Alice", "30", "London"],
|
||||
["Bob", "25", "New York"],
|
||||
["Charlie", "35", "Berlin"],
|
||||
]
|
||||
Reference in New Issue
Block a user