feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers

- Add smart httpx MockTransport routing based on endpoint paths
- Render HTML fixtures via Jinja templates populated from JSON data
- Introduce explicit, typed HTML parsers for semantic and table-based content
- Add end-to-end tests covering scraper → content → parser → Pydantic models
- Enforce explicit output contracts and avoid default dict-based parsing
This commit is contained in:
2026-01-02 18:31:34 +05:30
parent fa14a79ec9
commit 07293e4651
8 changed files with 156 additions and 31 deletions

48
tests/test_html_simple.py Normal file
View File

@@ -0,0 +1,48 @@
from typing import Optional
from pydantic import BaseModel
from bs4 import Tag
from omniread.html.parser import HTMLParser
from omniread.core.content import Content
class ParsedSimpleHTML(BaseModel):
title: Optional[str]
description: Optional[str]
content: Optional[str]
link: Optional[str]
class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
"""
Parser focused on high-level page semantics.
"""
def parse(self) -> ParsedSimpleHTML:
soup = self._soup
meta = self.parse_meta()
content_div = soup.find("div", id="content")
link_tag: Tag | None = soup.find("a")
return ParsedSimpleHTML(
title=meta["title"],
description=meta["meta"].get("description"),
content=self.parse_div(content_div) if content_div else None,
link=self.parse_link(link_tag) if link_tag else None,
)
def test_end_to_end_html_simple(http_scraper):
content: Content = http_scraper.fetch("https://test.local/simple")
parser = SimpleHTMLParser(content)
result = parser.parse()
assert isinstance(result, ParsedSimpleHTML)
assert result.title == "Test Page"
assert result.description == "Simple test page"
assert result.content == "Hello World"
assert result.link == "https://example.com"