feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers
- Add smart httpx MockTransport routing based on endpoint paths - Render HTML fixtures via Jinja templates populated from JSON data - Introduce explicit, typed HTML parsers for semantic and table-based content - Add end-to-end tests covering scraper → content → parser → Pydantic models - Enforce explicit output contracts and avoid default dict-based parsing
This commit is contained in:
44
tests/test_html_table.py
Normal file
44
tests/test_html_table.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from omniread.html.parser import HTMLParser
|
||||
from omniread.core.content import Content
|
||||
|
||||
|
||||
class ParsedTableHTML(BaseModel):
|
||||
title: Optional[str]
|
||||
table: list[list[str]]
|
||||
|
||||
|
||||
class TableHTMLParser(HTMLParser[ParsedTableHTML]):
|
||||
"""
|
||||
Parser focused on extracting tabular data.
|
||||
"""
|
||||
|
||||
def parse(self) -> ParsedTableHTML:
|
||||
soup = self._soup
|
||||
|
||||
table_tag = soup.find("table")
|
||||
|
||||
return ParsedTableHTML(
|
||||
title=soup.title.string.strip() if soup.title else None,
|
||||
table=self.parse_table(table_tag) if table_tag else [],
|
||||
)
|
||||
|
||||
|
||||
def test_end_to_end_html_table(http_scraper):
|
||||
content: Content = http_scraper.fetch("https://test.local/table")
|
||||
|
||||
parser = TableHTMLParser(content)
|
||||
result = parser.parse()
|
||||
|
||||
assert isinstance(result, ParsedTableHTML)
|
||||
|
||||
assert result.title == "Table Test Page"
|
||||
assert result.table == [
|
||||
["Name", "Age", "City"],
|
||||
["Alice", "30", "London"],
|
||||
["Bob", "25", "New York"],
|
||||
["Charlie", "35", "Berlin"],
|
||||
]
|
||||
Reference in New Issue
Block a user