Files
omniread/tests/test_html_table.py
Vishesh 'ironeagle' Bangotra 07293e4651 feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers
- Add smart httpx MockTransport routing based on endpoint paths
- Render HTML fixtures via Jinja templates populated from JSON data
- Introduce explicit, typed HTML parsers for semantic and table-based content
- Add end-to-end tests covering scraper → content → parser → Pydantic models
- Enforce explicit output contracts and avoid default dict-based parsing
2026-01-02 18:31:34 +05:30

45 lines
1.1 KiB
Python

from typing import Optional
from pydantic import BaseModel
from omniread.html.parser import HTMLParser
from omniread.core.content import Content
class ParsedTableHTML(BaseModel):
title: Optional[str]
table: list[list[str]]
class TableHTMLParser(HTMLParser[ParsedTableHTML]):
"""
Parser focused on extracting tabular data.
"""
def parse(self) -> ParsedTableHTML:
soup = self._soup
table_tag = soup.find("table")
return ParsedTableHTML(
title=soup.title.string.strip() if soup.title else None,
table=self.parse_table(table_tag) if table_tag else [],
)
def test_end_to_end_html_table(http_scraper):
content: Content = http_scraper.fetch("https://test.local/table")
parser = TableHTMLParser(content)
result = parser.parse()
assert isinstance(result, ParsedTableHTML)
assert result.title == "Table Test Page"
assert result.table == [
["Name", "Age", "City"],
["Alice", "30", "London"],
["Bob", "25", "New York"],
["Charlie", "35", "Berlin"],
]