from typing import Optional from pydantic import BaseModel from omniread.html.parser import HTMLParser from omniread.core.content import Content class ParsedTableHTML(BaseModel): title: Optional[str] table: list[list[str]] class TableHTMLParser(HTMLParser[ParsedTableHTML]): """ Parser focused on extracting tabular data. """ def parse(self) -> ParsedTableHTML: soup = self._soup table_tag = soup.find("table") return ParsedTableHTML( title=soup.title.string.strip() if soup.title else None, table=self.parse_table(table_tag) if table_tag else [], ) def test_end_to_end_html_table(http_scraper): content: Content = http_scraper.fetch("https://test.local/table") parser = TableHTMLParser(content) result = parser.parse() assert isinstance(result, ParsedTableHTML) assert result.title == "Table Test Page" assert result.table == [ ["Name", "Age", "City"], ["Alice", "30", "London"], ["Bob", "25", "New York"], ["Charlie", "35", "Berlin"], ]