- Replace deep imports with top-level omniread exports in tests - Ensure tests validate only the supported public API surface - Align HTML and PDF tests with documented library usage
50 lines
1.1 KiB
Python
50 lines
1.1 KiB
Python
from typing import Optional
|
|
|
|
from pydantic import BaseModel
|
|
|
|
from omniread import (
|
|
# core
|
|
Content,
|
|
|
|
# html
|
|
HTMLParser,
|
|
)
|
|
|
|
|
|
class ParsedTableHTML(BaseModel):
|
|
title: Optional[str]
|
|
table: list[list[str]]
|
|
|
|
|
|
class TableHTMLParser(HTMLParser[ParsedTableHTML]):
|
|
"""
|
|
Parser focused on extracting tabular data.
|
|
"""
|
|
|
|
def parse(self) -> ParsedTableHTML:
|
|
soup = self._soup
|
|
|
|
table_tag = soup.find("table")
|
|
|
|
return ParsedTableHTML(
|
|
title=soup.title.string.strip() if soup.title else None,
|
|
table=self.parse_table(table_tag) if table_tag else [],
|
|
)
|
|
|
|
|
|
def test_end_to_end_html_table(http_scraper):
|
|
content: Content = http_scraper.fetch("https://test.local/table")
|
|
|
|
parser = TableHTMLParser(content)
|
|
result = parser.parse()
|
|
|
|
assert isinstance(result, ParsedTableHTML)
|
|
|
|
assert result.title == "Table Test Page"
|
|
assert result.table == [
|
|
["Name", "Age", "City"],
|
|
["Alice", "30", "London"],
|
|
["Bob", "25", "New York"],
|
|
["Charlie", "35", "Berlin"],
|
|
]
|