from typing import Optional from pydantic import BaseModel from bs4 import Tag from omniread.html.parser import HTMLParser from omniread.core.content import Content class ParsedHTML(BaseModel): title: Optional[str] description: Optional[str] content: Optional[str] link: Optional[str] class TestHTMLParser(HTMLParser[ParsedHTML]): """ Concrete HTML parser with explicit Pydantic return type. """ def parse(self) -> ParsedHTML: soup = self._soup meta = self.parse_meta() content_div = soup.find("div", id="content") link_tag: Tag | None = soup.find("a") return ParsedHTML( title=meta["title"], description=meta["meta"].get("description"), content=self.parse_div(content_div) if content_div else None, link=self.parse_link(link_tag) if link_tag else None, ) def test_end_to_end_html_scrape_and_parse(http_scraper): # --- Scrape (real scraper, mocked transport) content: Content = http_scraper.fetch("https://test.local") # --- Parse parser = TestHTMLParser(content) result = parser.parse() # --- Assertions assert isinstance(result, ParsedHTML) assert result.title == "Test Page" assert result.description == "Simple test page" assert result.content == "Hello World" assert result.link == "https://example.com"