from typing import Optional from pydantic import BaseModel from bs4 import Tag from omniread.html.parser import HTMLParser from omniread.core.content import Content class ParsedSimpleHTML(BaseModel): title: Optional[str] description: Optional[str] content: Optional[str] link: Optional[str] class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]): """ Parser focused on high-level page semantics. """ def parse(self) -> ParsedSimpleHTML: soup = self._soup meta = self.parse_meta() content_div = soup.find("div", id="content") link_tag: Tag | None = soup.find("a") return ParsedSimpleHTML( title=meta["title"], description=meta["meta"].get("description"), content=self.parse_div(content_div) if content_div else None, link=self.parse_link(link_tag) if link_tag else None, ) def test_end_to_end_html_simple(http_scraper): content: Content = http_scraper.fetch("https://test.local/simple") parser = SimpleHTMLParser(content) result = parser.parse() assert isinstance(result, ParsedSimpleHTML) assert result.title == "Test Page" assert result.description == "Simple test page" assert result.content == "Hello World" assert result.link == "https://example.com"