- Replace deep imports with top-level omniread exports in tests - Ensure tests validate only the supported public API surface - Align HTML and PDF tests with documented library usage
54 lines
1.3 KiB
Python
54 lines
1.3 KiB
Python
from typing import Optional
|
|
|
|
from pydantic import BaseModel
|
|
from bs4 import Tag
|
|
|
|
from omniread import (
|
|
# core
|
|
Content,
|
|
|
|
# html
|
|
HTMLParser,
|
|
)
|
|
|
|
|
|
class ParsedSimpleHTML(BaseModel):
|
|
title: Optional[str]
|
|
description: Optional[str]
|
|
content: Optional[str]
|
|
link: Optional[str]
|
|
|
|
|
|
class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
|
|
"""
|
|
Parser focused on high-level page semantics.
|
|
"""
|
|
|
|
def parse(self) -> ParsedSimpleHTML:
|
|
soup = self._soup
|
|
meta = self.parse_meta()
|
|
|
|
content_div = soup.find("div", id="content")
|
|
link_tag: Tag | None = soup.find("a")
|
|
|
|
return ParsedSimpleHTML(
|
|
title=meta["title"],
|
|
description=meta["meta"].get("description"),
|
|
content=self.parse_div(content_div) if content_div else None,
|
|
link=self.parse_link(link_tag) if link_tag else None,
|
|
)
|
|
|
|
|
|
def test_end_to_end_html_simple(http_scraper):
|
|
content: Content = http_scraper.fetch("https://test.local/simple")
|
|
|
|
parser = SimpleHTMLParser(content)
|
|
result = parser.parse()
|
|
|
|
assert isinstance(result, ParsedSimpleHTML)
|
|
|
|
assert result.title == "Test Page"
|
|
assert result.description == "Simple test page"
|
|
assert result.content == "Hello World"
|
|
assert result.link == "https://example.com"
|