diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..055b4ee --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,45 @@ +import pytest +import httpx + +from omniread.core.content import ContentType +from omniread.html.scraper import HTMLScraper + + +TEST_HTML = b""" + + + + Test Page + + + +
Hello World
+ Link + + +""" + + +def mock_transport(request: httpx.Request) -> httpx.Response: + """ + httpx MockTransport handler. + """ + return httpx.Response( + status_code=200, + headers={"Content-Type": ContentType.HTML.value}, + content=TEST_HTML, + request=request, + ) + + +@pytest.fixture +def http_scraper() -> HTMLScraper: + transport = httpx.MockTransport(mock_transport) + + client = httpx.Client(transport=transport) + + # Patch scraper to use our mocked client + scraper = HTMLScraper() + scraper._client = client # intentional test-only override + + return scraper diff --git a/tests/test_html.py b/tests/test_html.py new file mode 100644 index 0000000..a26b075 --- /dev/null +++ b/tests/test_html.py @@ -0,0 +1,51 @@ +from typing import Optional + +from pydantic import BaseModel +from bs4 import Tag + +from omniread.html.parser import HTMLParser +from omniread.core.content import Content + + +class ParsedHTML(BaseModel): + title: Optional[str] + description: Optional[str] + content: Optional[str] + link: Optional[str] + + +class TestHTMLParser(HTMLParser[ParsedHTML]): + """ + Concrete HTML parser with explicit Pydantic return type. + """ + + def parse(self) -> ParsedHTML: + soup = self._soup + meta = self.parse_meta() + + content_div = soup.find("div", id="content") + link_tag: Tag | None = soup.find("a") + + return ParsedHTML( + title=meta["title"], + description=meta["meta"].get("description"), + content=self.parse_div(content_div) if content_div else None, + link=self.parse_link(link_tag) if link_tag else None, + ) + + +def test_end_to_end_html_scrape_and_parse(http_scraper): + # --- Scrape (real scraper, mocked transport) + content: Content = http_scraper.fetch("https://test.local") + + # --- Parse + parser = TestHTMLParser(content) + result = parser.parse() + + # --- Assertions + assert isinstance(result, ParsedHTML) + + assert result.title == "Test Page" + assert result.description == "Simple test page" + assert result.content == "Hello World" + assert result.link == "https://example.com"