diff --git a/requirements.txt b/requirements.txt index 3a227ed..a356612 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ httpx==0.27.0 beautifulsoup4==4.12.0 pydantic==2.12.3 +jinja2==3.1.6 # lxml==5.2.0 pytest==7.4.0 diff --git a/tests/conftest.py b/tests/conftest.py index 055b4ee..c9eca80 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,33 +1,57 @@ +import json import pytest import httpx +from pathlib import Path +from jinja2 import Environment, BaseLoader from omniread.core.content import ContentType from omniread.html.scraper import HTMLScraper -TEST_HTML = b""" - - - - Test Page - - - -
Hello World
- Link - - -""" +MOCK_DIR = Path(__file__).parent / "mocks" + + +def render_html(template_path, data_path) -> bytes: + template_text = Path(template_path).read_text(encoding="utf-8") + data = json.loads(Path(data_path).read_text(encoding="utf-8")) + + env = Environment( + loader=BaseLoader(), + autoescape=False, + ) + template = env.from_string(template_text) + + rendered = template.render(**data) + return rendered.encode("utf-8") def mock_transport(request: httpx.Request) -> httpx.Response: """ httpx MockTransport handler. """ + path = request.url.path + + if path == "/simple": + content = render_html( + MOCK_DIR / "simple.html.jinja", + MOCK_DIR / "simple.json", + ) + elif path == "/table": + content = render_html( + MOCK_DIR / "table.html.jinja", + MOCK_DIR / "table.json", + ) + else: + return httpx.Response( + status_code=404, + content=b"Not Found", + request=request, + ) + return httpx.Response( status_code=200, headers={"Content-Type": ContentType.HTML.value}, - content=TEST_HTML, + content=content, request=request, ) @@ -38,8 +62,4 @@ def http_scraper() -> HTMLScraper: client = httpx.Client(transport=transport) - # Patch scraper to use our mocked client - scraper = HTMLScraper() - scraper._client = client # intentional test-only override - - return scraper + return HTMLScraper(client=client) diff --git a/tests/mocks/simple.html.jinja b/tests/mocks/simple.html.jinja new file mode 100644 index 0000000..a3b25d9 --- /dev/null +++ b/tests/mocks/simple.html.jinja @@ -0,0 +1,11 @@ + + + + {{ title }} + + + +
{{ content }}
+ {{ link_text }} + + diff --git a/tests/mocks/simple.json b/tests/mocks/simple.json new file mode 100644 index 0000000..fba6f6f --- /dev/null +++ b/tests/mocks/simple.json @@ -0,0 +1,7 @@ +{ + "title": "Test Page", + "description": "Simple test page", + "content": "Hello World", + "link_url": "https://example.com", + "link_text": "Link" +} diff --git a/tests/mocks/table.html.jinja b/tests/mocks/table.html.jinja new file mode 100644 index 0000000..9bcba3c --- /dev/null +++ b/tests/mocks/table.html.jinja @@ -0,0 +1,31 @@ + + + + {{ title }} + + + +

{{ heading }}

+ + + + + {% for col in columns %} + + {% endfor %} + + + + {% for row in rows %} + + {% for cell in row %} + + {% endfor %} + + {% endfor %} + +
{{ col }}
{{ cell }}
+ + {{ link_text }} + + diff --git a/tests/mocks/table.json b/tests/mocks/table.json new file mode 100644 index 0000000..ea0815b --- /dev/null +++ b/tests/mocks/table.json @@ -0,0 +1,14 @@ +{ + "title": "Table Test Page", + "description": "HTML page with a table for parsing tests", + "heading": "Sample Table", + "table_id": "data-table", + "columns": ["Name", "Age", "City"], + "rows": [ + ["Alice", "30", "London"], + ["Bob", "25", "New York"], + ["Charlie", "35", "Berlin"] + ], + "link_url": "https://example.org/details", + "link_text": "Details" +} diff --git a/tests/test_html.py b/tests/test_html_simple.py similarity index 65% rename from tests/test_html.py rename to tests/test_html_simple.py index a26b075..c5129d2 100644 --- a/tests/test_html.py +++ b/tests/test_html_simple.py @@ -7,26 +7,26 @@ from omniread.html.parser import HTMLParser from omniread.core.content import Content -class ParsedHTML(BaseModel): +class ParsedSimpleHTML(BaseModel): title: Optional[str] description: Optional[str] content: Optional[str] link: Optional[str] -class TestHTMLParser(HTMLParser[ParsedHTML]): +class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]): """ - Concrete HTML parser with explicit Pydantic return type. + Parser focused on high-level page semantics. """ - def parse(self) -> ParsedHTML: + def parse(self) -> ParsedSimpleHTML: soup = self._soup meta = self.parse_meta() content_div = soup.find("div", id="content") link_tag: Tag | None = soup.find("a") - return ParsedHTML( + return ParsedSimpleHTML( title=meta["title"], description=meta["meta"].get("description"), content=self.parse_div(content_div) if content_div else None, @@ -34,16 +34,13 @@ class TestHTMLParser(HTMLParser[ParsedHTML]): ) -def test_end_to_end_html_scrape_and_parse(http_scraper): - # --- Scrape (real scraper, mocked transport) - content: Content = http_scraper.fetch("https://test.local") +def test_end_to_end_html_simple(http_scraper): + content: Content = http_scraper.fetch("https://test.local/simple") - # --- Parse - parser = TestHTMLParser(content) + parser = SimpleHTMLParser(content) result = parser.parse() - # --- Assertions - assert isinstance(result, ParsedHTML) + assert isinstance(result, ParsedSimpleHTML) assert result.title == "Test Page" assert result.description == "Simple test page" diff --git a/tests/test_html_table.py b/tests/test_html_table.py new file mode 100644 index 0000000..e7b998a --- /dev/null +++ b/tests/test_html_table.py @@ -0,0 +1,44 @@ +from typing import Optional + +from pydantic import BaseModel + +from omniread.html.parser import HTMLParser +from omniread.core.content import Content + + +class ParsedTableHTML(BaseModel): + title: Optional[str] + table: list[list[str]] + + +class TableHTMLParser(HTMLParser[ParsedTableHTML]): + """ + Parser focused on extracting tabular data. + """ + + def parse(self) -> ParsedTableHTML: + soup = self._soup + + table_tag = soup.find("table") + + return ParsedTableHTML( + title=soup.title.string.strip() if soup.title else None, + table=self.parse_table(table_tag) if table_tag else [], + ) + + +def test_end_to_end_html_table(http_scraper): + content: Content = http_scraper.fetch("https://test.local/table") + + parser = TableHTMLParser(content) + result = parser.parse() + + assert isinstance(result, ParsedTableHTML) + + assert result.title == "Table Test Page" + assert result.table == [ + ["Name", "Age", "City"], + ["Alice", "30", "London"], + ["Bob", "25", "New York"], + ["Charlie", "35", "Berlin"], + ]