diff --git a/requirements.txt b/requirements.txt
index 3a227ed..a356612 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
httpx==0.27.0
beautifulsoup4==4.12.0
pydantic==2.12.3
+jinja2==3.1.6
# lxml==5.2.0
pytest==7.4.0
diff --git a/tests/conftest.py b/tests/conftest.py
index 055b4ee..c9eca80 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,33 +1,57 @@
+import json
import pytest
import httpx
+from pathlib import Path
+from jinja2 import Environment, BaseLoader
from omniread.core.content import ContentType
from omniread.html.scraper import HTMLScraper
-TEST_HTML = b"""
-
-
-
- Test Page
-
-
-
- Hello World
- Link
-
-
-"""
+MOCK_DIR = Path(__file__).parent / "mocks"
+
+
+def render_html(template_path, data_path) -> bytes:
+ template_text = Path(template_path).read_text(encoding="utf-8")
+ data = json.loads(Path(data_path).read_text(encoding="utf-8"))
+
+ env = Environment(
+ loader=BaseLoader(),
+ autoescape=False,
+ )
+ template = env.from_string(template_text)
+
+ rendered = template.render(**data)
+ return rendered.encode("utf-8")
def mock_transport(request: httpx.Request) -> httpx.Response:
"""
httpx MockTransport handler.
"""
+ path = request.url.path
+
+ if path == "/simple":
+ content = render_html(
+ MOCK_DIR / "simple.html.jinja",
+ MOCK_DIR / "simple.json",
+ )
+ elif path == "/table":
+ content = render_html(
+ MOCK_DIR / "table.html.jinja",
+ MOCK_DIR / "table.json",
+ )
+ else:
+ return httpx.Response(
+ status_code=404,
+ content=b"Not Found",
+ request=request,
+ )
+
return httpx.Response(
status_code=200,
headers={"Content-Type": ContentType.HTML.value},
- content=TEST_HTML,
+ content=content,
request=request,
)
@@ -38,8 +62,4 @@ def http_scraper() -> HTMLScraper:
client = httpx.Client(transport=transport)
- # Patch scraper to use our mocked client
- scraper = HTMLScraper()
- scraper._client = client # intentional test-only override
-
- return scraper
+ return HTMLScraper(client=client)
diff --git a/tests/mocks/simple.html.jinja b/tests/mocks/simple.html.jinja
new file mode 100644
index 0000000..a3b25d9
--- /dev/null
+++ b/tests/mocks/simple.html.jinja
@@ -0,0 +1,11 @@
+
+
+
+ {{ title }}
+
+
+
+ {{ content }}
+ {{ link_text }}
+
+
diff --git a/tests/mocks/simple.json b/tests/mocks/simple.json
new file mode 100644
index 0000000..fba6f6f
--- /dev/null
+++ b/tests/mocks/simple.json
@@ -0,0 +1,7 @@
+{
+ "title": "Test Page",
+ "description": "Simple test page",
+ "content": "Hello World",
+ "link_url": "https://example.com",
+ "link_text": "Link"
+}
diff --git a/tests/mocks/table.html.jinja b/tests/mocks/table.html.jinja
new file mode 100644
index 0000000..9bcba3c
--- /dev/null
+++ b/tests/mocks/table.html.jinja
@@ -0,0 +1,31 @@
+
+
+
+ {{ title }}
+
+
+
+ {{ heading }}
+
+
+
+
+ {% for col in columns %}
+ | {{ col }} |
+ {% endfor %}
+
+
+
+ {% for row in rows %}
+
+ {% for cell in row %}
+ | {{ cell }} |
+ {% endfor %}
+
+ {% endfor %}
+
+
+
+ {{ link_text }}
+
+
diff --git a/tests/mocks/table.json b/tests/mocks/table.json
new file mode 100644
index 0000000..ea0815b
--- /dev/null
+++ b/tests/mocks/table.json
@@ -0,0 +1,14 @@
+{
+ "title": "Table Test Page",
+ "description": "HTML page with a table for parsing tests",
+ "heading": "Sample Table",
+ "table_id": "data-table",
+ "columns": ["Name", "Age", "City"],
+ "rows": [
+ ["Alice", "30", "London"],
+ ["Bob", "25", "New York"],
+ ["Charlie", "35", "Berlin"]
+ ],
+ "link_url": "https://example.org/details",
+ "link_text": "Details"
+}
diff --git a/tests/test_html.py b/tests/test_html_simple.py
similarity index 65%
rename from tests/test_html.py
rename to tests/test_html_simple.py
index a26b075..c5129d2 100644
--- a/tests/test_html.py
+++ b/tests/test_html_simple.py
@@ -7,26 +7,26 @@ from omniread.html.parser import HTMLParser
from omniread.core.content import Content
-class ParsedHTML(BaseModel):
+class ParsedSimpleHTML(BaseModel):
title: Optional[str]
description: Optional[str]
content: Optional[str]
link: Optional[str]
-class TestHTMLParser(HTMLParser[ParsedHTML]):
+class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
"""
- Concrete HTML parser with explicit Pydantic return type.
+ Parser focused on high-level page semantics.
"""
- def parse(self) -> ParsedHTML:
+ def parse(self) -> ParsedSimpleHTML:
soup = self._soup
meta = self.parse_meta()
content_div = soup.find("div", id="content")
link_tag: Tag | None = soup.find("a")
- return ParsedHTML(
+ return ParsedSimpleHTML(
title=meta["title"],
description=meta["meta"].get("description"),
content=self.parse_div(content_div) if content_div else None,
@@ -34,16 +34,13 @@ class TestHTMLParser(HTMLParser[ParsedHTML]):
)
-def test_end_to_end_html_scrape_and_parse(http_scraper):
- # --- Scrape (real scraper, mocked transport)
- content: Content = http_scraper.fetch("https://test.local")
+def test_end_to_end_html_simple(http_scraper):
+ content: Content = http_scraper.fetch("https://test.local/simple")
- # --- Parse
- parser = TestHTMLParser(content)
+ parser = SimpleHTMLParser(content)
result = parser.parse()
- # --- Assertions
- assert isinstance(result, ParsedHTML)
+ assert isinstance(result, ParsedSimpleHTML)
assert result.title == "Test Page"
assert result.description == "Simple test page"
diff --git a/tests/test_html_table.py b/tests/test_html_table.py
new file mode 100644
index 0000000..e7b998a
--- /dev/null
+++ b/tests/test_html_table.py
@@ -0,0 +1,44 @@
+from typing import Optional
+
+from pydantic import BaseModel
+
+from omniread.html.parser import HTMLParser
+from omniread.core.content import Content
+
+
+class ParsedTableHTML(BaseModel):
+ title: Optional[str]
+ table: list[list[str]]
+
+
+class TableHTMLParser(HTMLParser[ParsedTableHTML]):
+ """
+ Parser focused on extracting tabular data.
+ """
+
+ def parse(self) -> ParsedTableHTML:
+ soup = self._soup
+
+ table_tag = soup.find("table")
+
+ return ParsedTableHTML(
+ title=soup.title.string.strip() if soup.title else None,
+ table=self.parse_table(table_tag) if table_tag else [],
+ )
+
+
+def test_end_to_end_html_table(http_scraper):
+ content: Content = http_scraper.fetch("https://test.local/table")
+
+ parser = TableHTMLParser(content)
+ result = parser.parse()
+
+ assert isinstance(result, ParsedTableHTML)
+
+ assert result.title == "Table Test Page"
+ assert result.table == [
+ ["Name", "Age", "City"],
+ ["Alice", "30", "London"],
+ ["Bob", "25", "New York"],
+ ["Charlie", "35", "Berlin"],
+ ]