feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers

- Add smart httpx MockTransport routing based on endpoint paths - Render HTML fixtures via Jinja templates populated from JSON data - Introduce explicit, typed HTML parsers for semantic and table-based content - Add end-to-end tests covering scraper → content → parser → Pydantic models - Enforce explicit output contracts and avoid default dict-based parsing
simple test case
2026-01-02 18:31:34 +05:30 · 2026-01-02 18:20:03 +05:30 · 2026-01-02 18:19:47 +05:30 · 2026-01-02 18:08:46 +05:30 · 2026-01-02 18:08:37 +05:30
10 changed files with 250 additions and 13 deletions
--- a/omniread/html/scraper.py
+++ b/omniread/html/scraper.py
@@ -1,13 +1,13 @@
 import httpx
 from typing import Any, Mapping, Optional

-from omniread.core.content import Content
+from omniread.core.content import Content, ContentType
 from omniread.core.scraper import BaseScraper


 class HTMLScraper(BaseScraper):
    """
-    Base HTTP scraper using httpx.
+    Base HTML scraper using httpx.

    Fetches raw bytes and metadata only.
    """
@@ -15,13 +15,31 @@ class HTMLScraper(BaseScraper):
    def __init__(
        self,
        *,
+        client: httpx.Client | None = None,
        timeout: float = 15.0,
        headers: Optional[Mapping[str, str]] = None,
        follow_redirects: bool = True,
    ):
-        self.timeout = timeout
-        self.headers = dict(headers) if headers else {}
-        self.follow_redirects = follow_redirects
+        self._client = client or httpx.Client(
+            timeout=timeout,
+            headers=headers,
+            follow_redirects=follow_redirects,
+        )
+        self.content_type = ContentType.HTML
+
+    def validate_content_type(
+        self,
+        response: httpx.Response,
+    ):
+        raw_ct = response.headers.get("Content-Type")
+        if not raw_ct:
+            raise ValueError("Missing Content-Type header")
+
+        base_ct = raw_ct.split(";", 1)[0].strip().lower()
+        if base_ct != self.content_type.value:
+            raise ValueError(
+                f"Expected HTML content, got '{raw_ct}'"
+            )

    def fetch(
        self,
@@ -29,20 +47,17 @@ class HTMLScraper(BaseScraper):
        *,
        metadata: Optional[Mapping[str, Any]] = None,
    ) -> Content:
-        with httpx.Client(
-            timeout=self.timeout,
-            headers=self.headers,
-            follow_redirects=self.follow_redirects,
-        ) as client:
-            response = client.get(source)
-            response.raise_for_status()
+        response = self._client.get(source)
+        response.raise_for_status()
+        self.validate_content_type(response)

        return Content(
            raw=response.content,
            source=source,
-            content_type=response.headers.get("Content-Type"),
+            content_type=self.content_type,
            metadata={
                "status_code": response.status_code,
                "headers": dict(response.headers),
+                **(metadata or {}),
            },
        )
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
 httpx==0.27.0
 beautifulsoup4==4.12.0
+pydantic==2.12.3
+jinja2==3.1.6
 # lxml==5.2.0

 pytest==7.4.0
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,65 @@
+import json
+import pytest
+import httpx
+from pathlib import Path
+from jinja2 import Environment, BaseLoader
+
+from omniread.core.content import ContentType
+from omniread.html.scraper import HTMLScraper
+
+
+MOCK_DIR = Path(__file__).parent / "mocks"
+
+
+def render_html(template_path, data_path) -> bytes:
+    template_text = Path(template_path).read_text(encoding="utf-8")
+    data = json.loads(Path(data_path).read_text(encoding="utf-8"))
+
+    env = Environment(
+        loader=BaseLoader(),
+        autoescape=False,
+    )
+    template = env.from_string(template_text)
+
+    rendered = template.render(**data)
+    return rendered.encode("utf-8")
+
+
+def mock_transport(request: httpx.Request) -> httpx.Response:
+    """
+    httpx MockTransport handler.
+    """
+    path = request.url.path
+
+    if path == "/simple":
+        content = render_html(
+            MOCK_DIR / "simple.html.jinja",
+            MOCK_DIR / "simple.json",
+        )
+    elif path == "/table":
+        content = render_html(
+            MOCK_DIR / "table.html.jinja",
+            MOCK_DIR / "table.json",
+        )
+    else:
+        return httpx.Response(
+            status_code=404,
+            content=b"Not Found",
+            request=request,
+        )
+
+    return httpx.Response(
+        status_code=200,
+        headers={"Content-Type": ContentType.HTML.value},
+        content=content,
+        request=request,
+    )
+
+
+@pytest.fixture
+def http_scraper() -> HTMLScraper:
+    transport = httpx.MockTransport(mock_transport)
+
+    client = httpx.Client(transport=transport)
+
+    return HTMLScraper(client=client)
--- a/tests/mocks/simple.html.jinja
+++ b/tests/mocks/simple.html.jinja
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>{{ title }}</title>
+    <meta name="description" content="{{ description }}">
+  </head>
+  <body>
+    <div id="content">{{ content }}</div>
+    <a href="{{ link_url }}">{{ link_text }}</a>
+  </body>
+</html>
--- a/tests/mocks/simple.json
+++ b/tests/mocks/simple.json
@@ -0,0 +1,7 @@
+{
+  "title": "Test Page",
+  "description": "Simple test page",
+  "content": "Hello World",
+  "link_url": "https://example.com",
+  "link_text": "Link"
+}
--- a/tests/mocks/table.html.jinja
+++ b/tests/mocks/table.html.jinja
@@ -0,0 +1,31 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>{{ title }}</title>
+    <meta name="description" content="{{ description }}">
+  </head>
+  <body>
+    <h1>{{ heading }}</h1>
+
+    <table id="{{ table_id }}">
+      <thead>
+        <tr>
+          {% for col in columns %}
+          <th>{{ col }}</th>
+          {% endfor %}
+        </tr>
+      </thead>
+      <tbody>
+        {% for row in rows %}
+        <tr>
+          {% for cell in row %}
+          <td>{{ cell }}</td>
+          {% endfor %}
+        </tr>
+        {% endfor %}
+      </tbody>
+    </table>
+
+    <a href="{{ link_url }}">{{ link_text }}</a>
+  </body>
+</html>
--- a/tests/mocks/table.json
+++ b/tests/mocks/table.json
@@ -0,0 +1,14 @@
+{
+  "title": "Table Test Page",
+  "description": "HTML page with a table for parsing tests",
+  "heading": "Sample Table",
+  "table_id": "data-table",
+  "columns": ["Name", "Age", "City"],
+  "rows": [
+    ["Alice", "30", "London"],
+    ["Bob", "25", "New York"],
+    ["Charlie", "35", "Berlin"]
+  ],
+  "link_url": "https://example.org/details",
+  "link_text": "Details"
+}
--- a/tests/test_html_simple.py
+++ b/tests/test_html_simple.py
@@ -0,0 +1,48 @@
+from typing import Optional
+
+from pydantic import BaseModel
+from bs4 import Tag
+
+from omniread.html.parser import HTMLParser
+from omniread.core.content import Content
+
+
+class ParsedSimpleHTML(BaseModel):
+    title: Optional[str]
+    description: Optional[str]
+    content: Optional[str]
+    link: Optional[str]
+
+
+class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
+    """
+    Parser focused on high-level page semantics.
+    """
+
+    def parse(self) -> ParsedSimpleHTML:
+        soup = self._soup
+        meta = self.parse_meta()
+
+        content_div = soup.find("div", id="content")
+        link_tag: Tag | None = soup.find("a")
+
+        return ParsedSimpleHTML(
+            title=meta["title"],
+            description=meta["meta"].get("description"),
+            content=self.parse_div(content_div) if content_div else None,
+            link=self.parse_link(link_tag) if link_tag else None,
+        )
+
+
+def test_end_to_end_html_simple(http_scraper):
+    content: Content = http_scraper.fetch("https://test.local/simple")
+
+    parser = SimpleHTMLParser(content)
+    result = parser.parse()
+
+    assert isinstance(result, ParsedSimpleHTML)
+
+    assert result.title == "Test Page"
+    assert result.description == "Simple test page"
+    assert result.content == "Hello World"
+    assert result.link == "https://example.com"
--- a/tests/test_html_table.py
+++ b/tests/test_html_table.py
@@ -0,0 +1,44 @@
+from typing import Optional
+
+from pydantic import BaseModel
+
+from omniread.html.parser import HTMLParser
+from omniread.core.content import Content
+
+
+class ParsedTableHTML(BaseModel):
+    title: Optional[str]
+    table: list[list[str]]
+
+
+class TableHTMLParser(HTMLParser[ParsedTableHTML]):
+    """
+    Parser focused on extracting tabular data.
+    """
+
+    def parse(self) -> ParsedTableHTML:
+        soup = self._soup
+
+        table_tag = soup.find("table")
+
+        return ParsedTableHTML(
+            title=soup.title.string.strip() if soup.title else None,
+            table=self.parse_table(table_tag) if table_tag else [],
+        )
+
+
+def test_end_to_end_html_table(http_scraper):
+    content: Content = http_scraper.fetch("https://test.local/table")
+
+    parser = TableHTMLParser(content)
+    result = parser.parse()
+
+    assert isinstance(result, ParsedTableHTML)
+
+    assert result.title == "Table Test Page"
+    assert result.table == [
+        ["Name", "Age", "City"],
+        ["Alice", "30", "London"],
+        ["Bob", "25", "New York"],
+        ["Charlie", "35", "Berlin"],
+    ]
Author	SHA1	Message	Date
Vishesh 'ironeagle' Bangotra	07293e4651	feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers - Add smart httpx MockTransport routing based on endpoint paths - Render HTML fixtures via Jinja templates populated from JSON data - Introduce explicit, typed HTML parsers for semantic and table-based content - Add end-to-end tests covering scraper → content → parser → Pydantic models - Enforce explicit output contracts and avoid default dict-based parsing	2026-01-02 18:31:34 +05:30
Vishesh 'ironeagle' Bangotra	fa14a79ec9	simple test case	2026-01-02 18:20:03 +05:30
Vishesh 'ironeagle' Bangotra	55245cf241	added validation for content type	2026-01-02 18:19:47 +05:30
Vishesh 'ironeagle' Bangotra	202329e190	refactor(html-scraper): normalize Content-Type and inject httpx client - Inject httpx.Client for testability and reuse - Validate and normalize Content-Type header before returning Content - Emit ContentType.HTML instead of raw header strings - Avoid per-request client creation - Preserve metadata while allowing caller overrides	2026-01-02 18:08:46 +05:30
Vishesh 'ironeagle' Bangotra	f59024ddd5	added pydantic	2026-01-02 18:08:37 +05:30