Compare commits

...

5 Commits

Author SHA1 Message Date
07293e4651 feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers
- Add smart httpx MockTransport routing based on endpoint paths
- Render HTML fixtures via Jinja templates populated from JSON data
- Introduce explicit, typed HTML parsers for semantic and table-based content
- Add end-to-end tests covering scraper → content → parser → Pydantic models
- Enforce explicit output contracts and avoid default dict-based parsing
2026-01-02 18:31:34 +05:30
fa14a79ec9 simple test case 2026-01-02 18:20:03 +05:30
55245cf241 added validation for content type 2026-01-02 18:19:47 +05:30
202329e190 refactor(html-scraper): normalize Content-Type and inject httpx client
- Inject httpx.Client for testability and reuse
- Validate and normalize Content-Type header before returning Content
- Emit ContentType.HTML instead of raw header strings
- Avoid per-request client creation
- Preserve metadata while allowing caller overrides
2026-01-02 18:08:46 +05:30
f59024ddd5 added pydantic 2026-01-02 18:08:37 +05:30
10 changed files with 250 additions and 13 deletions

View File

@@ -1,13 +1,13 @@
import httpx import httpx
from typing import Any, Mapping, Optional from typing import Any, Mapping, Optional
from omniread.core.content import Content from omniread.core.content import Content, ContentType
from omniread.core.scraper import BaseScraper from omniread.core.scraper import BaseScraper
class HTMLScraper(BaseScraper): class HTMLScraper(BaseScraper):
""" """
Base HTTP scraper using httpx. Base HTML scraper using httpx.
Fetches raw bytes and metadata only. Fetches raw bytes and metadata only.
""" """
@@ -15,13 +15,31 @@ class HTMLScraper(BaseScraper):
def __init__( def __init__(
self, self,
*, *,
client: httpx.Client | None = None,
timeout: float = 15.0, timeout: float = 15.0,
headers: Optional[Mapping[str, str]] = None, headers: Optional[Mapping[str, str]] = None,
follow_redirects: bool = True, follow_redirects: bool = True,
): ):
self.timeout = timeout self._client = client or httpx.Client(
self.headers = dict(headers) if headers else {} timeout=timeout,
self.follow_redirects = follow_redirects headers=headers,
follow_redirects=follow_redirects,
)
self.content_type = ContentType.HTML
def validate_content_type(
self,
response: httpx.Response,
):
raw_ct = response.headers.get("Content-Type")
if not raw_ct:
raise ValueError("Missing Content-Type header")
base_ct = raw_ct.split(";", 1)[0].strip().lower()
if base_ct != self.content_type.value:
raise ValueError(
f"Expected HTML content, got '{raw_ct}'"
)
def fetch( def fetch(
self, self,
@@ -29,20 +47,17 @@ class HTMLScraper(BaseScraper):
*, *,
metadata: Optional[Mapping[str, Any]] = None, metadata: Optional[Mapping[str, Any]] = None,
) -> Content: ) -> Content:
with httpx.Client( response = self._client.get(source)
timeout=self.timeout, response.raise_for_status()
headers=self.headers, self.validate_content_type(response)
follow_redirects=self.follow_redirects,
) as client:
response = client.get(source)
response.raise_for_status()
return Content( return Content(
raw=response.content, raw=response.content,
source=source, source=source,
content_type=response.headers.get("Content-Type"), content_type=self.content_type,
metadata={ metadata={
"status_code": response.status_code, "status_code": response.status_code,
"headers": dict(response.headers), "headers": dict(response.headers),
**(metadata or {}),
}, },
) )

View File

@@ -1,5 +1,7 @@
httpx==0.27.0 httpx==0.27.0
beautifulsoup4==4.12.0 beautifulsoup4==4.12.0
pydantic==2.12.3
jinja2==3.1.6
# lxml==5.2.0 # lxml==5.2.0
pytest==7.4.0 pytest==7.4.0

0
tests/__init__.py Normal file
View File

65
tests/conftest.py Normal file
View File

@@ -0,0 +1,65 @@
import json
import pytest
import httpx
from pathlib import Path
from jinja2 import Environment, BaseLoader
from omniread.core.content import ContentType
from omniread.html.scraper import HTMLScraper
MOCK_DIR = Path(__file__).parent / "mocks"
def render_html(template_path, data_path) -> bytes:
template_text = Path(template_path).read_text(encoding="utf-8")
data = json.loads(Path(data_path).read_text(encoding="utf-8"))
env = Environment(
loader=BaseLoader(),
autoescape=False,
)
template = env.from_string(template_text)
rendered = template.render(**data)
return rendered.encode("utf-8")
def mock_transport(request: httpx.Request) -> httpx.Response:
"""
httpx MockTransport handler.
"""
path = request.url.path
if path == "/simple":
content = render_html(
MOCK_DIR / "simple.html.jinja",
MOCK_DIR / "simple.json",
)
elif path == "/table":
content = render_html(
MOCK_DIR / "table.html.jinja",
MOCK_DIR / "table.json",
)
else:
return httpx.Response(
status_code=404,
content=b"Not Found",
request=request,
)
return httpx.Response(
status_code=200,
headers={"Content-Type": ContentType.HTML.value},
content=content,
request=request,
)
@pytest.fixture
def http_scraper() -> HTMLScraper:
transport = httpx.MockTransport(mock_transport)
client = httpx.Client(transport=transport)
return HTMLScraper(client=client)

View File

@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ title }}</title>
<meta name="description" content="{{ description }}">
</head>
<body>
<div id="content">{{ content }}</div>
<a href="{{ link_url }}">{{ link_text }}</a>
</body>
</html>

7
tests/mocks/simple.json Normal file
View File

@@ -0,0 +1,7 @@
{
"title": "Test Page",
"description": "Simple test page",
"content": "Hello World",
"link_url": "https://example.com",
"link_text": "Link"
}

View File

@@ -0,0 +1,31 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ title }}</title>
<meta name="description" content="{{ description }}">
</head>
<body>
<h1>{{ heading }}</h1>
<table id="{{ table_id }}">
<thead>
<tr>
{% for col in columns %}
<th>{{ col }}</th>
{% endfor %}
</tr>
</thead>
<tbody>
{% for row in rows %}
<tr>
{% for cell in row %}
<td>{{ cell }}</td>
{% endfor %}
</tr>
{% endfor %}
</tbody>
</table>
<a href="{{ link_url }}">{{ link_text }}</a>
</body>
</html>

14
tests/mocks/table.json Normal file
View File

@@ -0,0 +1,14 @@
{
"title": "Table Test Page",
"description": "HTML page with a table for parsing tests",
"heading": "Sample Table",
"table_id": "data-table",
"columns": ["Name", "Age", "City"],
"rows": [
["Alice", "30", "London"],
["Bob", "25", "New York"],
["Charlie", "35", "Berlin"]
],
"link_url": "https://example.org/details",
"link_text": "Details"
}

48
tests/test_html_simple.py Normal file
View File

@@ -0,0 +1,48 @@
from typing import Optional
from pydantic import BaseModel
from bs4 import Tag
from omniread.html.parser import HTMLParser
from omniread.core.content import Content
class ParsedSimpleHTML(BaseModel):
title: Optional[str]
description: Optional[str]
content: Optional[str]
link: Optional[str]
class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
"""
Parser focused on high-level page semantics.
"""
def parse(self) -> ParsedSimpleHTML:
soup = self._soup
meta = self.parse_meta()
content_div = soup.find("div", id="content")
link_tag: Tag | None = soup.find("a")
return ParsedSimpleHTML(
title=meta["title"],
description=meta["meta"].get("description"),
content=self.parse_div(content_div) if content_div else None,
link=self.parse_link(link_tag) if link_tag else None,
)
def test_end_to_end_html_simple(http_scraper):
content: Content = http_scraper.fetch("https://test.local/simple")
parser = SimpleHTMLParser(content)
result = parser.parse()
assert isinstance(result, ParsedSimpleHTML)
assert result.title == "Test Page"
assert result.description == "Simple test page"
assert result.content == "Hello World"
assert result.link == "https://example.com"

44
tests/test_html_table.py Normal file
View File

@@ -0,0 +1,44 @@
from typing import Optional
from pydantic import BaseModel
from omniread.html.parser import HTMLParser
from omniread.core.content import Content
class ParsedTableHTML(BaseModel):
title: Optional[str]
table: list[list[str]]
class TableHTMLParser(HTMLParser[ParsedTableHTML]):
"""
Parser focused on extracting tabular data.
"""
def parse(self) -> ParsedTableHTML:
soup = self._soup
table_tag = soup.find("table")
return ParsedTableHTML(
title=soup.title.string.strip() if soup.title else None,
table=self.parse_table(table_tag) if table_tag else [],
)
def test_end_to_end_html_table(http_scraper):
content: Content = http_scraper.fetch("https://test.local/table")
parser = TableHTMLParser(content)
result = parser.parse()
assert isinstance(result, ParsedTableHTML)
assert result.title == "Table Test Page"
assert result.table == [
["Name", "Age", "City"],
["Alice", "30", "London"],
["Bob", "25", "New York"],
["Charlie", "35", "Berlin"],
]