Compare commits

..

5 Commits

Author SHA1 Message Date
07293e4651 feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers
- Add smart httpx MockTransport routing based on endpoint paths
- Render HTML fixtures via Jinja templates populated from JSON data
- Introduce explicit, typed HTML parsers for semantic and table-based content
- Add end-to-end tests covering scraper → content → parser → Pydantic models
- Enforce explicit output contracts and avoid default dict-based parsing
2026-01-02 18:31:34 +05:30
fa14a79ec9 simple test case 2026-01-02 18:20:03 +05:30
55245cf241 added validation for content type 2026-01-02 18:19:47 +05:30
202329e190 refactor(html-scraper): normalize Content-Type and inject httpx client
- Inject httpx.Client for testability and reuse
- Validate and normalize Content-Type header before returning Content
- Emit ContentType.HTML instead of raw header strings
- Avoid per-request client creation
- Preserve metadata while allowing caller overrides
2026-01-02 18:08:46 +05:30
f59024ddd5 added pydantic 2026-01-02 18:08:37 +05:30
10 changed files with 250 additions and 13 deletions

View File

@@ -1,13 +1,13 @@
import httpx
from typing import Any, Mapping, Optional
from omniread.core.content import Content
from omniread.core.content import Content, ContentType
from omniread.core.scraper import BaseScraper
class HTMLScraper(BaseScraper):
"""
Base HTTP scraper using httpx.
Base HTML scraper using httpx.
Fetches raw bytes and metadata only.
"""
@@ -15,13 +15,31 @@ class HTMLScraper(BaseScraper):
def __init__(
self,
*,
client: httpx.Client | None = None,
timeout: float = 15.0,
headers: Optional[Mapping[str, str]] = None,
follow_redirects: bool = True,
):
self.timeout = timeout
self.headers = dict(headers) if headers else {}
self.follow_redirects = follow_redirects
self._client = client or httpx.Client(
timeout=timeout,
headers=headers,
follow_redirects=follow_redirects,
)
self.content_type = ContentType.HTML
def validate_content_type(
self,
response: httpx.Response,
):
raw_ct = response.headers.get("Content-Type")
if not raw_ct:
raise ValueError("Missing Content-Type header")
base_ct = raw_ct.split(";", 1)[0].strip().lower()
if base_ct != self.content_type.value:
raise ValueError(
f"Expected HTML content, got '{raw_ct}'"
)
def fetch(
self,
@@ -29,20 +47,17 @@ class HTMLScraper(BaseScraper):
*,
metadata: Optional[Mapping[str, Any]] = None,
) -> Content:
with httpx.Client(
timeout=self.timeout,
headers=self.headers,
follow_redirects=self.follow_redirects,
) as client:
response = client.get(source)
response.raise_for_status()
response = self._client.get(source)
response.raise_for_status()
self.validate_content_type(response)
return Content(
raw=response.content,
source=source,
content_type=response.headers.get("Content-Type"),
content_type=self.content_type,
metadata={
"status_code": response.status_code,
"headers": dict(response.headers),
**(metadata or {}),
},
)

View File

@@ -1,5 +1,7 @@
httpx==0.27.0
beautifulsoup4==4.12.0
pydantic==2.12.3
jinja2==3.1.6
# lxml==5.2.0
pytest==7.4.0

0
tests/__init__.py Normal file
View File

65
tests/conftest.py Normal file
View File

@@ -0,0 +1,65 @@
import json
import pytest
import httpx
from pathlib import Path
from jinja2 import Environment, BaseLoader
from omniread.core.content import ContentType
from omniread.html.scraper import HTMLScraper
MOCK_DIR = Path(__file__).parent / "mocks"
def render_html(template_path, data_path) -> bytes:
template_text = Path(template_path).read_text(encoding="utf-8")
data = json.loads(Path(data_path).read_text(encoding="utf-8"))
env = Environment(
loader=BaseLoader(),
autoescape=False,
)
template = env.from_string(template_text)
rendered = template.render(**data)
return rendered.encode("utf-8")
def mock_transport(request: httpx.Request) -> httpx.Response:
"""
httpx MockTransport handler.
"""
path = request.url.path
if path == "/simple":
content = render_html(
MOCK_DIR / "simple.html.jinja",
MOCK_DIR / "simple.json",
)
elif path == "/table":
content = render_html(
MOCK_DIR / "table.html.jinja",
MOCK_DIR / "table.json",
)
else:
return httpx.Response(
status_code=404,
content=b"Not Found",
request=request,
)
return httpx.Response(
status_code=200,
headers={"Content-Type": ContentType.HTML.value},
content=content,
request=request,
)
@pytest.fixture
def http_scraper() -> HTMLScraper:
transport = httpx.MockTransport(mock_transport)
client = httpx.Client(transport=transport)
return HTMLScraper(client=client)

View File

@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ title }}</title>
<meta name="description" content="{{ description }}">
</head>
<body>
<div id="content">{{ content }}</div>
<a href="{{ link_url }}">{{ link_text }}</a>
</body>
</html>

7
tests/mocks/simple.json Normal file
View File

@@ -0,0 +1,7 @@
{
"title": "Test Page",
"description": "Simple test page",
"content": "Hello World",
"link_url": "https://example.com",
"link_text": "Link"
}

View File

@@ -0,0 +1,31 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ title }}</title>
<meta name="description" content="{{ description }}">
</head>
<body>
<h1>{{ heading }}</h1>
<table id="{{ table_id }}">
<thead>
<tr>
{% for col in columns %}
<th>{{ col }}</th>
{% endfor %}
</tr>
</thead>
<tbody>
{% for row in rows %}
<tr>
{% for cell in row %}
<td>{{ cell }}</td>
{% endfor %}
</tr>
{% endfor %}
</tbody>
</table>
<a href="{{ link_url }}">{{ link_text }}</a>
</body>
</html>

14
tests/mocks/table.json Normal file
View File

@@ -0,0 +1,14 @@
{
"title": "Table Test Page",
"description": "HTML page with a table for parsing tests",
"heading": "Sample Table",
"table_id": "data-table",
"columns": ["Name", "Age", "City"],
"rows": [
["Alice", "30", "London"],
["Bob", "25", "New York"],
["Charlie", "35", "Berlin"]
],
"link_url": "https://example.org/details",
"link_text": "Details"
}

48
tests/test_html_simple.py Normal file
View File

@@ -0,0 +1,48 @@
from typing import Optional
from pydantic import BaseModel
from bs4 import Tag
from omniread.html.parser import HTMLParser
from omniread.core.content import Content
class ParsedSimpleHTML(BaseModel):
title: Optional[str]
description: Optional[str]
content: Optional[str]
link: Optional[str]
class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
"""
Parser focused on high-level page semantics.
"""
def parse(self) -> ParsedSimpleHTML:
soup = self._soup
meta = self.parse_meta()
content_div = soup.find("div", id="content")
link_tag: Tag | None = soup.find("a")
return ParsedSimpleHTML(
title=meta["title"],
description=meta["meta"].get("description"),
content=self.parse_div(content_div) if content_div else None,
link=self.parse_link(link_tag) if link_tag else None,
)
def test_end_to_end_html_simple(http_scraper):
content: Content = http_scraper.fetch("https://test.local/simple")
parser = SimpleHTMLParser(content)
result = parser.parse()
assert isinstance(result, ParsedSimpleHTML)
assert result.title == "Test Page"
assert result.description == "Simple test page"
assert result.content == "Hello World"
assert result.link == "https://example.com"

44
tests/test_html_table.py Normal file
View File

@@ -0,0 +1,44 @@
from typing import Optional
from pydantic import BaseModel
from omniread.html.parser import HTMLParser
from omniread.core.content import Content
class ParsedTableHTML(BaseModel):
title: Optional[str]
table: list[list[str]]
class TableHTMLParser(HTMLParser[ParsedTableHTML]):
"""
Parser focused on extracting tabular data.
"""
def parse(self) -> ParsedTableHTML:
soup = self._soup
table_tag = soup.find("table")
return ParsedTableHTML(
title=soup.title.string.strip() if soup.title else None,
table=self.parse_table(table_tag) if table_tag else [],
)
def test_end_to_end_html_table(http_scraper):
content: Content = http_scraper.fetch("https://test.local/table")
parser = TableHTMLParser(content)
result = parser.parse()
assert isinstance(result, ParsedTableHTML)
assert result.title == "Table Test Page"
assert result.table == [
["Name", "Age", "City"],
["Alice", "30", "London"],
["Bob", "25", "New York"],
["Charlie", "35", "Berlin"],
]