refactor(html-scraper): normalize Content-Type and inject httpx client

- Inject httpx.Client for testability and reuse
- Validate and normalize Content-Type header before returning Content
- Emit ContentType.HTML instead of raw header strings
- Avoid per-request client creation
- Preserve metadata while allowing caller overrides
This commit is contained in:
2026-01-02 18:08:46 +05:30
parent f59024ddd5
commit 202329e190

View File

@@ -1,13 +1,13 @@
import httpx
from typing import Any, Mapping, Optional
from omniread.core.content import Content
from omniread.core.content import Content, ContentType
from omniread.core.scraper import BaseScraper
class HTMLScraper(BaseScraper):
"""
Base HTTP scraper using httpx.
Base HTML scraper using httpx.
Fetches raw bytes and metadata only.
"""
@@ -15,13 +15,16 @@ class HTMLScraper(BaseScraper):
def __init__(
self,
*,
client: httpx.Client | None = None,
timeout: float = 15.0,
headers: Optional[Mapping[str, str]] = None,
follow_redirects: bool = True,
):
self.timeout = timeout
self.headers = dict(headers) if headers else {}
self.follow_redirects = follow_redirects
self._client = client or httpx.Client(
timeout=timeout,
headers=headers,
follow_redirects=follow_redirects,
)
def fetch(
self,
@@ -29,20 +32,16 @@ class HTMLScraper(BaseScraper):
*,
metadata: Optional[Mapping[str, Any]] = None,
) -> Content:
with httpx.Client(
timeout=self.timeout,
headers=self.headers,
follow_redirects=self.follow_redirects,
) as client:
response = client.get(source)
response = self._client.get(source)
response.raise_for_status()
return Content(
raw=response.content,
source=source,
content_type=response.headers.get("Content-Type"),
content_type=ContentType.HTML,
metadata={
"status_code": response.status_code,
"headers": dict(response.headers),
**(metadata or {}),
},
)