refactor(html-scraper): normalize Content-Type and inject httpx client

- Inject httpx.Client for testability and reuse
- Validate and normalize Content-Type header before returning Content
- Emit ContentType.HTML instead of raw header strings
- Avoid per-request client creation
- Preserve metadata while allowing caller overrides
This commit is contained in:
2026-01-02 18:08:46 +05:30
parent f59024ddd5
commit 202329e190

View File

@@ -1,13 +1,13 @@
import httpx import httpx
from typing import Any, Mapping, Optional from typing import Any, Mapping, Optional
from omniread.core.content import Content from omniread.core.content import Content, ContentType
from omniread.core.scraper import BaseScraper from omniread.core.scraper import BaseScraper
class HTMLScraper(BaseScraper): class HTMLScraper(BaseScraper):
""" """
Base HTTP scraper using httpx. Base HTML scraper using httpx.
Fetches raw bytes and metadata only. Fetches raw bytes and metadata only.
""" """
@@ -15,13 +15,16 @@ class HTMLScraper(BaseScraper):
def __init__( def __init__(
self, self,
*, *,
client: httpx.Client | None = None,
timeout: float = 15.0, timeout: float = 15.0,
headers: Optional[Mapping[str, str]] = None, headers: Optional[Mapping[str, str]] = None,
follow_redirects: bool = True, follow_redirects: bool = True,
): ):
self.timeout = timeout self._client = client or httpx.Client(
self.headers = dict(headers) if headers else {} timeout=timeout,
self.follow_redirects = follow_redirects headers=headers,
follow_redirects=follow_redirects,
)
def fetch( def fetch(
self, self,
@@ -29,20 +32,16 @@ class HTMLScraper(BaseScraper):
*, *,
metadata: Optional[Mapping[str, Any]] = None, metadata: Optional[Mapping[str, Any]] = None,
) -> Content: ) -> Content:
with httpx.Client( response = self._client.get(source)
timeout=self.timeout,
headers=self.headers,
follow_redirects=self.follow_redirects,
) as client:
response = client.get(source)
response.raise_for_status() response.raise_for_status()
return Content( return Content(
raw=response.content, raw=response.content,
source=source, source=source,
content_type=response.headers.get("Content-Type"), content_type=ContentType.HTML,
metadata={ metadata={
"status_code": response.status_code, "status_code": response.status_code,
"headers": dict(response.headers), "headers": dict(response.headers),
**(metadata or {}),
}, },
) )