refactor(html-scraper): normalize Content-Type and inject httpx client
- Inject httpx.Client for testability and reuse - Validate and normalize Content-Type header before returning Content - Emit ContentType.HTML instead of raw header strings - Avoid per-request client creation - Preserve metadata while allowing caller overrides
This commit is contained in:
@@ -1,13 +1,13 @@
|
||||
import httpx
|
||||
from typing import Any, Mapping, Optional
|
||||
|
||||
from omniread.core.content import Content
|
||||
from omniread.core.content import Content, ContentType
|
||||
from omniread.core.scraper import BaseScraper
|
||||
|
||||
|
||||
class HTMLScraper(BaseScraper):
|
||||
"""
|
||||
Base HTTP scraper using httpx.
|
||||
Base HTML scraper using httpx.
|
||||
|
||||
Fetches raw bytes and metadata only.
|
||||
"""
|
||||
@@ -15,13 +15,16 @@ class HTMLScraper(BaseScraper):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
client: httpx.Client | None = None,
|
||||
timeout: float = 15.0,
|
||||
headers: Optional[Mapping[str, str]] = None,
|
||||
follow_redirects: bool = True,
|
||||
):
|
||||
self.timeout = timeout
|
||||
self.headers = dict(headers) if headers else {}
|
||||
self.follow_redirects = follow_redirects
|
||||
self._client = client or httpx.Client(
|
||||
timeout=timeout,
|
||||
headers=headers,
|
||||
follow_redirects=follow_redirects,
|
||||
)
|
||||
|
||||
def fetch(
|
||||
self,
|
||||
@@ -29,20 +32,16 @@ class HTMLScraper(BaseScraper):
|
||||
*,
|
||||
metadata: Optional[Mapping[str, Any]] = None,
|
||||
) -> Content:
|
||||
with httpx.Client(
|
||||
timeout=self.timeout,
|
||||
headers=self.headers,
|
||||
follow_redirects=self.follow_redirects,
|
||||
) as client:
|
||||
response = client.get(source)
|
||||
response.raise_for_status()
|
||||
response = self._client.get(source)
|
||||
response.raise_for_status()
|
||||
|
||||
return Content(
|
||||
raw=response.content,
|
||||
source=source,
|
||||
content_type=response.headers.get("Content-Type"),
|
||||
content_type=ContentType.HTML,
|
||||
metadata={
|
||||
"status_code": response.status_code,
|
||||
"headers": dict(response.headers),
|
||||
**(metadata or {}),
|
||||
},
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user