refactor(html-scraper): normalize Content-Type and inject httpx client
- Inject httpx.Client for testability and reuse - Validate and normalize Content-Type header before returning Content - Emit ContentType.HTML instead of raw header strings - Avoid per-request client creation - Preserve metadata while allowing caller overrides
This commit is contained in:
@@ -1,13 +1,13 @@
|
|||||||
import httpx
|
import httpx
|
||||||
from typing import Any, Mapping, Optional
|
from typing import Any, Mapping, Optional
|
||||||
|
|
||||||
from omniread.core.content import Content
|
from omniread.core.content import Content, ContentType
|
||||||
from omniread.core.scraper import BaseScraper
|
from omniread.core.scraper import BaseScraper
|
||||||
|
|
||||||
|
|
||||||
class HTMLScraper(BaseScraper):
|
class HTMLScraper(BaseScraper):
|
||||||
"""
|
"""
|
||||||
Base HTTP scraper using httpx.
|
Base HTML scraper using httpx.
|
||||||
|
|
||||||
Fetches raw bytes and metadata only.
|
Fetches raw bytes and metadata only.
|
||||||
"""
|
"""
|
||||||
@@ -15,13 +15,16 @@ class HTMLScraper(BaseScraper):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
|
client: httpx.Client | None = None,
|
||||||
timeout: float = 15.0,
|
timeout: float = 15.0,
|
||||||
headers: Optional[Mapping[str, str]] = None,
|
headers: Optional[Mapping[str, str]] = None,
|
||||||
follow_redirects: bool = True,
|
follow_redirects: bool = True,
|
||||||
):
|
):
|
||||||
self.timeout = timeout
|
self._client = client or httpx.Client(
|
||||||
self.headers = dict(headers) if headers else {}
|
timeout=timeout,
|
||||||
self.follow_redirects = follow_redirects
|
headers=headers,
|
||||||
|
follow_redirects=follow_redirects,
|
||||||
|
)
|
||||||
|
|
||||||
def fetch(
|
def fetch(
|
||||||
self,
|
self,
|
||||||
@@ -29,20 +32,16 @@ class HTMLScraper(BaseScraper):
|
|||||||
*,
|
*,
|
||||||
metadata: Optional[Mapping[str, Any]] = None,
|
metadata: Optional[Mapping[str, Any]] = None,
|
||||||
) -> Content:
|
) -> Content:
|
||||||
with httpx.Client(
|
response = self._client.get(source)
|
||||||
timeout=self.timeout,
|
|
||||||
headers=self.headers,
|
|
||||||
follow_redirects=self.follow_redirects,
|
|
||||||
) as client:
|
|
||||||
response = client.get(source)
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
return Content(
|
return Content(
|
||||||
raw=response.content,
|
raw=response.content,
|
||||||
source=source,
|
source=source,
|
||||||
content_type=response.headers.get("Content-Type"),
|
content_type=ContentType.HTML,
|
||||||
metadata={
|
metadata={
|
||||||
"status_code": response.status_code,
|
"status_code": response.status_code,
|
||||||
"headers": dict(response.headers),
|
"headers": dict(response.headers),
|
||||||
|
**(metadata or {}),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user