diff --git a/omniread/html/scraper.py b/omniread/html/scraper.py
index cb8e903..22234c1 100644
--- a/omniread/html/scraper.py
+++ b/omniread/html/scraper.py
@@ -1,13 +1,13 @@
import httpx
from typing import Any, Mapping, Optional
-from omniread.core.content import Content
+from omniread.core.content import Content, ContentType
from omniread.core.scraper import BaseScraper
class HTMLScraper(BaseScraper):
"""
- Base HTTP scraper using httpx.
+ Base HTML scraper using httpx.
Fetches raw bytes and metadata only.
"""
@@ -15,13 +15,16 @@ class HTMLScraper(BaseScraper):
def __init__(
self,
*,
+ client: httpx.Client | None = None,
timeout: float = 15.0,
headers: Optional[Mapping[str, str]] = None,
follow_redirects: bool = True,
):
- self.timeout = timeout
- self.headers = dict(headers) if headers else {}
- self.follow_redirects = follow_redirects
+ self._client = client or httpx.Client(
+ timeout=timeout,
+ headers=headers,
+ follow_redirects=follow_redirects,
+ )
def fetch(
self,
@@ -29,20 +32,16 @@ class HTMLScraper(BaseScraper):
*,
metadata: Optional[Mapping[str, Any]] = None,
) -> Content:
- with httpx.Client(
- timeout=self.timeout,
- headers=self.headers,
- follow_redirects=self.follow_redirects,
- ) as client:
- response = client.get(source)
- response.raise_for_status()
+ response = self._client.get(source)
+ response.raise_for_status()
return Content(
raw=response.content,
source=source,
- content_type=response.headers.get("Content-Type"),
+ content_type=ContentType.HTML,
metadata={
"status_code": response.status_code,
"headers": dict(response.headers),
+ **(metadata or {}),
},
)