diff --git a/omniread/html/scraper.py b/omniread/html/scraper.py index cb8e903..22234c1 100644 --- a/omniread/html/scraper.py +++ b/omniread/html/scraper.py @@ -1,13 +1,13 @@ import httpx from typing import Any, Mapping, Optional -from omniread.core.content import Content +from omniread.core.content import Content, ContentType from omniread.core.scraper import BaseScraper class HTMLScraper(BaseScraper): """ - Base HTTP scraper using httpx. + Base HTML scraper using httpx. Fetches raw bytes and metadata only. """ @@ -15,13 +15,16 @@ class HTMLScraper(BaseScraper): def __init__( self, *, + client: httpx.Client | None = None, timeout: float = 15.0, headers: Optional[Mapping[str, str]] = None, follow_redirects: bool = True, ): - self.timeout = timeout - self.headers = dict(headers) if headers else {} - self.follow_redirects = follow_redirects + self._client = client or httpx.Client( + timeout=timeout, + headers=headers, + follow_redirects=follow_redirects, + ) def fetch( self, @@ -29,20 +32,16 @@ class HTMLScraper(BaseScraper): *, metadata: Optional[Mapping[str, Any]] = None, ) -> Content: - with httpx.Client( - timeout=self.timeout, - headers=self.headers, - follow_redirects=self.follow_redirects, - ) as client: - response = client.get(source) - response.raise_for_status() + response = self._client.get(source) + response.raise_for_status() return Content( raw=response.content, source=source, - content_type=response.headers.get("Content-Type"), + content_type=ContentType.HTML, metadata={ "status_code": response.status_code, "headers": dict(response.headers), + **(metadata or {}), }, )