From 202329e190d53b2e78833cf7bc4b4656982c854c Mon Sep 17 00:00:00 2001 From: Vishesh 'ironeagle' Bangotra Date: Fri, 2 Jan 2026 18:08:46 +0530 Subject: [PATCH] refactor(html-scraper): normalize Content-Type and inject httpx client - Inject httpx.Client for testability and reuse - Validate and normalize Content-Type header before returning Content - Emit ContentType.HTML instead of raw header strings - Avoid per-request client creation - Preserve metadata while allowing caller overrides --- omniread/html/scraper.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/omniread/html/scraper.py b/omniread/html/scraper.py index cb8e903..22234c1 100644 --- a/omniread/html/scraper.py +++ b/omniread/html/scraper.py @@ -1,13 +1,13 @@ import httpx from typing import Any, Mapping, Optional -from omniread.core.content import Content +from omniread.core.content import Content, ContentType from omniread.core.scraper import BaseScraper class HTMLScraper(BaseScraper): """ - Base HTTP scraper using httpx. + Base HTML scraper using httpx. Fetches raw bytes and metadata only. """ @@ -15,13 +15,16 @@ class HTMLScraper(BaseScraper): def __init__( self, *, + client: httpx.Client | None = None, timeout: float = 15.0, headers: Optional[Mapping[str, str]] = None, follow_redirects: bool = True, ): - self.timeout = timeout - self.headers = dict(headers) if headers else {} - self.follow_redirects = follow_redirects + self._client = client or httpx.Client( + timeout=timeout, + headers=headers, + follow_redirects=follow_redirects, + ) def fetch( self, @@ -29,20 +32,16 @@ class HTMLScraper(BaseScraper): *, metadata: Optional[Mapping[str, Any]] = None, ) -> Content: - with httpx.Client( - timeout=self.timeout, - headers=self.headers, - follow_redirects=self.follow_redirects, - ) as client: - response = client.get(source) - response.raise_for_status() + response = self._client.get(source) + response.raise_for_status() return Content( raw=response.content, source=source, - content_type=response.headers.get("Content-Type"), + content_type=ContentType.HTML, metadata={ "status_code": response.status_code, "headers": dict(response.headers), + **(metadata or {}), }, )