import httpx from typing import Any, Mapping, Optional from omniread.core.content import Content, ContentType from omniread.core.scraper import BaseScraper class HTMLScraper(BaseScraper): """ Base HTML scraper using httpx. Fetches raw bytes and metadata only. """ def __init__( self, *, client: httpx.Client | None = None, timeout: float = 15.0, headers: Optional[Mapping[str, str]] = None, follow_redirects: bool = True, ): self._client = client or httpx.Client( timeout=timeout, headers=headers, follow_redirects=follow_redirects, ) self.content_type = ContentType.HTML def validate_content_type( self, response: httpx.Response, ): raw_ct = response.headers.get("Content-Type") if not raw_ct: raise ValueError("Missing Content-Type header") base_ct = raw_ct.split(";", 1)[0].strip().lower() if base_ct != self.content_type.value: raise ValueError( f"Expected HTML content, got '{raw_ct}'" ) def fetch( self, source: str, *, metadata: Optional[Mapping[str, Any]] = None, ) -> Content: response = self._client.get(source) response.raise_for_status() self.validate_content_type(response) return Content( raw=response.content, source=source, content_type=self.content_type, metadata={ "status_code": response.status_code, "headers": dict(response.headers), **(metadata or {}), }, )