Files
omniread/omniread/html/scraper.py

64 lines
1.7 KiB
Python

import httpx
from typing import Any, Mapping, Optional
from omniread.core.content import Content, ContentType
from omniread.core.scraper import BaseScraper
class HTMLScraper(BaseScraper):
"""
Base HTML scraper using httpx.
Fetches raw bytes and metadata only.
"""
def __init__(
self,
*,
client: httpx.Client | None = None,
timeout: float = 15.0,
headers: Optional[Mapping[str, str]] = None,
follow_redirects: bool = True,
):
self._client = client or httpx.Client(
timeout=timeout,
headers=headers,
follow_redirects=follow_redirects,
)
self.content_type = ContentType.HTML
def validate_content_type(
self,
response: httpx.Response,
):
raw_ct = response.headers.get("Content-Type")
if not raw_ct:
raise ValueError("Missing Content-Type header")
base_ct = raw_ct.split(";", 1)[0].strip().lower()
if base_ct != self.content_type.value:
raise ValueError(
f"Expected HTML content, got '{raw_ct}'"
)
def fetch(
self,
source: str,
*,
metadata: Optional[Mapping[str, Any]] = None,
) -> Content:
response = self._client.get(source)
response.raise_for_status()
self.validate_content_type(response)
return Content(
raw=response.content,
source=source,
content_type=self.content_type,
metadata={
"status_code": response.status_code,
"headers": dict(response.headers),
**(metadata or {}),
},
)