omniread/omniread/html/scraper.py

import httpx
from typing import Any, Mapping, Optional

from omniread.core.content import Content, ContentType
from omniread.core.scraper import BaseScraper


class HTMLScraper(BaseScraper):
    """
    Base HTML scraper using httpx.

    Fetches raw bytes and metadata only.
    """

    def __init__(
        self,
        *,
        client: httpx.Client | None = None,
        timeout: float = 15.0,
        headers: Optional[Mapping[str, str]] = None,
        follow_redirects: bool = True,
    ):
        self._client = client or httpx.Client(
            timeout=timeout,
            headers=headers,
            follow_redirects=follow_redirects,
        )
        self.content_type = ContentType.HTML

    def validate_content_type(
        self,
        response: httpx.Response,
    ):
        raw_ct = response.headers.get("Content-Type")
        if not raw_ct:
            raise ValueError("Missing Content-Type header")

        base_ct = raw_ct.split(";", 1)[0].strip().lower()
        if base_ct != self.content_type.value:
            raise ValueError(
                f"Expected HTML content, got '{raw_ct}'"
            )

    def fetch(
        self,
        source: str,
        *,
        metadata: Optional[Mapping[str, Any]] = None,
    ) -> Content:
        response = self._client.get(source)
        response.raise_for_status()
        self.validate_content_type(response)

        return Content(
            raw=response.content,
            source=source,
            content_type=self.content_type,
            metadata={
                "status_code": response.status_code,
                "headers": dict(response.headers),
                **(metadata or {}),
            },
        )