64 lines
1.7 KiB
Python
64 lines
1.7 KiB
Python
import httpx
|
|
from typing import Any, Mapping, Optional
|
|
|
|
from omniread.core.content import Content, ContentType
|
|
from omniread.core.scraper import BaseScraper
|
|
|
|
|
|
class HTMLScraper(BaseScraper):
|
|
"""
|
|
Base HTML scraper using httpx.
|
|
|
|
Fetches raw bytes and metadata only.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
client: httpx.Client | None = None,
|
|
timeout: float = 15.0,
|
|
headers: Optional[Mapping[str, str]] = None,
|
|
follow_redirects: bool = True,
|
|
):
|
|
self._client = client or httpx.Client(
|
|
timeout=timeout,
|
|
headers=headers,
|
|
follow_redirects=follow_redirects,
|
|
)
|
|
self.content_type = ContentType.HTML
|
|
|
|
def validate_content_type(
|
|
self,
|
|
response: httpx.Response,
|
|
):
|
|
raw_ct = response.headers.get("Content-Type")
|
|
if not raw_ct:
|
|
raise ValueError("Missing Content-Type header")
|
|
|
|
base_ct = raw_ct.split(";", 1)[0].strip().lower()
|
|
if base_ct != self.content_type.value:
|
|
raise ValueError(
|
|
f"Expected HTML content, got '{raw_ct}'"
|
|
)
|
|
|
|
def fetch(
|
|
self,
|
|
source: str,
|
|
*,
|
|
metadata: Optional[Mapping[str, Any]] = None,
|
|
) -> Content:
|
|
response = self._client.get(source)
|
|
response.raise_for_status()
|
|
self.validate_content_type(response)
|
|
|
|
return Content(
|
|
raw=response.content,
|
|
source=source,
|
|
content_type=self.content_type,
|
|
metadata={
|
|
"status_code": response.status_code,
|
|
"headers": dict(response.headers),
|
|
**(metadata or {}),
|
|
},
|
|
)
|