Files
omniread/omniread/html/scraper.py

49 lines
1.2 KiB
Python

import httpx
from typing import Any, Mapping, Optional
from omniread.core.content import Content
from omniread.core.scraper import BaseScraper
class HTMLScraper(BaseScraper):
"""
Base HTTP scraper using httpx.
Fetches raw bytes and metadata only.
"""
def __init__(
self,
*,
timeout: float = 15.0,
headers: Optional[Mapping[str, str]] = None,
follow_redirects: bool = True,
):
self.timeout = timeout
self.headers = dict(headers) if headers else {}
self.follow_redirects = follow_redirects
def fetch(
self,
source: str,
*,
metadata: Optional[Mapping[str, Any]] = None,
) -> Content:
with httpx.Client(
timeout=self.timeout,
headers=self.headers,
follow_redirects=self.follow_redirects,
) as client:
response = client.get(source)
response.raise_for_status()
return Content(
raw=response.content,
source=source,
content_type=response.headers.get("Content-Type"),
metadata={
"status_code": response.status_code,
"headers": dict(response.headers),
},
)