49 lines
1.2 KiB
Python
49 lines
1.2 KiB
Python
import httpx
|
|
from typing import Any, Mapping, Optional
|
|
|
|
from omniread.core.content import Content
|
|
from omniread.core.scraper import BaseScraper
|
|
|
|
|
|
class HTMLScraper(BaseScraper):
|
|
"""
|
|
Base HTTP scraper using httpx.
|
|
|
|
Fetches raw bytes and metadata only.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
timeout: float = 15.0,
|
|
headers: Optional[Mapping[str, str]] = None,
|
|
follow_redirects: bool = True,
|
|
):
|
|
self.timeout = timeout
|
|
self.headers = dict(headers) if headers else {}
|
|
self.follow_redirects = follow_redirects
|
|
|
|
def fetch(
|
|
self,
|
|
source: str,
|
|
*,
|
|
metadata: Optional[Mapping[str, Any]] = None,
|
|
) -> Content:
|
|
with httpx.Client(
|
|
timeout=self.timeout,
|
|
headers=self.headers,
|
|
follow_redirects=self.follow_redirects,
|
|
) as client:
|
|
response = client.get(source)
|
|
response.raise_for_status()
|
|
|
|
return Content(
|
|
raw=response.content,
|
|
source=source,
|
|
content_type=response.headers.get("Content-Type"),
|
|
metadata={
|
|
"status_code": response.status_code,
|
|
"headers": dict(response.headers),
|
|
},
|
|
)
|