added validation for content type
This commit is contained in:
@@ -25,6 +25,21 @@ class HTMLScraper(BaseScraper):
|
||||
headers=headers,
|
||||
follow_redirects=follow_redirects,
|
||||
)
|
||||
self.content_type = ContentType.HTML
|
||||
|
||||
def validate_content_type(
|
||||
self,
|
||||
response: httpx.Response,
|
||||
):
|
||||
raw_ct = response.headers.get("Content-Type")
|
||||
if not raw_ct:
|
||||
raise ValueError("Missing Content-Type header")
|
||||
|
||||
base_ct = raw_ct.split(";", 1)[0].strip().lower()
|
||||
if base_ct != self.content_type.value:
|
||||
raise ValueError(
|
||||
f"Expected HTML content, got '{raw_ct}'"
|
||||
)
|
||||
|
||||
def fetch(
|
||||
self,
|
||||
@@ -34,11 +49,12 @@ class HTMLScraper(BaseScraper):
|
||||
) -> Content:
|
||||
response = self._client.get(source)
|
||||
response.raise_for_status()
|
||||
self.validate_content_type(response)
|
||||
|
||||
return Content(
|
||||
raw=response.content,
|
||||
source=source,
|
||||
content_type=ContentType.HTML,
|
||||
content_type=self.content_type,
|
||||
metadata={
|
||||
"status_code": response.status_code,
|
||||
"headers": dict(response.headers),
|
||||
|
||||
Reference in New Issue
Block a user