diff --git a/omniread/html/scraper.py b/omniread/html/scraper.py
index 22234c1..9d9de23 100644
--- a/omniread/html/scraper.py
+++ b/omniread/html/scraper.py
@@ -25,6 +25,21 @@ class HTMLScraper(BaseScraper):
headers=headers,
follow_redirects=follow_redirects,
)
+ self.content_type = ContentType.HTML
+
+ def validate_content_type(
+ self,
+ response: httpx.Response,
+ ):
+ raw_ct = response.headers.get("Content-Type")
+ if not raw_ct:
+ raise ValueError("Missing Content-Type header")
+
+ base_ct = raw_ct.split(";", 1)[0].strip().lower()
+ if base_ct != self.content_type.value:
+ raise ValueError(
+ f"Expected HTML content, got '{raw_ct}'"
+ )
def fetch(
self,
@@ -34,11 +49,12 @@ class HTMLScraper(BaseScraper):
) -> Content:
response = self._client.get(source)
response.raise_for_status()
+ self.validate_content_type(response)
return Content(
raw=response.content,
source=source,
- content_type=ContentType.HTML,
+ content_type=self.content_type,
metadata={
"status_code": response.status_code,
"headers": dict(response.headers),