diff --git a/omniread/html/scraper.py b/omniread/html/scraper.py index 22234c1..9d9de23 100644 --- a/omniread/html/scraper.py +++ b/omniread/html/scraper.py @@ -25,6 +25,21 @@ class HTMLScraper(BaseScraper): headers=headers, follow_redirects=follow_redirects, ) + self.content_type = ContentType.HTML + + def validate_content_type( + self, + response: httpx.Response, + ): + raw_ct = response.headers.get("Content-Type") + if not raw_ct: + raise ValueError("Missing Content-Type header") + + base_ct = raw_ct.split(";", 1)[0].strip().lower() + if base_ct != self.content_type.value: + raise ValueError( + f"Expected HTML content, got '{raw_ct}'" + ) def fetch( self, @@ -34,11 +49,12 @@ class HTMLScraper(BaseScraper): ) -> Content: response = self._client.get(source) response.raise_for_status() + self.validate_content_type(response) return Content( raw=response.content, source=source, - content_type=ContentType.HTML, + content_type=self.content_type, metadata={ "status_code": response.status_code, "headers": dict(response.headers),