Compare commits
5 Commits
32ee43e77a
...
07293e4651
| Author | SHA1 | Date | |
|---|---|---|---|
| 07293e4651 | |||
| fa14a79ec9 | |||
| 55245cf241 | |||
| 202329e190 | |||
| f59024ddd5 |
@@ -1,13 +1,13 @@
|
|||||||
import httpx
|
import httpx
|
||||||
from typing import Any, Mapping, Optional
|
from typing import Any, Mapping, Optional
|
||||||
|
|
||||||
from omniread.core.content import Content
|
from omniread.core.content import Content, ContentType
|
||||||
from omniread.core.scraper import BaseScraper
|
from omniread.core.scraper import BaseScraper
|
||||||
|
|
||||||
|
|
||||||
class HTMLScraper(BaseScraper):
|
class HTMLScraper(BaseScraper):
|
||||||
"""
|
"""
|
||||||
Base HTTP scraper using httpx.
|
Base HTML scraper using httpx.
|
||||||
|
|
||||||
Fetches raw bytes and metadata only.
|
Fetches raw bytes and metadata only.
|
||||||
"""
|
"""
|
||||||
@@ -15,13 +15,31 @@ class HTMLScraper(BaseScraper):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
|
client: httpx.Client | None = None,
|
||||||
timeout: float = 15.0,
|
timeout: float = 15.0,
|
||||||
headers: Optional[Mapping[str, str]] = None,
|
headers: Optional[Mapping[str, str]] = None,
|
||||||
follow_redirects: bool = True,
|
follow_redirects: bool = True,
|
||||||
):
|
):
|
||||||
self.timeout = timeout
|
self._client = client or httpx.Client(
|
||||||
self.headers = dict(headers) if headers else {}
|
timeout=timeout,
|
||||||
self.follow_redirects = follow_redirects
|
headers=headers,
|
||||||
|
follow_redirects=follow_redirects,
|
||||||
|
)
|
||||||
|
self.content_type = ContentType.HTML
|
||||||
|
|
||||||
|
def validate_content_type(
|
||||||
|
self,
|
||||||
|
response: httpx.Response,
|
||||||
|
):
|
||||||
|
raw_ct = response.headers.get("Content-Type")
|
||||||
|
if not raw_ct:
|
||||||
|
raise ValueError("Missing Content-Type header")
|
||||||
|
|
||||||
|
base_ct = raw_ct.split(";", 1)[0].strip().lower()
|
||||||
|
if base_ct != self.content_type.value:
|
||||||
|
raise ValueError(
|
||||||
|
f"Expected HTML content, got '{raw_ct}'"
|
||||||
|
)
|
||||||
|
|
||||||
def fetch(
|
def fetch(
|
||||||
self,
|
self,
|
||||||
@@ -29,20 +47,17 @@ class HTMLScraper(BaseScraper):
|
|||||||
*,
|
*,
|
||||||
metadata: Optional[Mapping[str, Any]] = None,
|
metadata: Optional[Mapping[str, Any]] = None,
|
||||||
) -> Content:
|
) -> Content:
|
||||||
with httpx.Client(
|
response = self._client.get(source)
|
||||||
timeout=self.timeout,
|
response.raise_for_status()
|
||||||
headers=self.headers,
|
self.validate_content_type(response)
|
||||||
follow_redirects=self.follow_redirects,
|
|
||||||
) as client:
|
|
||||||
response = client.get(source)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
return Content(
|
return Content(
|
||||||
raw=response.content,
|
raw=response.content,
|
||||||
source=source,
|
source=source,
|
||||||
content_type=response.headers.get("Content-Type"),
|
content_type=self.content_type,
|
||||||
metadata={
|
metadata={
|
||||||
"status_code": response.status_code,
|
"status_code": response.status_code,
|
||||||
"headers": dict(response.headers),
|
"headers": dict(response.headers),
|
||||||
|
**(metadata or {}),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
httpx==0.27.0
|
httpx==0.27.0
|
||||||
beautifulsoup4==4.12.0
|
beautifulsoup4==4.12.0
|
||||||
|
pydantic==2.12.3
|
||||||
|
jinja2==3.1.6
|
||||||
# lxml==5.2.0
|
# lxml==5.2.0
|
||||||
|
|
||||||
pytest==7.4.0
|
pytest==7.4.0
|
||||||
|
|||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
65
tests/conftest.py
Normal file
65
tests/conftest.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
import httpx
|
||||||
|
from pathlib import Path
|
||||||
|
from jinja2 import Environment, BaseLoader
|
||||||
|
|
||||||
|
from omniread.core.content import ContentType
|
||||||
|
from omniread.html.scraper import HTMLScraper
|
||||||
|
|
||||||
|
|
||||||
|
MOCK_DIR = Path(__file__).parent / "mocks"
|
||||||
|
|
||||||
|
|
||||||
|
def render_html(template_path, data_path) -> bytes:
|
||||||
|
template_text = Path(template_path).read_text(encoding="utf-8")
|
||||||
|
data = json.loads(Path(data_path).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
env = Environment(
|
||||||
|
loader=BaseLoader(),
|
||||||
|
autoescape=False,
|
||||||
|
)
|
||||||
|
template = env.from_string(template_text)
|
||||||
|
|
||||||
|
rendered = template.render(**data)
|
||||||
|
return rendered.encode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def mock_transport(request: httpx.Request) -> httpx.Response:
|
||||||
|
"""
|
||||||
|
httpx MockTransport handler.
|
||||||
|
"""
|
||||||
|
path = request.url.path
|
||||||
|
|
||||||
|
if path == "/simple":
|
||||||
|
content = render_html(
|
||||||
|
MOCK_DIR / "simple.html.jinja",
|
||||||
|
MOCK_DIR / "simple.json",
|
||||||
|
)
|
||||||
|
elif path == "/table":
|
||||||
|
content = render_html(
|
||||||
|
MOCK_DIR / "table.html.jinja",
|
||||||
|
MOCK_DIR / "table.json",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return httpx.Response(
|
||||||
|
status_code=404,
|
||||||
|
content=b"Not Found",
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
|
||||||
|
return httpx.Response(
|
||||||
|
status_code=200,
|
||||||
|
headers={"Content-Type": ContentType.HTML.value},
|
||||||
|
content=content,
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def http_scraper() -> HTMLScraper:
|
||||||
|
transport = httpx.MockTransport(mock_transport)
|
||||||
|
|
||||||
|
client = httpx.Client(transport=transport)
|
||||||
|
|
||||||
|
return HTMLScraper(client=client)
|
||||||
11
tests/mocks/simple.html.jinja
Normal file
11
tests/mocks/simple.html.jinja
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>{{ title }}</title>
|
||||||
|
<meta name="description" content="{{ description }}">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="content">{{ content }}</div>
|
||||||
|
<a href="{{ link_url }}">{{ link_text }}</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
7
tests/mocks/simple.json
Normal file
7
tests/mocks/simple.json
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
"title": "Test Page",
|
||||||
|
"description": "Simple test page",
|
||||||
|
"content": "Hello World",
|
||||||
|
"link_url": "https://example.com",
|
||||||
|
"link_text": "Link"
|
||||||
|
}
|
||||||
31
tests/mocks/table.html.jinja
Normal file
31
tests/mocks/table.html.jinja
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>{{ title }}</title>
|
||||||
|
<meta name="description" content="{{ description }}">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>{{ heading }}</h1>
|
||||||
|
|
||||||
|
<table id="{{ table_id }}">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
{% for col in columns %}
|
||||||
|
<th>{{ col }}</th>
|
||||||
|
{% endfor %}
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for row in rows %}
|
||||||
|
<tr>
|
||||||
|
{% for cell in row %}
|
||||||
|
<td>{{ cell }}</td>
|
||||||
|
{% endfor %}
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<a href="{{ link_url }}">{{ link_text }}</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
14
tests/mocks/table.json
Normal file
14
tests/mocks/table.json
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"title": "Table Test Page",
|
||||||
|
"description": "HTML page with a table for parsing tests",
|
||||||
|
"heading": "Sample Table",
|
||||||
|
"table_id": "data-table",
|
||||||
|
"columns": ["Name", "Age", "City"],
|
||||||
|
"rows": [
|
||||||
|
["Alice", "30", "London"],
|
||||||
|
["Bob", "25", "New York"],
|
||||||
|
["Charlie", "35", "Berlin"]
|
||||||
|
],
|
||||||
|
"link_url": "https://example.org/details",
|
||||||
|
"link_text": "Details"
|
||||||
|
}
|
||||||
48
tests/test_html_simple.py
Normal file
48
tests/test_html_simple.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from bs4 import Tag
|
||||||
|
|
||||||
|
from omniread.html.parser import HTMLParser
|
||||||
|
from omniread.core.content import Content
|
||||||
|
|
||||||
|
|
||||||
|
class ParsedSimpleHTML(BaseModel):
|
||||||
|
title: Optional[str]
|
||||||
|
description: Optional[str]
|
||||||
|
content: Optional[str]
|
||||||
|
link: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
|
||||||
|
"""
|
||||||
|
Parser focused on high-level page semantics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse(self) -> ParsedSimpleHTML:
|
||||||
|
soup = self._soup
|
||||||
|
meta = self.parse_meta()
|
||||||
|
|
||||||
|
content_div = soup.find("div", id="content")
|
||||||
|
link_tag: Tag | None = soup.find("a")
|
||||||
|
|
||||||
|
return ParsedSimpleHTML(
|
||||||
|
title=meta["title"],
|
||||||
|
description=meta["meta"].get("description"),
|
||||||
|
content=self.parse_div(content_div) if content_div else None,
|
||||||
|
link=self.parse_link(link_tag) if link_tag else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_end_to_end_html_simple(http_scraper):
|
||||||
|
content: Content = http_scraper.fetch("https://test.local/simple")
|
||||||
|
|
||||||
|
parser = SimpleHTMLParser(content)
|
||||||
|
result = parser.parse()
|
||||||
|
|
||||||
|
assert isinstance(result, ParsedSimpleHTML)
|
||||||
|
|
||||||
|
assert result.title == "Test Page"
|
||||||
|
assert result.description == "Simple test page"
|
||||||
|
assert result.content == "Hello World"
|
||||||
|
assert result.link == "https://example.com"
|
||||||
44
tests/test_html_table.py
Normal file
44
tests/test_html_table.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from omniread.html.parser import HTMLParser
|
||||||
|
from omniread.core.content import Content
|
||||||
|
|
||||||
|
|
||||||
|
class ParsedTableHTML(BaseModel):
|
||||||
|
title: Optional[str]
|
||||||
|
table: list[list[str]]
|
||||||
|
|
||||||
|
|
||||||
|
class TableHTMLParser(HTMLParser[ParsedTableHTML]):
|
||||||
|
"""
|
||||||
|
Parser focused on extracting tabular data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse(self) -> ParsedTableHTML:
|
||||||
|
soup = self._soup
|
||||||
|
|
||||||
|
table_tag = soup.find("table")
|
||||||
|
|
||||||
|
return ParsedTableHTML(
|
||||||
|
title=soup.title.string.strip() if soup.title else None,
|
||||||
|
table=self.parse_table(table_tag) if table_tag else [],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_end_to_end_html_table(http_scraper):
|
||||||
|
content: Content = http_scraper.fetch("https://test.local/table")
|
||||||
|
|
||||||
|
parser = TableHTMLParser(content)
|
||||||
|
result = parser.parse()
|
||||||
|
|
||||||
|
assert isinstance(result, ParsedTableHTML)
|
||||||
|
|
||||||
|
assert result.title == "Table Test Page"
|
||||||
|
assert result.table == [
|
||||||
|
["Name", "Age", "City"],
|
||||||
|
["Alice", "30", "London"],
|
||||||
|
["Bob", "25", "New York"],
|
||||||
|
["Charlie", "35", "Berlin"],
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user