- docs(core): document Content and ContentType canonical models - docs(core): define BaseParser contract and parsing semantics - docs(core): define BaseScraper contract and acquisition semantics - docs(html): document HTML package purpose and scope - docs(html): add HTMLParser base with DOM helpers and contracts - docs(html): add HTTP-based HTMLScraper with content-type enforcement - docs(pdf): document PDF package structure and public pipeline - docs(pdf): add BasePDFClient abstraction and filesystem implementation - docs(pdf): add PDFParser base contract for binary parsing - docs(pdf): add PDFScraper coordinating client and Content normalization - docs(api): expand top-level omniread module with install instructions and examples
178 lines
5.0 KiB
Python
178 lines
5.0 KiB
Python
"""
|
|
HTML parser base implementations for OmniRead.
|
|
|
|
This module provides reusable HTML parsing utilities built on top of
|
|
the abstract parser contracts defined in `omniread.core.parser`.
|
|
|
|
It supplies:
|
|
- Content-type enforcement for HTML inputs
|
|
- BeautifulSoup initialization and lifecycle management
|
|
- Common helper methods for extracting structured data from HTML elements
|
|
|
|
Concrete parsers must subclass `HTMLParser` and implement the `parse()` method
|
|
to return a structured representation appropriate for their use case.
|
|
"""
|
|
|
|
from typing import Any, Generic, TypeVar, Optional
|
|
from abc import abstractmethod
|
|
|
|
from bs4 import BeautifulSoup, Tag
|
|
|
|
from omniread.core.content import ContentType, Content
|
|
from omniread.core.parser import BaseParser
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
|
class HTMLParser(BaseParser[T], Generic[T]):
|
|
"""
|
|
Base HTML parser.
|
|
|
|
This class extends the core `BaseParser` with HTML-specific behavior,
|
|
including DOM parsing via BeautifulSoup and reusable extraction helpers.
|
|
|
|
Provides reusable helpers for HTML extraction.
|
|
Concrete parsers must explicitly define the return type.
|
|
|
|
Characteristics:
|
|
- Accepts only HTML content
|
|
- Owns a parsed BeautifulSoup DOM tree
|
|
- Provides pure helper utilities for common HTML structures
|
|
|
|
Concrete subclasses must:
|
|
- Define the output type `T`
|
|
- Implement the `parse()` method
|
|
"""
|
|
|
|
supported_types = {ContentType.HTML}
|
|
"""Set of content types supported by this parser (HTML only)."""
|
|
|
|
def __init__(self, content: Content, features: str = "html.parser"):
|
|
"""
|
|
Initialize the HTML parser.
|
|
|
|
Args:
|
|
content: HTML content to be parsed.
|
|
features: BeautifulSoup parser backend to use
|
|
(e.g., 'html.parser', 'lxml').
|
|
|
|
Raises:
|
|
ValueError: If the content is empty or not valid HTML.
|
|
"""
|
|
super().__init__(content)
|
|
self._features = features
|
|
self._soup = self._get_soup()
|
|
|
|
# ----------------------------
|
|
# Contract
|
|
# ----------------------------
|
|
|
|
@abstractmethod
|
|
def parse(self) -> T:
|
|
"""
|
|
Fully parse the HTML content into structured output.
|
|
|
|
Implementations must fully interpret the HTML DOM and return
|
|
a deterministic, structured output.
|
|
|
|
Returns:
|
|
Parsed representation of type `T`.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
# ----------------------------
|
|
# Helpers (static / pure)
|
|
# ----------------------------
|
|
|
|
@staticmethod
|
|
def parse_div(div: Tag, *, separator: str = " ") -> str:
|
|
"""
|
|
Extract normalized text from a `<div>` element.
|
|
|
|
Args:
|
|
div: BeautifulSoup tag representing a `<div>`.
|
|
separator: String used to separate text nodes.
|
|
|
|
Returns:
|
|
Flattened, whitespace-normalized text content.
|
|
"""
|
|
return div.get_text(separator=separator, strip=True)
|
|
|
|
@staticmethod
|
|
def parse_link(a: Tag) -> Optional[str]:
|
|
"""
|
|
Extract the hyperlink reference from an `<a>` element.
|
|
|
|
Args:
|
|
a: BeautifulSoup tag representing an anchor.
|
|
|
|
Returns:
|
|
The value of the `href` attribute, or None if absent.
|
|
"""
|
|
return a.get("href")
|
|
|
|
@staticmethod
|
|
def parse_table(table: Tag) -> list[list[str]]:
|
|
"""
|
|
Parse an HTML table into a 2D list of strings.
|
|
|
|
Args:
|
|
table: BeautifulSoup tag representing a `<table>`.
|
|
|
|
Returns:
|
|
A list of rows, where each row is a list of cell text values.
|
|
"""
|
|
rows: list[list[str]] = []
|
|
for tr in table.find_all("tr"):
|
|
cells = [
|
|
cell.get_text(strip=True)
|
|
for cell in tr.find_all(["td", "th"])
|
|
]
|
|
if cells:
|
|
rows.append(cells)
|
|
return rows
|
|
|
|
# ----------------------------
|
|
# Helpers (instance-level)
|
|
# ----------------------------
|
|
|
|
def _get_soup(self) -> BeautifulSoup:
|
|
"""
|
|
Build a BeautifulSoup DOM tree from raw HTML content.
|
|
|
|
Returns:
|
|
Parsed BeautifulSoup document tree.
|
|
|
|
Raises:
|
|
ValueError: If the content payload is empty.
|
|
"""
|
|
if not self.content.raw:
|
|
raise ValueError("Empty HTML content")
|
|
return BeautifulSoup(self.content.raw, features=self._features)
|
|
|
|
def parse_meta(self) -> dict[str, Any]:
|
|
"""
|
|
Extract high-level metadata from the HTML document.
|
|
|
|
This includes:
|
|
- Document title
|
|
- `<meta>` tag name/property → content mappings
|
|
|
|
Returns:
|
|
Dictionary containing extracted metadata.
|
|
"""
|
|
soup = self._soup
|
|
|
|
title = soup.title.string.strip() if soup.title and soup.title.string else None
|
|
|
|
meta = {
|
|
tag.get("name") or tag.get("property"): tag.get("content")
|
|
for tag in soup.find_all("meta")
|
|
if tag.get("content")
|
|
}
|
|
|
|
return {
|
|
"title": title,
|
|
"meta": meta,
|
|
}
|