Files
omniread/omniread/html/parser.py
Vishesh 'ironeagle' Bangotra 7f1b0d9c10 docs: add contract-oriented docstrings across core, html, and pdf layers
- docs(core): document Content and ContentType canonical models
- docs(core): define BaseParser contract and parsing semantics
- docs(core): define BaseScraper contract and acquisition semantics
- docs(html): document HTML package purpose and scope
- docs(html): add HTMLParser base with DOM helpers and contracts
- docs(html): add HTTP-based HTMLScraper with content-type enforcement
- docs(pdf): document PDF package structure and public pipeline
- docs(pdf): add BasePDFClient abstraction and filesystem implementation
- docs(pdf): add PDFParser base contract for binary parsing
- docs(pdf): add PDFScraper coordinating client and Content normalization
- docs(api): expand top-level omniread module with install instructions and examples
2026-01-09 15:51:22 +05:30

178 lines
5.0 KiB
Python

"""
HTML parser base implementations for OmniRead.
This module provides reusable HTML parsing utilities built on top of
the abstract parser contracts defined in `omniread.core.parser`.
It supplies:
- Content-type enforcement for HTML inputs
- BeautifulSoup initialization and lifecycle management
- Common helper methods for extracting structured data from HTML elements
Concrete parsers must subclass `HTMLParser` and implement the `parse()` method
to return a structured representation appropriate for their use case.
"""
from typing import Any, Generic, TypeVar, Optional
from abc import abstractmethod
from bs4 import BeautifulSoup, Tag
from omniread.core.content import ContentType, Content
from omniread.core.parser import BaseParser
T = TypeVar("T")
class HTMLParser(BaseParser[T], Generic[T]):
"""
Base HTML parser.
This class extends the core `BaseParser` with HTML-specific behavior,
including DOM parsing via BeautifulSoup and reusable extraction helpers.
Provides reusable helpers for HTML extraction.
Concrete parsers must explicitly define the return type.
Characteristics:
- Accepts only HTML content
- Owns a parsed BeautifulSoup DOM tree
- Provides pure helper utilities for common HTML structures
Concrete subclasses must:
- Define the output type `T`
- Implement the `parse()` method
"""
supported_types = {ContentType.HTML}
"""Set of content types supported by this parser (HTML only)."""
def __init__(self, content: Content, features: str = "html.parser"):
"""
Initialize the HTML parser.
Args:
content: HTML content to be parsed.
features: BeautifulSoup parser backend to use
(e.g., 'html.parser', 'lxml').
Raises:
ValueError: If the content is empty or not valid HTML.
"""
super().__init__(content)
self._features = features
self._soup = self._get_soup()
# ----------------------------
# Contract
# ----------------------------
@abstractmethod
def parse(self) -> T:
"""
Fully parse the HTML content into structured output.
Implementations must fully interpret the HTML DOM and return
a deterministic, structured output.
Returns:
Parsed representation of type `T`.
"""
raise NotImplementedError
# ----------------------------
# Helpers (static / pure)
# ----------------------------
@staticmethod
def parse_div(div: Tag, *, separator: str = " ") -> str:
"""
Extract normalized text from a `<div>` element.
Args:
div: BeautifulSoup tag representing a `<div>`.
separator: String used to separate text nodes.
Returns:
Flattened, whitespace-normalized text content.
"""
return div.get_text(separator=separator, strip=True)
@staticmethod
def parse_link(a: Tag) -> Optional[str]:
"""
Extract the hyperlink reference from an `<a>` element.
Args:
a: BeautifulSoup tag representing an anchor.
Returns:
The value of the `href` attribute, or None if absent.
"""
return a.get("href")
@staticmethod
def parse_table(table: Tag) -> list[list[str]]:
"""
Parse an HTML table into a 2D list of strings.
Args:
table: BeautifulSoup tag representing a `<table>`.
Returns:
A list of rows, where each row is a list of cell text values.
"""
rows: list[list[str]] = []
for tr in table.find_all("tr"):
cells = [
cell.get_text(strip=True)
for cell in tr.find_all(["td", "th"])
]
if cells:
rows.append(cells)
return rows
# ----------------------------
# Helpers (instance-level)
# ----------------------------
def _get_soup(self) -> BeautifulSoup:
"""
Build a BeautifulSoup DOM tree from raw HTML content.
Returns:
Parsed BeautifulSoup document tree.
Raises:
ValueError: If the content payload is empty.
"""
if not self.content.raw:
raise ValueError("Empty HTML content")
return BeautifulSoup(self.content.raw, features=self._features)
def parse_meta(self) -> dict[str, Any]:
"""
Extract high-level metadata from the HTML document.
This includes:
- Document title
- `<meta>` tag name/property → content mappings
Returns:
Dictionary containing extracted metadata.
"""
soup = self._soup
title = soup.title.string.strip() if soup.title and soup.title.string else None
meta = {
tag.get("name") or tag.get("property"): tag.get("content")
for tag in soup.find_all("meta")
if tag.get("content")
}
return {
"title": title,
"meta": meta,
}