omniread/omniread/html/parser.py

"""
HTML parser base implementations for OmniRead.

This module provides reusable HTML parsing utilities built on top of
the abstract parser contracts defined in `omniread.core.parser`.

It supplies:
- Content-type enforcement for HTML inputs
- BeautifulSoup initialization and lifecycle management
- Common helper methods for extracting structured data from HTML elements

Concrete parsers must subclass `HTMLParser` and implement the `parse()` method
to return a structured representation appropriate for their use case.
"""

from typing import Any, Generic, TypeVar, Optional
from abc import abstractmethod

from bs4 import BeautifulSoup, Tag

from omniread.core.content import ContentType, Content
from omniread.core.parser import BaseParser

T = TypeVar("T")


class HTMLParser(BaseParser[T], Generic[T]):
    """
    Base HTML parser.

    This class extends the core `BaseParser` with HTML-specific behavior,
    including DOM parsing via BeautifulSoup and reusable extraction helpers.

    Provides reusable helpers for HTML extraction.
    Concrete parsers must explicitly define the return type.

    Characteristics:
    - Accepts only HTML content
    - Owns a parsed BeautifulSoup DOM tree
    - Provides pure helper utilities for common HTML structures

    Concrete subclasses must:
    - Define the output type `T`
    - Implement the `parse()` method
    """

    supported_types = {ContentType.HTML}
    """Set of content types supported by this parser (HTML only)."""

    def __init__(self, content: Content, features: str = "html.parser"):
        """
        Initialize the HTML parser.

        Args:
            content: HTML content to be parsed.
            features: BeautifulSoup parser backend to use
                (e.g., 'html.parser', 'lxml').

        Raises:
            ValueError: If the content is empty or not valid HTML.
        """
        super().__init__(content)
        self._features = features
        self._soup = self._get_soup()

    # ----------------------------
    # Contract
    # ----------------------------

    @abstractmethod
    def parse(self) -> T:
        """
        Fully parse the HTML content into structured output.

        Implementations must fully interpret the HTML DOM and return
        a deterministic, structured output.

        Returns:
            Parsed representation of type `T`.
        """
        raise NotImplementedError

    # ----------------------------
    # Helpers (static / pure)
    # ----------------------------

    @staticmethod
    def parse_div(div: Tag, *, separator: str = " ") -> str:
        """
        Extract normalized text from a `<div>` element.

        Args:
            div: BeautifulSoup tag representing a `<div>`.
            separator: String used to separate text nodes.

        Returns:
            Flattened, whitespace-normalized text content.
        """
        return div.get_text(separator=separator, strip=True)

    @staticmethod
    def parse_link(a: Tag) -> Optional[str]:
        """
        Extract the hyperlink reference from an `<a>` element.

        Args:
            a: BeautifulSoup tag representing an anchor.

        Returns:
            The value of the `href` attribute, or None if absent.
        """
        return a.get("href")

    @staticmethod
    def parse_table(table: Tag) -> list[list[str]]:
        """
        Parse an HTML table into a 2D list of strings.

        Args:
            table: BeautifulSoup tag representing a `<table>`.

        Returns:
            A list of rows, where each row is a list of cell text values.
        """
        rows: list[list[str]] = []
        for tr in table.find_all("tr"):
            cells = [
                cell.get_text(strip=True)
                for cell in tr.find_all(["td", "th"])
            ]
            if cells:
                rows.append(cells)
        return rows

    # ----------------------------
    # Helpers (instance-level)
    # ----------------------------

    def _get_soup(self) -> BeautifulSoup:
        """
        Build a BeautifulSoup DOM tree from raw HTML content.

        Returns:
            Parsed BeautifulSoup document tree.

        Raises:
            ValueError: If the content payload is empty.
        """
        if not self.content.raw:
            raise ValueError("Empty HTML content")
        return BeautifulSoup(self.content.raw, features=self._features)

    def parse_meta(self) -> dict[str, Any]:
        """
        Extract high-level metadata from the HTML document.

        This includes:
        - Document title
        - `<meta>` tag name/property → content mappings

        Returns:
            Dictionary containing extracted metadata.
        """
        soup = self._soup

        title = soup.title.string.strip() if soup.title and soup.title.string else None

        meta = {
            tag.get("name") or tag.get("property"): tag.get("content")
            for tag in soup.find_all("meta")
            if tag.get("content")
        }

        return {
            "title": title,
            "meta": meta,
        }