{ "module": "omniread.html.parser", "content": { "path": "omniread.html.parser", "docstring": "# Summary\n\nHTML parser base implementations for OmniRead.\n\nThis module provides reusable HTML parsing utilities built on top of\nthe abstract parser contracts defined in `omniread.core.parser`.\n\nIt supplies:\n\n- Content-type enforcement for HTML inputs\n- BeautifulSoup initialization and lifecycle management\n- Common helper methods for extracting structured data from HTML elements\n\nConcrete parsers must subclass `HTMLParser` and implement the `parse()` method\nto return a structured representation appropriate for their use case.", "objects": { "Any": { "name": "Any", "kind": "alias", "path": "omniread.html.parser.Any", "signature": "", "docstring": null }, "Generic": { "name": "Generic", "kind": "alias", "path": "omniread.html.parser.Generic", "signature": "", "docstring": null }, "TypeVar": { "name": "TypeVar", "kind": "alias", "path": "omniread.html.parser.TypeVar", "signature": "", "docstring": null }, "Optional": { "name": "Optional", "kind": "alias", "path": "omniread.html.parser.Optional", "signature": "", "docstring": null }, "abstractmethod": { "name": "abstractmethod", "kind": "alias", "path": "omniread.html.parser.abstractmethod", "signature": "", "docstring": null }, "BeautifulSoup": { "name": "BeautifulSoup", "kind": "alias", "path": "omniread.html.parser.BeautifulSoup", "signature": "", "docstring": null }, "Tag": { "name": "Tag", "kind": "alias", "path": "omniread.html.parser.Tag", "signature": "", "docstring": null }, "ContentType": { "name": "ContentType", "kind": "class", "path": "omniread.html.parser.ContentType", "signature": "", "docstring": "Supported MIME types for extracted content.\n\nNotes:\n **Guarantees:**\n\n - This enum represents the declared or inferred media type of the\n content source.\n - It is primarily used for routing content to the appropriate\n parser or downstream consumer.", "members": { "HTML": { "name": "HTML", "kind": "attribute", "path": "omniread.html.parser.ContentType.HTML", "signature": "", "docstring": "HTML document content." }, "PDF": { "name": "PDF", "kind": "attribute", "path": "omniread.html.parser.ContentType.PDF", "signature": "", "docstring": "PDF document content." }, "JSON": { "name": "JSON", "kind": "attribute", "path": "omniread.html.parser.ContentType.JSON", "signature": "", "docstring": "JSON document content." }, "XML": { "name": "XML", "kind": "attribute", "path": "omniread.html.parser.ContentType.XML", "signature": "", "docstring": "XML document content." } } }, "Content": { "name": "Content", "kind": "class", "path": "omniread.html.parser.Content", "signature": "", "docstring": "Normalized representation of extracted content.\n\nNotes:\n **Responsibilities:**\n\n - A `Content` instance represents a raw content payload along with\n minimal contextual metadata describing its origin and type.\n - This class is the primary exchange format between scrapers,\n parsers, and downstream consumers.", "members": { "raw": { "name": "raw", "kind": "attribute", "path": "omniread.html.parser.Content.raw", "signature": "", "docstring": "Raw content bytes as retrieved from the source." }, "source": { "name": "source", "kind": "attribute", "path": "omniread.html.parser.Content.source", "signature": "", "docstring": "Identifier of the content origin (URL, file path, or logical name)." }, "content_type": { "name": "content_type", "kind": "attribute", "path": "omniread.html.parser.Content.content_type", "signature": "", "docstring": "Optional MIME type of the content, if known." }, "metadata": { "name": "metadata", "kind": "attribute", "path": "omniread.html.parser.Content.metadata", "signature": "", "docstring": "Optional, implementation-defined metadata associated with the content (e.g., headers, encoding hints, extraction notes)." } } }, "BaseParser": { "name": "BaseParser", "kind": "class", "path": "omniread.html.parser.BaseParser", "signature": "", "docstring": "Base interface for all parsers.\n\nNotes:\n **Guarantees:**\n\n - A parser is a self-contained object that owns the `Content` it is\n responsible for interpreting.\n - Consumers may rely on early validation of content compatibility\n and type-stable return values from `parse()`.\n\n **Responsibilities:**\n\n - Implementations must declare supported content types via `supported_types`.\n - Implementations must raise parsing-specific exceptions from `parse()`.\n - Implementations must remain deterministic for a given input.", "members": { "supported_types": { "name": "supported_types", "kind": "attribute", "path": "omniread.html.parser.BaseParser.supported_types", "signature": "", "docstring": "Set of content types supported by this parser. An empty set indicates that the parser is content-type agnostic." }, "content": { "name": "content", "kind": "attribute", "path": "omniread.html.parser.BaseParser.content", "signature": "", "docstring": null }, "parse": { "name": "parse", "kind": "function", "path": "omniread.html.parser.BaseParser.parse", "signature": "", "docstring": "Parse the owned content into structured output.\n\nReturns:\n T:\n Parsed, structured representation.\n\nRaises:\n Exception:\n Parsing-specific errors as defined by the implementation.\n\nNotes:\n **Responsibilities:**\n\n - Implementations must fully consume the provided content and\n return a deterministic, structured output." }, "supports": { "name": "supports", "kind": "function", "path": "omniread.html.parser.BaseParser.supports", "signature": "", "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n bool:\n True if the content type is supported; False otherwise." } } }, "T": { "name": "T", "kind": "attribute", "path": "omniread.html.parser.T", "signature": null, "docstring": null }, "HTMLParser": { "name": "HTMLParser", "kind": "class", "path": "omniread.html.parser.HTMLParser", "signature": "", "docstring": "Base HTML parser.\n\nNotes:\n **Responsibilities:**\n\n - This class extends the core `BaseParser` with HTML-specific behavior,\n including DOM parsing via BeautifulSoup and reusable extraction helpers.\n - Provides reusable helpers for HTML extraction. Concrete parsers must\n explicitly define the return type.\n\n **Guarantees:**\n\n - Accepts only HTML content.\n - Owns a parsed BeautifulSoup DOM tree.\n - Provides pure helper utilities for common HTML structures.\n\n **Constraints:**\n\n - Concrete subclasses must define the output type `T` and implement\n the `parse()` method.", "members": { "supported_types": { "name": "supported_types", "kind": "attribute", "path": "omniread.html.parser.HTMLParser.supported_types", "signature": null, "docstring": "Set of content types supported by this parser (HTML only)." }, "parse": { "name": "parse", "kind": "function", "path": "omniread.html.parser.HTMLParser.parse", "signature": "", "docstring": "Fully parse the HTML content into structured output.\n\nReturns:\n T:\n Parsed representation of type `T`.\n\nNotes:\n **Responsibilities:**\n\n - Implementations must fully interpret the HTML DOM and return a\n deterministic, structured output." }, "parse_div": { "name": "parse_div", "kind": "function", "path": "omniread.html.parser.HTMLParser.parse_div", "signature": "", "docstring": "Extract normalized text from a `
` element.\n\nArgs:\n div (Tag):\n BeautifulSoup tag representing a `
`.\n separator (str, optional):\n String used to separate text nodes.\n\nReturns:\n str:\n Flattened, whitespace-normalized text content." }, "parse_link": { "name": "parse_link", "kind": "function", "path": "omniread.html.parser.HTMLParser.parse_link", "signature": "", "docstring": "Extract the hyperlink reference from an `` element.\n\nArgs:\n a (Tag):\n BeautifulSoup tag representing an anchor.\n\nReturns:\n Optional[str]:\n The value of the `href` attribute, or None if absent." }, "parse_table": { "name": "parse_table", "kind": "function", "path": "omniread.html.parser.HTMLParser.parse_table", "signature": "", "docstring": "Parse an HTML table into a 2D list of strings.\n\nArgs:\n table (Tag):\n BeautifulSoup tag representing a ``.\n\nReturns:\n list[list[str]]:\n A list of rows, where each row is a list of cell text values." }, "parse_meta": { "name": "parse_meta", "kind": "function", "path": "omniread.html.parser.HTMLParser.parse_meta", "signature": "", "docstring": "Extract high-level metadata from the HTML document.\n\nReturns:\n dict[str, Any]:\n Dictionary containing extracted metadata.\n\nNotes:\n **Responsibilities:**\n\n - Extract high-level metadata from the HTML document.\n - This includes: Document title, `` tag name/property to\n content mappings." } } }, "list": { "name": "list", "kind": "alias", "path": "omniread.html.parser.list", "signature": "", "docstring": null }, "dict": { "name": "dict", "kind": "alias", "path": "omniread.html.parser.dict", "signature": "", "docstring": null } } } }