{ "module": "omniread.html.parser", "content": { "path": "omniread.html.parser", "docstring": "HTML parser base implementations for OmniRead.\n\nThis module provides reusable HTML parsing utilities built on top of\nthe abstract parser contracts defined in `omniread.core.parser`.\n\nIt supplies:\n- Content-type enforcement for HTML inputs\n- BeautifulSoup initialization and lifecycle management\n- Common helper methods for extracting structured data from HTML elements\n\nConcrete parsers must subclass `HTMLParser` and implement the `parse()` method\nto return a structured representation appropriate for their use case.", "objects": { "Any": { "name": "Any", "kind": "alias", "path": "omniread.html.parser.Any", "signature": "", "docstring": null }, "Generic": { "name": "Generic", "kind": "alias", "path": "omniread.html.parser.Generic", "signature": "", "docstring": null }, "TypeVar": { "name": "TypeVar", "kind": "alias", "path": "omniread.html.parser.TypeVar", "signature": "", "docstring": null }, "Optional": { "name": "Optional", "kind": "alias", "path": "omniread.html.parser.Optional", "signature": "", "docstring": null }, "abstractmethod": { "name": "abstractmethod", "kind": "alias", "path": "omniread.html.parser.abstractmethod", "signature": "", "docstring": null }, "BeautifulSoup": { "name": "BeautifulSoup", "kind": "alias", "path": "omniread.html.parser.BeautifulSoup", "signature": "", "docstring": null }, "Tag": { "name": "Tag", "kind": "alias", "path": "omniread.html.parser.Tag", "signature": "", "docstring": null }, "ContentType": { "name": "ContentType", "kind": "class", "path": "omniread.html.parser.ContentType", "signature": "", "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", "members": { "HTML": { "name": "HTML", "kind": "attribute", "path": "omniread.html.parser.ContentType.HTML", "signature": "", "docstring": "HTML document content." }, "PDF": { "name": "PDF", "kind": "attribute", "path": "omniread.html.parser.ContentType.PDF", "signature": "", "docstring": "PDF document content." }, "JSON": { "name": "JSON", "kind": "attribute", "path": "omniread.html.parser.ContentType.JSON", "signature": "", "docstring": "JSON document content." }, "XML": { "name": "XML", "kind": "attribute", "path": "omniread.html.parser.ContentType.XML", "signature": "", "docstring": "XML document content." } } }, "Content": { "name": "Content", "kind": "class", "path": "omniread.html.parser.Content", "signature": "", "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", "members": { "raw": { "name": "raw", "kind": "attribute", "path": "omniread.html.parser.Content.raw", "signature": "", "docstring": null }, "source": { "name": "source", "kind": "attribute", "path": "omniread.html.parser.Content.source", "signature": "", "docstring": null }, "content_type": { "name": "content_type", "kind": "attribute", "path": "omniread.html.parser.Content.content_type", "signature": "", "docstring": null }, "metadata": { "name": "metadata", "kind": "attribute", "path": "omniread.html.parser.Content.metadata", "signature": "", "docstring": null } } }, "BaseParser": { "name": "BaseParser", "kind": "class", "path": "omniread.html.parser.BaseParser", "signature": "", "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", "members": { "supported_types": { "name": "supported_types", "kind": "attribute", "path": "omniread.html.parser.BaseParser.supported_types", "signature": "", "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." }, "content": { "name": "content", "kind": "attribute", "path": "omniread.html.parser.BaseParser.content", "signature": "", "docstring": null }, "parse": { "name": "parse", "kind": "function", "path": "omniread.html.parser.BaseParser.parse", "signature": "", "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." }, "supports": { "name": "supports", "kind": "function", "path": "omniread.html.parser.BaseParser.supports", "signature": "", "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." } } }, "T": { "name": "T", "kind": "attribute", "path": "omniread.html.parser.T", "signature": null, "docstring": null }, "HTMLParser": { "name": "HTMLParser", "kind": "class", "path": "omniread.html.parser.HTMLParser", "signature": "", "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method", "members": { "supported_types": { "name": "supported_types", "kind": "attribute", "path": "omniread.html.parser.HTMLParser.supported_types", "signature": null, "docstring": "Set of content types supported by this parser (HTML only)." }, "parse": { "name": "parse", "kind": "function", "path": "omniread.html.parser.HTMLParser.parse", "signature": "", "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`." }, "parse_div": { "name": "parse_div", "kind": "function", "path": "omniread.html.parser.HTMLParser.parse_div", "signature": "", "docstring": "Extract normalized text from a `
` element.\n\nArgs:\n div: BeautifulSoup tag representing a `
`.\n separator: String used to separate text nodes.\n\nReturns:\n Flattened, whitespace-normalized text content." }, "parse_link": { "name": "parse_link", "kind": "function", "path": "omniread.html.parser.HTMLParser.parse_link", "signature": "", "docstring": "Extract the hyperlink reference from an `` element.\n\nArgs:\n a: BeautifulSoup tag representing an anchor.\n\nReturns:\n The value of the `href` attribute, or None if absent." }, "parse_table": { "name": "parse_table", "kind": "function", "path": "omniread.html.parser.HTMLParser.parse_table", "signature": "", "docstring": "Parse an HTML table into a 2D list of strings.\n\nArgs:\n table: BeautifulSoup tag representing a ``.\n\nReturns:\n A list of rows, where each row is a list of cell text values." }, "parse_meta": { "name": "parse_meta", "kind": "function", "path": "omniread.html.parser.HTMLParser.parse_meta", "signature": "", "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `` tag name/property → content mappings\n\nReturns:\n Dictionary containing extracted metadata." } } }, "list": { "name": "list", "kind": "alias", "path": "omniread.html.parser.list", "signature": "", "docstring": null }, "dict": { "name": "dict", "kind": "alias", "path": "omniread.html.parser.dict", "signature": "", "docstring": null } } } }