omniread/mcp_docs/modules/omniread.html.parser.json

{
  "module": "omniread.html.parser",
  "content": {
    "path": "omniread.html.parser",
    "docstring": "HTML parser base implementations for OmniRead.\n\nThis module provides reusable HTML parsing utilities built on top of\nthe abstract parser contracts defined in `omniread.core.parser`.\n\nIt supplies:\n- Content-type enforcement for HTML inputs\n- BeautifulSoup initialization and lifecycle management\n- Common helper methods for extracting structured data from HTML elements\n\nConcrete parsers must subclass `HTMLParser` and implement the `parse()` method\nto return a structured representation appropriate for their use case.",
    "objects": {
      "Any": {
        "name": "Any",
        "kind": "alias",
        "path": "omniread.html.parser.Any",
        "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
        "docstring": null
      },
      "Generic": {
        "name": "Generic",
        "kind": "alias",
        "path": "omniread.html.parser.Generic",
        "signature": "<bound method Alias.signature of Alias('Generic', 'typing.Generic')>",
        "docstring": null
      },
      "TypeVar": {
        "name": "TypeVar",
        "kind": "alias",
        "path": "omniread.html.parser.TypeVar",
        "signature": "<bound method Alias.signature of Alias('TypeVar', 'typing.TypeVar')>",
        "docstring": null
      },
      "Optional": {
        "name": "Optional",
        "kind": "alias",
        "path": "omniread.html.parser.Optional",
        "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
        "docstring": null
      },
      "abstractmethod": {
        "name": "abstractmethod",
        "kind": "alias",
        "path": "omniread.html.parser.abstractmethod",
        "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
        "docstring": null
      },
      "BeautifulSoup": {
        "name": "BeautifulSoup",
        "kind": "alias",
        "path": "omniread.html.parser.BeautifulSoup",
        "signature": "<bound method Alias.signature of Alias('BeautifulSoup', 'bs4.BeautifulSoup')>",
        "docstring": null
      },
      "Tag": {
        "name": "Tag",
        "kind": "alias",
        "path": "omniread.html.parser.Tag",
        "signature": "<bound method Alias.signature of Alias('Tag', 'bs4.Tag')>",
        "docstring": null
      },
      "ContentType": {
        "name": "ContentType",
        "kind": "class",
        "path": "omniread.html.parser.ContentType",
        "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
        "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
        "members": {
          "HTML": {
            "name": "HTML",
            "kind": "attribute",
            "path": "omniread.html.parser.ContentType.HTML",
            "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
            "docstring": "HTML document content."
          },
          "PDF": {
            "name": "PDF",
            "kind": "attribute",
            "path": "omniread.html.parser.ContentType.PDF",
            "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
            "docstring": "PDF document content."
          },
          "JSON": {
            "name": "JSON",
            "kind": "attribute",
            "path": "omniread.html.parser.ContentType.JSON",
            "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
            "docstring": "JSON document content."
          },
          "XML": {
            "name": "XML",
            "kind": "attribute",
            "path": "omniread.html.parser.ContentType.XML",
            "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
            "docstring": "XML document content."
          }
        }
      },
      "Content": {
        "name": "Content",
        "kind": "class",
        "path": "omniread.html.parser.Content",
        "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
        "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
        "members": {
          "raw": {
            "name": "raw",
            "kind": "attribute",
            "path": "omniread.html.parser.Content.raw",
            "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
            "docstring": null
          },
          "source": {
            "name": "source",
            "kind": "attribute",
            "path": "omniread.html.parser.Content.source",
            "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
            "docstring": null
          },
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.html.parser.Content.content_type",
            "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
            "docstring": null
          },
          "metadata": {
            "name": "metadata",
            "kind": "attribute",
            "path": "omniread.html.parser.Content.metadata",
            "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
            "docstring": null
          }
        }
      },
      "BaseParser": {
        "name": "BaseParser",
        "kind": "class",
        "path": "omniread.html.parser.BaseParser",
        "signature": "<bound method Alias.signature of Alias('BaseParser', 'omniread.core.parser.BaseParser')>",
        "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.html.parser.BaseParser.supported_types",
            "signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.core.parser.BaseParser.supported_types')>",
            "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic."
          },
          "content": {
            "name": "content",
            "kind": "attribute",
            "path": "omniread.html.parser.BaseParser.content",
            "signature": "<bound method Alias.signature of Alias('content', 'omniread.core.parser.BaseParser.content')>",
            "docstring": null
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.html.parser.BaseParser.parse",
            "signature": "<bound method Alias.signature of Alias('parse', 'omniread.core.parser.BaseParser.parse')>",
            "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed, structured representation.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
          },
          "supports": {
            "name": "supports",
            "kind": "function",
            "path": "omniread.html.parser.BaseParser.supports",
            "signature": "<bound method Alias.signature of Alias('supports', 'omniread.core.parser.BaseParser.supports')>",
            "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n    True if the content type is supported; False otherwise."
          }
        }
      },
      "T": {
        "name": "T",
        "kind": "attribute",
        "path": "omniread.html.parser.T",
        "signature": null,
        "docstring": null
      },
      "HTMLParser": {
        "name": "HTMLParser",
        "kind": "class",
        "path": "omniread.html.parser.HTMLParser",
        "signature": "<bound method Class.signature of Class('HTMLParser', 27, 177)>",
        "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.html.parser.HTMLParser.supported_types",
            "signature": null,
            "docstring": "Set of content types supported by this parser (HTML only)."
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.html.parser.HTMLParser.parse",
            "signature": "<bound method Function.signature of Function('parse', 70, 81)>",
            "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n    Parsed representation of type `T`."
          },
          "parse_div": {
            "name": "parse_div",
            "kind": "function",
            "path": "omniread.html.parser.HTMLParser.parse_div",
            "signature": "<bound method Function.signature of Function('parse_div', 87, 99)>",
            "docstring": "Extract normalized text from a `<div>` element.\n\nArgs:\n    div: BeautifulSoup tag representing a `<div>`.\n    separator: String used to separate text nodes.\n\nReturns:\n    Flattened, whitespace-normalized text content."
          },
          "parse_link": {
            "name": "parse_link",
            "kind": "function",
            "path": "omniread.html.parser.HTMLParser.parse_link",
            "signature": "<bound method Function.signature of Function('parse_link', 101, 112)>",
            "docstring": "Extract the hyperlink reference from an `<a>` element.\n\nArgs:\n    a: BeautifulSoup tag representing an anchor.\n\nReturns:\n    The value of the `href` attribute, or None if absent."
          },
          "parse_table": {
            "name": "parse_table",
            "kind": "function",
            "path": "omniread.html.parser.HTMLParser.parse_table",
            "signature": "<bound method Function.signature of Function('parse_table', 114, 133)>",
            "docstring": "Parse an HTML table into a 2D list of strings.\n\nArgs:\n    table: BeautifulSoup tag representing a `<table>`.\n\nReturns:\n    A list of rows, where each row is a list of cell text values."
          },
          "parse_meta": {
            "name": "parse_meta",
            "kind": "function",
            "path": "omniread.html.parser.HTMLParser.parse_meta",
            "signature": "<bound method Function.signature of Function('parse_meta', 153, 177)>",
            "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `<meta>` tag name/property → content mappings\n\nReturns:\n    Dictionary containing extracted metadata."
          }
        }
      },
      "list": {
        "name": "list",
        "kind": "alias",
        "path": "omniread.html.parser.list",
        "signature": "<bound method Alias.signature of Alias('list', 'typing.list')>",
        "docstring": null
      },
      "dict": {
        "name": "dict",
        "kind": "alias",
        "path": "omniread.html.parser.dict",
        "signature": "<bound method Alias.signature of Alias('dict', 'typing.dict')>",
        "docstring": null
      }
    }
  }
}