{ "module": "omniread.core.parser", "content": { "path": "omniread.core.parser", "docstring": "Abstract parsing contracts for OmniRead.\n\nThis module defines the **format-agnostic parser interface** used to transform\nraw content into structured, typed representations.\n\nParsers are responsible for:\n- Interpreting a single `Content` instance\n- Validating compatibility with the content type\n- Producing a structured output suitable for downstream consumers\n\nParsers are not responsible for:\n- Fetching or acquiring content\n- Performing retries or error recovery\n- Managing multiple content sources", "objects": { "ABC": { "name": "ABC", "kind": "alias", "path": "omniread.core.parser.ABC", "signature": "", "docstring": null }, "abstractmethod": { "name": "abstractmethod", "kind": "alias", "path": "omniread.core.parser.abstractmethod", "signature": "", "docstring": null }, "Generic": { "name": "Generic", "kind": "alias", "path": "omniread.core.parser.Generic", "signature": "", "docstring": null }, "TypeVar": { "name": "TypeVar", "kind": "alias", "path": "omniread.core.parser.TypeVar", "signature": "", "docstring": null }, "Set": { "name": "Set", "kind": "alias", "path": "omniread.core.parser.Set", "signature": "", "docstring": null }, "Content": { "name": "Content", "kind": "class", "path": "omniread.core.parser.Content", "signature": "", "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", "members": { "raw": { "name": "raw", "kind": "attribute", "path": "omniread.core.parser.Content.raw", "signature": "", "docstring": null }, "source": { "name": "source", "kind": "attribute", "path": "omniread.core.parser.Content.source", "signature": "", "docstring": null }, "content_type": { "name": "content_type", "kind": "attribute", "path": "omniread.core.parser.Content.content_type", "signature": "", "docstring": null }, "metadata": { "name": "metadata", "kind": "attribute", "path": "omniread.core.parser.Content.metadata", "signature": "", "docstring": null } } }, "ContentType": { "name": "ContentType", "kind": "class", "path": "omniread.core.parser.ContentType", "signature": "", "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", "members": { "HTML": { "name": "HTML", "kind": "attribute", "path": "omniread.core.parser.ContentType.HTML", "signature": "", "docstring": "HTML document content." }, "PDF": { "name": "PDF", "kind": "attribute", "path": "omniread.core.parser.ContentType.PDF", "signature": "", "docstring": "PDF document content." }, "JSON": { "name": "JSON", "kind": "attribute", "path": "omniread.core.parser.ContentType.JSON", "signature": "", "docstring": "JSON document content." }, "XML": { "name": "XML", "kind": "attribute", "path": "omniread.core.parser.ContentType.XML", "signature": "", "docstring": "XML document content." } } }, "T": { "name": "T", "kind": "attribute", "path": "omniread.core.parser.T", "signature": null, "docstring": null }, "BaseParser": { "name": "BaseParser", "kind": "class", "path": "omniread.core.parser.BaseParser", "signature": "", "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", "members": { "supported_types": { "name": "supported_types", "kind": "attribute", "path": "omniread.core.parser.BaseParser.supported_types", "signature": null, "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." }, "content": { "name": "content", "kind": "attribute", "path": "omniread.core.parser.BaseParser.content", "signature": null, "docstring": null }, "parse": { "name": "parse", "kind": "function", "path": "omniread.core.parser.BaseParser.parse", "signature": "", "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." }, "supports": { "name": "supports", "kind": "function", "path": "omniread.core.parser.BaseParser.supports", "signature": "", "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." } } } } } }