From a34ed1f98cfec1765cc46457a8973cf9bdd3873b Mon Sep 17 00:00:00 2001 From: Vishesh 'ironeagle' Bangotra Date: Thu, 22 Jan 2026 16:39:19 +0530 Subject: [PATCH] using doc-forge --- .gitignore | 3 +- docforge.nav.yml | 16 + docs/{ => omniread}/core/content.md | 2 + docs/{ => omniread}/core/index.md | 2 + docs/{ => omniread}/core/parser.md | 2 + docs/{ => omniread}/core/scraper.md | 2 + docs/{ => omniread}/html/index.md | 2 + docs/{ => omniread}/html/parser.md | 2 + docs/{ => omniread}/html/scraper.md | 2 + docs/{ => omniread}/index.md | 2 + docs/{ => omniread}/pdf/client.md | 2 + docs/{ => omniread}/pdf/index.md | 2 + docs/{ => omniread}/pdf/parser.md | 2 + docs/{ => omniread}/pdf/scraper.md | 2 + generate_docs.py | 46 - mcp_docs/index.json | 6 + mcp_docs/modules/omniread.core.content.json | 118 ++ mcp_docs/modules/omniread.core.json | 513 ++++++ mcp_docs/modules/omniread.core.parser.json | 162 ++ mcp_docs/modules/omniread.core.scraper.json | 97 ++ mcp_docs/modules/omniread.html.json | 488 ++++++ mcp_docs/modules/omniread.html.parser.json | 241 +++ mcp_docs/modules/omniread.html.scraper.json | 157 ++ mcp_docs/modules/omniread.json | 1639 +++++++++++++++++++ mcp_docs/modules/omniread.pdf.client.json | 69 + mcp_docs/modules/omniread.pdf.json | 419 +++++ mcp_docs/modules/omniread.pdf.parser.json | 134 ++ mcp_docs/modules/omniread.pdf.scraper.json | 152 ++ mcp_docs/nav.json | 50 + mkdocs.yml | 79 +- omniread/__init__.py | 21 + omniread/__init__.pyi | 13 + omniread/core/__init__.pyi | 10 + omniread/core/content.pyi | 15 + omniread/core/parser.pyi | 13 + omniread/core/scraper.pyi | 7 + omniread/html/__init__.pyi | 4 + omniread/html/parser.pyi | 18 + omniread/html/scraper.pyi | 10 + omniread/pdf/__init__.pyi | 5 + omniread/pdf/client.py | 3 +- omniread/pdf/client.pyi | 10 + omniread/pdf/parser.pyi | 11 + omniread/pdf/scraper.py | 2 +- omniread/pdf/scraper.pyi | 8 + requirements.txt | 19 - 46 files changed, 4475 insertions(+), 107 deletions(-) create mode 100644 docforge.nav.yml rename docs/{ => omniread}/core/content.md (70%) rename docs/{ => omniread}/core/index.md (69%) rename docs/{ => omniread}/core/parser.md (71%) rename docs/{ => omniread}/core/scraper.md (70%) rename docs/{ => omniread}/html/index.md (69%) rename docs/{ => omniread}/html/parser.md (71%) rename docs/{ => omniread}/html/scraper.md (70%) rename docs/{ => omniread}/index.md (52%) rename docs/{ => omniread}/pdf/client.md (70%) rename docs/{ => omniread}/pdf/index.md (70%) rename docs/{ => omniread}/pdf/parser.md (70%) rename docs/{ => omniread}/pdf/scraper.md (69%) delete mode 100644 generate_docs.py create mode 100644 mcp_docs/index.json create mode 100644 mcp_docs/modules/omniread.core.content.json create mode 100644 mcp_docs/modules/omniread.core.json create mode 100644 mcp_docs/modules/omniread.core.parser.json create mode 100644 mcp_docs/modules/omniread.core.scraper.json create mode 100644 mcp_docs/modules/omniread.html.json create mode 100644 mcp_docs/modules/omniread.html.parser.json create mode 100644 mcp_docs/modules/omniread.html.scraper.json create mode 100644 mcp_docs/modules/omniread.json create mode 100644 mcp_docs/modules/omniread.pdf.client.json create mode 100644 mcp_docs/modules/omniread.pdf.json create mode 100644 mcp_docs/modules/omniread.pdf.parser.json create mode 100644 mcp_docs/modules/omniread.pdf.scraper.json create mode 100644 mcp_docs/nav.json create mode 100644 omniread/__init__.pyi create mode 100644 omniread/core/__init__.pyi create mode 100644 omniread/core/content.pyi create mode 100644 omniread/core/parser.pyi create mode 100644 omniread/core/scraper.pyi create mode 100644 omniread/html/__init__.pyi create mode 100644 omniread/html/parser.pyi create mode 100644 omniread/html/scraper.pyi create mode 100644 omniread/pdf/__init__.pyi create mode 100644 omniread/pdf/client.pyi create mode 100644 omniread/pdf/parser.pyi create mode 100644 omniread/pdf/scraper.pyi delete mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index e9c3bd7..13b61a5 100644 --- a/.gitignore +++ b/.gitignore @@ -37,4 +37,5 @@ Thumbs.db *.swp *.swo *~ -*.tmp \ No newline at end of file +*.tmp +site diff --git a/docforge.nav.yml b/docforge.nav.yml new file mode 100644 index 0000000..3fe52f3 --- /dev/null +++ b/docforge.nav.yml @@ -0,0 +1,16 @@ +home: omniread/index.md +groups: + Core API: + - omniread/core/index.md + - omniread/core/content.md + - omniread/core/parser.md + - omniread/core/scraper.md + HTML Handling: + - omniread/html/index.md + - omniread/html/parser.md + - omniread/html/scraper.md + PDF Handling: + - omniread/pdf/index.md + - omniread/pdf/client.md + - omniread/pdf/parser.md + - omniread/pdf/scraper.md diff --git a/docs/core/content.md b/docs/omniread/core/content.md similarity index 70% rename from docs/core/content.md rename to docs/omniread/core/content.md index 1ab7be9..85c22e9 100644 --- a/docs/core/content.md +++ b/docs/omniread/core/content.md @@ -1 +1,3 @@ +# Content + ::: omniread.core.content diff --git a/docs/core/index.md b/docs/omniread/core/index.md similarity index 69% rename from docs/core/index.md rename to docs/omniread/core/index.md index cf537df..ddc8ee0 100644 --- a/docs/core/index.md +++ b/docs/omniread/core/index.md @@ -1 +1,3 @@ +# Core + ::: omniread.core diff --git a/docs/core/parser.md b/docs/omniread/core/parser.md similarity index 71% rename from docs/core/parser.md rename to docs/omniread/core/parser.md index 403dae0..db6d680 100644 --- a/docs/core/parser.md +++ b/docs/omniread/core/parser.md @@ -1 +1,3 @@ +# Parser + ::: omniread.core.parser diff --git a/docs/core/scraper.md b/docs/omniread/core/scraper.md similarity index 70% rename from docs/core/scraper.md rename to docs/omniread/core/scraper.md index 4fee994..17ccde2 100644 --- a/docs/core/scraper.md +++ b/docs/omniread/core/scraper.md @@ -1 +1,3 @@ +# Scraper + ::: omniread.core.scraper diff --git a/docs/html/index.md b/docs/omniread/html/index.md similarity index 69% rename from docs/html/index.md rename to docs/omniread/html/index.md index 79491c2..22eee94 100644 --- a/docs/html/index.md +++ b/docs/omniread/html/index.md @@ -1 +1,3 @@ +# Html + ::: omniread.html diff --git a/docs/html/parser.md b/docs/omniread/html/parser.md similarity index 71% rename from docs/html/parser.md rename to docs/omniread/html/parser.md index a000adf..03beb98 100644 --- a/docs/html/parser.md +++ b/docs/omniread/html/parser.md @@ -1 +1,3 @@ +# Parser + ::: omniread.html.parser diff --git a/docs/html/scraper.md b/docs/omniread/html/scraper.md similarity index 70% rename from docs/html/scraper.md rename to docs/omniread/html/scraper.md index f82248e..bd32ee1 100644 --- a/docs/html/scraper.md +++ b/docs/omniread/html/scraper.md @@ -1 +1,3 @@ +# Scraper + ::: omniread.html.scraper diff --git a/docs/index.md b/docs/omniread/index.md similarity index 52% rename from docs/index.md rename to docs/omniread/index.md index 77e9bbb..0bb27d4 100644 --- a/docs/index.md +++ b/docs/omniread/index.md @@ -1 +1,3 @@ +# Omniread + ::: omniread diff --git a/docs/pdf/client.md b/docs/omniread/pdf/client.md similarity index 70% rename from docs/pdf/client.md rename to docs/omniread/pdf/client.md index 347ffcc..22271f1 100644 --- a/docs/pdf/client.md +++ b/docs/omniread/pdf/client.md @@ -1 +1,3 @@ +# Client + ::: omniread.pdf.client diff --git a/docs/pdf/index.md b/docs/omniread/pdf/index.md similarity index 70% rename from docs/pdf/index.md rename to docs/omniread/pdf/index.md index 6948b1d..3177c4f 100644 --- a/docs/pdf/index.md +++ b/docs/omniread/pdf/index.md @@ -1 +1,3 @@ +# Pdf + ::: omniread.pdf diff --git a/docs/pdf/parser.md b/docs/omniread/pdf/parser.md similarity index 70% rename from docs/pdf/parser.md rename to docs/omniread/pdf/parser.md index f49a28b..b1125bc 100644 --- a/docs/pdf/parser.md +++ b/docs/omniread/pdf/parser.md @@ -1 +1,3 @@ +# Parser + ::: omniread.pdf.parser diff --git a/docs/pdf/scraper.md b/docs/omniread/pdf/scraper.md similarity index 69% rename from docs/pdf/scraper.md rename to docs/omniread/pdf/scraper.md index 9788cff..a94daec 100644 --- a/docs/pdf/scraper.md +++ b/docs/omniread/pdf/scraper.md @@ -1 +1,3 @@ +# Scraper + ::: omniread.pdf.scraper diff --git a/generate_docs.py b/generate_docs.py deleted file mode 100644 index bb93ddd..0000000 --- a/generate_docs.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Programmatic MkDocs build script for OmniRead. - -This script builds (or serves) the documentation by invoking MkDocs -*as a Python library*, not via shell commands. - -Requirements: -- mkdocs -- mkdocs-material -- mkdocstrings[python] - -Usage: - python generate_docs.py - python generate_docs.py --serve -""" - -import sys -from pathlib import Path - -from mkdocs.commands import build as mkdocs_build -from mkdocs.commands import serve as mkdocs_serve -from mkdocs.config import load_config - - -PROJECT_ROOT = Path(__file__).resolve().parent -MKDOCS_YML = PROJECT_ROOT / "mkdocs.yml" - - -def main() -> None: - if not MKDOCS_YML.exists(): - raise FileNotFoundError("mkdocs.yml not found at project root") - - # Load MkDocs configuration programmatically - config = load_config(str(MKDOCS_YML)) - - # Decide mode - if "--serve" in sys.argv: - # Live-reload development server - mkdocs_serve.serve(config) - else: - # Static site build - mkdocs_build.build(config) - - -if __name__ == "__main__": - main() diff --git a/mcp_docs/index.json b/mcp_docs/index.json new file mode 100644 index 0000000..48b0a19 --- /dev/null +++ b/mcp_docs/index.json @@ -0,0 +1,6 @@ +{ + "project": "omniread", + "type": "docforge-model", + "modules_count": 12, + "source": "docforge" +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.core.content.json b/mcp_docs/modules/omniread.core.content.json new file mode 100644 index 0000000..811112d --- /dev/null +++ b/mcp_docs/modules/omniread.core.content.json @@ -0,0 +1,118 @@ +{ + "module": "omniread.core.content", + "content": { + "path": "omniread.core.content", + "docstring": "Canonical content models for OmniRead.\n\nThis module defines the **format-agnostic content representation** used across\nall parsers and scrapers in OmniRead.\n\nThe models defined here represent *what* was extracted, not *how* it was\nretrieved or parsed. Format-specific behavior and metadata must not alter\nthe semantic meaning of these models.", + "objects": { + "Enum": { + "name": "Enum", + "kind": "alias", + "path": "omniread.core.content.Enum", + "signature": "", + "docstring": null + }, + "dataclass": { + "name": "dataclass", + "kind": "alias", + "path": "omniread.core.content.dataclass", + "signature": "", + "docstring": null + }, + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.core.content.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.core.content.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.core.content.Optional", + "signature": "", + "docstring": null + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.core.content.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.core.content.ContentType.HTML", + "signature": null, + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.core.content.ContentType.PDF", + "signature": null, + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.core.content.ContentType.JSON", + "signature": null, + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.core.content.ContentType.XML", + "signature": null, + "docstring": "XML document content." + } + } + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.content.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.content.Content.raw", + "signature": null, + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.content.Content.source", + "signature": null, + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.content.Content.content_type", + "signature": null, + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.content.Content.metadata", + "signature": null, + "docstring": null + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.core.json b/mcp_docs/modules/omniread.core.json new file mode 100644 index 0000000..068f792 --- /dev/null +++ b/mcp_docs/modules/omniread.core.json @@ -0,0 +1,513 @@ +{ + "module": "omniread.core", + "content": { + "path": "omniread.core", + "docstring": "Core domain contracts for OmniRead.\n\nThis package defines the **format-agnostic domain layer** of OmniRead.\nIt exposes canonical content models and abstract interfaces that are\nimplemented by format-specific modules (HTML, PDF, etc.).\n\nPublic exports from this package are considered **stable contracts** and\nare safe for downstream consumers to depend on.\n\nSubmodules:\n- content: Canonical content models and enums\n- parser: Abstract parsing contracts\n- scraper: Abstract scraping contracts\n\nFormat-specific behavior must not be introduced at this layer.", + "objects": { + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.core.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.core.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.core.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.core.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.core.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.core.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.core.BaseParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.core.BaseParser.content", + "signature": "", + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.core.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.core.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.core.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.core.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + }, + "content": { + "name": "content", + "kind": "module", + "path": "omniread.core.content", + "signature": null, + "docstring": "Canonical content models for OmniRead.\n\nThis module defines the **format-agnostic content representation** used across\nall parsers and scrapers in OmniRead.\n\nThe models defined here represent *what* was extracted, not *how* it was\nretrieved or parsed. Format-specific behavior and metadata must not alter\nthe semantic meaning of these models.", + "members": { + "Enum": { + "name": "Enum", + "kind": "alias", + "path": "omniread.core.content.Enum", + "signature": "", + "docstring": null + }, + "dataclass": { + "name": "dataclass", + "kind": "alias", + "path": "omniread.core.content.dataclass", + "signature": "", + "docstring": null + }, + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.core.content.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.core.content.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.core.content.Optional", + "signature": "", + "docstring": null + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.core.content.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.core.content.ContentType.HTML", + "signature": null, + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.core.content.ContentType.PDF", + "signature": null, + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.core.content.ContentType.JSON", + "signature": null, + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.core.content.ContentType.XML", + "signature": null, + "docstring": "XML document content." + } + } + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.content.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.content.Content.raw", + "signature": null, + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.content.Content.source", + "signature": null, + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.content.Content.content_type", + "signature": null, + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.content.Content.metadata", + "signature": null, + "docstring": null + } + } + } + } + }, + "parser": { + "name": "parser", + "kind": "module", + "path": "omniread.core.parser", + "signature": null, + "docstring": "Abstract parsing contracts for OmniRead.\n\nThis module defines the **format-agnostic parser interface** used to transform\nraw content into structured, typed representations.\n\nParsers are responsible for:\n- Interpreting a single `Content` instance\n- Validating compatibility with the content type\n- Producing a structured output suitable for downstream consumers\n\nParsers are not responsible for:\n- Fetching or acquiring content\n- Performing retries or error recovery\n- Managing multiple content sources", + "members": { + "ABC": { + "name": "ABC", + "kind": "alias", + "path": "omniread.core.parser.ABC", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.core.parser.abstractmethod", + "signature": "", + "docstring": null + }, + "Generic": { + "name": "Generic", + "kind": "alias", + "path": "omniread.core.parser.Generic", + "signature": "", + "docstring": null + }, + "TypeVar": { + "name": "TypeVar", + "kind": "alias", + "path": "omniread.core.parser.TypeVar", + "signature": "", + "docstring": null + }, + "Set": { + "name": "Set", + "kind": "alias", + "path": "omniread.core.parser.Set", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.parser.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.parser.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.parser.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.parser.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.parser.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.core.parser.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "T": { + "name": "T", + "kind": "attribute", + "path": "omniread.core.parser.T", + "signature": null, + "docstring": null + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.core.parser.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.core.parser.BaseParser.supported_types", + "signature": null, + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.core.parser.BaseParser.content", + "signature": null, + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.core.parser.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.core.parser.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + } + } + }, + "scraper": { + "name": "scraper", + "kind": "module", + "path": "omniread.core.scraper", + "signature": null, + "docstring": "Abstract scraping contracts for OmniRead.\n\nThis module defines the **format-agnostic scraper interface** responsible for\nacquiring raw content from external sources.\n\nScrapers are responsible for:\n- Locating and retrieving raw content bytes\n- Attaching minimal contextual metadata\n- Returning normalized `Content` objects\n\nScrapers are explicitly NOT responsible for:\n- Parsing or interpreting content\n- Inferring structure or semantics\n- Performing content-type specific processing\n\nAll interpretation must be delegated to parsers.", + "members": { + "ABC": { + "name": "ABC", + "kind": "alias", + "path": "omniread.core.scraper.ABC", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.core.scraper.abstractmethod", + "signature": "", + "docstring": null + }, + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.core.scraper.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.core.scraper.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.core.scraper.Optional", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.scraper.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.scraper.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.scraper.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.scraper.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.scraper.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.core.scraper.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.core.scraper.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.core.parser.json b/mcp_docs/modules/omniread.core.parser.json new file mode 100644 index 0000000..d30bb09 --- /dev/null +++ b/mcp_docs/modules/omniread.core.parser.json @@ -0,0 +1,162 @@ +{ + "module": "omniread.core.parser", + "content": { + "path": "omniread.core.parser", + "docstring": "Abstract parsing contracts for OmniRead.\n\nThis module defines the **format-agnostic parser interface** used to transform\nraw content into structured, typed representations.\n\nParsers are responsible for:\n- Interpreting a single `Content` instance\n- Validating compatibility with the content type\n- Producing a structured output suitable for downstream consumers\n\nParsers are not responsible for:\n- Fetching or acquiring content\n- Performing retries or error recovery\n- Managing multiple content sources", + "objects": { + "ABC": { + "name": "ABC", + "kind": "alias", + "path": "omniread.core.parser.ABC", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.core.parser.abstractmethod", + "signature": "", + "docstring": null + }, + "Generic": { + "name": "Generic", + "kind": "alias", + "path": "omniread.core.parser.Generic", + "signature": "", + "docstring": null + }, + "TypeVar": { + "name": "TypeVar", + "kind": "alias", + "path": "omniread.core.parser.TypeVar", + "signature": "", + "docstring": null + }, + "Set": { + "name": "Set", + "kind": "alias", + "path": "omniread.core.parser.Set", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.parser.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.parser.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.parser.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.parser.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.parser.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.core.parser.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "T": { + "name": "T", + "kind": "attribute", + "path": "omniread.core.parser.T", + "signature": null, + "docstring": null + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.core.parser.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.core.parser.BaseParser.supported_types", + "signature": null, + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.core.parser.BaseParser.content", + "signature": null, + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.core.parser.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.core.parser.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.core.scraper.json b/mcp_docs/modules/omniread.core.scraper.json new file mode 100644 index 0000000..88ebbc9 --- /dev/null +++ b/mcp_docs/modules/omniread.core.scraper.json @@ -0,0 +1,97 @@ +{ + "module": "omniread.core.scraper", + "content": { + "path": "omniread.core.scraper", + "docstring": "Abstract scraping contracts for OmniRead.\n\nThis module defines the **format-agnostic scraper interface** responsible for\nacquiring raw content from external sources.\n\nScrapers are responsible for:\n- Locating and retrieving raw content bytes\n- Attaching minimal contextual metadata\n- Returning normalized `Content` objects\n\nScrapers are explicitly NOT responsible for:\n- Parsing or interpreting content\n- Inferring structure or semantics\n- Performing content-type specific processing\n\nAll interpretation must be delegated to parsers.", + "objects": { + "ABC": { + "name": "ABC", + "kind": "alias", + "path": "omniread.core.scraper.ABC", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.core.scraper.abstractmethod", + "signature": "", + "docstring": null + }, + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.core.scraper.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.core.scraper.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.core.scraper.Optional", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.scraper.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.scraper.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.scraper.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.scraper.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.scraper.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.core.scraper.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.core.scraper.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.html.json b/mcp_docs/modules/omniread.html.json new file mode 100644 index 0000000..0c443d3 --- /dev/null +++ b/mcp_docs/modules/omniread.html.json @@ -0,0 +1,488 @@ +{ + "module": "omniread.html", + "content": { + "path": "omniread.html", + "docstring": "HTML format implementation for OmniRead.\n\nThis package provides **HTML-specific implementations** of the core OmniRead\ncontracts defined in `omniread.core`.\n\nIt includes:\n- HTML parsers that interpret HTML content\n- HTML scrapers that retrieve HTML documents\n\nThis package:\n- Implements, but does not redefine, core contracts\n- May contain HTML-specific behavior and edge-case handling\n- Produces canonical content models defined in `omniread.core.content`\n\nConsumers should depend on `omniread.core` interfaces wherever possible and\nuse this package only when HTML-specific behavior is required.", + "objects": { + "HTMLScraper": { + "name": "HTMLScraper", + "kind": "class", + "path": "omniread.html.HTMLScraper", + "signature": "", + "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses", + "members": { + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.HTMLScraper.content_type", + "signature": "", + "docstring": null + }, + "validate_content_type": { + "name": "validate_content_type", + "kind": "function", + "path": "omniread.html.HTMLScraper.validate_content_type", + "signature": "", + "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n response: HTTP response returned by `httpx`.\n\nRaises:\n ValueError: If the `Content-Type` header is missing or does not\n indicate HTML content." + }, + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.html.HTMLScraper.fetch", + "signature": "", + "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n source: URL of the HTML document.\n metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw HTML bytes\n - Source URL\n - HTML content type\n - HTTP response metadata\n\nRaises:\n httpx.HTTPError: If the HTTP request fails.\n ValueError: If the response is not valid HTML." + } + } + }, + "HTMLParser": { + "name": "HTMLParser", + "kind": "class", + "path": "omniread.html.HTMLParser", + "signature": "", + "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.html.HTMLParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser (HTML only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.html.HTMLParser.parse", + "signature": "", + "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`." + }, + "parse_div": { + "name": "parse_div", + "kind": "function", + "path": "omniread.html.HTMLParser.parse_div", + "signature": "", + "docstring": "Extract normalized text from a `
` element.\n\nArgs:\n div: BeautifulSoup tag representing a `
`.\n separator: String used to separate text nodes.\n\nReturns:\n Flattened, whitespace-normalized text content." + }, + "parse_link": { + "name": "parse_link", + "kind": "function", + "path": "omniread.html.HTMLParser.parse_link", + "signature": "", + "docstring": "Extract the hyperlink reference from an `` element.\n\nArgs:\n a: BeautifulSoup tag representing an anchor.\n\nReturns:\n The value of the `href` attribute, or None if absent." + }, + "parse_table": { + "name": "parse_table", + "kind": "function", + "path": "omniread.html.HTMLParser.parse_table", + "signature": "", + "docstring": "Parse an HTML table into a 2D list of strings.\n\nArgs:\n table: BeautifulSoup tag representing a ``.\n\nReturns:\n A list of rows, where each row is a list of cell text values." + }, + "parse_meta": { + "name": "parse_meta", + "kind": "function", + "path": "omniread.html.HTMLParser.parse_meta", + "signature": "", + "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `` tag name/property → content mappings\n\nReturns:\n Dictionary containing extracted metadata." + } + } + }, + "parser": { + "name": "parser", + "kind": "module", + "path": "omniread.html.parser", + "signature": null, + "docstring": "HTML parser base implementations for OmniRead.\n\nThis module provides reusable HTML parsing utilities built on top of\nthe abstract parser contracts defined in `omniread.core.parser`.\n\nIt supplies:\n- Content-type enforcement for HTML inputs\n- BeautifulSoup initialization and lifecycle management\n- Common helper methods for extracting structured data from HTML elements\n\nConcrete parsers must subclass `HTMLParser` and implement the `parse()` method\nto return a structured representation appropriate for their use case.", + "members": { + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.html.parser.Any", + "signature": "", + "docstring": null + }, + "Generic": { + "name": "Generic", + "kind": "alias", + "path": "omniread.html.parser.Generic", + "signature": "", + "docstring": null + }, + "TypeVar": { + "name": "TypeVar", + "kind": "alias", + "path": "omniread.html.parser.TypeVar", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.html.parser.Optional", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.html.parser.abstractmethod", + "signature": "", + "docstring": null + }, + "BeautifulSoup": { + "name": "BeautifulSoup", + "kind": "alias", + "path": "omniread.html.parser.BeautifulSoup", + "signature": "", + "docstring": null + }, + "Tag": { + "name": "Tag", + "kind": "alias", + "path": "omniread.html.parser.Tag", + "signature": "", + "docstring": null + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.html.parser.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.html.parser.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.html.parser.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.html.parser.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.parser.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.html.parser.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.html.parser.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.html.parser.BaseParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.html.parser.BaseParser.content", + "signature": "", + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.html.parser.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.html.parser.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + }, + "T": { + "name": "T", + "kind": "attribute", + "path": "omniread.html.parser.T", + "signature": null, + "docstring": null + }, + "HTMLParser": { + "name": "HTMLParser", + "kind": "class", + "path": "omniread.html.parser.HTMLParser", + "signature": "", + "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.html.parser.HTMLParser.supported_types", + "signature": null, + "docstring": "Set of content types supported by this parser (HTML only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse", + "signature": "", + "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`." + }, + "parse_div": { + "name": "parse_div", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse_div", + "signature": "", + "docstring": "Extract normalized text from a `
` element.\n\nArgs:\n div: BeautifulSoup tag representing a `
`.\n separator: String used to separate text nodes.\n\nReturns:\n Flattened, whitespace-normalized text content." + }, + "parse_link": { + "name": "parse_link", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse_link", + "signature": "", + "docstring": "Extract the hyperlink reference from an `` element.\n\nArgs:\n a: BeautifulSoup tag representing an anchor.\n\nReturns:\n The value of the `href` attribute, or None if absent." + }, + "parse_table": { + "name": "parse_table", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse_table", + "signature": "", + "docstring": "Parse an HTML table into a 2D list of strings.\n\nArgs:\n table: BeautifulSoup tag representing a `
`.\n\nReturns:\n A list of rows, where each row is a list of cell text values." + }, + "parse_meta": { + "name": "parse_meta", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse_meta", + "signature": "", + "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `` tag name/property → content mappings\n\nReturns:\n Dictionary containing extracted metadata." + } + } + }, + "list": { + "name": "list", + "kind": "alias", + "path": "omniread.html.parser.list", + "signature": "", + "docstring": null + }, + "dict": { + "name": "dict", + "kind": "alias", + "path": "omniread.html.parser.dict", + "signature": "", + "docstring": null + } + } + }, + "scraper": { + "name": "scraper", + "kind": "module", + "path": "omniread.html.scraper", + "signature": null, + "docstring": "HTML scraping implementation for OmniRead.\n\nThis module provides an HTTP-based scraper for retrieving HTML documents.\nIt implements the core `BaseScraper` contract using `httpx` as the transport\nlayer.\n\nThis scraper is responsible for:\n- Fetching raw HTML bytes over HTTP(S)\n- Validating response content type\n- Attaching HTTP metadata to the returned content\n\nThis scraper is not responsible for:\n- Parsing or interpreting HTML\n- Retrying failed requests\n- Managing crawl policies or rate limiting", + "members": { + "httpx": { + "name": "httpx", + "kind": "alias", + "path": "omniread.html.scraper.httpx", + "signature": "", + "docstring": null + }, + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.html.scraper.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.html.scraper.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.html.scraper.Optional", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.html.scraper.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.html.scraper.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.html.scraper.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.scraper.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.html.scraper.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.html.scraper.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.html.scraper.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.html.scraper.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + }, + "HTMLScraper": { + "name": "HTMLScraper", + "kind": "class", + "path": "omniread.html.scraper.HTMLScraper", + "signature": "", + "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses", + "members": { + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.scraper.HTMLScraper.content_type", + "signature": null, + "docstring": null + }, + "validate_content_type": { + "name": "validate_content_type", + "kind": "function", + "path": "omniread.html.scraper.HTMLScraper.validate_content_type", + "signature": "", + "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n response: HTTP response returned by `httpx`.\n\nRaises:\n ValueError: If the `Content-Type` header is missing or does not\n indicate HTML content." + }, + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.html.scraper.HTMLScraper.fetch", + "signature": "", + "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n source: URL of the HTML document.\n metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw HTML bytes\n - Source URL\n - HTML content type\n - HTTP response metadata\n\nRaises:\n httpx.HTTPError: If the HTTP request fails.\n ValueError: If the response is not valid HTML." + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.html.parser.json b/mcp_docs/modules/omniread.html.parser.json new file mode 100644 index 0000000..147bfdc --- /dev/null +++ b/mcp_docs/modules/omniread.html.parser.json @@ -0,0 +1,241 @@ +{ + "module": "omniread.html.parser", + "content": { + "path": "omniread.html.parser", + "docstring": "HTML parser base implementations for OmniRead.\n\nThis module provides reusable HTML parsing utilities built on top of\nthe abstract parser contracts defined in `omniread.core.parser`.\n\nIt supplies:\n- Content-type enforcement for HTML inputs\n- BeautifulSoup initialization and lifecycle management\n- Common helper methods for extracting structured data from HTML elements\n\nConcrete parsers must subclass `HTMLParser` and implement the `parse()` method\nto return a structured representation appropriate for their use case.", + "objects": { + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.html.parser.Any", + "signature": "", + "docstring": null + }, + "Generic": { + "name": "Generic", + "kind": "alias", + "path": "omniread.html.parser.Generic", + "signature": "", + "docstring": null + }, + "TypeVar": { + "name": "TypeVar", + "kind": "alias", + "path": "omniread.html.parser.TypeVar", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.html.parser.Optional", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.html.parser.abstractmethod", + "signature": "", + "docstring": null + }, + "BeautifulSoup": { + "name": "BeautifulSoup", + "kind": "alias", + "path": "omniread.html.parser.BeautifulSoup", + "signature": "", + "docstring": null + }, + "Tag": { + "name": "Tag", + "kind": "alias", + "path": "omniread.html.parser.Tag", + "signature": "", + "docstring": null + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.html.parser.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.html.parser.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.html.parser.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.html.parser.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.parser.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.html.parser.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.html.parser.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.html.parser.BaseParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.html.parser.BaseParser.content", + "signature": "", + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.html.parser.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.html.parser.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + }, + "T": { + "name": "T", + "kind": "attribute", + "path": "omniread.html.parser.T", + "signature": null, + "docstring": null + }, + "HTMLParser": { + "name": "HTMLParser", + "kind": "class", + "path": "omniread.html.parser.HTMLParser", + "signature": "", + "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.html.parser.HTMLParser.supported_types", + "signature": null, + "docstring": "Set of content types supported by this parser (HTML only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse", + "signature": "", + "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`." + }, + "parse_div": { + "name": "parse_div", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse_div", + "signature": "", + "docstring": "Extract normalized text from a `
`.\n\nReturns:\n A list of rows, where each row is a list of cell text values." + }, + "parse_meta": { + "name": "parse_meta", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse_meta", + "signature": "", + "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `` tag name/property → content mappings\n\nReturns:\n Dictionary containing extracted metadata." + } + } + }, + "list": { + "name": "list", + "kind": "alias", + "path": "omniread.html.parser.list", + "signature": "", + "docstring": null + }, + "dict": { + "name": "dict", + "kind": "alias", + "path": "omniread.html.parser.dict", + "signature": "", + "docstring": null + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.html.scraper.json b/mcp_docs/modules/omniread.html.scraper.json new file mode 100644 index 0000000..b360df9 --- /dev/null +++ b/mcp_docs/modules/omniread.html.scraper.json @@ -0,0 +1,157 @@ +{ + "module": "omniread.html.scraper", + "content": { + "path": "omniread.html.scraper", + "docstring": "HTML scraping implementation for OmniRead.\n\nThis module provides an HTTP-based scraper for retrieving HTML documents.\nIt implements the core `BaseScraper` contract using `httpx` as the transport\nlayer.\n\nThis scraper is responsible for:\n- Fetching raw HTML bytes over HTTP(S)\n- Validating response content type\n- Attaching HTTP metadata to the returned content\n\nThis scraper is not responsible for:\n- Parsing or interpreting HTML\n- Retrying failed requests\n- Managing crawl policies or rate limiting", + "objects": { + "httpx": { + "name": "httpx", + "kind": "alias", + "path": "omniread.html.scraper.httpx", + "signature": "", + "docstring": null + }, + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.html.scraper.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.html.scraper.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.html.scraper.Optional", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.html.scraper.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.html.scraper.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.html.scraper.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.scraper.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.html.scraper.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.html.scraper.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.html.scraper.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.html.scraper.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + }, + "HTMLScraper": { + "name": "HTMLScraper", + "kind": "class", + "path": "omniread.html.scraper.HTMLScraper", + "signature": "", + "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses", + "members": { + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.scraper.HTMLScraper.content_type", + "signature": null, + "docstring": null + }, + "validate_content_type": { + "name": "validate_content_type", + "kind": "function", + "path": "omniread.html.scraper.HTMLScraper.validate_content_type", + "signature": "", + "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n response: HTTP response returned by `httpx`.\n\nRaises:\n ValueError: If the `Content-Type` header is missing or does not\n indicate HTML content." + }, + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.html.scraper.HTMLScraper.fetch", + "signature": "", + "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n source: URL of the HTML document.\n metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw HTML bytes\n - Source URL\n - HTML content type\n - HTTP response metadata\n\nRaises:\n httpx.HTTPError: If the HTTP request fails.\n ValueError: If the response is not valid HTML." + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.json b/mcp_docs/modules/omniread.json new file mode 100644 index 0000000..a321fcf --- /dev/null +++ b/mcp_docs/modules/omniread.json @@ -0,0 +1,1639 @@ +{ + "module": "omniread", + "content": { + "path": "omniread", + "docstring": "OmniRead — format-agnostic content acquisition and parsing framework.\n\nOmniRead provides a **cleanly layered architecture** for fetching, parsing,\nand normalizing content from heterogeneous sources such as HTML documents\nand PDF files.\n\nThe library is structured around three core concepts:\n\n1. **Content**\n A canonical, format-agnostic container representing raw content bytes\n and minimal contextual metadata.\n\n2. **Scrapers**\n Components responsible for *acquiring* raw content from a source\n (HTTP, filesystem, object storage, etc.). Scrapers never interpret\n content.\n\n3. **Parsers**\n Components responsible for *interpreting* acquired content and\n converting it into structured, typed representations.\n\nOmniRead deliberately separates these responsibilities to ensure:\n- Clear boundaries between IO and interpretation\n- Replaceable implementations per format\n- Predictable, testable behavior\n\n----------------------------------------------------------------------\nInstallation\n----------------------------------------------------------------------\n\nInstall OmniRead using pip:\n\n pip install omniread\n\nOr with Poetry:\n\n poetry add omniread\n\n----------------------------------------------------------------------\nBasic Usage\n----------------------------------------------------------------------\n\nHTML example:\n\n from omniread import HTMLScraper, HTMLParser\n\n scraper = HTMLScraper()\n content = scraper.fetch(\"https://example.com\")\n\n class TitleParser(HTMLParser[str]):\n def parse(self) -> str:\n return self._soup.title.string\n\n parser = TitleParser(content)\n title = parser.parse()\n\nPDF example:\n\n from omniread import FileSystemPDFClient, PDFScraper, PDFParser\n from pathlib import Path\n\n client = FileSystemPDFClient()\n scraper = PDFScraper(client=client)\n content = scraper.fetch(Path(\"document.pdf\"))\n\n class TextPDFParser(PDFParser[str]):\n def parse(self) -> str:\n # implement PDF text extraction\n ...\n\n parser = TextPDFParser(content)\n result = parser.parse()\n\n----------------------------------------------------------------------\nPublic API Surface\n----------------------------------------------------------------------\n\nThis module re-exports the **recommended public entry points** of OmniRead.\n\nConsumers are encouraged to import from this namespace rather than from\nformat-specific submodules directly, unless advanced customization is\nrequired.\n\nCore:\n- Content\n- ContentType\n\nHTML:\n- HTMLScraper\n- HTMLParser\n\nPDF:\n- FileSystemPDFClient\n- PDFScraper\n- PDFParser\n\n## Core Philosophy\n\n`OmniRead` is designed as a **decoupled content engine**:\n\n1. **Separation of Concerns**: Scrapers *fetch*, Parsers *interpret*. Neither knows about the other.\n2. **Normalized Exchange**: All components communicate via the `Content` model, ensuring a consistent contract.\n3. **Format Agnosticism**: The core logic is independent of whether the input is HTML, PDF, or JSON.\n\n## Documentation Design\n\nFor those extending `OmniRead`, follow these \"AI-Native\" docstring principles:\n\n### For Humans\n- **Clear Contracts**: Explicitly state what a component is and is NOT responsible for.\n- **Runnable Examples**: Include small, logical snippets in the package `__init__.py`.\n\n### For LLMs\n- **Structured Models**: Use dataclasses and enums for core data to ensure clean MCP JSON representation.\n- **Type Safety**: All public APIs must be fully typed and have corresponding `.pyi` stubs.\n- **Detailed Raises**: Include `: description` pairs in the `Raises` section to help agents handle errors gracefully.", + "objects": { + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "HTMLScraper": { + "name": "HTMLScraper", + "kind": "class", + "path": "omniread.HTMLScraper", + "signature": "", + "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses", + "members": { + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.HTMLScraper.content_type", + "signature": "", + "docstring": null + }, + "validate_content_type": { + "name": "validate_content_type", + "kind": "function", + "path": "omniread.HTMLScraper.validate_content_type", + "signature": "", + "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n response: HTTP response returned by `httpx`.\n\nRaises:\n ValueError: If the `Content-Type` header is missing or does not\n indicate HTML content." + }, + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.HTMLScraper.fetch", + "signature": "", + "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n source: URL of the HTML document.\n metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw HTML bytes\n - Source URL\n - HTML content type\n - HTTP response metadata\n\nRaises:\n httpx.HTTPError: If the HTTP request fails.\n ValueError: If the response is not valid HTML." + } + } + }, + "HTMLParser": { + "name": "HTMLParser", + "kind": "class", + "path": "omniread.HTMLParser", + "signature": "", + "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.HTMLParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser (HTML only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.HTMLParser.parse", + "signature": "", + "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`." + }, + "parse_div": { + "name": "parse_div", + "kind": "function", + "path": "omniread.HTMLParser.parse_div", + "signature": "", + "docstring": "Extract normalized text from a `
`.\n\nReturns:\n A list of rows, where each row is a list of cell text values." + }, + "parse_meta": { + "name": "parse_meta", + "kind": "function", + "path": "omniread.HTMLParser.parse_meta", + "signature": "", + "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `` tag name/property → content mappings\n\nReturns:\n Dictionary containing extracted metadata." + } + } + }, + "FileSystemPDFClient": { + "name": "FileSystemPDFClient", + "kind": "class", + "path": "omniread.FileSystemPDFClient", + "signature": "", + "docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.FileSystemPDFClient.fetch", + "signature": "", + "docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n path: Filesystem path to the PDF file.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n FileNotFoundError: If the path does not exist.\n ValueError: If the path exists but is not a file." + } + } + }, + "PDFScraper": { + "name": "PDFScraper", + "kind": "class", + "path": "omniread.PDFScraper", + "signature": "", + "docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.PDFScraper.fetch", + "signature": "", + "docstring": "Fetch a PDF document from the given source.\n\nArgs:\n source: Identifier of the PDF source as understood by the\n configured PDF client.\n metadata: Optional metadata to attach to the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw PDF bytes\n - Source identifier\n - PDF content type\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors raised by the PDF client." + } + } + }, + "PDFParser": { + "name": "PDFParser", + "kind": "class", + "path": "omniread.PDFParser", + "signature": "", + "docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.PDFParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser (PDF only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.PDFParser.parse", + "signature": "", + "docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + } + } + }, + "core": { + "name": "core", + "kind": "module", + "path": "omniread.core", + "signature": null, + "docstring": "Core domain contracts for OmniRead.\n\nThis package defines the **format-agnostic domain layer** of OmniRead.\nIt exposes canonical content models and abstract interfaces that are\nimplemented by format-specific modules (HTML, PDF, etc.).\n\nPublic exports from this package are considered **stable contracts** and\nare safe for downstream consumers to depend on.\n\nSubmodules:\n- content: Canonical content models and enums\n- parser: Abstract parsing contracts\n- scraper: Abstract scraping contracts\n\nFormat-specific behavior must not be introduced at this layer.", + "members": { + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.core.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.core.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.core.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.core.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.core.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.core.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.core.BaseParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.core.BaseParser.content", + "signature": "", + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.core.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.core.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.core.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.core.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + }, + "content": { + "name": "content", + "kind": "module", + "path": "omniread.core.content", + "signature": null, + "docstring": "Canonical content models for OmniRead.\n\nThis module defines the **format-agnostic content representation** used across\nall parsers and scrapers in OmniRead.\n\nThe models defined here represent *what* was extracted, not *how* it was\nretrieved or parsed. Format-specific behavior and metadata must not alter\nthe semantic meaning of these models.", + "members": { + "Enum": { + "name": "Enum", + "kind": "alias", + "path": "omniread.core.content.Enum", + "signature": "", + "docstring": null + }, + "dataclass": { + "name": "dataclass", + "kind": "alias", + "path": "omniread.core.content.dataclass", + "signature": "", + "docstring": null + }, + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.core.content.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.core.content.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.core.content.Optional", + "signature": "", + "docstring": null + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.core.content.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.core.content.ContentType.HTML", + "signature": null, + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.core.content.ContentType.PDF", + "signature": null, + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.core.content.ContentType.JSON", + "signature": null, + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.core.content.ContentType.XML", + "signature": null, + "docstring": "XML document content." + } + } + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.content.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.content.Content.raw", + "signature": null, + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.content.Content.source", + "signature": null, + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.content.Content.content_type", + "signature": null, + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.content.Content.metadata", + "signature": null, + "docstring": null + } + } + } + } + }, + "parser": { + "name": "parser", + "kind": "module", + "path": "omniread.core.parser", + "signature": null, + "docstring": "Abstract parsing contracts for OmniRead.\n\nThis module defines the **format-agnostic parser interface** used to transform\nraw content into structured, typed representations.\n\nParsers are responsible for:\n- Interpreting a single `Content` instance\n- Validating compatibility with the content type\n- Producing a structured output suitable for downstream consumers\n\nParsers are not responsible for:\n- Fetching or acquiring content\n- Performing retries or error recovery\n- Managing multiple content sources", + "members": { + "ABC": { + "name": "ABC", + "kind": "alias", + "path": "omniread.core.parser.ABC", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.core.parser.abstractmethod", + "signature": "", + "docstring": null + }, + "Generic": { + "name": "Generic", + "kind": "alias", + "path": "omniread.core.parser.Generic", + "signature": "", + "docstring": null + }, + "TypeVar": { + "name": "TypeVar", + "kind": "alias", + "path": "omniread.core.parser.TypeVar", + "signature": "", + "docstring": null + }, + "Set": { + "name": "Set", + "kind": "alias", + "path": "omniread.core.parser.Set", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.parser.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.parser.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.parser.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.parser.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.parser.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.core.parser.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.core.parser.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "T": { + "name": "T", + "kind": "attribute", + "path": "omniread.core.parser.T", + "signature": null, + "docstring": null + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.core.parser.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.core.parser.BaseParser.supported_types", + "signature": null, + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.core.parser.BaseParser.content", + "signature": null, + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.core.parser.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.core.parser.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + } + } + }, + "scraper": { + "name": "scraper", + "kind": "module", + "path": "omniread.core.scraper", + "signature": null, + "docstring": "Abstract scraping contracts for OmniRead.\n\nThis module defines the **format-agnostic scraper interface** responsible for\nacquiring raw content from external sources.\n\nScrapers are responsible for:\n- Locating and retrieving raw content bytes\n- Attaching minimal contextual metadata\n- Returning normalized `Content` objects\n\nScrapers are explicitly NOT responsible for:\n- Parsing or interpreting content\n- Inferring structure or semantics\n- Performing content-type specific processing\n\nAll interpretation must be delegated to parsers.", + "members": { + "ABC": { + "name": "ABC", + "kind": "alias", + "path": "omniread.core.scraper.ABC", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.core.scraper.abstractmethod", + "signature": "", + "docstring": null + }, + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.core.scraper.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.core.scraper.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.core.scraper.Optional", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.core.scraper.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.core.scraper.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.core.scraper.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.core.scraper.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.core.scraper.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.core.scraper.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.core.scraper.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + } + } + } + } + }, + "html": { + "name": "html", + "kind": "module", + "path": "omniread.html", + "signature": null, + "docstring": "HTML format implementation for OmniRead.\n\nThis package provides **HTML-specific implementations** of the core OmniRead\ncontracts defined in `omniread.core`.\n\nIt includes:\n- HTML parsers that interpret HTML content\n- HTML scrapers that retrieve HTML documents\n\nThis package:\n- Implements, but does not redefine, core contracts\n- May contain HTML-specific behavior and edge-case handling\n- Produces canonical content models defined in `omniread.core.content`\n\nConsumers should depend on `omniread.core` interfaces wherever possible and\nuse this package only when HTML-specific behavior is required.", + "members": { + "HTMLScraper": { + "name": "HTMLScraper", + "kind": "class", + "path": "omniread.html.HTMLScraper", + "signature": "", + "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses", + "members": { + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.HTMLScraper.content_type", + "signature": "", + "docstring": null + }, + "validate_content_type": { + "name": "validate_content_type", + "kind": "function", + "path": "omniread.html.HTMLScraper.validate_content_type", + "signature": "", + "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n response: HTTP response returned by `httpx`.\n\nRaises:\n ValueError: If the `Content-Type` header is missing or does not\n indicate HTML content." + }, + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.html.HTMLScraper.fetch", + "signature": "", + "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n source: URL of the HTML document.\n metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw HTML bytes\n - Source URL\n - HTML content type\n - HTTP response metadata\n\nRaises:\n httpx.HTTPError: If the HTTP request fails.\n ValueError: If the response is not valid HTML." + } + } + }, + "HTMLParser": { + "name": "HTMLParser", + "kind": "class", + "path": "omniread.html.HTMLParser", + "signature": "", + "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.html.HTMLParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser (HTML only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.html.HTMLParser.parse", + "signature": "", + "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`." + }, + "parse_div": { + "name": "parse_div", + "kind": "function", + "path": "omniread.html.HTMLParser.parse_div", + "signature": "", + "docstring": "Extract normalized text from a `
`.\n\nReturns:\n A list of rows, where each row is a list of cell text values." + }, + "parse_meta": { + "name": "parse_meta", + "kind": "function", + "path": "omniread.html.HTMLParser.parse_meta", + "signature": "", + "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `` tag name/property → content mappings\n\nReturns:\n Dictionary containing extracted metadata." + } + } + }, + "parser": { + "name": "parser", + "kind": "module", + "path": "omniread.html.parser", + "signature": null, + "docstring": "HTML parser base implementations for OmniRead.\n\nThis module provides reusable HTML parsing utilities built on top of\nthe abstract parser contracts defined in `omniread.core.parser`.\n\nIt supplies:\n- Content-type enforcement for HTML inputs\n- BeautifulSoup initialization and lifecycle management\n- Common helper methods for extracting structured data from HTML elements\n\nConcrete parsers must subclass `HTMLParser` and implement the `parse()` method\nto return a structured representation appropriate for their use case.", + "members": { + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.html.parser.Any", + "signature": "", + "docstring": null + }, + "Generic": { + "name": "Generic", + "kind": "alias", + "path": "omniread.html.parser.Generic", + "signature": "", + "docstring": null + }, + "TypeVar": { + "name": "TypeVar", + "kind": "alias", + "path": "omniread.html.parser.TypeVar", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.html.parser.Optional", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.html.parser.abstractmethod", + "signature": "", + "docstring": null + }, + "BeautifulSoup": { + "name": "BeautifulSoup", + "kind": "alias", + "path": "omniread.html.parser.BeautifulSoup", + "signature": "", + "docstring": null + }, + "Tag": { + "name": "Tag", + "kind": "alias", + "path": "omniread.html.parser.Tag", + "signature": "", + "docstring": null + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.html.parser.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.html.parser.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.html.parser.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.html.parser.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.html.parser.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.parser.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.html.parser.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.html.parser.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.html.parser.BaseParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.html.parser.BaseParser.content", + "signature": "", + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.html.parser.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.html.parser.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + }, + "T": { + "name": "T", + "kind": "attribute", + "path": "omniread.html.parser.T", + "signature": null, + "docstring": null + }, + "HTMLParser": { + "name": "HTMLParser", + "kind": "class", + "path": "omniread.html.parser.HTMLParser", + "signature": "", + "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.html.parser.HTMLParser.supported_types", + "signature": null, + "docstring": "Set of content types supported by this parser (HTML only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse", + "signature": "", + "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`." + }, + "parse_div": { + "name": "parse_div", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse_div", + "signature": "", + "docstring": "Extract normalized text from a `
`.\n\nReturns:\n A list of rows, where each row is a list of cell text values." + }, + "parse_meta": { + "name": "parse_meta", + "kind": "function", + "path": "omniread.html.parser.HTMLParser.parse_meta", + "signature": "", + "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `` tag name/property → content mappings\n\nReturns:\n Dictionary containing extracted metadata." + } + } + }, + "list": { + "name": "list", + "kind": "alias", + "path": "omniread.html.parser.list", + "signature": "", + "docstring": null + }, + "dict": { + "name": "dict", + "kind": "alias", + "path": "omniread.html.parser.dict", + "signature": "", + "docstring": null + } + } + }, + "scraper": { + "name": "scraper", + "kind": "module", + "path": "omniread.html.scraper", + "signature": null, + "docstring": "HTML scraping implementation for OmniRead.\n\nThis module provides an HTTP-based scraper for retrieving HTML documents.\nIt implements the core `BaseScraper` contract using `httpx` as the transport\nlayer.\n\nThis scraper is responsible for:\n- Fetching raw HTML bytes over HTTP(S)\n- Validating response content type\n- Attaching HTTP metadata to the returned content\n\nThis scraper is not responsible for:\n- Parsing or interpreting HTML\n- Retrying failed requests\n- Managing crawl policies or rate limiting", + "members": { + "httpx": { + "name": "httpx", + "kind": "alias", + "path": "omniread.html.scraper.httpx", + "signature": "", + "docstring": null + }, + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.html.scraper.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.html.scraper.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.html.scraper.Optional", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.html.scraper.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.html.scraper.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.html.scraper.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.scraper.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.html.scraper.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.html.scraper.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.html.scraper.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.html.scraper.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.html.scraper.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + }, + "HTMLScraper": { + "name": "HTMLScraper", + "kind": "class", + "path": "omniread.html.scraper.HTMLScraper", + "signature": "", + "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses", + "members": { + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.html.scraper.HTMLScraper.content_type", + "signature": null, + "docstring": null + }, + "validate_content_type": { + "name": "validate_content_type", + "kind": "function", + "path": "omniread.html.scraper.HTMLScraper.validate_content_type", + "signature": "", + "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n response: HTTP response returned by `httpx`.\n\nRaises:\n ValueError: If the `Content-Type` header is missing or does not\n indicate HTML content." + }, + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.html.scraper.HTMLScraper.fetch", + "signature": "", + "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n source: URL of the HTML document.\n metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw HTML bytes\n - Source URL\n - HTML content type\n - HTTP response metadata\n\nRaises:\n httpx.HTTPError: If the HTTP request fails.\n ValueError: If the response is not valid HTML." + } + } + } + } + } + } + }, + "pdf": { + "name": "pdf", + "kind": "module", + "path": "omniread.pdf", + "signature": null, + "docstring": "PDF format implementation for OmniRead.\n\nThis package provides **PDF-specific implementations** of the core OmniRead\ncontracts defined in `omniread.core`.\n\nUnlike HTML, PDF handling requires an explicit client layer for document\naccess. This package therefore includes:\n- PDF clients for acquiring raw PDF data\n- PDF scrapers that coordinate client access\n- PDF parsers that extract structured content from PDF binaries\n\nPublic exports from this package represent the supported PDF pipeline\nand are safe for consumers to import directly when working with PDFs.", + "members": { + "FileSystemPDFClient": { + "name": "FileSystemPDFClient", + "kind": "class", + "path": "omniread.pdf.FileSystemPDFClient", + "signature": "", + "docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.FileSystemPDFClient.fetch", + "signature": "", + "docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n path: Filesystem path to the PDF file.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n FileNotFoundError: If the path does not exist.\n ValueError: If the path exists but is not a file." + } + } + }, + "PDFScraper": { + "name": "PDFScraper", + "kind": "class", + "path": "omniread.pdf.PDFScraper", + "signature": "", + "docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.PDFScraper.fetch", + "signature": "", + "docstring": "Fetch a PDF document from the given source.\n\nArgs:\n source: Identifier of the PDF source as understood by the\n configured PDF client.\n metadata: Optional metadata to attach to the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw PDF bytes\n - Source identifier\n - PDF content type\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors raised by the PDF client." + } + } + }, + "PDFParser": { + "name": "PDFParser", + "kind": "class", + "path": "omniread.pdf.PDFParser", + "signature": "", + "docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.pdf.PDFParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser (PDF only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.pdf.PDFParser.parse", + "signature": "", + "docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + } + } + }, + "client": { + "name": "client", + "kind": "module", + "path": "omniread.pdf.client", + "signature": null, + "docstring": "PDF client abstractions for OmniRead.\n\nThis module defines the **client layer** responsible for retrieving raw PDF\nbytes from a concrete backing store.\n\nClients provide low-level access to PDF binaries and are intentionally\ndecoupled from scraping and parsing logic. They do not perform validation,\ninterpretation, or content extraction.\n\nTypical backing stores include:\n- Local filesystems\n- Object storage (S3, GCS, etc.)\n- Network file systems", + "members": { + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.pdf.client.Any", + "signature": "", + "docstring": null + }, + "ABC": { + "name": "ABC", + "kind": "alias", + "path": "omniread.pdf.client.ABC", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.pdf.client.abstractmethod", + "signature": "", + "docstring": null + }, + "Path": { + "name": "Path", + "kind": "alias", + "path": "omniread.pdf.client.Path", + "signature": "", + "docstring": null + }, + "BasePDFClient": { + "name": "BasePDFClient", + "kind": "class", + "path": "omniread.pdf.client.BasePDFClient", + "signature": "", + "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.client.BasePDFClient.fetch", + "signature": "", + "docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n source: Identifier of the PDF location, such as a file path,\n object storage key, or remote reference.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n Exception: Retrieval-specific errors defined by the implementation." + } + } + }, + "FileSystemPDFClient": { + "name": "FileSystemPDFClient", + "kind": "class", + "path": "omniread.pdf.client.FileSystemPDFClient", + "signature": "", + "docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.client.FileSystemPDFClient.fetch", + "signature": "", + "docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n path: Filesystem path to the PDF file.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n FileNotFoundError: If the path does not exist.\n ValueError: If the path exists but is not a file." + } + } + } + } + }, + "parser": { + "name": "parser", + "kind": "module", + "path": "omniread.pdf.parser", + "signature": null, + "docstring": "PDF parser base implementations for OmniRead.\n\nThis module defines the **PDF-specific parser contract**, extending the\nformat-agnostic `BaseParser` with constraints appropriate for PDF content.\n\nPDF parsers are responsible for interpreting binary PDF data and producing\nstructured representations suitable for downstream consumption.", + "members": { + "Generic": { + "name": "Generic", + "kind": "alias", + "path": "omniread.pdf.parser.Generic", + "signature": "", + "docstring": null + }, + "TypeVar": { + "name": "TypeVar", + "kind": "alias", + "path": "omniread.pdf.parser.TypeVar", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.pdf.parser.abstractmethod", + "signature": "", + "docstring": null + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.pdf.parser.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.pdf.parser.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.pdf.parser.BaseParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.pdf.parser.BaseParser.content", + "signature": "", + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.pdf.parser.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.pdf.parser.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + }, + "T": { + "name": "T", + "kind": "attribute", + "path": "omniread.pdf.parser.T", + "signature": null, + "docstring": null + }, + "PDFParser": { + "name": "PDFParser", + "kind": "class", + "path": "omniread.pdf.parser.PDFParser", + "signature": "", + "docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.pdf.parser.PDFParser.supported_types", + "signature": null, + "docstring": "Set of content types supported by this parser (PDF only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.pdf.parser.PDFParser.parse", + "signature": "", + "docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + } + } + } + } + }, + "scraper": { + "name": "scraper", + "kind": "module", + "path": "omniread.pdf.scraper", + "signature": null, + "docstring": "PDF scraping implementation for OmniRead.\n\nThis module provides a PDF-specific scraper that coordinates PDF byte\nretrieval via a client and normalizes the result into a `Content` object.\n\nThe scraper implements the core `BaseScraper` contract while delegating\nall storage and access concerns to a `BasePDFClient` implementation.", + "members": { + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.pdf.scraper.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.pdf.scraper.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.pdf.scraper.Optional", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.pdf.scraper.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.pdf.scraper.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.pdf.scraper.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.scraper.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + }, + "BasePDFClient": { + "name": "BasePDFClient", + "kind": "class", + "path": "omniread.pdf.scraper.BasePDFClient", + "signature": "", + "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.scraper.BasePDFClient.fetch", + "signature": "", + "docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n source: Identifier of the PDF location, such as a file path,\n object storage key, or remote reference.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n Exception: Retrieval-specific errors defined by the implementation." + } + } + }, + "PDFScraper": { + "name": "PDFScraper", + "kind": "class", + "path": "omniread.pdf.scraper.PDFScraper", + "signature": "", + "docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.scraper.PDFScraper.fetch", + "signature": "", + "docstring": "Fetch a PDF document from the given source.\n\nArgs:\n source: Identifier of the PDF source as understood by the\n configured PDF client.\n metadata: Optional metadata to attach to the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw PDF bytes\n - Source identifier\n - PDF content type\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors raised by the PDF client." + } + } + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.pdf.client.json b/mcp_docs/modules/omniread.pdf.client.json new file mode 100644 index 0000000..127ef5a --- /dev/null +++ b/mcp_docs/modules/omniread.pdf.client.json @@ -0,0 +1,69 @@ +{ + "module": "omniread.pdf.client", + "content": { + "path": "omniread.pdf.client", + "docstring": "PDF client abstractions for OmniRead.\n\nThis module defines the **client layer** responsible for retrieving raw PDF\nbytes from a concrete backing store.\n\nClients provide low-level access to PDF binaries and are intentionally\ndecoupled from scraping and parsing logic. They do not perform validation,\ninterpretation, or content extraction.\n\nTypical backing stores include:\n- Local filesystems\n- Object storage (S3, GCS, etc.)\n- Network file systems", + "objects": { + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.pdf.client.Any", + "signature": "", + "docstring": null + }, + "ABC": { + "name": "ABC", + "kind": "alias", + "path": "omniread.pdf.client.ABC", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.pdf.client.abstractmethod", + "signature": "", + "docstring": null + }, + "Path": { + "name": "Path", + "kind": "alias", + "path": "omniread.pdf.client.Path", + "signature": "", + "docstring": null + }, + "BasePDFClient": { + "name": "BasePDFClient", + "kind": "class", + "path": "omniread.pdf.client.BasePDFClient", + "signature": "", + "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.client.BasePDFClient.fetch", + "signature": "", + "docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n source: Identifier of the PDF location, such as a file path,\n object storage key, or remote reference.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n Exception: Retrieval-specific errors defined by the implementation." + } + } + }, + "FileSystemPDFClient": { + "name": "FileSystemPDFClient", + "kind": "class", + "path": "omniread.pdf.client.FileSystemPDFClient", + "signature": "", + "docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.client.FileSystemPDFClient.fetch", + "signature": "", + "docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n path: Filesystem path to the PDF file.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n FileNotFoundError: If the path does not exist.\n ValueError: If the path exists but is not a file." + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.pdf.json b/mcp_docs/modules/omniread.pdf.json new file mode 100644 index 0000000..9067c6d --- /dev/null +++ b/mcp_docs/modules/omniread.pdf.json @@ -0,0 +1,419 @@ +{ + "module": "omniread.pdf", + "content": { + "path": "omniread.pdf", + "docstring": "PDF format implementation for OmniRead.\n\nThis package provides **PDF-specific implementations** of the core OmniRead\ncontracts defined in `omniread.core`.\n\nUnlike HTML, PDF handling requires an explicit client layer for document\naccess. This package therefore includes:\n- PDF clients for acquiring raw PDF data\n- PDF scrapers that coordinate client access\n- PDF parsers that extract structured content from PDF binaries\n\nPublic exports from this package represent the supported PDF pipeline\nand are safe for consumers to import directly when working with PDFs.", + "objects": { + "FileSystemPDFClient": { + "name": "FileSystemPDFClient", + "kind": "class", + "path": "omniread.pdf.FileSystemPDFClient", + "signature": "", + "docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.FileSystemPDFClient.fetch", + "signature": "", + "docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n path: Filesystem path to the PDF file.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n FileNotFoundError: If the path does not exist.\n ValueError: If the path exists but is not a file." + } + } + }, + "PDFScraper": { + "name": "PDFScraper", + "kind": "class", + "path": "omniread.pdf.PDFScraper", + "signature": "", + "docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.PDFScraper.fetch", + "signature": "", + "docstring": "Fetch a PDF document from the given source.\n\nArgs:\n source: Identifier of the PDF source as understood by the\n configured PDF client.\n metadata: Optional metadata to attach to the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw PDF bytes\n - Source identifier\n - PDF content type\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors raised by the PDF client." + } + } + }, + "PDFParser": { + "name": "PDFParser", + "kind": "class", + "path": "omniread.pdf.PDFParser", + "signature": "", + "docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.pdf.PDFParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser (PDF only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.pdf.PDFParser.parse", + "signature": "", + "docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + } + } + }, + "client": { + "name": "client", + "kind": "module", + "path": "omniread.pdf.client", + "signature": null, + "docstring": "PDF client abstractions for OmniRead.\n\nThis module defines the **client layer** responsible for retrieving raw PDF\nbytes from a concrete backing store.\n\nClients provide low-level access to PDF binaries and are intentionally\ndecoupled from scraping and parsing logic. They do not perform validation,\ninterpretation, or content extraction.\n\nTypical backing stores include:\n- Local filesystems\n- Object storage (S3, GCS, etc.)\n- Network file systems", + "members": { + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.pdf.client.Any", + "signature": "", + "docstring": null + }, + "ABC": { + "name": "ABC", + "kind": "alias", + "path": "omniread.pdf.client.ABC", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.pdf.client.abstractmethod", + "signature": "", + "docstring": null + }, + "Path": { + "name": "Path", + "kind": "alias", + "path": "omniread.pdf.client.Path", + "signature": "", + "docstring": null + }, + "BasePDFClient": { + "name": "BasePDFClient", + "kind": "class", + "path": "omniread.pdf.client.BasePDFClient", + "signature": "", + "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.client.BasePDFClient.fetch", + "signature": "", + "docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n source: Identifier of the PDF location, such as a file path,\n object storage key, or remote reference.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n Exception: Retrieval-specific errors defined by the implementation." + } + } + }, + "FileSystemPDFClient": { + "name": "FileSystemPDFClient", + "kind": "class", + "path": "omniread.pdf.client.FileSystemPDFClient", + "signature": "", + "docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.client.FileSystemPDFClient.fetch", + "signature": "", + "docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n path: Filesystem path to the PDF file.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n FileNotFoundError: If the path does not exist.\n ValueError: If the path exists but is not a file." + } + } + } + } + }, + "parser": { + "name": "parser", + "kind": "module", + "path": "omniread.pdf.parser", + "signature": null, + "docstring": "PDF parser base implementations for OmniRead.\n\nThis module defines the **PDF-specific parser contract**, extending the\nformat-agnostic `BaseParser` with constraints appropriate for PDF content.\n\nPDF parsers are responsible for interpreting binary PDF data and producing\nstructured representations suitable for downstream consumption.", + "members": { + "Generic": { + "name": "Generic", + "kind": "alias", + "path": "omniread.pdf.parser.Generic", + "signature": "", + "docstring": null + }, + "TypeVar": { + "name": "TypeVar", + "kind": "alias", + "path": "omniread.pdf.parser.TypeVar", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.pdf.parser.abstractmethod", + "signature": "", + "docstring": null + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.pdf.parser.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.pdf.parser.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.pdf.parser.BaseParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.pdf.parser.BaseParser.content", + "signature": "", + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.pdf.parser.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.pdf.parser.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + }, + "T": { + "name": "T", + "kind": "attribute", + "path": "omniread.pdf.parser.T", + "signature": null, + "docstring": null + }, + "PDFParser": { + "name": "PDFParser", + "kind": "class", + "path": "omniread.pdf.parser.PDFParser", + "signature": "", + "docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.pdf.parser.PDFParser.supported_types", + "signature": null, + "docstring": "Set of content types supported by this parser (PDF only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.pdf.parser.PDFParser.parse", + "signature": "", + "docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + } + } + } + } + }, + "scraper": { + "name": "scraper", + "kind": "module", + "path": "omniread.pdf.scraper", + "signature": null, + "docstring": "PDF scraping implementation for OmniRead.\n\nThis module provides a PDF-specific scraper that coordinates PDF byte\nretrieval via a client and normalizes the result into a `Content` object.\n\nThe scraper implements the core `BaseScraper` contract while delegating\nall storage and access concerns to a `BasePDFClient` implementation.", + "members": { + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.pdf.scraper.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.pdf.scraper.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.pdf.scraper.Optional", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.pdf.scraper.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.pdf.scraper.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.pdf.scraper.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.scraper.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + }, + "BasePDFClient": { + "name": "BasePDFClient", + "kind": "class", + "path": "omniread.pdf.scraper.BasePDFClient", + "signature": "", + "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.scraper.BasePDFClient.fetch", + "signature": "", + "docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n source: Identifier of the PDF location, such as a file path,\n object storage key, or remote reference.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n Exception: Retrieval-specific errors defined by the implementation." + } + } + }, + "PDFScraper": { + "name": "PDFScraper", + "kind": "class", + "path": "omniread.pdf.scraper.PDFScraper", + "signature": "", + "docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.scraper.PDFScraper.fetch", + "signature": "", + "docstring": "Fetch a PDF document from the given source.\n\nArgs:\n source: Identifier of the PDF source as understood by the\n configured PDF client.\n metadata: Optional metadata to attach to the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw PDF bytes\n - Source identifier\n - PDF content type\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors raised by the PDF client." + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.pdf.parser.json b/mcp_docs/modules/omniread.pdf.parser.json new file mode 100644 index 0000000..471c058 --- /dev/null +++ b/mcp_docs/modules/omniread.pdf.parser.json @@ -0,0 +1,134 @@ +{ + "module": "omniread.pdf.parser", + "content": { + "path": "omniread.pdf.parser", + "docstring": "PDF parser base implementations for OmniRead.\n\nThis module defines the **PDF-specific parser contract**, extending the\nformat-agnostic `BaseParser` with constraints appropriate for PDF content.\n\nPDF parsers are responsible for interpreting binary PDF data and producing\nstructured representations suitable for downstream consumption.", + "objects": { + "Generic": { + "name": "Generic", + "kind": "alias", + "path": "omniread.pdf.parser.Generic", + "signature": "", + "docstring": null + }, + "TypeVar": { + "name": "TypeVar", + "kind": "alias", + "path": "omniread.pdf.parser.TypeVar", + "signature": "", + "docstring": null + }, + "abstractmethod": { + "name": "abstractmethod", + "kind": "alias", + "path": "omniread.pdf.parser.abstractmethod", + "signature": "", + "docstring": null + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.pdf.parser.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.pdf.parser.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseParser": { + "name": "BaseParser", + "kind": "class", + "path": "omniread.pdf.parser.BaseParser", + "signature": "", + "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.pdf.parser.BaseParser.supported_types", + "signature": "", + "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic." + }, + "content": { + "name": "content", + "kind": "attribute", + "path": "omniread.pdf.parser.BaseParser.content", + "signature": "", + "docstring": null + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.pdf.parser.BaseParser.parse", + "signature": "", + "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + }, + "supports": { + "name": "supports", + "kind": "function", + "path": "omniread.pdf.parser.BaseParser.supports", + "signature": "", + "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise." + } + } + }, + "T": { + "name": "T", + "kind": "attribute", + "path": "omniread.pdf.parser.T", + "signature": null, + "docstring": null + }, + "PDFParser": { + "name": "PDFParser", + "kind": "class", + "path": "omniread.pdf.parser.PDFParser", + "signature": "", + "docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method", + "members": { + "supported_types": { + "name": "supported_types", + "kind": "attribute", + "path": "omniread.pdf.parser.PDFParser.supported_types", + "signature": null, + "docstring": "Set of content types supported by this parser (PDF only)." + }, + "parse": { + "name": "parse", + "kind": "function", + "path": "omniread.pdf.parser.PDFParser.parse", + "signature": "", + "docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation." + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/modules/omniread.pdf.scraper.json b/mcp_docs/modules/omniread.pdf.scraper.json new file mode 100644 index 0000000..3d5756f --- /dev/null +++ b/mcp_docs/modules/omniread.pdf.scraper.json @@ -0,0 +1,152 @@ +{ + "module": "omniread.pdf.scraper", + "content": { + "path": "omniread.pdf.scraper", + "docstring": "PDF scraping implementation for OmniRead.\n\nThis module provides a PDF-specific scraper that coordinates PDF byte\nretrieval via a client and normalizes the result into a `Content` object.\n\nThe scraper implements the core `BaseScraper` contract while delegating\nall storage and access concerns to a `BasePDFClient` implementation.", + "objects": { + "Any": { + "name": "Any", + "kind": "alias", + "path": "omniread.pdf.scraper.Any", + "signature": "", + "docstring": null + }, + "Mapping": { + "name": "Mapping", + "kind": "alias", + "path": "omniread.pdf.scraper.Mapping", + "signature": "", + "docstring": null + }, + "Optional": { + "name": "Optional", + "kind": "alias", + "path": "omniread.pdf.scraper.Optional", + "signature": "", + "docstring": null + }, + "Content": { + "name": "Content", + "kind": "class", + "path": "omniread.pdf.scraper.Content", + "signature": "", + "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).", + "members": { + "raw": { + "name": "raw", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.raw", + "signature": "", + "docstring": null + }, + "source": { + "name": "source", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.source", + "signature": "", + "docstring": null + }, + "content_type": { + "name": "content_type", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.content_type", + "signature": "", + "docstring": null + }, + "metadata": { + "name": "metadata", + "kind": "attribute", + "path": "omniread.pdf.scraper.Content.metadata", + "signature": "", + "docstring": null + } + } + }, + "ContentType": { + "name": "ContentType", + "kind": "class", + "path": "omniread.pdf.scraper.ContentType", + "signature": "", + "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.", + "members": { + "HTML": { + "name": "HTML", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.HTML", + "signature": "", + "docstring": "HTML document content." + }, + "PDF": { + "name": "PDF", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.PDF", + "signature": "", + "docstring": "PDF document content." + }, + "JSON": { + "name": "JSON", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.JSON", + "signature": "", + "docstring": "JSON document content." + }, + "XML": { + "name": "XML", + "kind": "attribute", + "path": "omniread.pdf.scraper.ContentType.XML", + "signature": "", + "docstring": "XML document content." + } + } + }, + "BaseScraper": { + "name": "BaseScraper", + "kind": "class", + "path": "omniread.pdf.scraper.BaseScraper", + "signature": "", + "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.scraper.BaseScraper.fetch", + "signature": "", + "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation." + } + } + }, + "BasePDFClient": { + "name": "BasePDFClient", + "kind": "class", + "path": "omniread.pdf.scraper.BasePDFClient", + "signature": "", + "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.scraper.BasePDFClient.fetch", + "signature": "", + "docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n source: Identifier of the PDF location, such as a file path,\n object storage key, or remote reference.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n Exception: Retrieval-specific errors defined by the implementation." + } + } + }, + "PDFScraper": { + "name": "PDFScraper", + "kind": "class", + "path": "omniread.pdf.scraper.PDFScraper", + "signature": "", + "docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata", + "members": { + "fetch": { + "name": "fetch", + "kind": "function", + "path": "omniread.pdf.scraper.PDFScraper.fetch", + "signature": "", + "docstring": "Fetch a PDF document from the given source.\n\nArgs:\n source: Identifier of the PDF source as understood by the\n configured PDF client.\n metadata: Optional metadata to attach to the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw PDF bytes\n - Source identifier\n - PDF content type\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors raised by the PDF client." + } + } + } + } + } +} \ No newline at end of file diff --git a/mcp_docs/nav.json b/mcp_docs/nav.json new file mode 100644 index 0000000..61b75cc --- /dev/null +++ b/mcp_docs/nav.json @@ -0,0 +1,50 @@ +[ + { + "module": "omniread", + "resource": "doc://modules/omniread" + }, + { + "module": "omniread.core", + "resource": "doc://modules/omniread.core" + }, + { + "module": "omniread.core.content", + "resource": "doc://modules/omniread.core.content" + }, + { + "module": "omniread.core.parser", + "resource": "doc://modules/omniread.core.parser" + }, + { + "module": "omniread.core.scraper", + "resource": "doc://modules/omniread.core.scraper" + }, + { + "module": "omniread.html", + "resource": "doc://modules/omniread.html" + }, + { + "module": "omniread.html.parser", + "resource": "doc://modules/omniread.html.parser" + }, + { + "module": "omniread.html.scraper", + "resource": "doc://modules/omniread.html.scraper" + }, + { + "module": "omniread.pdf", + "resource": "doc://modules/omniread.pdf" + }, + { + "module": "omniread.pdf.client", + "resource": "doc://modules/omniread.pdf.client" + }, + { + "module": "omniread.pdf.parser", + "resource": "doc://modules/omniread.pdf.parser" + }, + { + "module": "omniread.pdf.scraper", + "resource": "doc://modules/omniread.pdf.scraper" + } +] \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 2a736d4..1cf8fee 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,50 +4,51 @@ site_description: Format-agnostic document reading, parsing, and scraping framew theme: name: material palette: - - scheme: slate - primary: deep purple - accent: cyan + - scheme: slate + primary: deep purple + accent: cyan font: text: Inter code: JetBrains Mono features: - - navigation.tabs - - navigation.expand - - navigation.top - - navigation.instant - - content.code.copy - - content.code.annotate + - navigation.tabs + - navigation.expand + - navigation.top + - navigation.instant + - content.code.copy + - content.code.annotate plugins: - - search - - mkdocstrings: - handlers: - python: - paths: ["."] - options: - docstring_style: google - show_source: false - show_signature_annotations: true - separate_signature: true - merge_init_into_class: true - inherited_members: true - annotations_path: brief - show_root_heading: true - group_by_category: true +- search +- mkdocstrings: + handlers: + python: + paths: + - . + options: + docstring_style: google + show_source: false + show_signature_annotations: true + separate_signature: true + merge_init_into_class: true + inherited_members: true + annotations_path: brief + show_root_heading: true + group_by_category: true nav: - - Home: index.md - - - Core (Contracts): - - Content Models: core/content.md - - Parsers: core/parser.md - - Scrapers: core/scraper.md - - - HTML Implementation: - - HTML Parser: html/parser.md - - HTML Scraper: html/scraper.md - - - PDF Implementation: - - PDF Client: pdf/client.md - - PDF Parser: pdf/parser.md - - PDF Scraper: pdf/scraper.md +- Home: omniread/index.md +- Core API: + - omniread/core/index.md + - omniread/core/content.md + - omniread/core/parser.md + - omniread/core/scraper.md +- HTML Handling: + - omniread/html/index.md + - omniread/html/parser.md + - omniread/html/scraper.md +- PDF Handling: + - omniread/pdf/index.md + - omniread/pdf/client.md + - omniread/pdf/parser.md + - omniread/pdf/scraper.md diff --git a/omniread/__init__.py b/omniread/__init__.py index 6d9b3a9..8108e09 100644 --- a/omniread/__init__.py +++ b/omniread/__init__.py @@ -94,6 +94,27 @@ PDF: - FileSystemPDFClient - PDFScraper - PDFParser + +## Core Philosophy + +`OmniRead` is designed as a **decoupled content engine**: + +1. **Separation of Concerns**: Scrapers *fetch*, Parsers *interpret*. Neither knows about the other. +2. **Normalized Exchange**: All components communicate via the `Content` model, ensuring a consistent contract. +3. **Format Agnosticism**: The core logic is independent of whether the input is HTML, PDF, or JSON. + +## Documentation Design + +For those extending `OmniRead`, follow these "AI-Native" docstring principles: + +### For Humans +- **Clear Contracts**: Explicitly state what a component is and is NOT responsible for. +- **Runnable Examples**: Include small, logical snippets in the package `__init__.py`. + +### For LLMs +- **Structured Models**: Use dataclasses and enums for core data to ensure clean MCP JSON representation. +- **Type Safety**: All public APIs must be fully typed and have corresponding `.pyi` stubs. +- **Detailed Raises**: Include `: description` pairs in the `Raises` section to help agents handle errors gracefully. """ from .core import Content, ContentType diff --git a/omniread/__init__.pyi b/omniread/__init__.pyi new file mode 100644 index 0000000..0980e93 --- /dev/null +++ b/omniread/__init__.pyi @@ -0,0 +1,13 @@ +from .core import Content, ContentType +from .html import HTMLScraper, HTMLParser +from .pdf import FileSystemPDFClient, PDFScraper, PDFParser + +__all__ = [ + "Content", + "ContentType", + "HTMLScraper", + "HTMLParser", + "FileSystemPDFClient", + "PDFScraper", + "PDFParser", +] diff --git a/omniread/core/__init__.pyi b/omniread/core/__init__.pyi new file mode 100644 index 0000000..e1f1c1c --- /dev/null +++ b/omniread/core/__init__.pyi @@ -0,0 +1,10 @@ +from .content import Content, ContentType +from .parser import BaseParser +from .scraper import BaseScraper + +__all__ = [ + "Content", + "ContentType", + "BaseParser", + "BaseScraper", +] diff --git a/omniread/core/content.pyi b/omniread/core/content.pyi new file mode 100644 index 0000000..5606462 --- /dev/null +++ b/omniread/core/content.pyi @@ -0,0 +1,15 @@ +from enum import Enum +from typing import Any, Mapping, Optional + +class ContentType(str, Enum): + HTML = "text/html" + PDF = "application/pdf" + JSON = "application/json" + XML = "application/xml" + +class Content: + raw: bytes + source: str + content_type: Optional[ContentType] + metadata: Optional[Mapping[str, Any]] + def __init__(self, raw: bytes, source: str, content_type: Optional[ContentType] = ..., metadata: Optional[Mapping[str, Any]] = ...) -> None: ... diff --git a/omniread/core/parser.pyi b/omniread/core/parser.pyi new file mode 100644 index 0000000..c76e60f --- /dev/null +++ b/omniread/core/parser.pyi @@ -0,0 +1,13 @@ +from abc import ABC, abstractmethod +from typing import Generic, TypeVar, Set +from .content import Content, ContentType + +T = TypeVar("T") + +class BaseParser(ABC, Generic[T]): + supported_types: Set[ContentType] + content: Content + def __init__(self, content: Content) -> None: ... + @abstractmethod + def parse(self) -> T: ... + def supports(self) -> bool: ... diff --git a/omniread/core/scraper.pyi b/omniread/core/scraper.pyi new file mode 100644 index 0000000..cf0ee5f --- /dev/null +++ b/omniread/core/scraper.pyi @@ -0,0 +1,7 @@ +from abc import ABC, abstractmethod +from typing import Any, Mapping, Optional +from .content import Content + +class BaseScraper(ABC): + @abstractmethod + def fetch(self, source: str, *, metadata: Optional[Mapping[str, Any]] = ...) -> Content: ... diff --git a/omniread/html/__init__.pyi b/omniread/html/__init__.pyi new file mode 100644 index 0000000..e52c56d --- /dev/null +++ b/omniread/html/__init__.pyi @@ -0,0 +1,4 @@ +from .scraper import HTMLScraper +from .parser import HTMLParser + +__all__ = ["HTMLScraper", "HTMLParser"] diff --git a/omniread/html/parser.pyi b/omniread/html/parser.pyi new file mode 100644 index 0000000..78d8fad --- /dev/null +++ b/omniread/html/parser.pyi @@ -0,0 +1,18 @@ +from typing import Any, Generic, TypeVar, Optional, list, dict +from bs4 import BeautifulSoup, Tag +from omniread.core.content import ContentType, Content +from omniread.core.parser import BaseParser + +T = TypeVar("T") + +class HTMLParser(BaseParser[T], Generic[T]): + supported_types: set[ContentType] + def __init__(self, content: Content, features: str = ...) -> None: ... + def parse(self) -> T: ... + @staticmethod + def parse_div(div: Tag, *, separator: str = ...) -> str: ... + @staticmethod + def parse_link(a: Tag) -> Optional[str]: ... + @staticmethod + def parse_table(table: Tag) -> list[list[str]]: ... + def parse_meta(self) -> dict[str, Any]: ... diff --git a/omniread/html/scraper.pyi b/omniread/html/scraper.pyi new file mode 100644 index 0000000..2249bca --- /dev/null +++ b/omniread/html/scraper.pyi @@ -0,0 +1,10 @@ +import httpx +from typing import Any, Mapping, Optional +from omniread.core.content import Content, ContentType +from omniread.core.scraper import BaseScraper + +class HTMLScraper(BaseScraper): + content_type: ContentType + def __init__(self, *, client: Optional[httpx.Client] = ..., timeout: float = ..., headers: Optional[Mapping[str, str]] = ..., follow_redirects: bool = ...) -> None: ... + def validate_content_type(self, response: httpx.Response) -> None: ... + def fetch(self, source: str, *, metadata: Optional[Mapping[str, Any]] = ...) -> Content: ... diff --git a/omniread/pdf/__init__.pyi b/omniread/pdf/__init__.pyi new file mode 100644 index 0000000..bfd206e --- /dev/null +++ b/omniread/pdf/__init__.pyi @@ -0,0 +1,5 @@ +from .client import FileSystemPDFClient +from .scraper import PDFScraper +from .parser import PDFParser + +__all__ = ["FileSystemPDFClient", "PDFScraper", "PDFParser"] diff --git a/omniread/pdf/client.py b/omniread/pdf/client.py index 686821c..c1de901 100644 --- a/omniread/pdf/client.py +++ b/omniread/pdf/client.py @@ -14,6 +14,7 @@ Typical backing stores include: - Network file systems """ +from typing import Any from abc import ABC, abstractmethod from pathlib import Path @@ -30,7 +31,7 @@ class BasePDFClient(ABC): """ @abstractmethod - def fetch(self, source: str) -> bytes: + def fetch(self, source: Any) -> bytes: """ Fetch raw PDF bytes from the given source. diff --git a/omniread/pdf/client.pyi b/omniread/pdf/client.pyi new file mode 100644 index 0000000..896f87a --- /dev/null +++ b/omniread/pdf/client.pyi @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any + +class BasePDFClient(ABC): + @abstractmethod + def fetch(self, source: Any) -> bytes: ... + +class FileSystemPDFClient(BasePDFClient): + def fetch(self, source: Path | str) -> bytes: ... diff --git a/omniread/pdf/parser.pyi b/omniread/pdf/parser.pyi new file mode 100644 index 0000000..79439f7 --- /dev/null +++ b/omniread/pdf/parser.pyi @@ -0,0 +1,11 @@ +from abc import abstractmethod +from typing import Generic, TypeVar +from omniread.core.content import ContentType +from omniread.core.parser import BaseParser + +T = TypeVar("T") + +class PDFParser(BaseParser[T], Generic[T]): + supported_types: set[ContentType] + @abstractmethod + def parse(self) -> T: ... diff --git a/omniread/pdf/scraper.py b/omniread/pdf/scraper.py index 9198679..7446ef0 100644 --- a/omniread/pdf/scraper.py +++ b/omniread/pdf/scraper.py @@ -39,7 +39,7 @@ class PDFScraper(BaseScraper): def fetch( self, - source: str, + source: Any, *, metadata: Optional[Mapping[str, Any]] = None, ) -> Content: diff --git a/omniread/pdf/scraper.pyi b/omniread/pdf/scraper.pyi new file mode 100644 index 0000000..cb4ba75 --- /dev/null +++ b/omniread/pdf/scraper.pyi @@ -0,0 +1,8 @@ +from typing import Any, Mapping, Optional +from omniread.core.content import Content, ContentType +from omniread.core.scraper import BaseScraper +from .client import BasePDFClient + +class PDFScraper(BaseScraper): + def __init__(self, *, client: BasePDFClient) -> None: ... + def fetch(self, source: Any, *, metadata: Optional[Mapping[str, Any]] = ...) -> Content: ... diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 3f4e28f..0000000 --- a/requirements.txt +++ /dev/null @@ -1,19 +0,0 @@ -httpx==0.27.0 -beautifulsoup4==4.12.0 -pydantic==2.12.3 -jinja2==3.1.6 -# lxml==5.2.0 - -# Test Packages -pytest==7.4.0 -pytest-asyncio==0.21.0 -pytest-cov==4.1.0 - -# Doc Packages -mkdocs==1.6.1 -mkdocs-material==9.6.23 -neoteroi-mkdocs==1.1.3 -pymdown-extensions==10.16.1 -mkdocs-swagger-ui-tag==0.7.2 -mkdocstrings==1.0.0 -mkdocstrings-python==2.0.1