2026-01-22 11:27:57 +00:00
46 changed files with 4475 additions and 107 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -37,4 +37,5 @@ Thumbs.db
 *.swp
 *.swo
 *~
-*.tmp
+*.tmp
 site
--- a/docforge.nav.yml
+++ b/docforge.nav.yml
@@ -0,0 +1,16 @@
 home: omniread/index.md
 groups:
  Core API:
    - omniread/core/index.md
    - omniread/core/content.md
    - omniread/core/parser.md
    - omniread/core/scraper.md
  HTML Handling:
    - omniread/html/index.md
    - omniread/html/parser.md
    - omniread/html/scraper.md
  PDF Handling:
    - omniread/pdf/index.md
    - omniread/pdf/client.md
    - omniread/pdf/parser.md
    - omniread/pdf/scraper.md
--- a/docs/omniread/core/content.md
+++ b/docs/omniread/core/content.md
@@ -1 +1,3 @@
 # Content
 ::: omniread.core.content
--- a/docs/omniread/core/index.md
+++ b/docs/omniread/core/index.md
@@ -1 +1,3 @@
 # Core
 ::: omniread.core
--- a/docs/omniread/core/parser.md
+++ b/docs/omniread/core/parser.md
@@ -1 +1,3 @@
 # Parser
 ::: omniread.core.parser
--- a/docs/omniread/core/scraper.md
+++ b/docs/omniread/core/scraper.md
@@ -1 +1,3 @@
 # Scraper
 ::: omniread.core.scraper
--- a/docs/omniread/html/index.md
+++ b/docs/omniread/html/index.md
@@ -1 +1,3 @@
 # Html
 ::: omniread.html
--- a/docs/omniread/html/parser.md
+++ b/docs/omniread/html/parser.md
@@ -1 +1,3 @@
 # Parser
 ::: omniread.html.parser
--- a/docs/omniread/html/scraper.md
+++ b/docs/omniread/html/scraper.md
@@ -1 +1,3 @@
 # Scraper
 ::: omniread.html.scraper
--- a/docs/omniread/index.md
+++ b/docs/omniread/index.md
@@ -1 +1,3 @@
 # Omniread
 ::: omniread
--- a/docs/omniread/pdf/client.md
+++ b/docs/omniread/pdf/client.md
@@ -1 +1,3 @@
 # Client
 ::: omniread.pdf.client
--- a/docs/omniread/pdf/index.md
+++ b/docs/omniread/pdf/index.md
@@ -1 +1,3 @@
 # Pdf
 ::: omniread.pdf
--- a/docs/omniread/pdf/parser.md
+++ b/docs/omniread/pdf/parser.md
@@ -1 +1,3 @@
 # Parser
 ::: omniread.pdf.parser
--- a/docs/omniread/pdf/scraper.md
+++ b/docs/omniread/pdf/scraper.md
@@ -1 +1,3 @@
 # Scraper
 ::: omniread.pdf.scraper
--- a/generate_docs.py
+++ b/generate_docs.py
@@ -1,46 +0,0 @@
 """
 Programmatic MkDocs build script for OmniRead.
 This script builds (or serves) the documentation by invoking MkDocs
 *as a Python library*, not via shell commands.
 Requirements:
 - mkdocs
 - mkdocs-material
 - mkdocstrings[python]
 Usage:
    python generate_docs.py
    python generate_docs.py --serve
 """
 import sys
 from pathlib import Path
 from mkdocs.commands import build as mkdocs_build
 from mkdocs.commands import serve as mkdocs_serve
 from mkdocs.config import load_config
 PROJECT_ROOT = Path(__file__).resolve().parent
 MKDOCS_YML = PROJECT_ROOT / "mkdocs.yml"
 def main() -> None:
    if not MKDOCS_YML.exists():
        raise FileNotFoundError("mkdocs.yml not found at project root")
    # Load MkDocs configuration programmatically
    config = load_config(str(MKDOCS_YML))
    # Decide mode
    if "--serve" in sys.argv:
        # Live-reload development server
        mkdocs_serve.serve(config)
    else:
        # Static site build
        mkdocs_build.build(config)
 if __name__ == "__main__":
    main()
--- a/mcp_docs/index.json
+++ b/mcp_docs/index.json
@@ -0,0 +1,6 @@
 {
  "project": "omniread",
  "type": "docforge-model",
  "modules_count": 12,
  "source": "docforge"
 }
--- a/mcp_docs/modules/omniread.core.content.json
+++ b/mcp_docs/modules/omniread.core.content.json
@@ -0,0 +1,118 @@
 {
  "module": "omniread.core.content",
  "content": {
    "path": "omniread.core.content",
    "docstring": "Canonical content models for OmniRead.\n\nThis module defines the **format-agnostic content representation** used across\nall parsers and scrapers in OmniRead.\n\nThe models defined here represent *what* was extracted, not *how* it was\nretrieved or parsed. Format-specific behavior and metadata must not alter\nthe semantic meaning of these models.",
    "objects": {
      "Enum": {
        "name": "Enum",
        "kind": "alias",
        "path": "omniread.core.content.Enum",
        "signature": "<bound method Alias.signature of Alias('Enum', 'enum.Enum')>",
        "docstring": null
      },
      "dataclass": {
        "name": "dataclass",
        "kind": "alias",
        "path": "omniread.core.content.dataclass",
        "signature": "<bound method Alias.signature of Alias('dataclass', 'dataclasses.dataclass')>",
        "docstring": null
      },
      "Any": {
        "name": "Any",
        "kind": "alias",
        "path": "omniread.core.content.Any",
        "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
        "docstring": null
      },
      "Mapping": {
        "name": "Mapping",
        "kind": "alias",
        "path": "omniread.core.content.Mapping",
        "signature": "<bound method Alias.signature of Alias('Mapping', 'typing.Mapping')>",
        "docstring": null
      },
      "Optional": {
        "name": "Optional",
        "kind": "alias",
        "path": "omniread.core.content.Optional",
        "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
        "docstring": null
      },
      "ContentType": {
        "name": "ContentType",
        "kind": "class",
        "path": "omniread.core.content.ContentType",
        "signature": "<bound method Class.signature of Class('ContentType', 17, 36)>",
        "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
        "members": {
          "HTML": {
            "name": "HTML",
            "kind": "attribute",
            "path": "omniread.core.content.ContentType.HTML",
            "signature": null,
            "docstring": "HTML document content."
          },
          "PDF": {
            "name": "PDF",
            "kind": "attribute",
            "path": "omniread.core.content.ContentType.PDF",
            "signature": null,
            "docstring": "PDF document content."
          },
          "JSON": {
            "name": "JSON",
            "kind": "attribute",
            "path": "omniread.core.content.ContentType.JSON",
            "signature": null,
            "docstring": "JSON document content."
          },
          "XML": {
            "name": "XML",
            "kind": "attribute",
            "path": "omniread.core.content.ContentType.XML",
            "signature": null,
            "docstring": "XML document content."
          }
        }
      },
      "Content": {
        "name": "Content",
        "kind": "class",
        "path": "omniread.core.content.Content",
        "signature": "<bound method Class.signature of Class('Content', 39, 63)>",
        "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
        "members": {
          "raw": {
            "name": "raw",
            "kind": "attribute",
            "path": "omniread.core.content.Content.raw",
            "signature": null,
            "docstring": null
          },
          "source": {
            "name": "source",
            "kind": "attribute",
            "path": "omniread.core.content.Content.source",
            "signature": null,
            "docstring": null
          },
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.core.content.Content.content_type",
            "signature": null,
            "docstring": null
          },
          "metadata": {
            "name": "metadata",
            "kind": "attribute",
            "path": "omniread.core.content.Content.metadata",
            "signature": null,
            "docstring": null
          }
        }
      }
    }
  }
 }
--- a/mcp_docs/modules/omniread.core.json
+++ b/mcp_docs/modules/omniread.core.json
@@ -0,0 +1,513 @@
 {
  "module": "omniread.core",
  "content": {
    "path": "omniread.core",
    "docstring": "Core domain contracts for OmniRead.\n\nThis package defines the **format-agnostic domain layer** of OmniRead.\nIt exposes canonical content models and abstract interfaces that are\nimplemented by format-specific modules (HTML, PDF, etc.).\n\nPublic exports from this package are considered **stable contracts** and\nare safe for downstream consumers to depend on.\n\nSubmodules:\n- content: Canonical content models and enums\n- parser: Abstract parsing contracts\n- scraper: Abstract scraping contracts\n\nFormat-specific behavior must not be introduced at this layer.",
    "objects": {
      "Content": {
        "name": "Content",
        "kind": "class",
        "path": "omniread.core.Content",
        "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
        "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
        "members": {
          "raw": {
            "name": "raw",
            "kind": "attribute",
            "path": "omniread.core.Content.raw",
            "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
            "docstring": null
          },
          "source": {
            "name": "source",
            "kind": "attribute",
            "path": "omniread.core.Content.source",
            "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
            "docstring": null
          },
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.core.Content.content_type",
            "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
            "docstring": null
          },
          "metadata": {
            "name": "metadata",
            "kind": "attribute",
            "path": "omniread.core.Content.metadata",
            "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
            "docstring": null
          }
        }
      },
      "ContentType": {
        "name": "ContentType",
        "kind": "class",
        "path": "omniread.core.ContentType",
        "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
        "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
        "members": {
          "HTML": {
            "name": "HTML",
            "kind": "attribute",
            "path": "omniread.core.ContentType.HTML",
            "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
            "docstring": "HTML document content."
          },
          "PDF": {
            "name": "PDF",
            "kind": "attribute",
            "path": "omniread.core.ContentType.PDF",
            "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
            "docstring": "PDF document content."
          },
          "JSON": {
            "name": "JSON",
            "kind": "attribute",
            "path": "omniread.core.ContentType.JSON",
            "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
            "docstring": "JSON document content."
          },
          "XML": {
            "name": "XML",
            "kind": "attribute",
            "path": "omniread.core.ContentType.XML",
            "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
            "docstring": "XML document content."
          }
        }
      },
      "BaseParser": {
        "name": "BaseParser",
        "kind": "class",
        "path": "omniread.core.BaseParser",
        "signature": "<bound method Alias.signature of Alias('BaseParser', 'omniread.core.parser.BaseParser')>",
        "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.core.BaseParser.supported_types",
            "signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.core.parser.BaseParser.supported_types')>",
            "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic."
          },
          "content": {
            "name": "content",
            "kind": "attribute",
            "path": "omniread.core.BaseParser.content",
            "signature": "<bound method Alias.signature of Alias('content', 'omniread.core.parser.BaseParser.content')>",
            "docstring": null
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.core.BaseParser.parse",
            "signature": "<bound method Alias.signature of Alias('parse', 'omniread.core.parser.BaseParser.parse')>",
            "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed, structured representation.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
          },
          "supports": {
            "name": "supports",
            "kind": "function",
            "path": "omniread.core.BaseParser.supports",
            "signature": "<bound method Alias.signature of Alias('supports', 'omniread.core.parser.BaseParser.supports')>",
            "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n    True if the content type is supported; False otherwise."
          }
        }
      },
      "BaseScraper": {
        "name": "BaseScraper",
        "kind": "class",
        "path": "omniread.core.BaseScraper",
        "signature": "<bound method Alias.signature of Alias('BaseScraper', 'omniread.core.scraper.BaseScraper')>",
        "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.core.BaseScraper.fetch",
            "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.core.scraper.BaseScraper.fetch')>",
            "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n    source: Location identifier (URL, file path, S3 URI, etc.)\n    metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n    Content object containing raw bytes and metadata.\n    - Raw content bytes\n    - Source identifier\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors as defined by the implementation."
          }
        }
      },
      "content": {
        "name": "content",
        "kind": "module",
        "path": "omniread.core.content",
        "signature": null,
        "docstring": "Canonical content models for OmniRead.\n\nThis module defines the **format-agnostic content representation** used across\nall parsers and scrapers in OmniRead.\n\nThe models defined here represent *what* was extracted, not *how* it was\nretrieved or parsed. Format-specific behavior and metadata must not alter\nthe semantic meaning of these models.",
        "members": {
          "Enum": {
            "name": "Enum",
            "kind": "alias",
            "path": "omniread.core.content.Enum",
            "signature": "<bound method Alias.signature of Alias('Enum', 'enum.Enum')>",
            "docstring": null
          },
          "dataclass": {
            "name": "dataclass",
            "kind": "alias",
            "path": "omniread.core.content.dataclass",
            "signature": "<bound method Alias.signature of Alias('dataclass', 'dataclasses.dataclass')>",
            "docstring": null
          },
          "Any": {
            "name": "Any",
            "kind": "alias",
            "path": "omniread.core.content.Any",
            "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
            "docstring": null
          },
          "Mapping": {
            "name": "Mapping",
            "kind": "alias",
            "path": "omniread.core.content.Mapping",
            "signature": "<bound method Alias.signature of Alias('Mapping', 'typing.Mapping')>",
            "docstring": null
          },
          "Optional": {
            "name": "Optional",
            "kind": "alias",
            "path": "omniread.core.content.Optional",
            "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
            "docstring": null
          },
          "ContentType": {
            "name": "ContentType",
            "kind": "class",
            "path": "omniread.core.content.ContentType",
            "signature": "<bound method Class.signature of Class('ContentType', 17, 36)>",
            "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
            "members": {
              "HTML": {
                "name": "HTML",
                "kind": "attribute",
                "path": "omniread.core.content.ContentType.HTML",
                "signature": null,
                "docstring": "HTML document content."
              },
              "PDF": {
                "name": "PDF",
                "kind": "attribute",
                "path": "omniread.core.content.ContentType.PDF",
                "signature": null,
                "docstring": "PDF document content."
              },
              "JSON": {
                "name": "JSON",
                "kind": "attribute",
                "path": "omniread.core.content.ContentType.JSON",
                "signature": null,
                "docstring": "JSON document content."
              },
              "XML": {
                "name": "XML",
                "kind": "attribute",
                "path": "omniread.core.content.ContentType.XML",
                "signature": null,
                "docstring": "XML document content."
              }
            }
          },
          "Content": {
            "name": "Content",
            "kind": "class",
            "path": "omniread.core.content.Content",
            "signature": "<bound method Class.signature of Class('Content', 39, 63)>",
            "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
            "members": {
              "raw": {
                "name": "raw",
                "kind": "attribute",
                "path": "omniread.core.content.Content.raw",
                "signature": null,
                "docstring": null
              },
              "source": {
                "name": "source",
                "kind": "attribute",
                "path": "omniread.core.content.Content.source",
                "signature": null,
                "docstring": null
              },
              "content_type": {
                "name": "content_type",
                "kind": "attribute",
                "path": "omniread.core.content.Content.content_type",
                "signature": null,
                "docstring": null
              },
              "metadata": {
                "name": "metadata",
                "kind": "attribute",
                "path": "omniread.core.content.Content.metadata",
                "signature": null,
                "docstring": null
              }
            }
          }
        }
      },
      "parser": {
        "name": "parser",
        "kind": "module",
        "path": "omniread.core.parser",
        "signature": null,
        "docstring": "Abstract parsing contracts for OmniRead.\n\nThis module defines the **format-agnostic parser interface** used to transform\nraw content into structured, typed representations.\n\nParsers are responsible for:\n- Interpreting a single `Content` instance\n- Validating compatibility with the content type\n- Producing a structured output suitable for downstream consumers\n\nParsers are not responsible for:\n- Fetching or acquiring content\n- Performing retries or error recovery\n- Managing multiple content sources",
        "members": {
          "ABC": {
            "name": "ABC",
            "kind": "alias",
            "path": "omniread.core.parser.ABC",
            "signature": "<bound method Alias.signature of Alias('ABC', 'abc.ABC')>",
            "docstring": null
          },
          "abstractmethod": {
            "name": "abstractmethod",
            "kind": "alias",
            "path": "omniread.core.parser.abstractmethod",
            "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
            "docstring": null
          },
          "Generic": {
            "name": "Generic",
            "kind": "alias",
            "path": "omniread.core.parser.Generic",
            "signature": "<bound method Alias.signature of Alias('Generic', 'typing.Generic')>",
            "docstring": null
          },
          "TypeVar": {
            "name": "TypeVar",
            "kind": "alias",
            "path": "omniread.core.parser.TypeVar",
            "signature": "<bound method Alias.signature of Alias('TypeVar', 'typing.TypeVar')>",
            "docstring": null
          },
          "Set": {
            "name": "Set",
            "kind": "alias",
            "path": "omniread.core.parser.Set",
            "signature": "<bound method Alias.signature of Alias('Set', 'typing.Set')>",
            "docstring": null
          },
          "Content": {
            "name": "Content",
            "kind": "class",
            "path": "omniread.core.parser.Content",
            "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
            "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
            "members": {
              "raw": {
                "name": "raw",
                "kind": "attribute",
                "path": "omniread.core.parser.Content.raw",
                "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
                "docstring": null
              },
              "source": {
                "name": "source",
                "kind": "attribute",
                "path": "omniread.core.parser.Content.source",
                "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
                "docstring": null
              },
              "content_type": {
                "name": "content_type",
                "kind": "attribute",
                "path": "omniread.core.parser.Content.content_type",
                "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
                "docstring": null
              },
              "metadata": {
                "name": "metadata",
                "kind": "attribute",
                "path": "omniread.core.parser.Content.metadata",
                "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
                "docstring": null
              }
            }
          },
          "ContentType": {
            "name": "ContentType",
            "kind": "class",
            "path": "omniread.core.parser.ContentType",
            "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
            "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
            "members": {
              "HTML": {
                "name": "HTML",
                "kind": "attribute",
                "path": "omniread.core.parser.ContentType.HTML",
                "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
                "docstring": "HTML document content."
              },
              "PDF": {
                "name": "PDF",
                "kind": "attribute",
                "path": "omniread.core.parser.ContentType.PDF",
                "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
                "docstring": "PDF document content."
              },
              "JSON": {
                "name": "JSON",
                "kind": "attribute",
                "path": "omniread.core.parser.ContentType.JSON",
                "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
                "docstring": "JSON document content."
              },
              "XML": {
                "name": "XML",
                "kind": "attribute",
                "path": "omniread.core.parser.ContentType.XML",
                "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
                "docstring": "XML document content."
              }
            }
          },
          "T": {
            "name": "T",
            "kind": "attribute",
            "path": "omniread.core.parser.T",
            "signature": null,
            "docstring": null
          },
          "BaseParser": {
            "name": "BaseParser",
            "kind": "class",
            "path": "omniread.core.parser.BaseParser",
            "signature": "<bound method Class.signature of Class('BaseParser', 26, 98)>",
            "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`",
            "members": {
              "supported_types": {
                "name": "supported_types",
                "kind": "attribute",
                "path": "omniread.core.parser.BaseParser.supported_types",
                "signature": null,
                "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic."
              },
              "content": {
                "name": "content",
                "kind": "attribute",
                "path": "omniread.core.parser.BaseParser.content",
                "signature": null,
                "docstring": null
              },
              "parse": {
                "name": "parse",
                "kind": "function",
                "path": "omniread.core.parser.BaseParser.parse",
                "signature": "<bound method Function.signature of Function('parse', 68, 82)>",
                "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed, structured representation.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
              },
              "supports": {
                "name": "supports",
                "kind": "function",
                "path": "omniread.core.parser.BaseParser.supports",
                "signature": "<bound method Function.signature of Function('supports', 84, 98)>",
                "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n    True if the content type is supported; False otherwise."
              }
            }
          }
        }
      },
      "scraper": {
        "name": "scraper",
        "kind": "module",
        "path": "omniread.core.scraper",
        "signature": null,
        "docstring": "Abstract scraping contracts for OmniRead.\n\nThis module defines the **format-agnostic scraper interface** responsible for\nacquiring raw content from external sources.\n\nScrapers are responsible for:\n- Locating and retrieving raw content bytes\n- Attaching minimal contextual metadata\n- Returning normalized `Content` objects\n\nScrapers are explicitly NOT responsible for:\n- Parsing or interpreting content\n- Inferring structure or semantics\n- Performing content-type specific processing\n\nAll interpretation must be delegated to parsers.",
        "members": {
          "ABC": {
            "name": "ABC",
            "kind": "alias",
            "path": "omniread.core.scraper.ABC",
            "signature": "<bound method Alias.signature of Alias('ABC', 'abc.ABC')>",
            "docstring": null
          },
          "abstractmethod": {
            "name": "abstractmethod",
            "kind": "alias",
            "path": "omniread.core.scraper.abstractmethod",
            "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
            "docstring": null
          },
          "Any": {
            "name": "Any",
            "kind": "alias",
            "path": "omniread.core.scraper.Any",
            "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
            "docstring": null
          },
          "Mapping": {
            "name": "Mapping",
            "kind": "alias",
            "path": "omniread.core.scraper.Mapping",
            "signature": "<bound method Alias.signature of Alias('Mapping', 'typing.Mapping')>",
            "docstring": null
          },
          "Optional": {
            "name": "Optional",
            "kind": "alias",
            "path": "omniread.core.scraper.Optional",
            "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
            "docstring": null
          },
          "Content": {
            "name": "Content",
            "kind": "class",
            "path": "omniread.core.scraper.Content",
            "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
            "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
            "members": {
              "raw": {
                "name": "raw",
                "kind": "attribute",
                "path": "omniread.core.scraper.Content.raw",
                "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
                "docstring": null
              },
              "source": {
                "name": "source",
                "kind": "attribute",
                "path": "omniread.core.scraper.Content.source",
                "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
                "docstring": null
              },
              "content_type": {
                "name": "content_type",
                "kind": "attribute",
                "path": "omniread.core.scraper.Content.content_type",
                "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
                "docstring": null
              },
              "metadata": {
                "name": "metadata",
                "kind": "attribute",
                "path": "omniread.core.scraper.Content.metadata",
                "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
                "docstring": null
              }
            }
          },
          "BaseScraper": {
            "name": "BaseScraper",
            "kind": "class",
            "path": "omniread.core.scraper.BaseScraper",
            "signature": "<bound method Class.signature of Class('BaseScraper', 26, 75)>",
            "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser",
            "members": {
              "fetch": {
                "name": "fetch",
                "kind": "function",
                "path": "omniread.core.scraper.BaseScraper.fetch",
                "signature": "<bound method Function.signature of Function('fetch', 49, 75)>",
                "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n    source: Location identifier (URL, file path, S3 URI, etc.)\n    metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n    Content object containing raw bytes and metadata.\n    - Raw content bytes\n    - Source identifier\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors as defined by the implementation."
              }
            }
          }
        }
      }
    }
  }
 }
--- a/mcp_docs/modules/omniread.core.parser.json
+++ b/mcp_docs/modules/omniread.core.parser.json
@@ -0,0 +1,162 @@
 {
  "module": "omniread.core.parser",
  "content": {
    "path": "omniread.core.parser",
    "docstring": "Abstract parsing contracts for OmniRead.\n\nThis module defines the **format-agnostic parser interface** used to transform\nraw content into structured, typed representations.\n\nParsers are responsible for:\n- Interpreting a single `Content` instance\n- Validating compatibility with the content type\n- Producing a structured output suitable for downstream consumers\n\nParsers are not responsible for:\n- Fetching or acquiring content\n- Performing retries or error recovery\n- Managing multiple content sources",
    "objects": {
      "ABC": {
        "name": "ABC",
        "kind": "alias",
        "path": "omniread.core.parser.ABC",
        "signature": "<bound method Alias.signature of Alias('ABC', 'abc.ABC')>",
        "docstring": null
      },
      "abstractmethod": {
        "name": "abstractmethod",
        "kind": "alias",
        "path": "omniread.core.parser.abstractmethod",
        "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
        "docstring": null
      },
      "Generic": {
        "name": "Generic",
        "kind": "alias",
        "path": "omniread.core.parser.Generic",
        "signature": "<bound method Alias.signature of Alias('Generic', 'typing.Generic')>",
        "docstring": null
      },
      "TypeVar": {
        "name": "TypeVar",
        "kind": "alias",
        "path": "omniread.core.parser.TypeVar",
        "signature": "<bound method Alias.signature of Alias('TypeVar', 'typing.TypeVar')>",
        "docstring": null
      },
      "Set": {
        "name": "Set",
        "kind": "alias",
        "path": "omniread.core.parser.Set",
        "signature": "<bound method Alias.signature of Alias('Set', 'typing.Set')>",
        "docstring": null
      },
      "Content": {
        "name": "Content",
        "kind": "class",
        "path": "omniread.core.parser.Content",
        "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
        "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
        "members": {
          "raw": {
            "name": "raw",
            "kind": "attribute",
            "path": "omniread.core.parser.Content.raw",
            "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
            "docstring": null
          },
          "source": {
            "name": "source",
            "kind": "attribute",
            "path": "omniread.core.parser.Content.source",
            "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
            "docstring": null
          },
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.core.parser.Content.content_type",
            "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
            "docstring": null
          },
          "metadata": {
            "name": "metadata",
            "kind": "attribute",
            "path": "omniread.core.parser.Content.metadata",
            "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
            "docstring": null
          }
        }
      },
      "ContentType": {
        "name": "ContentType",
        "kind": "class",
        "path": "omniread.core.parser.ContentType",
        "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
        "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
        "members": {
          "HTML": {
            "name": "HTML",
            "kind": "attribute",
            "path": "omniread.core.parser.ContentType.HTML",
            "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
            "docstring": "HTML document content."
          },
          "PDF": {
            "name": "PDF",
            "kind": "attribute",
            "path": "omniread.core.parser.ContentType.PDF",
            "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
            "docstring": "PDF document content."
          },
          "JSON": {
            "name": "JSON",
            "kind": "attribute",
            "path": "omniread.core.parser.ContentType.JSON",
            "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
            "docstring": "JSON document content."
          },
          "XML": {
            "name": "XML",
            "kind": "attribute",
            "path": "omniread.core.parser.ContentType.XML",
            "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
            "docstring": "XML document content."
          }
        }
      },
      "T": {
        "name": "T",
        "kind": "attribute",
        "path": "omniread.core.parser.T",
        "signature": null,
        "docstring": null
      },
      "BaseParser": {
        "name": "BaseParser",
        "kind": "class",
        "path": "omniread.core.parser.BaseParser",
        "signature": "<bound method Class.signature of Class('BaseParser', 26, 98)>",
        "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.core.parser.BaseParser.supported_types",
            "signature": null,
            "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic."
          },
          "content": {
            "name": "content",
            "kind": "attribute",
            "path": "omniread.core.parser.BaseParser.content",
            "signature": null,
            "docstring": null
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.core.parser.BaseParser.parse",
            "signature": "<bound method Function.signature of Function('parse', 68, 82)>",
            "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed, structured representation.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
          },
          "supports": {
            "name": "supports",
            "kind": "function",
            "path": "omniread.core.parser.BaseParser.supports",
            "signature": "<bound method Function.signature of Function('supports', 84, 98)>",
            "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n    True if the content type is supported; False otherwise."
          }
        }
      }
    }
  }
 }
--- a/mcp_docs/modules/omniread.core.scraper.json
+++ b/mcp_docs/modules/omniread.core.scraper.json
@@ -0,0 +1,97 @@
 {
  "module": "omniread.core.scraper",
  "content": {
    "path": "omniread.core.scraper",
    "docstring": "Abstract scraping contracts for OmniRead.\n\nThis module defines the **format-agnostic scraper interface** responsible for\nacquiring raw content from external sources.\n\nScrapers are responsible for:\n- Locating and retrieving raw content bytes\n- Attaching minimal contextual metadata\n- Returning normalized `Content` objects\n\nScrapers are explicitly NOT responsible for:\n- Parsing or interpreting content\n- Inferring structure or semantics\n- Performing content-type specific processing\n\nAll interpretation must be delegated to parsers.",
    "objects": {
      "ABC": {
        "name": "ABC",
        "kind": "alias",
        "path": "omniread.core.scraper.ABC",
        "signature": "<bound method Alias.signature of Alias('ABC', 'abc.ABC')>",
        "docstring": null
      },
      "abstractmethod": {
        "name": "abstractmethod",
        "kind": "alias",
        "path": "omniread.core.scraper.abstractmethod",
        "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
        "docstring": null
      },
      "Any": {
        "name": "Any",
        "kind": "alias",
        "path": "omniread.core.scraper.Any",
        "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
        "docstring": null
      },
      "Mapping": {
        "name": "Mapping",
        "kind": "alias",
        "path": "omniread.core.scraper.Mapping",
        "signature": "<bound method Alias.signature of Alias('Mapping', 'typing.Mapping')>",
        "docstring": null
      },
      "Optional": {
        "name": "Optional",
        "kind": "alias",
        "path": "omniread.core.scraper.Optional",
        "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
        "docstring": null
      },
      "Content": {
        "name": "Content",
        "kind": "class",
        "path": "omniread.core.scraper.Content",
        "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
        "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
        "members": {
          "raw": {
            "name": "raw",
            "kind": "attribute",
            "path": "omniread.core.scraper.Content.raw",
            "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
            "docstring": null
          },
          "source": {
            "name": "source",
            "kind": "attribute",
            "path": "omniread.core.scraper.Content.source",
            "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
            "docstring": null
          },
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.core.scraper.Content.content_type",
            "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
            "docstring": null
          },
          "metadata": {
            "name": "metadata",
            "kind": "attribute",
            "path": "omniread.core.scraper.Content.metadata",
            "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
            "docstring": null
          }
        }
      },
      "BaseScraper": {
        "name": "BaseScraper",
        "kind": "class",
        "path": "omniread.core.scraper.BaseScraper",
        "signature": "<bound method Class.signature of Class('BaseScraper', 26, 75)>",
        "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.core.scraper.BaseScraper.fetch",
            "signature": "<bound method Function.signature of Function('fetch', 49, 75)>",
            "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n    source: Location identifier (URL, file path, S3 URI, etc.)\n    metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n    Content object containing raw bytes and metadata.\n    - Raw content bytes\n    - Source identifier\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors as defined by the implementation."
          }
        }
      }
    }
  }
 }
--- a/mcp_docs/modules/omniread.html.json
+++ b/mcp_docs/modules/omniread.html.json
@@ -0,0 +1,488 @@
 {
  "module": "omniread.html",
  "content": {
    "path": "omniread.html",
    "docstring": "HTML format implementation for OmniRead.\n\nThis package provides **HTML-specific implementations** of the core OmniRead\ncontracts defined in `omniread.core`.\n\nIt includes:\n- HTML parsers that interpret HTML content\n- HTML scrapers that retrieve HTML documents\n\nThis package:\n- Implements, but does not redefine, core contracts\n- May contain HTML-specific behavior and edge-case handling\n- Produces canonical content models defined in `omniread.core.content`\n\nConsumers should depend on `omniread.core` interfaces wherever possible and\nuse this package only when HTML-specific behavior is required.",
    "objects": {
      "HTMLScraper": {
        "name": "HTMLScraper",
        "kind": "class",
        "path": "omniread.html.HTMLScraper",
        "signature": "<bound method Alias.signature of Alias('HTMLScraper', 'omniread.html.scraper.HTMLScraper')>",
        "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses",
        "members": {
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.html.HTMLScraper.content_type",
            "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.html.scraper.HTMLScraper.content_type')>",
            "docstring": null
          },
          "validate_content_type": {
            "name": "validate_content_type",
            "kind": "function",
            "path": "omniread.html.HTMLScraper.validate_content_type",
            "signature": "<bound method Alias.signature of Alias('validate_content_type', 'omniread.html.scraper.HTMLScraper.validate_content_type')>",
            "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n    response: HTTP response returned by `httpx`.\n\nRaises:\n    ValueError: If the `Content-Type` header is missing or does not\n        indicate HTML content."
          },
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.html.HTMLScraper.fetch",
            "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.html.scraper.HTMLScraper.fetch')>",
            "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n    source: URL of the HTML document.\n    metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n    A `Content` instance containing:\n    - Raw HTML bytes\n    - Source URL\n    - HTML content type\n    - HTTP response metadata\n\nRaises:\n    httpx.HTTPError: If the HTTP request fails.\n    ValueError: If the response is not valid HTML."
          }
        }
      },
      "HTMLParser": {
        "name": "HTMLParser",
        "kind": "class",
        "path": "omniread.html.HTMLParser",
        "signature": "<bound method Alias.signature of Alias('HTMLParser', 'omniread.html.parser.HTMLParser')>",
        "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.html.HTMLParser.supported_types",
            "signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.html.parser.HTMLParser.supported_types')>",
            "docstring": "Set of content types supported by this parser (HTML only)."
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.html.HTMLParser.parse",
            "signature": "<bound method Alias.signature of Alias('parse', 'omniread.html.parser.HTMLParser.parse')>",
            "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n    Parsed representation of type `T`."
          },
          "parse_div": {
            "name": "parse_div",
            "kind": "function",
            "path": "omniread.html.HTMLParser.parse_div",
            "signature": "<bound method Alias.signature of Alias('parse_div', 'omniread.html.parser.HTMLParser.parse_div')>",
            "docstring": "Extract normalized text from a `<div>` element.\n\nArgs:\n    div: BeautifulSoup tag representing a `<div>`.\n    separator: String used to separate text nodes.\n\nReturns:\n    Flattened, whitespace-normalized text content."
          },
          "parse_link": {
            "name": "parse_link",
            "kind": "function",
            "path": "omniread.html.HTMLParser.parse_link",
            "signature": "<bound method Alias.signature of Alias('parse_link', 'omniread.html.parser.HTMLParser.parse_link')>",
            "docstring": "Extract the hyperlink reference from an `<a>` element.\n\nArgs:\n    a: BeautifulSoup tag representing an anchor.\n\nReturns:\n    The value of the `href` attribute, or None if absent."
          },
          "parse_table": {
            "name": "parse_table",
            "kind": "function",
            "path": "omniread.html.HTMLParser.parse_table",
            "signature": "<bound method Alias.signature of Alias('parse_table', 'omniread.html.parser.HTMLParser.parse_table')>",
            "docstring": "Parse an HTML table into a 2D list of strings.\n\nArgs:\n    table: BeautifulSoup tag representing a `<table>`.\n\nReturns:\n    A list of rows, where each row is a list of cell text values."
          },
          "parse_meta": {
            "name": "parse_meta",
            "kind": "function",
            "path": "omniread.html.HTMLParser.parse_meta",
            "signature": "<bound method Alias.signature of Alias('parse_meta', 'omniread.html.parser.HTMLParser.parse_meta')>",
            "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `<meta>` tag name/property → content mappings\n\nReturns:\n    Dictionary containing extracted metadata."
          }
        }
      },
      "parser": {
        "name": "parser",
        "kind": "module",
        "path": "omniread.html.parser",
        "signature": null,
        "docstring": "HTML parser base implementations for OmniRead.\n\nThis module provides reusable HTML parsing utilities built on top of\nthe abstract parser contracts defined in `omniread.core.parser`.\n\nIt supplies:\n- Content-type enforcement for HTML inputs\n- BeautifulSoup initialization and lifecycle management\n- Common helper methods for extracting structured data from HTML elements\n\nConcrete parsers must subclass `HTMLParser` and implement the `parse()` method\nto return a structured representation appropriate for their use case.",
        "members": {
          "Any": {
            "name": "Any",
            "kind": "alias",
            "path": "omniread.html.parser.Any",
            "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
            "docstring": null
          },
          "Generic": {
            "name": "Generic",
            "kind": "alias",
            "path": "omniread.html.parser.Generic",
            "signature": "<bound method Alias.signature of Alias('Generic', 'typing.Generic')>",
            "docstring": null
          },
          "TypeVar": {
            "name": "TypeVar",
            "kind": "alias",
            "path": "omniread.html.parser.TypeVar",
            "signature": "<bound method Alias.signature of Alias('TypeVar', 'typing.TypeVar')>",
            "docstring": null
          },
          "Optional": {
            "name": "Optional",
            "kind": "alias",
            "path": "omniread.html.parser.Optional",
            "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
            "docstring": null
          },
          "abstractmethod": {
            "name": "abstractmethod",
            "kind": "alias",
            "path": "omniread.html.parser.abstractmethod",
            "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
            "docstring": null
          },
          "BeautifulSoup": {
            "name": "BeautifulSoup",
            "kind": "alias",
            "path": "omniread.html.parser.BeautifulSoup",
            "signature": "<bound method Alias.signature of Alias('BeautifulSoup', 'bs4.BeautifulSoup')>",
            "docstring": null
          },
          "Tag": {
            "name": "Tag",
            "kind": "alias",
            "path": "omniread.html.parser.Tag",
            "signature": "<bound method Alias.signature of Alias('Tag', 'bs4.Tag')>",
            "docstring": null
          },
          "ContentType": {
            "name": "ContentType",
            "kind": "class",
            "path": "omniread.html.parser.ContentType",
            "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
            "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
            "members": {
              "HTML": {
                "name": "HTML",
                "kind": "attribute",
                "path": "omniread.html.parser.ContentType.HTML",
                "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
                "docstring": "HTML document content."
              },
              "PDF": {
                "name": "PDF",
                "kind": "attribute",
                "path": "omniread.html.parser.ContentType.PDF",
                "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
                "docstring": "PDF document content."
              },
              "JSON": {
                "name": "JSON",
                "kind": "attribute",
                "path": "omniread.html.parser.ContentType.JSON",
                "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
                "docstring": "JSON document content."
              },
              "XML": {
                "name": "XML",
                "kind": "attribute",
                "path": "omniread.html.parser.ContentType.XML",
                "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
                "docstring": "XML document content."
              }
            }
          },
          "Content": {
            "name": "Content",
            "kind": "class",
            "path": "omniread.html.parser.Content",
            "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
            "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
            "members": {
              "raw": {
                "name": "raw",
                "kind": "attribute",
                "path": "omniread.html.parser.Content.raw",
                "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
                "docstring": null
              },
              "source": {
                "name": "source",
                "kind": "attribute",
                "path": "omniread.html.parser.Content.source",
                "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
                "docstring": null
              },
              "content_type": {
                "name": "content_type",
                "kind": "attribute",
                "path": "omniread.html.parser.Content.content_type",
                "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
                "docstring": null
              },
              "metadata": {
                "name": "metadata",
                "kind": "attribute",
                "path": "omniread.html.parser.Content.metadata",
                "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
                "docstring": null
              }
            }
          },
          "BaseParser": {
            "name": "BaseParser",
            "kind": "class",
            "path": "omniread.html.parser.BaseParser",
            "signature": "<bound method Alias.signature of Alias('BaseParser', 'omniread.core.parser.BaseParser')>",
            "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`",
            "members": {
              "supported_types": {
                "name": "supported_types",
                "kind": "attribute",
                "path": "omniread.html.parser.BaseParser.supported_types",
                "signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.core.parser.BaseParser.supported_types')>",
                "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic."
              },
              "content": {
                "name": "content",
                "kind": "attribute",
                "path": "omniread.html.parser.BaseParser.content",
                "signature": "<bound method Alias.signature of Alias('content', 'omniread.core.parser.BaseParser.content')>",
                "docstring": null
              },
              "parse": {
                "name": "parse",
                "kind": "function",
                "path": "omniread.html.parser.BaseParser.parse",
                "signature": "<bound method Alias.signature of Alias('parse', 'omniread.core.parser.BaseParser.parse')>",
                "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed, structured representation.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
              },
              "supports": {
                "name": "supports",
                "kind": "function",
                "path": "omniread.html.parser.BaseParser.supports",
                "signature": "<bound method Alias.signature of Alias('supports', 'omniread.core.parser.BaseParser.supports')>",
                "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n    True if the content type is supported; False otherwise."
              }
            }
          },
          "T": {
            "name": "T",
            "kind": "attribute",
            "path": "omniread.html.parser.T",
            "signature": null,
            "docstring": null
          },
          "HTMLParser": {
            "name": "HTMLParser",
            "kind": "class",
            "path": "omniread.html.parser.HTMLParser",
            "signature": "<bound method Class.signature of Class('HTMLParser', 27, 177)>",
            "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method",
            "members": {
              "supported_types": {
                "name": "supported_types",
                "kind": "attribute",
                "path": "omniread.html.parser.HTMLParser.supported_types",
                "signature": null,
                "docstring": "Set of content types supported by this parser (HTML only)."
              },
              "parse": {
                "name": "parse",
                "kind": "function",
                "path": "omniread.html.parser.HTMLParser.parse",
                "signature": "<bound method Function.signature of Function('parse', 70, 81)>",
                "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n    Parsed representation of type `T`."
              },
              "parse_div": {
                "name": "parse_div",
                "kind": "function",
                "path": "omniread.html.parser.HTMLParser.parse_div",
                "signature": "<bound method Function.signature of Function('parse_div', 87, 99)>",
                "docstring": "Extract normalized text from a `<div>` element.\n\nArgs:\n    div: BeautifulSoup tag representing a `<div>`.\n    separator: String used to separate text nodes.\n\nReturns:\n    Flattened, whitespace-normalized text content."
              },
              "parse_link": {
                "name": "parse_link",
                "kind": "function",
                "path": "omniread.html.parser.HTMLParser.parse_link",
                "signature": "<bound method Function.signature of Function('parse_link', 101, 112)>",
                "docstring": "Extract the hyperlink reference from an `<a>` element.\n\nArgs:\n    a: BeautifulSoup tag representing an anchor.\n\nReturns:\n    The value of the `href` attribute, or None if absent."
              },
              "parse_table": {
                "name": "parse_table",
                "kind": "function",
                "path": "omniread.html.parser.HTMLParser.parse_table",
                "signature": "<bound method Function.signature of Function('parse_table', 114, 133)>",
                "docstring": "Parse an HTML table into a 2D list of strings.\n\nArgs:\n    table: BeautifulSoup tag representing a `<table>`.\n\nReturns:\n    A list of rows, where each row is a list of cell text values."
              },
              "parse_meta": {
                "name": "parse_meta",
                "kind": "function",
                "path": "omniread.html.parser.HTMLParser.parse_meta",
                "signature": "<bound method Function.signature of Function('parse_meta', 153, 177)>",
                "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `<meta>` tag name/property → content mappings\n\nReturns:\n    Dictionary containing extracted metadata."
              }
            }
          },
          "list": {
            "name": "list",
            "kind": "alias",
            "path": "omniread.html.parser.list",
            "signature": "<bound method Alias.signature of Alias('list', 'typing.list')>",
            "docstring": null
          },
          "dict": {
            "name": "dict",
            "kind": "alias",
            "path": "omniread.html.parser.dict",
            "signature": "<bound method Alias.signature of Alias('dict', 'typing.dict')>",
            "docstring": null
          }
        }
      },
      "scraper": {
        "name": "scraper",
        "kind": "module",
        "path": "omniread.html.scraper",
        "signature": null,
        "docstring": "HTML scraping implementation for OmniRead.\n\nThis module provides an HTTP-based scraper for retrieving HTML documents.\nIt implements the core `BaseScraper` contract using `httpx` as the transport\nlayer.\n\nThis scraper is responsible for:\n- Fetching raw HTML bytes over HTTP(S)\n- Validating response content type\n- Attaching HTTP metadata to the returned content\n\nThis scraper is not responsible for:\n- Parsing or interpreting HTML\n- Retrying failed requests\n- Managing crawl policies or rate limiting",
        "members": {
          "httpx": {
            "name": "httpx",
            "kind": "alias",
            "path": "omniread.html.scraper.httpx",
            "signature": "<bound method Alias.signature of Alias('httpx', 'httpx')>",
            "docstring": null
          },
          "Any": {
            "name": "Any",
            "kind": "alias",
            "path": "omniread.html.scraper.Any",
            "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
            "docstring": null
          },
          "Mapping": {
            "name": "Mapping",
            "kind": "alias",
            "path": "omniread.html.scraper.Mapping",
            "signature": "<bound method Alias.signature of Alias('Mapping', 'typing.Mapping')>",
            "docstring": null
          },
          "Optional": {
            "name": "Optional",
            "kind": "alias",
            "path": "omniread.html.scraper.Optional",
            "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
            "docstring": null
          },
          "Content": {
            "name": "Content",
            "kind": "class",
            "path": "omniread.html.scraper.Content",
            "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
            "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
            "members": {
              "raw": {
                "name": "raw",
                "kind": "attribute",
                "path": "omniread.html.scraper.Content.raw",
                "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
                "docstring": null
              },
              "source": {
                "name": "source",
                "kind": "attribute",
                "path": "omniread.html.scraper.Content.source",
                "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
                "docstring": null
              },
              "content_type": {
                "name": "content_type",
                "kind": "attribute",
                "path": "omniread.html.scraper.Content.content_type",
                "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
                "docstring": null
              },
              "metadata": {
                "name": "metadata",
                "kind": "attribute",
                "path": "omniread.html.scraper.Content.metadata",
                "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
                "docstring": null
              }
            }
          },
          "ContentType": {
            "name": "ContentType",
            "kind": "class",
            "path": "omniread.html.scraper.ContentType",
            "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
            "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
            "members": {
              "HTML": {
                "name": "HTML",
                "kind": "attribute",
                "path": "omniread.html.scraper.ContentType.HTML",
                "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
                "docstring": "HTML document content."
              },
              "PDF": {
                "name": "PDF",
                "kind": "attribute",
                "path": "omniread.html.scraper.ContentType.PDF",
                "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
                "docstring": "PDF document content."
              },
              "JSON": {
                "name": "JSON",
                "kind": "attribute",
                "path": "omniread.html.scraper.ContentType.JSON",
                "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
                "docstring": "JSON document content."
              },
              "XML": {
                "name": "XML",
                "kind": "attribute",
                "path": "omniread.html.scraper.ContentType.XML",
                "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
                "docstring": "XML document content."
              }
            }
          },
          "BaseScraper": {
            "name": "BaseScraper",
            "kind": "class",
            "path": "omniread.html.scraper.BaseScraper",
            "signature": "<bound method Alias.signature of Alias('BaseScraper', 'omniread.core.scraper.BaseScraper')>",
            "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser",
            "members": {
              "fetch": {
                "name": "fetch",
                "kind": "function",
                "path": "omniread.html.scraper.BaseScraper.fetch",
                "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.core.scraper.BaseScraper.fetch')>",
                "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n    source: Location identifier (URL, file path, S3 URI, etc.)\n    metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n    Content object containing raw bytes and metadata.\n    - Raw content bytes\n    - Source identifier\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors as defined by the implementation."
              }
            }
          },
          "HTMLScraper": {
            "name": "HTMLScraper",
            "kind": "class",
            "path": "omniread.html.scraper.HTMLScraper",
            "signature": "<bound method Class.signature of Class('HTMLScraper', 26, 134)>",
            "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses",
            "members": {
              "content_type": {
                "name": "content_type",
                "kind": "attribute",
                "path": "omniread.html.scraper.HTMLScraper.content_type",
                "signature": null,
                "docstring": null
              },
              "validate_content_type": {
                "name": "validate_content_type",
                "kind": "function",
                "path": "omniread.html.scraper.HTMLScraper.validate_content_type",
                "signature": "<bound method Function.signature of Function('validate_content_type', 71, 94)>",
                "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n    response: HTTP response returned by `httpx`.\n\nRaises:\n    ValueError: If the `Content-Type` header is missing or does not\n        indicate HTML content."
              },
              "fetch": {
                "name": "fetch",
                "kind": "function",
                "path": "omniread.html.scraper.HTMLScraper.fetch",
                "signature": "<bound method Function.signature of Function('fetch', 96, 134)>",
                "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n    source: URL of the HTML document.\n    metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n    A `Content` instance containing:\n    - Raw HTML bytes\n    - Source URL\n    - HTML content type\n    - HTTP response metadata\n\nRaises:\n    httpx.HTTPError: If the HTTP request fails.\n    ValueError: If the response is not valid HTML."
              }
            }
          }
        }
      }
    }
  }
 }
--- a/mcp_docs/modules/omniread.html.parser.json
+++ b/mcp_docs/modules/omniread.html.parser.json
@@ -0,0 +1,241 @@
 {
  "module": "omniread.html.parser",
  "content": {
    "path": "omniread.html.parser",
    "docstring": "HTML parser base implementations for OmniRead.\n\nThis module provides reusable HTML parsing utilities built on top of\nthe abstract parser contracts defined in `omniread.core.parser`.\n\nIt supplies:\n- Content-type enforcement for HTML inputs\n- BeautifulSoup initialization and lifecycle management\n- Common helper methods for extracting structured data from HTML elements\n\nConcrete parsers must subclass `HTMLParser` and implement the `parse()` method\nto return a structured representation appropriate for their use case.",
    "objects": {
      "Any": {
        "name": "Any",
        "kind": "alias",
        "path": "omniread.html.parser.Any",
        "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
        "docstring": null
      },
      "Generic": {
        "name": "Generic",
        "kind": "alias",
        "path": "omniread.html.parser.Generic",
        "signature": "<bound method Alias.signature of Alias('Generic', 'typing.Generic')>",
        "docstring": null
      },
      "TypeVar": {
        "name": "TypeVar",
        "kind": "alias",
        "path": "omniread.html.parser.TypeVar",
        "signature": "<bound method Alias.signature of Alias('TypeVar', 'typing.TypeVar')>",
        "docstring": null
      },
      "Optional": {
        "name": "Optional",
        "kind": "alias",
        "path": "omniread.html.parser.Optional",
        "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
        "docstring": null
      },
      "abstractmethod": {
        "name": "abstractmethod",
        "kind": "alias",
        "path": "omniread.html.parser.abstractmethod",
        "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
        "docstring": null
      },
      "BeautifulSoup": {
        "name": "BeautifulSoup",
        "kind": "alias",
        "path": "omniread.html.parser.BeautifulSoup",
        "signature": "<bound method Alias.signature of Alias('BeautifulSoup', 'bs4.BeautifulSoup')>",
        "docstring": null
      },
      "Tag": {
        "name": "Tag",
        "kind": "alias",
        "path": "omniread.html.parser.Tag",
        "signature": "<bound method Alias.signature of Alias('Tag', 'bs4.Tag')>",
        "docstring": null
      },
      "ContentType": {
        "name": "ContentType",
        "kind": "class",
        "path": "omniread.html.parser.ContentType",
        "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
        "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
        "members": {
          "HTML": {
            "name": "HTML",
            "kind": "attribute",
            "path": "omniread.html.parser.ContentType.HTML",
            "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
            "docstring": "HTML document content."
          },
          "PDF": {
            "name": "PDF",
            "kind": "attribute",
            "path": "omniread.html.parser.ContentType.PDF",
            "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
            "docstring": "PDF document content."
          },
          "JSON": {
            "name": "JSON",
            "kind": "attribute",
            "path": "omniread.html.parser.ContentType.JSON",
            "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
            "docstring": "JSON document content."
          },
          "XML": {
            "name": "XML",
            "kind": "attribute",
            "path": "omniread.html.parser.ContentType.XML",
            "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
            "docstring": "XML document content."
          }
        }
      },
      "Content": {
        "name": "Content",
        "kind": "class",
        "path": "omniread.html.parser.Content",
        "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
        "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
        "members": {
          "raw": {
            "name": "raw",
            "kind": "attribute",
            "path": "omniread.html.parser.Content.raw",
            "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
            "docstring": null
          },
          "source": {
            "name": "source",
            "kind": "attribute",
            "path": "omniread.html.parser.Content.source",
            "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
            "docstring": null
          },
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.html.parser.Content.content_type",
            "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
            "docstring": null
          },
          "metadata": {
            "name": "metadata",
            "kind": "attribute",
            "path": "omniread.html.parser.Content.metadata",
            "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
            "docstring": null
          }
        }
      },
      "BaseParser": {
        "name": "BaseParser",
        "kind": "class",
        "path": "omniread.html.parser.BaseParser",
        "signature": "<bound method Alias.signature of Alias('BaseParser', 'omniread.core.parser.BaseParser')>",
        "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.html.parser.BaseParser.supported_types",
            "signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.core.parser.BaseParser.supported_types')>",
            "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic."
          },
          "content": {
            "name": "content",
            "kind": "attribute",
            "path": "omniread.html.parser.BaseParser.content",
            "signature": "<bound method Alias.signature of Alias('content', 'omniread.core.parser.BaseParser.content')>",
            "docstring": null
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.html.parser.BaseParser.parse",
            "signature": "<bound method Alias.signature of Alias('parse', 'omniread.core.parser.BaseParser.parse')>",
            "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed, structured representation.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
          },
          "supports": {
            "name": "supports",
            "kind": "function",
            "path": "omniread.html.parser.BaseParser.supports",
            "signature": "<bound method Alias.signature of Alias('supports', 'omniread.core.parser.BaseParser.supports')>",
            "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n    True if the content type is supported; False otherwise."
          }
        }
      },
      "T": {
        "name": "T",
        "kind": "attribute",
        "path": "omniread.html.parser.T",
        "signature": null,
        "docstring": null
      },
      "HTMLParser": {
        "name": "HTMLParser",
        "kind": "class",
        "path": "omniread.html.parser.HTMLParser",
        "signature": "<bound method Class.signature of Class('HTMLParser', 27, 177)>",
        "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.html.parser.HTMLParser.supported_types",
            "signature": null,
            "docstring": "Set of content types supported by this parser (HTML only)."
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.html.parser.HTMLParser.parse",
            "signature": "<bound method Function.signature of Function('parse', 70, 81)>",
            "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n    Parsed representation of type `T`."
          },
          "parse_div": {
            "name": "parse_div",
            "kind": "function",
            "path": "omniread.html.parser.HTMLParser.parse_div",
            "signature": "<bound method Function.signature of Function('parse_div', 87, 99)>",
            "docstring": "Extract normalized text from a `<div>` element.\n\nArgs:\n    div: BeautifulSoup tag representing a `<div>`.\n    separator: String used to separate text nodes.\n\nReturns:\n    Flattened, whitespace-normalized text content."
          },
          "parse_link": {
            "name": "parse_link",
            "kind": "function",
            "path": "omniread.html.parser.HTMLParser.parse_link",
            "signature": "<bound method Function.signature of Function('parse_link', 101, 112)>",
            "docstring": "Extract the hyperlink reference from an `<a>` element.\n\nArgs:\n    a: BeautifulSoup tag representing an anchor.\n\nReturns:\n    The value of the `href` attribute, or None if absent."
          },
          "parse_table": {
            "name": "parse_table",
            "kind": "function",
            "path": "omniread.html.parser.HTMLParser.parse_table",
            "signature": "<bound method Function.signature of Function('parse_table', 114, 133)>",
            "docstring": "Parse an HTML table into a 2D list of strings.\n\nArgs:\n    table: BeautifulSoup tag representing a `<table>`.\n\nReturns:\n    A list of rows, where each row is a list of cell text values."
          },
          "parse_meta": {
            "name": "parse_meta",
            "kind": "function",
            "path": "omniread.html.parser.HTMLParser.parse_meta",
            "signature": "<bound method Function.signature of Function('parse_meta', 153, 177)>",
            "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `<meta>` tag name/property → content mappings\n\nReturns:\n    Dictionary containing extracted metadata."
          }
        }
      },
      "list": {
        "name": "list",
        "kind": "alias",
        "path": "omniread.html.parser.list",
        "signature": "<bound method Alias.signature of Alias('list', 'typing.list')>",
        "docstring": null
      },
      "dict": {
        "name": "dict",
        "kind": "alias",
        "path": "omniread.html.parser.dict",
        "signature": "<bound method Alias.signature of Alias('dict', 'typing.dict')>",
        "docstring": null
      }
    }
  }
 }
--- a/mcp_docs/modules/omniread.html.scraper.json
+++ b/mcp_docs/modules/omniread.html.scraper.json
@@ -0,0 +1,157 @@
 {
  "module": "omniread.html.scraper",
  "content": {
    "path": "omniread.html.scraper",
    "docstring": "HTML scraping implementation for OmniRead.\n\nThis module provides an HTTP-based scraper for retrieving HTML documents.\nIt implements the core `BaseScraper` contract using `httpx` as the transport\nlayer.\n\nThis scraper is responsible for:\n- Fetching raw HTML bytes over HTTP(S)\n- Validating response content type\n- Attaching HTTP metadata to the returned content\n\nThis scraper is not responsible for:\n- Parsing or interpreting HTML\n- Retrying failed requests\n- Managing crawl policies or rate limiting",
    "objects": {
      "httpx": {
        "name": "httpx",
        "kind": "alias",
        "path": "omniread.html.scraper.httpx",
        "signature": "<bound method Alias.signature of Alias('httpx', 'httpx')>",
        "docstring": null
      },
      "Any": {
        "name": "Any",
        "kind": "alias",
        "path": "omniread.html.scraper.Any",
        "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
        "docstring": null
      },
      "Mapping": {
        "name": "Mapping",
        "kind": "alias",
        "path": "omniread.html.scraper.Mapping",
        "signature": "<bound method Alias.signature of Alias('Mapping', 'typing.Mapping')>",
        "docstring": null
      },
      "Optional": {
        "name": "Optional",
        "kind": "alias",
        "path": "omniread.html.scraper.Optional",
        "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
        "docstring": null
      },
      "Content": {
        "name": "Content",
        "kind": "class",
        "path": "omniread.html.scraper.Content",
        "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
        "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
        "members": {
          "raw": {
            "name": "raw",
            "kind": "attribute",
            "path": "omniread.html.scraper.Content.raw",
            "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
            "docstring": null
          },
          "source": {
            "name": "source",
            "kind": "attribute",
            "path": "omniread.html.scraper.Content.source",
            "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
            "docstring": null
          },
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.html.scraper.Content.content_type",
            "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
            "docstring": null
          },
          "metadata": {
            "name": "metadata",
            "kind": "attribute",
            "path": "omniread.html.scraper.Content.metadata",
            "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
            "docstring": null
          }
        }
      },
      "ContentType": {
        "name": "ContentType",
        "kind": "class",
        "path": "omniread.html.scraper.ContentType",
        "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
        "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
        "members": {
          "HTML": {
            "name": "HTML",
            "kind": "attribute",
            "path": "omniread.html.scraper.ContentType.HTML",
            "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
            "docstring": "HTML document content."
          },
          "PDF": {
            "name": "PDF",
            "kind": "attribute",
            "path": "omniread.html.scraper.ContentType.PDF",
            "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
            "docstring": "PDF document content."
          },
          "JSON": {
            "name": "JSON",
            "kind": "attribute",
            "path": "omniread.html.scraper.ContentType.JSON",
            "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
            "docstring": "JSON document content."
          },
          "XML": {
            "name": "XML",
            "kind": "attribute",
            "path": "omniread.html.scraper.ContentType.XML",
            "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
            "docstring": "XML document content."
          }
        }
      },
      "BaseScraper": {
        "name": "BaseScraper",
        "kind": "class",
        "path": "omniread.html.scraper.BaseScraper",
        "signature": "<bound method Alias.signature of Alias('BaseScraper', 'omniread.core.scraper.BaseScraper')>",
        "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.html.scraper.BaseScraper.fetch",
            "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.core.scraper.BaseScraper.fetch')>",
            "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n    source: Location identifier (URL, file path, S3 URI, etc.)\n    metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n    Content object containing raw bytes and metadata.\n    - Raw content bytes\n    - Source identifier\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors as defined by the implementation."
          }
        }
      },
      "HTMLScraper": {
        "name": "HTMLScraper",
        "kind": "class",
        "path": "omniread.html.scraper.HTMLScraper",
        "signature": "<bound method Class.signature of Class('HTMLScraper', 26, 134)>",
        "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses",
        "members": {
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.html.scraper.HTMLScraper.content_type",
            "signature": null,
            "docstring": null
          },
          "validate_content_type": {
            "name": "validate_content_type",
            "kind": "function",
            "path": "omniread.html.scraper.HTMLScraper.validate_content_type",
            "signature": "<bound method Function.signature of Function('validate_content_type', 71, 94)>",
            "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n    response: HTTP response returned by `httpx`.\n\nRaises:\n    ValueError: If the `Content-Type` header is missing or does not\n        indicate HTML content."
          },
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.html.scraper.HTMLScraper.fetch",
            "signature": "<bound method Function.signature of Function('fetch', 96, 134)>",
            "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n    source: URL of the HTML document.\n    metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n    A `Content` instance containing:\n    - Raw HTML bytes\n    - Source URL\n    - HTML content type\n    - HTTP response metadata\n\nRaises:\n    httpx.HTTPError: If the HTTP request fails.\n    ValueError: If the response is not valid HTML."
          }
        }
      }
    }
  }
 }
--- a/mcp_docs/modules/omniread.json
+++ b/mcp_docs/modules/omniread.json
--- a/mcp_docs/modules/omniread.pdf.client.json
+++ b/mcp_docs/modules/omniread.pdf.client.json
@@ -0,0 +1,69 @@
 {
  "module": "omniread.pdf.client",
  "content": {
    "path": "omniread.pdf.client",
    "docstring": "PDF client abstractions for OmniRead.\n\nThis module defines the **client layer** responsible for retrieving raw PDF\nbytes from a concrete backing store.\n\nClients provide low-level access to PDF binaries and are intentionally\ndecoupled from scraping and parsing logic. They do not perform validation,\ninterpretation, or content extraction.\n\nTypical backing stores include:\n- Local filesystems\n- Object storage (S3, GCS, etc.)\n- Network file systems",
    "objects": {
      "Any": {
        "name": "Any",
        "kind": "alias",
        "path": "omniread.pdf.client.Any",
        "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
        "docstring": null
      },
      "ABC": {
        "name": "ABC",
        "kind": "alias",
        "path": "omniread.pdf.client.ABC",
        "signature": "<bound method Alias.signature of Alias('ABC', 'abc.ABC')>",
        "docstring": null
      },
      "abstractmethod": {
        "name": "abstractmethod",
        "kind": "alias",
        "path": "omniread.pdf.client.abstractmethod",
        "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
        "docstring": null
      },
      "Path": {
        "name": "Path",
        "kind": "alias",
        "path": "omniread.pdf.client.Path",
        "signature": "<bound method Alias.signature of Alias('Path', 'pathlib.Path')>",
        "docstring": null
      },
      "BasePDFClient": {
        "name": "BasePDFClient",
        "kind": "class",
        "path": "omniread.pdf.client.BasePDFClient",
        "signature": "<bound method Class.signature of Class('BasePDFClient', 22, 48)>",
        "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.pdf.client.BasePDFClient.fetch",
            "signature": "<bound method Function.signature of Function('fetch', 33, 48)>",
            "docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n    source: Identifier of the PDF location, such as a file path,\n        object storage key, or remote reference.\n\nReturns:\n    Raw PDF bytes.\n\nRaises:\n    Exception: Retrieval-specific errors defined by the implementation."
          }
        }
      },
      "FileSystemPDFClient": {
        "name": "FileSystemPDFClient",
        "kind": "class",
        "path": "omniread.pdf.client.FileSystemPDFClient",
        "signature": "<bound method Class.signature of Class('FileSystemPDFClient', 51, 80)>",
        "docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.pdf.client.FileSystemPDFClient.fetch",
            "signature": "<bound method Function.signature of Function('fetch', 59, 80)>",
            "docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n    path: Filesystem path to the PDF file.\n\nReturns:\n    Raw PDF bytes.\n\nRaises:\n    FileNotFoundError: If the path does not exist.\n    ValueError: If the path exists but is not a file."
          }
        }
      }
    }
  }
 }
--- a/mcp_docs/modules/omniread.pdf.json
+++ b/mcp_docs/modules/omniread.pdf.json
@@ -0,0 +1,419 @@
 {
  "module": "omniread.pdf",
  "content": {
    "path": "omniread.pdf",
    "docstring": "PDF format implementation for OmniRead.\n\nThis package provides **PDF-specific implementations** of the core OmniRead\ncontracts defined in `omniread.core`.\n\nUnlike HTML, PDF handling requires an explicit client layer for document\naccess. This package therefore includes:\n- PDF clients for acquiring raw PDF data\n- PDF scrapers that coordinate client access\n- PDF parsers that extract structured content from PDF binaries\n\nPublic exports from this package represent the supported PDF pipeline\nand are safe for consumers to import directly when working with PDFs.",
    "objects": {
      "FileSystemPDFClient": {
        "name": "FileSystemPDFClient",
        "kind": "class",
        "path": "omniread.pdf.FileSystemPDFClient",
        "signature": "<bound method Alias.signature of Alias('FileSystemPDFClient', 'omniread.pdf.client.FileSystemPDFClient')>",
        "docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.pdf.FileSystemPDFClient.fetch",
            "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.pdf.client.FileSystemPDFClient.fetch')>",
            "docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n    path: Filesystem path to the PDF file.\n\nReturns:\n    Raw PDF bytes.\n\nRaises:\n    FileNotFoundError: If the path does not exist.\n    ValueError: If the path exists but is not a file."
          }
        }
      },
      "PDFScraper": {
        "name": "PDFScraper",
        "kind": "class",
        "path": "omniread.pdf.PDFScraper",
        "signature": "<bound method Alias.signature of Alias('PDFScraper', 'omniread.pdf.scraper.PDFScraper')>",
        "docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.pdf.PDFScraper.fetch",
            "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.pdf.scraper.PDFScraper.fetch')>",
            "docstring": "Fetch a PDF document from the given source.\n\nArgs:\n    source: Identifier of the PDF source as understood by the\n        configured PDF client.\n    metadata: Optional metadata to attach to the returned content.\n\nReturns:\n    A `Content` instance containing:\n    - Raw PDF bytes\n    - Source identifier\n    - PDF content type\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors raised by the PDF client."
          }
        }
      },
      "PDFParser": {
        "name": "PDFParser",
        "kind": "class",
        "path": "omniread.pdf.PDFParser",
        "signature": "<bound method Alias.signature of Alias('PDFParser', 'omniread.pdf.parser.PDFParser')>",
        "docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.pdf.PDFParser.supported_types",
            "signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.pdf.parser.PDFParser.supported_types')>",
            "docstring": "Set of content types supported by this parser (PDF only)."
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.pdf.PDFParser.parse",
            "signature": "<bound method Alias.signature of Alias('parse', 'omniread.pdf.parser.PDFParser.parse')>",
            "docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed representation of type `T`.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
          }
        }
      },
      "client": {
        "name": "client",
        "kind": "module",
        "path": "omniread.pdf.client",
        "signature": null,
        "docstring": "PDF client abstractions for OmniRead.\n\nThis module defines the **client layer** responsible for retrieving raw PDF\nbytes from a concrete backing store.\n\nClients provide low-level access to PDF binaries and are intentionally\ndecoupled from scraping and parsing logic. They do not perform validation,\ninterpretation, or content extraction.\n\nTypical backing stores include:\n- Local filesystems\n- Object storage (S3, GCS, etc.)\n- Network file systems",
        "members": {
          "Any": {
            "name": "Any",
            "kind": "alias",
            "path": "omniread.pdf.client.Any",
            "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
            "docstring": null
          },
          "ABC": {
            "name": "ABC",
            "kind": "alias",
            "path": "omniread.pdf.client.ABC",
            "signature": "<bound method Alias.signature of Alias('ABC', 'abc.ABC')>",
            "docstring": null
          },
          "abstractmethod": {
            "name": "abstractmethod",
            "kind": "alias",
            "path": "omniread.pdf.client.abstractmethod",
            "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
            "docstring": null
          },
          "Path": {
            "name": "Path",
            "kind": "alias",
            "path": "omniread.pdf.client.Path",
            "signature": "<bound method Alias.signature of Alias('Path', 'pathlib.Path')>",
            "docstring": null
          },
          "BasePDFClient": {
            "name": "BasePDFClient",
            "kind": "class",
            "path": "omniread.pdf.client.BasePDFClient",
            "signature": "<bound method Class.signature of Class('BasePDFClient', 22, 48)>",
            "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure",
            "members": {
              "fetch": {
                "name": "fetch",
                "kind": "function",
                "path": "omniread.pdf.client.BasePDFClient.fetch",
                "signature": "<bound method Function.signature of Function('fetch', 33, 48)>",
                "docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n    source: Identifier of the PDF location, such as a file path,\n        object storage key, or remote reference.\n\nReturns:\n    Raw PDF bytes.\n\nRaises:\n    Exception: Retrieval-specific errors defined by the implementation."
              }
            }
          },
          "FileSystemPDFClient": {
            "name": "FileSystemPDFClient",
            "kind": "class",
            "path": "omniread.pdf.client.FileSystemPDFClient",
            "signature": "<bound method Class.signature of Class('FileSystemPDFClient', 51, 80)>",
            "docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.",
            "members": {
              "fetch": {
                "name": "fetch",
                "kind": "function",
                "path": "omniread.pdf.client.FileSystemPDFClient.fetch",
                "signature": "<bound method Function.signature of Function('fetch', 59, 80)>",
                "docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n    path: Filesystem path to the PDF file.\n\nReturns:\n    Raw PDF bytes.\n\nRaises:\n    FileNotFoundError: If the path does not exist.\n    ValueError: If the path exists but is not a file."
              }
            }
          }
        }
      },
      "parser": {
        "name": "parser",
        "kind": "module",
        "path": "omniread.pdf.parser",
        "signature": null,
        "docstring": "PDF parser base implementations for OmniRead.\n\nThis module defines the **PDF-specific parser contract**, extending the\nformat-agnostic `BaseParser` with constraints appropriate for PDF content.\n\nPDF parsers are responsible for interpreting binary PDF data and producing\nstructured representations suitable for downstream consumption.",
        "members": {
          "Generic": {
            "name": "Generic",
            "kind": "alias",
            "path": "omniread.pdf.parser.Generic",
            "signature": "<bound method Alias.signature of Alias('Generic', 'typing.Generic')>",
            "docstring": null
          },
          "TypeVar": {
            "name": "TypeVar",
            "kind": "alias",
            "path": "omniread.pdf.parser.TypeVar",
            "signature": "<bound method Alias.signature of Alias('TypeVar', 'typing.TypeVar')>",
            "docstring": null
          },
          "abstractmethod": {
            "name": "abstractmethod",
            "kind": "alias",
            "path": "omniread.pdf.parser.abstractmethod",
            "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
            "docstring": null
          },
          "ContentType": {
            "name": "ContentType",
            "kind": "class",
            "path": "omniread.pdf.parser.ContentType",
            "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
            "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
            "members": {
              "HTML": {
                "name": "HTML",
                "kind": "attribute",
                "path": "omniread.pdf.parser.ContentType.HTML",
                "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
                "docstring": "HTML document content."
              },
              "PDF": {
                "name": "PDF",
                "kind": "attribute",
                "path": "omniread.pdf.parser.ContentType.PDF",
                "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
                "docstring": "PDF document content."
              },
              "JSON": {
                "name": "JSON",
                "kind": "attribute",
                "path": "omniread.pdf.parser.ContentType.JSON",
                "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
                "docstring": "JSON document content."
              },
              "XML": {
                "name": "XML",
                "kind": "attribute",
                "path": "omniread.pdf.parser.ContentType.XML",
                "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
                "docstring": "XML document content."
              }
            }
          },
          "BaseParser": {
            "name": "BaseParser",
            "kind": "class",
            "path": "omniread.pdf.parser.BaseParser",
            "signature": "<bound method Alias.signature of Alias('BaseParser', 'omniread.core.parser.BaseParser')>",
            "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`",
            "members": {
              "supported_types": {
                "name": "supported_types",
                "kind": "attribute",
                "path": "omniread.pdf.parser.BaseParser.supported_types",
                "signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.core.parser.BaseParser.supported_types')>",
                "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic."
              },
              "content": {
                "name": "content",
                "kind": "attribute",
                "path": "omniread.pdf.parser.BaseParser.content",
                "signature": "<bound method Alias.signature of Alias('content', 'omniread.core.parser.BaseParser.content')>",
                "docstring": null
              },
              "parse": {
                "name": "parse",
                "kind": "function",
                "path": "omniread.pdf.parser.BaseParser.parse",
                "signature": "<bound method Alias.signature of Alias('parse', 'omniread.core.parser.BaseParser.parse')>",
                "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed, structured representation.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
              },
              "supports": {
                "name": "supports",
                "kind": "function",
                "path": "omniread.pdf.parser.BaseParser.supports",
                "signature": "<bound method Alias.signature of Alias('supports', 'omniread.core.parser.BaseParser.supports')>",
                "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n    True if the content type is supported; False otherwise."
              }
            }
          },
          "T": {
            "name": "T",
            "kind": "attribute",
            "path": "omniread.pdf.parser.T",
            "signature": null,
            "docstring": null
          },
          "PDFParser": {
            "name": "PDFParser",
            "kind": "class",
            "path": "omniread.pdf.parser.PDFParser",
            "signature": "<bound method Class.signature of Class('PDFParser', 20, 49)>",
            "docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method",
            "members": {
              "supported_types": {
                "name": "supported_types",
                "kind": "attribute",
                "path": "omniread.pdf.parser.PDFParser.supported_types",
                "signature": null,
                "docstring": "Set of content types supported by this parser (PDF only)."
              },
              "parse": {
                "name": "parse",
                "kind": "function",
                "path": "omniread.pdf.parser.PDFParser.parse",
                "signature": "<bound method Function.signature of Function('parse', 35, 49)>",
                "docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed representation of type `T`.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
              }
            }
          }
        }
      },
      "scraper": {
        "name": "scraper",
        "kind": "module",
        "path": "omniread.pdf.scraper",
        "signature": null,
        "docstring": "PDF scraping implementation for OmniRead.\n\nThis module provides a PDF-specific scraper that coordinates PDF byte\nretrieval via a client and normalizes the result into a `Content` object.\n\nThe scraper implements the core `BaseScraper` contract while delegating\nall storage and access concerns to a `BasePDFClient` implementation.",
        "members": {
          "Any": {
            "name": "Any",
            "kind": "alias",
            "path": "omniread.pdf.scraper.Any",
            "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
            "docstring": null
          },
          "Mapping": {
            "name": "Mapping",
            "kind": "alias",
            "path": "omniread.pdf.scraper.Mapping",
            "signature": "<bound method Alias.signature of Alias('Mapping', 'typing.Mapping')>",
            "docstring": null
          },
          "Optional": {
            "name": "Optional",
            "kind": "alias",
            "path": "omniread.pdf.scraper.Optional",
            "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
            "docstring": null
          },
          "Content": {
            "name": "Content",
            "kind": "class",
            "path": "omniread.pdf.scraper.Content",
            "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
            "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
            "members": {
              "raw": {
                "name": "raw",
                "kind": "attribute",
                "path": "omniread.pdf.scraper.Content.raw",
                "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
                "docstring": null
              },
              "source": {
                "name": "source",
                "kind": "attribute",
                "path": "omniread.pdf.scraper.Content.source",
                "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
                "docstring": null
              },
              "content_type": {
                "name": "content_type",
                "kind": "attribute",
                "path": "omniread.pdf.scraper.Content.content_type",
                "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
                "docstring": null
              },
              "metadata": {
                "name": "metadata",
                "kind": "attribute",
                "path": "omniread.pdf.scraper.Content.metadata",
                "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
                "docstring": null
              }
            }
          },
          "ContentType": {
            "name": "ContentType",
            "kind": "class",
            "path": "omniread.pdf.scraper.ContentType",
            "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
            "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
            "members": {
              "HTML": {
                "name": "HTML",
                "kind": "attribute",
                "path": "omniread.pdf.scraper.ContentType.HTML",
                "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
                "docstring": "HTML document content."
              },
              "PDF": {
                "name": "PDF",
                "kind": "attribute",
                "path": "omniread.pdf.scraper.ContentType.PDF",
                "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
                "docstring": "PDF document content."
              },
              "JSON": {
                "name": "JSON",
                "kind": "attribute",
                "path": "omniread.pdf.scraper.ContentType.JSON",
                "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
                "docstring": "JSON document content."
              },
              "XML": {
                "name": "XML",
                "kind": "attribute",
                "path": "omniread.pdf.scraper.ContentType.XML",
                "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
                "docstring": "XML document content."
              }
            }
          },
          "BaseScraper": {
            "name": "BaseScraper",
            "kind": "class",
            "path": "omniread.pdf.scraper.BaseScraper",
            "signature": "<bound method Alias.signature of Alias('BaseScraper', 'omniread.core.scraper.BaseScraper')>",
            "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser",
            "members": {
              "fetch": {
                "name": "fetch",
                "kind": "function",
                "path": "omniread.pdf.scraper.BaseScraper.fetch",
                "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.core.scraper.BaseScraper.fetch')>",
                "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n    source: Location identifier (URL, file path, S3 URI, etc.)\n    metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n    Content object containing raw bytes and metadata.\n    - Raw content bytes\n    - Source identifier\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors as defined by the implementation."
              }
            }
          },
          "BasePDFClient": {
            "name": "BasePDFClient",
            "kind": "class",
            "path": "omniread.pdf.scraper.BasePDFClient",
            "signature": "<bound method Alias.signature of Alias('BasePDFClient', 'omniread.pdf.client.BasePDFClient')>",
            "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure",
            "members": {
              "fetch": {
                "name": "fetch",
                "kind": "function",
                "path": "omniread.pdf.scraper.BasePDFClient.fetch",
                "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.pdf.client.BasePDFClient.fetch')>",
                "docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n    source: Identifier of the PDF location, such as a file path,\n        object storage key, or remote reference.\n\nReturns:\n    Raw PDF bytes.\n\nRaises:\n    Exception: Retrieval-specific errors defined by the implementation."
              }
            }
          },
          "PDFScraper": {
            "name": "PDFScraper",
            "kind": "class",
            "path": "omniread.pdf.scraper.PDFScraper",
            "signature": "<bound method Class.signature of Class('PDFScraper', 18, 71)>",
            "docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata",
            "members": {
              "fetch": {
                "name": "fetch",
                "kind": "function",
                "path": "omniread.pdf.scraper.PDFScraper.fetch",
                "signature": "<bound method Function.signature of Function('fetch', 40, 71)>",
                "docstring": "Fetch a PDF document from the given source.\n\nArgs:\n    source: Identifier of the PDF source as understood by the\n        configured PDF client.\n    metadata: Optional metadata to attach to the returned content.\n\nReturns:\n    A `Content` instance containing:\n    - Raw PDF bytes\n    - Source identifier\n    - PDF content type\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors raised by the PDF client."
              }
            }
          }
        }
      }
    }
  }
 }
--- a/mcp_docs/modules/omniread.pdf.parser.json
+++ b/mcp_docs/modules/omniread.pdf.parser.json
@@ -0,0 +1,134 @@
 {
  "module": "omniread.pdf.parser",
  "content": {
    "path": "omniread.pdf.parser",
    "docstring": "PDF parser base implementations for OmniRead.\n\nThis module defines the **PDF-specific parser contract**, extending the\nformat-agnostic `BaseParser` with constraints appropriate for PDF content.\n\nPDF parsers are responsible for interpreting binary PDF data and producing\nstructured representations suitable for downstream consumption.",
    "objects": {
      "Generic": {
        "name": "Generic",
        "kind": "alias",
        "path": "omniread.pdf.parser.Generic",
        "signature": "<bound method Alias.signature of Alias('Generic', 'typing.Generic')>",
        "docstring": null
      },
      "TypeVar": {
        "name": "TypeVar",
        "kind": "alias",
        "path": "omniread.pdf.parser.TypeVar",
        "signature": "<bound method Alias.signature of Alias('TypeVar', 'typing.TypeVar')>",
        "docstring": null
      },
      "abstractmethod": {
        "name": "abstractmethod",
        "kind": "alias",
        "path": "omniread.pdf.parser.abstractmethod",
        "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
        "docstring": null
      },
      "ContentType": {
        "name": "ContentType",
        "kind": "class",
        "path": "omniread.pdf.parser.ContentType",
        "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
        "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
        "members": {
          "HTML": {
            "name": "HTML",
            "kind": "attribute",
            "path": "omniread.pdf.parser.ContentType.HTML",
            "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
            "docstring": "HTML document content."
          },
          "PDF": {
            "name": "PDF",
            "kind": "attribute",
            "path": "omniread.pdf.parser.ContentType.PDF",
            "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
            "docstring": "PDF document content."
          },
          "JSON": {
            "name": "JSON",
            "kind": "attribute",
            "path": "omniread.pdf.parser.ContentType.JSON",
            "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
            "docstring": "JSON document content."
          },
          "XML": {
            "name": "XML",
            "kind": "attribute",
            "path": "omniread.pdf.parser.ContentType.XML",
            "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
            "docstring": "XML document content."
          }
        }
      },
      "BaseParser": {
        "name": "BaseParser",
        "kind": "class",
        "path": "omniread.pdf.parser.BaseParser",
        "signature": "<bound method Alias.signature of Alias('BaseParser', 'omniread.core.parser.BaseParser')>",
        "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.pdf.parser.BaseParser.supported_types",
            "signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.core.parser.BaseParser.supported_types')>",
            "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic."
          },
          "content": {
            "name": "content",
            "kind": "attribute",
            "path": "omniread.pdf.parser.BaseParser.content",
            "signature": "<bound method Alias.signature of Alias('content', 'omniread.core.parser.BaseParser.content')>",
            "docstring": null
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.pdf.parser.BaseParser.parse",
            "signature": "<bound method Alias.signature of Alias('parse', 'omniread.core.parser.BaseParser.parse')>",
            "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed, structured representation.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
          },
          "supports": {
            "name": "supports",
            "kind": "function",
            "path": "omniread.pdf.parser.BaseParser.supports",
            "signature": "<bound method Alias.signature of Alias('supports', 'omniread.core.parser.BaseParser.supports')>",
            "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n    True if the content type is supported; False otherwise."
          }
        }
      },
      "T": {
        "name": "T",
        "kind": "attribute",
        "path": "omniread.pdf.parser.T",
        "signature": null,
        "docstring": null
      },
      "PDFParser": {
        "name": "PDFParser",
        "kind": "class",
        "path": "omniread.pdf.parser.PDFParser",
        "signature": "<bound method Class.signature of Class('PDFParser', 20, 49)>",
        "docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.pdf.parser.PDFParser.supported_types",
            "signature": null,
            "docstring": "Set of content types supported by this parser (PDF only)."
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.pdf.parser.PDFParser.parse",
            "signature": "<bound method Function.signature of Function('parse', 35, 49)>",
            "docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed representation of type `T`.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
          }
        }
      }
    }
  }
 }
--- a/mcp_docs/modules/omniread.pdf.scraper.json
+++ b/mcp_docs/modules/omniread.pdf.scraper.json
@@ -0,0 +1,152 @@
 {
  "module": "omniread.pdf.scraper",
  "content": {
    "path": "omniread.pdf.scraper",
    "docstring": "PDF scraping implementation for OmniRead.\n\nThis module provides a PDF-specific scraper that coordinates PDF byte\nretrieval via a client and normalizes the result into a `Content` object.\n\nThe scraper implements the core `BaseScraper` contract while delegating\nall storage and access concerns to a `BasePDFClient` implementation.",
    "objects": {
      "Any": {
        "name": "Any",
        "kind": "alias",
        "path": "omniread.pdf.scraper.Any",
        "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
        "docstring": null
      },
      "Mapping": {
        "name": "Mapping",
        "kind": "alias",
        "path": "omniread.pdf.scraper.Mapping",
        "signature": "<bound method Alias.signature of Alias('Mapping', 'typing.Mapping')>",
        "docstring": null
      },
      "Optional": {
        "name": "Optional",
        "kind": "alias",
        "path": "omniread.pdf.scraper.Optional",
        "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
        "docstring": null
      },
      "Content": {
        "name": "Content",
        "kind": "class",
        "path": "omniread.pdf.scraper.Content",
        "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
        "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
        "members": {
          "raw": {
            "name": "raw",
            "kind": "attribute",
            "path": "omniread.pdf.scraper.Content.raw",
            "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
            "docstring": null
          },
          "source": {
            "name": "source",
            "kind": "attribute",
            "path": "omniread.pdf.scraper.Content.source",
            "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
            "docstring": null
          },
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.pdf.scraper.Content.content_type",
            "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
            "docstring": null
          },
          "metadata": {
            "name": "metadata",
            "kind": "attribute",
            "path": "omniread.pdf.scraper.Content.metadata",
            "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
            "docstring": null
          }
        }
      },
      "ContentType": {
        "name": "ContentType",
        "kind": "class",
        "path": "omniread.pdf.scraper.ContentType",
        "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
        "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
        "members": {
          "HTML": {
            "name": "HTML",
            "kind": "attribute",
            "path": "omniread.pdf.scraper.ContentType.HTML",
            "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
            "docstring": "HTML document content."
          },
          "PDF": {
            "name": "PDF",
            "kind": "attribute",
            "path": "omniread.pdf.scraper.ContentType.PDF",
            "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
            "docstring": "PDF document content."
          },
          "JSON": {
            "name": "JSON",
            "kind": "attribute",
            "path": "omniread.pdf.scraper.ContentType.JSON",
            "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
            "docstring": "JSON document content."
          },
          "XML": {
            "name": "XML",
            "kind": "attribute",
            "path": "omniread.pdf.scraper.ContentType.XML",
            "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
            "docstring": "XML document content."
          }
        }
      },
      "BaseScraper": {
        "name": "BaseScraper",
        "kind": "class",
        "path": "omniread.pdf.scraper.BaseScraper",
        "signature": "<bound method Alias.signature of Alias('BaseScraper', 'omniread.core.scraper.BaseScraper')>",
        "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.pdf.scraper.BaseScraper.fetch",
            "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.core.scraper.BaseScraper.fetch')>",
            "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n    source: Location identifier (URL, file path, S3 URI, etc.)\n    metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n    Content object containing raw bytes and metadata.\n    - Raw content bytes\n    - Source identifier\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors as defined by the implementation."
          }
        }
      },
      "BasePDFClient": {
        "name": "BasePDFClient",
        "kind": "class",
        "path": "omniread.pdf.scraper.BasePDFClient",
        "signature": "<bound method Alias.signature of Alias('BasePDFClient', 'omniread.pdf.client.BasePDFClient')>",
        "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.pdf.scraper.BasePDFClient.fetch",
            "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.pdf.client.BasePDFClient.fetch')>",
            "docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n    source: Identifier of the PDF location, such as a file path,\n        object storage key, or remote reference.\n\nReturns:\n    Raw PDF bytes.\n\nRaises:\n    Exception: Retrieval-specific errors defined by the implementation."
          }
        }
      },
      "PDFScraper": {
        "name": "PDFScraper",
        "kind": "class",
        "path": "omniread.pdf.scraper.PDFScraper",
        "signature": "<bound method Class.signature of Class('PDFScraper', 18, 71)>",
        "docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.pdf.scraper.PDFScraper.fetch",
            "signature": "<bound method Function.signature of Function('fetch', 40, 71)>",
            "docstring": "Fetch a PDF document from the given source.\n\nArgs:\n    source: Identifier of the PDF source as understood by the\n        configured PDF client.\n    metadata: Optional metadata to attach to the returned content.\n\nReturns:\n    A `Content` instance containing:\n    - Raw PDF bytes\n    - Source identifier\n    - PDF content type\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors raised by the PDF client."
          }
        }
      }
    }
  }
 }
--- a/mcp_docs/nav.json
+++ b/mcp_docs/nav.json
@@ -0,0 +1,50 @@
 [
  {
    "module": "omniread",
    "resource": "doc://modules/omniread"
  },
  {
    "module": "omniread.core",
    "resource": "doc://modules/omniread.core"
  },
  {
    "module": "omniread.core.content",
    "resource": "doc://modules/omniread.core.content"
  },
  {
    "module": "omniread.core.parser",
    "resource": "doc://modules/omniread.core.parser"
  },
  {
    "module": "omniread.core.scraper",
    "resource": "doc://modules/omniread.core.scraper"
  },
  {
    "module": "omniread.html",
    "resource": "doc://modules/omniread.html"
  },
  {
    "module": "omniread.html.parser",
    "resource": "doc://modules/omniread.html.parser"
  },
  {
    "module": "omniread.html.scraper",
    "resource": "doc://modules/omniread.html.scraper"
  },
  {
    "module": "omniread.pdf",
    "resource": "doc://modules/omniread.pdf"
  },
  {
    "module": "omniread.pdf.client",
    "resource": "doc://modules/omniread.pdf.client"
  },
  {
    "module": "omniread.pdf.parser",
    "resource": "doc://modules/omniread.pdf.parser"
  },
  {
    "module": "omniread.pdf.scraper",
    "resource": "doc://modules/omniread.pdf.scraper"
  }
 ]
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -4,50 +4,51 @@ site_description: Format-agnostic document reading, parsing, and scraping framew
 theme:
  name: material
  palette:
-    - scheme: slate
+  - scheme: slate
-      primary: deep purple
+    primary: deep purple
-      accent: cyan
+    accent: cyan
  font:
    text: Inter
    code: JetBrains Mono
  features:
-    - navigation.tabs
+  - navigation.tabs
-    - navigation.expand
+  - navigation.expand
-    - navigation.top
+  - navigation.top
-    - navigation.instant
+  - navigation.instant
-    - content.code.copy
+  - content.code.copy
-    - content.code.annotate
+  - content.code.annotate
 plugins:
-  - search
+- search
-  - mkdocstrings:
+- mkdocstrings:
-      handlers:
+    handlers:
-        python:
+      python:
-          paths: ["."]
+        paths:
-          options:
+        - .
-            docstring_style: google
+        options:
-            show_source: false
+          docstring_style: google
-            show_signature_annotations: true
+          show_source: false
-            separate_signature: true
+          show_signature_annotations: true
-            merge_init_into_class: true
+          separate_signature: true
-            inherited_members: true
+          merge_init_into_class: true
-            annotations_path: brief
+          inherited_members: true
-            show_root_heading: true
+          annotations_path: brief
-            group_by_category: true
+          show_root_heading: true
          group_by_category: true
 nav:
-  - Home: index.md
+- Home: omniread/index.md
-
+- Core API:
-  - Core (Contracts):
+  - omniread/core/index.md
-      - Content Models: core/content.md
+  - omniread/core/content.md
-      - Parsers: core/parser.md
+  - omniread/core/parser.md
-      - Scrapers: core/scraper.md
+  - omniread/core/scraper.md
-
+- HTML Handling:
-  - HTML Implementation:
+  - omniread/html/index.md
-      - HTML Parser: html/parser.md
+  - omniread/html/parser.md
-      - HTML Scraper: html/scraper.md
+  - omniread/html/scraper.md
-
+- PDF Handling:
-  - PDF Implementation:
+  - omniread/pdf/index.md
-      - PDF Client: pdf/client.md
+  - omniread/pdf/client.md
-      - PDF Parser: pdf/parser.md
+  - omniread/pdf/parser.md
-      - PDF Scraper: pdf/scraper.md
+  - omniread/pdf/scraper.md
--- a/omniread/init.py
+++ b/omniread/init.py
@@ -94,6 +94,27 @@ PDF:
 - FileSystemPDFClient
 - PDFScraper
 - PDFParser
 ## Core Philosophy
 `OmniRead` is designed as a **decoupled content engine**:
 1. **Separation of Concerns**: Scrapers *fetch*, Parsers *interpret*. Neither knows about the other.
 2. **Normalized Exchange**: All components communicate via the `Content` model, ensuring a consistent contract.
 3. **Format Agnosticism**: The core logic is independent of whether the input is HTML, PDF, or JSON.
 ## Documentation Design
 For those extending `OmniRead`, follow these "AI-Native" docstring principles:
 ### For Humans
 - **Clear Contracts**: Explicitly state what a component is and is NOT responsible for.
 - **Runnable Examples**: Include small, logical snippets in the package `__init__.py`.
 ### For LLMs
 - **Structured Models**: Use dataclasses and enums for core data to ensure clean MCP JSON representation.
 - **Type Safety**: All public APIs must be fully typed and have corresponding `.pyi` stubs.
 - **Detailed Raises**: Include `: description` pairs in the `Raises` section to help agents handle errors gracefully.
 """
 from .core import Content, ContentType
--- a/omniread/init.pyi
+++ b/omniread/init.pyi
@@ -0,0 +1,13 @@
 from .core import Content, ContentType
 from .html import HTMLScraper, HTMLParser
 from .pdf import FileSystemPDFClient, PDFScraper, PDFParser
 __all__ = [
    "Content",
    "ContentType",
    "HTMLScraper",
    "HTMLParser",
    "FileSystemPDFClient",
    "PDFScraper",
    "PDFParser",
 ]
--- a/omniread/core/init.pyi
+++ b/omniread/core/init.pyi
@@ -0,0 +1,10 @@
 from .content import Content, ContentType
 from .parser import BaseParser
 from .scraper import BaseScraper
 __all__ = [
    "Content",
    "ContentType",
    "BaseParser",
    "BaseScraper",
 ]
--- a/omniread/core/content.pyi
+++ b/omniread/core/content.pyi
@@ -0,0 +1,15 @@
 from enum import Enum
 from typing import Any, Mapping, Optional
 class ContentType(str, Enum):
    HTML = "text/html"
    PDF = "application/pdf"
    JSON = "application/json"
    XML = "application/xml"
 class Content:
    raw: bytes
    source: str
    content_type: Optional[ContentType]
    metadata: Optional[Mapping[str, Any]]
    def __init__(self, raw: bytes, source: str, content_type: Optional[ContentType] = ..., metadata: Optional[Mapping[str, Any]] = ...) -> None: ...
--- a/omniread/core/parser.pyi
+++ b/omniread/core/parser.pyi
@@ -0,0 +1,13 @@
 from abc import ABC, abstractmethod
 from typing import Generic, TypeVar, Set
 from .content import Content, ContentType
 T = TypeVar("T")
 class BaseParser(ABC, Generic[T]):
    supported_types: Set[ContentType]
    content: Content
    def __init__(self, content: Content) -> None: ...
    @abstractmethod
    def parse(self) -> T: ...
    def supports(self) -> bool: ...
--- a/omniread/core/scraper.pyi
+++ b/omniread/core/scraper.pyi
@@ -0,0 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Mapping, Optional
 from .content import Content
 class BaseScraper(ABC):
    @abstractmethod
    def fetch(self, source: str, *, metadata: Optional[Mapping[str, Any]] = ...) -> Content: ...
--- a/omniread/html/init.pyi
+++ b/omniread/html/init.pyi
@@ -0,0 +1,4 @@
 from .scraper import HTMLScraper
 from .parser import HTMLParser
 __all__ = ["HTMLScraper", "HTMLParser"]
--- a/omniread/html/parser.pyi
+++ b/omniread/html/parser.pyi
@@ -0,0 +1,18 @@
 from typing import Any, Generic, TypeVar, Optional, list, dict
 from bs4 import BeautifulSoup, Tag
 from omniread.core.content import ContentType, Content
 from omniread.core.parser import BaseParser
 T = TypeVar("T")
 class HTMLParser(BaseParser[T], Generic[T]):
    supported_types: set[ContentType]
    def __init__(self, content: Content, features: str = ...) -> None: ...
    def parse(self) -> T: ...
    @staticmethod
    def parse_div(div: Tag, *, separator: str = ...) -> str: ...
    @staticmethod
    def parse_link(a: Tag) -> Optional[str]: ...
    @staticmethod
    def parse_table(table: Tag) -> list[list[str]]: ...
    def parse_meta(self) -> dict[str, Any]: ...
--- a/omniread/html/scraper.pyi
+++ b/omniread/html/scraper.pyi
@@ -0,0 +1,10 @@
 import httpx
 from typing import Any, Mapping, Optional
 from omniread.core.content import Content, ContentType
 from omniread.core.scraper import BaseScraper
 class HTMLScraper(BaseScraper):
    content_type: ContentType
    def __init__(self, *, client: Optional[httpx.Client] = ..., timeout: float = ..., headers: Optional[Mapping[str, str]] = ..., follow_redirects: bool = ...) -> None: ...
    def validate_content_type(self, response: httpx.Response) -> None: ...
    def fetch(self, source: str, *, metadata: Optional[Mapping[str, Any]] = ...) -> Content: ...
--- a/omniread/pdf/init.pyi
+++ b/omniread/pdf/init.pyi
@@ -0,0 +1,5 @@
 from .client import FileSystemPDFClient
 from .scraper import PDFScraper
 from .parser import PDFParser
 __all__ = ["FileSystemPDFClient", "PDFScraper", "PDFParser"]
--- a/omniread/pdf/client.py
+++ b/omniread/pdf/client.py
@@ -14,6 +14,7 @@ Typical backing stores include:
 - Network file systems
 """
 from typing import Any
 from abc import ABC, abstractmethod
 from pathlib import Path
@@ -30,7 +31,7 @@ class BasePDFClient(ABC):
    """
    @abstractmethod
-    def fetch(self, source: str) -> bytes:
+    def fetch(self, source: Any) -> bytes:
        """
        Fetch raw PDF bytes from the given source.
--- a/omniread/pdf/client.pyi
+++ b/omniread/pdf/client.pyi
@@ -0,0 +1,10 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Any
 class BasePDFClient(ABC):
    @abstractmethod
    def fetch(self, source: Any) -> bytes: ...
 class FileSystemPDFClient(BasePDFClient):
    def fetch(self, source: Path | str) -> bytes: ...
--- a/omniread/pdf/parser.pyi
+++ b/omniread/pdf/parser.pyi
@@ -0,0 +1,11 @@
 from abc import abstractmethod
 from typing import Generic, TypeVar
 from omniread.core.content import ContentType
 from omniread.core.parser import BaseParser
 T = TypeVar("T")
 class PDFParser(BaseParser[T], Generic[T]):
    supported_types: set[ContentType]
    @abstractmethod
    def parse(self) -> T: ...
--- a/omniread/pdf/scraper.py
+++ b/omniread/pdf/scraper.py
@@ -39,7 +39,7 @@ class PDFScraper(BaseScraper):
    def fetch(
        self,
-        source: str,
+        source: Any,
        *,
        metadata: Optional[Mapping[str, Any]] = None,
    ) -> Content:
--- a/omniread/pdf/scraper.pyi
+++ b/omniread/pdf/scraper.pyi
@@ -0,0 +1,8 @@
 from typing import Any, Mapping, Optional
 from omniread.core.content import Content, ContentType
 from omniread.core.scraper import BaseScraper
 from .client import BasePDFClient
 class PDFScraper(BaseScraper):
    def __init__(self, *, client: BasePDFClient) -> None: ...
    def fetch(self, source: Any, *, metadata: Optional[Mapping[str, Any]] = ...) -> Content: ...
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,19 +0,0 @@
 httpx==0.27.0
 beautifulsoup4==4.12.0
 pydantic==2.12.3
 jinja2==3.1.6
 # lxml==5.2.0
 # Test Packages
 pytest==7.4.0
 pytest-asyncio==0.21.0
 pytest-cov==4.1.0
 # Doc Packages
 mkdocs==1.6.1
 mkdocs-material==9.6.23
 neoteroi-mkdocs==1.1.3
 pymdown-extensions==10.16.1
 mkdocs-swagger-ui-tag==0.7.2
 mkdocstrings==1.0.0
 mkdocstrings-python==2.0.1
 *.swp
 *.swo
 *~
 *.tmp
+site
`@@ -1 +1,3 @@`
		`# Content`

	`::: omniread.core.content`	`::: omniread.core.content`
`@@ -1 +1,3 @@`
		`# Core`

	`::: omniread.core`	`::: omniread.core`
`@@ -1 +1,3 @@`
		`# Parser`

	`::: omniread.core.parser`	`::: omniread.core.parser`
`@@ -1 +1,3 @@`
		`# Scraper`

	`::: omniread.core.scraper`	`::: omniread.core.scraper`
`@@ -1 +1,3 @@`
		`# Html`

	`::: omniread.html`	`::: omniread.html`
`@@ -1 +1,3 @@`
		`# Parser`

	`::: omniread.html.parser`	`::: omniread.html.parser`
`@@ -1 +1,3 @@`
		`# Scraper`

	`::: omniread.html.scraper`	`::: omniread.html.scraper`
`@@ -1 +1,3 @@`
		`# Client`

	`::: omniread.pdf.client`	`::: omniread.pdf.client`
`@@ -1 +1,3 @@`
		`# Parser`

	`::: omniread.pdf.parser`	`::: omniread.pdf.parser`