updated docs strings and added README.md

2026-03-08 17:59:56 +05:30
parent 0fbf0ca0f0
commit de7d04eb1a
26 changed files with 546 additions and 406 deletions
--- a/mcp_docs/modules/omniread.pdf.scraper.json
+++ b/mcp_docs/modules/omniread.pdf.scraper.json
@@ -2,7 +2,7 @@
  "module": "omniread.pdf.scraper",
  "content": {
    "path": "omniread.pdf.scraper",
-    "docstring": "PDF scraping implementation for OmniRead.\n\n---\n\n## Summary\n\nThis module provides a PDF-specific scraper that coordinates PDF byte\nretrieval via a client and normalizes the result into a `Content` object.\n\nThe scraper implements the core `BaseScraper` contract while delegating\nall storage and access concerns to a `BasePDFClient` implementation.",
+    "docstring": "# Summary\n\nPDF scraping implementation for OmniRead.\n\nThis module provides a PDF-specific scraper that coordinates PDF byte\nretrieval via a client and normalizes the result into a `Content` object.\n\nThe scraper implements the core `BaseScraper` contract while delegating\nall storage and access concerns to a `BasePDFClient` implementation.",
    "objects": {
      "Any": {
        "name": "Any",
@@ -30,7 +30,7 @@
        "kind": "class",
        "path": "omniread.pdf.scraper.Content",
        "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
-        "docstring": "Normalized representation of extracted content.\n\nNotes:\n    **Responsibilities:**\n\n        - A `Content` instance represents a raw content payload along with minimal contextual metadata describing its origin and type\n        - This class is the primary exchange format between Scrapers, Parsers, and Downstream consumers",
+        "docstring": "Normalized representation of extracted content.\n\nNotes:\n    **Responsibilities:**\n\n        - A `Content` instance represents a raw content payload along with\n          minimal contextual metadata describing its origin and type.\n        - This class is the primary exchange format between scrapers,\n          parsers, and downstream consumers.",
        "members": {
          "raw": {
            "name": "raw",
@@ -67,7 +67,7 @@
        "kind": "class",
        "path": "omniread.pdf.scraper.ContentType",
        "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
-        "docstring": "Supported MIME types for extracted content.\n\nNotes:\n    **Guarantees:**\n\n        - This enum represents the declared or inferred media type of the content source\n        - It is primarily used for routing content to the appropriate parser or downstream consumer",
+        "docstring": "Supported MIME types for extracted content.\n\nNotes:\n    **Guarantees:**\n\n        - This enum represents the declared or inferred media type of the\n          content source.\n        - It is primarily used for routing content to the appropriate\n          parser or downstream consumer.",
        "members": {
          "HTML": {
            "name": "HTML",
@@ -104,14 +104,14 @@
        "kind": "class",
        "path": "omniread.pdf.scraper.BaseScraper",
        "signature": "<bound method Alias.signature of Alias('BaseScraper', 'omniread.core.scraper.BaseScraper')>",
-        "docstring": "Base interface for all scrapers.\n\nNotes:\n    **Responsibilities:**\n\n        - A scraper is responsible ONLY for fetching raw content (bytes) from a source. It must not interpret or parse it\n        - A scraper is a stateless acquisition component that retrieves raw content from a source and returns it as a `Content` object\n        - Scrapers define how content is obtained, not what the content means\n        - Implementations may vary in transport mechanism, authentication strategy, retry and backoff behavior\n\n    **Constraints:**\n\n        - Implementations must not parse content, modify content semantics, or couple scraping logic to a specific parser",
+        "docstring": "Base interface for all scrapers.\n\nNotes:\n    **Responsibilities:**\n\n        - A scraper is responsible ONLY for fetching raw content (bytes)\n          from a source. It must not interpret or parse it.\n        - A scraper is a stateless acquisition component that retrieves raw\n          content from a source and returns it as a `Content` object.\n        - Scrapers define how content is obtained, not what the content means.\n        - Implementations may vary in transport mechanism, authentication\n          strategy, retry and backoff behavior.\n\n    **Constraints:**\n\n        - Implementations must not parse content, modify content semantics,\n          or couple scraping logic to a specific parser.",
        "members": {
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.pdf.scraper.BaseScraper.fetch",
            "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.core.scraper.BaseScraper.fetch')>",
-            "docstring": "Fetch raw content from the given source.\n\nArgs:\n    source (str):\n        Location identifier (URL, file path, S3 URI, etc.)\n    metadata (Optional[Mapping[str, Any]], optional):\n        Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n    Content:\n        Content object containing raw bytes and metadata.\n\nRaises:\n    Exception:\n        Retrieval-specific errors as defined by the implementation.\n\nNotes:\n    **Responsibilities:**\n\n        - Implementations must retrieve the content referenced by `source` and return it as raw bytes wrapped in a `Content` object"
+            "docstring": "Fetch raw content from the given source.\n\nArgs:\n    source (str):\n        Location identifier (URL, file path, S3 URI, etc.).\n\n    metadata (Optional[Mapping[str, Any]], optional):\n        Optional hints for the scraper (headers, auth, etc.).\n\nReturns:\n    Content:\n        Content object containing raw bytes and metadata.\n\nRaises:\n    Exception:\n        Retrieval-specific errors as defined by the implementation.\n\nNotes:\n    **Responsibilities:**\n\n        - Implementations must retrieve the content referenced by `source`\n          and return it as raw bytes wrapped in a `Content` object."
          }
        }
      },
@@ -120,7 +120,7 @@
        "kind": "class",
        "path": "omniread.pdf.scraper.BasePDFClient",
        "signature": "<bound method Alias.signature of Alias('BasePDFClient', 'omniread.pdf.client.BasePDFClient')>",
-        "docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nNotes:\n    **Responsibilities:**\n\n        - Implementations must accept a source identifier appropriate to the backing store, return the full PDF binary payload, and raise retrieval-specific errors on failure",
+        "docstring": "Abstract client responsible for retrieving PDF bytes.\n\nRetrieves bytes from a specific backing store (filesystem, S3, FTP, etc.).\n\nNotes:\n    **Responsibilities:**\n\n        - Implementations must accept a source identifier appropriate to\n          the backing store.\n        - Return the full PDF binary payload.\n        - Raise retrieval-specific errors on failure.",
        "members": {
          "fetch": {
            "name": "fetch",
@@ -135,8 +135,8 @@
        "name": "PDFScraper",
        "kind": "class",
        "path": "omniread.pdf.scraper.PDFScraper",
-        "signature": "<bound method Class.signature of Class('PDFScraper', 22, 77)>",
-        "docstring": "Scraper for PDF sources.\n\nNotes:\n    **Responsibilities:**\n\n        - Delegates byte retrieval to a PDF client and normalizes output into Content\n        - Preserves caller-provided metadata\n\n    **Constraints:**\n    \n        - The scraper: Does not perform parsing or interpretation, does not assume a specific storage backend",
+        "signature": "<bound method Class.signature of Class('PDFScraper', 20, 77)>",
+        "docstring": "Scraper for PDF sources.\n\nNotes:\n    **Responsibilities:**\n\n        - Delegates byte retrieval to a PDF client and normalizes output\n          into `Content`.\n        - Preserves caller-provided metadata.\n\n    **Constraints:**\n\n        - The scraper does not perform parsing or interpretation.\n        - Does not assume a specific storage backend.",
        "members": {
          "fetch": {
            "name": "fetch",