This commit is contained in:
2026-03-08 00:41:28 +05:30
parent 5842e6a227
commit 0fbf0ca0f0
14 changed files with 430 additions and 398 deletions

View File

@@ -2,21 +2,21 @@
"module": "omniread.pdf",
"content": {
"path": "omniread.pdf",
"docstring": "PDF format implementation for OmniRead.\n\nThis package provides **PDF-specific implementations** of the core OmniRead\ncontracts defined in `omniread.core`.\n\nUnlike HTML, PDF handling requires an explicit client layer for document\naccess. This package therefore includes:\n- PDF clients for acquiring raw PDF data\n- PDF scrapers that coordinate client access\n- PDF parsers that extract structured content from PDF binaries\n\nPublic exports from this package represent the supported PDF pipeline\nand are safe for consumers to import directly when working with PDFs.",
"docstring": "PDF format implementation for OmniRead.\n\n---\n\n## Summary\n\nThis package provides **PDF-specific implementations** of the core OmniRead\ncontracts defined in `omniread.core`.\n\nUnlike HTML, PDF handling requires an explicit client layer for document\naccess. This package therefore includes:\n- PDF clients for acquiring raw PDF data\n- PDF scrapers that coordinate client access\n- PDF parsers that extract structured content from PDF binaries\n\nPublic exports from this package represent the supported PDF pipeline\nand are safe for consumers to import directly when working with PDFs.\n\n---\n\n## Public API\n\n FileSystemPDFClient\n PDFScraper\n PDFParser\n\n---",
"objects": {
"FileSystemPDFClient": {
"name": "FileSystemPDFClient",
"kind": "class",
"path": "omniread.pdf.FileSystemPDFClient",
"signature": "<bound method Alias.signature of Alias('FileSystemPDFClient', 'omniread.pdf.client.FileSystemPDFClient')>",
"docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.",
"docstring": "PDF client that reads from the local filesystem.\n\nNotes:\n **Guarantees:**\n\n - This client reads PDF files directly from the disk and returns their raw binary contents",
"members": {
"fetch": {
"name": "fetch",
"kind": "function",
"path": "omniread.pdf.FileSystemPDFClient.fetch",
"signature": "<bound method Alias.signature of Alias('fetch', 'omniread.pdf.client.FileSystemPDFClient.fetch')>",
"docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n path: Filesystem path to the PDF file.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n FileNotFoundError: If the path does not exist.\n ValueError: If the path exists but is not a file."
"docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n path (Path):\n Filesystem path to the PDF file.\n\nReturns:\n bytes:\n Raw PDF bytes.\n\nRaises:\n FileNotFoundError:\n If the path does not exist.\n ValueError:\n If the path exists but is not a file."
}
}
},
@@ -25,14 +25,14 @@
"kind": "class",
"path": "omniread.pdf.PDFScraper",
"signature": "<bound method Alias.signature of Alias('PDFScraper', 'omniread.pdf.scraper.PDFScraper')>",
"docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata",
"docstring": "Scraper for PDF sources.\n\nNotes:\n **Responsibilities:**\n\n - Delegates byte retrieval to a PDF client and normalizes output into Content\n - Preserves caller-provided metadata\n\n **Constraints:**\n \n - The scraper: Does not perform parsing or interpretation, does not assume a specific storage backend",
"members": {
"fetch": {
"name": "fetch",
"kind": "function",
"path": "omniread.pdf.PDFScraper.fetch",
"signature": "<bound method Alias.signature of Alias('fetch', 'omniread.pdf.scraper.PDFScraper.fetch')>",
"docstring": "Fetch a PDF document from the given source.\n\nArgs:\n source: Identifier of the PDF source as understood by the\n configured PDF client.\n metadata: Optional metadata to attach to the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw PDF bytes\n - Source identifier\n - PDF content type\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors raised by the PDF client."
"docstring": "Fetch a PDF document from the given source.\n\nArgs:\n source (Any):\n Identifier of the PDF source as understood by the configured PDF client.\n metadata (Optional[Mapping[str, Any]], optional):\n Optional metadata to attach to the returned content.\n\nReturns:\n Content:\n A `Content` instance containing raw PDF bytes, source identifier, PDF content type, and optional metadata.\n\nRaises:\n Exception:\n Retrieval-specific errors raised by the PDF client."
}
}
},
@@ -41,7 +41,7 @@
"kind": "class",
"path": "omniread.pdf.PDFParser",
"signature": "<bound method Alias.signature of Alias('PDFParser', 'omniread.pdf.parser.PDFParser')>",
"docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method",
"docstring": "Base PDF parser.\n\nNotes:\n **Responsibilities:**\n\n - This class enforces PDF content-type compatibility and provides the extension point for implementing concrete PDF parsing strategies\n\n **Constraints:**\n\n - Concrete implementations must: Define the output type `T`, implement the `parse()` method",
"members": {
"supported_types": {
"name": "supported_types",
@@ -55,7 +55,7 @@
"kind": "function",
"path": "omniread.pdf.PDFParser.parse",
"signature": "<bound method Alias.signature of Alias('parse', 'omniread.pdf.parser.PDFParser.parse')>",
"docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation."
"docstring": "Parse PDF content into a structured output.\n\nReturns:\n T:\n Parsed representation of type `T`.\n\nRaises:\n Exception:\n Parsing-specific errors as defined by the implementation.\n\nNotes:\n **Responsibilities:**\n\n - Implementations must fully interpret the PDF binary payload and return a deterministic, structured output"
}
}
},
@@ -64,7 +64,7 @@
"kind": "module",
"path": "omniread.pdf.client",
"signature": null,
"docstring": "PDF client abstractions for OmniRead.\n\nThis module defines the **client layer** responsible for retrieving raw PDF\nbytes from a concrete backing store.\n\nClients provide low-level access to PDF binaries and are intentionally\ndecoupled from scraping and parsing logic. They do not perform validation,\ninterpretation, or content extraction.\n\nTypical backing stores include:\n- Local filesystems\n- Object storage (S3, GCS, etc.)\n- Network file systems",
"docstring": "PDF client abstractions for OmniRead.\n\n---\n\n## Summary\n\nThis module defines the **client layer** responsible for retrieving raw PDF\nbytes from a concrete backing store.\n\nClients provide low-level access to PDF binaries and are intentionally\ndecoupled from scraping and parsing logic. They do not perform validation,\ninterpretation, or content extraction.\n\nTypical backing stores include:\n- Local filesystems\n- Object storage (S3, GCS, etc.)\n- Network file systems",
"members": {
"Any": {
"name": "Any",
@@ -98,15 +98,15 @@
"name": "BasePDFClient",
"kind": "class",
"path": "omniread.pdf.client.BasePDFClient",
"signature": "<bound method Class.signature of Class('BasePDFClient', 22, 48)>",
"docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure",
"signature": "<bound method Class.signature of Class('BasePDFClient', 26, 54)>",
"docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nNotes:\n **Responsibilities:**\n\n - Implementations must accept a source identifier appropriate to the backing store, return the full PDF binary payload, and raise retrieval-specific errors on failure",
"members": {
"fetch": {
"name": "fetch",
"kind": "function",
"path": "omniread.pdf.client.BasePDFClient.fetch",
"signature": "<bound method Function.signature of Function('fetch', 33, 48)>",
"docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n source: Identifier of the PDF location, such as a file path,\n object storage key, or remote reference.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n Exception: Retrieval-specific errors defined by the implementation."
"signature": "<bound method Function.signature of Function('fetch', 37, 54)>",
"docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n source (Any):\n Identifier of the PDF location, such as a file path, object storage key, or remote reference.\n\nReturns:\n bytes:\n Raw PDF bytes.\n\nRaises:\n Exception:\n Retrieval-specific errors defined by the implementation."
}
}
},
@@ -114,15 +114,15 @@
"name": "FileSystemPDFClient",
"kind": "class",
"path": "omniread.pdf.client.FileSystemPDFClient",
"signature": "<bound method Class.signature of Class('FileSystemPDFClient', 51, 80)>",
"docstring": "PDF client that reads from the local filesystem.\n\nThis client reads PDF files directly from the disk and returns their raw\nbinary contents.",
"signature": "<bound method Class.signature of Class('FileSystemPDFClient', 57, 92)>",
"docstring": "PDF client that reads from the local filesystem.\n\nNotes:\n **Guarantees:**\n\n - This client reads PDF files directly from the disk and returns their raw binary contents",
"members": {
"fetch": {
"name": "fetch",
"kind": "function",
"path": "omniread.pdf.client.FileSystemPDFClient.fetch",
"signature": "<bound method Function.signature of Function('fetch', 59, 80)>",
"docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n path: Filesystem path to the PDF file.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n FileNotFoundError: If the path does not exist.\n ValueError: If the path exists but is not a file."
"signature": "<bound method Function.signature of Function('fetch', 67, 92)>",
"docstring": "Read a PDF file from the local filesystem.\n\nArgs:\n path (Path):\n Filesystem path to the PDF file.\n\nReturns:\n bytes:\n Raw PDF bytes.\n\nRaises:\n FileNotFoundError:\n If the path does not exist.\n ValueError:\n If the path exists but is not a file."
}
}
}
@@ -133,7 +133,7 @@
"kind": "module",
"path": "omniread.pdf.parser",
"signature": null,
"docstring": "PDF parser base implementations for OmniRead.\n\nThis module defines the **PDF-specific parser contract**, extending the\nformat-agnostic `BaseParser` with constraints appropriate for PDF content.\n\nPDF parsers are responsible for interpreting binary PDF data and producing\nstructured representations suitable for downstream consumption.",
"docstring": "PDF parser base implementations for OmniRead.\n\n---\n\n## Summary\n\nThis module defines the **PDF-specific parser contract**, extending the\nformat-agnostic `BaseParser` with constraints appropriate for PDF content.\n\nPDF parsers are responsible for interpreting binary PDF data and producing\nstructured representations suitable for downstream consumption.",
"members": {
"Generic": {
"name": "Generic",
@@ -161,7 +161,7 @@
"kind": "class",
"path": "omniread.pdf.parser.ContentType",
"signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
"docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
"docstring": "Supported MIME types for extracted content.\n\nNotes:\n **Guarantees:**\n\n - This enum represents the declared or inferred media type of the content source\n - It is primarily used for routing content to the appropriate parser or downstream consumer",
"members": {
"HTML": {
"name": "HTML",
@@ -198,14 +198,14 @@
"kind": "class",
"path": "omniread.pdf.parser.BaseParser",
"signature": "<bound method Alias.signature of Alias('BaseParser', 'omniread.core.parser.BaseParser')>",
"docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`",
"docstring": "Base interface for all parsers.\n\nNotes:\n **Guarantees:**\n\n - A parser is a self-contained object that owns the Content it is responsible for interpreting\n - Consumers may rely on early validation of content compatibility and type-stable return values from `parse()`\n\n **Responsibilities:**\n\n - Implementations must declare supported content types via `supported_types`\n - Implementations must raise parsing-specific exceptions from `parse()`\n - Implementations must remain deterministic for a given input",
"members": {
"supported_types": {
"name": "supported_types",
"kind": "attribute",
"path": "omniread.pdf.parser.BaseParser.supported_types",
"signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.core.parser.BaseParser.supported_types')>",
"docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic."
"docstring": "Set of content types supported by this parser. An empty set indicates that the parser is content-type agnostic."
},
"content": {
"name": "content",
@@ -219,14 +219,14 @@
"kind": "function",
"path": "omniread.pdf.parser.BaseParser.parse",
"signature": "<bound method Alias.signature of Alias('parse', 'omniread.core.parser.BaseParser.parse')>",
"docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed, structured representation.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation."
"docstring": "Parse the owned content into structured output.\n\nReturns:\n T:\n Parsed, structured representation.\n\nRaises:\n Exception:\n Parsing-specific errors as defined by the implementation.\n\nNotes:\n **Responsibilities:**\n\n - Implementations must fully consume the provided content and return a deterministic, structured output"
},
"supports": {
"name": "supports",
"kind": "function",
"path": "omniread.pdf.parser.BaseParser.supports",
"signature": "<bound method Alias.signature of Alias('supports', 'omniread.core.parser.BaseParser.supports')>",
"docstring": "Check whether this parser supports the content's type.\n\nReturns:\n True if the content type is supported; False otherwise."
"docstring": "Check whether this parser supports the content's type.\n\nReturns:\n bool:\n True if the content type is supported; False otherwise."
}
}
},
@@ -241,8 +241,8 @@
"name": "PDFParser",
"kind": "class",
"path": "omniread.pdf.parser.PDFParser",
"signature": "<bound method Class.signature of Class('PDFParser', 20, 49)>",
"docstring": "Base PDF parser.\n\nThis class enforces PDF content-type compatibility and provides the\nextension point for implementing concrete PDF parsing strategies.\n\nConcrete implementations must define:\n- Define the output type `T`\n- Implement the `parse()` method",
"signature": "<bound method Class.signature of Class('PDFParser', 24, 61)>",
"docstring": "Base PDF parser.\n\nNotes:\n **Responsibilities:**\n\n - This class enforces PDF content-type compatibility and provides the extension point for implementing concrete PDF parsing strategies\n\n **Constraints:**\n\n - Concrete implementations must: Define the output type `T`, implement the `parse()` method",
"members": {
"supported_types": {
"name": "supported_types",
@@ -255,8 +255,8 @@
"name": "parse",
"kind": "function",
"path": "omniread.pdf.parser.PDFParser.parse",
"signature": "<bound method Function.signature of Function('parse', 35, 49)>",
"docstring": "Parse PDF content into a structured output.\n\nImplementations must fully interpret the PDF binary payload and\nreturn a deterministic, structured output.\n\nReturns:\n Parsed representation of type `T`.\n\nRaises:\n Exception: Parsing-specific errors as defined by the implementation."
"signature": "<bound method Function.signature of Function('parse', 43, 61)>",
"docstring": "Parse PDF content into a structured output.\n\nReturns:\n T:\n Parsed representation of type `T`.\n\nRaises:\n Exception:\n Parsing-specific errors as defined by the implementation.\n\nNotes:\n **Responsibilities:**\n\n - Implementations must fully interpret the PDF binary payload and return a deterministic, structured output"
}
}
}
@@ -267,7 +267,7 @@
"kind": "module",
"path": "omniread.pdf.scraper",
"signature": null,
"docstring": "PDF scraping implementation for OmniRead.\n\nThis module provides a PDF-specific scraper that coordinates PDF byte\nretrieval via a client and normalizes the result into a `Content` object.\n\nThe scraper implements the core `BaseScraper` contract while delegating\nall storage and access concerns to a `BasePDFClient` implementation.",
"docstring": "PDF scraping implementation for OmniRead.\n\n---\n\n## Summary\n\nThis module provides a PDF-specific scraper that coordinates PDF byte\nretrieval via a client and normalizes the result into a `Content` object.\n\nThe scraper implements the core `BaseScraper` contract while delegating\nall storage and access concerns to a `BasePDFClient` implementation.",
"members": {
"Any": {
"name": "Any",
@@ -295,35 +295,35 @@
"kind": "class",
"path": "omniread.pdf.scraper.Content",
"signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
"docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n raw: Raw content bytes as retrieved from the source.\n source: Identifier of the content origin (URL, file path, or logical name).\n content_type: Optional MIME type of the content, if known.\n metadata: Optional, implementation-defined metadata associated with\n the content (e.g., headers, encoding hints, extraction notes).",
"docstring": "Normalized representation of extracted content.\n\nNotes:\n **Responsibilities:**\n\n - A `Content` instance represents a raw content payload along with minimal contextual metadata describing its origin and type\n - This class is the primary exchange format between Scrapers, Parsers, and Downstream consumers",
"members": {
"raw": {
"name": "raw",
"kind": "attribute",
"path": "omniread.pdf.scraper.Content.raw",
"signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
"docstring": null
"docstring": "Raw content bytes as retrieved from the source."
},
"source": {
"name": "source",
"kind": "attribute",
"path": "omniread.pdf.scraper.Content.source",
"signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
"docstring": null
"docstring": "Identifier of the content origin (URL, file path, or logical name)."
},
"content_type": {
"name": "content_type",
"kind": "attribute",
"path": "omniread.pdf.scraper.Content.content_type",
"signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
"docstring": null
"docstring": "Optional MIME type of the content, if known."
},
"metadata": {
"name": "metadata",
"kind": "attribute",
"path": "omniread.pdf.scraper.Content.metadata",
"signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
"docstring": null
"docstring": "Optional, implementation-defined metadata associated with the content (e.g., headers, encoding hints, extraction notes)."
}
}
},
@@ -332,7 +332,7 @@
"kind": "class",
"path": "omniread.pdf.scraper.ContentType",
"signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
"docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
"docstring": "Supported MIME types for extracted content.\n\nNotes:\n **Guarantees:**\n\n - This enum represents the declared or inferred media type of the content source\n - It is primarily used for routing content to the appropriate parser or downstream consumer",
"members": {
"HTML": {
"name": "HTML",
@@ -369,14 +369,14 @@
"kind": "class",
"path": "omniread.pdf.scraper.BaseScraper",
"signature": "<bound method Alias.signature of Alias('BaseScraper', 'omniread.core.scraper.BaseScraper')>",
"docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser",
"docstring": "Base interface for all scrapers.\n\nNotes:\n **Responsibilities:**\n\n - A scraper is responsible ONLY for fetching raw content (bytes) from a source. It must not interpret or parse it\n - A scraper is a stateless acquisition component that retrieves raw content from a source and returns it as a `Content` object\n - Scrapers define how content is obtained, not what the content means\n - Implementations may vary in transport mechanism, authentication strategy, retry and backoff behavior\n\n **Constraints:**\n\n - Implementations must not parse content, modify content semantics, or couple scraping logic to a specific parser",
"members": {
"fetch": {
"name": "fetch",
"kind": "function",
"path": "omniread.pdf.scraper.BaseScraper.fetch",
"signature": "<bound method Alias.signature of Alias('fetch', 'omniread.core.scraper.BaseScraper.fetch')>",
"docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n source: Location identifier (URL, file path, S3 URI, etc.)\n metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content object containing raw bytes and metadata.\n - Raw content bytes\n - Source identifier\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors as defined by the implementation."
"docstring": "Fetch raw content from the given source.\n\nArgs:\n source (str):\n Location identifier (URL, file path, S3 URI, etc.)\n metadata (Optional[Mapping[str, Any]], optional):\n Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n Content:\n Content object containing raw bytes and metadata.\n\nRaises:\n Exception:\n Retrieval-specific errors as defined by the implementation.\n\nNotes:\n **Responsibilities:**\n\n - Implementations must retrieve the content referenced by `source` and return it as raw bytes wrapped in a `Content` object"
}
}
},
@@ -385,14 +385,14 @@
"kind": "class",
"path": "omniread.pdf.scraper.BasePDFClient",
"signature": "<bound method Alias.signature of Alias('BasePDFClient', 'omniread.pdf.client.BasePDFClient')>",
"docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nImplementations must:\n- Accept a source identifier appropriate to the backing store\n- Return the full PDF binary payload\n- Raise retrieval-specific errors on failure",
"docstring": "Abstract client responsible for retrieving PDF bytes\nfrom a specific backing store (filesystem, S3, FTP, etc.).\n\nNotes:\n **Responsibilities:**\n\n - Implementations must accept a source identifier appropriate to the backing store, return the full PDF binary payload, and raise retrieval-specific errors on failure",
"members": {
"fetch": {
"name": "fetch",
"kind": "function",
"path": "omniread.pdf.scraper.BasePDFClient.fetch",
"signature": "<bound method Alias.signature of Alias('fetch', 'omniread.pdf.client.BasePDFClient.fetch')>",
"docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n source: Identifier of the PDF location, such as a file path,\n object storage key, or remote reference.\n\nReturns:\n Raw PDF bytes.\n\nRaises:\n Exception: Retrieval-specific errors defined by the implementation."
"docstring": "Fetch raw PDF bytes from the given source.\n\nArgs:\n source (Any):\n Identifier of the PDF location, such as a file path, object storage key, or remote reference.\n\nReturns:\n bytes:\n Raw PDF bytes.\n\nRaises:\n Exception:\n Retrieval-specific errors defined by the implementation."
}
}
},
@@ -400,15 +400,15 @@
"name": "PDFScraper",
"kind": "class",
"path": "omniread.pdf.scraper.PDFScraper",
"signature": "<bound method Class.signature of Class('PDFScraper', 18, 71)>",
"docstring": "Scraper for PDF sources.\n\nDelegates byte retrieval to a PDF client and normalizes\noutput into Content.\n\nThe scraper:\n- Does not perform parsing or interpretation\n- Does not assume a specific storage backend\n- Preserves caller-provided metadata",
"signature": "<bound method Class.signature of Class('PDFScraper', 22, 77)>",
"docstring": "Scraper for PDF sources.\n\nNotes:\n **Responsibilities:**\n\n - Delegates byte retrieval to a PDF client and normalizes output into Content\n - Preserves caller-provided metadata\n\n **Constraints:**\n \n - The scraper: Does not perform parsing or interpretation, does not assume a specific storage backend",
"members": {
"fetch": {
"name": "fetch",
"kind": "function",
"path": "omniread.pdf.scraper.PDFScraper.fetch",
"signature": "<bound method Function.signature of Function('fetch', 40, 71)>",
"docstring": "Fetch a PDF document from the given source.\n\nArgs:\n source: Identifier of the PDF source as understood by the\n configured PDF client.\n metadata: Optional metadata to attach to the returned content.\n\nReturns:\n A `Content` instance containing:\n - Raw PDF bytes\n - Source identifier\n - PDF content type\n - Optional metadata\n\nRaises:\n Exception: Retrieval-specific errors raised by the PDF client."
"signature": "<bound method Function.signature of Function('fetch', 47, 77)>",
"docstring": "Fetch a PDF document from the given source.\n\nArgs:\n source (Any):\n Identifier of the PDF source as understood by the configured PDF client.\n metadata (Optional[Mapping[str, Any]], optional):\n Optional metadata to attach to the returned content.\n\nReturns:\n Content:\n A `Content` instance containing raw PDF bytes, source identifier, PDF content type, and optional metadata.\n\nRaises:\n Exception:\n Retrieval-specific errors raised by the PDF client."
}
}
}