From 5842e6a2274dfa85782b5403bdca01e4dbfe0234 Mon Sep 17 00:00:00 2001 From: Vishesh 'ironeagle' Bangotra Date: Sun, 8 Mar 2026 00:29:25 +0530 Subject: [PATCH] google styled doc --- omniread/__init__.py | 61 +++++++++----------------- omniread/core/__init__.py | 13 ++++++ omniread/core/content.py | 44 ++++++++++++------- omniread/core/parser.py | 50 +++++++++++++--------- omniread/core/scraper.py | 49 ++++++++++----------- omniread/html/__init__.py | 13 ++++++ omniread/html/parser.py | 90 ++++++++++++++++++++++++--------------- omniread/html/scraper.py | 61 ++++++++++++++------------ omniread/pdf/__init__.py | 14 ++++++ omniread/pdf/client.py | 40 +++++++++++------ omniread/pdf/parser.py | 34 ++++++++++----- omniread/pdf/scraper.py | 38 ++++++++++------- 12 files changed, 304 insertions(+), 203 deletions(-) diff --git a/omniread/__init__.py b/omniread/__init__.py index 8108e09..5706dac 100644 --- a/omniread/__init__.py +++ b/omniread/__init__.py @@ -1,33 +1,28 @@ """ OmniRead — format-agnostic content acquisition and parsing framework. +--- + +## Summary + OmniRead provides a **cleanly layered architecture** for fetching, parsing, and normalizing content from heterogeneous sources such as HTML documents and PDF files. The library is structured around three core concepts: -1. **Content** - A canonical, format-agnostic container representing raw content bytes - and minimal contextual metadata. - -2. **Scrapers** - Components responsible for *acquiring* raw content from a source - (HTTP, filesystem, object storage, etc.). Scrapers never interpret - content. - -3. **Parsers** - Components responsible for *interpreting* acquired content and - converting it into structured, typed representations. +1. **Content**: A canonical, format-agnostic container representing raw content bytes and minimal contextual metadata. +2. **Scrapers**: Components responsible for *acquiring* raw content from a source (HTTP, filesystem, object storage, etc.). Scrapers never interpret content. +3. **Parsers**: Components responsible for *interpreting* acquired content and converting it into structured, typed representations. OmniRead deliberately separates these responsibilities to ensure: - Clear boundaries between IO and interpretation - Replaceable implementations per format - Predictable, testable behavior ----------------------------------------------------------------------- -Installation ----------------------------------------------------------------------- +--- + +## Installation Install OmniRead using pip: @@ -37,9 +32,9 @@ Or with Poetry: poetry add omniread ----------------------------------------------------------------------- -Basic Usage ----------------------------------------------------------------------- +--- + +## Quick start HTML example: @@ -72,49 +67,35 @@ PDF example: parser = TextPDFParser(content) result = parser.parse() ----------------------------------------------------------------------- -Public API Surface ----------------------------------------------------------------------- +--- + +## Public API This module re-exports the **recommended public entry points** of OmniRead. - Consumers are encouraged to import from this namespace rather than from format-specific submodules directly, unless advanced customization is required. -Core: +**Core:** - Content - ContentType -HTML: +**HTML:** - HTMLScraper - HTMLParser -PDF: +**PDF:** - FileSystemPDFClient - PDFScraper - PDFParser -## Core Philosophy - +**Core Philosophy:** `OmniRead` is designed as a **decoupled content engine**: - 1. **Separation of Concerns**: Scrapers *fetch*, Parsers *interpret*. Neither knows about the other. 2. **Normalized Exchange**: All components communicate via the `Content` model, ensuring a consistent contract. 3. **Format Agnosticism**: The core logic is independent of whether the input is HTML, PDF, or JSON. -## Documentation Design - -For those extending `OmniRead`, follow these "AI-Native" docstring principles: - -### For Humans -- **Clear Contracts**: Explicitly state what a component is and is NOT responsible for. -- **Runnable Examples**: Include small, logical snippets in the package `__init__.py`. - -### For LLMs -- **Structured Models**: Use dataclasses and enums for core data to ensure clean MCP JSON representation. -- **Type Safety**: All public APIs must be fully typed and have corresponding `.pyi` stubs. -- **Detailed Raises**: Include `: description` pairs in the `Raises` section to help agents handle errors gracefully. +--- """ from .core import Content, ContentType diff --git a/omniread/core/__init__.py b/omniread/core/__init__.py index ea9f4b4..3b5c778 100644 --- a/omniread/core/__init__.py +++ b/omniread/core/__init__.py @@ -1,6 +1,10 @@ """ Core domain contracts for OmniRead. +--- + +## Summary + This package defines the **format-agnostic domain layer** of OmniRead. It exposes canonical content models and abstract interfaces that are implemented by format-specific modules (HTML, PDF, etc.). @@ -14,6 +18,15 @@ Submodules: - scraper: Abstract scraping contracts Format-specific behavior must not be introduced at this layer. + +--- + +## Public API + + Content + ContentType + +--- """ from .content import Content, ContentType diff --git a/omniread/core/content.py b/omniread/core/content.py index 2bc1af1..18a50a1 100644 --- a/omniread/core/content.py +++ b/omniread/core/content.py @@ -1,6 +1,10 @@ """ Canonical content models for OmniRead. +--- + +## Summary + This module defines the **format-agnostic content representation** used across all parsers and scrapers in OmniRead. @@ -18,9 +22,11 @@ class ContentType(str, Enum): """ Supported MIME types for extracted content. - This enum represents the declared or inferred media type of the content - source. It is primarily used for routing content to the appropriate - parser or downstream consumer. + Notes: + **Guarantees:** + + - This enum represents the declared or inferred media type of the content source + - It is primarily used for routing content to the appropriate parser or downstream consumer """ HTML = "text/html" @@ -41,23 +47,29 @@ class Content: """ Normalized representation of extracted content. - A `Content` instance represents a raw content payload along with minimal - contextual metadata describing its origin and type. + Notes: + **Responsibilities:** - This class is the **primary exchange format** between: - - Scrapers - - Parsers - - Downstream consumers - - Attributes: - raw: Raw content bytes as retrieved from the source. - source: Identifier of the content origin (URL, file path, or logical name). - content_type: Optional MIME type of the content, if known. - metadata: Optional, implementation-defined metadata associated with - the content (e.g., headers, encoding hints, extraction notes). + - A `Content` instance represents a raw content payload along with minimal contextual metadata describing its origin and type + - This class is the primary exchange format between Scrapers, Parsers, and Downstream consumers """ raw: bytes + """ + Raw content bytes as retrieved from the source. + """ + source: str + """ + Identifier of the content origin (URL, file path, or logical name). + """ + content_type: Optional[ContentType] = None + """ + Optional MIME type of the content, if known. + """ + metadata: Optional[Mapping[str, Any]] = None + """ + Optional, implementation-defined metadata associated with the content (e.g., headers, encoding hints, extraction notes). + """ diff --git a/omniread/core/parser.py b/omniread/core/parser.py index 4f0bc08..4ac3a05 100644 --- a/omniread/core/parser.py +++ b/omniread/core/parser.py @@ -1,6 +1,10 @@ """ Abstract parsing contracts for OmniRead. +--- + +## Summary + This module defines the **format-agnostic parser interface** used to transform raw content into structured, typed representations. @@ -27,23 +31,22 @@ class BaseParser(ABC, Generic[T]): """ Base interface for all parsers. - A parser is a self-contained object that owns the Content - it is responsible for interpreting. + Notes: + **Guarantees:** - Implementations must: - - Declare supported content types via `supported_types` - - Raise parsing-specific exceptions from `parse()` - - Remain deterministic for a given input + - A parser is a self-contained object that owns the Content it is responsible for interpreting + - Consumers may rely on early validation of content compatibility and type-stable return values from `parse()` - Consumers may rely on: - - Early validation of content compatibility - - Type-stable return values from `parse()` + **Responsibilities:** + + - Implementations must declare supported content types via `supported_types` + - Implementations must raise parsing-specific exceptions from `parse()` + - Implementations must remain deterministic for a given input """ supported_types: Set[ContentType] = set() - """Set of content types supported by this parser. - - An empty set indicates that the parser is content-type agnostic. + """ + Set of content types supported by this parser. An empty set indicates that the parser is content-type agnostic. """ def __init__(self, content: Content): @@ -51,10 +54,12 @@ class BaseParser(ABC, Generic[T]): Initialize the parser with content to be parsed. Args: - content: Content instance to be parsed. + content (Content): + Content instance to be parsed. Raises: - ValueError: If the content type is not supported by this parser. + ValueError: + If the content type is not supported by this parser. """ self.content = content @@ -70,14 +75,18 @@ class BaseParser(ABC, Generic[T]): """ Parse the owned content into structured output. - Implementations must fully consume the provided content and - return a deterministic, structured output. - Returns: - Parsed, structured representation. + T: + Parsed, structured representation. Raises: - Exception: Parsing-specific errors as defined by the implementation. + Exception: + Parsing-specific errors as defined by the implementation. + + Notes: + **Responsibilities:** + + - Implementations must fully consume the provided content and return a deterministic, structured output """ raise NotImplementedError @@ -86,7 +95,8 @@ class BaseParser(ABC, Generic[T]): Check whether this parser supports the content's type. Returns: - True if the content type is supported; False otherwise. + bool: + True if the content type is supported; False otherwise. """ if not self.supported_types: diff --git a/omniread/core/scraper.py b/omniread/core/scraper.py index 910dfe2..931cdb6 100644 --- a/omniread/core/scraper.py +++ b/omniread/core/scraper.py @@ -1,6 +1,10 @@ """ Abstract scraping contracts for OmniRead. +--- + +## Summary + This module defines the **format-agnostic scraper interface** responsible for acquiring raw content from external sources. @@ -27,23 +31,17 @@ class BaseScraper(ABC): """ Base interface for all scrapers. - A scraper is responsible ONLY for fetching raw content - (bytes) from a source. It must not interpret or parse it. + Notes: + **Responsibilities:** - A scraper is a **stateless acquisition component** that retrieves raw - content from a source and returns it as a `Content` object. + - A scraper is responsible ONLY for fetching raw content (bytes) from a source. It must not interpret or parse it + - A scraper is a stateless acquisition component that retrieves raw content from a source and returns it as a `Content` object + - Scrapers define how content is obtained, not what the content means + - Implementations may vary in transport mechanism, authentication strategy, retry and backoff behavior - Scrapers define *how content is obtained*, not *what the content means*. + **Constraints:** - Implementations may vary in: - - Transport mechanism (HTTP, filesystem, cloud storage) - - Authentication strategy - - Retry and backoff behavior - - Implementations must not: - - Parse content - - Modify content semantics - - Couple scraping logic to a specific parser + - Implementations must not parse content, modify content semantics, or couple scraping logic to a specific parser """ @abstractmethod @@ -56,20 +54,23 @@ class BaseScraper(ABC): """ Fetch raw content from the given source. - Implementations must retrieve the content referenced by `source` - and return it as raw bytes wrapped in a `Content` object. - Args: - source: Location identifier (URL, file path, S3 URI, etc.) - metadata: Optional hints for the scraper (headers, auth, etc.) + source (str): + Location identifier (URL, file path, S3 URI, etc.) + metadata (Optional[Mapping[str, Any]], optional): + Optional hints for the scraper (headers, auth, etc.) Returns: - Content object containing raw bytes and metadata. - - Raw content bytes - - Source identifier - - Optional metadata + Content: + Content object containing raw bytes and metadata. Raises: - Exception: Retrieval-specific errors as defined by the implementation. + Exception: + Retrieval-specific errors as defined by the implementation. + + Notes: + **Responsibilities:** + + - Implementations must retrieve the content referenced by `source` and return it as raw bytes wrapped in a `Content` object """ raise NotImplementedError diff --git a/omniread/html/__init__.py b/omniread/html/__init__.py index 4ef38b9..8ad87b5 100644 --- a/omniread/html/__init__.py +++ b/omniread/html/__init__.py @@ -1,6 +1,10 @@ """ HTML format implementation for OmniRead. +--- + +## Summary + This package provides **HTML-specific implementations** of the core OmniRead contracts defined in `omniread.core`. @@ -15,6 +19,15 @@ This package: Consumers should depend on `omniread.core` interfaces wherever possible and use this package only when HTML-specific behavior is required. + +--- + +## Public API + + HTMLScraper + HTMLParser + +--- """ diff --git a/omniread/html/parser.py b/omniread/html/parser.py index 06e25e6..0413249 100644 --- a/omniread/html/parser.py +++ b/omniread/html/parser.py @@ -1,6 +1,10 @@ """ HTML parser base implementations for OmniRead. +--- + +## Summary + This module provides reusable HTML parsing utilities built on top of the abstract parser contracts defined in `omniread.core.parser`. @@ -28,36 +32,39 @@ class HTMLParser(BaseParser[T], Generic[T]): """ Base HTML parser. - This class extends the core `BaseParser` with HTML-specific behavior, - including DOM parsing via BeautifulSoup and reusable extraction helpers. + Notes: + **Responsibilities:** - Provides reusable helpers for HTML extraction. - Concrete parsers must explicitly define the return type. + - This class extends the core `BaseParser` with HTML-specific behavior, including DOM parsing via BeautifulSoup and reusable extraction helpers + - Provides reusable helpers for HTML extraction. Concrete parsers must explicitly define the return type - Characteristics: - - Accepts only HTML content - - Owns a parsed BeautifulSoup DOM tree - - Provides pure helper utilities for common HTML structures + **Guarantees:** - Concrete subclasses must: - - Define the output type `T` - - Implement the `parse()` method + - Characteristics: Accepts only HTML content, owns a parsed BeautifulSoup DOM tree, provides pure helper utilities for common HTML structures + + **Constraints:** + + - Concrete subclasses must define the output type `T` and implement the `parse()` method """ supported_types = {ContentType.HTML} - """Set of content types supported by this parser (HTML only).""" + """ + Set of content types supported by this parser (HTML only). + """ def __init__(self, content: Content, features: str = "html.parser"): """ Initialize the HTML parser. Args: - content: HTML content to be parsed. - features: BeautifulSoup parser backend to use - (e.g., 'html.parser', 'lxml'). + content (Content): + HTML content to be parsed. + features (str, optional): + BeautifulSoup parser backend to use (e.g., 'html.parser', 'lxml'). Raises: - ValueError: If the content is empty or not valid HTML. + ValueError: + If the content is empty or not valid HTML. """ super().__init__(content) self._features = features @@ -72,11 +79,14 @@ class HTMLParser(BaseParser[T], Generic[T]): """ Fully parse the HTML content into structured output. - Implementations must fully interpret the HTML DOM and return - a deterministic, structured output. - Returns: - Parsed representation of type `T`. + T: + Parsed representation of type `T`. + + Notes: + **Responsibilities:** + + - Implementations must fully interpret the HTML DOM and return a deterministic, structured output """ raise NotImplementedError @@ -90,11 +100,14 @@ class HTMLParser(BaseParser[T], Generic[T]): Extract normalized text from a `
` element. Args: - div: BeautifulSoup tag representing a `
`. - separator: String used to separate text nodes. + div (Tag): + BeautifulSoup tag representing a `
`. + separator (str, optional): + String used to separate text nodes. Returns: - Flattened, whitespace-normalized text content. + str: + Flattened, whitespace-normalized text content. """ return div.get_text(separator=separator, strip=True) @@ -104,10 +117,12 @@ class HTMLParser(BaseParser[T], Generic[T]): Extract the hyperlink reference from an `` element. Args: - a: BeautifulSoup tag representing an anchor. + a (Tag): + BeautifulSoup tag representing an anchor. Returns: - The value of the `href` attribute, or None if absent. + Optional[str]: + The value of the `href` attribute, or None if absent. """ return a.get("href") @@ -117,10 +132,12 @@ class HTMLParser(BaseParser[T], Generic[T]): Parse an HTML table into a 2D list of strings. Args: - table: BeautifulSoup tag representing a ``. + table (Tag): + BeautifulSoup tag representing a `
`. Returns: - A list of rows, where each row is a list of cell text values. + list[list[str]]: + A list of rows, where each row is a list of cell text values. """ rows: list[list[str]] = [] for tr in table.find_all("tr"): @@ -141,10 +158,12 @@ class HTMLParser(BaseParser[T], Generic[T]): Build a BeautifulSoup DOM tree from raw HTML content. Returns: - Parsed BeautifulSoup document tree. + BeautifulSoup: + Parsed BeautifulSoup document tree. Raises: - ValueError: If the content payload is empty. + ValueError: + If the content payload is empty. """ if not self.content.raw: raise ValueError("Empty HTML content") @@ -154,12 +173,15 @@ class HTMLParser(BaseParser[T], Generic[T]): """ Extract high-level metadata from the HTML document. - This includes: - - Document title - - `` tag name/property → content mappings - Returns: - Dictionary containing extracted metadata. + dict[str, Any]: + Dictionary containing extracted metadata. + + Notes: + **Responsibilities:** + + - Extract high-level metadata from the HTML document + - This includes: Document title, `` tag name/property → content mappings """ soup = self._soup diff --git a/omniread/html/scraper.py b/omniread/html/scraper.py index 58115b0..4df5c80 100644 --- a/omniread/html/scraper.py +++ b/omniread/html/scraper.py @@ -1,6 +1,10 @@ """ HTML scraping implementation for OmniRead. +--- + +## Summary + This module provides an HTTP-based scraper for retrieving HTML documents. It implements the core `BaseScraper` contract using `httpx` as the transport layer. @@ -27,19 +31,15 @@ class HTMLScraper(BaseScraper): """ Base HTML scraper using httpx. - This scraper retrieves HTML documents over HTTP(S) and returns them - as raw content wrapped in a `Content` object. + Notes: + **Responsibilities:** - Fetches raw bytes and metadata only. - The scraper: - - Uses `httpx.Client` for HTTP requests - - Enforces an HTML content type - - Preserves HTTP response metadata + - This scraper retrieves HTML documents over HTTP(S) and returns them as raw content wrapped in a `Content` object + - Fetches raw bytes and metadata only. The scraper uses `httpx.Client` for HTTP requests, enforces an HTML content type, preserves HTTP response metadata - The scraper does not: - - Parse HTML - - Perform retries or backoff - - Handle non-HTML responses + **Constraints:** + + - The scraper does not: Parse HTML, perform retries or backoff, handle non-HTML responses """ def __init__( @@ -54,11 +54,14 @@ class HTMLScraper(BaseScraper): Initialize the HTML scraper. Args: - client: Optional pre-configured `httpx.Client`. If omitted, - a client is created internally. - timeout: Request timeout in seconds. - headers: Optional default HTTP headers. - follow_redirects: Whether to follow HTTP redirects. + client (httpx.Client | None, optional): + Optional pre-configured `httpx.Client`. If omitted, a client is created internally. + timeout (float, optional): + Request timeout in seconds. + headers (Optional[Mapping[str, str]], optional): + Optional default HTTP headers. + follow_redirects (bool, optional): + Whether to follow HTTP redirects. """ self._client = client or httpx.Client( @@ -76,11 +79,12 @@ class HTMLScraper(BaseScraper): Validate that the HTTP response contains HTML content. Args: - response: HTTP response returned by `httpx`. + response (httpx.Response): + HTTP response returned by `httpx`. Raises: - ValueError: If the `Content-Type` header is missing or does not - indicate HTML content. + ValueError: + If the `Content-Type` header is missing or does not indicate HTML content. """ raw_ct = response.headers.get("Content-Type") @@ -103,19 +107,20 @@ class HTMLScraper(BaseScraper): Fetch an HTML document from the given source. Args: - source: URL of the HTML document. - metadata: Optional metadata to be merged into the returned content. + source (str): + URL of the HTML document. + metadata (Optional[Mapping[str, Any]], optional): + Optional metadata to be merged into the returned content. Returns: - A `Content` instance containing: - - Raw HTML bytes - - Source URL - - HTML content type - - HTTP response metadata + Content: + A `Content` instance containing raw HTML bytes, source URL, HTML content type, and HTTP response metadata. Raises: - httpx.HTTPError: If the HTTP request fails. - ValueError: If the response is not valid HTML. + httpx.HTTPError: + If the HTTP request fails. + ValueError: + If the response is not valid HTML. """ response = self._client.get(source) diff --git a/omniread/pdf/__init__.py b/omniread/pdf/__init__.py index d924554..f690d23 100644 --- a/omniread/pdf/__init__.py +++ b/omniread/pdf/__init__.py @@ -1,6 +1,10 @@ """ PDF format implementation for OmniRead. +--- + +## Summary + This package provides **PDF-specific implementations** of the core OmniRead contracts defined in `omniread.core`. @@ -12,6 +16,16 @@ access. This package therefore includes: Public exports from this package represent the supported PDF pipeline and are safe for consumers to import directly when working with PDFs. + +--- + +## Public API + + FileSystemPDFClient + PDFScraper + PDFParser + +--- """ from .client import FileSystemPDFClient diff --git a/omniread/pdf/client.py b/omniread/pdf/client.py index c1de901..fff515d 100644 --- a/omniread/pdf/client.py +++ b/omniread/pdf/client.py @@ -1,6 +1,10 @@ """ PDF client abstractions for OmniRead. +--- + +## Summary + This module defines the **client layer** responsible for retrieving raw PDF bytes from a concrete backing store. @@ -24,10 +28,10 @@ class BasePDFClient(ABC): Abstract client responsible for retrieving PDF bytes from a specific backing store (filesystem, S3, FTP, etc.). - Implementations must: - - Accept a source identifier appropriate to the backing store - - Return the full PDF binary payload - - Raise retrieval-specific errors on failure + Notes: + **Responsibilities:** + + - Implementations must accept a source identifier appropriate to the backing store, return the full PDF binary payload, and raise retrieval-specific errors on failure """ @abstractmethod @@ -36,14 +40,16 @@ class BasePDFClient(ABC): Fetch raw PDF bytes from the given source. Args: - source: Identifier of the PDF location, such as a file path, - object storage key, or remote reference. + source (Any): + Identifier of the PDF location, such as a file path, object storage key, or remote reference. Returns: - Raw PDF bytes. + bytes: + Raw PDF bytes. Raises: - Exception: Retrieval-specific errors defined by the implementation. + Exception: + Retrieval-specific errors defined by the implementation. """ raise NotImplementedError @@ -52,8 +58,10 @@ class FileSystemPDFClient(BasePDFClient): """ PDF client that reads from the local filesystem. - This client reads PDF files directly from the disk and returns their raw - binary contents. + Notes: + **Guarantees:** + + - This client reads PDF files directly from the disk and returns their raw binary contents """ def fetch(self, path: Path) -> bytes: @@ -61,14 +69,18 @@ class FileSystemPDFClient(BasePDFClient): Read a PDF file from the local filesystem. Args: - path: Filesystem path to the PDF file. + path (Path): + Filesystem path to the PDF file. Returns: - Raw PDF bytes. + bytes: + Raw PDF bytes. Raises: - FileNotFoundError: If the path does not exist. - ValueError: If the path exists but is not a file. + FileNotFoundError: + If the path does not exist. + ValueError: + If the path exists but is not a file. """ if not path.exists(): diff --git a/omniread/pdf/parser.py b/omniread/pdf/parser.py index 65c4b5a..465fe9c 100644 --- a/omniread/pdf/parser.py +++ b/omniread/pdf/parser.py @@ -1,6 +1,10 @@ """ PDF parser base implementations for OmniRead. +--- + +## Summary + This module defines the **PDF-specific parser contract**, extending the format-agnostic `BaseParser` with constraints appropriate for PDF content. @@ -21,29 +25,37 @@ class PDFParser(BaseParser[T], Generic[T]): """ Base PDF parser. - This class enforces PDF content-type compatibility and provides the - extension point for implementing concrete PDF parsing strategies. + Notes: + **Responsibilities:** - Concrete implementations must define: - - Define the output type `T` - - Implement the `parse()` method + - This class enforces PDF content-type compatibility and provides the extension point for implementing concrete PDF parsing strategies + + **Constraints:** + + - Concrete implementations must: Define the output type `T`, implement the `parse()` method """ supported_types = {ContentType.PDF} - """Set of content types supported by this parser (PDF only).""" + """ + Set of content types supported by this parser (PDF only). + """ @abstractmethod def parse(self) -> T: """ Parse PDF content into a structured output. - Implementations must fully interpret the PDF binary payload and - return a deterministic, structured output. - Returns: - Parsed representation of type `T`. + T: + Parsed representation of type `T`. Raises: - Exception: Parsing-specific errors as defined by the implementation. + Exception: + Parsing-specific errors as defined by the implementation. + + Notes: + **Responsibilities:** + + - Implementations must fully interpret the PDF binary payload and return a deterministic, structured output """ raise NotImplementedError diff --git a/omniread/pdf/scraper.py b/omniread/pdf/scraper.py index 7446ef0..83052bc 100644 --- a/omniread/pdf/scraper.py +++ b/omniread/pdf/scraper.py @@ -1,6 +1,10 @@ """ PDF scraping implementation for OmniRead. +--- + +## Summary + This module provides a PDF-specific scraper that coordinates PDF byte retrieval via a client and normalizes the result into a `Content` object. @@ -19,13 +23,15 @@ class PDFScraper(BaseScraper): """ Scraper for PDF sources. - Delegates byte retrieval to a PDF client and normalizes - output into Content. + Notes: + **Responsibilities:** - The scraper: - - Does not perform parsing or interpretation - - Does not assume a specific storage backend - - Preserves caller-provided metadata + - Delegates byte retrieval to a PDF client and normalizes output into Content + - Preserves caller-provided metadata + + **Constraints:** + + - The scraper: Does not perform parsing or interpretation, does not assume a specific storage backend """ def __init__(self, *, client: BasePDFClient): @@ -33,7 +39,8 @@ class PDFScraper(BaseScraper): Initialize the PDF scraper. Args: - client: PDF client responsible for retrieving raw PDF bytes. + client (BasePDFClient): + PDF client responsible for retrieving raw PDF bytes. """ self._client = client @@ -47,19 +54,18 @@ class PDFScraper(BaseScraper): Fetch a PDF document from the given source. Args: - source: Identifier of the PDF source as understood by the - configured PDF client. - metadata: Optional metadata to attach to the returned content. + source (Any): + Identifier of the PDF source as understood by the configured PDF client. + metadata (Optional[Mapping[str, Any]], optional): + Optional metadata to attach to the returned content. Returns: - A `Content` instance containing: - - Raw PDF bytes - - Source identifier - - PDF content type - - Optional metadata + Content: + A `Content` instance containing raw PDF bytes, source identifier, PDF content type, and optional metadata. Raises: - Exception: Retrieval-specific errors raised by the PDF client. + Exception: + Retrieval-specific errors raised by the PDF client. """ raw = self._client.fetch(source)