diff --git a/omniread/__init__.py b/omniread/__init__.py index 8108e09..5706dac 100644 --- a/omniread/__init__.py +++ b/omniread/__init__.py @@ -1,33 +1,28 @@ """ OmniRead — format-agnostic content acquisition and parsing framework. +--- + +## Summary + OmniRead provides a **cleanly layered architecture** for fetching, parsing, and normalizing content from heterogeneous sources such as HTML documents and PDF files. The library is structured around three core concepts: -1. **Content** - A canonical, format-agnostic container representing raw content bytes - and minimal contextual metadata. - -2. **Scrapers** - Components responsible for *acquiring* raw content from a source - (HTTP, filesystem, object storage, etc.). Scrapers never interpret - content. - -3. **Parsers** - Components responsible for *interpreting* acquired content and - converting it into structured, typed representations. +1. **Content**: A canonical, format-agnostic container representing raw content bytes and minimal contextual metadata. +2. **Scrapers**: Components responsible for *acquiring* raw content from a source (HTTP, filesystem, object storage, etc.). Scrapers never interpret content. +3. **Parsers**: Components responsible for *interpreting* acquired content and converting it into structured, typed representations. OmniRead deliberately separates these responsibilities to ensure: - Clear boundaries between IO and interpretation - Replaceable implementations per format - Predictable, testable behavior ----------------------------------------------------------------------- -Installation ----------------------------------------------------------------------- +--- + +## Installation Install OmniRead using pip: @@ -37,9 +32,9 @@ Or with Poetry: poetry add omniread ----------------------------------------------------------------------- -Basic Usage ----------------------------------------------------------------------- +--- + +## Quick start HTML example: @@ -72,49 +67,35 @@ PDF example: parser = TextPDFParser(content) result = parser.parse() ----------------------------------------------------------------------- -Public API Surface ----------------------------------------------------------------------- +--- + +## Public API This module re-exports the **recommended public entry points** of OmniRead. - Consumers are encouraged to import from this namespace rather than from format-specific submodules directly, unless advanced customization is required. -Core: +**Core:** - Content - ContentType -HTML: +**HTML:** - HTMLScraper - HTMLParser -PDF: +**PDF:** - FileSystemPDFClient - PDFScraper - PDFParser -## Core Philosophy - +**Core Philosophy:** `OmniRead` is designed as a **decoupled content engine**: - 1. **Separation of Concerns**: Scrapers *fetch*, Parsers *interpret*. Neither knows about the other. 2. **Normalized Exchange**: All components communicate via the `Content` model, ensuring a consistent contract. 3. **Format Agnosticism**: The core logic is independent of whether the input is HTML, PDF, or JSON. -## Documentation Design - -For those extending `OmniRead`, follow these "AI-Native" docstring principles: - -### For Humans -- **Clear Contracts**: Explicitly state what a component is and is NOT responsible for. -- **Runnable Examples**: Include small, logical snippets in the package `__init__.py`. - -### For LLMs -- **Structured Models**: Use dataclasses and enums for core data to ensure clean MCP JSON representation. -- **Type Safety**: All public APIs must be fully typed and have corresponding `.pyi` stubs. -- **Detailed Raises**: Include `: description` pairs in the `Raises` section to help agents handle errors gracefully. +--- """ from .core import Content, ContentType diff --git a/omniread/core/__init__.py b/omniread/core/__init__.py index ea9f4b4..3b5c778 100644 --- a/omniread/core/__init__.py +++ b/omniread/core/__init__.py @@ -1,6 +1,10 @@ """ Core domain contracts for OmniRead. +--- + +## Summary + This package defines the **format-agnostic domain layer** of OmniRead. It exposes canonical content models and abstract interfaces that are implemented by format-specific modules (HTML, PDF, etc.). @@ -14,6 +18,15 @@ Submodules: - scraper: Abstract scraping contracts Format-specific behavior must not be introduced at this layer. + +--- + +## Public API + + Content + ContentType + +--- """ from .content import Content, ContentType diff --git a/omniread/core/content.py b/omniread/core/content.py index 2bc1af1..18a50a1 100644 --- a/omniread/core/content.py +++ b/omniread/core/content.py @@ -1,6 +1,10 @@ """ Canonical content models for OmniRead. +--- + +## Summary + This module defines the **format-agnostic content representation** used across all parsers and scrapers in OmniRead. @@ -18,9 +22,11 @@ class ContentType(str, Enum): """ Supported MIME types for extracted content. - This enum represents the declared or inferred media type of the content - source. It is primarily used for routing content to the appropriate - parser or downstream consumer. + Notes: + **Guarantees:** + + - This enum represents the declared or inferred media type of the content source + - It is primarily used for routing content to the appropriate parser or downstream consumer """ HTML = "text/html" @@ -41,23 +47,29 @@ class Content: """ Normalized representation of extracted content. - A `Content` instance represents a raw content payload along with minimal - contextual metadata describing its origin and type. + Notes: + **Responsibilities:** - This class is the **primary exchange format** between: - - Scrapers - - Parsers - - Downstream consumers - - Attributes: - raw: Raw content bytes as retrieved from the source. - source: Identifier of the content origin (URL, file path, or logical name). - content_type: Optional MIME type of the content, if known. - metadata: Optional, implementation-defined metadata associated with - the content (e.g., headers, encoding hints, extraction notes). + - A `Content` instance represents a raw content payload along with minimal contextual metadata describing its origin and type + - This class is the primary exchange format between Scrapers, Parsers, and Downstream consumers """ raw: bytes + """ + Raw content bytes as retrieved from the source. + """ + source: str + """ + Identifier of the content origin (URL, file path, or logical name). + """ + content_type: Optional[ContentType] = None + """ + Optional MIME type of the content, if known. + """ + metadata: Optional[Mapping[str, Any]] = None + """ + Optional, implementation-defined metadata associated with the content (e.g., headers, encoding hints, extraction notes). + """ diff --git a/omniread/core/parser.py b/omniread/core/parser.py index 4f0bc08..4ac3a05 100644 --- a/omniread/core/parser.py +++ b/omniread/core/parser.py @@ -1,6 +1,10 @@ """ Abstract parsing contracts for OmniRead. +--- + +## Summary + This module defines the **format-agnostic parser interface** used to transform raw content into structured, typed representations. @@ -27,23 +31,22 @@ class BaseParser(ABC, Generic[T]): """ Base interface for all parsers. - A parser is a self-contained object that owns the Content - it is responsible for interpreting. + Notes: + **Guarantees:** - Implementations must: - - Declare supported content types via `supported_types` - - Raise parsing-specific exceptions from `parse()` - - Remain deterministic for a given input + - A parser is a self-contained object that owns the Content it is responsible for interpreting + - Consumers may rely on early validation of content compatibility and type-stable return values from `parse()` - Consumers may rely on: - - Early validation of content compatibility - - Type-stable return values from `parse()` + **Responsibilities:** + + - Implementations must declare supported content types via `supported_types` + - Implementations must raise parsing-specific exceptions from `parse()` + - Implementations must remain deterministic for a given input """ supported_types: Set[ContentType] = set() - """Set of content types supported by this parser. - - An empty set indicates that the parser is content-type agnostic. + """ + Set of content types supported by this parser. An empty set indicates that the parser is content-type agnostic. """ def __init__(self, content: Content): @@ -51,10 +54,12 @@ class BaseParser(ABC, Generic[T]): Initialize the parser with content to be parsed. Args: - content: Content instance to be parsed. + content (Content): + Content instance to be parsed. Raises: - ValueError: If the content type is not supported by this parser. + ValueError: + If the content type is not supported by this parser. """ self.content = content @@ -70,14 +75,18 @@ class BaseParser(ABC, Generic[T]): """ Parse the owned content into structured output. - Implementations must fully consume the provided content and - return a deterministic, structured output. - Returns: - Parsed, structured representation. + T: + Parsed, structured representation. Raises: - Exception: Parsing-specific errors as defined by the implementation. + Exception: + Parsing-specific errors as defined by the implementation. + + Notes: + **Responsibilities:** + + - Implementations must fully consume the provided content and return a deterministic, structured output """ raise NotImplementedError @@ -86,7 +95,8 @@ class BaseParser(ABC, Generic[T]): Check whether this parser supports the content's type. Returns: - True if the content type is supported; False otherwise. + bool: + True if the content type is supported; False otherwise. """ if not self.supported_types: diff --git a/omniread/core/scraper.py b/omniread/core/scraper.py index 910dfe2..931cdb6 100644 --- a/omniread/core/scraper.py +++ b/omniread/core/scraper.py @@ -1,6 +1,10 @@ """ Abstract scraping contracts for OmniRead. +--- + +## Summary + This module defines the **format-agnostic scraper interface** responsible for acquiring raw content from external sources. @@ -27,23 +31,17 @@ class BaseScraper(ABC): """ Base interface for all scrapers. - A scraper is responsible ONLY for fetching raw content - (bytes) from a source. It must not interpret or parse it. + Notes: + **Responsibilities:** - A scraper is a **stateless acquisition component** that retrieves raw - content from a source and returns it as a `Content` object. + - A scraper is responsible ONLY for fetching raw content (bytes) from a source. It must not interpret or parse it + - A scraper is a stateless acquisition component that retrieves raw content from a source and returns it as a `Content` object + - Scrapers define how content is obtained, not what the content means + - Implementations may vary in transport mechanism, authentication strategy, retry and backoff behavior - Scrapers define *how content is obtained*, not *what the content means*. + **Constraints:** - Implementations may vary in: - - Transport mechanism (HTTP, filesystem, cloud storage) - - Authentication strategy - - Retry and backoff behavior - - Implementations must not: - - Parse content - - Modify content semantics - - Couple scraping logic to a specific parser + - Implementations must not parse content, modify content semantics, or couple scraping logic to a specific parser """ @abstractmethod @@ -56,20 +54,23 @@ class BaseScraper(ABC): """ Fetch raw content from the given source. - Implementations must retrieve the content referenced by `source` - and return it as raw bytes wrapped in a `Content` object. - Args: - source: Location identifier (URL, file path, S3 URI, etc.) - metadata: Optional hints for the scraper (headers, auth, etc.) + source (str): + Location identifier (URL, file path, S3 URI, etc.) + metadata (Optional[Mapping[str, Any]], optional): + Optional hints for the scraper (headers, auth, etc.) Returns: - Content object containing raw bytes and metadata. - - Raw content bytes - - Source identifier - - Optional metadata + Content: + Content object containing raw bytes and metadata. Raises: - Exception: Retrieval-specific errors as defined by the implementation. + Exception: + Retrieval-specific errors as defined by the implementation. + + Notes: + **Responsibilities:** + + - Implementations must retrieve the content referenced by `source` and return it as raw bytes wrapped in a `Content` object """ raise NotImplementedError diff --git a/omniread/html/__init__.py b/omniread/html/__init__.py index 4ef38b9..8ad87b5 100644 --- a/omniread/html/__init__.py +++ b/omniread/html/__init__.py @@ -1,6 +1,10 @@ """ HTML format implementation for OmniRead. +--- + +## Summary + This package provides **HTML-specific implementations** of the core OmniRead contracts defined in `omniread.core`. @@ -15,6 +19,15 @@ This package: Consumers should depend on `omniread.core` interfaces wherever possible and use this package only when HTML-specific behavior is required. + +--- + +## Public API + + HTMLScraper + HTMLParser + +--- """ diff --git a/omniread/html/parser.py b/omniread/html/parser.py index 06e25e6..0413249 100644 --- a/omniread/html/parser.py +++ b/omniread/html/parser.py @@ -1,6 +1,10 @@ """ HTML parser base implementations for OmniRead. +--- + +## Summary + This module provides reusable HTML parsing utilities built on top of the abstract parser contracts defined in `omniread.core.parser`. @@ -28,36 +32,39 @@ class HTMLParser(BaseParser[T], Generic[T]): """ Base HTML parser. - This class extends the core `BaseParser` with HTML-specific behavior, - including DOM parsing via BeautifulSoup and reusable extraction helpers. + Notes: + **Responsibilities:** - Provides reusable helpers for HTML extraction. - Concrete parsers must explicitly define the return type. + - This class extends the core `BaseParser` with HTML-specific behavior, including DOM parsing via BeautifulSoup and reusable extraction helpers + - Provides reusable helpers for HTML extraction. Concrete parsers must explicitly define the return type - Characteristics: - - Accepts only HTML content - - Owns a parsed BeautifulSoup DOM tree - - Provides pure helper utilities for common HTML structures + **Guarantees:** - Concrete subclasses must: - - Define the output type `T` - - Implement the `parse()` method + - Characteristics: Accepts only HTML content, owns a parsed BeautifulSoup DOM tree, provides pure helper utilities for common HTML structures + + **Constraints:** + + - Concrete subclasses must define the output type `T` and implement the `parse()` method """ supported_types = {ContentType.HTML} - """Set of content types supported by this parser (HTML only).""" + """ + Set of content types supported by this parser (HTML only). + """ def __init__(self, content: Content, features: str = "html.parser"): """ Initialize the HTML parser. Args: - content: HTML content to be parsed. - features: BeautifulSoup parser backend to use - (e.g., 'html.parser', 'lxml'). + content (Content): + HTML content to be parsed. + features (str, optional): + BeautifulSoup parser backend to use (e.g., 'html.parser', 'lxml'). Raises: - ValueError: If the content is empty or not valid HTML. + ValueError: + If the content is empty or not valid HTML. """ super().__init__(content) self._features = features @@ -72,11 +79,14 @@ class HTMLParser(BaseParser[T], Generic[T]): """ Fully parse the HTML content into structured output. - Implementations must fully interpret the HTML DOM and return - a deterministic, structured output. - Returns: - Parsed representation of type `T`. + T: + Parsed representation of type `T`. + + Notes: + **Responsibilities:** + + - Implementations must fully interpret the HTML DOM and return a deterministic, structured output """ raise NotImplementedError @@ -90,11 +100,14 @@ class HTMLParser(BaseParser[T], Generic[T]): Extract normalized text from a `