diff --git a/omniread/__init__.py b/omniread/__init__.py index df7ce37..6d9b3a9 100644 --- a/omniread/__init__.py +++ b/omniread/__init__.py @@ -1,3 +1,101 @@ +""" +OmniRead — format-agnostic content acquisition and parsing framework. + +OmniRead provides a **cleanly layered architecture** for fetching, parsing, +and normalizing content from heterogeneous sources such as HTML documents +and PDF files. + +The library is structured around three core concepts: + +1. **Content** + A canonical, format-agnostic container representing raw content bytes + and minimal contextual metadata. + +2. **Scrapers** + Components responsible for *acquiring* raw content from a source + (HTTP, filesystem, object storage, etc.). Scrapers never interpret + content. + +3. **Parsers** + Components responsible for *interpreting* acquired content and + converting it into structured, typed representations. + +OmniRead deliberately separates these responsibilities to ensure: +- Clear boundaries between IO and interpretation +- Replaceable implementations per format +- Predictable, testable behavior + +---------------------------------------------------------------------- +Installation +---------------------------------------------------------------------- + +Install OmniRead using pip: + + pip install omniread + +Or with Poetry: + + poetry add omniread + +---------------------------------------------------------------------- +Basic Usage +---------------------------------------------------------------------- + +HTML example: + + from omniread import HTMLScraper, HTMLParser + + scraper = HTMLScraper() + content = scraper.fetch("https://example.com") + + class TitleParser(HTMLParser[str]): + def parse(self) -> str: + return self._soup.title.string + + parser = TitleParser(content) + title = parser.parse() + +PDF example: + + from omniread import FileSystemPDFClient, PDFScraper, PDFParser + from pathlib import Path + + client = FileSystemPDFClient() + scraper = PDFScraper(client=client) + content = scraper.fetch(Path("document.pdf")) + + class TextPDFParser(PDFParser[str]): + def parse(self) -> str: + # implement PDF text extraction + ... + + parser = TextPDFParser(content) + result = parser.parse() + +---------------------------------------------------------------------- +Public API Surface +---------------------------------------------------------------------- + +This module re-exports the **recommended public entry points** of OmniRead. + +Consumers are encouraged to import from this namespace rather than from +format-specific submodules directly, unless advanced customization is +required. + +Core: +- Content +- ContentType + +HTML: +- HTMLScraper +- HTMLParser + +PDF: +- FileSystemPDFClient +- PDFScraper +- PDFParser +""" + from .core import Content, ContentType from .html import HTMLScraper, HTMLParser from .pdf import FileSystemPDFClient, PDFScraper, PDFParser diff --git a/omniread/core/__init__.py b/omniread/core/__init__.py index c64acc6..ea9f4b4 100644 --- a/omniread/core/__init__.py +++ b/omniread/core/__init__.py @@ -1,3 +1,21 @@ +""" +Core domain contracts for OmniRead. + +This package defines the **format-agnostic domain layer** of OmniRead. +It exposes canonical content models and abstract interfaces that are +implemented by format-specific modules (HTML, PDF, etc.). + +Public exports from this package are considered **stable contracts** and +are safe for downstream consumers to depend on. + +Submodules: +- content: Canonical content models and enums +- parser: Abstract parsing contracts +- scraper: Abstract scraping contracts + +Format-specific behavior must not be introduced at this layer. +""" + from .content import Content, ContentType __all__ = [ diff --git a/omniread/core/content.py b/omniread/core/content.py index a301739..2bc1af1 100644 --- a/omniread/core/content.py +++ b/omniread/core/content.py @@ -1,17 +1,62 @@ +""" +Canonical content models for OmniRead. + +This module defines the **format-agnostic content representation** used across +all parsers and scrapers in OmniRead. + +The models defined here represent *what* was extracted, not *how* it was +retrieved or parsed. Format-specific behavior and metadata must not alter +the semantic meaning of these models. +""" + from enum import Enum from dataclasses import dataclass from typing import Any, Mapping, Optional class ContentType(str, Enum): + """ + Supported MIME types for extracted content. + + This enum represents the declared or inferred media type of the content + source. It is primarily used for routing content to the appropriate + parser or downstream consumer. + """ + HTML = "text/html" + """HTML document content.""" + PDF = "application/pdf" + """PDF document content.""" + JSON = "application/json" + """JSON document content.""" + XML = "application/xml" + """XML document content.""" @dataclass(slots=True) class Content: + """ + Normalized representation of extracted content. + + A `Content` instance represents a raw content payload along with minimal + contextual metadata describing its origin and type. + + This class is the **primary exchange format** between: + - Scrapers + - Parsers + - Downstream consumers + + Attributes: + raw: Raw content bytes as retrieved from the source. + source: Identifier of the content origin (URL, file path, or logical name). + content_type: Optional MIME type of the content, if known. + metadata: Optional, implementation-defined metadata associated with + the content (e.g., headers, encoding hints, extraction notes). + """ + raw: bytes source: str content_type: Optional[ContentType] = None diff --git a/omniread/core/parser.py b/omniread/core/parser.py index 3426672..4f0bc08 100644 --- a/omniread/core/parser.py +++ b/omniread/core/parser.py @@ -1,3 +1,20 @@ +""" +Abstract parsing contracts for OmniRead. + +This module defines the **format-agnostic parser interface** used to transform +raw content into structured, typed representations. + +Parsers are responsible for: +- Interpreting a single `Content` instance +- Validating compatibility with the content type +- Producing a structured output suitable for downstream consumers + +Parsers are not responsible for: +- Fetching or acquiring content +- Performing retries or error recovery +- Managing multiple content sources +""" + from abc import ABC, abstractmethod from typing import Generic, TypeVar, Set @@ -12,11 +29,34 @@ class BaseParser(ABC, Generic[T]): A parser is a self-contained object that owns the Content it is responsible for interpreting. + + Implementations must: + - Declare supported content types via `supported_types` + - Raise parsing-specific exceptions from `parse()` + - Remain deterministic for a given input + + Consumers may rely on: + - Early validation of content compatibility + - Type-stable return values from `parse()` """ supported_types: Set[ContentType] = set() + """Set of content types supported by this parser. + + An empty set indicates that the parser is content-type agnostic. + """ def __init__(self, content: Content): + """ + Initialize the parser with content to be parsed. + + Args: + content: Content instance to be parsed. + + Raises: + ValueError: If the content type is not supported by this parser. + """ + self.content = content if not self.supports(): @@ -30,15 +70,25 @@ class BaseParser(ABC, Generic[T]): """ Parse the owned content into structured output. + Implementations must fully consume the provided content and + return a deterministic, structured output. + Returns: Parsed, structured representation. + + Raises: + Exception: Parsing-specific errors as defined by the implementation. """ raise NotImplementedError def supports(self) -> bool: """ Check whether this parser supports the content's type. + + Returns: + True if the content type is supported; False otherwise. """ + if not self.supported_types: return True diff --git a/omniread/core/scraper.py b/omniread/core/scraper.py index d1ec9e7..910dfe2 100644 --- a/omniread/core/scraper.py +++ b/omniread/core/scraper.py @@ -1,3 +1,22 @@ +""" +Abstract scraping contracts for OmniRead. + +This module defines the **format-agnostic scraper interface** responsible for +acquiring raw content from external sources. + +Scrapers are responsible for: +- Locating and retrieving raw content bytes +- Attaching minimal contextual metadata +- Returning normalized `Content` objects + +Scrapers are explicitly NOT responsible for: +- Parsing or interpreting content +- Inferring structure or semantics +- Performing content-type specific processing + +All interpretation must be delegated to parsers. +""" + from abc import ABC, abstractmethod from typing import Any, Mapping, Optional @@ -10,6 +29,21 @@ class BaseScraper(ABC): A scraper is responsible ONLY for fetching raw content (bytes) from a source. It must not interpret or parse it. + + A scraper is a **stateless acquisition component** that retrieves raw + content from a source and returns it as a `Content` object. + + Scrapers define *how content is obtained*, not *what the content means*. + + Implementations may vary in: + - Transport mechanism (HTTP, filesystem, cloud storage) + - Authentication strategy + - Retry and backoff behavior + + Implementations must not: + - Parse content + - Modify content semantics + - Couple scraping logic to a specific parser """ @abstractmethod @@ -22,11 +56,20 @@ class BaseScraper(ABC): """ Fetch raw content from the given source. + Implementations must retrieve the content referenced by `source` + and return it as raw bytes wrapped in a `Content` object. + Args: source: Location identifier (URL, file path, S3 URI, etc.) metadata: Optional hints for the scraper (headers, auth, etc.) Returns: Content object containing raw bytes and metadata. + - Raw content bytes + - Source identifier + - Optional metadata + + Raises: + Exception: Retrieval-specific errors as defined by the implementation. """ raise NotImplementedError diff --git a/omniread/html/__init__.py b/omniread/html/__init__.py index 0199c55..4ef38b9 100644 --- a/omniread/html/__init__.py +++ b/omniread/html/__init__.py @@ -1,3 +1,23 @@ +""" +HTML format implementation for OmniRead. + +This package provides **HTML-specific implementations** of the core OmniRead +contracts defined in `omniread.core`. + +It includes: +- HTML parsers that interpret HTML content +- HTML scrapers that retrieve HTML documents + +This package: +- Implements, but does not redefine, core contracts +- May contain HTML-specific behavior and edge-case handling +- Produces canonical content models defined in `omniread.core.content` + +Consumers should depend on `omniread.core` interfaces wherever possible and +use this package only when HTML-specific behavior is required. +""" + + from .scraper import HTMLScraper from .parser import HTMLParser diff --git a/omniread/html/parser.py b/omniread/html/parser.py index faf2e52..06e25e6 100644 --- a/omniread/html/parser.py +++ b/omniread/html/parser.py @@ -1,6 +1,21 @@ -from typing import Any, Generic, TypeVar, Optional +""" +HTML parser base implementations for OmniRead. +This module provides reusable HTML parsing utilities built on top of +the abstract parser contracts defined in `omniread.core.parser`. + +It supplies: +- Content-type enforcement for HTML inputs +- BeautifulSoup initialization and lifecycle management +- Common helper methods for extracting structured data from HTML elements + +Concrete parsers must subclass `HTMLParser` and implement the `parse()` method +to return a structured representation appropriate for their use case. +""" + +from typing import Any, Generic, TypeVar, Optional from abc import abstractmethod + from bs4 import BeautifulSoup, Tag from omniread.core.content import ContentType, Content @@ -13,13 +28,37 @@ class HTMLParser(BaseParser[T], Generic[T]): """ Base HTML parser. + This class extends the core `BaseParser` with HTML-specific behavior, + including DOM parsing via BeautifulSoup and reusable extraction helpers. + Provides reusable helpers for HTML extraction. Concrete parsers must explicitly define the return type. + + Characteristics: + - Accepts only HTML content + - Owns a parsed BeautifulSoup DOM tree + - Provides pure helper utilities for common HTML structures + + Concrete subclasses must: + - Define the output type `T` + - Implement the `parse()` method """ supported_types = {ContentType.HTML} + """Set of content types supported by this parser (HTML only).""" def __init__(self, content: Content, features: str = "html.parser"): + """ + Initialize the HTML parser. + + Args: + content: HTML content to be parsed. + features: BeautifulSoup parser backend to use + (e.g., 'html.parser', 'lxml'). + + Raises: + ValueError: If the content is empty or not valid HTML. + """ super().__init__(content) self._features = features self._soup = self._get_soup() @@ -32,6 +71,12 @@ class HTMLParser(BaseParser[T], Generic[T]): def parse(self) -> T: """ Fully parse the HTML content into structured output. + + Implementations must fully interpret the HTML DOM and return + a deterministic, structured output. + + Returns: + Parsed representation of type `T`. """ raise NotImplementedError @@ -41,14 +86,42 @@ class HTMLParser(BaseParser[T], Generic[T]): @staticmethod def parse_div(div: Tag, *, separator: str = " ") -> str: + """ + Extract normalized text from a `