diff --git a/omniread/__init__.py b/omniread/__init__.py index df7ce37..6d9b3a9 100644 --- a/omniread/__init__.py +++ b/omniread/__init__.py @@ -1,3 +1,101 @@ +""" +OmniRead — format-agnostic content acquisition and parsing framework. + +OmniRead provides a **cleanly layered architecture** for fetching, parsing, +and normalizing content from heterogeneous sources such as HTML documents +and PDF files. + +The library is structured around three core concepts: + +1. **Content** + A canonical, format-agnostic container representing raw content bytes + and minimal contextual metadata. + +2. **Scrapers** + Components responsible for *acquiring* raw content from a source + (HTTP, filesystem, object storage, etc.). Scrapers never interpret + content. + +3. **Parsers** + Components responsible for *interpreting* acquired content and + converting it into structured, typed representations. + +OmniRead deliberately separates these responsibilities to ensure: +- Clear boundaries between IO and interpretation +- Replaceable implementations per format +- Predictable, testable behavior + +---------------------------------------------------------------------- +Installation +---------------------------------------------------------------------- + +Install OmniRead using pip: + + pip install omniread + +Or with Poetry: + + poetry add omniread + +---------------------------------------------------------------------- +Basic Usage +---------------------------------------------------------------------- + +HTML example: + + from omniread import HTMLScraper, HTMLParser + + scraper = HTMLScraper() + content = scraper.fetch("https://example.com") + + class TitleParser(HTMLParser[str]): + def parse(self) -> str: + return self._soup.title.string + + parser = TitleParser(content) + title = parser.parse() + +PDF example: + + from omniread import FileSystemPDFClient, PDFScraper, PDFParser + from pathlib import Path + + client = FileSystemPDFClient() + scraper = PDFScraper(client=client) + content = scraper.fetch(Path("document.pdf")) + + class TextPDFParser(PDFParser[str]): + def parse(self) -> str: + # implement PDF text extraction + ... + + parser = TextPDFParser(content) + result = parser.parse() + +---------------------------------------------------------------------- +Public API Surface +---------------------------------------------------------------------- + +This module re-exports the **recommended public entry points** of OmniRead. + +Consumers are encouraged to import from this namespace rather than from +format-specific submodules directly, unless advanced customization is +required. + +Core: +- Content +- ContentType + +HTML: +- HTMLScraper +- HTMLParser + +PDF: +- FileSystemPDFClient +- PDFScraper +- PDFParser +""" + from .core import Content, ContentType from .html import HTMLScraper, HTMLParser from .pdf import FileSystemPDFClient, PDFScraper, PDFParser diff --git a/omniread/core/__init__.py b/omniread/core/__init__.py index c64acc6..ea9f4b4 100644 --- a/omniread/core/__init__.py +++ b/omniread/core/__init__.py @@ -1,3 +1,21 @@ +""" +Core domain contracts for OmniRead. + +This package defines the **format-agnostic domain layer** of OmniRead. +It exposes canonical content models and abstract interfaces that are +implemented by format-specific modules (HTML, PDF, etc.). + +Public exports from this package are considered **stable contracts** and +are safe for downstream consumers to depend on. + +Submodules: +- content: Canonical content models and enums +- parser: Abstract parsing contracts +- scraper: Abstract scraping contracts + +Format-specific behavior must not be introduced at this layer. +""" + from .content import Content, ContentType __all__ = [ diff --git a/omniread/core/content.py b/omniread/core/content.py index a301739..2bc1af1 100644 --- a/omniread/core/content.py +++ b/omniread/core/content.py @@ -1,17 +1,62 @@ +""" +Canonical content models for OmniRead. + +This module defines the **format-agnostic content representation** used across +all parsers and scrapers in OmniRead. + +The models defined here represent *what* was extracted, not *how* it was +retrieved or parsed. Format-specific behavior and metadata must not alter +the semantic meaning of these models. +""" + from enum import Enum from dataclasses import dataclass from typing import Any, Mapping, Optional class ContentType(str, Enum): + """ + Supported MIME types for extracted content. + + This enum represents the declared or inferred media type of the content + source. It is primarily used for routing content to the appropriate + parser or downstream consumer. + """ + HTML = "text/html" + """HTML document content.""" + PDF = "application/pdf" + """PDF document content.""" + JSON = "application/json" + """JSON document content.""" + XML = "application/xml" + """XML document content.""" @dataclass(slots=True) class Content: + """ + Normalized representation of extracted content. + + A `Content` instance represents a raw content payload along with minimal + contextual metadata describing its origin and type. + + This class is the **primary exchange format** between: + - Scrapers + - Parsers + - Downstream consumers + + Attributes: + raw: Raw content bytes as retrieved from the source. + source: Identifier of the content origin (URL, file path, or logical name). + content_type: Optional MIME type of the content, if known. + metadata: Optional, implementation-defined metadata associated with + the content (e.g., headers, encoding hints, extraction notes). + """ + raw: bytes source: str content_type: Optional[ContentType] = None diff --git a/omniread/core/parser.py b/omniread/core/parser.py index 3426672..4f0bc08 100644 --- a/omniread/core/parser.py +++ b/omniread/core/parser.py @@ -1,3 +1,20 @@ +""" +Abstract parsing contracts for OmniRead. + +This module defines the **format-agnostic parser interface** used to transform +raw content into structured, typed representations. + +Parsers are responsible for: +- Interpreting a single `Content` instance +- Validating compatibility with the content type +- Producing a structured output suitable for downstream consumers + +Parsers are not responsible for: +- Fetching or acquiring content +- Performing retries or error recovery +- Managing multiple content sources +""" + from abc import ABC, abstractmethod from typing import Generic, TypeVar, Set @@ -12,11 +29,34 @@ class BaseParser(ABC, Generic[T]): A parser is a self-contained object that owns the Content it is responsible for interpreting. + + Implementations must: + - Declare supported content types via `supported_types` + - Raise parsing-specific exceptions from `parse()` + - Remain deterministic for a given input + + Consumers may rely on: + - Early validation of content compatibility + - Type-stable return values from `parse()` """ supported_types: Set[ContentType] = set() + """Set of content types supported by this parser. + + An empty set indicates that the parser is content-type agnostic. + """ def __init__(self, content: Content): + """ + Initialize the parser with content to be parsed. + + Args: + content: Content instance to be parsed. + + Raises: + ValueError: If the content type is not supported by this parser. + """ + self.content = content if not self.supports(): @@ -30,15 +70,25 @@ class BaseParser(ABC, Generic[T]): """ Parse the owned content into structured output. + Implementations must fully consume the provided content and + return a deterministic, structured output. + Returns: Parsed, structured representation. + + Raises: + Exception: Parsing-specific errors as defined by the implementation. """ raise NotImplementedError def supports(self) -> bool: """ Check whether this parser supports the content's type. + + Returns: + True if the content type is supported; False otherwise. """ + if not self.supported_types: return True diff --git a/omniread/core/scraper.py b/omniread/core/scraper.py index d1ec9e7..910dfe2 100644 --- a/omniread/core/scraper.py +++ b/omniread/core/scraper.py @@ -1,3 +1,22 @@ +""" +Abstract scraping contracts for OmniRead. + +This module defines the **format-agnostic scraper interface** responsible for +acquiring raw content from external sources. + +Scrapers are responsible for: +- Locating and retrieving raw content bytes +- Attaching minimal contextual metadata +- Returning normalized `Content` objects + +Scrapers are explicitly NOT responsible for: +- Parsing or interpreting content +- Inferring structure or semantics +- Performing content-type specific processing + +All interpretation must be delegated to parsers. +""" + from abc import ABC, abstractmethod from typing import Any, Mapping, Optional @@ -10,6 +29,21 @@ class BaseScraper(ABC): A scraper is responsible ONLY for fetching raw content (bytes) from a source. It must not interpret or parse it. + + A scraper is a **stateless acquisition component** that retrieves raw + content from a source and returns it as a `Content` object. + + Scrapers define *how content is obtained*, not *what the content means*. + + Implementations may vary in: + - Transport mechanism (HTTP, filesystem, cloud storage) + - Authentication strategy + - Retry and backoff behavior + + Implementations must not: + - Parse content + - Modify content semantics + - Couple scraping logic to a specific parser """ @abstractmethod @@ -22,11 +56,20 @@ class BaseScraper(ABC): """ Fetch raw content from the given source. + Implementations must retrieve the content referenced by `source` + and return it as raw bytes wrapped in a `Content` object. + Args: source: Location identifier (URL, file path, S3 URI, etc.) metadata: Optional hints for the scraper (headers, auth, etc.) Returns: Content object containing raw bytes and metadata. + - Raw content bytes + - Source identifier + - Optional metadata + + Raises: + Exception: Retrieval-specific errors as defined by the implementation. """ raise NotImplementedError diff --git a/omniread/html/__init__.py b/omniread/html/__init__.py index 0199c55..4ef38b9 100644 --- a/omniread/html/__init__.py +++ b/omniread/html/__init__.py @@ -1,3 +1,23 @@ +""" +HTML format implementation for OmniRead. + +This package provides **HTML-specific implementations** of the core OmniRead +contracts defined in `omniread.core`. + +It includes: +- HTML parsers that interpret HTML content +- HTML scrapers that retrieve HTML documents + +This package: +- Implements, but does not redefine, core contracts +- May contain HTML-specific behavior and edge-case handling +- Produces canonical content models defined in `omniread.core.content` + +Consumers should depend on `omniread.core` interfaces wherever possible and +use this package only when HTML-specific behavior is required. +""" + + from .scraper import HTMLScraper from .parser import HTMLParser diff --git a/omniread/html/parser.py b/omniread/html/parser.py index faf2e52..06e25e6 100644 --- a/omniread/html/parser.py +++ b/omniread/html/parser.py @@ -1,6 +1,21 @@ -from typing import Any, Generic, TypeVar, Optional +""" +HTML parser base implementations for OmniRead. +This module provides reusable HTML parsing utilities built on top of +the abstract parser contracts defined in `omniread.core.parser`. + +It supplies: +- Content-type enforcement for HTML inputs +- BeautifulSoup initialization and lifecycle management +- Common helper methods for extracting structured data from HTML elements + +Concrete parsers must subclass `HTMLParser` and implement the `parse()` method +to return a structured representation appropriate for their use case. +""" + +from typing import Any, Generic, TypeVar, Optional from abc import abstractmethod + from bs4 import BeautifulSoup, Tag from omniread.core.content import ContentType, Content @@ -13,13 +28,37 @@ class HTMLParser(BaseParser[T], Generic[T]): """ Base HTML parser. + This class extends the core `BaseParser` with HTML-specific behavior, + including DOM parsing via BeautifulSoup and reusable extraction helpers. + Provides reusable helpers for HTML extraction. Concrete parsers must explicitly define the return type. + + Characteristics: + - Accepts only HTML content + - Owns a parsed BeautifulSoup DOM tree + - Provides pure helper utilities for common HTML structures + + Concrete subclasses must: + - Define the output type `T` + - Implement the `parse()` method """ supported_types = {ContentType.HTML} + """Set of content types supported by this parser (HTML only).""" def __init__(self, content: Content, features: str = "html.parser"): + """ + Initialize the HTML parser. + + Args: + content: HTML content to be parsed. + features: BeautifulSoup parser backend to use + (e.g., 'html.parser', 'lxml'). + + Raises: + ValueError: If the content is empty or not valid HTML. + """ super().__init__(content) self._features = features self._soup = self._get_soup() @@ -32,6 +71,12 @@ class HTMLParser(BaseParser[T], Generic[T]): def parse(self) -> T: """ Fully parse the HTML content into structured output. + + Implementations must fully interpret the HTML DOM and return + a deterministic, structured output. + + Returns: + Parsed representation of type `T`. """ raise NotImplementedError @@ -41,14 +86,42 @@ class HTMLParser(BaseParser[T], Generic[T]): @staticmethod def parse_div(div: Tag, *, separator: str = " ") -> str: + """ + Extract normalized text from a `
` element. + + Args: + div: BeautifulSoup tag representing a `
`. + separator: String used to separate text nodes. + + Returns: + Flattened, whitespace-normalized text content. + """ return div.get_text(separator=separator, strip=True) @staticmethod def parse_link(a: Tag) -> Optional[str]: + """ + Extract the hyperlink reference from an `` element. + + Args: + a: BeautifulSoup tag representing an anchor. + + Returns: + The value of the `href` attribute, or None if absent. + """ return a.get("href") @staticmethod def parse_table(table: Tag) -> list[list[str]]: + """ + Parse an HTML table into a 2D list of strings. + + Args: + table: BeautifulSoup tag representing a ``. + + Returns: + A list of rows, where each row is a list of cell text values. + """ rows: list[list[str]] = [] for tr in table.find_all("tr"): cells = [ @@ -64,11 +137,30 @@ class HTMLParser(BaseParser[T], Generic[T]): # ---------------------------- def _get_soup(self) -> BeautifulSoup: + """ + Build a BeautifulSoup DOM tree from raw HTML content. + + Returns: + Parsed BeautifulSoup document tree. + + Raises: + ValueError: If the content payload is empty. + """ if not self.content.raw: raise ValueError("Empty HTML content") return BeautifulSoup(self.content.raw, features=self._features) def parse_meta(self) -> dict[str, Any]: + """ + Extract high-level metadata from the HTML document. + + This includes: + - Document title + - `` tag name/property → content mappings + + Returns: + Dictionary containing extracted metadata. + """ soup = self._soup title = soup.title.string.strip() if soup.title and soup.title.string else None diff --git a/omniread/html/scraper.py b/omniread/html/scraper.py index 9d9de23..58115b0 100644 --- a/omniread/html/scraper.py +++ b/omniread/html/scraper.py @@ -1,3 +1,21 @@ +""" +HTML scraping implementation for OmniRead. + +This module provides an HTTP-based scraper for retrieving HTML documents. +It implements the core `BaseScraper` contract using `httpx` as the transport +layer. + +This scraper is responsible for: +- Fetching raw HTML bytes over HTTP(S) +- Validating response content type +- Attaching HTTP metadata to the returned content + +This scraper is not responsible for: +- Parsing or interpreting HTML +- Retrying failed requests +- Managing crawl policies or rate limiting +""" + import httpx from typing import Any, Mapping, Optional @@ -9,7 +27,19 @@ class HTMLScraper(BaseScraper): """ Base HTML scraper using httpx. + This scraper retrieves HTML documents over HTTP(S) and returns them + as raw content wrapped in a `Content` object. + Fetches raw bytes and metadata only. + The scraper: + - Uses `httpx.Client` for HTTP requests + - Enforces an HTML content type + - Preserves HTTP response metadata + + The scraper does not: + - Parse HTML + - Perform retries or backoff + - Handle non-HTML responses """ def __init__( @@ -20,6 +50,17 @@ class HTMLScraper(BaseScraper): headers: Optional[Mapping[str, str]] = None, follow_redirects: bool = True, ): + """ + Initialize the HTML scraper. + + Args: + client: Optional pre-configured `httpx.Client`. If omitted, + a client is created internally. + timeout: Request timeout in seconds. + headers: Optional default HTTP headers. + follow_redirects: Whether to follow HTTP redirects. + """ + self._client = client or httpx.Client( timeout=timeout, headers=headers, @@ -31,6 +72,17 @@ class HTMLScraper(BaseScraper): self, response: httpx.Response, ): + """ + Validate that the HTTP response contains HTML content. + + Args: + response: HTTP response returned by `httpx`. + + Raises: + ValueError: If the `Content-Type` header is missing or does not + indicate HTML content. + """ + raw_ct = response.headers.get("Content-Type") if not raw_ct: raise ValueError("Missing Content-Type header") @@ -47,6 +99,25 @@ class HTMLScraper(BaseScraper): *, metadata: Optional[Mapping[str, Any]] = None, ) -> Content: + """ + Fetch an HTML document from the given source. + + Args: + source: URL of the HTML document. + metadata: Optional metadata to be merged into the returned content. + + Returns: + A `Content` instance containing: + - Raw HTML bytes + - Source URL + - HTML content type + - HTTP response metadata + + Raises: + httpx.HTTPError: If the HTTP request fails. + ValueError: If the response is not valid HTML. + """ + response = self._client.get(source) response.raise_for_status() self.validate_content_type(response) diff --git a/omniread/pdf/__init__.py b/omniread/pdf/__init__.py index b7281aa..d924554 100644 --- a/omniread/pdf/__init__.py +++ b/omniread/pdf/__init__.py @@ -1,3 +1,19 @@ +""" +PDF format implementation for OmniRead. + +This package provides **PDF-specific implementations** of the core OmniRead +contracts defined in `omniread.core`. + +Unlike HTML, PDF handling requires an explicit client layer for document +access. This package therefore includes: +- PDF clients for acquiring raw PDF data +- PDF scrapers that coordinate client access +- PDF parsers that extract structured content from PDF binaries + +Public exports from this package represent the supported PDF pipeline +and are safe for consumers to import directly when working with PDFs. +""" + from .client import FileSystemPDFClient from .scraper import PDFScraper from .parser import PDFParser diff --git a/omniread/pdf/client.py b/omniread/pdf/client.py index 595a294..686821c 100644 --- a/omniread/pdf/client.py +++ b/omniread/pdf/client.py @@ -1,3 +1,19 @@ +""" +PDF client abstractions for OmniRead. + +This module defines the **client layer** responsible for retrieving raw PDF +bytes from a concrete backing store. + +Clients provide low-level access to PDF binaries and are intentionally +decoupled from scraping and parsing logic. They do not perform validation, +interpretation, or content extraction. + +Typical backing stores include: +- Local filesystems +- Object storage (S3, GCS, etc.) +- Network file systems +""" + from abc import ABC, abstractmethod from pathlib import Path @@ -5,13 +21,28 @@ from pathlib import Path class BasePDFClient(ABC): """ Abstract client responsible for retrieving PDF bytes - from a specific backing store (filesystem, S3, FTP, etc). + from a specific backing store (filesystem, S3, FTP, etc.). + + Implementations must: + - Accept a source identifier appropriate to the backing store + - Return the full PDF binary payload + - Raise retrieval-specific errors on failure """ @abstractmethod def fetch(self, source: str) -> bytes: """ Fetch raw PDF bytes from the given source. + + Args: + source: Identifier of the PDF location, such as a file path, + object storage key, or remote reference. + + Returns: + Raw PDF bytes. + + Raises: + Exception: Retrieval-specific errors defined by the implementation. """ raise NotImplementedError @@ -19,9 +50,25 @@ class BasePDFClient(ABC): class FileSystemPDFClient(BasePDFClient): """ PDF client that reads from the local filesystem. + + This client reads PDF files directly from the disk and returns their raw + binary contents. """ def fetch(self, path: Path) -> bytes: + """ + Read a PDF file from the local filesystem. + + Args: + path: Filesystem path to the PDF file. + + Returns: + Raw PDF bytes. + + Raises: + FileNotFoundError: If the path does not exist. + ValueError: If the path exists but is not a file. + """ if not path.exists(): raise FileNotFoundError(f"PDF not found: {path}") diff --git a/omniread/pdf/parser.py b/omniread/pdf/parser.py index f46a2f5..65c4b5a 100644 --- a/omniread/pdf/parser.py +++ b/omniread/pdf/parser.py @@ -1,3 +1,13 @@ +""" +PDF parser base implementations for OmniRead. + +This module defines the **PDF-specific parser contract**, extending the +format-agnostic `BaseParser` with constraints appropriate for PDF content. + +PDF parsers are responsible for interpreting binary PDF data and producing +structured representations suitable for downstream consumption. +""" + from typing import Generic, TypeVar from abc import abstractmethod @@ -11,16 +21,29 @@ class PDFParser(BaseParser[T], Generic[T]): """ Base PDF parser. + This class enforces PDF content-type compatibility and provides the + extension point for implementing concrete PDF parsing strategies. + Concrete implementations must define: - - the output type T - - the parsing strategy + - Define the output type `T` + - Implement the `parse()` method """ supported_types = {ContentType.PDF} + """Set of content types supported by this parser (PDF only).""" @abstractmethod def parse(self) -> T: """ Parse PDF content into a structured output. + + Implementations must fully interpret the PDF binary payload and + return a deterministic, structured output. + + Returns: + Parsed representation of type `T`. + + Raises: + Exception: Parsing-specific errors as defined by the implementation. """ raise NotImplementedError diff --git a/omniread/pdf/scraper.py b/omniread/pdf/scraper.py index f561567..9198679 100644 --- a/omniread/pdf/scraper.py +++ b/omniread/pdf/scraper.py @@ -1,3 +1,13 @@ +""" +PDF scraping implementation for OmniRead. + +This module provides a PDF-specific scraper that coordinates PDF byte +retrieval via a client and normalizes the result into a `Content` object. + +The scraper implements the core `BaseScraper` contract while delegating +all storage and access concerns to a `BasePDFClient` implementation. +""" + from typing import Any, Mapping, Optional from omniread.core.content import Content, ContentType @@ -11,9 +21,20 @@ class PDFScraper(BaseScraper): Delegates byte retrieval to a PDF client and normalizes output into Content. + + The scraper: + - Does not perform parsing or interpretation + - Does not assume a specific storage backend + - Preserves caller-provided metadata """ def __init__(self, *, client: BasePDFClient): + """ + Initialize the PDF scraper. + + Args: + client: PDF client responsible for retrieving raw PDF bytes. + """ self._client = client def fetch( @@ -22,6 +43,24 @@ class PDFScraper(BaseScraper): *, metadata: Optional[Mapping[str, Any]] = None, ) -> Content: + """ + Fetch a PDF document from the given source. + + Args: + source: Identifier of the PDF source as understood by the + configured PDF client. + metadata: Optional metadata to attach to the returned content. + + Returns: + A `Content` instance containing: + - Raw PDF bytes + - Source identifier + - PDF content type + - Optional metadata + + Raises: + Exception: Retrieval-specific errors raised by the PDF client. + """ raw = self._client.fetch(source) return Content(