omniread/omniread/__init__.py

"""
OmniRead — format-agnostic content acquisition and parsing framework.

OmniRead provides a **cleanly layered architecture** for fetching, parsing,
and normalizing content from heterogeneous sources such as HTML documents
and PDF files.

The library is structured around three core concepts:

1. **Content**
   A canonical, format-agnostic container representing raw content bytes
   and minimal contextual metadata.

2. **Scrapers**
   Components responsible for *acquiring* raw content from a source
   (HTTP, filesystem, object storage, etc.). Scrapers never interpret
   content.

3. **Parsers**
   Components responsible for *interpreting* acquired content and
   converting it into structured, typed representations.

OmniRead deliberately separates these responsibilities to ensure:
- Clear boundaries between IO and interpretation
- Replaceable implementations per format
- Predictable, testable behavior

----------------------------------------------------------------------
Installation
----------------------------------------------------------------------

Install OmniRead using pip:

    pip install omniread

Or with Poetry:

    poetry add omniread

----------------------------------------------------------------------
Basic Usage
----------------------------------------------------------------------

HTML example:

    from omniread import HTMLScraper, HTMLParser

    scraper = HTMLScraper()
    content = scraper.fetch("https://example.com")

    class TitleParser(HTMLParser[str]):
        def parse(self) -> str:
            return self._soup.title.string

    parser = TitleParser(content)
    title = parser.parse()

PDF example:

    from omniread import FileSystemPDFClient, PDFScraper, PDFParser
    from pathlib import Path

    client = FileSystemPDFClient()
    scraper = PDFScraper(client=client)
    content = scraper.fetch(Path("document.pdf"))

    class TextPDFParser(PDFParser[str]):
        def parse(self) -> str:
            # implement PDF text extraction
            ...

    parser = TextPDFParser(content)
    result = parser.parse()

----------------------------------------------------------------------
Public API Surface
----------------------------------------------------------------------

This module re-exports the **recommended public entry points** of OmniRead.

Consumers are encouraged to import from this namespace rather than from
format-specific submodules directly, unless advanced customization is
required.

Core:
- Content
- ContentType

HTML:
- HTMLScraper
- HTMLParser

PDF:
- FileSystemPDFClient
- PDFScraper
- PDFParser
"""

from .core import Content, ContentType
from .html import HTMLScraper, HTMLParser
from .pdf import FileSystemPDFClient, PDFScraper, PDFParser

__all__ = [
    # core
    "Content",
    "ContentType",

    # html
    "HTMLScraper",
    "HTMLParser",

    # pdf
    "FileSystemPDFClient",
    "PDFScraper",
    "PDFParser",
]