docs: add contract-oriented docstrings across core, html, and pdf layers

- docs(core): document Content and ContentType canonical models - docs(core): define BaseParser contract and parsing semantics - docs(core): define BaseScraper contract and acquisition semantics - docs(html): document HTML package purpose and scope - docs(html): add HTMLParser base with DOM helpers and contracts - docs(html): add HTTP-based HTMLScraper with content-type enforcement - docs(pdf): document PDF package structure and public pipeline - docs(pdf): add BasePDFClient abstraction and filesystem implementation - docs(pdf): add PDFParser base contract for binary parsing - docs(pdf): add PDFScraper coordinating client and Content normalization - docs(api): expand top-level omniread module with install instructions and examples
2026-01-09 15:51:22 +05:30
parent b2173f3ef0
commit 7f1b0d9c10
12 changed files with 566 additions and 4 deletions
--- a/omniread/init.py
+++ b/omniread/init.py
@@ -1,3 +1,101 @@
+"""
+OmniRead — format-agnostic content acquisition and parsing framework.
+
+OmniRead provides a **cleanly layered architecture** for fetching, parsing,
+and normalizing content from heterogeneous sources such as HTML documents
+and PDF files.
+
+The library is structured around three core concepts:
+
+1. **Content**
+   A canonical, format-agnostic container representing raw content bytes
+   and minimal contextual metadata.
+
+2. **Scrapers**
+   Components responsible for *acquiring* raw content from a source
+   (HTTP, filesystem, object storage, etc.). Scrapers never interpret
+   content.
+
+3. **Parsers**
+   Components responsible for *interpreting* acquired content and
+   converting it into structured, typed representations.
+
+OmniRead deliberately separates these responsibilities to ensure:
+- Clear boundaries between IO and interpretation
+- Replaceable implementations per format
+- Predictable, testable behavior
+
+----------------------------------------------------------------------
+Installation
+----------------------------------------------------------------------
+
+Install OmniRead using pip:
+
+    pip install omniread
+
+Or with Poetry:
+
+    poetry add omniread
+
+----------------------------------------------------------------------
+Basic Usage
+----------------------------------------------------------------------
+
+HTML example:
+
+    from omniread import HTMLScraper, HTMLParser
+
+    scraper = HTMLScraper()
+    content = scraper.fetch("https://example.com")
+
+    class TitleParser(HTMLParser[str]):
+        def parse(self) -> str:
+            return self._soup.title.string
+
+    parser = TitleParser(content)
+    title = parser.parse()
+
+PDF example:
+
+    from omniread import FileSystemPDFClient, PDFScraper, PDFParser
+    from pathlib import Path
+
+    client = FileSystemPDFClient()
+    scraper = PDFScraper(client=client)
+    content = scraper.fetch(Path("document.pdf"))
+
+    class TextPDFParser(PDFParser[str]):
+        def parse(self) -> str:
+            # implement PDF text extraction
+            ...
+
+    parser = TextPDFParser(content)
+    result = parser.parse()
+
+----------------------------------------------------------------------
+Public API Surface
+----------------------------------------------------------------------
+
+This module re-exports the **recommended public entry points** of OmniRead.
+
+Consumers are encouraged to import from this namespace rather than from
+format-specific submodules directly, unless advanced customization is
+required.
+
+Core:
+- Content
+- ContentType
+
+HTML:
+- HTMLScraper
+- HTMLParser
+
+PDF:
+- FileSystemPDFClient
+- PDFScraper
+- PDFParser
+"""
+
 from .core import Content, ContentType
 from .html import HTMLScraper, HTMLParser
 from .pdf import FileSystemPDFClient, PDFScraper, PDFParser