added .drone.yml

python file to generate docs. useful for pycharm on windows
added doc packages in requirements.txt
2026-01-09 15:55:54 +05:30 · 2026-01-09 15:52:27 +05:30 · 2026-01-09 15:52:14 +05:30 · 2026-01-09 15:51:54 +05:30 · 2026-01-09 15:51:22 +05:30 · 2026-01-02 19:02:20 +05:30
38 changed files with 1301 additions and 14 deletions
--- a/.drone.yml
+++ b/.drone.yml
@@ -0,0 +1,129 @@
 ---
 kind: pipeline
 type: docker
 name: build-and-publish-pypi
 platform:
  os: linux
  arch: arm64
 workspace:
  path: /drone/src
 steps:
  - name: check-version
    image: curlimages/curl:latest
    environment:
      PIP_REPO_URL:
        from_secret: PIP_REPO_URL
      PIP_USERNAME:
        from_secret: PIP_USERNAME
      PIP_PASSWORD:
        from_secret: PIP_PASSWORD
    commands:
      - PACKAGE_NAME=$(grep -E '^name\s*=' pyproject.toml | head -1 | cut -d'"' -f2)
      - VERSION=$(grep -E '^version\s*=' pyproject.toml | head -1 | cut -d'"' -f2)
      - echo "🔍 Checking if $PACKAGE_NAME==$VERSION exists on $PIP_REPO_URL ..."
      - |
        if curl -fsSL -u "$PIP_USERNAME:$PIP_PASSWORD" "$PIP_REPO_URL/simple/$PACKAGE_NAME/" | grep -q "$VERSION"; then
          echo "✅ $PACKAGE_NAME==$VERSION already exists — skipping build."
          exit 78
        else
          echo "🆕 New version detected: $PACKAGE_NAME==$VERSION"
        fi
  - name: build-package
    image: python:3.13-slim
    commands:
      - pip install --upgrade pip build
      - echo "📦 Building Python package..."
      - python -m build
      - ls -l dist
  - name: upload-to-private-pypi
    image: python:3.13-slim
    environment:
      PIP_REPO_URL:
        from_secret: PIP_REPO_URL
      PIP_USERNAME:
        from_secret: PIP_USERNAME
      PIP_PASSWORD:
        from_secret: PIP_PASSWORD
    commands:
      - pip install --upgrade twine
      - echo "🚀 Uploading to private PyPI at $PIP_REPO_URL ..."
      - |
        twine upload \
          --repository-url "$PIP_REPO_URL" \
          -u "$PIP_USERNAME" \
          -p "$PIP_PASSWORD" \
          dist/*
 trigger:
  event:
    - tag
 ---
 kind: pipeline
 type: docker
 name: backfill-pypi-from-tags
 platform:
  os: linux
  arch: arm64
 workspace:
  path: /drone/src
 steps:
  - name: fetch-tags
    image: alpine/git
    commands:
      - git fetch --tags --force
  - name: build-and-upload-missing
    image: python:3.13-slim
    environment:
      PIP_REPO_URL:
        from_secret: PIP_REPO_URL
      PIP_USERNAME:
        from_secret: PIP_USERNAME
      PIP_PASSWORD:
        from_secret: PIP_PASSWORD
    commands:
      - apt-get update
      - apt-get install -y git curl ca-certificates
      - pip install --upgrade pip build twine
      - |
        set -e
        PACKAGE_NAME=$(grep -E '^name\s*=' pyproject.toml | cut -d'"' -f2)
        echo "📦 Package: $PACKAGE_NAME"
        for TAG in $(git tag --sort=version:refname); do
          VERSION="$TAG"
          echo "🔁 Version: $VERSION"
          if curl -fsSL -u "$PIP_USERNAME:$PIP_PASSWORD" \
            "$PIP_REPO_URL/simple/$PACKAGE_NAME/" | grep -q "$VERSION"; then
            echo "⏭️  Exists, skipping"
            continue
          fi
          git checkout --force "$TAG"
          echo "🏗️  Building $VERSION"
          rm -rf dist
          python -m build
          echo "⬆️  Uploading $VERSION"
          twine upload \
            --repository-url "$PIP_REPO_URL" \
            -u "$PIP_USERNAME" \
            -p "$PIP_PASSWORD" \
            dist/*
        done
 trigger:
  event:
    - custom
--- a/docs/core/content.md
+++ b/docs/core/content.md
@@ -0,0 +1 @@
 ::: omniread.core.content
--- a/docs/core/index.md
+++ b/docs/core/index.md
@@ -0,0 +1 @@
 ::: omniread.core
--- a/docs/core/parser.md
+++ b/docs/core/parser.md
@@ -0,0 +1 @@
 ::: omniread.core.parser
--- a/docs/core/scraper.md
+++ b/docs/core/scraper.md
@@ -0,0 +1 @@
 ::: omniread.core.scraper
--- a/docs/html/index.md
+++ b/docs/html/index.md
@@ -0,0 +1 @@
 ::: omniread.html
--- a/docs/html/parser.md
+++ b/docs/html/parser.md
@@ -0,0 +1 @@
 ::: omniread.html.parser
--- a/docs/html/scraper.md
+++ b/docs/html/scraper.md
@@ -0,0 +1 @@
 ::: omniread.html.scraper
--- a/docs/index.md
+++ b/docs/index.md
@@ -0,0 +1 @@
 ::: omniread
--- a/docs/pdf/client.md
+++ b/docs/pdf/client.md
@@ -0,0 +1 @@
 ::: omniread.pdf.client
--- a/docs/pdf/index.md
+++ b/docs/pdf/index.md
@@ -0,0 +1 @@
 ::: omniread.pdf
--- a/docs/pdf/parser.md
+++ b/docs/pdf/parser.md
@@ -0,0 +1 @@
 ::: omniread.pdf.parser
--- a/docs/pdf/scraper.md
+++ b/docs/pdf/scraper.md
@@ -0,0 +1 @@
 ::: omniread.pdf.scraper
--- a/generate_docs.py
+++ b/generate_docs.py
@@ -0,0 +1,46 @@
 """
 Programmatic MkDocs build script for OmniRead.
 This script builds (or serves) the documentation by invoking MkDocs
 *as a Python library*, not via shell commands.
 Requirements:
 - mkdocs
 - mkdocs-material
 - mkdocstrings[python]
 Usage:
    python generate_docs.py
    python generate_docs.py --serve
 """
 import sys
 from pathlib import Path
 from mkdocs.commands import build as mkdocs_build
 from mkdocs.commands import serve as mkdocs_serve
 from mkdocs.config import load_config
 PROJECT_ROOT = Path(__file__).resolve().parent
 MKDOCS_YML = PROJECT_ROOT / "mkdocs.yml"
 def main() -> None:
    if not MKDOCS_YML.exists():
        raise FileNotFoundError("mkdocs.yml not found at project root")
    # Load MkDocs configuration programmatically
    config = load_config(str(MKDOCS_YML))
    # Decide mode
    if "--serve" in sys.argv:
        # Live-reload development server
        mkdocs_serve.serve(config)
    else:
        # Static site build
        mkdocs_build.build(config)
 if __name__ == "__main__":
    main()
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -0,0 +1,53 @@
 site_name: Aetoskia OmniRead
 site_description: Format-agnostic document reading, parsing, and scraping framework
 theme:
  name: material
  palette:
    - scheme: slate
      primary: deep purple
      accent: cyan
  font:
    text: Inter
    code: JetBrains Mono
  features:
    - navigation.tabs
    - navigation.expand
    - navigation.top
    - navigation.instant
    - content.code.copy
    - content.code.annotate
 plugins:
  - search
  - mkdocstrings:
      handlers:
        python:
          paths: ["."]
          options:
            docstring_style: google
            show_source: false
            show_signature_annotations: true
            separate_signature: true
            merge_init_into_class: true
            inherited_members: true
            annotations_path: brief
            show_root_heading: true
            group_by_category: true
 nav:
  - Home: index.md
  - Core (Contracts):
      - Content Models: core/content.md
      - Parsers: core/parser.md
      - Scrapers: core/scraper.md
  - HTML Implementation:
      - HTML Parser: html/parser.md
      - HTML Scraper: html/scraper.md
  - PDF Implementation:
      - PDF Client: pdf/client.md
      - PDF Parser: pdf/parser.md
      - PDF Scraper: pdf/scraper.md
--- a/omniread/init.py
+++ b/omniread/init.py
@@ -0,0 +1,116 @@
 """
 OmniRead — format-agnostic content acquisition and parsing framework.
 OmniRead provides a **cleanly layered architecture** for fetching, parsing,
 and normalizing content from heterogeneous sources such as HTML documents
 and PDF files.
 The library is structured around three core concepts:
 1. **Content**
   A canonical, format-agnostic container representing raw content bytes
   and minimal contextual metadata.
 2. **Scrapers**
   Components responsible for *acquiring* raw content from a source
   (HTTP, filesystem, object storage, etc.). Scrapers never interpret
   content.
 3. **Parsers**
   Components responsible for *interpreting* acquired content and
   converting it into structured, typed representations.
 OmniRead deliberately separates these responsibilities to ensure:
 - Clear boundaries between IO and interpretation
 - Replaceable implementations per format
 - Predictable, testable behavior
 ----------------------------------------------------------------------
 Installation
 ----------------------------------------------------------------------
 Install OmniRead using pip:
    pip install omniread
 Or with Poetry:
    poetry add omniread
 ----------------------------------------------------------------------
 Basic Usage
 ----------------------------------------------------------------------
 HTML example:
    from omniread import HTMLScraper, HTMLParser
    scraper = HTMLScraper()
    content = scraper.fetch("https://example.com")
    class TitleParser(HTMLParser[str]):
        def parse(self) -> str:
            return self._soup.title.string
    parser = TitleParser(content)
    title = parser.parse()
 PDF example:
    from omniread import FileSystemPDFClient, PDFScraper, PDFParser
    from pathlib import Path
    client = FileSystemPDFClient()
    scraper = PDFScraper(client=client)
    content = scraper.fetch(Path("document.pdf"))
    class TextPDFParser(PDFParser[str]):
        def parse(self) -> str:
            # implement PDF text extraction
            ...
    parser = TextPDFParser(content)
    result = parser.parse()
 ----------------------------------------------------------------------
 Public API Surface
 ----------------------------------------------------------------------
 This module re-exports the **recommended public entry points** of OmniRead.
 Consumers are encouraged to import from this namespace rather than from
 format-specific submodules directly, unless advanced customization is
 required.
 Core:
 - Content
 - ContentType
 HTML:
 - HTMLScraper
 - HTMLParser
 PDF:
 - FileSystemPDFClient
 - PDFScraper
 - PDFParser
 """
 from .core import Content, ContentType
 from .html import HTMLScraper, HTMLParser
 from .pdf import FileSystemPDFClient, PDFScraper, PDFParser
 __all__ = [
    # core
    "Content",
    "ContentType",
    # html
    "HTMLScraper",
    "HTMLParser",
    # pdf
    "FileSystemPDFClient",
    "PDFScraper",
    "PDFParser",
 ]
--- a/omniread/core/init.py
+++ b/omniread/core/init.py
@@ -0,0 +1,24 @@
 """
 Core domain contracts for OmniRead.
 This package defines the **format-agnostic domain layer** of OmniRead.
 It exposes canonical content models and abstract interfaces that are
 implemented by format-specific modules (HTML, PDF, etc.).
 Public exports from this package are considered **stable contracts** and
 are safe for downstream consumers to depend on.
 Submodules:
 - content: Canonical content models and enums
 - parser: Abstract parsing contracts
 - scraper: Abstract scraping contracts
 Format-specific behavior must not be introduced at this layer.
 """
 from .content import Content, ContentType
 __all__ = [
    "Content",
    "ContentType",
 ]
--- a/omniread/core/content.py
+++ b/omniread/core/content.py
@@ -1,17 +1,62 @@
 """
 Canonical content models for OmniRead.
 This module defines the **format-agnostic content representation** used across
 all parsers and scrapers in OmniRead.
 The models defined here represent *what* was extracted, not *how* it was
 retrieved or parsed. Format-specific behavior and metadata must not alter
 the semantic meaning of these models.
 """
 from enum import Enum
 from dataclasses import dataclass
 from typing import Any, Mapping, Optional
 class ContentType(str, Enum):
    """
    Supported MIME types for extracted content.
    This enum represents the declared or inferred media type of the content
    source. It is primarily used for routing content to the appropriate
    parser or downstream consumer.
    """
    HTML = "text/html"
    """HTML document content."""
    PDF = "application/pdf"
    """PDF document content."""
    JSON = "application/json"
    """JSON document content."""
    XML = "application/xml"
    """XML document content."""
@dataclass(slots=True)
 class Content:
    """
    Normalized representation of extracted content.
    A `Content` instance represents a raw content payload along with minimal
    contextual metadata describing its origin and type.
    This class is the **primary exchange format** between:
    - Scrapers
    - Parsers
    - Downstream consumers
    Attributes:
        raw: Raw content bytes as retrieved from the source.
        source: Identifier of the content origin (URL, file path, or logical name).
        content_type: Optional MIME type of the content, if known.
        metadata: Optional, implementation-defined metadata associated with
            the content (e.g., headers, encoding hints, extraction notes).
    """
    raw: bytes
    source: str
    content_type: Optional[ContentType] = None
--- a/omniread/core/parser.py
+++ b/omniread/core/parser.py
@@ -1,3 +1,20 @@
 """
 Abstract parsing contracts for OmniRead.
 This module defines the **format-agnostic parser interface** used to transform
 raw content into structured, typed representations.
 Parsers are responsible for:
 - Interpreting a single `Content` instance
 - Validating compatibility with the content type
 - Producing a structured output suitable for downstream consumers
 Parsers are not responsible for:
 - Fetching or acquiring content
 - Performing retries or error recovery
 - Managing multiple content sources
 """
 from abc import ABC, abstractmethod
 from typing import Generic, TypeVar, Set
@@ -12,11 +29,34 @@ class BaseParser(ABC, Generic[T]):
    A parser is a self-contained object that owns the Content
    it is responsible for interpreting.
    Implementations must:
    - Declare supported content types via `supported_types`
    - Raise parsing-specific exceptions from `parse()`
    - Remain deterministic for a given input
    Consumers may rely on:
    - Early validation of content compatibility
    - Type-stable return values from `parse()`
    """
    supported_types: Set[ContentType] = set()
    """Set of content types supported by this parser.
    An empty set indicates that the parser is content-type agnostic.
    """
    def __init__(self, content: Content):
        """
        Initialize the parser with content to be parsed.
        Args:
            content: Content instance to be parsed.
        Raises:
            ValueError: If the content type is not supported by this parser.
        """
        self.content = content
        if not self.supports():
@@ -30,15 +70,25 @@ class BaseParser(ABC, Generic[T]):
        """
        Parse the owned content into structured output.
        Implementations must fully consume the provided content and
        return a deterministic, structured output.
        Returns:
            Parsed, structured representation.
        Raises:
            Exception: Parsing-specific errors as defined by the implementation.
        """
        raise NotImplementedError
    def supports(self) -> bool:
        """
        Check whether this parser supports the content's type.
        Returns:
            True if the content type is supported; False otherwise.
        """
        if not self.supported_types:
            return True
--- a/omniread/core/scraper.py
+++ b/omniread/core/scraper.py
@@ -1,3 +1,22 @@
 """
 Abstract scraping contracts for OmniRead.
 This module defines the **format-agnostic scraper interface** responsible for
 acquiring raw content from external sources.
 Scrapers are responsible for:
 - Locating and retrieving raw content bytes
 - Attaching minimal contextual metadata
 - Returning normalized `Content` objects
 Scrapers are explicitly NOT responsible for:
 - Parsing or interpreting content
 - Inferring structure or semantics
 - Performing content-type specific processing
 All interpretation must be delegated to parsers.
 """
 from abc import ABC, abstractmethod
 from typing import Any, Mapping, Optional
@@ -10,6 +29,21 @@ class BaseScraper(ABC):
    A scraper is responsible ONLY for fetching raw content
    (bytes) from a source. It must not interpret or parse it.
    A scraper is a **stateless acquisition component** that retrieves raw
    content from a source and returns it as a `Content` object.
    Scrapers define *how content is obtained*, not *what the content means*.
    Implementations may vary in:
    - Transport mechanism (HTTP, filesystem, cloud storage)
    - Authentication strategy
    - Retry and backoff behavior
    Implementations must not:
    - Parse content
    - Modify content semantics
    - Couple scraping logic to a specific parser
    """
    @abstractmethod
@@ -22,11 +56,20 @@ class BaseScraper(ABC):
        """
        Fetch raw content from the given source.
        Implementations must retrieve the content referenced by `source`
        and return it as raw bytes wrapped in a `Content` object.
        Args:
            source: Location identifier (URL, file path, S3 URI, etc.)
            metadata: Optional hints for the scraper (headers, auth, etc.)
        Returns:
            Content object containing raw bytes and metadata.
            - Raw content bytes
            - Source identifier
            - Optional metadata
        Raises:
            Exception: Retrieval-specific errors as defined by the implementation.
        """
        raise NotImplementedError
--- a/omniread/html/init.py
+++ b/omniread/html/init.py
@@ -0,0 +1,27 @@
 """
 HTML format implementation for OmniRead.
 This package provides **HTML-specific implementations** of the core OmniRead
 contracts defined in `omniread.core`.
 It includes:
 - HTML parsers that interpret HTML content
 - HTML scrapers that retrieve HTML documents
 This package:
 - Implements, but does not redefine, core contracts
 - May contain HTML-specific behavior and edge-case handling
 - Produces canonical content models defined in `omniread.core.content`
 Consumers should depend on `omniread.core` interfaces wherever possible and
 use this package only when HTML-specific behavior is required.
 """
 from .scraper import HTMLScraper
 from .parser import HTMLParser
 __all__ = [
    "HTMLScraper",
    "HTMLParser",
 ]
--- a/omniread/html/parser.py
+++ b/omniread/html/parser.py
@@ -1,6 +1,21 @@
-from typing import Any, Generic, TypeVar, Optional
+"""
 HTML parser base implementations for OmniRead.
 This module provides reusable HTML parsing utilities built on top of
 the abstract parser contracts defined in `omniread.core.parser`.
 It supplies:
 - Content-type enforcement for HTML inputs
 - BeautifulSoup initialization and lifecycle management
 - Common helper methods for extracting structured data from HTML elements
 Concrete parsers must subclass `HTMLParser` and implement the `parse()` method
 to return a structured representation appropriate for their use case.
 """
 from typing import Any, Generic, TypeVar, Optional
 from abc import abstractmethod
 from bs4 import BeautifulSoup, Tag
 from omniread.core.content import ContentType, Content
@@ -13,13 +28,37 @@ class HTMLParser(BaseParser[T], Generic[T]):
    """
    Base HTML parser.
    This class extends the core `BaseParser` with HTML-specific behavior,
    including DOM parsing via BeautifulSoup and reusable extraction helpers.
    Provides reusable helpers for HTML extraction.
    Concrete parsers must explicitly define the return type.
    Characteristics:
    - Accepts only HTML content
    - Owns a parsed BeautifulSoup DOM tree
    - Provides pure helper utilities for common HTML structures
    Concrete subclasses must:
    - Define the output type `T`
    - Implement the `parse()` method
    """
    supported_types = {ContentType.HTML}
    """Set of content types supported by this parser (HTML only)."""
    def __init__(self, content: Content, features: str = "html.parser"):
        """
        Initialize the HTML parser.
        Args:
            content: HTML content to be parsed.
            features: BeautifulSoup parser backend to use
                (e.g., 'html.parser', 'lxml').
        Raises:
            ValueError: If the content is empty or not valid HTML.
        """
        super().__init__(content)
        self._features = features
        self._soup = self._get_soup()
@@ -32,6 +71,12 @@ class HTMLParser(BaseParser[T], Generic[T]):
    def parse(self) -> T:
        """
        Fully parse the HTML content into structured output.
        Implementations must fully interpret the HTML DOM and return
        a deterministic, structured output.
        Returns:
            Parsed representation of type `T`.
        """
        raise NotImplementedError
@@ -41,14 +86,42 @@ class HTMLParser(BaseParser[T], Generic[T]):
    @staticmethod
    def parse_div(div: Tag, *, separator: str = " ") -> str:
        """
        Extract normalized text from a `<div>` element.
        Args:
            div: BeautifulSoup tag representing a `<div>`.
            separator: String used to separate text nodes.
        Returns:
            Flattened, whitespace-normalized text content.
        """
        return div.get_text(separator=separator, strip=True)
    @staticmethod
    def parse_link(a: Tag) -> Optional[str]:
        """
        Extract the hyperlink reference from an `<a>` element.
        Args:
            a: BeautifulSoup tag representing an anchor.
        Returns:
            The value of the `href` attribute, or None if absent.
        """
        return a.get("href")
    @staticmethod
    def parse_table(table: Tag) -> list[list[str]]:
        """
        Parse an HTML table into a 2D list of strings.
        Args:
            table: BeautifulSoup tag representing a `<table>`.
        Returns:
            A list of rows, where each row is a list of cell text values.
        """
        rows: list[list[str]] = []
        for tr in table.find_all("tr"):
            cells = [
@@ -64,11 +137,30 @@ class HTMLParser(BaseParser[T], Generic[T]):
    # ----------------------------
    def _get_soup(self) -> BeautifulSoup:
        """
        Build a BeautifulSoup DOM tree from raw HTML content.
        Returns:
            Parsed BeautifulSoup document tree.
        Raises:
            ValueError: If the content payload is empty.
        """
        if not self.content.raw:
            raise ValueError("Empty HTML content")
        return BeautifulSoup(self.content.raw, features=self._features)
    def parse_meta(self) -> dict[str, Any]:
        """
        Extract high-level metadata from the HTML document.
        This includes:
        - Document title
        - `<meta>` tag name/property → content mappings
        Returns:
            Dictionary containing extracted metadata.
        """
        soup = self._soup
        title = soup.title.string.strip() if soup.title and soup.title.string else None
--- a/omniread/html/scraper.py
+++ b/omniread/html/scraper.py
@@ -1,27 +1,97 @@
 """
 HTML scraping implementation for OmniRead.
 This module provides an HTTP-based scraper for retrieving HTML documents.
 It implements the core `BaseScraper` contract using `httpx` as the transport
 layer.
 This scraper is responsible for:
 - Fetching raw HTML bytes over HTTP(S)
 - Validating response content type
 - Attaching HTTP metadata to the returned content
 This scraper is not responsible for:
 - Parsing or interpreting HTML
 - Retrying failed requests
 - Managing crawl policies or rate limiting
 """
 import httpx
 from typing import Any, Mapping, Optional
-from omniread.core.content import Content
+from omniread.core.content import Content, ContentType
 from omniread.core.scraper import BaseScraper
 class HTMLScraper(BaseScraper):
    """
-    Base HTTP scraper using httpx.
+    Base HTML scraper using httpx.
    This scraper retrieves HTML documents over HTTP(S) and returns them
    as raw content wrapped in a `Content` object.
    Fetches raw bytes and metadata only.
    The scraper:
    - Uses `httpx.Client` for HTTP requests
    - Enforces an HTML content type
    - Preserves HTTP response metadata
    The scraper does not:
    - Parse HTML
    - Perform retries or backoff
    - Handle non-HTML responses
    """
    def __init__(
        self,
        *,
        client: httpx.Client | None = None,
        timeout: float = 15.0,
        headers: Optional[Mapping[str, str]] = None,
        follow_redirects: bool = True,
    ):
-        self.timeout = timeout
+        """
-        self.headers = dict(headers) if headers else {}
+        Initialize the HTML scraper.
-        self.follow_redirects = follow_redirects
+
        Args:
            client: Optional pre-configured `httpx.Client`. If omitted,
                a client is created internally.
            timeout: Request timeout in seconds.
            headers: Optional default HTTP headers.
            follow_redirects: Whether to follow HTTP redirects.
        """
        self._client = client or httpx.Client(
            timeout=timeout,
            headers=headers,
            follow_redirects=follow_redirects,
        )
        self.content_type = ContentType.HTML
    def validate_content_type(
        self,
        response: httpx.Response,
    ):
        """
        Validate that the HTTP response contains HTML content.
        Args:
            response: HTTP response returned by `httpx`.
        Raises:
            ValueError: If the `Content-Type` header is missing or does not
                indicate HTML content.
        """
        raw_ct = response.headers.get("Content-Type")
        if not raw_ct:
            raise ValueError("Missing Content-Type header")
        base_ct = raw_ct.split(";", 1)[0].strip().lower()
        if base_ct != self.content_type.value:
            raise ValueError(
                f"Expected HTML content, got '{raw_ct}'"
            )
    def fetch(
        self,
@@ -29,20 +99,36 @@ class HTMLScraper(BaseScraper):
        *,
        metadata: Optional[Mapping[str, Any]] = None,
    ) -> Content:
-        with httpx.Client(
+        """
-            timeout=self.timeout,
+        Fetch an HTML document from the given source.
-            headers=self.headers,
+
-            follow_redirects=self.follow_redirects,
+        Args:
-        ) as client:
+            source: URL of the HTML document.
-            response = client.get(source)
+            metadata: Optional metadata to be merged into the returned content.
        Returns:
            A `Content` instance containing:
            - Raw HTML bytes
            - Source URL
            - HTML content type
            - HTTP response metadata
        Raises:
            httpx.HTTPError: If the HTTP request fails.
            ValueError: If the response is not valid HTML.
        """
        response = self._client.get(source)
        response.raise_for_status()
        self.validate_content_type(response)
        return Content(
            raw=response.content,
            source=source,
-            content_type=response.headers.get("Content-Type"),
+            content_type=self.content_type,
            metadata={
                "status_code": response.status_code,
                "headers": dict(response.headers),
                **(metadata or {}),
            },
        )
--- a/omniread/pdf/init.py
+++ b/omniread/pdf/init.py
@@ -0,0 +1,25 @@
 """
 PDF format implementation for OmniRead.
 This package provides **PDF-specific implementations** of the core OmniRead
 contracts defined in `omniread.core`.
 Unlike HTML, PDF handling requires an explicit client layer for document
 access. This package therefore includes:
 - PDF clients for acquiring raw PDF data
 - PDF scrapers that coordinate client access
 - PDF parsers that extract structured content from PDF binaries
 Public exports from this package represent the supported PDF pipeline
 and are safe for consumers to import directly when working with PDFs.
 """
 from .client import FileSystemPDFClient
 from .scraper import PDFScraper
 from .parser import PDFParser
 __all__ = [
    "FileSystemPDFClient",
    "PDFScraper",
    "PDFParser",
 ]
--- a/omniread/pdf/client.py
+++ b/omniread/pdf/client.py
@@ -0,0 +1,79 @@
 """
 PDF client abstractions for OmniRead.
 This module defines the **client layer** responsible for retrieving raw PDF
 bytes from a concrete backing store.
 Clients provide low-level access to PDF binaries and are intentionally
 decoupled from scraping and parsing logic. They do not perform validation,
 interpretation, or content extraction.
 Typical backing stores include:
 - Local filesystems
 - Object storage (S3, GCS, etc.)
 - Network file systems
 """
 from abc import ABC, abstractmethod
 from pathlib import Path
 class BasePDFClient(ABC):
    """
    Abstract client responsible for retrieving PDF bytes
    from a specific backing store (filesystem, S3, FTP, etc.).
    Implementations must:
    - Accept a source identifier appropriate to the backing store
    - Return the full PDF binary payload
    - Raise retrieval-specific errors on failure
    """
    @abstractmethod
    def fetch(self, source: str) -> bytes:
        """
        Fetch raw PDF bytes from the given source.
        Args:
            source: Identifier of the PDF location, such as a file path,
                object storage key, or remote reference.
        Returns:
            Raw PDF bytes.
        Raises:
            Exception: Retrieval-specific errors defined by the implementation.
        """
        raise NotImplementedError
 class FileSystemPDFClient(BasePDFClient):
    """
    PDF client that reads from the local filesystem.
    This client reads PDF files directly from the disk and returns their raw
    binary contents.
    """
    def fetch(self, path: Path) -> bytes:
        """
        Read a PDF file from the local filesystem.
        Args:
            path: Filesystem path to the PDF file.
        Returns:
            Raw PDF bytes.
        Raises:
            FileNotFoundError: If the path does not exist.
            ValueError: If the path exists but is not a file.
        """
        if not path.exists():
            raise FileNotFoundError(f"PDF not found: {path}")
        if not path.is_file():
            raise ValueError(f"Path is not a file: {path}")
        return path.read_bytes()
--- a/omniread/pdf/parser.py
+++ b/omniread/pdf/parser.py
@@ -0,0 +1,49 @@
 """
 PDF parser base implementations for OmniRead.
 This module defines the **PDF-specific parser contract**, extending the
 format-agnostic `BaseParser` with constraints appropriate for PDF content.
 PDF parsers are responsible for interpreting binary PDF data and producing
 structured representations suitable for downstream consumption.
 """
 from typing import Generic, TypeVar
 from abc import abstractmethod
 from omniread.core.content import ContentType
 from omniread.core.parser import BaseParser
 T = TypeVar("T")
 class PDFParser(BaseParser[T], Generic[T]):
    """
    Base PDF parser.
    This class enforces PDF content-type compatibility and provides the
    extension point for implementing concrete PDF parsing strategies.
    Concrete implementations must define:
    - Define the output type `T`
    - Implement the `parse()` method
    """
    supported_types = {ContentType.PDF}
    """Set of content types supported by this parser (PDF only)."""
    @abstractmethod
    def parse(self) -> T:
        """
        Parse PDF content into a structured output.
        Implementations must fully interpret the PDF binary payload and
        return a deterministic, structured output.
        Returns:
            Parsed representation of type `T`.
        Raises:
            Exception: Parsing-specific errors as defined by the implementation.
        """
        raise NotImplementedError
--- a/omniread/pdf/scraper.py
+++ b/omniread/pdf/scraper.py
@@ -0,0 +1,71 @@
 """
 PDF scraping implementation for OmniRead.
 This module provides a PDF-specific scraper that coordinates PDF byte
 retrieval via a client and normalizes the result into a `Content` object.
 The scraper implements the core `BaseScraper` contract while delegating
 all storage and access concerns to a `BasePDFClient` implementation.
 """
 from typing import Any, Mapping, Optional
 from omniread.core.content import Content, ContentType
 from omniread.core.scraper import BaseScraper
 from omniread.pdf.client import BasePDFClient
 class PDFScraper(BaseScraper):
    """
    Scraper for PDF sources.
    Delegates byte retrieval to a PDF client and normalizes
    output into Content.
    The scraper:
    - Does not perform parsing or interpretation
    - Does not assume a specific storage backend
    - Preserves caller-provided metadata
    """
    def __init__(self, *, client: BasePDFClient):
        """
        Initialize the PDF scraper.
        Args:
            client: PDF client responsible for retrieving raw PDF bytes.
        """
        self._client = client
    def fetch(
        self,
        source: str,
        *,
        metadata: Optional[Mapping[str, Any]] = None,
    ) -> Content:
        """
        Fetch a PDF document from the given source.
        Args:
            source: Identifier of the PDF source as understood by the
                configured PDF client.
            metadata: Optional metadata to attach to the returned content.
        Returns:
            A `Content` instance containing:
            - Raw PDF bytes
            - Source identifier
            - PDF content type
            - Optional metadata
        Raises:
            Exception: Retrieval-specific errors raised by the PDF client.
        """
        raw = self._client.fetch(source)
        return Content(
            raw=raw,
            source=source,
            content_type=ContentType.PDF,
            metadata=dict(metadata) if metadata else None,
        )
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,19 @@
 httpx==0.27.0
 beautifulsoup4==4.12.0
 pydantic==2.12.3
 jinja2==3.1.6
 # lxml==5.2.0
 # Test Packages
 pytest==7.4.0
 pytest-asyncio==0.21.0
 pytest-cov==4.1.0
 # Doc Packages
 mkdocs==1.6.1
 mkdocs-material==9.6.23
 neoteroi-mkdocs==1.1.3
 pymdown-extensions==10.16.1
 mkdocs-swagger-ui-tag==0.7.2
 mkdocstrings==1.0.0
 mkdocstrings-python==2.0.1
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,90 @@
 import json
 import pytest
 import httpx
 from pathlib import Path
 from jinja2 import Environment, BaseLoader
 from omniread import (
    # core
    ContentType,
    # html
    HTMLScraper,
    # pdf
    FileSystemPDFClient,
    PDFScraper,
 )
 MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html"
 MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf"
 def render_html(template_path, data_path) -> bytes:
    template_text = Path(template_path).read_text(encoding="utf-8")
    data = json.loads(Path(data_path).read_text(encoding="utf-8"))
    env = Environment(
        loader=BaseLoader(),
        autoescape=False,
    )
    template = env.from_string(template_text)
    rendered = template.render(**data)
    return rendered.encode("utf-8")
 def mock_transport(request: httpx.Request) -> httpx.Response:
    """
    httpx MockTransport handler.
    """
    path = request.url.path
    if path not in ['/simple', '/table']:
        return httpx.Response(
            status_code=404,
            content=b"Not Found",
            request=request,
        )
    endpoint = path.split("/")[-1]
    content = render_html(
        MOCK_HTML_DIR / f"{endpoint}.html.jinja",
        MOCK_HTML_DIR / f"{endpoint}.json",
    )
    return httpx.Response(
        status_code=200,
        headers={"Content-Type": ContentType.HTML.value},
        content=content,
        request=request,
    )
@pytest.fixture
 def http_scraper() -> HTMLScraper:
    transport = httpx.MockTransport(mock_transport)
    client = httpx.Client(transport=transport)
    return HTMLScraper(client=client)
 class MockPDFClient(FileSystemPDFClient):
    """
    Test-only PDF client that routes logical identifiers
    to fixture files.
    """
    def fetch(self, source: str) -> bytes:
        if source in ["simple"]:
            source = MOCK_PDF_DIR / f"{source}.pdf"
        else:
            raise FileNotFoundError(f"No mock PDF route for '{source}'")
        return super().fetch(source)
@pytest.fixture
 def pdf_scraper() -> PDFScraper:
    client = MockPDFClient()
    return PDFScraper(client=client)
--- a/tests/mocks/html/simple.html.jinja
+++ b/tests/mocks/html/simple.html.jinja
@@ -0,0 +1,11 @@
 <!DOCTYPE html>
 <html>
  <head>
    <title>{{ title }}</title>
    <meta name="description" content="{{ description }}">
  </head>
  <body>
    <div id="content">{{ content }}</div>
    <a href="{{ link_url }}">{{ link_text }}</a>
  </body>
 </html>
--- a/tests/mocks/html/simple.json
+++ b/tests/mocks/html/simple.json
@@ -0,0 +1,7 @@
 {
  "title": "Test Page",
  "description": "Simple test page",
  "content": "Hello World",
  "link_url": "https://example.com",
  "link_text": "Link"
 }
--- a/tests/mocks/html/table.html.jinja
+++ b/tests/mocks/html/table.html.jinja
@@ -0,0 +1,31 @@
 <!DOCTYPE html>
 <html>
  <head>
    <title>{{ title }}</title>
    <meta name="description" content="{{ description }}">
  </head>
  <body>
    <h1>{{ heading }}</h1>
    <table id="{{ table_id }}">
      <thead>
        <tr>
          {% for col in columns %}
          <th>{{ col }}</th>
          {% endfor %}
        </tr>
      </thead>
      <tbody>
        {% for row in rows %}
        <tr>
          {% for cell in row %}
          <td>{{ cell }}</td>
          {% endfor %}
        </tr>
        {% endfor %}
      </tbody>
    </table>
    <a href="{{ link_url }}">{{ link_text }}</a>
  </body>
 </html>
--- a/tests/mocks/html/table.json
+++ b/tests/mocks/html/table.json
@@ -0,0 +1,14 @@
 {
  "title": "Table Test Page",
  "description": "HTML page with a table for parsing tests",
  "heading": "Sample Table",
  "table_id": "data-table",
  "columns": ["Name", "Age", "City"],
  "rows": [
    ["Alice", "30", "London"],
    ["Bob", "25", "New York"],
    ["Charlie", "35", "Berlin"]
  ],
  "link_url": "https://example.org/details",
  "link_text": "Details"
 }
--- a/tests/mocks/pdf/simple.pdf
+++ b/tests/mocks/pdf/simple.pdf
@@ -0,0 +1,32 @@
 %PDF-1.4
 1 0 obj
 << /Type /Catalog /Pages 2 0 R >>
 endobj
 2 0 obj
 << /Type /Pages /Kids [3 0 R] /Count 1 >>
 endobj
 3 0 obj
 << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
 endobj
 4 0 obj
 << /Length 44 >>
 stream
 BT
 /F1 12 Tf
 72 720 Td
 (Simple PDF Test) Tj
 ET
 endstream
 endobj
 xref
 0 5
 0000000000 65535 f
 0000000010 00000 n
 0000000061 00000 n
 0000000116 00000 n
 0000000203 00000 n
 trailer
 << /Size 5 /Root 1 0 R >>
 startxref
 300
 %%EOF
--- a/tests/test_html_simple.py
+++ b/tests/test_html_simple.py
@@ -0,0 +1,53 @@
 from typing import Optional
 from pydantic import BaseModel
 from bs4 import Tag
 from omniread import (
    # core
    Content,
    # html
    HTMLParser,
 )
 class ParsedSimpleHTML(BaseModel):
    title: Optional[str]
    description: Optional[str]
    content: Optional[str]
    link: Optional[str]
 class SimpleHTMLParser(HTMLParser[ParsedSimpleHTML]):
    """
    Parser focused on high-level page semantics.
    """
    def parse(self) -> ParsedSimpleHTML:
        soup = self._soup
        meta = self.parse_meta()
        content_div = soup.find("div", id="content")
        link_tag: Tag | None = soup.find("a")
        return ParsedSimpleHTML(
            title=meta["title"],
            description=meta["meta"].get("description"),
            content=self.parse_div(content_div) if content_div else None,
            link=self.parse_link(link_tag) if link_tag else None,
        )
 def test_end_to_end_html_simple(http_scraper):
    content: Content = http_scraper.fetch("https://test.local/simple")
    parser = SimpleHTMLParser(content)
    result = parser.parse()
    assert isinstance(result, ParsedSimpleHTML)
    assert result.title == "Test Page"
    assert result.description == "Simple test page"
    assert result.content == "Hello World"
    assert result.link == "https://example.com"
--- a/tests/test_html_table.py
+++ b/tests/test_html_table.py
@@ -0,0 +1,49 @@
 from typing import Optional
 from pydantic import BaseModel
 from omniread import (
    # core
    Content,
    # html
    HTMLParser,
 )
 class ParsedTableHTML(BaseModel):
    title: Optional[str]
    table: list[list[str]]
 class TableHTMLParser(HTMLParser[ParsedTableHTML]):
    """
    Parser focused on extracting tabular data.
    """
    def parse(self) -> ParsedTableHTML:
        soup = self._soup
        table_tag = soup.find("table")
        return ParsedTableHTML(
            title=soup.title.string.strip() if soup.title else None,
            table=self.parse_table(table_tag) if table_tag else [],
        )
 def test_end_to_end_html_table(http_scraper):
    content: Content = http_scraper.fetch("https://test.local/table")
    parser = TableHTMLParser(content)
    result = parser.parse()
    assert isinstance(result, ParsedTableHTML)
    assert result.title == "Table Test Page"
    assert result.table == [
        ["Name", "Age", "City"],
        ["Alice", "30", "London"],
        ["Bob", "25", "New York"],
        ["Charlie", "35", "Berlin"],
    ]
--- a/tests/test_pdf_simple.py
+++ b/tests/test_pdf_simple.py
@@ -0,0 +1,41 @@
 from typing import Literal
 from pydantic import BaseModel
 from omniread import (
    # core
    Content,
    # pdf
    PDFParser,
 )
 class ParsedPDF(BaseModel):
    size_bytes: int
    magic: Literal[b"%PDF"]
 class SimplePDFParser(PDFParser[ParsedPDF]):
    def parse(self) -> ParsedPDF:
        raw = self.content.raw
        if not raw.startswith(b"%PDF"):
            raise ValueError("Not a valid PDF")
        return ParsedPDF(
            size_bytes=len(raw),
            magic=b"%PDF",
        )
 def test_end_to_end_pdf_simple(pdf_scraper):
    # --- Scrape (identifier-based, routed in conftest)
    content: Content = pdf_scraper.fetch("simple")
    assert content.raw.startswith(b"%PDF")
    # --- Parse
    parser = SimplePDFParser(content)
    result = parser.parse()
    assert result.magic == b"%PDF"
    assert result.size_bytes > 100
Author	SHA1	Message	Date
Vishesh 'ironeagle' Bangotra	6808538485	added .drone.yml All checks were successful continuous-integration/drone Build is passing Details	2026-01-09 15:55:54 +05:30
Vishesh 'ironeagle' Bangotra	fc29f49d41	python file to generate docs. useful for pycharm on windows	2026-01-09 15:52:27 +05:30
Vishesh 'ironeagle' Bangotra	3d6655084f	added doc packages in requirements.txt	2026-01-09 15:52:14 +05:30
Vishesh 'ironeagle' Bangotra	5af411020c	docs: add mkdocs configuration and API documentation structure - docs(mkdocs): add mkdocs.yml with material theme and plugin configuration - docs(mkdocs): configure navigation for core, html, and pdf modules - docs(docs): add documentation root and homepage - docs(docs): add core contracts documentation pages - docs(docs): add html implementation documentation pages - docs(docs): add pdf implementation documentation pages - docs(docs): wire mkdocstrings directives for API reference rendering	2026-01-09 15:51:54 +05:30
Vishesh 'ironeagle' Bangotra	7f1b0d9c10	docs: add contract-oriented docstrings across core, html, and pdf layers - docs(core): document Content and ContentType canonical models - docs(core): define BaseParser contract and parsing semantics - docs(core): define BaseScraper contract and acquisition semantics - docs(html): document HTML package purpose and scope - docs(html): add HTMLParser base with DOM helpers and contracts - docs(html): add HTTP-based HTMLScraper with content-type enforcement - docs(pdf): document PDF package structure and public pipeline - docs(pdf): add BasePDFClient abstraction and filesystem implementation - docs(pdf): add PDFParser base contract for binary parsing - docs(pdf): add PDFScraper coordinating client and Content normalization - docs(api): expand top-level omniread module with install instructions and examples	2026-01-09 15:51:22 +05:30
Vishesh 'ironeagle' Bangotra	b2173f3ef0	refactor(tests): use omniread public API instead of internal module imports - Replace deep imports with top-level omniread exports in tests - Ensure tests validate only the supported public API surface - Align HTML and PDF tests with documented library usage	2026-01-02 19:02:20 +05:30
Vishesh 'ironeagle' Bangotra	de67c7b0b1	feat(pdf): add PDF client, scraper, parser, and end-to-end tests - Introduce PDF submodule with client, scraper, and generic parser - Add filesystem PDF client and test-only mock routing - Add end-to-end PDF scrape → parse tests with typed output - Mirror HTML module architecture for consistency - Expose PDF primitives via omniread public API	2026-01-02 18:59:36 +05:30
Vishesh 'ironeagle' Bangotra	390eb22e1b	moved html mocks to html sub folder and updated conftest.py to read from new location with better path and endpoint handling	2026-01-02 18:44:26 +05:30
Vishesh 'ironeagle' Bangotra	358abc9b36	feat(api): expose core and html primitives via top-level package exports - Re-export Content and ContentType from omniread.core - Re-export HTMLScraper and HTMLParser from omniread.html - Define explicit __all__ for stable public API surface	2026-01-02 18:36:29 +05:30
Vishesh 'ironeagle' Bangotra	07293e4651	feat(testing): add end-to-end HTML scraping and parsing tests with typed parsers - Add smart httpx MockTransport routing based on endpoint paths - Render HTML fixtures via Jinja templates populated from JSON data - Introduce explicit, typed HTML parsers for semantic and table-based content - Add end-to-end tests covering scraper → content → parser → Pydantic models - Enforce explicit output contracts and avoid default dict-based parsing	2026-01-02 18:31:34 +05:30
Vishesh 'ironeagle' Bangotra	fa14a79ec9	simple test case	2026-01-02 18:20:03 +05:30
Vishesh 'ironeagle' Bangotra	55245cf241	added validation for content type	2026-01-02 18:19:47 +05:30
Vishesh 'ironeagle' Bangotra	202329e190	refactor(html-scraper): normalize Content-Type and inject httpx client - Inject httpx.Client for testability and reuse - Validate and normalize Content-Type header before returning Content - Emit ContentType.HTML instead of raw header strings - Avoid per-request client creation - Preserve metadata while allowing caller overrides	2026-01-02 18:08:46 +05:30
Vishesh 'ironeagle' Bangotra	f59024ddd5	added pydantic	2026-01-02 18:08:37 +05:30