omniread/omniread/core/content.py

"""
Canonical content models for OmniRead.

---

## Summary

This module defines the **format-agnostic content representation** used across
all parsers and scrapers in OmniRead.

The models defined here represent *what* was extracted, not *how* it was
retrieved or parsed. Format-specific behavior and metadata must not alter
the semantic meaning of these models.
"""

from enum import Enum
from dataclasses import dataclass
from typing import Any, Mapping, Optional


class ContentType(str, Enum):
    """
    Supported MIME types for extracted content.

    Notes:
        **Guarantees:**

            - This enum represents the declared or inferred media type of the content source
            - It is primarily used for routing content to the appropriate parser or downstream consumer
    """

    HTML = "text/html"
    """HTML document content."""

    PDF = "application/pdf"
    """PDF document content."""

    JSON = "application/json"
    """JSON document content."""

    XML = "application/xml"
    """XML document content."""


@dataclass(slots=True)
class Content:
    """
    Normalized representation of extracted content.

    Notes:
        **Responsibilities:**

            - A `Content` instance represents a raw content payload along with minimal contextual metadata describing its origin and type
            - This class is the primary exchange format between Scrapers, Parsers, and Downstream consumers
    """

    raw: bytes
    """
    Raw content bytes as retrieved from the source.
    """

    source: str
    """
    Identifier of the content origin (URL, file path, or logical name).
    """

    content_type: Optional[ContentType] = None
    """
    Optional MIME type of the content, if known.
    """

    metadata: Optional[Mapping[str, Any]] = None
    """
    Optional, implementation-defined metadata associated with the content (e.g., headers, encoding hints, extraction notes).
    """