76 lines
1.9 KiB
Python
76 lines
1.9 KiB
Python
"""
|
|
Canonical content models for OmniRead.
|
|
|
|
---
|
|
|
|
## Summary
|
|
|
|
This module defines the **format-agnostic content representation** used across
|
|
all parsers and scrapers in OmniRead.
|
|
|
|
The models defined here represent *what* was extracted, not *how* it was
|
|
retrieved or parsed. Format-specific behavior and metadata must not alter
|
|
the semantic meaning of these models.
|
|
"""
|
|
|
|
from enum import Enum
|
|
from dataclasses import dataclass
|
|
from typing import Any, Mapping, Optional
|
|
|
|
|
|
class ContentType(str, Enum):
|
|
"""
|
|
Supported MIME types for extracted content.
|
|
|
|
Notes:
|
|
**Guarantees:**
|
|
|
|
- This enum represents the declared or inferred media type of the content source
|
|
- It is primarily used for routing content to the appropriate parser or downstream consumer
|
|
"""
|
|
|
|
HTML = "text/html"
|
|
"""HTML document content."""
|
|
|
|
PDF = "application/pdf"
|
|
"""PDF document content."""
|
|
|
|
JSON = "application/json"
|
|
"""JSON document content."""
|
|
|
|
XML = "application/xml"
|
|
"""XML document content."""
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class Content:
|
|
"""
|
|
Normalized representation of extracted content.
|
|
|
|
Notes:
|
|
**Responsibilities:**
|
|
|
|
- A `Content` instance represents a raw content payload along with minimal contextual metadata describing its origin and type
|
|
- This class is the primary exchange format between Scrapers, Parsers, and Downstream consumers
|
|
"""
|
|
|
|
raw: bytes
|
|
"""
|
|
Raw content bytes as retrieved from the source.
|
|
"""
|
|
|
|
source: str
|
|
"""
|
|
Identifier of the content origin (URL, file path, or logical name).
|
|
"""
|
|
|
|
content_type: Optional[ContentType] = None
|
|
"""
|
|
Optional MIME type of the content, if known.
|
|
"""
|
|
|
|
metadata: Optional[Mapping[str, Any]] = None
|
|
"""
|
|
Optional, implementation-defined metadata associated with the content (e.g., headers, encoding hints, extraction notes).
|
|
"""
|