109 lines
2.9 KiB
Python
109 lines
2.9 KiB
Python
"""
|
|
Abstract parsing contracts for OmniRead.
|
|
|
|
---
|
|
|
|
## Summary
|
|
|
|
This module defines the **format-agnostic parser interface** used to transform
|
|
raw content into structured, typed representations.
|
|
|
|
Parsers are responsible for:
|
|
- Interpreting a single `Content` instance
|
|
- Validating compatibility with the content type
|
|
- Producing a structured output suitable for downstream consumers
|
|
|
|
Parsers are not responsible for:
|
|
- Fetching or acquiring content
|
|
- Performing retries or error recovery
|
|
- Managing multiple content sources
|
|
"""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from typing import Generic, TypeVar, Set
|
|
|
|
from .content import Content, ContentType
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
|
class BaseParser(ABC, Generic[T]):
|
|
"""
|
|
Base interface for all parsers.
|
|
|
|
Notes:
|
|
**Guarantees:**
|
|
|
|
- A parser is a self-contained object that owns the Content it is responsible for interpreting
|
|
- Consumers may rely on early validation of content compatibility and type-stable return values from `parse()`
|
|
|
|
**Responsibilities:**
|
|
|
|
- Implementations must declare supported content types via `supported_types`
|
|
- Implementations must raise parsing-specific exceptions from `parse()`
|
|
- Implementations must remain deterministic for a given input
|
|
"""
|
|
|
|
supported_types: Set[ContentType] = set()
|
|
"""
|
|
Set of content types supported by this parser. An empty set indicates that the parser is content-type agnostic.
|
|
"""
|
|
|
|
def __init__(self, content: Content):
|
|
"""
|
|
Initialize the parser with content to be parsed.
|
|
|
|
Args:
|
|
content (Content):
|
|
Content instance to be parsed.
|
|
|
|
Raises:
|
|
ValueError:
|
|
If the content type is not supported by this parser.
|
|
"""
|
|
|
|
self.content = content
|
|
|
|
if not self.supports():
|
|
raise ValueError(
|
|
f"{self.__class__.__name__} does not support content type "
|
|
f"{content.content_type!r}"
|
|
)
|
|
|
|
@abstractmethod
|
|
def parse(self) -> T:
|
|
"""
|
|
Parse the owned content into structured output.
|
|
|
|
Returns:
|
|
T:
|
|
Parsed, structured representation.
|
|
|
|
Raises:
|
|
Exception:
|
|
Parsing-specific errors as defined by the implementation.
|
|
|
|
Notes:
|
|
**Responsibilities:**
|
|
|
|
- Implementations must fully consume the provided content and return a deterministic, structured output
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def supports(self) -> bool:
|
|
"""
|
|
Check whether this parser supports the content's type.
|
|
|
|
Returns:
|
|
bool:
|
|
True if the content type is supported; False otherwise.
|
|
"""
|
|
|
|
if not self.supported_types:
|
|
return True
|
|
|
|
if self.content.content_type is None:
|
|
return False
|
|
|
|
return self.content.content_type in self.supported_types
|