Files
omniread/omniread/core/parser.py

109 lines
2.9 KiB
Python

"""
Abstract parsing contracts for OmniRead.
---
## Summary
This module defines the **format-agnostic parser interface** used to transform
raw content into structured, typed representations.
Parsers are responsible for:
- Interpreting a single `Content` instance
- Validating compatibility with the content type
- Producing a structured output suitable for downstream consumers
Parsers are not responsible for:
- Fetching or acquiring content
- Performing retries or error recovery
- Managing multiple content sources
"""
from abc import ABC, abstractmethod
from typing import Generic, TypeVar, Set
from .content import Content, ContentType
T = TypeVar("T")
class BaseParser(ABC, Generic[T]):
"""
Base interface for all parsers.
Notes:
**Guarantees:**
- A parser is a self-contained object that owns the Content it is responsible for interpreting
- Consumers may rely on early validation of content compatibility and type-stable return values from `parse()`
**Responsibilities:**
- Implementations must declare supported content types via `supported_types`
- Implementations must raise parsing-specific exceptions from `parse()`
- Implementations must remain deterministic for a given input
"""
supported_types: Set[ContentType] = set()
"""
Set of content types supported by this parser. An empty set indicates that the parser is content-type agnostic.
"""
def __init__(self, content: Content):
"""
Initialize the parser with content to be parsed.
Args:
content (Content):
Content instance to be parsed.
Raises:
ValueError:
If the content type is not supported by this parser.
"""
self.content = content
if not self.supports():
raise ValueError(
f"{self.__class__.__name__} does not support content type "
f"{content.content_type!r}"
)
@abstractmethod
def parse(self) -> T:
"""
Parse the owned content into structured output.
Returns:
T:
Parsed, structured representation.
Raises:
Exception:
Parsing-specific errors as defined by the implementation.
Notes:
**Responsibilities:**
- Implementations must fully consume the provided content and return a deterministic, structured output
"""
raise NotImplementedError
def supports(self) -> bool:
"""
Check whether this parser supports the content's type.
Returns:
bool:
True if the content type is supported; False otherwise.
"""
if not self.supported_types:
return True
if self.content.content_type is None:
return False
return self.content.content_type in self.supported_types