omni read basic modules

This commit is contained in:
2025-12-31 14:28:50 +05:30
parent c0959cb8d1
commit 32ee43e77a
10 changed files with 410 additions and 0 deletions

0
omniread/__init__.py Normal file
View File

View File

18
omniread/core/content.py Normal file
View File

@@ -0,0 +1,18 @@
from enum import Enum
from dataclasses import dataclass
from typing import Any, Mapping, Optional
class ContentType(str, Enum):
HTML = "text/html"
PDF = "application/pdf"
JSON = "application/json"
XML = "application/xml"
@dataclass(slots=True)
class Content:
raw: bytes
source: str
content_type: Optional[ContentType] = None
metadata: Optional[Mapping[str, Any]] = None

48
omniread/core/parser.py Normal file
View File

@@ -0,0 +1,48 @@
from abc import ABC, abstractmethod
from typing import Generic, TypeVar, Set
from .content import Content, ContentType
T = TypeVar("T")
class BaseParser(ABC, Generic[T]):
"""
Base interface for all parsers.
A parser is a self-contained object that owns the Content
it is responsible for interpreting.
"""
supported_types: Set[ContentType] = set()
def __init__(self, content: Content):
self.content = content
if not self.supports():
raise ValueError(
f"{self.__class__.__name__} does not support content type "
f"{content.content_type!r}"
)
@abstractmethod
def parse(self) -> T:
"""
Parse the owned content into structured output.
Returns:
Parsed, structured representation.
"""
raise NotImplementedError
def supports(self) -> bool:
"""
Check whether this parser supports the content's type.
"""
if not self.supported_types:
return True
if self.content.content_type is None:
return False
return self.content.content_type in self.supported_types

32
omniread/core/scraper.py Normal file
View File

@@ -0,0 +1,32 @@
from abc import ABC, abstractmethod
from typing import Any, Mapping, Optional
from .content import Content
class BaseScraper(ABC):
"""
Base interface for all scrapers.
A scraper is responsible ONLY for fetching raw content
(bytes) from a source. It must not interpret or parse it.
"""
@abstractmethod
def fetch(
self,
source: str,
*,
metadata: Optional[Mapping[str, Any]] = None,
) -> Content:
"""
Fetch raw content from the given source.
Args:
source: Location identifier (URL, file path, S3 URI, etc.)
metadata: Optional hints for the scraper (headers, auth, etc.)
Returns:
Content object containing raw bytes and metadata.
"""
raise NotImplementedError

View File

85
omniread/html/parser.py Normal file
View File

@@ -0,0 +1,85 @@
from typing import Any, Generic, TypeVar, Optional
from abc import abstractmethod
from bs4 import BeautifulSoup, Tag
from omniread.core.content import ContentType, Content
from omniread.core.parser import BaseParser
T = TypeVar("T")
class HTMLParser(BaseParser[T], Generic[T]):
"""
Base HTML parser.
Provides reusable helpers for HTML extraction.
Concrete parsers must explicitly define the return type.
"""
supported_types = {ContentType.HTML}
def __init__(self, content: Content, features: str = "html.parser"):
super().__init__(content)
self._features = features
self._soup = self._get_soup()
# ----------------------------
# Contract
# ----------------------------
@abstractmethod
def parse(self) -> T:
"""
Fully parse the HTML content into structured output.
"""
raise NotImplementedError
# ----------------------------
# Helpers (static / pure)
# ----------------------------
@staticmethod
def parse_div(div: Tag, *, separator: str = " ") -> str:
return div.get_text(separator=separator, strip=True)
@staticmethod
def parse_link(a: Tag) -> Optional[str]:
return a.get("href")
@staticmethod
def parse_table(table: Tag) -> list[list[str]]:
rows: list[list[str]] = []
for tr in table.find_all("tr"):
cells = [
cell.get_text(strip=True)
for cell in tr.find_all(["td", "th"])
]
if cells:
rows.append(cells)
return rows
# ----------------------------
# Helpers (instance-level)
# ----------------------------
def _get_soup(self) -> BeautifulSoup:
if not self.content.raw:
raise ValueError("Empty HTML content")
return BeautifulSoup(self.content.raw, features=self._features)
def parse_meta(self) -> dict[str, Any]:
soup = self._soup
title = soup.title.string.strip() if soup.title and soup.title.string else None
meta = {
tag.get("name") or tag.get("property"): tag.get("content")
for tag in soup.find_all("meta")
if tag.get("content")
}
return {
"title": title,
"meta": meta,
}

48
omniread/html/scraper.py Normal file
View File

@@ -0,0 +1,48 @@
import httpx
from typing import Any, Mapping, Optional
from omniread.core.content import Content
from omniread.core.scraper import BaseScraper
class HTMLScraper(BaseScraper):
"""
Base HTTP scraper using httpx.
Fetches raw bytes and metadata only.
"""
def __init__(
self,
*,
timeout: float = 15.0,
headers: Optional[Mapping[str, str]] = None,
follow_redirects: bool = True,
):
self.timeout = timeout
self.headers = dict(headers) if headers else {}
self.follow_redirects = follow_redirects
def fetch(
self,
source: str,
*,
metadata: Optional[Mapping[str, Any]] = None,
) -> Content:
with httpx.Client(
timeout=self.timeout,
headers=self.headers,
follow_redirects=self.follow_redirects,
) as client:
response = client.get(source)
response.raise_for_status()
return Content(
raw=response.content,
source=source,
content_type=response.headers.get("Content-Type"),
metadata={
"status_code": response.status_code,
"headers": dict(response.headers),
},
)

172
pyproject.toml Normal file
View File

@@ -0,0 +1,172 @@
[build-system]
requires = ["setuptools>=65.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "omniread"
version = "0.0.1"
description = "Composable content ingestion framework with pluggable scrapers and parsers for HTML, PDF, and structured data"
readme = "README.md"
requires-python = ">=3.9"
license = { text = "MIT" }
authors = [
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
]
maintainers = [
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
]
keywords = [
"scraping",
"parsing",
"content-ingestion",
"html",
"pdf",
"document-processing",
"pipeline",
"typed",
]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Security",
"Topic :: Software Development :: Libraries :: Python Modules",
"Typing :: Typed",
]
dependencies = [
"requests>=2.31.0",
"beautifulsoup4>=4.12.0",
# "lxml>=5.0.0",
"pypdf>=4.0.0",
]
[project.optional-dependencies]
dev = [
"pytest>=7.4.0",
"pytest-asyncio>=0.21.0",
"pytest-cov>=4.1.0",
"black>=23.0.0",
"ruff>=0.1.0",
"mypy>=1.5.0",
"pre-commit>=3.4.0",
]
all = [
"omniread[dev,fastapi]",
]
[project.urls]
Homepage = "https://git.aetoskia.com/aetos/omniread"
Documentation = "https://git.aetoskia.com/aetos/omniread#readme"
Repository = "https://git.aetoskia.com/aetos/omniread.git"
Issues = "https://git.aetoskia.com/aetos/omniread/issues"
Versions = "https://git.aetoskia.com/aetos/omniread/tags"
[tool.setuptools]
packages = { find = { include = ["omniread*"] } }
[tool.setuptools.package-data]
omniread = ["py.typed"]
[tool.pytest.ini_options]
testpaths = ["tests"]
asyncio_mode = "auto"
python_files = ["test_*.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
addopts = [
"--strict-markers",
"--strict-config",
"--cov=omniread",
"--cov-report=term-missing",
"--cov-report=html",
"--cov-report=xml",
]
[tool.black]
line-length = 88
target-version = ["py39", "py310", "py311", "py312", "py313"]
include = '\.pyi?$'
extend-exclude = '''
/(
\.eggs
| \.git
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| build
| dist
)/
'''
[tool.ruff]
line-length = 88
target-version = "py39"
select = [
"E",
"W",
"F",
"I",
"B",
"C4",
"UP",
]
ignore = [
"E501",
"B008",
"C901",
]
[tool.ruff.per-file-ignores]
"__init__.py" = ["F401"]
[tool.mypy]
python_version = "3.9"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
check_untyped_defs = true
disallow_untyped_decorators = false
no_implicit_optional = true
warn_redundant_casts = true
warn_unused_ignores = true
warn_no_return = true
follow_imports = "normal"
strict_optional = true
[[tool.mypy.overrides]]
module = [
"jose.*",
"httpx.*",
]
ignore_missing_imports = true
[tool.coverage.run]
source = ["omniread"]
omit = [
"*/tests/*",
"*/test_*.py",
]
[tool.coverage.report]
exclude_lines = [
"pragma: no cover",
"def __repr__",
"if __name__ == .__main__.:",
"raise AssertionError",
"raise NotImplementedError",
"if TYPE_CHECKING:",
"@abstractmethod",
]

7
requirements.txt Normal file
View File

@@ -0,0 +1,7 @@
httpx==0.27.0
beautifulsoup4==4.12.0
# lxml==5.2.0
pytest==7.4.0
pytest-asyncio==0.21.0
pytest-cov==4.1.0