omni read basic modules
This commit is contained in:
0
omniread/__init__.py
Normal file
0
omniread/__init__.py
Normal file
0
omniread/core/__init__.py
Normal file
0
omniread/core/__init__.py
Normal file
18
omniread/core/content.py
Normal file
18
omniread/core/content.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
from enum import Enum
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Mapping, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class ContentType(str, Enum):
|
||||||
|
HTML = "text/html"
|
||||||
|
PDF = "application/pdf"
|
||||||
|
JSON = "application/json"
|
||||||
|
XML = "application/xml"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class Content:
|
||||||
|
raw: bytes
|
||||||
|
source: str
|
||||||
|
content_type: Optional[ContentType] = None
|
||||||
|
metadata: Optional[Mapping[str, Any]] = None
|
||||||
48
omniread/core/parser.py
Normal file
48
omniread/core/parser.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Generic, TypeVar, Set
|
||||||
|
|
||||||
|
from .content import Content, ContentType
|
||||||
|
|
||||||
|
T = TypeVar("T")
|
||||||
|
|
||||||
|
|
||||||
|
class BaseParser(ABC, Generic[T]):
|
||||||
|
"""
|
||||||
|
Base interface for all parsers.
|
||||||
|
|
||||||
|
A parser is a self-contained object that owns the Content
|
||||||
|
it is responsible for interpreting.
|
||||||
|
"""
|
||||||
|
|
||||||
|
supported_types: Set[ContentType] = set()
|
||||||
|
|
||||||
|
def __init__(self, content: Content):
|
||||||
|
self.content = content
|
||||||
|
|
||||||
|
if not self.supports():
|
||||||
|
raise ValueError(
|
||||||
|
f"{self.__class__.__name__} does not support content type "
|
||||||
|
f"{content.content_type!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def parse(self) -> T:
|
||||||
|
"""
|
||||||
|
Parse the owned content into structured output.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Parsed, structured representation.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def supports(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check whether this parser supports the content's type.
|
||||||
|
"""
|
||||||
|
if not self.supported_types:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if self.content.content_type is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return self.content.content_type in self.supported_types
|
||||||
32
omniread/core/scraper.py
Normal file
32
omniread/core/scraper.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Any, Mapping, Optional
|
||||||
|
|
||||||
|
from .content import Content
|
||||||
|
|
||||||
|
|
||||||
|
class BaseScraper(ABC):
|
||||||
|
"""
|
||||||
|
Base interface for all scrapers.
|
||||||
|
|
||||||
|
A scraper is responsible ONLY for fetching raw content
|
||||||
|
(bytes) from a source. It must not interpret or parse it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def fetch(
|
||||||
|
self,
|
||||||
|
source: str,
|
||||||
|
*,
|
||||||
|
metadata: Optional[Mapping[str, Any]] = None,
|
||||||
|
) -> Content:
|
||||||
|
"""
|
||||||
|
Fetch raw content from the given source.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: Location identifier (URL, file path, S3 URI, etc.)
|
||||||
|
metadata: Optional hints for the scraper (headers, auth, etc.)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Content object containing raw bytes and metadata.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
0
omniread/html/__init__.py
Normal file
0
omniread/html/__init__.py
Normal file
85
omniread/html/parser.py
Normal file
85
omniread/html/parser.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
from typing import Any, Generic, TypeVar, Optional
|
||||||
|
|
||||||
|
from abc import abstractmethod
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
from omniread.core.content import ContentType, Content
|
||||||
|
from omniread.core.parser import BaseParser
|
||||||
|
|
||||||
|
T = TypeVar("T")
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLParser(BaseParser[T], Generic[T]):
|
||||||
|
"""
|
||||||
|
Base HTML parser.
|
||||||
|
|
||||||
|
Provides reusable helpers for HTML extraction.
|
||||||
|
Concrete parsers must explicitly define the return type.
|
||||||
|
"""
|
||||||
|
|
||||||
|
supported_types = {ContentType.HTML}
|
||||||
|
|
||||||
|
def __init__(self, content: Content, features: str = "html.parser"):
|
||||||
|
super().__init__(content)
|
||||||
|
self._features = features
|
||||||
|
self._soup = self._get_soup()
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Contract
|
||||||
|
# ----------------------------
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def parse(self) -> T:
|
||||||
|
"""
|
||||||
|
Fully parse the HTML content into structured output.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Helpers (static / pure)
|
||||||
|
# ----------------------------
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_div(div: Tag, *, separator: str = " ") -> str:
|
||||||
|
return div.get_text(separator=separator, strip=True)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_link(a: Tag) -> Optional[str]:
|
||||||
|
return a.get("href")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_table(table: Tag) -> list[list[str]]:
|
||||||
|
rows: list[list[str]] = []
|
||||||
|
for tr in table.find_all("tr"):
|
||||||
|
cells = [
|
||||||
|
cell.get_text(strip=True)
|
||||||
|
for cell in tr.find_all(["td", "th"])
|
||||||
|
]
|
||||||
|
if cells:
|
||||||
|
rows.append(cells)
|
||||||
|
return rows
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Helpers (instance-level)
|
||||||
|
# ----------------------------
|
||||||
|
|
||||||
|
def _get_soup(self) -> BeautifulSoup:
|
||||||
|
if not self.content.raw:
|
||||||
|
raise ValueError("Empty HTML content")
|
||||||
|
return BeautifulSoup(self.content.raw, features=self._features)
|
||||||
|
|
||||||
|
def parse_meta(self) -> dict[str, Any]:
|
||||||
|
soup = self._soup
|
||||||
|
|
||||||
|
title = soup.title.string.strip() if soup.title and soup.title.string else None
|
||||||
|
|
||||||
|
meta = {
|
||||||
|
tag.get("name") or tag.get("property"): tag.get("content")
|
||||||
|
for tag in soup.find_all("meta")
|
||||||
|
if tag.get("content")
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": title,
|
||||||
|
"meta": meta,
|
||||||
|
}
|
||||||
48
omniread/html/scraper.py
Normal file
48
omniread/html/scraper.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
import httpx
|
||||||
|
from typing import Any, Mapping, Optional
|
||||||
|
|
||||||
|
from omniread.core.content import Content
|
||||||
|
from omniread.core.scraper import BaseScraper
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLScraper(BaseScraper):
|
||||||
|
"""
|
||||||
|
Base HTTP scraper using httpx.
|
||||||
|
|
||||||
|
Fetches raw bytes and metadata only.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
timeout: float = 15.0,
|
||||||
|
headers: Optional[Mapping[str, str]] = None,
|
||||||
|
follow_redirects: bool = True,
|
||||||
|
):
|
||||||
|
self.timeout = timeout
|
||||||
|
self.headers = dict(headers) if headers else {}
|
||||||
|
self.follow_redirects = follow_redirects
|
||||||
|
|
||||||
|
def fetch(
|
||||||
|
self,
|
||||||
|
source: str,
|
||||||
|
*,
|
||||||
|
metadata: Optional[Mapping[str, Any]] = None,
|
||||||
|
) -> Content:
|
||||||
|
with httpx.Client(
|
||||||
|
timeout=self.timeout,
|
||||||
|
headers=self.headers,
|
||||||
|
follow_redirects=self.follow_redirects,
|
||||||
|
) as client:
|
||||||
|
response = client.get(source)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
return Content(
|
||||||
|
raw=response.content,
|
||||||
|
source=source,
|
||||||
|
content_type=response.headers.get("Content-Type"),
|
||||||
|
metadata={
|
||||||
|
"status_code": response.status_code,
|
||||||
|
"headers": dict(response.headers),
|
||||||
|
},
|
||||||
|
)
|
||||||
172
pyproject.toml
Normal file
172
pyproject.toml
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=65.0", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "omniread"
|
||||||
|
version = "0.0.1"
|
||||||
|
description = "Composable content ingestion framework with pluggable scrapers and parsers for HTML, PDF, and structured data"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.9"
|
||||||
|
license = { text = "MIT" }
|
||||||
|
|
||||||
|
authors = [
|
||||||
|
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
|
||||||
|
]
|
||||||
|
maintainers = [
|
||||||
|
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
|
||||||
|
]
|
||||||
|
|
||||||
|
keywords = [
|
||||||
|
"scraping",
|
||||||
|
"parsing",
|
||||||
|
"content-ingestion",
|
||||||
|
"html",
|
||||||
|
"pdf",
|
||||||
|
"document-processing",
|
||||||
|
"pipeline",
|
||||||
|
"typed",
|
||||||
|
]
|
||||||
|
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 3 - Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Programming Language :: Python :: 3.13",
|
||||||
|
"Topic :: Security",
|
||||||
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||||
|
"Typing :: Typed",
|
||||||
|
]
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
"requests>=2.31.0",
|
||||||
|
"beautifulsoup4>=4.12.0",
|
||||||
|
# "lxml>=5.0.0",
|
||||||
|
"pypdf>=4.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=7.4.0",
|
||||||
|
"pytest-asyncio>=0.21.0",
|
||||||
|
"pytest-cov>=4.1.0",
|
||||||
|
"black>=23.0.0",
|
||||||
|
"ruff>=0.1.0",
|
||||||
|
"mypy>=1.5.0",
|
||||||
|
"pre-commit>=3.4.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
all = [
|
||||||
|
"omniread[dev,fastapi]",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://git.aetoskia.com/aetos/omniread"
|
||||||
|
Documentation = "https://git.aetoskia.com/aetos/omniread#readme"
|
||||||
|
Repository = "https://git.aetoskia.com/aetos/omniread.git"
|
||||||
|
Issues = "https://git.aetoskia.com/aetos/omniread/issues"
|
||||||
|
Versions = "https://git.aetoskia.com/aetos/omniread/tags"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
packages = { find = { include = ["omniread*"] } }
|
||||||
|
|
||||||
|
[tool.setuptools.package-data]
|
||||||
|
omniread = ["py.typed"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["tests"]
|
||||||
|
asyncio_mode = "auto"
|
||||||
|
python_files = ["test_*.py"]
|
||||||
|
python_classes = ["Test*"]
|
||||||
|
python_functions = ["test_*"]
|
||||||
|
addopts = [
|
||||||
|
"--strict-markers",
|
||||||
|
"--strict-config",
|
||||||
|
"--cov=omniread",
|
||||||
|
"--cov-report=term-missing",
|
||||||
|
"--cov-report=html",
|
||||||
|
"--cov-report=xml",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 88
|
||||||
|
target-version = ["py39", "py310", "py311", "py312", "py313"]
|
||||||
|
include = '\.pyi?$'
|
||||||
|
extend-exclude = '''
|
||||||
|
/(
|
||||||
|
\.eggs
|
||||||
|
| \.git
|
||||||
|
| \.hg
|
||||||
|
| \.mypy_cache
|
||||||
|
| \.tox
|
||||||
|
| \.venv
|
||||||
|
| build
|
||||||
|
| dist
|
||||||
|
)/
|
||||||
|
'''
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 88
|
||||||
|
target-version = "py39"
|
||||||
|
select = [
|
||||||
|
"E",
|
||||||
|
"W",
|
||||||
|
"F",
|
||||||
|
"I",
|
||||||
|
"B",
|
||||||
|
"C4",
|
||||||
|
"UP",
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
"E501",
|
||||||
|
"B008",
|
||||||
|
"C901",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.per-file-ignores]
|
||||||
|
"__init__.py" = ["F401"]
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
python_version = "3.9"
|
||||||
|
warn_return_any = true
|
||||||
|
warn_unused_configs = true
|
||||||
|
disallow_untyped_defs = true
|
||||||
|
disallow_incomplete_defs = true
|
||||||
|
check_untyped_defs = true
|
||||||
|
disallow_untyped_decorators = false
|
||||||
|
no_implicit_optional = true
|
||||||
|
warn_redundant_casts = true
|
||||||
|
warn_unused_ignores = true
|
||||||
|
warn_no_return = true
|
||||||
|
follow_imports = "normal"
|
||||||
|
strict_optional = true
|
||||||
|
|
||||||
|
[[tool.mypy.overrides]]
|
||||||
|
module = [
|
||||||
|
"jose.*",
|
||||||
|
"httpx.*",
|
||||||
|
]
|
||||||
|
ignore_missing_imports = true
|
||||||
|
|
||||||
|
[tool.coverage.run]
|
||||||
|
source = ["omniread"]
|
||||||
|
omit = [
|
||||||
|
"*/tests/*",
|
||||||
|
"*/test_*.py",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.coverage.report]
|
||||||
|
exclude_lines = [
|
||||||
|
"pragma: no cover",
|
||||||
|
"def __repr__",
|
||||||
|
"if __name__ == .__main__.:",
|
||||||
|
"raise AssertionError",
|
||||||
|
"raise NotImplementedError",
|
||||||
|
"if TYPE_CHECKING:",
|
||||||
|
"@abstractmethod",
|
||||||
|
]
|
||||||
7
requirements.txt
Normal file
7
requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
httpx==0.27.0
|
||||||
|
beautifulsoup4==4.12.0
|
||||||
|
# lxml==5.2.0
|
||||||
|
|
||||||
|
pytest==7.4.0
|
||||||
|
pytest-asyncio==0.21.0
|
||||||
|
pytest-cov==4.1.0
|
||||||
Reference in New Issue
Block a user