omni read basic modules
This commit is contained in:
0
omniread/__init__.py
Normal file
0
omniread/__init__.py
Normal file
0
omniread/core/__init__.py
Normal file
0
omniread/core/__init__.py
Normal file
18
omniread/core/content.py
Normal file
18
omniread/core/content.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Mapping, Optional
|
||||
|
||||
|
||||
class ContentType(str, Enum):
|
||||
HTML = "text/html"
|
||||
PDF = "application/pdf"
|
||||
JSON = "application/json"
|
||||
XML = "application/xml"
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class Content:
|
||||
raw: bytes
|
||||
source: str
|
||||
content_type: Optional[ContentType] = None
|
||||
metadata: Optional[Mapping[str, Any]] = None
|
||||
48
omniread/core/parser.py
Normal file
48
omniread/core/parser.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Generic, TypeVar, Set
|
||||
|
||||
from .content import Content, ContentType
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class BaseParser(ABC, Generic[T]):
|
||||
"""
|
||||
Base interface for all parsers.
|
||||
|
||||
A parser is a self-contained object that owns the Content
|
||||
it is responsible for interpreting.
|
||||
"""
|
||||
|
||||
supported_types: Set[ContentType] = set()
|
||||
|
||||
def __init__(self, content: Content):
|
||||
self.content = content
|
||||
|
||||
if not self.supports():
|
||||
raise ValueError(
|
||||
f"{self.__class__.__name__} does not support content type "
|
||||
f"{content.content_type!r}"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def parse(self) -> T:
|
||||
"""
|
||||
Parse the owned content into structured output.
|
||||
|
||||
Returns:
|
||||
Parsed, structured representation.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def supports(self) -> bool:
|
||||
"""
|
||||
Check whether this parser supports the content's type.
|
||||
"""
|
||||
if not self.supported_types:
|
||||
return True
|
||||
|
||||
if self.content.content_type is None:
|
||||
return False
|
||||
|
||||
return self.content.content_type in self.supported_types
|
||||
32
omniread/core/scraper.py
Normal file
32
omniread/core/scraper.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Mapping, Optional
|
||||
|
||||
from .content import Content
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
"""
|
||||
Base interface for all scrapers.
|
||||
|
||||
A scraper is responsible ONLY for fetching raw content
|
||||
(bytes) from a source. It must not interpret or parse it.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def fetch(
|
||||
self,
|
||||
source: str,
|
||||
*,
|
||||
metadata: Optional[Mapping[str, Any]] = None,
|
||||
) -> Content:
|
||||
"""
|
||||
Fetch raw content from the given source.
|
||||
|
||||
Args:
|
||||
source: Location identifier (URL, file path, S3 URI, etc.)
|
||||
metadata: Optional hints for the scraper (headers, auth, etc.)
|
||||
|
||||
Returns:
|
||||
Content object containing raw bytes and metadata.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
0
omniread/html/__init__.py
Normal file
0
omniread/html/__init__.py
Normal file
85
omniread/html/parser.py
Normal file
85
omniread/html/parser.py
Normal file
@@ -0,0 +1,85 @@
|
||||
from typing import Any, Generic, TypeVar, Optional
|
||||
|
||||
from abc import abstractmethod
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from omniread.core.content import ContentType, Content
|
||||
from omniread.core.parser import BaseParser
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class HTMLParser(BaseParser[T], Generic[T]):
|
||||
"""
|
||||
Base HTML parser.
|
||||
|
||||
Provides reusable helpers for HTML extraction.
|
||||
Concrete parsers must explicitly define the return type.
|
||||
"""
|
||||
|
||||
supported_types = {ContentType.HTML}
|
||||
|
||||
def __init__(self, content: Content, features: str = "html.parser"):
|
||||
super().__init__(content)
|
||||
self._features = features
|
||||
self._soup = self._get_soup()
|
||||
|
||||
# ----------------------------
|
||||
# Contract
|
||||
# ----------------------------
|
||||
|
||||
@abstractmethod
|
||||
def parse(self) -> T:
|
||||
"""
|
||||
Fully parse the HTML content into structured output.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
# ----------------------------
|
||||
# Helpers (static / pure)
|
||||
# ----------------------------
|
||||
|
||||
@staticmethod
|
||||
def parse_div(div: Tag, *, separator: str = " ") -> str:
|
||||
return div.get_text(separator=separator, strip=True)
|
||||
|
||||
@staticmethod
|
||||
def parse_link(a: Tag) -> Optional[str]:
|
||||
return a.get("href")
|
||||
|
||||
@staticmethod
|
||||
def parse_table(table: Tag) -> list[list[str]]:
|
||||
rows: list[list[str]] = []
|
||||
for tr in table.find_all("tr"):
|
||||
cells = [
|
||||
cell.get_text(strip=True)
|
||||
for cell in tr.find_all(["td", "th"])
|
||||
]
|
||||
if cells:
|
||||
rows.append(cells)
|
||||
return rows
|
||||
|
||||
# ----------------------------
|
||||
# Helpers (instance-level)
|
||||
# ----------------------------
|
||||
|
||||
def _get_soup(self) -> BeautifulSoup:
|
||||
if not self.content.raw:
|
||||
raise ValueError("Empty HTML content")
|
||||
return BeautifulSoup(self.content.raw, features=self._features)
|
||||
|
||||
def parse_meta(self) -> dict[str, Any]:
|
||||
soup = self._soup
|
||||
|
||||
title = soup.title.string.strip() if soup.title and soup.title.string else None
|
||||
|
||||
meta = {
|
||||
tag.get("name") or tag.get("property"): tag.get("content")
|
||||
for tag in soup.find_all("meta")
|
||||
if tag.get("content")
|
||||
}
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"meta": meta,
|
||||
}
|
||||
48
omniread/html/scraper.py
Normal file
48
omniread/html/scraper.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import httpx
|
||||
from typing import Any, Mapping, Optional
|
||||
|
||||
from omniread.core.content import Content
|
||||
from omniread.core.scraper import BaseScraper
|
||||
|
||||
|
||||
class HTMLScraper(BaseScraper):
|
||||
"""
|
||||
Base HTTP scraper using httpx.
|
||||
|
||||
Fetches raw bytes and metadata only.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
timeout: float = 15.0,
|
||||
headers: Optional[Mapping[str, str]] = None,
|
||||
follow_redirects: bool = True,
|
||||
):
|
||||
self.timeout = timeout
|
||||
self.headers = dict(headers) if headers else {}
|
||||
self.follow_redirects = follow_redirects
|
||||
|
||||
def fetch(
|
||||
self,
|
||||
source: str,
|
||||
*,
|
||||
metadata: Optional[Mapping[str, Any]] = None,
|
||||
) -> Content:
|
||||
with httpx.Client(
|
||||
timeout=self.timeout,
|
||||
headers=self.headers,
|
||||
follow_redirects=self.follow_redirects,
|
||||
) as client:
|
||||
response = client.get(source)
|
||||
response.raise_for_status()
|
||||
|
||||
return Content(
|
||||
raw=response.content,
|
||||
source=source,
|
||||
content_type=response.headers.get("Content-Type"),
|
||||
metadata={
|
||||
"status_code": response.status_code,
|
||||
"headers": dict(response.headers),
|
||||
},
|
||||
)
|
||||
172
pyproject.toml
Normal file
172
pyproject.toml
Normal file
@@ -0,0 +1,172 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=65.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "omniread"
|
||||
version = "0.0.1"
|
||||
description = "Composable content ingestion framework with pluggable scrapers and parsers for HTML, PDF, and structured data"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
license = { text = "MIT" }
|
||||
|
||||
authors = [
|
||||
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
|
||||
]
|
||||
maintainers = [
|
||||
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
|
||||
]
|
||||
|
||||
keywords = [
|
||||
"scraping",
|
||||
"parsing",
|
||||
"content-ingestion",
|
||||
"html",
|
||||
"pdf",
|
||||
"document-processing",
|
||||
"pipeline",
|
||||
"typed",
|
||||
]
|
||||
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Topic :: Security",
|
||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||
"Typing :: Typed",
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
"requests>=2.31.0",
|
||||
"beautifulsoup4>=4.12.0",
|
||||
# "lxml>=5.0.0",
|
||||
"pypdf>=4.0.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=7.4.0",
|
||||
"pytest-asyncio>=0.21.0",
|
||||
"pytest-cov>=4.1.0",
|
||||
"black>=23.0.0",
|
||||
"ruff>=0.1.0",
|
||||
"mypy>=1.5.0",
|
||||
"pre-commit>=3.4.0",
|
||||
]
|
||||
|
||||
all = [
|
||||
"omniread[dev,fastapi]",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://git.aetoskia.com/aetos/omniread"
|
||||
Documentation = "https://git.aetoskia.com/aetos/omniread#readme"
|
||||
Repository = "https://git.aetoskia.com/aetos/omniread.git"
|
||||
Issues = "https://git.aetoskia.com/aetos/omniread/issues"
|
||||
Versions = "https://git.aetoskia.com/aetos/omniread/tags"
|
||||
|
||||
[tool.setuptools]
|
||||
packages = { find = { include = ["omniread*"] } }
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
omniread = ["py.typed"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
asyncio_mode = "auto"
|
||||
python_files = ["test_*.py"]
|
||||
python_classes = ["Test*"]
|
||||
python_functions = ["test_*"]
|
||||
addopts = [
|
||||
"--strict-markers",
|
||||
"--strict-config",
|
||||
"--cov=omniread",
|
||||
"--cov-report=term-missing",
|
||||
"--cov-report=html",
|
||||
"--cov-report=xml",
|
||||
]
|
||||
|
||||
[tool.black]
|
||||
line-length = 88
|
||||
target-version = ["py39", "py310", "py311", "py312", "py313"]
|
||||
include = '\.pyi?$'
|
||||
extend-exclude = '''
|
||||
/(
|
||||
\.eggs
|
||||
| \.git
|
||||
| \.hg
|
||||
| \.mypy_cache
|
||||
| \.tox
|
||||
| \.venv
|
||||
| build
|
||||
| dist
|
||||
)/
|
||||
'''
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 88
|
||||
target-version = "py39"
|
||||
select = [
|
||||
"E",
|
||||
"W",
|
||||
"F",
|
||||
"I",
|
||||
"B",
|
||||
"C4",
|
||||
"UP",
|
||||
]
|
||||
ignore = [
|
||||
"E501",
|
||||
"B008",
|
||||
"C901",
|
||||
]
|
||||
|
||||
[tool.ruff.per-file-ignores]
|
||||
"__init__.py" = ["F401"]
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.9"
|
||||
warn_return_any = true
|
||||
warn_unused_configs = true
|
||||
disallow_untyped_defs = true
|
||||
disallow_incomplete_defs = true
|
||||
check_untyped_defs = true
|
||||
disallow_untyped_decorators = false
|
||||
no_implicit_optional = true
|
||||
warn_redundant_casts = true
|
||||
warn_unused_ignores = true
|
||||
warn_no_return = true
|
||||
follow_imports = "normal"
|
||||
strict_optional = true
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = [
|
||||
"jose.*",
|
||||
"httpx.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.coverage.run]
|
||||
source = ["omniread"]
|
||||
omit = [
|
||||
"*/tests/*",
|
||||
"*/test_*.py",
|
||||
]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
"pragma: no cover",
|
||||
"def __repr__",
|
||||
"if __name__ == .__main__.:",
|
||||
"raise AssertionError",
|
||||
"raise NotImplementedError",
|
||||
"if TYPE_CHECKING:",
|
||||
"@abstractmethod",
|
||||
]
|
||||
7
requirements.txt
Normal file
7
requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
httpx==0.27.0
|
||||
beautifulsoup4==4.12.0
|
||||
# lxml==5.2.0
|
||||
|
||||
pytest==7.4.0
|
||||
pytest-asyncio==0.21.0
|
||||
pytest-cov==4.1.0
|
||||
Reference in New Issue
Block a user