omni read basic modules

2025-12-31 14:28:50 +05:30
parent c0959cb8d1
commit 32ee43e77a
10 changed files with 410 additions and 0 deletions
--- a/omniread/init.py
+++ b/omniread/init.py
--- a/omniread/core/init.py
+++ b/omniread/core/init.py
--- a/omniread/core/content.py
+++ b/omniread/core/content.py
@@ -0,0 +1,18 @@
 from enum import Enum
 from dataclasses import dataclass
 from typing import Any, Mapping, Optional
 class ContentType(str, Enum):
    HTML = "text/html"
    PDF = "application/pdf"
    JSON = "application/json"
    XML = "application/xml"
@dataclass(slots=True)
 class Content:
    raw: bytes
    source: str
    content_type: Optional[ContentType] = None
    metadata: Optional[Mapping[str, Any]] = None
--- a/omniread/core/parser.py
+++ b/omniread/core/parser.py
@@ -0,0 +1,48 @@
 from abc import ABC, abstractmethod
 from typing import Generic, TypeVar, Set
 from .content import Content, ContentType
 T = TypeVar("T")
 class BaseParser(ABC, Generic[T]):
    """
    Base interface for all parsers.
    A parser is a self-contained object that owns the Content
    it is responsible for interpreting.
    """
    supported_types: Set[ContentType] = set()
    def __init__(self, content: Content):
        self.content = content
        if not self.supports():
            raise ValueError(
                f"{self.__class__.__name__} does not support content type "
                f"{content.content_type!r}"
            )
    @abstractmethod
    def parse(self) -> T:
        """
        Parse the owned content into structured output.
        Returns:
            Parsed, structured representation.
        """
        raise NotImplementedError
    def supports(self) -> bool:
        """
        Check whether this parser supports the content's type.
        """
        if not self.supported_types:
            return True
        if self.content.content_type is None:
            return False
        return self.content.content_type in self.supported_types
--- a/omniread/core/scraper.py
+++ b/omniread/core/scraper.py
@@ -0,0 +1,32 @@
 from abc import ABC, abstractmethod
 from typing import Any, Mapping, Optional
 from .content import Content
 class BaseScraper(ABC):
    """
    Base interface for all scrapers.
    A scraper is responsible ONLY for fetching raw content
    (bytes) from a source. It must not interpret or parse it.
    """
    @abstractmethod
    def fetch(
        self,
        source: str,
        *,
        metadata: Optional[Mapping[str, Any]] = None,
    ) -> Content:
        """
        Fetch raw content from the given source.
        Args:
            source: Location identifier (URL, file path, S3 URI, etc.)
            metadata: Optional hints for the scraper (headers, auth, etc.)
        Returns:
            Content object containing raw bytes and metadata.
        """
        raise NotImplementedError
--- a/omniread/html/init.py
+++ b/omniread/html/init.py
--- a/omniread/html/parser.py
+++ b/omniread/html/parser.py
@@ -0,0 +1,85 @@
 from typing import Any, Generic, TypeVar, Optional
 from abc import abstractmethod
 from bs4 import BeautifulSoup, Tag
 from omniread.core.content import ContentType, Content
 from omniread.core.parser import BaseParser
 T = TypeVar("T")
 class HTMLParser(BaseParser[T], Generic[T]):
    """
    Base HTML parser.
    Provides reusable helpers for HTML extraction.
    Concrete parsers must explicitly define the return type.
    """
    supported_types = {ContentType.HTML}
    def __init__(self, content: Content, features: str = "html.parser"):
        super().__init__(content)
        self._features = features
        self._soup = self._get_soup()
    # ----------------------------
    # Contract
    # ----------------------------
    @abstractmethod
    def parse(self) -> T:
        """
        Fully parse the HTML content into structured output.
        """
        raise NotImplementedError
    # ----------------------------
    # Helpers (static / pure)
    # ----------------------------
    @staticmethod
    def parse_div(div: Tag, *, separator: str = " ") -> str:
        return div.get_text(separator=separator, strip=True)
    @staticmethod
    def parse_link(a: Tag) -> Optional[str]:
        return a.get("href")
    @staticmethod
    def parse_table(table: Tag) -> list[list[str]]:
        rows: list[list[str]] = []
        for tr in table.find_all("tr"):
            cells = [
                cell.get_text(strip=True)
                for cell in tr.find_all(["td", "th"])
            ]
            if cells:
                rows.append(cells)
        return rows
    # ----------------------------
    # Helpers (instance-level)
    # ----------------------------
    def _get_soup(self) -> BeautifulSoup:
        if not self.content.raw:
            raise ValueError("Empty HTML content")
        return BeautifulSoup(self.content.raw, features=self._features)
    def parse_meta(self) -> dict[str, Any]:
        soup = self._soup
        title = soup.title.string.strip() if soup.title and soup.title.string else None
        meta = {
            tag.get("name") or tag.get("property"): tag.get("content")
            for tag in soup.find_all("meta")
            if tag.get("content")
        }
        return {
            "title": title,
            "meta": meta,
        }
--- a/omniread/html/scraper.py
+++ b/omniread/html/scraper.py
@@ -0,0 +1,48 @@
 import httpx
 from typing import Any, Mapping, Optional
 from omniread.core.content import Content
 from omniread.core.scraper import BaseScraper
 class HTMLScraper(BaseScraper):
    """
    Base HTTP scraper using httpx.
    Fetches raw bytes and metadata only.
    """
    def __init__(
        self,
        *,
        timeout: float = 15.0,
        headers: Optional[Mapping[str, str]] = None,
        follow_redirects: bool = True,
    ):
        self.timeout = timeout
        self.headers = dict(headers) if headers else {}
        self.follow_redirects = follow_redirects
    def fetch(
        self,
        source: str,
        *,
        metadata: Optional[Mapping[str, Any]] = None,
    ) -> Content:
        with httpx.Client(
            timeout=self.timeout,
            headers=self.headers,
            follow_redirects=self.follow_redirects,
        ) as client:
            response = client.get(source)
            response.raise_for_status()
        return Content(
            raw=response.content,
            source=source,
            content_type=response.headers.get("Content-Type"),
            metadata={
                "status_code": response.status_code,
                "headers": dict(response.headers),
            },
        )
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,172 @@
 [build-system]
 requires = ["setuptools>=65.0", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "omniread"
 version = "0.0.1"
 description = "Composable content ingestion framework with pluggable scrapers and parsers for HTML, PDF, and structured data"
 readme = "README.md"
 requires-python = ">=3.9"
 license = { text = "MIT" }
 authors = [
    { name = "Aetos Skia", email = "dev@aetoskia.com" }
 ]
 maintainers = [
    { name = "Aetos Skia", email = "dev@aetoskia.com" }
 ]
 keywords = [
    "scraping",
    "parsing",
    "content-ingestion",
    "html",
    "pdf",
    "document-processing",
    "pipeline",
    "typed",
 ]
 classifiers = [
    "Development Status :: 3 - Alpha",
    "Intended Audience :: Developers",
    "License :: OSI Approved :: MIT License",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Topic :: Security",
    "Topic :: Software Development :: Libraries :: Python Modules",
    "Typing :: Typed",
 ]
 dependencies = [
    "requests>=2.31.0",
    "beautifulsoup4>=4.12.0",
 #    "lxml>=5.0.0",
    "pypdf>=4.0.0",
 ]
 [project.optional-dependencies]
 dev = [
    "pytest>=7.4.0",
    "pytest-asyncio>=0.21.0",
    "pytest-cov>=4.1.0",
    "black>=23.0.0",
    "ruff>=0.1.0",
    "mypy>=1.5.0",
    "pre-commit>=3.4.0",
 ]
 all = [
    "omniread[dev,fastapi]",
 ]
 [project.urls]
 Homepage = "https://git.aetoskia.com/aetos/omniread"
 Documentation = "https://git.aetoskia.com/aetos/omniread#readme"
 Repository = "https://git.aetoskia.com/aetos/omniread.git"
 Issues = "https://git.aetoskia.com/aetos/omniread/issues"
 Versions = "https://git.aetoskia.com/aetos/omniread/tags"
 [tool.setuptools]
 packages = { find = { include = ["omniread*"] } }
 [tool.setuptools.package-data]
 omniread = ["py.typed"]
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 asyncio_mode = "auto"
 python_files = ["test_*.py"]
 python_classes = ["Test*"]
 python_functions = ["test_*"]
 addopts = [
    "--strict-markers",
    "--strict-config",
    "--cov=omniread",
    "--cov-report=term-missing",
    "--cov-report=html",
    "--cov-report=xml",
 ]
 [tool.black]
 line-length = 88
 target-version = ["py39", "py310", "py311", "py312", "py313"]
 include = '\.pyi?$'
 extend-exclude = '''
 /(
  \.eggs
  | \.git
  | \.hg
  | \.mypy_cache
  | \.tox
  | \.venv
  | build
  | dist
 )/
 '''
 [tool.ruff]
 line-length = 88
 target-version = "py39"
 select = [
    "E",
    "W",
    "F",
    "I",
    "B",
    "C4",
    "UP",
 ]
 ignore = [
    "E501",
    "B008",
    "C901",
 ]
 [tool.ruff.per-file-ignores]
 "__init__.py" = ["F401"]
 [tool.mypy]
 python_version = "3.9"
 warn_return_any = true
 warn_unused_configs = true
 disallow_untyped_defs = true
 disallow_incomplete_defs = true
 check_untyped_defs = true
 disallow_untyped_decorators = false
 no_implicit_optional = true
 warn_redundant_casts = true
 warn_unused_ignores = true
 warn_no_return = true
 follow_imports = "normal"
 strict_optional = true
 [[tool.mypy.overrides]]
 module = [
    "jose.*",
    "httpx.*",
 ]
 ignore_missing_imports = true
 [tool.coverage.run]
 source = ["omniread"]
 omit = [
    "*/tests/*",
    "*/test_*.py",
 ]
 [tool.coverage.report]
 exclude_lines = [
    "pragma: no cover",
    "def __repr__",
    "if __name__ == .__main__.:",
    "raise AssertionError",
    "raise NotImplementedError",
    "if TYPE_CHECKING:",
    "@abstractmethod",
 ]
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,7 @@
 httpx==0.27.0
 beautifulsoup4==4.12.0
 # lxml==5.2.0
 pytest==7.4.0
 pytest-asyncio==0.21.0
 pytest-cov==4.1.0