omni read basic modules

2025-12-31 14:28:50 +05:30
parent c0959cb8d1
commit 32ee43e77a
10 changed files with 410 additions and 0 deletions
--- a/omniread/init.py
+++ b/omniread/init.py
--- a/omniread/core/init.py
+++ b/omniread/core/init.py
--- a/omniread/core/content.py
+++ b/omniread/core/content.py
@@ -0,0 +1,18 @@
+from enum import Enum
+from dataclasses import dataclass
+from typing import Any, Mapping, Optional
+
+
+class ContentType(str, Enum):
+    HTML = "text/html"
+    PDF = "application/pdf"
+    JSON = "application/json"
+    XML = "application/xml"
+
+
+@dataclass(slots=True)
+class Content:
+    raw: bytes
+    source: str
+    content_type: Optional[ContentType] = None
+    metadata: Optional[Mapping[str, Any]] = None
--- a/omniread/core/parser.py
+++ b/omniread/core/parser.py
@@ -0,0 +1,48 @@
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar, Set
+
+from .content import Content, ContentType
+
+T = TypeVar("T")
+
+
+class BaseParser(ABC, Generic[T]):
+    """
+    Base interface for all parsers.
+
+    A parser is a self-contained object that owns the Content
+    it is responsible for interpreting.
+    """
+
+    supported_types: Set[ContentType] = set()
+
+    def __init__(self, content: Content):
+        self.content = content
+
+        if not self.supports():
+            raise ValueError(
+                f"{self.__class__.__name__} does not support content type "
+                f"{content.content_type!r}"
+            )
+
+    @abstractmethod
+    def parse(self) -> T:
+        """
+        Parse the owned content into structured output.
+
+        Returns:
+            Parsed, structured representation.
+        """
+        raise NotImplementedError
+
+    def supports(self) -> bool:
+        """
+        Check whether this parser supports the content's type.
+        """
+        if not self.supported_types:
+            return True
+
+        if self.content.content_type is None:
+            return False
+
+        return self.content.content_type in self.supported_types
--- a/omniread/core/scraper.py
+++ b/omniread/core/scraper.py
@@ -0,0 +1,32 @@
+from abc import ABC, abstractmethod
+from typing import Any, Mapping, Optional
+
+from .content import Content
+
+
+class BaseScraper(ABC):
+    """
+    Base interface for all scrapers.
+
+    A scraper is responsible ONLY for fetching raw content
+    (bytes) from a source. It must not interpret or parse it.
+    """
+
+    @abstractmethod
+    def fetch(
+        self,
+        source: str,
+        *,
+        metadata: Optional[Mapping[str, Any]] = None,
+    ) -> Content:
+        """
+        Fetch raw content from the given source.
+
+        Args:
+            source: Location identifier (URL, file path, S3 URI, etc.)
+            metadata: Optional hints for the scraper (headers, auth, etc.)
+
+        Returns:
+            Content object containing raw bytes and metadata.
+        """
+        raise NotImplementedError
--- a/omniread/html/init.py
+++ b/omniread/html/init.py
--- a/omniread/html/parser.py
+++ b/omniread/html/parser.py
@@ -0,0 +1,85 @@
+from typing import Any, Generic, TypeVar, Optional
+
+from abc import abstractmethod
+from bs4 import BeautifulSoup, Tag
+
+from omniread.core.content import ContentType, Content
+from omniread.core.parser import BaseParser
+
+T = TypeVar("T")
+
+
+class HTMLParser(BaseParser[T], Generic[T]):
+    """
+    Base HTML parser.
+
+    Provides reusable helpers for HTML extraction.
+    Concrete parsers must explicitly define the return type.
+    """
+
+    supported_types = {ContentType.HTML}
+
+    def __init__(self, content: Content, features: str = "html.parser"):
+        super().__init__(content)
+        self._features = features
+        self._soup = self._get_soup()
+
+    # ----------------------------
+    # Contract
+    # ----------------------------
+
+    @abstractmethod
+    def parse(self) -> T:
+        """
+        Fully parse the HTML content into structured output.
+        """
+        raise NotImplementedError
+
+    # ----------------------------
+    # Helpers (static / pure)
+    # ----------------------------
+
+    @staticmethod
+    def parse_div(div: Tag, *, separator: str = " ") -> str:
+        return div.get_text(separator=separator, strip=True)
+
+    @staticmethod
+    def parse_link(a: Tag) -> Optional[str]:
+        return a.get("href")
+
+    @staticmethod
+    def parse_table(table: Tag) -> list[list[str]]:
+        rows: list[list[str]] = []
+        for tr in table.find_all("tr"):
+            cells = [
+                cell.get_text(strip=True)
+                for cell in tr.find_all(["td", "th"])
+            ]
+            if cells:
+                rows.append(cells)
+        return rows
+
+    # ----------------------------
+    # Helpers (instance-level)
+    # ----------------------------
+
+    def _get_soup(self) -> BeautifulSoup:
+        if not self.content.raw:
+            raise ValueError("Empty HTML content")
+        return BeautifulSoup(self.content.raw, features=self._features)
+
+    def parse_meta(self) -> dict[str, Any]:
+        soup = self._soup
+
+        title = soup.title.string.strip() if soup.title and soup.title.string else None
+
+        meta = {
+            tag.get("name") or tag.get("property"): tag.get("content")
+            for tag in soup.find_all("meta")
+            if tag.get("content")
+        }
+
+        return {
+            "title": title,
+            "meta": meta,
+        }
--- a/omniread/html/scraper.py
+++ b/omniread/html/scraper.py
@@ -0,0 +1,48 @@
+import httpx
+from typing import Any, Mapping, Optional
+
+from omniread.core.content import Content
+from omniread.core.scraper import BaseScraper
+
+
+class HTMLScraper(BaseScraper):
+    """
+    Base HTTP scraper using httpx.
+
+    Fetches raw bytes and metadata only.
+    """
+
+    def __init__(
+        self,
+        *,
+        timeout: float = 15.0,
+        headers: Optional[Mapping[str, str]] = None,
+        follow_redirects: bool = True,
+    ):
+        self.timeout = timeout
+        self.headers = dict(headers) if headers else {}
+        self.follow_redirects = follow_redirects
+
+    def fetch(
+        self,
+        source: str,
+        *,
+        metadata: Optional[Mapping[str, Any]] = None,
+    ) -> Content:
+        with httpx.Client(
+            timeout=self.timeout,
+            headers=self.headers,
+            follow_redirects=self.follow_redirects,
+        ) as client:
+            response = client.get(source)
+            response.raise_for_status()
+
+        return Content(
+            raw=response.content,
+            source=source,
+            content_type=response.headers.get("Content-Type"),
+            metadata={
+                "status_code": response.status_code,
+                "headers": dict(response.headers),
+            },
+        )
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,172 @@
+[build-system]
+requires = ["setuptools>=65.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "omniread"
+version = "0.0.1"
+description = "Composable content ingestion framework with pluggable scrapers and parsers for HTML, PDF, and structured data"
+readme = "README.md"
+requires-python = ">=3.9"
+license = { text = "MIT" }
+
+authors = [
+    { name = "Aetos Skia", email = "dev@aetoskia.com" }
+]
+maintainers = [
+    { name = "Aetos Skia", email = "dev@aetoskia.com" }
+]
+
+keywords = [
+    "scraping",
+    "parsing",
+    "content-ingestion",
+    "html",
+    "pdf",
+    "document-processing",
+    "pipeline",
+    "typed",
+]
+
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Security",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Typing :: Typed",
+]
+
+dependencies = [
+    "requests>=2.31.0",
+    "beautifulsoup4>=4.12.0",
+#    "lxml>=5.0.0",
+    "pypdf>=4.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.4.0",
+    "pytest-asyncio>=0.21.0",
+    "pytest-cov>=4.1.0",
+    "black>=23.0.0",
+    "ruff>=0.1.0",
+    "mypy>=1.5.0",
+    "pre-commit>=3.4.0",
+]
+
+all = [
+    "omniread[dev,fastapi]",
+]
+
+[project.urls]
+Homepage = "https://git.aetoskia.com/aetos/omniread"
+Documentation = "https://git.aetoskia.com/aetos/omniread#readme"
+Repository = "https://git.aetoskia.com/aetos/omniread.git"
+Issues = "https://git.aetoskia.com/aetos/omniread/issues"
+Versions = "https://git.aetoskia.com/aetos/omniread/tags"
+
+[tool.setuptools]
+packages = { find = { include = ["omniread*"] } }
+
+[tool.setuptools.package-data]
+omniread = ["py.typed"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+asyncio_mode = "auto"
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "--strict-markers",
+    "--strict-config",
+    "--cov=omniread",
+    "--cov-report=term-missing",
+    "--cov-report=html",
+    "--cov-report=xml",
+]
+
+[tool.black]
+line-length = 88
+target-version = ["py39", "py310", "py311", "py312", "py313"]
+include = '\.pyi?$'
+extend-exclude = '''
+/(
+  \.eggs
+  | \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | build
+  | dist
+)/
+'''
+
+[tool.ruff]
+line-length = 88
+target-version = "py39"
+select = [
+    "E",
+    "W",
+    "F",
+    "I",
+    "B",
+    "C4",
+    "UP",
+]
+ignore = [
+    "E501",
+    "B008",
+    "C901",
+]
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["F401"]
+
+[tool.mypy]
+python_version = "3.9"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+check_untyped_defs = true
+disallow_untyped_decorators = false
+no_implicit_optional = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_no_return = true
+follow_imports = "normal"
+strict_optional = true
+
+[[tool.mypy.overrides]]
+module = [
+    "jose.*",
+    "httpx.*",
+]
+ignore_missing_imports = true
+
+[tool.coverage.run]
+source = ["omniread"]
+omit = [
+    "*/tests/*",
+    "*/test_*.py",
+]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "if __name__ == .__main__.:",
+    "raise AssertionError",
+    "raise NotImplementedError",
+    "if TYPE_CHECKING:",
+    "@abstractmethod",
+]
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+httpx==0.27.0
+beautifulsoup4==4.12.0
+# lxml==5.2.0
+
+pytest==7.4.0
+pytest-asyncio==0.21.0
+pytest-cov==4.1.0