From 32ee43e77a9987bffba38b177ed3226c0617faf2 Mon Sep 17 00:00:00 2001 From: Vishesh 'ironeagle' Bangotra Date: Wed, 31 Dec 2025 14:28:50 +0530 Subject: [PATCH] omni read basic modules --- omniread/__init__.py | 0 omniread/core/__init__.py | 0 omniread/core/content.py | 18 ++++ omniread/core/parser.py | 48 +++++++++++ omniread/core/scraper.py | 32 +++++++ omniread/html/__init__.py | 0 omniread/html/parser.py | 85 +++++++++++++++++++ omniread/html/scraper.py | 48 +++++++++++ pyproject.toml | 172 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 7 ++ 10 files changed, 410 insertions(+) create mode 100644 omniread/__init__.py create mode 100644 omniread/core/__init__.py create mode 100644 omniread/core/content.py create mode 100644 omniread/core/parser.py create mode 100644 omniread/core/scraper.py create mode 100644 omniread/html/__init__.py create mode 100644 omniread/html/parser.py create mode 100644 omniread/html/scraper.py create mode 100644 pyproject.toml create mode 100644 requirements.txt diff --git a/omniread/__init__.py b/omniread/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/omniread/core/__init__.py b/omniread/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/omniread/core/content.py b/omniread/core/content.py new file mode 100644 index 0000000..a301739 --- /dev/null +++ b/omniread/core/content.py @@ -0,0 +1,18 @@ +from enum import Enum +from dataclasses import dataclass +from typing import Any, Mapping, Optional + + +class ContentType(str, Enum): + HTML = "text/html" + PDF = "application/pdf" + JSON = "application/json" + XML = "application/xml" + + +@dataclass(slots=True) +class Content: + raw: bytes + source: str + content_type: Optional[ContentType] = None + metadata: Optional[Mapping[str, Any]] = None diff --git a/omniread/core/parser.py b/omniread/core/parser.py new file mode 100644 index 0000000..3426672 --- /dev/null +++ b/omniread/core/parser.py @@ -0,0 +1,48 @@ +from abc import ABC, abstractmethod +from typing import Generic, TypeVar, Set + +from .content import Content, ContentType + +T = TypeVar("T") + + +class BaseParser(ABC, Generic[T]): + """ + Base interface for all parsers. + + A parser is a self-contained object that owns the Content + it is responsible for interpreting. + """ + + supported_types: Set[ContentType] = set() + + def __init__(self, content: Content): + self.content = content + + if not self.supports(): + raise ValueError( + f"{self.__class__.__name__} does not support content type " + f"{content.content_type!r}" + ) + + @abstractmethod + def parse(self) -> T: + """ + Parse the owned content into structured output. + + Returns: + Parsed, structured representation. + """ + raise NotImplementedError + + def supports(self) -> bool: + """ + Check whether this parser supports the content's type. + """ + if not self.supported_types: + return True + + if self.content.content_type is None: + return False + + return self.content.content_type in self.supported_types diff --git a/omniread/core/scraper.py b/omniread/core/scraper.py new file mode 100644 index 0000000..d1ec9e7 --- /dev/null +++ b/omniread/core/scraper.py @@ -0,0 +1,32 @@ +from abc import ABC, abstractmethod +from typing import Any, Mapping, Optional + +from .content import Content + + +class BaseScraper(ABC): + """ + Base interface for all scrapers. + + A scraper is responsible ONLY for fetching raw content + (bytes) from a source. It must not interpret or parse it. + """ + + @abstractmethod + def fetch( + self, + source: str, + *, + metadata: Optional[Mapping[str, Any]] = None, + ) -> Content: + """ + Fetch raw content from the given source. + + Args: + source: Location identifier (URL, file path, S3 URI, etc.) + metadata: Optional hints for the scraper (headers, auth, etc.) + + Returns: + Content object containing raw bytes and metadata. + """ + raise NotImplementedError diff --git a/omniread/html/__init__.py b/omniread/html/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/omniread/html/parser.py b/omniread/html/parser.py new file mode 100644 index 0000000..faf2e52 --- /dev/null +++ b/omniread/html/parser.py @@ -0,0 +1,85 @@ +from typing import Any, Generic, TypeVar, Optional + +from abc import abstractmethod +from bs4 import BeautifulSoup, Tag + +from omniread.core.content import ContentType, Content +from omniread.core.parser import BaseParser + +T = TypeVar("T") + + +class HTMLParser(BaseParser[T], Generic[T]): + """ + Base HTML parser. + + Provides reusable helpers for HTML extraction. + Concrete parsers must explicitly define the return type. + """ + + supported_types = {ContentType.HTML} + + def __init__(self, content: Content, features: str = "html.parser"): + super().__init__(content) + self._features = features + self._soup = self._get_soup() + + # ---------------------------- + # Contract + # ---------------------------- + + @abstractmethod + def parse(self) -> T: + """ + Fully parse the HTML content into structured output. + """ + raise NotImplementedError + + # ---------------------------- + # Helpers (static / pure) + # ---------------------------- + + @staticmethod + def parse_div(div: Tag, *, separator: str = " ") -> str: + return div.get_text(separator=separator, strip=True) + + @staticmethod + def parse_link(a: Tag) -> Optional[str]: + return a.get("href") + + @staticmethod + def parse_table(table: Tag) -> list[list[str]]: + rows: list[list[str]] = [] + for tr in table.find_all("tr"): + cells = [ + cell.get_text(strip=True) + for cell in tr.find_all(["td", "th"]) + ] + if cells: + rows.append(cells) + return rows + + # ---------------------------- + # Helpers (instance-level) + # ---------------------------- + + def _get_soup(self) -> BeautifulSoup: + if not self.content.raw: + raise ValueError("Empty HTML content") + return BeautifulSoup(self.content.raw, features=self._features) + + def parse_meta(self) -> dict[str, Any]: + soup = self._soup + + title = soup.title.string.strip() if soup.title and soup.title.string else None + + meta = { + tag.get("name") or tag.get("property"): tag.get("content") + for tag in soup.find_all("meta") + if tag.get("content") + } + + return { + "title": title, + "meta": meta, + } diff --git a/omniread/html/scraper.py b/omniread/html/scraper.py new file mode 100644 index 0000000..cb8e903 --- /dev/null +++ b/omniread/html/scraper.py @@ -0,0 +1,48 @@ +import httpx +from typing import Any, Mapping, Optional + +from omniread.core.content import Content +from omniread.core.scraper import BaseScraper + + +class HTMLScraper(BaseScraper): + """ + Base HTTP scraper using httpx. + + Fetches raw bytes and metadata only. + """ + + def __init__( + self, + *, + timeout: float = 15.0, + headers: Optional[Mapping[str, str]] = None, + follow_redirects: bool = True, + ): + self.timeout = timeout + self.headers = dict(headers) if headers else {} + self.follow_redirects = follow_redirects + + def fetch( + self, + source: str, + *, + metadata: Optional[Mapping[str, Any]] = None, + ) -> Content: + with httpx.Client( + timeout=self.timeout, + headers=self.headers, + follow_redirects=self.follow_redirects, + ) as client: + response = client.get(source) + response.raise_for_status() + + return Content( + raw=response.content, + source=source, + content_type=response.headers.get("Content-Type"), + metadata={ + "status_code": response.status_code, + "headers": dict(response.headers), + }, + ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0219d36 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,172 @@ +[build-system] +requires = ["setuptools>=65.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "omniread" +version = "0.0.1" +description = "Composable content ingestion framework with pluggable scrapers and parsers for HTML, PDF, and structured data" +readme = "README.md" +requires-python = ">=3.9" +license = { text = "MIT" } + +authors = [ + { name = "Aetos Skia", email = "dev@aetoskia.com" } +] +maintainers = [ + { name = "Aetos Skia", email = "dev@aetoskia.com" } +] + +keywords = [ + "scraping", + "parsing", + "content-ingestion", + "html", + "pdf", + "document-processing", + "pipeline", + "typed", +] + +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Security", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed", +] + +dependencies = [ + "requests>=2.31.0", + "beautifulsoup4>=4.12.0", +# "lxml>=5.0.0", + "pypdf>=4.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.0", + "pytest-asyncio>=0.21.0", + "pytest-cov>=4.1.0", + "black>=23.0.0", + "ruff>=0.1.0", + "mypy>=1.5.0", + "pre-commit>=3.4.0", +] + +all = [ + "omniread[dev,fastapi]", +] + +[project.urls] +Homepage = "https://git.aetoskia.com/aetos/omniread" +Documentation = "https://git.aetoskia.com/aetos/omniread#readme" +Repository = "https://git.aetoskia.com/aetos/omniread.git" +Issues = "https://git.aetoskia.com/aetos/omniread/issues" +Versions = "https://git.aetoskia.com/aetos/omniread/tags" + +[tool.setuptools] +packages = { find = { include = ["omniread*"] } } + +[tool.setuptools.package-data] +omniread = ["py.typed"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +asyncio_mode = "auto" +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = [ + "--strict-markers", + "--strict-config", + "--cov=omniread", + "--cov-report=term-missing", + "--cov-report=html", + "--cov-report=xml", +] + +[tool.black] +line-length = 88 +target-version = ["py39", "py310", "py311", "py312", "py313"] +include = '\.pyi?$' +extend-exclude = ''' +/( + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | build + | dist +)/ +''' + +[tool.ruff] +line-length = 88 +target-version = "py39" +select = [ + "E", + "W", + "F", + "I", + "B", + "C4", + "UP", +] +ignore = [ + "E501", + "B008", + "C901", +] + +[tool.ruff.per-file-ignores] +"__init__.py" = ["F401"] + +[tool.mypy] +python_version = "3.9" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +check_untyped_defs = true +disallow_untyped_decorators = false +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +warn_no_return = true +follow_imports = "normal" +strict_optional = true + +[[tool.mypy.overrides]] +module = [ + "jose.*", + "httpx.*", +] +ignore_missing_imports = true + +[tool.coverage.run] +source = ["omniread"] +omit = [ + "*/tests/*", + "*/test_*.py", +] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "if __name__ == .__main__.:", + "raise AssertionError", + "raise NotImplementedError", + "if TYPE_CHECKING:", + "@abstractmethod", +] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..83c2b59 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +httpx==0.27.0 +beautifulsoup4==4.12.0 +# lxml==5.2.0 + +pytest==7.4.0 +pytest-asyncio==0.21.0 +pytest-cov==4.1.0