refactor(tests): use omniread public API instead of internal module imports

- Replace deep imports with top-level omniread exports in tests - Ensure tests validate only the supported public API surface - Align HTML and PDF tests with documented library usage
feat(pdf): add PDF client, scraper, parser, and end-to-end tests
2026-01-02 19:02:20 +05:30 · 2026-01-02 18:59:36 +05:30 · 2026-01-02 18:44:26 +05:30 · 2026-01-02 18:36:29 +05:30
16 changed files with 257 additions and 19 deletions
--- a/omniread/init.py
+++ b/omniread/init.py
@@ -0,0 +1,18 @@
 from .core import Content, ContentType
 from .html import HTMLScraper, HTMLParser
 from .pdf import FileSystemPDFClient, PDFScraper, PDFParser
 __all__ = [
    # core
    "Content",
    "ContentType",
    # html
    "HTMLScraper",
    "HTMLParser",
    # pdf
    "FileSystemPDFClient",
    "PDFScraper",
    "PDFParser",
 ]
--- a/omniread/core/init.py
+++ b/omniread/core/init.py
@@ -0,0 +1,6 @@
 from .content import Content, ContentType
 __all__ = [
    "Content",
    "ContentType",
 ]
--- a/omniread/html/init.py
+++ b/omniread/html/init.py
@@ -0,0 +1,7 @@
 from .scraper import HTMLScraper
 from .parser import HTMLParser
 __all__ = [
    "HTMLScraper",
    "HTMLParser",
 ]
--- a/omniread/pdf/init.py
+++ b/omniread/pdf/init.py
@@ -0,0 +1,9 @@
 from .client import FileSystemPDFClient
 from .scraper import PDFScraper
 from .parser import PDFParser
 __all__ = [
    "FileSystemPDFClient",
    "PDFScraper",
    "PDFParser",
 ]
--- a/omniread/pdf/client.py
+++ b/omniread/pdf/client.py
@@ -0,0 +1,32 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
 class BasePDFClient(ABC):
    """
    Abstract client responsible for retrieving PDF bytes
    from a specific backing store (filesystem, S3, FTP, etc).
    """
    @abstractmethod
    def fetch(self, source: str) -> bytes:
        """
        Fetch raw PDF bytes from the given source.
        """
        raise NotImplementedError
 class FileSystemPDFClient(BasePDFClient):
    """
    PDF client that reads from the local filesystem.
    """
    def fetch(self, path: Path) -> bytes:
        if not path.exists():
            raise FileNotFoundError(f"PDF not found: {path}")
        if not path.is_file():
            raise ValueError(f"Path is not a file: {path}")
        return path.read_bytes()
--- a/omniread/pdf/parser.py
+++ b/omniread/pdf/parser.py
@@ -0,0 +1,26 @@
 from typing import Generic, TypeVar
 from abc import abstractmethod
 from omniread.core.content import ContentType
 from omniread.core.parser import BaseParser
 T = TypeVar("T")
 class PDFParser(BaseParser[T], Generic[T]):
    """
    Base PDF parser.
    Concrete implementations must define:
    - the output type T
    - the parsing strategy
    """
    supported_types = {ContentType.PDF}
    @abstractmethod
    def parse(self) -> T:
        """
        Parse PDF content into a structured output.
        """
        raise NotImplementedError
--- a/omniread/pdf/scraper.py
+++ b/omniread/pdf/scraper.py
@@ -0,0 +1,32 @@
 from typing import Any, Mapping, Optional
 from omniread.core.content import Content, ContentType
 from omniread.core.scraper import BaseScraper
 from omniread.pdf.client import BasePDFClient
 class PDFScraper(BaseScraper):
    """
    Scraper for PDF sources.
    Delegates byte retrieval to a PDF client and normalizes
    output into Content.
    """
    def __init__(self, *, client: BasePDFClient):
        self._client = client
    def fetch(
        self,
        source: str,
        *,
        metadata: Optional[Mapping[str, Any]] = None,
    ) -> Content:
        raw = self._client.fetch(source)
        return Content(
            raw=raw,
            source=source,
            content_type=ContentType.PDF,
            metadata=dict(metadata) if metadata else None,
        )
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,11 +4,21 @@ import httpx
 from pathlib import Path
 from jinja2 import Environment, BaseLoader
-from omniread.core.content import ContentType
+from omniread import (
-from omniread.html.scraper import HTMLScraper
+    # core
    ContentType,
    # html
    HTMLScraper,
    # pdf
    FileSystemPDFClient,
    PDFScraper,
 )
-MOCK_DIR = Path(__file__).parent / "mocks"
+MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html"
 MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf"
 def render_html(template_path, data_path) -> bytes:
@@ -30,23 +40,17 @@ def mock_transport(request: httpx.Request) -> httpx.Response:
    httpx MockTransport handler.
    """
    path = request.url.path
-
+    if path not in ['/simple', '/table']:
    if path == "/simple":
        content = render_html(
            MOCK_DIR / "simple.html.jinja",
            MOCK_DIR / "simple.json",
        )
    elif path == "/table":
        content = render_html(
            MOCK_DIR / "table.html.jinja",
            MOCK_DIR / "table.json",
        )
    else:
        return httpx.Response(
            status_code=404,
            content=b"Not Found",
            request=request,
        )
    endpoint = path.split("/")[-1]
    content = render_html(
        MOCK_HTML_DIR / f"{endpoint}.html.jinja",
        MOCK_HTML_DIR / f"{endpoint}.json",
    )
    return httpx.Response(
        status_code=200,
@@ -63,3 +67,24 @@ def http_scraper() -> HTMLScraper:
    client = httpx.Client(transport=transport)
    return HTMLScraper(client=client)
 class MockPDFClient(FileSystemPDFClient):
    """
    Test-only PDF client that routes logical identifiers
    to fixture files.
    """
    def fetch(self, source: str) -> bytes:
        if source in ["simple"]:
            source = MOCK_PDF_DIR / f"{source}.pdf"
        else:
            raise FileNotFoundError(f"No mock PDF route for '{source}'")
        return super().fetch(source)
@pytest.fixture
 def pdf_scraper() -> PDFScraper:
    client = MockPDFClient()
    return PDFScraper(client=client)
--- a/tests/mocks/html/simple.html.jinja
+++ b/tests/mocks/html/simple.html.jinja
--- a/tests/mocks/html/simple.json
+++ b/tests/mocks/html/simple.json
--- a/tests/mocks/html/table.html.jinja
+++ b/tests/mocks/html/table.html.jinja
--- a/tests/mocks/html/table.json
+++ b/tests/mocks/html/table.json
--- a/tests/mocks/pdf/simple.pdf
+++ b/tests/mocks/pdf/simple.pdf
@@ -0,0 +1,32 @@
 %PDF-1.4
 1 0 obj
 << /Type /Catalog /Pages 2 0 R >>
 endobj
 2 0 obj
 << /Type /Pages /Kids [3 0 R] /Count 1 >>
 endobj
 3 0 obj
 << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
 endobj
 4 0 obj
 << /Length 44 >>
 stream
 BT
 /F1 12 Tf
 72 720 Td
 (Simple PDF Test) Tj
 ET
 endstream
 endobj
 xref
 0 5
 0000000000 65535 f
 0000000010 00000 n
 0000000061 00000 n
 0000000116 00000 n
 0000000203 00000 n
 trailer
 << /Size 5 /Root 1 0 R >>
 startxref
 300
 %%EOF
--- a/tests/test_html_simple.py
+++ b/tests/test_html_simple.py
@@ -3,8 +3,13 @@ from typing import Optional
 from pydantic import BaseModel
 from bs4 import Tag
-from omniread.html.parser import HTMLParser
+from omniread import (
-from omniread.core.content import Content
+    # core
    Content,
    # html
    HTMLParser,
 )
 class ParsedSimpleHTML(BaseModel):
--- a/tests/test_html_table.py
+++ b/tests/test_html_table.py
@@ -2,8 +2,13 @@ from typing import Optional
 from pydantic import BaseModel
-from omniread.html.parser import HTMLParser
+from omniread import (
-from omniread.core.content import Content
+    # core
    Content,
    # html
    HTMLParser,
 )
 class ParsedTableHTML(BaseModel):
--- a/tests/test_pdf_simple.py
+++ b/tests/test_pdf_simple.py
@@ -0,0 +1,41 @@
 from typing import Literal
 from pydantic import BaseModel
 from omniread import (
    # core
    Content,
    # pdf
    PDFParser,
 )
 class ParsedPDF(BaseModel):
    size_bytes: int
    magic: Literal[b"%PDF"]
 class SimplePDFParser(PDFParser[ParsedPDF]):
    def parse(self) -> ParsedPDF:
        raw = self.content.raw
        if not raw.startswith(b"%PDF"):
            raise ValueError("Not a valid PDF")
        return ParsedPDF(
            size_bytes=len(raw),
            magic=b"%PDF",
        )
 def test_end_to_end_pdf_simple(pdf_scraper):
    # --- Scrape (identifier-based, routed in conftest)
    content: Content = pdf_scraper.fetch("simple")
    assert content.raw.startswith(b"%PDF")
    # --- Parse
    parser = SimplePDFParser(content)
    result = parser.parse()
    assert result.magic == b"%PDF"
    assert result.size_bytes > 100
Author	SHA1	Message	Date
Vishesh 'ironeagle' Bangotra	b2173f3ef0	refactor(tests): use omniread public API instead of internal module imports - Replace deep imports with top-level omniread exports in tests - Ensure tests validate only the supported public API surface - Align HTML and PDF tests with documented library usage	2026-01-02 19:02:20 +05:30
Vishesh 'ironeagle' Bangotra	de67c7b0b1	feat(pdf): add PDF client, scraper, parser, and end-to-end tests - Introduce PDF submodule with client, scraper, and generic parser - Add filesystem PDF client and test-only mock routing - Add end-to-end PDF scrape → parse tests with typed output - Mirror HTML module architecture for consistency - Expose PDF primitives via omniread public API	2026-01-02 18:59:36 +05:30
Vishesh 'ironeagle' Bangotra	390eb22e1b	moved html mocks to html sub folder and updated conftest.py to read from new location with better path and endpoint handling	2026-01-02 18:44:26 +05:30
Vishesh 'ironeagle' Bangotra	358abc9b36	feat(api): expose core and html primitives via top-level package exports - Re-export Content and ContentType from omniread.core - Re-export HTMLScraper and HTMLParser from omniread.html - Define explicit __all__ for stable public API surface	2026-01-02 18:36:29 +05:30