Compare commits

..

4 Commits

Author SHA1 Message Date
b2173f3ef0 refactor(tests): use omniread public API instead of internal module imports
- Replace deep imports with top-level omniread exports in tests
- Ensure tests validate only the supported public API surface
- Align HTML and PDF tests with documented library usage
2026-01-02 19:02:20 +05:30
de67c7b0b1 feat(pdf): add PDF client, scraper, parser, and end-to-end tests
- Introduce PDF submodule with client, scraper, and generic parser
- Add filesystem PDF client and test-only mock routing
- Add end-to-end PDF scrape → parse tests with typed output
- Mirror HTML module architecture for consistency
- Expose PDF primitives via omniread public API
2026-01-02 18:59:36 +05:30
390eb22e1b moved html mocks to html sub folder and updated conftest.py to read from new location with better path and endpoint handling 2026-01-02 18:44:26 +05:30
358abc9b36 feat(api): expose core and html primitives via top-level package exports
- Re-export Content and ContentType from omniread.core
- Re-export HTMLScraper and HTMLParser from omniread.html
- Define explicit __all__ for stable public API surface
2026-01-02 18:36:29 +05:30
16 changed files with 257 additions and 19 deletions

View File

@@ -0,0 +1,18 @@
from .core import Content, ContentType
from .html import HTMLScraper, HTMLParser
from .pdf import FileSystemPDFClient, PDFScraper, PDFParser
__all__ = [
# core
"Content",
"ContentType",
# html
"HTMLScraper",
"HTMLParser",
# pdf
"FileSystemPDFClient",
"PDFScraper",
"PDFParser",
]

View File

@@ -0,0 +1,6 @@
from .content import Content, ContentType
__all__ = [
"Content",
"ContentType",
]

View File

@@ -0,0 +1,7 @@
from .scraper import HTMLScraper
from .parser import HTMLParser
__all__ = [
"HTMLScraper",
"HTMLParser",
]

9
omniread/pdf/__init__.py Normal file
View File

@@ -0,0 +1,9 @@
from .client import FileSystemPDFClient
from .scraper import PDFScraper
from .parser import PDFParser
__all__ = [
"FileSystemPDFClient",
"PDFScraper",
"PDFParser",
]

32
omniread/pdf/client.py Normal file
View File

@@ -0,0 +1,32 @@
from abc import ABC, abstractmethod
from pathlib import Path
class BasePDFClient(ABC):
"""
Abstract client responsible for retrieving PDF bytes
from a specific backing store (filesystem, S3, FTP, etc).
"""
@abstractmethod
def fetch(self, source: str) -> bytes:
"""
Fetch raw PDF bytes from the given source.
"""
raise NotImplementedError
class FileSystemPDFClient(BasePDFClient):
"""
PDF client that reads from the local filesystem.
"""
def fetch(self, path: Path) -> bytes:
if not path.exists():
raise FileNotFoundError(f"PDF not found: {path}")
if not path.is_file():
raise ValueError(f"Path is not a file: {path}")
return path.read_bytes()

26
omniread/pdf/parser.py Normal file
View File

@@ -0,0 +1,26 @@
from typing import Generic, TypeVar
from abc import abstractmethod
from omniread.core.content import ContentType
from omniread.core.parser import BaseParser
T = TypeVar("T")
class PDFParser(BaseParser[T], Generic[T]):
"""
Base PDF parser.
Concrete implementations must define:
- the output type T
- the parsing strategy
"""
supported_types = {ContentType.PDF}
@abstractmethod
def parse(self) -> T:
"""
Parse PDF content into a structured output.
"""
raise NotImplementedError

32
omniread/pdf/scraper.py Normal file
View File

@@ -0,0 +1,32 @@
from typing import Any, Mapping, Optional
from omniread.core.content import Content, ContentType
from omniread.core.scraper import BaseScraper
from omniread.pdf.client import BasePDFClient
class PDFScraper(BaseScraper):
"""
Scraper for PDF sources.
Delegates byte retrieval to a PDF client and normalizes
output into Content.
"""
def __init__(self, *, client: BasePDFClient):
self._client = client
def fetch(
self,
source: str,
*,
metadata: Optional[Mapping[str, Any]] = None,
) -> Content:
raw = self._client.fetch(source)
return Content(
raw=raw,
source=source,
content_type=ContentType.PDF,
metadata=dict(metadata) if metadata else None,
)

View File

@@ -4,11 +4,21 @@ import httpx
from pathlib import Path from pathlib import Path
from jinja2 import Environment, BaseLoader from jinja2 import Environment, BaseLoader
from omniread.core.content import ContentType from omniread import (
from omniread.html.scraper import HTMLScraper # core
ContentType,
# html
HTMLScraper,
# pdf
FileSystemPDFClient,
PDFScraper,
)
MOCK_DIR = Path(__file__).parent / "mocks" MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html"
MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf"
def render_html(template_path, data_path) -> bytes: def render_html(template_path, data_path) -> bytes:
@@ -30,23 +40,17 @@ def mock_transport(request: httpx.Request) -> httpx.Response:
httpx MockTransport handler. httpx MockTransport handler.
""" """
path = request.url.path path = request.url.path
if path not in ['/simple', '/table']:
if path == "/simple":
content = render_html(
MOCK_DIR / "simple.html.jinja",
MOCK_DIR / "simple.json",
)
elif path == "/table":
content = render_html(
MOCK_DIR / "table.html.jinja",
MOCK_DIR / "table.json",
)
else:
return httpx.Response( return httpx.Response(
status_code=404, status_code=404,
content=b"Not Found", content=b"Not Found",
request=request, request=request,
) )
endpoint = path.split("/")[-1]
content = render_html(
MOCK_HTML_DIR / f"{endpoint}.html.jinja",
MOCK_HTML_DIR / f"{endpoint}.json",
)
return httpx.Response( return httpx.Response(
status_code=200, status_code=200,
@@ -63,3 +67,24 @@ def http_scraper() -> HTMLScraper:
client = httpx.Client(transport=transport) client = httpx.Client(transport=transport)
return HTMLScraper(client=client) return HTMLScraper(client=client)
class MockPDFClient(FileSystemPDFClient):
"""
Test-only PDF client that routes logical identifiers
to fixture files.
"""
def fetch(self, source: str) -> bytes:
if source in ["simple"]:
source = MOCK_PDF_DIR / f"{source}.pdf"
else:
raise FileNotFoundError(f"No mock PDF route for '{source}'")
return super().fetch(source)
@pytest.fixture
def pdf_scraper() -> PDFScraper:
client = MockPDFClient()
return PDFScraper(client=client)

View File

@@ -0,0 +1,32 @@
%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
endobj
4 0 obj
<< /Length 44 >>
stream
BT
/F1 12 Tf
72 720 Td
(Simple PDF Test) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000010 00000 n
0000000061 00000 n
0000000116 00000 n
0000000203 00000 n
trailer
<< /Size 5 /Root 1 0 R >>
startxref
300
%%EOF

View File

@@ -3,8 +3,13 @@ from typing import Optional
from pydantic import BaseModel from pydantic import BaseModel
from bs4 import Tag from bs4 import Tag
from omniread.html.parser import HTMLParser from omniread import (
from omniread.core.content import Content # core
Content,
# html
HTMLParser,
)
class ParsedSimpleHTML(BaseModel): class ParsedSimpleHTML(BaseModel):

View File

@@ -2,8 +2,13 @@ from typing import Optional
from pydantic import BaseModel from pydantic import BaseModel
from omniread.html.parser import HTMLParser from omniread import (
from omniread.core.content import Content # core
Content,
# html
HTMLParser,
)
class ParsedTableHTML(BaseModel): class ParsedTableHTML(BaseModel):

41
tests/test_pdf_simple.py Normal file
View File

@@ -0,0 +1,41 @@
from typing import Literal
from pydantic import BaseModel
from omniread import (
# core
Content,
# pdf
PDFParser,
)
class ParsedPDF(BaseModel):
size_bytes: int
magic: Literal[b"%PDF"]
class SimplePDFParser(PDFParser[ParsedPDF]):
def parse(self) -> ParsedPDF:
raw = self.content.raw
if not raw.startswith(b"%PDF"):
raise ValueError("Not a valid PDF")
return ParsedPDF(
size_bytes=len(raw),
magic=b"%PDF",
)
def test_end_to_end_pdf_simple(pdf_scraper):
# --- Scrape (identifier-based, routed in conftest)
content: Content = pdf_scraper.fetch("simple")
assert content.raw.startswith(b"%PDF")
# --- Parse
parser = SimplePDFParser(content)
result = parser.parse()
assert result.magic == b"%PDF"
assert result.size_bytes > 100