Compare commits
4 Commits
07293e4651
...
b2173f3ef0
| Author | SHA1 | Date | |
|---|---|---|---|
| b2173f3ef0 | |||
| de67c7b0b1 | |||
| 390eb22e1b | |||
| 358abc9b36 |
@@ -0,0 +1,18 @@
|
|||||||
|
from .core import Content, ContentType
|
||||||
|
from .html import HTMLScraper, HTMLParser
|
||||||
|
from .pdf import FileSystemPDFClient, PDFScraper, PDFParser
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# core
|
||||||
|
"Content",
|
||||||
|
"ContentType",
|
||||||
|
|
||||||
|
# html
|
||||||
|
"HTMLScraper",
|
||||||
|
"HTMLParser",
|
||||||
|
|
||||||
|
# pdf
|
||||||
|
"FileSystemPDFClient",
|
||||||
|
"PDFScraper",
|
||||||
|
"PDFParser",
|
||||||
|
]
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
from .content import Content, ContentType
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Content",
|
||||||
|
"ContentType",
|
||||||
|
]
|
||||||
|
|||||||
@@ -0,0 +1,7 @@
|
|||||||
|
from .scraper import HTMLScraper
|
||||||
|
from .parser import HTMLParser
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"HTMLScraper",
|
||||||
|
"HTMLParser",
|
||||||
|
]
|
||||||
|
|||||||
9
omniread/pdf/__init__.py
Normal file
9
omniread/pdf/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from .client import FileSystemPDFClient
|
||||||
|
from .scraper import PDFScraper
|
||||||
|
from .parser import PDFParser
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"FileSystemPDFClient",
|
||||||
|
"PDFScraper",
|
||||||
|
"PDFParser",
|
||||||
|
]
|
||||||
32
omniread/pdf/client.py
Normal file
32
omniread/pdf/client.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
class BasePDFClient(ABC):
|
||||||
|
"""
|
||||||
|
Abstract client responsible for retrieving PDF bytes
|
||||||
|
from a specific backing store (filesystem, S3, FTP, etc).
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def fetch(self, source: str) -> bytes:
|
||||||
|
"""
|
||||||
|
Fetch raw PDF bytes from the given source.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class FileSystemPDFClient(BasePDFClient):
|
||||||
|
"""
|
||||||
|
PDF client that reads from the local filesystem.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def fetch(self, path: Path) -> bytes:
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"PDF not found: {path}")
|
||||||
|
|
||||||
|
if not path.is_file():
|
||||||
|
raise ValueError(f"Path is not a file: {path}")
|
||||||
|
|
||||||
|
return path.read_bytes()
|
||||||
26
omniread/pdf/parser.py
Normal file
26
omniread/pdf/parser.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
from typing import Generic, TypeVar
|
||||||
|
from abc import abstractmethod
|
||||||
|
|
||||||
|
from omniread.core.content import ContentType
|
||||||
|
from omniread.core.parser import BaseParser
|
||||||
|
|
||||||
|
T = TypeVar("T")
|
||||||
|
|
||||||
|
|
||||||
|
class PDFParser(BaseParser[T], Generic[T]):
|
||||||
|
"""
|
||||||
|
Base PDF parser.
|
||||||
|
|
||||||
|
Concrete implementations must define:
|
||||||
|
- the output type T
|
||||||
|
- the parsing strategy
|
||||||
|
"""
|
||||||
|
|
||||||
|
supported_types = {ContentType.PDF}
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def parse(self) -> T:
|
||||||
|
"""
|
||||||
|
Parse PDF content into a structured output.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
32
omniread/pdf/scraper.py
Normal file
32
omniread/pdf/scraper.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
from typing import Any, Mapping, Optional
|
||||||
|
|
||||||
|
from omniread.core.content import Content, ContentType
|
||||||
|
from omniread.core.scraper import BaseScraper
|
||||||
|
from omniread.pdf.client import BasePDFClient
|
||||||
|
|
||||||
|
|
||||||
|
class PDFScraper(BaseScraper):
|
||||||
|
"""
|
||||||
|
Scraper for PDF sources.
|
||||||
|
|
||||||
|
Delegates byte retrieval to a PDF client and normalizes
|
||||||
|
output into Content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *, client: BasePDFClient):
|
||||||
|
self._client = client
|
||||||
|
|
||||||
|
def fetch(
|
||||||
|
self,
|
||||||
|
source: str,
|
||||||
|
*,
|
||||||
|
metadata: Optional[Mapping[str, Any]] = None,
|
||||||
|
) -> Content:
|
||||||
|
raw = self._client.fetch(source)
|
||||||
|
|
||||||
|
return Content(
|
||||||
|
raw=raw,
|
||||||
|
source=source,
|
||||||
|
content_type=ContentType.PDF,
|
||||||
|
metadata=dict(metadata) if metadata else None,
|
||||||
|
)
|
||||||
@@ -4,11 +4,21 @@ import httpx
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from jinja2 import Environment, BaseLoader
|
from jinja2 import Environment, BaseLoader
|
||||||
|
|
||||||
from omniread.core.content import ContentType
|
from omniread import (
|
||||||
from omniread.html.scraper import HTMLScraper
|
# core
|
||||||
|
ContentType,
|
||||||
|
|
||||||
|
# html
|
||||||
|
HTMLScraper,
|
||||||
|
|
||||||
|
# pdf
|
||||||
|
FileSystemPDFClient,
|
||||||
|
PDFScraper,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
MOCK_DIR = Path(__file__).parent / "mocks"
|
MOCK_HTML_DIR = Path(__file__).parent / "mocks" / "html"
|
||||||
|
MOCK_PDF_DIR = Path(__file__).parent / "mocks" / "pdf"
|
||||||
|
|
||||||
|
|
||||||
def render_html(template_path, data_path) -> bytes:
|
def render_html(template_path, data_path) -> bytes:
|
||||||
@@ -30,23 +40,17 @@ def mock_transport(request: httpx.Request) -> httpx.Response:
|
|||||||
httpx MockTransport handler.
|
httpx MockTransport handler.
|
||||||
"""
|
"""
|
||||||
path = request.url.path
|
path = request.url.path
|
||||||
|
if path not in ['/simple', '/table']:
|
||||||
if path == "/simple":
|
|
||||||
content = render_html(
|
|
||||||
MOCK_DIR / "simple.html.jinja",
|
|
||||||
MOCK_DIR / "simple.json",
|
|
||||||
)
|
|
||||||
elif path == "/table":
|
|
||||||
content = render_html(
|
|
||||||
MOCK_DIR / "table.html.jinja",
|
|
||||||
MOCK_DIR / "table.json",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
return httpx.Response(
|
return httpx.Response(
|
||||||
status_code=404,
|
status_code=404,
|
||||||
content=b"Not Found",
|
content=b"Not Found",
|
||||||
request=request,
|
request=request,
|
||||||
)
|
)
|
||||||
|
endpoint = path.split("/")[-1]
|
||||||
|
content = render_html(
|
||||||
|
MOCK_HTML_DIR / f"{endpoint}.html.jinja",
|
||||||
|
MOCK_HTML_DIR / f"{endpoint}.json",
|
||||||
|
)
|
||||||
|
|
||||||
return httpx.Response(
|
return httpx.Response(
|
||||||
status_code=200,
|
status_code=200,
|
||||||
@@ -63,3 +67,24 @@ def http_scraper() -> HTMLScraper:
|
|||||||
client = httpx.Client(transport=transport)
|
client = httpx.Client(transport=transport)
|
||||||
|
|
||||||
return HTMLScraper(client=client)
|
return HTMLScraper(client=client)
|
||||||
|
|
||||||
|
|
||||||
|
class MockPDFClient(FileSystemPDFClient):
|
||||||
|
"""
|
||||||
|
Test-only PDF client that routes logical identifiers
|
||||||
|
to fixture files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def fetch(self, source: str) -> bytes:
|
||||||
|
if source in ["simple"]:
|
||||||
|
source = MOCK_PDF_DIR / f"{source}.pdf"
|
||||||
|
else:
|
||||||
|
raise FileNotFoundError(f"No mock PDF route for '{source}'")
|
||||||
|
|
||||||
|
return super().fetch(source)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def pdf_scraper() -> PDFScraper:
|
||||||
|
client = MockPDFClient()
|
||||||
|
return PDFScraper(client=client)
|
||||||
|
|||||||
32
tests/mocks/pdf/simple.pdf
Normal file
32
tests/mocks/pdf/simple.pdf
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
%PDF-1.4
|
||||||
|
1 0 obj
|
||||||
|
<< /Type /Catalog /Pages 2 0 R >>
|
||||||
|
endobj
|
||||||
|
2 0 obj
|
||||||
|
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
||||||
|
endobj
|
||||||
|
3 0 obj
|
||||||
|
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
|
||||||
|
endobj
|
||||||
|
4 0 obj
|
||||||
|
<< /Length 44 >>
|
||||||
|
stream
|
||||||
|
BT
|
||||||
|
/F1 12 Tf
|
||||||
|
72 720 Td
|
||||||
|
(Simple PDF Test) Tj
|
||||||
|
ET
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
xref
|
||||||
|
0 5
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000010 00000 n
|
||||||
|
0000000061 00000 n
|
||||||
|
0000000116 00000 n
|
||||||
|
0000000203 00000 n
|
||||||
|
trailer
|
||||||
|
<< /Size 5 /Root 1 0 R >>
|
||||||
|
startxref
|
||||||
|
300
|
||||||
|
%%EOF
|
||||||
@@ -3,8 +3,13 @@ from typing import Optional
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from bs4 import Tag
|
from bs4 import Tag
|
||||||
|
|
||||||
from omniread.html.parser import HTMLParser
|
from omniread import (
|
||||||
from omniread.core.content import Content
|
# core
|
||||||
|
Content,
|
||||||
|
|
||||||
|
# html
|
||||||
|
HTMLParser,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ParsedSimpleHTML(BaseModel):
|
class ParsedSimpleHTML(BaseModel):
|
||||||
|
|||||||
@@ -2,8 +2,13 @@ from typing import Optional
|
|||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from omniread.html.parser import HTMLParser
|
from omniread import (
|
||||||
from omniread.core.content import Content
|
# core
|
||||||
|
Content,
|
||||||
|
|
||||||
|
# html
|
||||||
|
HTMLParser,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ParsedTableHTML(BaseModel):
|
class ParsedTableHTML(BaseModel):
|
||||||
|
|||||||
41
tests/test_pdf_simple.py
Normal file
41
tests/test_pdf_simple.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
from typing import Literal
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from omniread import (
|
||||||
|
# core
|
||||||
|
Content,
|
||||||
|
|
||||||
|
# pdf
|
||||||
|
PDFParser,
|
||||||
|
)
|
||||||
|
|
||||||
|
class ParsedPDF(BaseModel):
|
||||||
|
size_bytes: int
|
||||||
|
magic: Literal[b"%PDF"]
|
||||||
|
|
||||||
|
|
||||||
|
class SimplePDFParser(PDFParser[ParsedPDF]):
|
||||||
|
def parse(self) -> ParsedPDF:
|
||||||
|
raw = self.content.raw
|
||||||
|
|
||||||
|
if not raw.startswith(b"%PDF"):
|
||||||
|
raise ValueError("Not a valid PDF")
|
||||||
|
|
||||||
|
return ParsedPDF(
|
||||||
|
size_bytes=len(raw),
|
||||||
|
magic=b"%PDF",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_end_to_end_pdf_simple(pdf_scraper):
|
||||||
|
# --- Scrape (identifier-based, routed in conftest)
|
||||||
|
content: Content = pdf_scraper.fetch("simple")
|
||||||
|
|
||||||
|
assert content.raw.startswith(b"%PDF")
|
||||||
|
|
||||||
|
# --- Parse
|
||||||
|
parser = SimplePDFParser(content)
|
||||||
|
result = parser.parse()
|
||||||
|
|
||||||
|
assert result.magic == b"%PDF"
|
||||||
|
assert result.size_bytes > 100
|
||||||
Reference in New Issue
Block a user