lib init
This commit is contained in:
0
mail_intake/__init__.py
Normal file
0
mail_intake/__init__.py
Normal file
0
mail_intake/adapters/__init__.py
Normal file
0
mail_intake/adapters/__init__.py
Normal file
48
mail_intake/adapters/base.py
Normal file
48
mail_intake/adapters/base.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Iterator, Dict, Any
|
||||
|
||||
|
||||
class MailIntakeAdapter(ABC):
|
||||
"""
|
||||
Base adapter interface for mail providers.
|
||||
|
||||
This interface defines the minimal contract required for
|
||||
read-only mail ingestion. No provider-specific concepts
|
||||
should leak beyond implementations of this class.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
|
||||
"""
|
||||
Iterate over lightweight message references.
|
||||
|
||||
Must yield dictionaries containing at least:
|
||||
- message_id
|
||||
- thread_id
|
||||
|
||||
Example yield:
|
||||
{
|
||||
"message_id": "...",
|
||||
"thread_id": "..."
|
||||
}
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def fetch_message(self, message_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch a full raw message by message_id.
|
||||
|
||||
Returns the provider-native message payload
|
||||
(e.g., Gmail message JSON).
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch a full raw thread by thread_id.
|
||||
|
||||
Returns the provider-native thread payload.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
105
mail_intake/adapters/gmail.py
Normal file
105
mail_intake/adapters/gmail.py
Normal file
@@ -0,0 +1,105 @@
|
||||
from typing import Iterator, Dict, Any
|
||||
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.errors import HttpError
|
||||
|
||||
from mail_intake.adapters.base import MailIntakeAdapter
|
||||
from mail_intake.exceptions import MailIntakeAdapterError
|
||||
from mail_intake.auth.base import MailIntakeAuthProvider
|
||||
|
||||
|
||||
class MailIntakeGmailAdapter(MailIntakeAdapter):
|
||||
"""
|
||||
Gmail read-only adapter.
|
||||
|
||||
This class is the ONLY place where:
|
||||
- googleapiclient is imported
|
||||
- Gmail REST semantics are known
|
||||
- .execute() is called
|
||||
|
||||
It must remain thin and dumb by design.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
auth_provider: MailIntakeAuthProvider,
|
||||
user_id: str = "me",
|
||||
):
|
||||
self._auth_provider = auth_provider
|
||||
self._user_id = user_id
|
||||
self._service = None
|
||||
|
||||
@property
|
||||
def service(self):
|
||||
if self._service is None:
|
||||
try:
|
||||
creds = self._auth_provider.get_credentials()
|
||||
self._service = build("gmail", "v1", credentials=creds)
|
||||
except Exception as exc:
|
||||
raise MailIntakeAdapterError(
|
||||
"Failed to initialize Gmail service"
|
||||
) from exc
|
||||
return self._service
|
||||
|
||||
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
|
||||
"""
|
||||
Iterate over message references matching the query.
|
||||
|
||||
Yields:
|
||||
{
|
||||
"message_id": "...",
|
||||
"thread_id": "..."
|
||||
}
|
||||
"""
|
||||
try:
|
||||
request = (
|
||||
self.service.users()
|
||||
.messages()
|
||||
.list(userId=self._user_id, q=query)
|
||||
)
|
||||
|
||||
while request is not None:
|
||||
response = request.execute()
|
||||
|
||||
for msg in response.get("messages", []):
|
||||
yield {
|
||||
"message_id": msg["id"],
|
||||
"thread_id": msg["threadId"],
|
||||
}
|
||||
|
||||
request = (
|
||||
self.service.users()
|
||||
.messages()
|
||||
.list_next(request, response)
|
||||
)
|
||||
|
||||
except HttpError as exc:
|
||||
raise MailIntakeAdapterError(
|
||||
"Gmail API error while listing messages"
|
||||
) from exc
|
||||
|
||||
def fetch_message(self, message_id: str) -> Dict[str, Any]:
|
||||
try:
|
||||
return (
|
||||
self.service.users()
|
||||
.messages()
|
||||
.get(userId=self._user_id, id=message_id)
|
||||
.execute()
|
||||
)
|
||||
except HttpError as exc:
|
||||
raise MailIntakeAdapterError(
|
||||
f"Gmail API error while fetching message {message_id}"
|
||||
) from exc
|
||||
|
||||
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
|
||||
try:
|
||||
return (
|
||||
self.service.users()
|
||||
.threads()
|
||||
.get(userId=self._user_id, id=thread_id)
|
||||
.execute()
|
||||
)
|
||||
except HttpError as exc:
|
||||
raise MailIntakeAdapterError(
|
||||
f"Gmail API error while fetching thread {thread_id}"
|
||||
) from exc
|
||||
0
mail_intake/auth/__init__.py
Normal file
0
mail_intake/auth/__init__.py
Normal file
20
mail_intake/auth/base.py
Normal file
20
mail_intake/auth/base.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class MailIntakeAuthProvider(ABC):
|
||||
"""
|
||||
Abstract authentication provider.
|
||||
|
||||
Mail adapters depend on this interface, not on concrete
|
||||
OAuth or credential implementations.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_credentials(self):
|
||||
"""
|
||||
Return provider-specific credentials object.
|
||||
|
||||
This method is synchronous by design and must either
|
||||
return valid credentials or raise MailIntakeAuthError.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
81
mail_intake/auth/google.py
Normal file
81
mail_intake/auth/google.py
Normal file
@@ -0,0 +1,81 @@
|
||||
import os
|
||||
import pickle
|
||||
from typing import Sequence
|
||||
|
||||
import google.auth.exceptions
|
||||
from google.auth.transport.requests import Request
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||
|
||||
from mail_intake.auth.base import MailIntakeAuthProvider
|
||||
from mail_intake.exceptions import MailIntakeAuthError
|
||||
|
||||
|
||||
class MailIntakeGoogleAuth(MailIntakeAuthProvider):
|
||||
"""
|
||||
Google OAuth provider for Gmail access.
|
||||
|
||||
Responsibilities:
|
||||
- Load cached credentials from disk
|
||||
- Refresh expired tokens when possible
|
||||
- Trigger interactive login only when strictly required
|
||||
|
||||
This class is synchronous and intentionally state-light.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
credentials_path: str,
|
||||
token_path: str,
|
||||
scopes: Sequence[str],
|
||||
):
|
||||
self.credentials_path = credentials_path
|
||||
self.token_path = token_path
|
||||
self.scopes = list(scopes)
|
||||
|
||||
def get_credentials(self):
|
||||
creds = None
|
||||
|
||||
# Attempt to load cached credentials
|
||||
if os.path.exists(self.token_path):
|
||||
try:
|
||||
with open(self.token_path, "rb") as fh:
|
||||
creds = pickle.load(fh)
|
||||
except Exception:
|
||||
creds = None
|
||||
|
||||
# Validate / refresh credentials
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
try:
|
||||
creds.refresh(Request())
|
||||
except google.auth.exceptions.RefreshError:
|
||||
creds = None
|
||||
|
||||
# Interactive login if refresh failed or creds missing
|
||||
if not creds:
|
||||
if not os.path.exists(self.credentials_path):
|
||||
raise MailIntakeAuthError(
|
||||
f"Google credentials file not found: {self.credentials_path}"
|
||||
)
|
||||
|
||||
try:
|
||||
flow = InstalledAppFlow.from_client_secrets_file(
|
||||
self.credentials_path,
|
||||
self.scopes,
|
||||
)
|
||||
creds = flow.run_local_server(port=0)
|
||||
except Exception as exc:
|
||||
raise MailIntakeAuthError(
|
||||
"Failed to complete Google OAuth flow"
|
||||
) from exc
|
||||
|
||||
# Persist refreshed / new credentials
|
||||
try:
|
||||
with open(self.token_path, "wb") as fh:
|
||||
pickle.dump(creds, fh)
|
||||
except Exception as exc:
|
||||
raise MailIntakeAuthError(
|
||||
f"Failed to write token file: {self.token_path}"
|
||||
) from exc
|
||||
|
||||
return creds
|
||||
20
mail_intake/config.py
Normal file
20
mail_intake/config.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MailIntakeConfig:
|
||||
"""
|
||||
Global configuration for mail-intake.
|
||||
|
||||
This configuration is intentionally explicit and immutable.
|
||||
No implicit environment reads or global state.
|
||||
"""
|
||||
|
||||
provider: str = "gmail"
|
||||
user_id: str = "me"
|
||||
readonly: bool = True
|
||||
|
||||
# Provider-specific paths (optional at this layer)
|
||||
credentials_path: Optional[str] = None
|
||||
token_path: Optional[str] = None
|
||||
19
mail_intake/exceptions.py
Normal file
19
mail_intake/exceptions.py
Normal file
@@ -0,0 +1,19 @@
|
||||
class MailIntakeError(Exception):
|
||||
"""
|
||||
Base exception for all mail-intake errors.
|
||||
|
||||
Users of the library should catch this type (or subclasses)
|
||||
instead of provider-specific or third-party exceptions.
|
||||
"""
|
||||
|
||||
|
||||
class MailIntakeAuthError(MailIntakeError):
|
||||
"""Authentication and credential-related failures."""
|
||||
|
||||
|
||||
class MailIntakeAdapterError(MailIntakeError):
|
||||
"""Errors raised by mail provider adapters."""
|
||||
|
||||
|
||||
class MailIntakeParsingError(MailIntakeError):
|
||||
"""Errors encountered while parsing message content."""
|
||||
0
mail_intake/ingestion/__init__.py
Normal file
0
mail_intake/ingestion/__init__.py
Normal file
99
mail_intake/ingestion/reader.py
Normal file
99
mail_intake/ingestion/reader.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from datetime import datetime
|
||||
from typing import Iterator, Dict, Any
|
||||
|
||||
from mail_intake.adapters.base import MailIntakeAdapter
|
||||
from mail_intake.models.message import MailIntakeMessage
|
||||
from mail_intake.models.thread import MailIntakeThread
|
||||
from mail_intake.parsers.headers import parse_headers, extract_sender
|
||||
from mail_intake.parsers.body import extract_body
|
||||
from mail_intake.parsers.subject import normalize_subject
|
||||
from mail_intake.exceptions import MailIntakeParsingError
|
||||
|
||||
|
||||
class MailIntakeReader:
|
||||
"""
|
||||
High-level read-only ingestion interface.
|
||||
|
||||
This is the primary entry point users should interact with.
|
||||
It orchestrates:
|
||||
- adapter calls
|
||||
- parsing
|
||||
- normalization
|
||||
- model construction
|
||||
|
||||
No provider-specific logic exists here.
|
||||
"""
|
||||
|
||||
def __init__(self, adapter: MailIntakeAdapter):
|
||||
self._adapter = adapter
|
||||
|
||||
def iter_messages(self, query: str) -> Iterator[MailIntakeMessage]:
|
||||
"""
|
||||
Iterate over parsed messages matching a provider query.
|
||||
"""
|
||||
for ref in self._adapter.iter_message_refs(query):
|
||||
raw = self._adapter.fetch_message(ref["message_id"])
|
||||
yield self._parse_message(raw)
|
||||
|
||||
def iter_threads(self, query: str) -> Iterator[MailIntakeThread]:
|
||||
"""
|
||||
Iterate over threads constructed from messages matching a query.
|
||||
|
||||
Messages are grouped by thread_id and yielded as complete threads.
|
||||
"""
|
||||
threads: Dict[str, MailIntakeThread] = {}
|
||||
|
||||
for ref in self._adapter.iter_message_refs(query):
|
||||
raw = self._adapter.fetch_message(ref["message_id"])
|
||||
message = self._parse_message(raw)
|
||||
|
||||
thread = threads.get(message.thread_id)
|
||||
if thread is None:
|
||||
thread = MailIntakeThread(
|
||||
thread_id=message.thread_id,
|
||||
normalized_subject=normalize_subject(message.subject),
|
||||
)
|
||||
threads[message.thread_id] = thread
|
||||
|
||||
thread.add_message(message)
|
||||
|
||||
return iter(threads.values())
|
||||
|
||||
def _parse_message(self, raw_message: Dict[str, Any]) -> MailIntakeMessage:
|
||||
"""
|
||||
Parse a raw provider message into a MailIntakeMessage.
|
||||
"""
|
||||
try:
|
||||
message_id = raw_message["id"]
|
||||
thread_id = raw_message["threadId"]
|
||||
|
||||
# Gmail internalDate is milliseconds since epoch
|
||||
timestamp_ms = int(raw_message.get("internalDate", 0))
|
||||
timestamp = datetime.fromtimestamp(timestamp_ms / 1000)
|
||||
|
||||
payload = raw_message.get("payload", {})
|
||||
raw_headers_list = payload.get("headers", [])
|
||||
|
||||
headers = parse_headers(raw_headers_list)
|
||||
from_email, from_name = extract_sender(headers)
|
||||
|
||||
subject = headers.get("subject", "")
|
||||
body_text = extract_body(payload)
|
||||
snippet = raw_message.get("snippet", "")
|
||||
|
||||
return MailIntakeMessage(
|
||||
message_id=message_id,
|
||||
thread_id=thread_id,
|
||||
timestamp=timestamp,
|
||||
from_email=from_email,
|
||||
from_name=from_name,
|
||||
subject=subject,
|
||||
body_text=body_text,
|
||||
snippet=snippet,
|
||||
raw_headers=headers,
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
raise MailIntakeParsingError(
|
||||
f"Failed to parse message {raw_message.get('id')}"
|
||||
) from exc
|
||||
0
mail_intake/models/__init__.py
Normal file
0
mail_intake/models/__init__.py
Normal file
26
mail_intake/models/message.py
Normal file
26
mail_intake/models/message.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MailIntakeMessage:
|
||||
"""
|
||||
Canonical internal representation of a single email message.
|
||||
|
||||
This model is provider-agnostic and safe to persist.
|
||||
No Gmail-specific fields should appear here.
|
||||
"""
|
||||
|
||||
message_id: str
|
||||
thread_id: str
|
||||
timestamp: datetime
|
||||
|
||||
from_email: str
|
||||
from_name: Optional[str]
|
||||
|
||||
subject: str
|
||||
body_text: str
|
||||
snippet: str
|
||||
|
||||
raw_headers: Dict[str, str]
|
||||
35
mail_intake/models/thread.py
Normal file
35
mail_intake/models/thread.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import List, Set
|
||||
|
||||
from mail_intake.models.message import MailIntakeMessage
|
||||
|
||||
|
||||
@dataclass
|
||||
class MailIntakeThread:
|
||||
"""
|
||||
Canonical internal representation of an email thread.
|
||||
|
||||
Threads are the primary unit of reasoning for correspondence
|
||||
workflows (job applications, interviews, follow-ups, etc.).
|
||||
"""
|
||||
|
||||
thread_id: str
|
||||
normalized_subject: str
|
||||
|
||||
participants: Set[str] = field(default_factory=set)
|
||||
messages: List[MailIntakeMessage] = field(default_factory=list)
|
||||
|
||||
last_activity_at: datetime | None = None
|
||||
|
||||
def add_message(self, message: MailIntakeMessage) -> None:
|
||||
"""
|
||||
Add a message to the thread and update derived fields.
|
||||
"""
|
||||
self.messages.append(message)
|
||||
|
||||
if message.from_email:
|
||||
self.participants.add(message.from_email)
|
||||
|
||||
if self.last_activity_at is None or message.timestamp > self.last_activity_at:
|
||||
self.last_activity_at = message.timestamp
|
||||
0
mail_intake/parsers/__init__.py
Normal file
0
mail_intake/parsers/__init__.py
Normal file
83
mail_intake/parsers/body.py
Normal file
83
mail_intake/parsers/body.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import base64
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from mail_intake.exceptions import MailIntakeParsingError
|
||||
|
||||
|
||||
def _decode_base64(data: str) -> str:
|
||||
"""
|
||||
Decode Gmail URL-safe base64 payload into UTF-8 text.
|
||||
"""
|
||||
try:
|
||||
padded = data.replace("-", "+").replace("_", "/")
|
||||
decoded = base64.b64decode(padded)
|
||||
return decoded.decode("utf-8", errors="replace")
|
||||
except Exception as exc:
|
||||
raise MailIntakeParsingError("Failed to decode message body") from exc
|
||||
|
||||
|
||||
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
|
||||
"""
|
||||
Extract text content from a single MIME part.
|
||||
"""
|
||||
mime_type = part.get("mimeType")
|
||||
body = part.get("body", {})
|
||||
data = body.get("data")
|
||||
|
||||
if not data:
|
||||
return None
|
||||
|
||||
text = _decode_base64(data)
|
||||
|
||||
if mime_type == "text/plain":
|
||||
return text
|
||||
|
||||
if mime_type == "text/html":
|
||||
# soup = BeautifulSoup(text, "lxml")
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
return soup.get_text(separator="\n", strip=True)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_body(payload: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract the best-effort message body from a Gmail payload.
|
||||
|
||||
Priority:
|
||||
1. text/plain
|
||||
2. text/html (stripped to text)
|
||||
3. empty string (if nothing usable found)
|
||||
"""
|
||||
if not payload:
|
||||
return ""
|
||||
|
||||
# Multipart message
|
||||
if "parts" in payload:
|
||||
text_plain = None
|
||||
text_html = None
|
||||
|
||||
for part in payload.get("parts", []):
|
||||
content = _extract_from_part(part)
|
||||
if not content:
|
||||
continue
|
||||
|
||||
if part.get("mimeType") == "text/plain" and text_plain is None:
|
||||
text_plain = content
|
||||
elif part.get("mimeType") == "text/html" and text_html is None:
|
||||
text_html = content
|
||||
|
||||
if text_plain:
|
||||
return text_plain
|
||||
if text_html:
|
||||
return text_html
|
||||
|
||||
# Single-part message
|
||||
body = payload.get("body", {})
|
||||
data = body.get("data")
|
||||
if data:
|
||||
return _decode_base64(data)
|
||||
|
||||
return ""
|
||||
58
mail_intake/parsers/headers.py
Normal file
58
mail_intake/parsers/headers.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
|
||||
|
||||
def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
|
||||
"""
|
||||
Convert a list of Gmail-style headers into a normalized dict.
|
||||
|
||||
Input:
|
||||
[
|
||||
{"name": "From", "value": "John Doe <john@example.com>"},
|
||||
{"name": "Subject", "value": "Re: Interview Update"},
|
||||
...
|
||||
]
|
||||
|
||||
Output:
|
||||
{
|
||||
"from": "...",
|
||||
"subject": "...",
|
||||
...
|
||||
}
|
||||
"""
|
||||
headers: Dict[str, str] = {}
|
||||
|
||||
for header in raw_headers or []:
|
||||
name = header.get("name")
|
||||
value = header.get("value")
|
||||
|
||||
if not name or value is None:
|
||||
continue
|
||||
|
||||
headers[name.lower()] = value.strip()
|
||||
|
||||
return headers
|
||||
|
||||
|
||||
def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
|
||||
"""
|
||||
Extract sender email and optional display name from headers.
|
||||
|
||||
Returns:
|
||||
(email, name)
|
||||
|
||||
If name cannot be determined, name will be None.
|
||||
"""
|
||||
from_header = headers.get("from")
|
||||
if not from_header:
|
||||
return "", None
|
||||
|
||||
# Common forms:
|
||||
# Name <email@domain>
|
||||
# email@domain
|
||||
if "<" in from_header and ">" in from_header:
|
||||
name_part, email_part = from_header.split("<", 1)
|
||||
email = email_part.rstrip(">").strip()
|
||||
name = name_part.strip().strip('"') or None
|
||||
return email, name
|
||||
|
||||
return from_header.strip(), None
|
||||
33
mail_intake/parsers/subject.py
Normal file
33
mail_intake/parsers/subject.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import re
|
||||
|
||||
|
||||
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
|
||||
|
||||
|
||||
def normalize_subject(subject: str) -> str:
|
||||
"""
|
||||
Normalize an email subject for thread-level comparison.
|
||||
|
||||
Operations:
|
||||
- Strip common prefixes (Re:, Fwd:, FW:)
|
||||
- Collapse whitespace
|
||||
- Preserve original casing (no lowercasing)
|
||||
|
||||
This function is intentionally conservative.
|
||||
"""
|
||||
if not subject:
|
||||
return ""
|
||||
|
||||
normalized = subject.strip()
|
||||
|
||||
# Strip prefixes repeatedly (e.g., Re: Fwd: Re:)
|
||||
while True:
|
||||
new_value = _PREFIX_RE.sub("", normalized)
|
||||
if new_value == normalized:
|
||||
break
|
||||
normalized = new_value.strip()
|
||||
|
||||
# Normalize whitespace
|
||||
normalized = " ".join(normalized.split())
|
||||
|
||||
return normalized
|
||||
94
pyproject.toml
Normal file
94
pyproject.toml
Normal file
@@ -0,0 +1,94 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
|
||||
[project]
|
||||
name = "mail-intake"
|
||||
version = "0.0.1"
|
||||
description = "Structured mail ingestion and correspondence parsing with provider adapters (Gmail-first)."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = { text = "MIT" }
|
||||
|
||||
authors = [
|
||||
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
|
||||
]
|
||||
maintainers = [
|
||||
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
|
||||
]
|
||||
|
||||
|
||||
keywords = [
|
||||
"email",
|
||||
"gmail",
|
||||
"mail",
|
||||
"ingestion",
|
||||
"automation",
|
||||
"job-search",
|
||||
"correspondence",
|
||||
]
|
||||
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Topic :: Communications :: Email",
|
||||
"Topic :: Software Development :: Libraries",
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
# Gmail API stack
|
||||
"google-api-python-client>=2.120.0",
|
||||
"google-auth>=2.28.0",
|
||||
"google-auth-oauthlib>=1.2.0",
|
||||
|
||||
# Parsing
|
||||
"beautifulsoup4>=4.12.0",
|
||||
"lxml>=5.1.0",
|
||||
]
|
||||
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=8.0.0",
|
||||
"pytest-cov>=4.1.0",
|
||||
"ruff>=0.3.0",
|
||||
"mypy>=1.8.0",
|
||||
"types-beautifulsoup4",
|
||||
]
|
||||
|
||||
docs = [
|
||||
"mkdocs>=1.5.0",
|
||||
"mkdocs-material>=9.5.0",
|
||||
]
|
||||
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://git.aetoskia.com/aetos/mail-intake"
|
||||
Documentation = "https://git.aetoskia.com/aetos/mail-intake#readme"
|
||||
Repository = "https://git.aetoskia.com/aetos/mail-intake.git"
|
||||
Issues = "https://git.aetoskia.com/aetos/mail-intake/issues"
|
||||
Versions = "https://git.aetoskia.com/aetos/mail-intake/tags"
|
||||
|
||||
|
||||
[tool.setuptools]
|
||||
package-dir = { "" = "src" }
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
include = ["mail_intake*"]
|
||||
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
target-version = "py310"
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.10"
|
||||
strict = true
|
||||
ignore_missing_imports = true
|
||||
10
requirements.txt
Normal file
10
requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
beautifulsoup4==4.12.0
|
||||
|
||||
pytest==7.4.0
|
||||
pytest-asyncio==0.21.0
|
||||
pytest-cov==4.1.0
|
||||
|
||||
types-beautifulsoup4
|
||||
|
||||
# Optional, useful locally
|
||||
ipython
|
||||
91
tests/unit/test_models.py
Normal file
91
tests/unit/test_models.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from mail_intake.models.message import MailIntakeMessage
|
||||
from mail_intake.models.thread import MailIntakeThread
|
||||
|
||||
|
||||
def test_message_is_immutable():
|
||||
msg = MailIntakeMessage(
|
||||
message_id="m1",
|
||||
thread_id="t1",
|
||||
timestamp=datetime.utcnow(),
|
||||
from_email="alice@example.com",
|
||||
from_name="Alice",
|
||||
subject="Hello",
|
||||
body_text="Body",
|
||||
snippet="Snippet",
|
||||
raw_headers={"from": "Alice <alice@example.com>"},
|
||||
)
|
||||
|
||||
try:
|
||||
msg.subject = "Changed"
|
||||
assert False, "Message should be immutable"
|
||||
except Exception:
|
||||
assert True
|
||||
|
||||
|
||||
def test_thread_add_message_updates_participants_and_timestamp():
|
||||
t0 = datetime.utcnow()
|
||||
t1 = t0 + timedelta(minutes=5)
|
||||
|
||||
msg1 = MailIntakeMessage(
|
||||
message_id="m1",
|
||||
thread_id="t1",
|
||||
timestamp=t0,
|
||||
from_email="alice@example.com",
|
||||
from_name="Alice",
|
||||
subject="Hello",
|
||||
body_text="Body",
|
||||
snippet="Snippet",
|
||||
raw_headers={},
|
||||
)
|
||||
|
||||
msg2 = MailIntakeMessage(
|
||||
message_id="m2",
|
||||
thread_id="t1",
|
||||
timestamp=t1,
|
||||
from_email="bob@example.com",
|
||||
from_name="Bob",
|
||||
subject="Re: Hello",
|
||||
body_text="Reply",
|
||||
snippet="Reply",
|
||||
raw_headers={},
|
||||
)
|
||||
|
||||
thread = MailIntakeThread(
|
||||
thread_id="t1",
|
||||
normalized_subject="Hello",
|
||||
)
|
||||
|
||||
thread.add_message(msg1)
|
||||
assert thread.last_activity_at == t0
|
||||
assert "alice@example.com" in thread.participants
|
||||
|
||||
thread.add_message(msg2)
|
||||
assert thread.last_activity_at == t1
|
||||
assert "bob@example.com" in thread.participants
|
||||
assert len(thread.messages) == 2
|
||||
|
||||
|
||||
def test_thread_handles_messages_without_sender():
|
||||
msg = MailIntakeMessage(
|
||||
message_id="m1",
|
||||
thread_id="t1",
|
||||
timestamp=datetime.utcnow(),
|
||||
from_email="",
|
||||
from_name=None,
|
||||
subject="System Message",
|
||||
body_text="Body",
|
||||
snippet="Snippet",
|
||||
raw_headers={},
|
||||
)
|
||||
|
||||
thread = MailIntakeThread(
|
||||
thread_id="t1",
|
||||
normalized_subject="System Message",
|
||||
)
|
||||
|
||||
thread.add_message(msg)
|
||||
|
||||
assert len(thread.participants) == 0
|
||||
assert thread.last_activity_at is not None
|
||||
128
tests/unit/test_parsers.py
Normal file
128
tests/unit/test_parsers.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import base64
|
||||
|
||||
from mail_intake.parsers.subject import normalize_subject
|
||||
from mail_intake.parsers.headers import parse_headers, extract_sender
|
||||
from mail_intake.parsers.body import extract_body
|
||||
|
||||
|
||||
def _b64(text: str) -> str:
|
||||
return base64.b64encode(text.encode("utf-8")).decode("utf-8")
|
||||
|
||||
|
||||
# --------------------
|
||||
# Subject parsing
|
||||
# --------------------
|
||||
|
||||
def test_normalize_subject_strips_common_prefixes():
|
||||
assert normalize_subject("Re: Interview Update") == "Interview Update"
|
||||
assert normalize_subject("Fwd: Re: Offer Letter") == "Offer Letter"
|
||||
assert normalize_subject("FW: Re: FW: Status") == "Status"
|
||||
|
||||
|
||||
def test_normalize_subject_preserves_content_and_case():
|
||||
subject = "Interview Update – Backend Role"
|
||||
assert normalize_subject(subject) == subject
|
||||
|
||||
|
||||
def test_normalize_subject_empty_and_none_safe():
|
||||
assert normalize_subject("") == ""
|
||||
|
||||
|
||||
# --------------------
|
||||
# Header parsing
|
||||
# --------------------
|
||||
|
||||
def test_parse_headers_lowercases_keys():
|
||||
raw_headers = [
|
||||
{"name": "From", "value": "Alice <alice@example.com>"},
|
||||
{"name": "Subject", "value": "Hello"},
|
||||
]
|
||||
|
||||
headers = parse_headers(raw_headers)
|
||||
|
||||
assert headers["from"] == "Alice <alice@example.com>"
|
||||
assert headers["subject"] == "Hello"
|
||||
|
||||
|
||||
def test_parse_headers_ignores_invalid_entries():
|
||||
raw_headers = [
|
||||
{"name": "From", "value": "Bob <bob@example.com>"},
|
||||
{"name": None, "value": "X"},
|
||||
{"name": "X-Test", "value": None},
|
||||
]
|
||||
|
||||
headers = parse_headers(raw_headers)
|
||||
assert "from" in headers
|
||||
assert "x-test" not in headers
|
||||
|
||||
|
||||
def test_extract_sender_with_name_and_email():
|
||||
headers = {"from": "Alice Smith <alice@example.com>"}
|
||||
email, name = extract_sender(headers)
|
||||
|
||||
assert email == "alice@example.com"
|
||||
assert name == "Alice Smith"
|
||||
|
||||
|
||||
def test_extract_sender_email_only():
|
||||
headers = {"from": "bob@example.com"}
|
||||
email, name = extract_sender(headers)
|
||||
|
||||
assert email == "bob@example.com"
|
||||
assert name is None
|
||||
|
||||
|
||||
def test_extract_sender_missing_from():
|
||||
email, name = extract_sender({})
|
||||
assert email == ""
|
||||
assert name is None
|
||||
|
||||
|
||||
# --------------------
|
||||
# Body parsing
|
||||
# --------------------
|
||||
|
||||
def test_extract_body_prefers_text_plain():
|
||||
payload = {
|
||||
"parts": [
|
||||
{
|
||||
"mimeType": "text/html",
|
||||
"body": {"data": _b64("<p>Hello <b>HTML</b></p>")},
|
||||
},
|
||||
{
|
||||
"mimeType": "text/plain",
|
||||
"body": {"data": _b64("Hello TEXT")},
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
body = extract_body(payload)
|
||||
assert body == "Hello TEXT"
|
||||
|
||||
|
||||
def test_extract_body_falls_back_to_html():
|
||||
payload = {
|
||||
"parts": [
|
||||
{
|
||||
"mimeType": "text/html",
|
||||
"body": {"data": _b64("<p>Hello <b>World</b></p>")},
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
body = extract_body(payload)
|
||||
assert "Hello" in body
|
||||
assert "World" in body
|
||||
|
||||
|
||||
def test_extract_body_single_part():
|
||||
payload = {
|
||||
"body": {"data": _b64("Single part body")}
|
||||
}
|
||||
|
||||
body = extract_body(payload)
|
||||
assert body == "Single part body"
|
||||
|
||||
|
||||
def test_extract_body_empty_payload():
|
||||
assert extract_body({}) == ""
|
||||
Reference in New Issue
Block a user