lib init
This commit is contained in:
0
mail_intake/__init__.py
Normal file
0
mail_intake/__init__.py
Normal file
0
mail_intake/adapters/__init__.py
Normal file
0
mail_intake/adapters/__init__.py
Normal file
48
mail_intake/adapters/base.py
Normal file
48
mail_intake/adapters/base.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Iterator, Dict, Any
|
||||||
|
|
||||||
|
|
||||||
|
class MailIntakeAdapter(ABC):
|
||||||
|
"""
|
||||||
|
Base adapter interface for mail providers.
|
||||||
|
|
||||||
|
This interface defines the minimal contract required for
|
||||||
|
read-only mail ingestion. No provider-specific concepts
|
||||||
|
should leak beyond implementations of this class.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Iterate over lightweight message references.
|
||||||
|
|
||||||
|
Must yield dictionaries containing at least:
|
||||||
|
- message_id
|
||||||
|
- thread_id
|
||||||
|
|
||||||
|
Example yield:
|
||||||
|
{
|
||||||
|
"message_id": "...",
|
||||||
|
"thread_id": "..."
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def fetch_message(self, message_id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Fetch a full raw message by message_id.
|
||||||
|
|
||||||
|
Returns the provider-native message payload
|
||||||
|
(e.g., Gmail message JSON).
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Fetch a full raw thread by thread_id.
|
||||||
|
|
||||||
|
Returns the provider-native thread payload.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
105
mail_intake/adapters/gmail.py
Normal file
105
mail_intake/adapters/gmail.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
from typing import Iterator, Dict, Any
|
||||||
|
|
||||||
|
from googleapiclient.discovery import build
|
||||||
|
from googleapiclient.errors import HttpError
|
||||||
|
|
||||||
|
from mail_intake.adapters.base import MailIntakeAdapter
|
||||||
|
from mail_intake.exceptions import MailIntakeAdapterError
|
||||||
|
from mail_intake.auth.base import MailIntakeAuthProvider
|
||||||
|
|
||||||
|
|
||||||
|
class MailIntakeGmailAdapter(MailIntakeAdapter):
|
||||||
|
"""
|
||||||
|
Gmail read-only adapter.
|
||||||
|
|
||||||
|
This class is the ONLY place where:
|
||||||
|
- googleapiclient is imported
|
||||||
|
- Gmail REST semantics are known
|
||||||
|
- .execute() is called
|
||||||
|
|
||||||
|
It must remain thin and dumb by design.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
auth_provider: MailIntakeAuthProvider,
|
||||||
|
user_id: str = "me",
|
||||||
|
):
|
||||||
|
self._auth_provider = auth_provider
|
||||||
|
self._user_id = user_id
|
||||||
|
self._service = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def service(self):
|
||||||
|
if self._service is None:
|
||||||
|
try:
|
||||||
|
creds = self._auth_provider.get_credentials()
|
||||||
|
self._service = build("gmail", "v1", credentials=creds)
|
||||||
|
except Exception as exc:
|
||||||
|
raise MailIntakeAdapterError(
|
||||||
|
"Failed to initialize Gmail service"
|
||||||
|
) from exc
|
||||||
|
return self._service
|
||||||
|
|
||||||
|
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Iterate over message references matching the query.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
{
|
||||||
|
"message_id": "...",
|
||||||
|
"thread_id": "..."
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
request = (
|
||||||
|
self.service.users()
|
||||||
|
.messages()
|
||||||
|
.list(userId=self._user_id, q=query)
|
||||||
|
)
|
||||||
|
|
||||||
|
while request is not None:
|
||||||
|
response = request.execute()
|
||||||
|
|
||||||
|
for msg in response.get("messages", []):
|
||||||
|
yield {
|
||||||
|
"message_id": msg["id"],
|
||||||
|
"thread_id": msg["threadId"],
|
||||||
|
}
|
||||||
|
|
||||||
|
request = (
|
||||||
|
self.service.users()
|
||||||
|
.messages()
|
||||||
|
.list_next(request, response)
|
||||||
|
)
|
||||||
|
|
||||||
|
except HttpError as exc:
|
||||||
|
raise MailIntakeAdapterError(
|
||||||
|
"Gmail API error while listing messages"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
def fetch_message(self, message_id: str) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
return (
|
||||||
|
self.service.users()
|
||||||
|
.messages()
|
||||||
|
.get(userId=self._user_id, id=message_id)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
except HttpError as exc:
|
||||||
|
raise MailIntakeAdapterError(
|
||||||
|
f"Gmail API error while fetching message {message_id}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
return (
|
||||||
|
self.service.users()
|
||||||
|
.threads()
|
||||||
|
.get(userId=self._user_id, id=thread_id)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
except HttpError as exc:
|
||||||
|
raise MailIntakeAdapterError(
|
||||||
|
f"Gmail API error while fetching thread {thread_id}"
|
||||||
|
) from exc
|
||||||
0
mail_intake/auth/__init__.py
Normal file
0
mail_intake/auth/__init__.py
Normal file
20
mail_intake/auth/base.py
Normal file
20
mail_intake/auth/base.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
|
||||||
|
class MailIntakeAuthProvider(ABC):
|
||||||
|
"""
|
||||||
|
Abstract authentication provider.
|
||||||
|
|
||||||
|
Mail adapters depend on this interface, not on concrete
|
||||||
|
OAuth or credential implementations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_credentials(self):
|
||||||
|
"""
|
||||||
|
Return provider-specific credentials object.
|
||||||
|
|
||||||
|
This method is synchronous by design and must either
|
||||||
|
return valid credentials or raise MailIntakeAuthError.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
81
mail_intake/auth/google.py
Normal file
81
mail_intake/auth/google.py
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from typing import Sequence
|
||||||
|
|
||||||
|
import google.auth.exceptions
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||||
|
|
||||||
|
from mail_intake.auth.base import MailIntakeAuthProvider
|
||||||
|
from mail_intake.exceptions import MailIntakeAuthError
|
||||||
|
|
||||||
|
|
||||||
|
class MailIntakeGoogleAuth(MailIntakeAuthProvider):
|
||||||
|
"""
|
||||||
|
Google OAuth provider for Gmail access.
|
||||||
|
|
||||||
|
Responsibilities:
|
||||||
|
- Load cached credentials from disk
|
||||||
|
- Refresh expired tokens when possible
|
||||||
|
- Trigger interactive login only when strictly required
|
||||||
|
|
||||||
|
This class is synchronous and intentionally state-light.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
credentials_path: str,
|
||||||
|
token_path: str,
|
||||||
|
scopes: Sequence[str],
|
||||||
|
):
|
||||||
|
self.credentials_path = credentials_path
|
||||||
|
self.token_path = token_path
|
||||||
|
self.scopes = list(scopes)
|
||||||
|
|
||||||
|
def get_credentials(self):
|
||||||
|
creds = None
|
||||||
|
|
||||||
|
# Attempt to load cached credentials
|
||||||
|
if os.path.exists(self.token_path):
|
||||||
|
try:
|
||||||
|
with open(self.token_path, "rb") as fh:
|
||||||
|
creds = pickle.load(fh)
|
||||||
|
except Exception:
|
||||||
|
creds = None
|
||||||
|
|
||||||
|
# Validate / refresh credentials
|
||||||
|
if not creds or not creds.valid:
|
||||||
|
if creds and creds.expired and creds.refresh_token:
|
||||||
|
try:
|
||||||
|
creds.refresh(Request())
|
||||||
|
except google.auth.exceptions.RefreshError:
|
||||||
|
creds = None
|
||||||
|
|
||||||
|
# Interactive login if refresh failed or creds missing
|
||||||
|
if not creds:
|
||||||
|
if not os.path.exists(self.credentials_path):
|
||||||
|
raise MailIntakeAuthError(
|
||||||
|
f"Google credentials file not found: {self.credentials_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
flow = InstalledAppFlow.from_client_secrets_file(
|
||||||
|
self.credentials_path,
|
||||||
|
self.scopes,
|
||||||
|
)
|
||||||
|
creds = flow.run_local_server(port=0)
|
||||||
|
except Exception as exc:
|
||||||
|
raise MailIntakeAuthError(
|
||||||
|
"Failed to complete Google OAuth flow"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
# Persist refreshed / new credentials
|
||||||
|
try:
|
||||||
|
with open(self.token_path, "wb") as fh:
|
||||||
|
pickle.dump(creds, fh)
|
||||||
|
except Exception as exc:
|
||||||
|
raise MailIntakeAuthError(
|
||||||
|
f"Failed to write token file: {self.token_path}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
return creds
|
||||||
20
mail_intake/config.py
Normal file
20
mail_intake/config.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class MailIntakeConfig:
|
||||||
|
"""
|
||||||
|
Global configuration for mail-intake.
|
||||||
|
|
||||||
|
This configuration is intentionally explicit and immutable.
|
||||||
|
No implicit environment reads or global state.
|
||||||
|
"""
|
||||||
|
|
||||||
|
provider: str = "gmail"
|
||||||
|
user_id: str = "me"
|
||||||
|
readonly: bool = True
|
||||||
|
|
||||||
|
# Provider-specific paths (optional at this layer)
|
||||||
|
credentials_path: Optional[str] = None
|
||||||
|
token_path: Optional[str] = None
|
||||||
19
mail_intake/exceptions.py
Normal file
19
mail_intake/exceptions.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
class MailIntakeError(Exception):
|
||||||
|
"""
|
||||||
|
Base exception for all mail-intake errors.
|
||||||
|
|
||||||
|
Users of the library should catch this type (or subclasses)
|
||||||
|
instead of provider-specific or third-party exceptions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class MailIntakeAuthError(MailIntakeError):
|
||||||
|
"""Authentication and credential-related failures."""
|
||||||
|
|
||||||
|
|
||||||
|
class MailIntakeAdapterError(MailIntakeError):
|
||||||
|
"""Errors raised by mail provider adapters."""
|
||||||
|
|
||||||
|
|
||||||
|
class MailIntakeParsingError(MailIntakeError):
|
||||||
|
"""Errors encountered while parsing message content."""
|
||||||
0
mail_intake/ingestion/__init__.py
Normal file
0
mail_intake/ingestion/__init__.py
Normal file
99
mail_intake/ingestion/reader.py
Normal file
99
mail_intake/ingestion/reader.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from typing import Iterator, Dict, Any
|
||||||
|
|
||||||
|
from mail_intake.adapters.base import MailIntakeAdapter
|
||||||
|
from mail_intake.models.message import MailIntakeMessage
|
||||||
|
from mail_intake.models.thread import MailIntakeThread
|
||||||
|
from mail_intake.parsers.headers import parse_headers, extract_sender
|
||||||
|
from mail_intake.parsers.body import extract_body
|
||||||
|
from mail_intake.parsers.subject import normalize_subject
|
||||||
|
from mail_intake.exceptions import MailIntakeParsingError
|
||||||
|
|
||||||
|
|
||||||
|
class MailIntakeReader:
|
||||||
|
"""
|
||||||
|
High-level read-only ingestion interface.
|
||||||
|
|
||||||
|
This is the primary entry point users should interact with.
|
||||||
|
It orchestrates:
|
||||||
|
- adapter calls
|
||||||
|
- parsing
|
||||||
|
- normalization
|
||||||
|
- model construction
|
||||||
|
|
||||||
|
No provider-specific logic exists here.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, adapter: MailIntakeAdapter):
|
||||||
|
self._adapter = adapter
|
||||||
|
|
||||||
|
def iter_messages(self, query: str) -> Iterator[MailIntakeMessage]:
|
||||||
|
"""
|
||||||
|
Iterate over parsed messages matching a provider query.
|
||||||
|
"""
|
||||||
|
for ref in self._adapter.iter_message_refs(query):
|
||||||
|
raw = self._adapter.fetch_message(ref["message_id"])
|
||||||
|
yield self._parse_message(raw)
|
||||||
|
|
||||||
|
def iter_threads(self, query: str) -> Iterator[MailIntakeThread]:
|
||||||
|
"""
|
||||||
|
Iterate over threads constructed from messages matching a query.
|
||||||
|
|
||||||
|
Messages are grouped by thread_id and yielded as complete threads.
|
||||||
|
"""
|
||||||
|
threads: Dict[str, MailIntakeThread] = {}
|
||||||
|
|
||||||
|
for ref in self._adapter.iter_message_refs(query):
|
||||||
|
raw = self._adapter.fetch_message(ref["message_id"])
|
||||||
|
message = self._parse_message(raw)
|
||||||
|
|
||||||
|
thread = threads.get(message.thread_id)
|
||||||
|
if thread is None:
|
||||||
|
thread = MailIntakeThread(
|
||||||
|
thread_id=message.thread_id,
|
||||||
|
normalized_subject=normalize_subject(message.subject),
|
||||||
|
)
|
||||||
|
threads[message.thread_id] = thread
|
||||||
|
|
||||||
|
thread.add_message(message)
|
||||||
|
|
||||||
|
return iter(threads.values())
|
||||||
|
|
||||||
|
def _parse_message(self, raw_message: Dict[str, Any]) -> MailIntakeMessage:
|
||||||
|
"""
|
||||||
|
Parse a raw provider message into a MailIntakeMessage.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
message_id = raw_message["id"]
|
||||||
|
thread_id = raw_message["threadId"]
|
||||||
|
|
||||||
|
# Gmail internalDate is milliseconds since epoch
|
||||||
|
timestamp_ms = int(raw_message.get("internalDate", 0))
|
||||||
|
timestamp = datetime.fromtimestamp(timestamp_ms / 1000)
|
||||||
|
|
||||||
|
payload = raw_message.get("payload", {})
|
||||||
|
raw_headers_list = payload.get("headers", [])
|
||||||
|
|
||||||
|
headers = parse_headers(raw_headers_list)
|
||||||
|
from_email, from_name = extract_sender(headers)
|
||||||
|
|
||||||
|
subject = headers.get("subject", "")
|
||||||
|
body_text = extract_body(payload)
|
||||||
|
snippet = raw_message.get("snippet", "")
|
||||||
|
|
||||||
|
return MailIntakeMessage(
|
||||||
|
message_id=message_id,
|
||||||
|
thread_id=thread_id,
|
||||||
|
timestamp=timestamp,
|
||||||
|
from_email=from_email,
|
||||||
|
from_name=from_name,
|
||||||
|
subject=subject,
|
||||||
|
body_text=body_text,
|
||||||
|
snippet=snippet,
|
||||||
|
raw_headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
raise MailIntakeParsingError(
|
||||||
|
f"Failed to parse message {raw_message.get('id')}"
|
||||||
|
) from exc
|
||||||
0
mail_intake/models/__init__.py
Normal file
0
mail_intake/models/__init__.py
Normal file
26
mail_intake/models/message.py
Normal file
26
mail_intake/models/message.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, Dict
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class MailIntakeMessage:
|
||||||
|
"""
|
||||||
|
Canonical internal representation of a single email message.
|
||||||
|
|
||||||
|
This model is provider-agnostic and safe to persist.
|
||||||
|
No Gmail-specific fields should appear here.
|
||||||
|
"""
|
||||||
|
|
||||||
|
message_id: str
|
||||||
|
thread_id: str
|
||||||
|
timestamp: datetime
|
||||||
|
|
||||||
|
from_email: str
|
||||||
|
from_name: Optional[str]
|
||||||
|
|
||||||
|
subject: str
|
||||||
|
body_text: str
|
||||||
|
snippet: str
|
||||||
|
|
||||||
|
raw_headers: Dict[str, str]
|
||||||
35
mail_intake/models/thread.py
Normal file
35
mail_intake/models/thread.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Set
|
||||||
|
|
||||||
|
from mail_intake.models.message import MailIntakeMessage
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MailIntakeThread:
|
||||||
|
"""
|
||||||
|
Canonical internal representation of an email thread.
|
||||||
|
|
||||||
|
Threads are the primary unit of reasoning for correspondence
|
||||||
|
workflows (job applications, interviews, follow-ups, etc.).
|
||||||
|
"""
|
||||||
|
|
||||||
|
thread_id: str
|
||||||
|
normalized_subject: str
|
||||||
|
|
||||||
|
participants: Set[str] = field(default_factory=set)
|
||||||
|
messages: List[MailIntakeMessage] = field(default_factory=list)
|
||||||
|
|
||||||
|
last_activity_at: datetime | None = None
|
||||||
|
|
||||||
|
def add_message(self, message: MailIntakeMessage) -> None:
|
||||||
|
"""
|
||||||
|
Add a message to the thread and update derived fields.
|
||||||
|
"""
|
||||||
|
self.messages.append(message)
|
||||||
|
|
||||||
|
if message.from_email:
|
||||||
|
self.participants.add(message.from_email)
|
||||||
|
|
||||||
|
if self.last_activity_at is None or message.timestamp > self.last_activity_at:
|
||||||
|
self.last_activity_at = message.timestamp
|
||||||
0
mail_intake/parsers/__init__.py
Normal file
0
mail_intake/parsers/__init__.py
Normal file
83
mail_intake/parsers/body.py
Normal file
83
mail_intake/parsers/body.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
import base64
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from mail_intake.exceptions import MailIntakeParsingError
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_base64(data: str) -> str:
|
||||||
|
"""
|
||||||
|
Decode Gmail URL-safe base64 payload into UTF-8 text.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
padded = data.replace("-", "+").replace("_", "/")
|
||||||
|
decoded = base64.b64decode(padded)
|
||||||
|
return decoded.decode("utf-8", errors="replace")
|
||||||
|
except Exception as exc:
|
||||||
|
raise MailIntakeParsingError("Failed to decode message body") from exc
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extract text content from a single MIME part.
|
||||||
|
"""
|
||||||
|
mime_type = part.get("mimeType")
|
||||||
|
body = part.get("body", {})
|
||||||
|
data = body.get("data")
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
return None
|
||||||
|
|
||||||
|
text = _decode_base64(data)
|
||||||
|
|
||||||
|
if mime_type == "text/plain":
|
||||||
|
return text
|
||||||
|
|
||||||
|
if mime_type == "text/html":
|
||||||
|
# soup = BeautifulSoup(text, "lxml")
|
||||||
|
soup = BeautifulSoup(text, "html.parser")
|
||||||
|
return soup.get_text(separator="\n", strip=True)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_body(payload: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Extract the best-effort message body from a Gmail payload.
|
||||||
|
|
||||||
|
Priority:
|
||||||
|
1. text/plain
|
||||||
|
2. text/html (stripped to text)
|
||||||
|
3. empty string (if nothing usable found)
|
||||||
|
"""
|
||||||
|
if not payload:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Multipart message
|
||||||
|
if "parts" in payload:
|
||||||
|
text_plain = None
|
||||||
|
text_html = None
|
||||||
|
|
||||||
|
for part in payload.get("parts", []):
|
||||||
|
content = _extract_from_part(part)
|
||||||
|
if not content:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if part.get("mimeType") == "text/plain" and text_plain is None:
|
||||||
|
text_plain = content
|
||||||
|
elif part.get("mimeType") == "text/html" and text_html is None:
|
||||||
|
text_html = content
|
||||||
|
|
||||||
|
if text_plain:
|
||||||
|
return text_plain
|
||||||
|
if text_html:
|
||||||
|
return text_html
|
||||||
|
|
||||||
|
# Single-part message
|
||||||
|
body = payload.get("body", {})
|
||||||
|
data = body.get("data")
|
||||||
|
if data:
|
||||||
|
return _decode_base64(data)
|
||||||
|
|
||||||
|
return ""
|
||||||
58
mail_intake/parsers/headers.py
Normal file
58
mail_intake/parsers/headers.py
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
from typing import Dict, List, Tuple, Optional
|
||||||
|
|
||||||
|
|
||||||
|
def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Convert a list of Gmail-style headers into a normalized dict.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
[
|
||||||
|
{"name": "From", "value": "John Doe <john@example.com>"},
|
||||||
|
{"name": "Subject", "value": "Re: Interview Update"},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
|
||||||
|
Output:
|
||||||
|
{
|
||||||
|
"from": "...",
|
||||||
|
"subject": "...",
|
||||||
|
...
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
headers: Dict[str, str] = {}
|
||||||
|
|
||||||
|
for header in raw_headers or []:
|
||||||
|
name = header.get("name")
|
||||||
|
value = header.get("value")
|
||||||
|
|
||||||
|
if not name or value is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
headers[name.lower()] = value.strip()
|
||||||
|
|
||||||
|
return headers
|
||||||
|
|
||||||
|
|
||||||
|
def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
|
||||||
|
"""
|
||||||
|
Extract sender email and optional display name from headers.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(email, name)
|
||||||
|
|
||||||
|
If name cannot be determined, name will be None.
|
||||||
|
"""
|
||||||
|
from_header = headers.get("from")
|
||||||
|
if not from_header:
|
||||||
|
return "", None
|
||||||
|
|
||||||
|
# Common forms:
|
||||||
|
# Name <email@domain>
|
||||||
|
# email@domain
|
||||||
|
if "<" in from_header and ">" in from_header:
|
||||||
|
name_part, email_part = from_header.split("<", 1)
|
||||||
|
email = email_part.rstrip(">").strip()
|
||||||
|
name = name_part.strip().strip('"') or None
|
||||||
|
return email, name
|
||||||
|
|
||||||
|
return from_header.strip(), None
|
||||||
33
mail_intake/parsers/subject.py
Normal file
33
mail_intake/parsers/subject.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_subject(subject: str) -> str:
|
||||||
|
"""
|
||||||
|
Normalize an email subject for thread-level comparison.
|
||||||
|
|
||||||
|
Operations:
|
||||||
|
- Strip common prefixes (Re:, Fwd:, FW:)
|
||||||
|
- Collapse whitespace
|
||||||
|
- Preserve original casing (no lowercasing)
|
||||||
|
|
||||||
|
This function is intentionally conservative.
|
||||||
|
"""
|
||||||
|
if not subject:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
normalized = subject.strip()
|
||||||
|
|
||||||
|
# Strip prefixes repeatedly (e.g., Re: Fwd: Re:)
|
||||||
|
while True:
|
||||||
|
new_value = _PREFIX_RE.sub("", normalized)
|
||||||
|
if new_value == normalized:
|
||||||
|
break
|
||||||
|
normalized = new_value.strip()
|
||||||
|
|
||||||
|
# Normalize whitespace
|
||||||
|
normalized = " ".join(normalized.split())
|
||||||
|
|
||||||
|
return normalized
|
||||||
94
pyproject.toml
Normal file
94
pyproject.toml
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=68", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "mail-intake"
|
||||||
|
version = "0.0.1"
|
||||||
|
description = "Structured mail ingestion and correspondence parsing with provider adapters (Gmail-first)."
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.10"
|
||||||
|
license = { text = "MIT" }
|
||||||
|
|
||||||
|
authors = [
|
||||||
|
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
|
||||||
|
]
|
||||||
|
maintainers = [
|
||||||
|
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
keywords = [
|
||||||
|
"email",
|
||||||
|
"gmail",
|
||||||
|
"mail",
|
||||||
|
"ingestion",
|
||||||
|
"automation",
|
||||||
|
"job-search",
|
||||||
|
"correspondence",
|
||||||
|
]
|
||||||
|
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 3 - Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Topic :: Communications :: Email",
|
||||||
|
"Topic :: Software Development :: Libraries",
|
||||||
|
]
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
# Gmail API stack
|
||||||
|
"google-api-python-client>=2.120.0",
|
||||||
|
"google-auth>=2.28.0",
|
||||||
|
"google-auth-oauthlib>=1.2.0",
|
||||||
|
|
||||||
|
# Parsing
|
||||||
|
"beautifulsoup4>=4.12.0",
|
||||||
|
"lxml>=5.1.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=8.0.0",
|
||||||
|
"pytest-cov>=4.1.0",
|
||||||
|
"ruff>=0.3.0",
|
||||||
|
"mypy>=1.8.0",
|
||||||
|
"types-beautifulsoup4",
|
||||||
|
]
|
||||||
|
|
||||||
|
docs = [
|
||||||
|
"mkdocs>=1.5.0",
|
||||||
|
"mkdocs-material>=9.5.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://git.aetoskia.com/aetos/mail-intake"
|
||||||
|
Documentation = "https://git.aetoskia.com/aetos/mail-intake#readme"
|
||||||
|
Repository = "https://git.aetoskia.com/aetos/mail-intake.git"
|
||||||
|
Issues = "https://git.aetoskia.com/aetos/mail-intake/issues"
|
||||||
|
Versions = "https://git.aetoskia.com/aetos/mail-intake/tags"
|
||||||
|
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
package-dir = { "" = "src" }
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["src"]
|
||||||
|
include = ["mail_intake*"]
|
||||||
|
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 100
|
||||||
|
target-version = "py310"
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
python_version = "3.10"
|
||||||
|
strict = true
|
||||||
|
ignore_missing_imports = true
|
||||||
10
requirements.txt
Normal file
10
requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
beautifulsoup4==4.12.0
|
||||||
|
|
||||||
|
pytest==7.4.0
|
||||||
|
pytest-asyncio==0.21.0
|
||||||
|
pytest-cov==4.1.0
|
||||||
|
|
||||||
|
types-beautifulsoup4
|
||||||
|
|
||||||
|
# Optional, useful locally
|
||||||
|
ipython
|
||||||
91
tests/unit/test_models.py
Normal file
91
tests/unit/test_models.py
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
from mail_intake.models.message import MailIntakeMessage
|
||||||
|
from mail_intake.models.thread import MailIntakeThread
|
||||||
|
|
||||||
|
|
||||||
|
def test_message_is_immutable():
|
||||||
|
msg = MailIntakeMessage(
|
||||||
|
message_id="m1",
|
||||||
|
thread_id="t1",
|
||||||
|
timestamp=datetime.utcnow(),
|
||||||
|
from_email="alice@example.com",
|
||||||
|
from_name="Alice",
|
||||||
|
subject="Hello",
|
||||||
|
body_text="Body",
|
||||||
|
snippet="Snippet",
|
||||||
|
raw_headers={"from": "Alice <alice@example.com>"},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
msg.subject = "Changed"
|
||||||
|
assert False, "Message should be immutable"
|
||||||
|
except Exception:
|
||||||
|
assert True
|
||||||
|
|
||||||
|
|
||||||
|
def test_thread_add_message_updates_participants_and_timestamp():
|
||||||
|
t0 = datetime.utcnow()
|
||||||
|
t1 = t0 + timedelta(minutes=5)
|
||||||
|
|
||||||
|
msg1 = MailIntakeMessage(
|
||||||
|
message_id="m1",
|
||||||
|
thread_id="t1",
|
||||||
|
timestamp=t0,
|
||||||
|
from_email="alice@example.com",
|
||||||
|
from_name="Alice",
|
||||||
|
subject="Hello",
|
||||||
|
body_text="Body",
|
||||||
|
snippet="Snippet",
|
||||||
|
raw_headers={},
|
||||||
|
)
|
||||||
|
|
||||||
|
msg2 = MailIntakeMessage(
|
||||||
|
message_id="m2",
|
||||||
|
thread_id="t1",
|
||||||
|
timestamp=t1,
|
||||||
|
from_email="bob@example.com",
|
||||||
|
from_name="Bob",
|
||||||
|
subject="Re: Hello",
|
||||||
|
body_text="Reply",
|
||||||
|
snippet="Reply",
|
||||||
|
raw_headers={},
|
||||||
|
)
|
||||||
|
|
||||||
|
thread = MailIntakeThread(
|
||||||
|
thread_id="t1",
|
||||||
|
normalized_subject="Hello",
|
||||||
|
)
|
||||||
|
|
||||||
|
thread.add_message(msg1)
|
||||||
|
assert thread.last_activity_at == t0
|
||||||
|
assert "alice@example.com" in thread.participants
|
||||||
|
|
||||||
|
thread.add_message(msg2)
|
||||||
|
assert thread.last_activity_at == t1
|
||||||
|
assert "bob@example.com" in thread.participants
|
||||||
|
assert len(thread.messages) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_thread_handles_messages_without_sender():
|
||||||
|
msg = MailIntakeMessage(
|
||||||
|
message_id="m1",
|
||||||
|
thread_id="t1",
|
||||||
|
timestamp=datetime.utcnow(),
|
||||||
|
from_email="",
|
||||||
|
from_name=None,
|
||||||
|
subject="System Message",
|
||||||
|
body_text="Body",
|
||||||
|
snippet="Snippet",
|
||||||
|
raw_headers={},
|
||||||
|
)
|
||||||
|
|
||||||
|
thread = MailIntakeThread(
|
||||||
|
thread_id="t1",
|
||||||
|
normalized_subject="System Message",
|
||||||
|
)
|
||||||
|
|
||||||
|
thread.add_message(msg)
|
||||||
|
|
||||||
|
assert len(thread.participants) == 0
|
||||||
|
assert thread.last_activity_at is not None
|
||||||
128
tests/unit/test_parsers.py
Normal file
128
tests/unit/test_parsers.py
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
import base64
|
||||||
|
|
||||||
|
from mail_intake.parsers.subject import normalize_subject
|
||||||
|
from mail_intake.parsers.headers import parse_headers, extract_sender
|
||||||
|
from mail_intake.parsers.body import extract_body
|
||||||
|
|
||||||
|
|
||||||
|
def _b64(text: str) -> str:
|
||||||
|
return base64.b64encode(text.encode("utf-8")).decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------
|
||||||
|
# Subject parsing
|
||||||
|
# --------------------
|
||||||
|
|
||||||
|
def test_normalize_subject_strips_common_prefixes():
|
||||||
|
assert normalize_subject("Re: Interview Update") == "Interview Update"
|
||||||
|
assert normalize_subject("Fwd: Re: Offer Letter") == "Offer Letter"
|
||||||
|
assert normalize_subject("FW: Re: FW: Status") == "Status"
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_subject_preserves_content_and_case():
|
||||||
|
subject = "Interview Update – Backend Role"
|
||||||
|
assert normalize_subject(subject) == subject
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_subject_empty_and_none_safe():
|
||||||
|
assert normalize_subject("") == ""
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------
|
||||||
|
# Header parsing
|
||||||
|
# --------------------
|
||||||
|
|
||||||
|
def test_parse_headers_lowercases_keys():
|
||||||
|
raw_headers = [
|
||||||
|
{"name": "From", "value": "Alice <alice@example.com>"},
|
||||||
|
{"name": "Subject", "value": "Hello"},
|
||||||
|
]
|
||||||
|
|
||||||
|
headers = parse_headers(raw_headers)
|
||||||
|
|
||||||
|
assert headers["from"] == "Alice <alice@example.com>"
|
||||||
|
assert headers["subject"] == "Hello"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_headers_ignores_invalid_entries():
|
||||||
|
raw_headers = [
|
||||||
|
{"name": "From", "value": "Bob <bob@example.com>"},
|
||||||
|
{"name": None, "value": "X"},
|
||||||
|
{"name": "X-Test", "value": None},
|
||||||
|
]
|
||||||
|
|
||||||
|
headers = parse_headers(raw_headers)
|
||||||
|
assert "from" in headers
|
||||||
|
assert "x-test" not in headers
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_sender_with_name_and_email():
|
||||||
|
headers = {"from": "Alice Smith <alice@example.com>"}
|
||||||
|
email, name = extract_sender(headers)
|
||||||
|
|
||||||
|
assert email == "alice@example.com"
|
||||||
|
assert name == "Alice Smith"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_sender_email_only():
|
||||||
|
headers = {"from": "bob@example.com"}
|
||||||
|
email, name = extract_sender(headers)
|
||||||
|
|
||||||
|
assert email == "bob@example.com"
|
||||||
|
assert name is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_sender_missing_from():
|
||||||
|
email, name = extract_sender({})
|
||||||
|
assert email == ""
|
||||||
|
assert name is None
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------
|
||||||
|
# Body parsing
|
||||||
|
# --------------------
|
||||||
|
|
||||||
|
def test_extract_body_prefers_text_plain():
|
||||||
|
payload = {
|
||||||
|
"parts": [
|
||||||
|
{
|
||||||
|
"mimeType": "text/html",
|
||||||
|
"body": {"data": _b64("<p>Hello <b>HTML</b></p>")},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mimeType": "text/plain",
|
||||||
|
"body": {"data": _b64("Hello TEXT")},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
body = extract_body(payload)
|
||||||
|
assert body == "Hello TEXT"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_body_falls_back_to_html():
|
||||||
|
payload = {
|
||||||
|
"parts": [
|
||||||
|
{
|
||||||
|
"mimeType": "text/html",
|
||||||
|
"body": {"data": _b64("<p>Hello <b>World</b></p>")},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
body = extract_body(payload)
|
||||||
|
assert "Hello" in body
|
||||||
|
assert "World" in body
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_body_single_part():
|
||||||
|
payload = {
|
||||||
|
"body": {"data": _b64("Single part body")}
|
||||||
|
}
|
||||||
|
|
||||||
|
body = extract_body(payload)
|
||||||
|
assert body == "Single part body"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_body_empty_payload():
|
||||||
|
assert extract_body({}) == ""
|
||||||
Reference in New Issue
Block a user