This commit is contained in:
2026-01-03 05:21:55 +05:30
parent 278f0a3d40
commit 412a9c7bec
22 changed files with 950 additions and 0 deletions

0
mail_intake/__init__.py Normal file
View File

View File

View File

@@ -0,0 +1,48 @@
from abc import ABC, abstractmethod
from typing import Iterator, Dict, Any
class MailIntakeAdapter(ABC):
"""
Base adapter interface for mail providers.
This interface defines the minimal contract required for
read-only mail ingestion. No provider-specific concepts
should leak beyond implementations of this class.
"""
@abstractmethod
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
"""
Iterate over lightweight message references.
Must yield dictionaries containing at least:
- message_id
- thread_id
Example yield:
{
"message_id": "...",
"thread_id": "..."
}
"""
raise NotImplementedError
@abstractmethod
def fetch_message(self, message_id: str) -> Dict[str, Any]:
"""
Fetch a full raw message by message_id.
Returns the provider-native message payload
(e.g., Gmail message JSON).
"""
raise NotImplementedError
@abstractmethod
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
"""
Fetch a full raw thread by thread_id.
Returns the provider-native thread payload.
"""
raise NotImplementedError

View File

@@ -0,0 +1,105 @@
from typing import Iterator, Dict, Any
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from mail_intake.adapters.base import MailIntakeAdapter
from mail_intake.exceptions import MailIntakeAdapterError
from mail_intake.auth.base import MailIntakeAuthProvider
class MailIntakeGmailAdapter(MailIntakeAdapter):
"""
Gmail read-only adapter.
This class is the ONLY place where:
- googleapiclient is imported
- Gmail REST semantics are known
- .execute() is called
It must remain thin and dumb by design.
"""
def __init__(
self,
auth_provider: MailIntakeAuthProvider,
user_id: str = "me",
):
self._auth_provider = auth_provider
self._user_id = user_id
self._service = None
@property
def service(self):
if self._service is None:
try:
creds = self._auth_provider.get_credentials()
self._service = build("gmail", "v1", credentials=creds)
except Exception as exc:
raise MailIntakeAdapterError(
"Failed to initialize Gmail service"
) from exc
return self._service
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
"""
Iterate over message references matching the query.
Yields:
{
"message_id": "...",
"thread_id": "..."
}
"""
try:
request = (
self.service.users()
.messages()
.list(userId=self._user_id, q=query)
)
while request is not None:
response = request.execute()
for msg in response.get("messages", []):
yield {
"message_id": msg["id"],
"thread_id": msg["threadId"],
}
request = (
self.service.users()
.messages()
.list_next(request, response)
)
except HttpError as exc:
raise MailIntakeAdapterError(
"Gmail API error while listing messages"
) from exc
def fetch_message(self, message_id: str) -> Dict[str, Any]:
try:
return (
self.service.users()
.messages()
.get(userId=self._user_id, id=message_id)
.execute()
)
except HttpError as exc:
raise MailIntakeAdapterError(
f"Gmail API error while fetching message {message_id}"
) from exc
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
try:
return (
self.service.users()
.threads()
.get(userId=self._user_id, id=thread_id)
.execute()
)
except HttpError as exc:
raise MailIntakeAdapterError(
f"Gmail API error while fetching thread {thread_id}"
) from exc

View File

20
mail_intake/auth/base.py Normal file
View File

@@ -0,0 +1,20 @@
from abc import ABC, abstractmethod
class MailIntakeAuthProvider(ABC):
"""
Abstract authentication provider.
Mail adapters depend on this interface, not on concrete
OAuth or credential implementations.
"""
@abstractmethod
def get_credentials(self):
"""
Return provider-specific credentials object.
This method is synchronous by design and must either
return valid credentials or raise MailIntakeAuthError.
"""
raise NotImplementedError

View File

@@ -0,0 +1,81 @@
import os
import pickle
from typing import Sequence
import google.auth.exceptions
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from mail_intake.auth.base import MailIntakeAuthProvider
from mail_intake.exceptions import MailIntakeAuthError
class MailIntakeGoogleAuth(MailIntakeAuthProvider):
"""
Google OAuth provider for Gmail access.
Responsibilities:
- Load cached credentials from disk
- Refresh expired tokens when possible
- Trigger interactive login only when strictly required
This class is synchronous and intentionally state-light.
"""
def __init__(
self,
credentials_path: str,
token_path: str,
scopes: Sequence[str],
):
self.credentials_path = credentials_path
self.token_path = token_path
self.scopes = list(scopes)
def get_credentials(self):
creds = None
# Attempt to load cached credentials
if os.path.exists(self.token_path):
try:
with open(self.token_path, "rb") as fh:
creds = pickle.load(fh)
except Exception:
creds = None
# Validate / refresh credentials
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
try:
creds.refresh(Request())
except google.auth.exceptions.RefreshError:
creds = None
# Interactive login if refresh failed or creds missing
if not creds:
if not os.path.exists(self.credentials_path):
raise MailIntakeAuthError(
f"Google credentials file not found: {self.credentials_path}"
)
try:
flow = InstalledAppFlow.from_client_secrets_file(
self.credentials_path,
self.scopes,
)
creds = flow.run_local_server(port=0)
except Exception as exc:
raise MailIntakeAuthError(
"Failed to complete Google OAuth flow"
) from exc
# Persist refreshed / new credentials
try:
with open(self.token_path, "wb") as fh:
pickle.dump(creds, fh)
except Exception as exc:
raise MailIntakeAuthError(
f"Failed to write token file: {self.token_path}"
) from exc
return creds

20
mail_intake/config.py Normal file
View File

@@ -0,0 +1,20 @@
from dataclasses import dataclass
from typing import Optional
@dataclass(frozen=True)
class MailIntakeConfig:
"""
Global configuration for mail-intake.
This configuration is intentionally explicit and immutable.
No implicit environment reads or global state.
"""
provider: str = "gmail"
user_id: str = "me"
readonly: bool = True
# Provider-specific paths (optional at this layer)
credentials_path: Optional[str] = None
token_path: Optional[str] = None

19
mail_intake/exceptions.py Normal file
View File

@@ -0,0 +1,19 @@
class MailIntakeError(Exception):
"""
Base exception for all mail-intake errors.
Users of the library should catch this type (or subclasses)
instead of provider-specific or third-party exceptions.
"""
class MailIntakeAuthError(MailIntakeError):
"""Authentication and credential-related failures."""
class MailIntakeAdapterError(MailIntakeError):
"""Errors raised by mail provider adapters."""
class MailIntakeParsingError(MailIntakeError):
"""Errors encountered while parsing message content."""

View File

View File

@@ -0,0 +1,99 @@
from datetime import datetime
from typing import Iterator, Dict, Any
from mail_intake.adapters.base import MailIntakeAdapter
from mail_intake.models.message import MailIntakeMessage
from mail_intake.models.thread import MailIntakeThread
from mail_intake.parsers.headers import parse_headers, extract_sender
from mail_intake.parsers.body import extract_body
from mail_intake.parsers.subject import normalize_subject
from mail_intake.exceptions import MailIntakeParsingError
class MailIntakeReader:
"""
High-level read-only ingestion interface.
This is the primary entry point users should interact with.
It orchestrates:
- adapter calls
- parsing
- normalization
- model construction
No provider-specific logic exists here.
"""
def __init__(self, adapter: MailIntakeAdapter):
self._adapter = adapter
def iter_messages(self, query: str) -> Iterator[MailIntakeMessage]:
"""
Iterate over parsed messages matching a provider query.
"""
for ref in self._adapter.iter_message_refs(query):
raw = self._adapter.fetch_message(ref["message_id"])
yield self._parse_message(raw)
def iter_threads(self, query: str) -> Iterator[MailIntakeThread]:
"""
Iterate over threads constructed from messages matching a query.
Messages are grouped by thread_id and yielded as complete threads.
"""
threads: Dict[str, MailIntakeThread] = {}
for ref in self._adapter.iter_message_refs(query):
raw = self._adapter.fetch_message(ref["message_id"])
message = self._parse_message(raw)
thread = threads.get(message.thread_id)
if thread is None:
thread = MailIntakeThread(
thread_id=message.thread_id,
normalized_subject=normalize_subject(message.subject),
)
threads[message.thread_id] = thread
thread.add_message(message)
return iter(threads.values())
def _parse_message(self, raw_message: Dict[str, Any]) -> MailIntakeMessage:
"""
Parse a raw provider message into a MailIntakeMessage.
"""
try:
message_id = raw_message["id"]
thread_id = raw_message["threadId"]
# Gmail internalDate is milliseconds since epoch
timestamp_ms = int(raw_message.get("internalDate", 0))
timestamp = datetime.fromtimestamp(timestamp_ms / 1000)
payload = raw_message.get("payload", {})
raw_headers_list = payload.get("headers", [])
headers = parse_headers(raw_headers_list)
from_email, from_name = extract_sender(headers)
subject = headers.get("subject", "")
body_text = extract_body(payload)
snippet = raw_message.get("snippet", "")
return MailIntakeMessage(
message_id=message_id,
thread_id=thread_id,
timestamp=timestamp,
from_email=from_email,
from_name=from_name,
subject=subject,
body_text=body_text,
snippet=snippet,
raw_headers=headers,
)
except Exception as exc:
raise MailIntakeParsingError(
f"Failed to parse message {raw_message.get('id')}"
) from exc

View File

View File

@@ -0,0 +1,26 @@
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Dict
@dataclass(frozen=True)
class MailIntakeMessage:
"""
Canonical internal representation of a single email message.
This model is provider-agnostic and safe to persist.
No Gmail-specific fields should appear here.
"""
message_id: str
thread_id: str
timestamp: datetime
from_email: str
from_name: Optional[str]
subject: str
body_text: str
snippet: str
raw_headers: Dict[str, str]

View File

@@ -0,0 +1,35 @@
from dataclasses import dataclass, field
from datetime import datetime
from typing import List, Set
from mail_intake.models.message import MailIntakeMessage
@dataclass
class MailIntakeThread:
"""
Canonical internal representation of an email thread.
Threads are the primary unit of reasoning for correspondence
workflows (job applications, interviews, follow-ups, etc.).
"""
thread_id: str
normalized_subject: str
participants: Set[str] = field(default_factory=set)
messages: List[MailIntakeMessage] = field(default_factory=list)
last_activity_at: datetime | None = None
def add_message(self, message: MailIntakeMessage) -> None:
"""
Add a message to the thread and update derived fields.
"""
self.messages.append(message)
if message.from_email:
self.participants.add(message.from_email)
if self.last_activity_at is None or message.timestamp > self.last_activity_at:
self.last_activity_at = message.timestamp

View File

View File

@@ -0,0 +1,83 @@
import base64
from typing import Dict, Any, Optional
from bs4 import BeautifulSoup
from mail_intake.exceptions import MailIntakeParsingError
def _decode_base64(data: str) -> str:
"""
Decode Gmail URL-safe base64 payload into UTF-8 text.
"""
try:
padded = data.replace("-", "+").replace("_", "/")
decoded = base64.b64decode(padded)
return decoded.decode("utf-8", errors="replace")
except Exception as exc:
raise MailIntakeParsingError("Failed to decode message body") from exc
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
"""
Extract text content from a single MIME part.
"""
mime_type = part.get("mimeType")
body = part.get("body", {})
data = body.get("data")
if not data:
return None
text = _decode_base64(data)
if mime_type == "text/plain":
return text
if mime_type == "text/html":
# soup = BeautifulSoup(text, "lxml")
soup = BeautifulSoup(text, "html.parser")
return soup.get_text(separator="\n", strip=True)
return None
def extract_body(payload: Dict[str, Any]) -> str:
"""
Extract the best-effort message body from a Gmail payload.
Priority:
1. text/plain
2. text/html (stripped to text)
3. empty string (if nothing usable found)
"""
if not payload:
return ""
# Multipart message
if "parts" in payload:
text_plain = None
text_html = None
for part in payload.get("parts", []):
content = _extract_from_part(part)
if not content:
continue
if part.get("mimeType") == "text/plain" and text_plain is None:
text_plain = content
elif part.get("mimeType") == "text/html" and text_html is None:
text_html = content
if text_plain:
return text_plain
if text_html:
return text_html
# Single-part message
body = payload.get("body", {})
data = body.get("data")
if data:
return _decode_base64(data)
return ""

View File

@@ -0,0 +1,58 @@
from typing import Dict, List, Tuple, Optional
def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
"""
Convert a list of Gmail-style headers into a normalized dict.
Input:
[
{"name": "From", "value": "John Doe <john@example.com>"},
{"name": "Subject", "value": "Re: Interview Update"},
...
]
Output:
{
"from": "...",
"subject": "...",
...
}
"""
headers: Dict[str, str] = {}
for header in raw_headers or []:
name = header.get("name")
value = header.get("value")
if not name or value is None:
continue
headers[name.lower()] = value.strip()
return headers
def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
"""
Extract sender email and optional display name from headers.
Returns:
(email, name)
If name cannot be determined, name will be None.
"""
from_header = headers.get("from")
if not from_header:
return "", None
# Common forms:
# Name <email@domain>
# email@domain
if "<" in from_header and ">" in from_header:
name_part, email_part = from_header.split("<", 1)
email = email_part.rstrip(">").strip()
name = name_part.strip().strip('"') or None
return email, name
return from_header.strip(), None

View File

@@ -0,0 +1,33 @@
import re
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
def normalize_subject(subject: str) -> str:
"""
Normalize an email subject for thread-level comparison.
Operations:
- Strip common prefixes (Re:, Fwd:, FW:)
- Collapse whitespace
- Preserve original casing (no lowercasing)
This function is intentionally conservative.
"""
if not subject:
return ""
normalized = subject.strip()
# Strip prefixes repeatedly (e.g., Re: Fwd: Re:)
while True:
new_value = _PREFIX_RE.sub("", normalized)
if new_value == normalized:
break
normalized = new_value.strip()
# Normalize whitespace
normalized = " ".join(normalized.split())
return normalized