This commit is contained in:
2026-01-03 05:21:55 +05:30
parent 278f0a3d40
commit 412a9c7bec
22 changed files with 950 additions and 0 deletions

0
mail_intake/__init__.py Normal file
View File

View File

View File

@@ -0,0 +1,48 @@
from abc import ABC, abstractmethod
from typing import Iterator, Dict, Any
class MailIntakeAdapter(ABC):
"""
Base adapter interface for mail providers.
This interface defines the minimal contract required for
read-only mail ingestion. No provider-specific concepts
should leak beyond implementations of this class.
"""
@abstractmethod
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
"""
Iterate over lightweight message references.
Must yield dictionaries containing at least:
- message_id
- thread_id
Example yield:
{
"message_id": "...",
"thread_id": "..."
}
"""
raise NotImplementedError
@abstractmethod
def fetch_message(self, message_id: str) -> Dict[str, Any]:
"""
Fetch a full raw message by message_id.
Returns the provider-native message payload
(e.g., Gmail message JSON).
"""
raise NotImplementedError
@abstractmethod
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
"""
Fetch a full raw thread by thread_id.
Returns the provider-native thread payload.
"""
raise NotImplementedError

View File

@@ -0,0 +1,105 @@
from typing import Iterator, Dict, Any
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from mail_intake.adapters.base import MailIntakeAdapter
from mail_intake.exceptions import MailIntakeAdapterError
from mail_intake.auth.base import MailIntakeAuthProvider
class MailIntakeGmailAdapter(MailIntakeAdapter):
"""
Gmail read-only adapter.
This class is the ONLY place where:
- googleapiclient is imported
- Gmail REST semantics are known
- .execute() is called
It must remain thin and dumb by design.
"""
def __init__(
self,
auth_provider: MailIntakeAuthProvider,
user_id: str = "me",
):
self._auth_provider = auth_provider
self._user_id = user_id
self._service = None
@property
def service(self):
if self._service is None:
try:
creds = self._auth_provider.get_credentials()
self._service = build("gmail", "v1", credentials=creds)
except Exception as exc:
raise MailIntakeAdapterError(
"Failed to initialize Gmail service"
) from exc
return self._service
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
"""
Iterate over message references matching the query.
Yields:
{
"message_id": "...",
"thread_id": "..."
}
"""
try:
request = (
self.service.users()
.messages()
.list(userId=self._user_id, q=query)
)
while request is not None:
response = request.execute()
for msg in response.get("messages", []):
yield {
"message_id": msg["id"],
"thread_id": msg["threadId"],
}
request = (
self.service.users()
.messages()
.list_next(request, response)
)
except HttpError as exc:
raise MailIntakeAdapterError(
"Gmail API error while listing messages"
) from exc
def fetch_message(self, message_id: str) -> Dict[str, Any]:
try:
return (
self.service.users()
.messages()
.get(userId=self._user_id, id=message_id)
.execute()
)
except HttpError as exc:
raise MailIntakeAdapterError(
f"Gmail API error while fetching message {message_id}"
) from exc
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
try:
return (
self.service.users()
.threads()
.get(userId=self._user_id, id=thread_id)
.execute()
)
except HttpError as exc:
raise MailIntakeAdapterError(
f"Gmail API error while fetching thread {thread_id}"
) from exc

View File

20
mail_intake/auth/base.py Normal file
View File

@@ -0,0 +1,20 @@
from abc import ABC, abstractmethod
class MailIntakeAuthProvider(ABC):
"""
Abstract authentication provider.
Mail adapters depend on this interface, not on concrete
OAuth or credential implementations.
"""
@abstractmethod
def get_credentials(self):
"""
Return provider-specific credentials object.
This method is synchronous by design and must either
return valid credentials or raise MailIntakeAuthError.
"""
raise NotImplementedError

View File

@@ -0,0 +1,81 @@
import os
import pickle
from typing import Sequence
import google.auth.exceptions
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from mail_intake.auth.base import MailIntakeAuthProvider
from mail_intake.exceptions import MailIntakeAuthError
class MailIntakeGoogleAuth(MailIntakeAuthProvider):
"""
Google OAuth provider for Gmail access.
Responsibilities:
- Load cached credentials from disk
- Refresh expired tokens when possible
- Trigger interactive login only when strictly required
This class is synchronous and intentionally state-light.
"""
def __init__(
self,
credentials_path: str,
token_path: str,
scopes: Sequence[str],
):
self.credentials_path = credentials_path
self.token_path = token_path
self.scopes = list(scopes)
def get_credentials(self):
creds = None
# Attempt to load cached credentials
if os.path.exists(self.token_path):
try:
with open(self.token_path, "rb") as fh:
creds = pickle.load(fh)
except Exception:
creds = None
# Validate / refresh credentials
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
try:
creds.refresh(Request())
except google.auth.exceptions.RefreshError:
creds = None
# Interactive login if refresh failed or creds missing
if not creds:
if not os.path.exists(self.credentials_path):
raise MailIntakeAuthError(
f"Google credentials file not found: {self.credentials_path}"
)
try:
flow = InstalledAppFlow.from_client_secrets_file(
self.credentials_path,
self.scopes,
)
creds = flow.run_local_server(port=0)
except Exception as exc:
raise MailIntakeAuthError(
"Failed to complete Google OAuth flow"
) from exc
# Persist refreshed / new credentials
try:
with open(self.token_path, "wb") as fh:
pickle.dump(creds, fh)
except Exception as exc:
raise MailIntakeAuthError(
f"Failed to write token file: {self.token_path}"
) from exc
return creds

20
mail_intake/config.py Normal file
View File

@@ -0,0 +1,20 @@
from dataclasses import dataclass
from typing import Optional
@dataclass(frozen=True)
class MailIntakeConfig:
"""
Global configuration for mail-intake.
This configuration is intentionally explicit and immutable.
No implicit environment reads or global state.
"""
provider: str = "gmail"
user_id: str = "me"
readonly: bool = True
# Provider-specific paths (optional at this layer)
credentials_path: Optional[str] = None
token_path: Optional[str] = None

19
mail_intake/exceptions.py Normal file
View File

@@ -0,0 +1,19 @@
class MailIntakeError(Exception):
"""
Base exception for all mail-intake errors.
Users of the library should catch this type (or subclasses)
instead of provider-specific or third-party exceptions.
"""
class MailIntakeAuthError(MailIntakeError):
"""Authentication and credential-related failures."""
class MailIntakeAdapterError(MailIntakeError):
"""Errors raised by mail provider adapters."""
class MailIntakeParsingError(MailIntakeError):
"""Errors encountered while parsing message content."""

View File

View File

@@ -0,0 +1,99 @@
from datetime import datetime
from typing import Iterator, Dict, Any
from mail_intake.adapters.base import MailIntakeAdapter
from mail_intake.models.message import MailIntakeMessage
from mail_intake.models.thread import MailIntakeThread
from mail_intake.parsers.headers import parse_headers, extract_sender
from mail_intake.parsers.body import extract_body
from mail_intake.parsers.subject import normalize_subject
from mail_intake.exceptions import MailIntakeParsingError
class MailIntakeReader:
"""
High-level read-only ingestion interface.
This is the primary entry point users should interact with.
It orchestrates:
- adapter calls
- parsing
- normalization
- model construction
No provider-specific logic exists here.
"""
def __init__(self, adapter: MailIntakeAdapter):
self._adapter = adapter
def iter_messages(self, query: str) -> Iterator[MailIntakeMessage]:
"""
Iterate over parsed messages matching a provider query.
"""
for ref in self._adapter.iter_message_refs(query):
raw = self._adapter.fetch_message(ref["message_id"])
yield self._parse_message(raw)
def iter_threads(self, query: str) -> Iterator[MailIntakeThread]:
"""
Iterate over threads constructed from messages matching a query.
Messages are grouped by thread_id and yielded as complete threads.
"""
threads: Dict[str, MailIntakeThread] = {}
for ref in self._adapter.iter_message_refs(query):
raw = self._adapter.fetch_message(ref["message_id"])
message = self._parse_message(raw)
thread = threads.get(message.thread_id)
if thread is None:
thread = MailIntakeThread(
thread_id=message.thread_id,
normalized_subject=normalize_subject(message.subject),
)
threads[message.thread_id] = thread
thread.add_message(message)
return iter(threads.values())
def _parse_message(self, raw_message: Dict[str, Any]) -> MailIntakeMessage:
"""
Parse a raw provider message into a MailIntakeMessage.
"""
try:
message_id = raw_message["id"]
thread_id = raw_message["threadId"]
# Gmail internalDate is milliseconds since epoch
timestamp_ms = int(raw_message.get("internalDate", 0))
timestamp = datetime.fromtimestamp(timestamp_ms / 1000)
payload = raw_message.get("payload", {})
raw_headers_list = payload.get("headers", [])
headers = parse_headers(raw_headers_list)
from_email, from_name = extract_sender(headers)
subject = headers.get("subject", "")
body_text = extract_body(payload)
snippet = raw_message.get("snippet", "")
return MailIntakeMessage(
message_id=message_id,
thread_id=thread_id,
timestamp=timestamp,
from_email=from_email,
from_name=from_name,
subject=subject,
body_text=body_text,
snippet=snippet,
raw_headers=headers,
)
except Exception as exc:
raise MailIntakeParsingError(
f"Failed to parse message {raw_message.get('id')}"
) from exc

View File

View File

@@ -0,0 +1,26 @@
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Dict
@dataclass(frozen=True)
class MailIntakeMessage:
"""
Canonical internal representation of a single email message.
This model is provider-agnostic and safe to persist.
No Gmail-specific fields should appear here.
"""
message_id: str
thread_id: str
timestamp: datetime
from_email: str
from_name: Optional[str]
subject: str
body_text: str
snippet: str
raw_headers: Dict[str, str]

View File

@@ -0,0 +1,35 @@
from dataclasses import dataclass, field
from datetime import datetime
from typing import List, Set
from mail_intake.models.message import MailIntakeMessage
@dataclass
class MailIntakeThread:
"""
Canonical internal representation of an email thread.
Threads are the primary unit of reasoning for correspondence
workflows (job applications, interviews, follow-ups, etc.).
"""
thread_id: str
normalized_subject: str
participants: Set[str] = field(default_factory=set)
messages: List[MailIntakeMessage] = field(default_factory=list)
last_activity_at: datetime | None = None
def add_message(self, message: MailIntakeMessage) -> None:
"""
Add a message to the thread and update derived fields.
"""
self.messages.append(message)
if message.from_email:
self.participants.add(message.from_email)
if self.last_activity_at is None or message.timestamp > self.last_activity_at:
self.last_activity_at = message.timestamp

View File

View File

@@ -0,0 +1,83 @@
import base64
from typing import Dict, Any, Optional
from bs4 import BeautifulSoup
from mail_intake.exceptions import MailIntakeParsingError
def _decode_base64(data: str) -> str:
"""
Decode Gmail URL-safe base64 payload into UTF-8 text.
"""
try:
padded = data.replace("-", "+").replace("_", "/")
decoded = base64.b64decode(padded)
return decoded.decode("utf-8", errors="replace")
except Exception as exc:
raise MailIntakeParsingError("Failed to decode message body") from exc
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
"""
Extract text content from a single MIME part.
"""
mime_type = part.get("mimeType")
body = part.get("body", {})
data = body.get("data")
if not data:
return None
text = _decode_base64(data)
if mime_type == "text/plain":
return text
if mime_type == "text/html":
# soup = BeautifulSoup(text, "lxml")
soup = BeautifulSoup(text, "html.parser")
return soup.get_text(separator="\n", strip=True)
return None
def extract_body(payload: Dict[str, Any]) -> str:
"""
Extract the best-effort message body from a Gmail payload.
Priority:
1. text/plain
2. text/html (stripped to text)
3. empty string (if nothing usable found)
"""
if not payload:
return ""
# Multipart message
if "parts" in payload:
text_plain = None
text_html = None
for part in payload.get("parts", []):
content = _extract_from_part(part)
if not content:
continue
if part.get("mimeType") == "text/plain" and text_plain is None:
text_plain = content
elif part.get("mimeType") == "text/html" and text_html is None:
text_html = content
if text_plain:
return text_plain
if text_html:
return text_html
# Single-part message
body = payload.get("body", {})
data = body.get("data")
if data:
return _decode_base64(data)
return ""

View File

@@ -0,0 +1,58 @@
from typing import Dict, List, Tuple, Optional
def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
"""
Convert a list of Gmail-style headers into a normalized dict.
Input:
[
{"name": "From", "value": "John Doe <john@example.com>"},
{"name": "Subject", "value": "Re: Interview Update"},
...
]
Output:
{
"from": "...",
"subject": "...",
...
}
"""
headers: Dict[str, str] = {}
for header in raw_headers or []:
name = header.get("name")
value = header.get("value")
if not name or value is None:
continue
headers[name.lower()] = value.strip()
return headers
def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
"""
Extract sender email and optional display name from headers.
Returns:
(email, name)
If name cannot be determined, name will be None.
"""
from_header = headers.get("from")
if not from_header:
return "", None
# Common forms:
# Name <email@domain>
# email@domain
if "<" in from_header and ">" in from_header:
name_part, email_part = from_header.split("<", 1)
email = email_part.rstrip(">").strip()
name = name_part.strip().strip('"') or None
return email, name
return from_header.strip(), None

View File

@@ -0,0 +1,33 @@
import re
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
def normalize_subject(subject: str) -> str:
"""
Normalize an email subject for thread-level comparison.
Operations:
- Strip common prefixes (Re:, Fwd:, FW:)
- Collapse whitespace
- Preserve original casing (no lowercasing)
This function is intentionally conservative.
"""
if not subject:
return ""
normalized = subject.strip()
# Strip prefixes repeatedly (e.g., Re: Fwd: Re:)
while True:
new_value = _PREFIX_RE.sub("", normalized)
if new_value == normalized:
break
normalized = new_value.strip()
# Normalize whitespace
normalized = " ".join(normalized.split())
return normalized

94
pyproject.toml Normal file
View File

@@ -0,0 +1,94 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "mail-intake"
version = "0.0.1"
description = "Structured mail ingestion and correspondence parsing with provider adapters (Gmail-first)."
readme = "README.md"
requires-python = ">=3.10"
license = { text = "MIT" }
authors = [
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
]
maintainers = [
{ name = "Aetos Skia", email = "dev@aetoskia.com" }
]
keywords = [
"email",
"gmail",
"mail",
"ingestion",
"automation",
"job-search",
"correspondence",
]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Communications :: Email",
"Topic :: Software Development :: Libraries",
]
dependencies = [
# Gmail API stack
"google-api-python-client>=2.120.0",
"google-auth>=2.28.0",
"google-auth-oauthlib>=1.2.0",
# Parsing
"beautifulsoup4>=4.12.0",
"lxml>=5.1.0",
]
[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
"pytest-cov>=4.1.0",
"ruff>=0.3.0",
"mypy>=1.8.0",
"types-beautifulsoup4",
]
docs = [
"mkdocs>=1.5.0",
"mkdocs-material>=9.5.0",
]
[project.urls]
Homepage = "https://git.aetoskia.com/aetos/mail-intake"
Documentation = "https://git.aetoskia.com/aetos/mail-intake#readme"
Repository = "https://git.aetoskia.com/aetos/mail-intake.git"
Issues = "https://git.aetoskia.com/aetos/mail-intake/issues"
Versions = "https://git.aetoskia.com/aetos/mail-intake/tags"
[tool.setuptools]
package-dir = { "" = "src" }
[tool.setuptools.packages.find]
where = ["src"]
include = ["mail_intake*"]
[tool.ruff]
line-length = 100
target-version = "py310"
[tool.mypy]
python_version = "3.10"
strict = true
ignore_missing_imports = true

10
requirements.txt Normal file
View File

@@ -0,0 +1,10 @@
beautifulsoup4==4.12.0
pytest==7.4.0
pytest-asyncio==0.21.0
pytest-cov==4.1.0
types-beautifulsoup4
# Optional, useful locally
ipython

91
tests/unit/test_models.py Normal file
View File

@@ -0,0 +1,91 @@
from datetime import datetime, timedelta
from mail_intake.models.message import MailIntakeMessage
from mail_intake.models.thread import MailIntakeThread
def test_message_is_immutable():
msg = MailIntakeMessage(
message_id="m1",
thread_id="t1",
timestamp=datetime.utcnow(),
from_email="alice@example.com",
from_name="Alice",
subject="Hello",
body_text="Body",
snippet="Snippet",
raw_headers={"from": "Alice <alice@example.com>"},
)
try:
msg.subject = "Changed"
assert False, "Message should be immutable"
except Exception:
assert True
def test_thread_add_message_updates_participants_and_timestamp():
t0 = datetime.utcnow()
t1 = t0 + timedelta(minutes=5)
msg1 = MailIntakeMessage(
message_id="m1",
thread_id="t1",
timestamp=t0,
from_email="alice@example.com",
from_name="Alice",
subject="Hello",
body_text="Body",
snippet="Snippet",
raw_headers={},
)
msg2 = MailIntakeMessage(
message_id="m2",
thread_id="t1",
timestamp=t1,
from_email="bob@example.com",
from_name="Bob",
subject="Re: Hello",
body_text="Reply",
snippet="Reply",
raw_headers={},
)
thread = MailIntakeThread(
thread_id="t1",
normalized_subject="Hello",
)
thread.add_message(msg1)
assert thread.last_activity_at == t0
assert "alice@example.com" in thread.participants
thread.add_message(msg2)
assert thread.last_activity_at == t1
assert "bob@example.com" in thread.participants
assert len(thread.messages) == 2
def test_thread_handles_messages_without_sender():
msg = MailIntakeMessage(
message_id="m1",
thread_id="t1",
timestamp=datetime.utcnow(),
from_email="",
from_name=None,
subject="System Message",
body_text="Body",
snippet="Snippet",
raw_headers={},
)
thread = MailIntakeThread(
thread_id="t1",
normalized_subject="System Message",
)
thread.add_message(msg)
assert len(thread.participants) == 0
assert thread.last_activity_at is not None

128
tests/unit/test_parsers.py Normal file
View File

@@ -0,0 +1,128 @@
import base64
from mail_intake.parsers.subject import normalize_subject
from mail_intake.parsers.headers import parse_headers, extract_sender
from mail_intake.parsers.body import extract_body
def _b64(text: str) -> str:
return base64.b64encode(text.encode("utf-8")).decode("utf-8")
# --------------------
# Subject parsing
# --------------------
def test_normalize_subject_strips_common_prefixes():
assert normalize_subject("Re: Interview Update") == "Interview Update"
assert normalize_subject("Fwd: Re: Offer Letter") == "Offer Letter"
assert normalize_subject("FW: Re: FW: Status") == "Status"
def test_normalize_subject_preserves_content_and_case():
subject = "Interview Update Backend Role"
assert normalize_subject(subject) == subject
def test_normalize_subject_empty_and_none_safe():
assert normalize_subject("") == ""
# --------------------
# Header parsing
# --------------------
def test_parse_headers_lowercases_keys():
raw_headers = [
{"name": "From", "value": "Alice <alice@example.com>"},
{"name": "Subject", "value": "Hello"},
]
headers = parse_headers(raw_headers)
assert headers["from"] == "Alice <alice@example.com>"
assert headers["subject"] == "Hello"
def test_parse_headers_ignores_invalid_entries():
raw_headers = [
{"name": "From", "value": "Bob <bob@example.com>"},
{"name": None, "value": "X"},
{"name": "X-Test", "value": None},
]
headers = parse_headers(raw_headers)
assert "from" in headers
assert "x-test" not in headers
def test_extract_sender_with_name_and_email():
headers = {"from": "Alice Smith <alice@example.com>"}
email, name = extract_sender(headers)
assert email == "alice@example.com"
assert name == "Alice Smith"
def test_extract_sender_email_only():
headers = {"from": "bob@example.com"}
email, name = extract_sender(headers)
assert email == "bob@example.com"
assert name is None
def test_extract_sender_missing_from():
email, name = extract_sender({})
assert email == ""
assert name is None
# --------------------
# Body parsing
# --------------------
def test_extract_body_prefers_text_plain():
payload = {
"parts": [
{
"mimeType": "text/html",
"body": {"data": _b64("<p>Hello <b>HTML</b></p>")},
},
{
"mimeType": "text/plain",
"body": {"data": _b64("Hello TEXT")},
},
]
}
body = extract_body(payload)
assert body == "Hello TEXT"
def test_extract_body_falls_back_to_html():
payload = {
"parts": [
{
"mimeType": "text/html",
"body": {"data": _b64("<p>Hello <b>World</b></p>")},
}
]
}
body = extract_body(payload)
assert "Hello" in body
assert "World" in body
def test_extract_body_single_part():
payload = {
"body": {"data": _b64("Single part body")}
}
body = extract_body(payload)
assert body == "Single part body"
def test_extract_body_empty_payload():
assert extract_body({}) == ""