docs(mail_intake): add comprehensive docstrings across ingestion, adapters, auth, and parsing layers
- docs(mail_intake/__init__.py): document module-based public API and usage patterns - docs(mail_intake/ingestion/reader.py): document high-level ingestion orchestration - docs(mail_intake/adapters/base.py): document adapter contract for mail providers - docs(mail_intake/adapters/gmail.py): document Gmail adapter implementation and constraints - docs(mail_intake/auth/base.py): document authentication provider contract - docs(mail_intake/auth/google.py): document Google OAuth authentication provider - docs(mail_intake/models/message.py): document canonical email message model - docs(mail_intake/models/thread.py): document canonical email thread model - docs(mail_intake/parsers/body.py): document message body extraction logic - docs(mail_intake/parsers/headers.py): document message header normalization utilities - docs(mail_intake/parsers/subject.py): document subject normalization utilities - docs(mail_intake/config.py): document global configuration model - docs(mail_intake/exceptions.py): document library exception hierarchy
This commit is contained in:
@@ -0,0 +1,126 @@
|
|||||||
|
"""
|
||||||
|
Mail Intake — provider-agnostic, read-only email ingestion framework.
|
||||||
|
|
||||||
|
Mail Intake is a **contract-first library** designed to ingest, parse, and
|
||||||
|
normalize email data from external providers (such as Gmail) into clean,
|
||||||
|
provider-agnostic domain models.
|
||||||
|
|
||||||
|
The library is intentionally structured around clear layers, each exposed
|
||||||
|
as a first-class module at the package root:
|
||||||
|
|
||||||
|
- adapters: provider-specific access (e.g. Gmail)
|
||||||
|
- auth: authentication providers and credential management
|
||||||
|
- parsers: extraction and normalization of message content
|
||||||
|
- ingestion: orchestration and high-level ingestion workflows
|
||||||
|
- models: canonical, provider-agnostic data representations
|
||||||
|
- config: explicit global configuration
|
||||||
|
- exceptions: library-defined error hierarchy
|
||||||
|
|
||||||
|
The package root acts as a **namespace**, not a facade. Consumers are
|
||||||
|
expected to import functionality explicitly from the appropriate module.
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
Installation
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
||||||
|
Install using pip:
|
||||||
|
|
||||||
|
pip install mail-intake
|
||||||
|
|
||||||
|
Or with Poetry:
|
||||||
|
|
||||||
|
poetry add mail-intake
|
||||||
|
|
||||||
|
Mail Intake is pure Python and has no runtime dependencies beyond those
|
||||||
|
required by the selected provider (for example, Google APIs for Gmail).
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
Basic Usage
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
||||||
|
Minimal Gmail ingestion example:
|
||||||
|
|
||||||
|
from mail_intake.ingestion import MailIntakeReader
|
||||||
|
from mail_intake.adapters import MailIntakeGmailAdapter
|
||||||
|
from mail_intake.auth import MailIntakeGoogleAuth
|
||||||
|
|
||||||
|
auth = MailIntakeGoogleAuth(
|
||||||
|
credentials_path="credentials.json",
|
||||||
|
token_path="token.pickle",
|
||||||
|
scopes=["https://www.googleapis.com/auth/gmail.readonly"],
|
||||||
|
)
|
||||||
|
|
||||||
|
adapter = MailIntakeGmailAdapter(auth_provider=auth)
|
||||||
|
reader = MailIntakeReader(adapter)
|
||||||
|
|
||||||
|
for message in reader.iter_messages("from:recruiter@example.com"):
|
||||||
|
print(message.subject, message.from_email)
|
||||||
|
|
||||||
|
Iterating over threads:
|
||||||
|
|
||||||
|
for thread in reader.iter_threads("subject:Interview"):
|
||||||
|
print(thread.normalized_subject, len(thread.messages))
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
Extensibility Model
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
||||||
|
Mail Intake is designed to be extensible via **public contracts** exposed
|
||||||
|
through its modules:
|
||||||
|
|
||||||
|
- Users MAY implement their own mail adapters by subclassing
|
||||||
|
`adapters.MailIntakeAdapter`
|
||||||
|
- Users MAY implement their own authentication providers by subclassing
|
||||||
|
`auth.MailIntakeAuthProvider`
|
||||||
|
|
||||||
|
Users SHOULD NOT subclass built-in adapter implementations. Built-in
|
||||||
|
adapters (such as Gmail) are reference implementations and may change
|
||||||
|
internally without notice.
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
Public API Surface
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
||||||
|
The supported public API consists of the following top-level modules:
|
||||||
|
|
||||||
|
- mail_intake.ingestion
|
||||||
|
- mail_intake.adapters
|
||||||
|
- mail_intake.auth
|
||||||
|
- mail_intake.parsers
|
||||||
|
- mail_intake.models
|
||||||
|
- mail_intake.config
|
||||||
|
- mail_intake.exceptions
|
||||||
|
|
||||||
|
Classes and functions should be imported explicitly from these modules.
|
||||||
|
No individual symbols are re-exported at the package root.
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
Design Guarantees
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
||||||
|
- Read-only access: no mutation of provider state
|
||||||
|
- Provider-agnostic domain models
|
||||||
|
- Explicit configuration and dependency injection
|
||||||
|
- No implicit global state or environment reads
|
||||||
|
- Deterministic, testable behavior
|
||||||
|
|
||||||
|
Mail Intake favors correctness, clarity, and explicitness over convenience
|
||||||
|
shortcuts.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
import ingestion
|
||||||
|
import adapters
|
||||||
|
import auth
|
||||||
|
import models
|
||||||
|
import config
|
||||||
|
import exceptions
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"ingestion",
|
||||||
|
"adapters",
|
||||||
|
"auth",
|
||||||
|
"models",
|
||||||
|
"config",
|
||||||
|
"exceptions",
|
||||||
|
]
|
||||||
|
|||||||
@@ -0,0 +1,28 @@
|
|||||||
|
"""
|
||||||
|
Mail provider adapter implementations for Mail Intake.
|
||||||
|
|
||||||
|
This package contains **adapter-layer implementations** responsible for
|
||||||
|
interfacing with external mail providers and exposing a normalized,
|
||||||
|
provider-agnostic contract to the rest of the system.
|
||||||
|
|
||||||
|
Adapters in this package:
|
||||||
|
- Implement the `MailIntakeAdapter` interface
|
||||||
|
- Encapsulate all provider-specific APIs and semantics
|
||||||
|
- Perform read-only access to mail data
|
||||||
|
- Return provider-native payloads without interpretation
|
||||||
|
|
||||||
|
Provider-specific logic **must not leak** outside of adapter implementations.
|
||||||
|
All parsings, normalizations, and transformations must be handled by downstream
|
||||||
|
components.
|
||||||
|
|
||||||
|
Public adapters exported from this package are considered the supported
|
||||||
|
integration surface for mail providers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .base import MailIntakeAdapter
|
||||||
|
from .gmail import MailIntakeGmailAdapter
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"MailIntakeAdapter",
|
||||||
|
"MailIntakeGmailAdapter",
|
||||||
|
]
|
||||||
|
|||||||
@@ -1,3 +1,14 @@
|
|||||||
|
"""
|
||||||
|
Mail provider adapter contracts for Mail Intake.
|
||||||
|
|
||||||
|
This module defines the **provider-agnostic adapter interface** used for
|
||||||
|
read-only mail ingestion.
|
||||||
|
|
||||||
|
Adapters encapsulate all provider-specific access logic and expose a
|
||||||
|
minimal, normalized contract to the rest of the system. No provider-specific
|
||||||
|
types or semantics should leak beyond implementations of this interface.
|
||||||
|
"""
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Iterator, Dict, Any
|
from typing import Iterator, Dict, Any
|
||||||
|
|
||||||
@@ -6,43 +17,60 @@ class MailIntakeAdapter(ABC):
|
|||||||
"""
|
"""
|
||||||
Base adapter interface for mail providers.
|
Base adapter interface for mail providers.
|
||||||
|
|
||||||
This interface defines the minimal contract required for
|
This interface defines the minimal contract required to:
|
||||||
read-only mail ingestion. No provider-specific concepts
|
- Discover messages matching a query
|
||||||
should leak beyond implementations of this class.
|
- Retrieve full message payloads
|
||||||
|
- Retrieve full thread payloads
|
||||||
|
|
||||||
|
Adapters are intentionally read-only and must not mutate provider state.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
|
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
|
||||||
"""
|
"""
|
||||||
Iterate over lightweight message references.
|
Iterate over lightweight message references matching a query.
|
||||||
|
|
||||||
Must yield dictionaries containing at least:
|
Implementations must yield dictionaries containing at least:
|
||||||
- message_id
|
- ``message_id``: Provider-specific message identifier
|
||||||
- thread_id
|
- ``thread_id``: Provider-specific thread identifier
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Provider-specific query string used to filter messages.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Dictionaries containing message and thread identifiers.
|
||||||
|
|
||||||
Example yield:
|
Example yield:
|
||||||
{
|
{
|
||||||
"message_id": "...",
|
"message_id": "...",
|
||||||
"thread_id": "..."
|
"thread_id": "..."
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def fetch_message(self, message_id: str) -> Dict[str, Any]:
|
def fetch_message(self, message_id: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Fetch a full raw message by message_id.
|
Fetch a full raw message by message identifier.
|
||||||
|
|
||||||
Returns the provider-native message payload
|
Args:
|
||||||
(e.g., Gmail message JSON).
|
message_id: Provider-specific message identifier.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Provider-native message payload
|
||||||
|
(e.g., Gmail message JSON structure).
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
|
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Fetch a full raw thread by thread_id.
|
Fetch a full raw thread by thread identifier.
|
||||||
|
|
||||||
Returns the provider-native thread payload.
|
Args:
|
||||||
|
thread_id: Provider-specific thread identifier.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Provider-native thread payload.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|||||||
@@ -1,3 +1,17 @@
|
|||||||
|
"""
|
||||||
|
Gmail adapter implementation for Mail Intake.
|
||||||
|
|
||||||
|
This module provides a **Gmail-specific implementation** of the
|
||||||
|
`MailIntakeAdapter` contract.
|
||||||
|
|
||||||
|
It is the only place in the codebase where:
|
||||||
|
- `googleapiclient` is imported
|
||||||
|
- Gmail REST API semantics are known
|
||||||
|
- Low-level `.execute()` calls are made
|
||||||
|
|
||||||
|
All Gmail-specific behavior must be strictly contained within this module.
|
||||||
|
"""
|
||||||
|
|
||||||
from typing import Iterator, Dict, Any
|
from typing import Iterator, Dict, Any
|
||||||
|
|
||||||
from googleapiclient.discovery import build
|
from googleapiclient.discovery import build
|
||||||
@@ -12,12 +26,19 @@ class MailIntakeGmailAdapter(MailIntakeAdapter):
|
|||||||
"""
|
"""
|
||||||
Gmail read-only adapter.
|
Gmail read-only adapter.
|
||||||
|
|
||||||
|
This adapter implements the `MailIntakeAdapter` interface using the
|
||||||
|
Gmail REST API. It translates the generic mail intake contract into
|
||||||
|
Gmail-specific API calls.
|
||||||
|
|
||||||
This class is the ONLY place where:
|
This class is the ONLY place where:
|
||||||
- googleapiclient is imported
|
- googleapiclient is imported
|
||||||
- Gmail REST semantics are known
|
- Gmail REST semantics are known
|
||||||
- .execute() is called
|
- .execute() is called
|
||||||
|
|
||||||
It must remain thin and dumb by design.
|
Design constraints:
|
||||||
|
- Must remain thin and imperative
|
||||||
|
- Must not perform parsing or interpretation
|
||||||
|
- Must not expose Gmail-specific types beyond this class
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -25,12 +46,29 @@ class MailIntakeGmailAdapter(MailIntakeAdapter):
|
|||||||
auth_provider: MailIntakeAuthProvider,
|
auth_provider: MailIntakeAuthProvider,
|
||||||
user_id: str = "me",
|
user_id: str = "me",
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Initialize the Gmail adapter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
auth_provider: Authentication provider capable of supplying
|
||||||
|
valid Gmail API credentials.
|
||||||
|
user_id: Gmail user identifier. Defaults to `"me"`.
|
||||||
|
"""
|
||||||
self._auth_provider = auth_provider
|
self._auth_provider = auth_provider
|
||||||
self._user_id = user_id
|
self._user_id = user_id
|
||||||
self._service = None
|
self._service = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def service(self):
|
def service(self):
|
||||||
|
"""
|
||||||
|
Lazily initialize and return the Gmail API service client.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Initialized Gmail API service instance.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
MailIntakeAdapterError: If the Gmail service cannot be initialized.
|
||||||
|
"""
|
||||||
if self._service is None:
|
if self._service is None:
|
||||||
try:
|
try:
|
||||||
creds = self._auth_provider.get_credentials()
|
creds = self._auth_provider.get_credentials()
|
||||||
@@ -45,11 +83,16 @@ class MailIntakeGmailAdapter(MailIntakeAdapter):
|
|||||||
"""
|
"""
|
||||||
Iterate over message references matching the query.
|
Iterate over message references matching the query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Gmail search query string.
|
||||||
|
|
||||||
Yields:
|
Yields:
|
||||||
{
|
Dictionaries containing:
|
||||||
"message_id": "...",
|
- ``message_id``: Gmail message ID
|
||||||
"thread_id": "..."
|
- ``thread_id``: Gmail thread ID
|
||||||
}
|
|
||||||
|
Raises:
|
||||||
|
MailIntakeAdapterError: If the Gmail API returns an error.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
request = (
|
request = (
|
||||||
@@ -79,6 +122,18 @@ class MailIntakeGmailAdapter(MailIntakeAdapter):
|
|||||||
) from exc
|
) from exc
|
||||||
|
|
||||||
def fetch_message(self, message_id: str) -> Dict[str, Any]:
|
def fetch_message(self, message_id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Fetch a full Gmail message by message ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message_id: Gmail message identifier.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Provider-native Gmail message payload.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
MailIntakeAdapterError: If the Gmail API returns an error.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
return (
|
return (
|
||||||
self.service.users()
|
self.service.users()
|
||||||
@@ -92,6 +147,18 @@ class MailIntakeGmailAdapter(MailIntakeAdapter):
|
|||||||
) from exc
|
) from exc
|
||||||
|
|
||||||
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
|
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Fetch a full Gmail thread by thread ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
thread_id: Gmail thread identifier.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Provider-native Gmail thread payload.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
MailIntakeAdapterError: If the Gmail API returns an error.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
return (
|
return (
|
||||||
self.service.users()
|
self.service.users()
|
||||||
|
|||||||
@@ -0,0 +1,26 @@
|
|||||||
|
"""
|
||||||
|
Authentication provider implementations for Mail Intake.
|
||||||
|
|
||||||
|
This package defines the **authentication layer** used by mail adapters
|
||||||
|
to obtain provider-specific credentials.
|
||||||
|
|
||||||
|
It exposes:
|
||||||
|
- A stable, provider-agnostic authentication contract
|
||||||
|
- Concrete authentication providers for supported platforms
|
||||||
|
|
||||||
|
Authentication providers:
|
||||||
|
- Are responsible for credential acquisition and lifecycle management
|
||||||
|
- Are intentionally decoupled from adapter logic
|
||||||
|
- May be extended by users to support additional providers
|
||||||
|
|
||||||
|
Consumers should depend on the abstract interface and use concrete
|
||||||
|
implementations only where explicitly required.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .base import MailIntakeAuthProvider
|
||||||
|
from .google import MailIntakeGoogleAuth
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"MailIntakeAuthProvider",
|
||||||
|
"MailIntakeGoogleAuth",
|
||||||
|
]
|
||||||
|
|||||||
@@ -1,3 +1,14 @@
|
|||||||
|
"""
|
||||||
|
Authentication provider contracts for Mail Intake.
|
||||||
|
|
||||||
|
This module defines the **authentication abstraction layer** used by mail
|
||||||
|
adapters to obtain provider-specific credentials.
|
||||||
|
|
||||||
|
Authentication concerns are intentionally decoupled from adapter logic.
|
||||||
|
Adapters depend only on this interface and must not be aware of how
|
||||||
|
credentials are acquired, refreshed, or stored.
|
||||||
|
"""
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
|
||||||
@@ -7,6 +18,17 @@ class MailIntakeAuthProvider(ABC):
|
|||||||
|
|
||||||
Mail adapters depend on this interface, not on concrete
|
Mail adapters depend on this interface, not on concrete
|
||||||
OAuth or credential implementations.
|
OAuth or credential implementations.
|
||||||
|
|
||||||
|
Authentication providers encapsulate all logic required to acquire
|
||||||
|
valid credentials for a mail provider.
|
||||||
|
|
||||||
|
Implementations may involve:
|
||||||
|
- OAuth flows
|
||||||
|
- Service account credentials
|
||||||
|
- Token refresh logic
|
||||||
|
- Secure credential storage
|
||||||
|
|
||||||
|
Adapters must treat the returned credentials as opaque and provider-specific.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@@ -16,5 +38,13 @@ class MailIntakeAuthProvider(ABC):
|
|||||||
|
|
||||||
This method is synchronous by design and must either
|
This method is synchronous by design and must either
|
||||||
return valid credentials or raise MailIntakeAuthError.
|
return valid credentials or raise MailIntakeAuthError.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Provider-specific credentials object suitable for use by
|
||||||
|
the corresponding mail adapter.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: Authentication-specific errors defined by the
|
||||||
|
implementation.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|||||||
@@ -1,3 +1,17 @@
|
|||||||
|
"""
|
||||||
|
Google authentication provider implementation for Mail Intake.
|
||||||
|
|
||||||
|
This module provides a **Google OAuth–based authentication provider**
|
||||||
|
used primarily for Gmail access.
|
||||||
|
|
||||||
|
It encapsulates all Google-specific authentication concerns, including:
|
||||||
|
- Credential loading and persistence
|
||||||
|
- Token refresh handling
|
||||||
|
- Interactive OAuth flow initiation
|
||||||
|
|
||||||
|
No Google authentication details should leak outside this module.
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
from typing import Sequence
|
from typing import Sequence
|
||||||
@@ -14,12 +28,16 @@ class MailIntakeGoogleAuth(MailIntakeAuthProvider):
|
|||||||
"""
|
"""
|
||||||
Google OAuth provider for Gmail access.
|
Google OAuth provider for Gmail access.
|
||||||
|
|
||||||
Responsibilities:
|
This provider implements the `MailIntakeAuthProvider` interface using
|
||||||
- Load cached credentials from disk
|
Google's OAuth 2.0 flow and credential management libraries.
|
||||||
- Refresh expired tokens when possible
|
|
||||||
- Trigger interactive login only when strictly required
|
|
||||||
|
|
||||||
This class is synchronous and intentionally state-light.
|
Responsibilities:
|
||||||
|
- Load cached credentials from disk when available
|
||||||
|
- Refresh expired credentials when possible
|
||||||
|
- Initiate an interactive OAuth flow only when required
|
||||||
|
- Persist refreshed or newly obtained credentials
|
||||||
|
|
||||||
|
This class is synchronous by design and maintains a minimal internal state.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -28,11 +46,36 @@ class MailIntakeGoogleAuth(MailIntakeAuthProvider):
|
|||||||
token_path: str,
|
token_path: str,
|
||||||
scopes: Sequence[str],
|
scopes: Sequence[str],
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Initialize the Google authentication provider.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
credentials_path: Path to the Google OAuth client secrets file.
|
||||||
|
token_path: Path where OAuth tokens will be cached.
|
||||||
|
scopes: OAuth scopes required for access.
|
||||||
|
"""
|
||||||
self.credentials_path = credentials_path
|
self.credentials_path = credentials_path
|
||||||
self.token_path = token_path
|
self.token_path = token_path
|
||||||
self.scopes = list(scopes)
|
self.scopes = list(scopes)
|
||||||
|
|
||||||
def get_credentials(self):
|
def get_credentials(self):
|
||||||
|
"""
|
||||||
|
Retrieve valid Google OAuth credentials.
|
||||||
|
|
||||||
|
This method attempts to:
|
||||||
|
1. Load cached credentials from disk
|
||||||
|
2. Refresh expired credentials when possible
|
||||||
|
3. Perform an interactive OAuth login as a fallback
|
||||||
|
4. Persist valid credentials for future use
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Google OAuth credentials object suitable for use with
|
||||||
|
Google API clients.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
MailIntakeAuthError: If credentials cannot be loaded, refreshed,
|
||||||
|
or obtained via interactive authentication.
|
||||||
|
"""
|
||||||
creds = None
|
creds = None
|
||||||
|
|
||||||
# Attempt to load cached credentials
|
# Attempt to load cached credentials
|
||||||
|
|||||||
@@ -1,3 +1,14 @@
|
|||||||
|
"""
|
||||||
|
Global configuration models for Mail Intake.
|
||||||
|
|
||||||
|
This module defines the **top-level configuration object** used to control
|
||||||
|
mail ingestion behavior across adapters, authentication providers, and
|
||||||
|
ingestion workflows.
|
||||||
|
|
||||||
|
Configuration is intentionally explicit, immutable, and free of implicit
|
||||||
|
environment reads to ensure predictability and testability.
|
||||||
|
"""
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@@ -9,12 +20,26 @@ class MailIntakeConfig:
|
|||||||
|
|
||||||
This configuration is intentionally explicit and immutable.
|
This configuration is intentionally explicit and immutable.
|
||||||
No implicit environment reads or global state.
|
No implicit environment reads or global state.
|
||||||
|
|
||||||
|
Design principles:
|
||||||
|
- Immutable once constructed
|
||||||
|
- Explicit configuration over implicit defaults
|
||||||
|
- No direct environment or filesystem access
|
||||||
|
|
||||||
|
This model is safe to pass across layers and suitable for serialization.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
provider: str = "gmail"
|
provider: str = "gmail"
|
||||||
user_id: str = "me"
|
"""Identifier of the mail provider to use (e.g., ``"gmail"``)."""
|
||||||
readonly: bool = True
|
|
||||||
|
user_id: str = "me"
|
||||||
|
"""Provider-specific user identifier. Defaults to the authenticated user."""
|
||||||
|
|
||||||
|
readonly: bool = True
|
||||||
|
"""Whether ingestion should operate in read-only mode."""
|
||||||
|
|
||||||
# Provider-specific paths (optional at this layer)
|
|
||||||
credentials_path: Optional[str] = None
|
credentials_path: Optional[str] = None
|
||||||
|
"""Optional path to provider credentials configuration."""
|
||||||
|
|
||||||
token_path: Optional[str] = None
|
token_path: Optional[str] = None
|
||||||
|
"""Optional path to persisted authentication tokens."""
|
||||||
|
|||||||
@@ -1,19 +1,49 @@
|
|||||||
|
"""
|
||||||
|
Exception hierarchy for Mail Intake.
|
||||||
|
|
||||||
|
This module defines the **canonical exception types** used throughout the
|
||||||
|
Mail Intake library.
|
||||||
|
|
||||||
|
All library-raised errors derive from `MailIntakeError`. Consumers are
|
||||||
|
encouraged to catch this base type (or specific subclasses) rather than
|
||||||
|
provider-specific or third-party exceptions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class MailIntakeError(Exception):
|
class MailIntakeError(Exception):
|
||||||
"""
|
"""
|
||||||
Base exception for all mail-intake errors.
|
Base exception for all Mail Intake errors.
|
||||||
|
|
||||||
Users of the library should catch this type (or subclasses)
|
This is the root of the Mail Intake exception hierarchy.
|
||||||
instead of provider-specific or third-party exceptions.
|
All errors raised by the library must derive from this class.
|
||||||
|
|
||||||
|
Consumers should generally catch this type when handling
|
||||||
|
library-level failures.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class MailIntakeAuthError(MailIntakeError):
|
class MailIntakeAuthError(MailIntakeError):
|
||||||
"""Authentication and credential-related failures."""
|
"""
|
||||||
|
Authentication and credential-related failures.
|
||||||
|
|
||||||
|
Raised when authentication providers are unable to acquire,
|
||||||
|
refresh, or persist valid credentials.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class MailIntakeAdapterError(MailIntakeError):
|
class MailIntakeAdapterError(MailIntakeError):
|
||||||
"""Errors raised by mail provider adapters."""
|
"""
|
||||||
|
Errors raised by mail provider adapters.
|
||||||
|
|
||||||
|
Raised when a provider adapter encounters API errors,
|
||||||
|
transport failures, or invalid provider responses.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class MailIntakeParsingError(MailIntakeError):
|
class MailIntakeParsingError(MailIntakeError):
|
||||||
"""Errors encountered while parsing message content."""
|
"""
|
||||||
|
Errors encountered while parsing message content.
|
||||||
|
|
||||||
|
Raised when raw provider payloads cannot be interpreted
|
||||||
|
or normalized into internal domain models.
|
||||||
|
"""
|
||||||
|
|||||||
@@ -0,0 +1,24 @@
|
|||||||
|
"""
|
||||||
|
Mail ingestion orchestration for Mail Intake.
|
||||||
|
|
||||||
|
This package contains **high-level ingestion components** responsible for
|
||||||
|
coordinating mail retrieval, parsing, normalization, and model construction.
|
||||||
|
|
||||||
|
It represents the **top of the ingestion pipeline** and is intended to be the
|
||||||
|
primary interaction surface for library consumers.
|
||||||
|
|
||||||
|
Components in this package:
|
||||||
|
- Are provider-agnostic
|
||||||
|
- Depend only on adapter and parser contracts
|
||||||
|
- Contain no provider-specific API logic
|
||||||
|
- Expose read-only ingestion workflows
|
||||||
|
|
||||||
|
Consumers are expected to construct a mail adapter and pass it to the
|
||||||
|
ingestion layer to begin processing messages and threads.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .reader import MailIntakeReader
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"MailIntakeReader",
|
||||||
|
]
|
||||||
|
|||||||
@@ -1,3 +1,18 @@
|
|||||||
|
"""
|
||||||
|
High-level mail ingestion orchestration for Mail Intake.
|
||||||
|
|
||||||
|
This module provides the primary, provider-agnostic entry point for
|
||||||
|
reading and processing mail data.
|
||||||
|
|
||||||
|
It coordinates:
|
||||||
|
- Mail adapter access
|
||||||
|
- Message and thread iteration
|
||||||
|
- Header and body parsing
|
||||||
|
- Normalization and model construction
|
||||||
|
|
||||||
|
No provider-specific logic or API semantics are permitted in this layer.
|
||||||
|
"""
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterator, Dict, Any
|
from typing import Iterator, Dict, Any
|
||||||
|
|
||||||
@@ -14,22 +29,43 @@ class MailIntakeReader:
|
|||||||
"""
|
"""
|
||||||
High-level read-only ingestion interface.
|
High-level read-only ingestion interface.
|
||||||
|
|
||||||
This is the primary entry point users should interact with.
|
This class is the **primary entry point** for consumers of the Mail
|
||||||
It orchestrates:
|
Intake library.
|
||||||
- adapter calls
|
|
||||||
- parsing
|
|
||||||
- normalization
|
|
||||||
- model construction
|
|
||||||
|
|
||||||
No provider-specific logic exists here.
|
It orchestrates the full ingestion pipeline:
|
||||||
|
- Querying the adapter for message references
|
||||||
|
- Fetching raw provider messages
|
||||||
|
- Parsing and normalizing message data
|
||||||
|
- Constructing domain models
|
||||||
|
|
||||||
|
This class is intentionally:
|
||||||
|
- Provider-agnostic
|
||||||
|
- Stateless beyond iteration scope
|
||||||
|
- Read-only
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, adapter: MailIntakeAdapter):
|
def __init__(self, adapter: MailIntakeAdapter):
|
||||||
|
"""
|
||||||
|
Initialize the mail reader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
adapter: Mail adapter implementation used to retrieve raw
|
||||||
|
messages and threads from a mail provider.
|
||||||
|
"""
|
||||||
self._adapter = adapter
|
self._adapter = adapter
|
||||||
|
|
||||||
def iter_messages(self, query: str) -> Iterator[MailIntakeMessage]:
|
def iter_messages(self, query: str) -> Iterator[MailIntakeMessage]:
|
||||||
"""
|
"""
|
||||||
Iterate over parsed messages matching a provider query.
|
Iterate over parsed messages matching a provider query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Provider-specific query string used to filter messages.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Fully parsed and normalized `MailIntakeMessage` instances.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
MailIntakeParsingError: If a message cannot be parsed.
|
||||||
"""
|
"""
|
||||||
for ref in self._adapter.iter_message_refs(query):
|
for ref in self._adapter.iter_message_refs(query):
|
||||||
raw = self._adapter.fetch_message(ref["message_id"])
|
raw = self._adapter.fetch_message(ref["message_id"])
|
||||||
@@ -39,7 +75,17 @@ class MailIntakeReader:
|
|||||||
"""
|
"""
|
||||||
Iterate over threads constructed from messages matching a query.
|
Iterate over threads constructed from messages matching a query.
|
||||||
|
|
||||||
Messages are grouped by thread_id and yielded as complete threads.
|
Messages are grouped by `thread_id` and yielded as complete thread
|
||||||
|
objects containing all associated messages.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Provider-specific query string used to filter messages.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An iterator of `MailIntakeThread` instances.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
MailIntakeParsingError: If a message cannot be parsed.
|
||||||
"""
|
"""
|
||||||
threads: Dict[str, MailIntakeThread] = {}
|
threads: Dict[str, MailIntakeThread] = {}
|
||||||
|
|
||||||
@@ -61,7 +107,17 @@ class MailIntakeReader:
|
|||||||
|
|
||||||
def _parse_message(self, raw_message: Dict[str, Any]) -> MailIntakeMessage:
|
def _parse_message(self, raw_message: Dict[str, Any]) -> MailIntakeMessage:
|
||||||
"""
|
"""
|
||||||
Parse a raw provider message into a MailIntakeMessage.
|
Parse a raw provider message into a `MailIntakeMessage`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_message: Provider-native message payload.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A fully populated `MailIntakeMessage` instance.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
MailIntakeParsingError: If the message payload is missing required
|
||||||
|
fields or cannot be parsed.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
message_id = raw_message["id"]
|
message_id = raw_message["id"]
|
||||||
|
|||||||
@@ -0,0 +1,22 @@
|
|||||||
|
"""
|
||||||
|
Domain models for Mail Intake.
|
||||||
|
|
||||||
|
This package defines the **canonical, provider-agnostic data models**
|
||||||
|
used throughout the Mail Intake ingestion pipeline.
|
||||||
|
|
||||||
|
Models in this package:
|
||||||
|
- Represent fully parsed and normalized mail data
|
||||||
|
- Are safe to persist, serialize, and index
|
||||||
|
- Contain no provider-specific payloads or API semantics
|
||||||
|
- Serve as stable inputs for downstream processing and analysis
|
||||||
|
|
||||||
|
These models form the core internal data contract of the library.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .message import MailIntakeMessage
|
||||||
|
from .thread import MailIntakeThread
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"MailIntakeMessage",
|
||||||
|
"MailIntakeThread",
|
||||||
|
]
|
||||||
|
|||||||
@@ -1,3 +1,14 @@
|
|||||||
|
"""
|
||||||
|
Message domain models for Mail Intake.
|
||||||
|
|
||||||
|
This module defines the **canonical, provider-agnostic representation**
|
||||||
|
of an individual email message as used internally by the Mail Intake
|
||||||
|
ingestion pipeline.
|
||||||
|
|
||||||
|
Models in this module are safe to persist and must not contain any
|
||||||
|
provider-specific fields or semantics.
|
||||||
|
"""
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Dict
|
from typing import Optional, Dict
|
||||||
@@ -8,19 +19,37 @@ class MailIntakeMessage:
|
|||||||
"""
|
"""
|
||||||
Canonical internal representation of a single email message.
|
Canonical internal representation of a single email message.
|
||||||
|
|
||||||
This model is provider-agnostic and safe to persist.
|
This model represents a fully parsed and normalized email message.
|
||||||
No Gmail-specific fields should appear here.
|
It is intentionally provider-agnostic and suitable for persistence,
|
||||||
|
indexing, and downstream processing.
|
||||||
|
|
||||||
|
No provider-specific identifiers, payloads, or API semantics
|
||||||
|
should appear in this model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
message_id: str
|
message_id: str
|
||||||
|
"""Provider-specific message identifier."""
|
||||||
|
|
||||||
thread_id: str
|
thread_id: str
|
||||||
|
"""Provider-specific thread identifier to which this message belongs."""
|
||||||
|
|
||||||
timestamp: datetime
|
timestamp: datetime
|
||||||
|
"""Message timestamp as a timezone-naive UTC datetime."""
|
||||||
|
|
||||||
from_email: str
|
from_email: str
|
||||||
|
"""Sender email address."""
|
||||||
|
|
||||||
from_name: Optional[str]
|
from_name: Optional[str]
|
||||||
|
"""Optional human-readable sender name."""
|
||||||
|
|
||||||
subject: str
|
subject: str
|
||||||
|
"""Raw subject line of the message."""
|
||||||
|
|
||||||
body_text: str
|
body_text: str
|
||||||
|
"""Extracted plain-text body content of the message."""
|
||||||
|
|
||||||
snippet: str
|
snippet: str
|
||||||
|
"""Short provider-supplied preview snippet of the message."""
|
||||||
|
|
||||||
raw_headers: Dict[str, str]
|
raw_headers: Dict[str, str]
|
||||||
|
"""Normalized mapping of message headers (header name → value)."""
|
||||||
|
|||||||
@@ -1,3 +1,13 @@
|
|||||||
|
"""
|
||||||
|
Thread domain models for Mail Intake.
|
||||||
|
|
||||||
|
This module defines the **canonical, provider-agnostic representation**
|
||||||
|
of an email thread as used internally by the Mail Intake ingestion pipeline.
|
||||||
|
|
||||||
|
Threads group related messages and serve as the primary unit of reasoning
|
||||||
|
for higher-level correspondence workflows.
|
||||||
|
"""
|
||||||
|
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Set
|
from typing import List, Set
|
||||||
@@ -10,21 +20,40 @@ class MailIntakeThread:
|
|||||||
"""
|
"""
|
||||||
Canonical internal representation of an email thread.
|
Canonical internal representation of an email thread.
|
||||||
|
|
||||||
Threads are the primary unit of reasoning for correspondence
|
A thread groups multiple related messages under a single subject
|
||||||
workflows (job applications, interviews, follow-ups, etc.).
|
and participant set. It is designed to support reasoning over
|
||||||
|
conversational context such as job applications, interviews,
|
||||||
|
follow-ups, and ongoing discussions.
|
||||||
|
|
||||||
|
This model is provider-agnostic and safe to persist.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
thread_id: str
|
thread_id: str
|
||||||
|
"""Provider-specific thread identifier."""
|
||||||
|
|
||||||
normalized_subject: str
|
normalized_subject: str
|
||||||
|
"""Normalized subject line used to group related messages."""
|
||||||
|
|
||||||
participants: Set[str] = field(default_factory=set)
|
participants: Set[str] = field(default_factory=set)
|
||||||
|
"""Set of unique participant email addresses observed in the thread."""
|
||||||
|
|
||||||
messages: List[MailIntakeMessage] = field(default_factory=list)
|
messages: List[MailIntakeMessage] = field(default_factory=list)
|
||||||
|
"""Ordered list of messages belonging to this thread."""
|
||||||
|
|
||||||
last_activity_at: datetime | None = None
|
last_activity_at: datetime | None = None
|
||||||
|
"""Timestamp of the most recent message in the thread."""
|
||||||
|
|
||||||
def add_message(self, message: MailIntakeMessage) -> None:
|
def add_message(self, message: MailIntakeMessage) -> None:
|
||||||
"""
|
"""
|
||||||
Add a message to the thread and update derived fields.
|
Add a message to the thread and update derived fields.
|
||||||
|
|
||||||
|
This method:
|
||||||
|
- Appends the message to the thread
|
||||||
|
- Tracks unique participants
|
||||||
|
- Updates the last activity timestamp
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message: Parsed mail message to add to the thread.
|
||||||
"""
|
"""
|
||||||
self.messages.append(message)
|
self.messages.append(message)
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,30 @@
|
|||||||
|
"""
|
||||||
|
Message parsing utilities for Mail Intake.
|
||||||
|
|
||||||
|
This package contains **provider-aware but adapter-agnostic parsing helpers**
|
||||||
|
used to extract and normalize structured information from raw mail payloads.
|
||||||
|
|
||||||
|
Parsers in this package are responsible for:
|
||||||
|
- Interpreting provider-native message structures
|
||||||
|
- Extracting meaningful fields such as headers, body text, and subjects
|
||||||
|
- Normalizing data into consistent internal representations
|
||||||
|
|
||||||
|
This package does not:
|
||||||
|
- Perform network or IO operations
|
||||||
|
- Contain provider API logic
|
||||||
|
- Construct domain models directly
|
||||||
|
|
||||||
|
Parsing functions are designed to be composable and are orchestrated by the
|
||||||
|
ingestion layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .body import extract_body
|
||||||
|
from .headers import parse_headers, extract_sender
|
||||||
|
from .subject import normalize_subject
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"extract_body",
|
||||||
|
"parse_headers",
|
||||||
|
"extract_sender",
|
||||||
|
"normalize_subject",
|
||||||
|
]
|
||||||
|
|||||||
@@ -1,3 +1,13 @@
|
|||||||
|
"""
|
||||||
|
Message body extraction utilities for Mail Intake.
|
||||||
|
|
||||||
|
This module contains helper functions for extracting a best-effort
|
||||||
|
plain-text body from provider-native message payloads.
|
||||||
|
|
||||||
|
The logic is intentionally tolerant of malformed or partial data and
|
||||||
|
prefers human-readable text over fidelity to original formatting.
|
||||||
|
"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
from typing import Dict, Any, Optional
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
@@ -9,6 +19,18 @@ from mail_intake.exceptions import MailIntakeParsingError
|
|||||||
def _decode_base64(data: str) -> str:
|
def _decode_base64(data: str) -> str:
|
||||||
"""
|
"""
|
||||||
Decode Gmail URL-safe base64 payload into UTF-8 text.
|
Decode Gmail URL-safe base64 payload into UTF-8 text.
|
||||||
|
|
||||||
|
Gmail message bodies are encoded using URL-safe base64, which may
|
||||||
|
omit padding and use non-standard characters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: URL-safe base64-encoded string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Decoded UTF-8 text with replacement for invalid characters.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
MailIntakeParsingError: If decoding fails.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
padded = data.replace("-", "+").replace("_", "/")
|
padded = data.replace("-", "+").replace("_", "/")
|
||||||
@@ -21,6 +43,16 @@ def _decode_base64(data: str) -> str:
|
|||||||
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
|
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Extract text content from a single MIME part.
|
Extract text content from a single MIME part.
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- text/plain
|
||||||
|
- text/html (converted to plain text)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
part: MIME part dictionary from a provider payload.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Extracted plain-text content, or None if unsupported or empty.
|
||||||
"""
|
"""
|
||||||
mime_type = part.get("mimeType")
|
mime_type = part.get("mimeType")
|
||||||
body = part.get("body", {})
|
body = part.get("body", {})
|
||||||
@@ -49,7 +81,14 @@ def extract_body(payload: Dict[str, Any]) -> str:
|
|||||||
Priority:
|
Priority:
|
||||||
1. text/plain
|
1. text/plain
|
||||||
2. text/html (stripped to text)
|
2. text/html (stripped to text)
|
||||||
3. empty string (if nothing usable found)
|
3. Single-part body
|
||||||
|
4. empty string (if nothing usable found)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
payload: Provider-native message payload dictionary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Extracted plain-text message body.
|
||||||
"""
|
"""
|
||||||
if not payload:
|
if not payload:
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -1,3 +1,13 @@
|
|||||||
|
"""
|
||||||
|
Message header parsing utilities for Mail Intake.
|
||||||
|
|
||||||
|
This module provides helper functions for normalizing and extracting
|
||||||
|
useful information from provider-native message headers.
|
||||||
|
|
||||||
|
The functions here are intentionally simple and tolerant of malformed
|
||||||
|
or incomplete header data.
|
||||||
|
"""
|
||||||
|
|
||||||
from typing import Dict, List, Tuple, Optional
|
from typing import Dict, List, Tuple, Optional
|
||||||
|
|
||||||
|
|
||||||
@@ -5,19 +15,29 @@ def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
|
|||||||
"""
|
"""
|
||||||
Convert a list of Gmail-style headers into a normalized dict.
|
Convert a list of Gmail-style headers into a normalized dict.
|
||||||
|
|
||||||
Input:
|
Provider payloads (such as Gmail) typically represent headers as a list
|
||||||
[
|
of name/value mappings. This function normalizes them into a
|
||||||
{"name": "From", "value": "John Doe <john@example.com>"},
|
case-insensitive dictionary keyed by lowercase header names.
|
||||||
{"name": "Subject", "value": "Re: Interview Update"},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
|
|
||||||
Output:
|
Args:
|
||||||
{
|
raw_headers: List of header dictionaries, each containing
|
||||||
"from": "...",
|
``name`` and ``value`` keys.
|
||||||
"subject": "...",
|
|
||||||
...
|
Returns:
|
||||||
}
|
Dictionary mapping lowercase header names to stripped values.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Input:
|
||||||
|
[
|
||||||
|
{"name": "From", "value": "John Doe <john@example.com>"},
|
||||||
|
{"name": "Subject", "value": "Re: Interview Update"},
|
||||||
|
]
|
||||||
|
|
||||||
|
Output:
|
||||||
|
{
|
||||||
|
"from": "John Doe <john@example.com>",
|
||||||
|
"subject": "Re: Interview Update",
|
||||||
|
}
|
||||||
"""
|
"""
|
||||||
headers: Dict[str, str] = {}
|
headers: Dict[str, str] = {}
|
||||||
|
|
||||||
@@ -37,18 +57,27 @@ def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
|
|||||||
"""
|
"""
|
||||||
Extract sender email and optional display name from headers.
|
Extract sender email and optional display name from headers.
|
||||||
|
|
||||||
Returns:
|
This function parses the ``From`` header and attempts to extract:
|
||||||
(email, name)
|
- Sender email address
|
||||||
|
- Optional human-readable display name
|
||||||
|
|
||||||
If name cannot be determined, name will be None.
|
Args:
|
||||||
|
headers: Normalized header dictionary as returned by
|
||||||
|
:func:`parse_headers`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple ``(email, name)`` where:
|
||||||
|
- ``email`` is the sender email address
|
||||||
|
- ``name`` is the display name, or ``None`` if unavailable
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
``"John Doe <john@example.com>"`` → ``("john@example.com", "John Doe")``
|
||||||
|
``"john@example.com"`` → ``("john@example.com", None)``
|
||||||
"""
|
"""
|
||||||
from_header = headers.get("from")
|
from_header = headers.get("from")
|
||||||
if not from_header:
|
if not from_header:
|
||||||
return "", None
|
return "", None
|
||||||
|
|
||||||
# Common forms:
|
|
||||||
# Name <email@domain>
|
|
||||||
# email@domain
|
|
||||||
if "<" in from_header and ">" in from_header:
|
if "<" in from_header and ">" in from_header:
|
||||||
name_part, email_part = from_header.split("<", 1)
|
name_part, email_part = from_header.split("<", 1)
|
||||||
email = email_part.rstrip(">").strip()
|
email = email_part.rstrip(">").strip()
|
||||||
|
|||||||
@@ -1,7 +1,18 @@
|
|||||||
|
"""
|
||||||
|
Subject line normalization utilities for Mail Intake.
|
||||||
|
|
||||||
|
This module provides helper functions for normalizing email subject lines
|
||||||
|
to enable reliable thread-level comparison and grouping.
|
||||||
|
|
||||||
|
Normalization is intentionally conservative to avoid altering semantic
|
||||||
|
meaning while removing common reply and forward prefixes.
|
||||||
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
|
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
|
||||||
|
"""Regular expression matching common reply/forward subject prefixes."""
|
||||||
|
|
||||||
|
|
||||||
def normalize_subject(subject: str) -> str:
|
def normalize_subject(subject: str) -> str:
|
||||||
@@ -9,11 +20,19 @@ def normalize_subject(subject: str) -> str:
|
|||||||
Normalize an email subject for thread-level comparison.
|
Normalize an email subject for thread-level comparison.
|
||||||
|
|
||||||
Operations:
|
Operations:
|
||||||
- Strip common prefixes (Re:, Fwd:, FW:)
|
- Strips common prefixes such as ``Re:``, ``Fwd:``, and ``FW:``
|
||||||
- Collapse whitespace
|
- Repeats prefix stripping to handle stacked prefixes
|
||||||
- Preserve original casing (no lowercasing)
|
- Collapses excessive whitespace
|
||||||
|
- Preserves original casing (no lowercasing)
|
||||||
|
|
||||||
This function is intentionally conservative.
|
This function is intentionally conservative and avoids aggressive
|
||||||
|
transformations that could alter the semantic meaning of the subject.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
subject: Raw subject line from a message header.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Normalized subject string suitable for thread grouping.
|
||||||
"""
|
"""
|
||||||
if not subject:
|
if not subject:
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
Reference in New Issue
Block a user