diff --git a/mail_intake/__init__.py b/mail_intake/__init__.py index e69de29..0db787b 100644 --- a/mail_intake/__init__.py +++ b/mail_intake/__init__.py @@ -0,0 +1,126 @@ +""" +Mail Intake — provider-agnostic, read-only email ingestion framework. + +Mail Intake is a **contract-first library** designed to ingest, parse, and +normalize email data from external providers (such as Gmail) into clean, +provider-agnostic domain models. + +The library is intentionally structured around clear layers, each exposed +as a first-class module at the package root: + +- adapters: provider-specific access (e.g. Gmail) +- auth: authentication providers and credential management +- parsers: extraction and normalization of message content +- ingestion: orchestration and high-level ingestion workflows +- models: canonical, provider-agnostic data representations +- config: explicit global configuration +- exceptions: library-defined error hierarchy + +The package root acts as a **namespace**, not a facade. Consumers are +expected to import functionality explicitly from the appropriate module. + +---------------------------------------------------------------------- +Installation +---------------------------------------------------------------------- + +Install using pip: + + pip install mail-intake + +Or with Poetry: + + poetry add mail-intake + +Mail Intake is pure Python and has no runtime dependencies beyond those +required by the selected provider (for example, Google APIs for Gmail). + +---------------------------------------------------------------------- +Basic Usage +---------------------------------------------------------------------- + +Minimal Gmail ingestion example: + + from mail_intake.ingestion import MailIntakeReader + from mail_intake.adapters import MailIntakeGmailAdapter + from mail_intake.auth import MailIntakeGoogleAuth + + auth = MailIntakeGoogleAuth( + credentials_path="credentials.json", + token_path="token.pickle", + scopes=["https://www.googleapis.com/auth/gmail.readonly"], + ) + + adapter = MailIntakeGmailAdapter(auth_provider=auth) + reader = MailIntakeReader(adapter) + + for message in reader.iter_messages("from:recruiter@example.com"): + print(message.subject, message.from_email) + +Iterating over threads: + + for thread in reader.iter_threads("subject:Interview"): + print(thread.normalized_subject, len(thread.messages)) + +---------------------------------------------------------------------- +Extensibility Model +---------------------------------------------------------------------- + +Mail Intake is designed to be extensible via **public contracts** exposed +through its modules: + +- Users MAY implement their own mail adapters by subclassing + `adapters.MailIntakeAdapter` +- Users MAY implement their own authentication providers by subclassing + `auth.MailIntakeAuthProvider` + +Users SHOULD NOT subclass built-in adapter implementations. Built-in +adapters (such as Gmail) are reference implementations and may change +internally without notice. + +---------------------------------------------------------------------- +Public API Surface +---------------------------------------------------------------------- + +The supported public API consists of the following top-level modules: + +- mail_intake.ingestion +- mail_intake.adapters +- mail_intake.auth +- mail_intake.parsers +- mail_intake.models +- mail_intake.config +- mail_intake.exceptions + +Classes and functions should be imported explicitly from these modules. +No individual symbols are re-exported at the package root. + +---------------------------------------------------------------------- +Design Guarantees +---------------------------------------------------------------------- + +- Read-only access: no mutation of provider state +- Provider-agnostic domain models +- Explicit configuration and dependency injection +- No implicit global state or environment reads +- Deterministic, testable behavior + +Mail Intake favors correctness, clarity, and explicitness over convenience +shortcuts. +""" + + +import ingestion +import adapters +import auth +import models +import config +import exceptions + +__all__ = [ + "ingestion", + "adapters", + "auth", + "models", + "config", + "exceptions", +] diff --git a/mail_intake/adapters/__init__.py b/mail_intake/adapters/__init__.py index e69de29..908d78c 100644 --- a/mail_intake/adapters/__init__.py +++ b/mail_intake/adapters/__init__.py @@ -0,0 +1,28 @@ +""" +Mail provider adapter implementations for Mail Intake. + +This package contains **adapter-layer implementations** responsible for +interfacing with external mail providers and exposing a normalized, +provider-agnostic contract to the rest of the system. + +Adapters in this package: +- Implement the `MailIntakeAdapter` interface +- Encapsulate all provider-specific APIs and semantics +- Perform read-only access to mail data +- Return provider-native payloads without interpretation + +Provider-specific logic **must not leak** outside of adapter implementations. +All parsings, normalizations, and transformations must be handled by downstream +components. + +Public adapters exported from this package are considered the supported +integration surface for mail providers. +""" + +from .base import MailIntakeAdapter +from .gmail import MailIntakeGmailAdapter + +__all__ = [ + "MailIntakeAdapter", + "MailIntakeGmailAdapter", +] diff --git a/mail_intake/adapters/base.py b/mail_intake/adapters/base.py index e26e17d..f956e75 100644 --- a/mail_intake/adapters/base.py +++ b/mail_intake/adapters/base.py @@ -1,3 +1,14 @@ +""" +Mail provider adapter contracts for Mail Intake. + +This module defines the **provider-agnostic adapter interface** used for +read-only mail ingestion. + +Adapters encapsulate all provider-specific access logic and expose a +minimal, normalized contract to the rest of the system. No provider-specific +types or semantics should leak beyond implementations of this interface. +""" + from abc import ABC, abstractmethod from typing import Iterator, Dict, Any @@ -6,43 +17,60 @@ class MailIntakeAdapter(ABC): """ Base adapter interface for mail providers. - This interface defines the minimal contract required for - read-only mail ingestion. No provider-specific concepts - should leak beyond implementations of this class. + This interface defines the minimal contract required to: + - Discover messages matching a query + - Retrieve full message payloads + - Retrieve full thread payloads + + Adapters are intentionally read-only and must not mutate provider state. """ @abstractmethod def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]: """ - Iterate over lightweight message references. + Iterate over lightweight message references matching a query. - Must yield dictionaries containing at least: - - message_id - - thread_id + Implementations must yield dictionaries containing at least: + - ``message_id``: Provider-specific message identifier + - ``thread_id``: Provider-specific thread identifier + + Args: + query: Provider-specific query string used to filter messages. + + Yields: + Dictionaries containing message and thread identifiers. Example yield: - { - "message_id": "...", - "thread_id": "..." - } + { + "message_id": "...", + "thread_id": "..." + } """ raise NotImplementedError @abstractmethod def fetch_message(self, message_id: str) -> Dict[str, Any]: """ - Fetch a full raw message by message_id. + Fetch a full raw message by message identifier. - Returns the provider-native message payload - (e.g., Gmail message JSON). + Args: + message_id: Provider-specific message identifier. + + Returns: + Provider-native message payload + (e.g., Gmail message JSON structure). """ raise NotImplementedError @abstractmethod def fetch_thread(self, thread_id: str) -> Dict[str, Any]: """ - Fetch a full raw thread by thread_id. + Fetch a full raw thread by thread identifier. - Returns the provider-native thread payload. + Args: + thread_id: Provider-specific thread identifier. + + Returns: + Provider-native thread payload. """ raise NotImplementedError diff --git a/mail_intake/adapters/gmail.py b/mail_intake/adapters/gmail.py index d4c52dc..969dc86 100644 --- a/mail_intake/adapters/gmail.py +++ b/mail_intake/adapters/gmail.py @@ -1,3 +1,17 @@ +""" +Gmail adapter implementation for Mail Intake. + +This module provides a **Gmail-specific implementation** of the +`MailIntakeAdapter` contract. + +It is the only place in the codebase where: +- `googleapiclient` is imported +- Gmail REST API semantics are known +- Low-level `.execute()` calls are made + +All Gmail-specific behavior must be strictly contained within this module. +""" + from typing import Iterator, Dict, Any from googleapiclient.discovery import build @@ -12,12 +26,19 @@ class MailIntakeGmailAdapter(MailIntakeAdapter): """ Gmail read-only adapter. + This adapter implements the `MailIntakeAdapter` interface using the + Gmail REST API. It translates the generic mail intake contract into + Gmail-specific API calls. + This class is the ONLY place where: - googleapiclient is imported - Gmail REST semantics are known - .execute() is called - It must remain thin and dumb by design. + Design constraints: + - Must remain thin and imperative + - Must not perform parsing or interpretation + - Must not expose Gmail-specific types beyond this class """ def __init__( @@ -25,12 +46,29 @@ class MailIntakeGmailAdapter(MailIntakeAdapter): auth_provider: MailIntakeAuthProvider, user_id: str = "me", ): + """ + Initialize the Gmail adapter. + + Args: + auth_provider: Authentication provider capable of supplying + valid Gmail API credentials. + user_id: Gmail user identifier. Defaults to `"me"`. + """ self._auth_provider = auth_provider self._user_id = user_id self._service = None @property def service(self): + """ + Lazily initialize and return the Gmail API service client. + + Returns: + Initialized Gmail API service instance. + + Raises: + MailIntakeAdapterError: If the Gmail service cannot be initialized. + """ if self._service is None: try: creds = self._auth_provider.get_credentials() @@ -45,11 +83,16 @@ class MailIntakeGmailAdapter(MailIntakeAdapter): """ Iterate over message references matching the query. + Args: + query: Gmail search query string. + Yields: - { - "message_id": "...", - "thread_id": "..." - } + Dictionaries containing: + - ``message_id``: Gmail message ID + - ``thread_id``: Gmail thread ID + + Raises: + MailIntakeAdapterError: If the Gmail API returns an error. """ try: request = ( @@ -79,6 +122,18 @@ class MailIntakeGmailAdapter(MailIntakeAdapter): ) from exc def fetch_message(self, message_id: str) -> Dict[str, Any]: + """ + Fetch a full Gmail message by message ID. + + Args: + message_id: Gmail message identifier. + + Returns: + Provider-native Gmail message payload. + + Raises: + MailIntakeAdapterError: If the Gmail API returns an error. + """ try: return ( self.service.users() @@ -92,6 +147,18 @@ class MailIntakeGmailAdapter(MailIntakeAdapter): ) from exc def fetch_thread(self, thread_id: str) -> Dict[str, Any]: + """ + Fetch a full Gmail thread by thread ID. + + Args: + thread_id: Gmail thread identifier. + + Returns: + Provider-native Gmail thread payload. + + Raises: + MailIntakeAdapterError: If the Gmail API returns an error. + """ try: return ( self.service.users() diff --git a/mail_intake/auth/__init__.py b/mail_intake/auth/__init__.py index e69de29..342f482 100644 --- a/mail_intake/auth/__init__.py +++ b/mail_intake/auth/__init__.py @@ -0,0 +1,26 @@ +""" +Authentication provider implementations for Mail Intake. + +This package defines the **authentication layer** used by mail adapters +to obtain provider-specific credentials. + +It exposes: +- A stable, provider-agnostic authentication contract +- Concrete authentication providers for supported platforms + +Authentication providers: +- Are responsible for credential acquisition and lifecycle management +- Are intentionally decoupled from adapter logic +- May be extended by users to support additional providers + +Consumers should depend on the abstract interface and use concrete +implementations only where explicitly required. +""" + +from .base import MailIntakeAuthProvider +from .google import MailIntakeGoogleAuth + +__all__ = [ + "MailIntakeAuthProvider", + "MailIntakeGoogleAuth", +] diff --git a/mail_intake/auth/base.py b/mail_intake/auth/base.py index 526ff36..24cfcdb 100644 --- a/mail_intake/auth/base.py +++ b/mail_intake/auth/base.py @@ -1,3 +1,14 @@ +""" +Authentication provider contracts for Mail Intake. + +This module defines the **authentication abstraction layer** used by mail +adapters to obtain provider-specific credentials. + +Authentication concerns are intentionally decoupled from adapter logic. +Adapters depend only on this interface and must not be aware of how +credentials are acquired, refreshed, or stored. +""" + from abc import ABC, abstractmethod @@ -7,6 +18,17 @@ class MailIntakeAuthProvider(ABC): Mail adapters depend on this interface, not on concrete OAuth or credential implementations. + + Authentication providers encapsulate all logic required to acquire + valid credentials for a mail provider. + + Implementations may involve: + - OAuth flows + - Service account credentials + - Token refresh logic + - Secure credential storage + + Adapters must treat the returned credentials as opaque and provider-specific. """ @abstractmethod @@ -16,5 +38,13 @@ class MailIntakeAuthProvider(ABC): This method is synchronous by design and must either return valid credentials or raise MailIntakeAuthError. + + Returns: + Provider-specific credentials object suitable for use by + the corresponding mail adapter. + + Raises: + Exception: Authentication-specific errors defined by the + implementation. """ raise NotImplementedError diff --git a/mail_intake/auth/google.py b/mail_intake/auth/google.py index ec9a4f1..a6dcc3e 100644 --- a/mail_intake/auth/google.py +++ b/mail_intake/auth/google.py @@ -1,3 +1,17 @@ +""" +Google authentication provider implementation for Mail Intake. + +This module provides a **Google OAuth–based authentication provider** +used primarily for Gmail access. + +It encapsulates all Google-specific authentication concerns, including: +- Credential loading and persistence +- Token refresh handling +- Interactive OAuth flow initiation + +No Google authentication details should leak outside this module. +""" + import os import pickle from typing import Sequence @@ -14,12 +28,16 @@ class MailIntakeGoogleAuth(MailIntakeAuthProvider): """ Google OAuth provider for Gmail access. - Responsibilities: - - Load cached credentials from disk - - Refresh expired tokens when possible - - Trigger interactive login only when strictly required + This provider implements the `MailIntakeAuthProvider` interface using + Google's OAuth 2.0 flow and credential management libraries. - This class is synchronous and intentionally state-light. + Responsibilities: + - Load cached credentials from disk when available + - Refresh expired credentials when possible + - Initiate an interactive OAuth flow only when required + - Persist refreshed or newly obtained credentials + + This class is synchronous by design and maintains a minimal internal state. """ def __init__( @@ -28,11 +46,36 @@ class MailIntakeGoogleAuth(MailIntakeAuthProvider): token_path: str, scopes: Sequence[str], ): + """ + Initialize the Google authentication provider. + + Args: + credentials_path: Path to the Google OAuth client secrets file. + token_path: Path where OAuth tokens will be cached. + scopes: OAuth scopes required for access. + """ self.credentials_path = credentials_path self.token_path = token_path self.scopes = list(scopes) def get_credentials(self): + """ + Retrieve valid Google OAuth credentials. + + This method attempts to: + 1. Load cached credentials from disk + 2. Refresh expired credentials when possible + 3. Perform an interactive OAuth login as a fallback + 4. Persist valid credentials for future use + + Returns: + Google OAuth credentials object suitable for use with + Google API clients. + + Raises: + MailIntakeAuthError: If credentials cannot be loaded, refreshed, + or obtained via interactive authentication. + """ creds = None # Attempt to load cached credentials diff --git a/mail_intake/config.py b/mail_intake/config.py index afd63c5..6898ceb 100644 --- a/mail_intake/config.py +++ b/mail_intake/config.py @@ -1,3 +1,14 @@ +""" +Global configuration models for Mail Intake. + +This module defines the **top-level configuration object** used to control +mail ingestion behavior across adapters, authentication providers, and +ingestion workflows. + +Configuration is intentionally explicit, immutable, and free of implicit +environment reads to ensure predictability and testability. +""" + from dataclasses import dataclass from typing import Optional @@ -9,12 +20,26 @@ class MailIntakeConfig: This configuration is intentionally explicit and immutable. No implicit environment reads or global state. + + Design principles: + - Immutable once constructed + - Explicit configuration over implicit defaults + - No direct environment or filesystem access + + This model is safe to pass across layers and suitable for serialization. """ provider: str = "gmail" - user_id: str = "me" - readonly: bool = True + """Identifier of the mail provider to use (e.g., ``"gmail"``).""" + + user_id: str = "me" + """Provider-specific user identifier. Defaults to the authenticated user.""" + + readonly: bool = True + """Whether ingestion should operate in read-only mode.""" - # Provider-specific paths (optional at this layer) credentials_path: Optional[str] = None + """Optional path to provider credentials configuration.""" + token_path: Optional[str] = None + """Optional path to persisted authentication tokens.""" diff --git a/mail_intake/exceptions.py b/mail_intake/exceptions.py index 43043f4..5bba3fb 100644 --- a/mail_intake/exceptions.py +++ b/mail_intake/exceptions.py @@ -1,19 +1,49 @@ +""" +Exception hierarchy for Mail Intake. + +This module defines the **canonical exception types** used throughout the +Mail Intake library. + +All library-raised errors derive from `MailIntakeError`. Consumers are +encouraged to catch this base type (or specific subclasses) rather than +provider-specific or third-party exceptions. +""" + + class MailIntakeError(Exception): """ - Base exception for all mail-intake errors. + Base exception for all Mail Intake errors. - Users of the library should catch this type (or subclasses) - instead of provider-specific or third-party exceptions. + This is the root of the Mail Intake exception hierarchy. + All errors raised by the library must derive from this class. + + Consumers should generally catch this type when handling + library-level failures. """ class MailIntakeAuthError(MailIntakeError): - """Authentication and credential-related failures.""" + """ + Authentication and credential-related failures. + + Raised when authentication providers are unable to acquire, + refresh, or persist valid credentials. + """ class MailIntakeAdapterError(MailIntakeError): - """Errors raised by mail provider adapters.""" + """ + Errors raised by mail provider adapters. + + Raised when a provider adapter encounters API errors, + transport failures, or invalid provider responses. + """ class MailIntakeParsingError(MailIntakeError): - """Errors encountered while parsing message content.""" + """ + Errors encountered while parsing message content. + + Raised when raw provider payloads cannot be interpreted + or normalized into internal domain models. + """ diff --git a/mail_intake/ingestion/__init__.py b/mail_intake/ingestion/__init__.py index e69de29..54b10ac 100644 --- a/mail_intake/ingestion/__init__.py +++ b/mail_intake/ingestion/__init__.py @@ -0,0 +1,24 @@ +""" +Mail ingestion orchestration for Mail Intake. + +This package contains **high-level ingestion components** responsible for +coordinating mail retrieval, parsing, normalization, and model construction. + +It represents the **top of the ingestion pipeline** and is intended to be the +primary interaction surface for library consumers. + +Components in this package: +- Are provider-agnostic +- Depend only on adapter and parser contracts +- Contain no provider-specific API logic +- Expose read-only ingestion workflows + +Consumers are expected to construct a mail adapter and pass it to the +ingestion layer to begin processing messages and threads. +""" + +from .reader import MailIntakeReader + +__all__ = [ + "MailIntakeReader", +] diff --git a/mail_intake/ingestion/reader.py b/mail_intake/ingestion/reader.py index 186d296..6bec5b8 100644 --- a/mail_intake/ingestion/reader.py +++ b/mail_intake/ingestion/reader.py @@ -1,3 +1,18 @@ +""" +High-level mail ingestion orchestration for Mail Intake. + +This module provides the primary, provider-agnostic entry point for +reading and processing mail data. + +It coordinates: +- Mail adapter access +- Message and thread iteration +- Header and body parsing +- Normalization and model construction + +No provider-specific logic or API semantics are permitted in this layer. +""" + from datetime import datetime from typing import Iterator, Dict, Any @@ -14,22 +29,43 @@ class MailIntakeReader: """ High-level read-only ingestion interface. - This is the primary entry point users should interact with. - It orchestrates: - - adapter calls - - parsing - - normalization - - model construction + This class is the **primary entry point** for consumers of the Mail + Intake library. - No provider-specific logic exists here. + It orchestrates the full ingestion pipeline: + - Querying the adapter for message references + - Fetching raw provider messages + - Parsing and normalizing message data + - Constructing domain models + + This class is intentionally: + - Provider-agnostic + - Stateless beyond iteration scope + - Read-only """ def __init__(self, adapter: MailIntakeAdapter): + """ + Initialize the mail reader. + + Args: + adapter: Mail adapter implementation used to retrieve raw + messages and threads from a mail provider. + """ self._adapter = adapter def iter_messages(self, query: str) -> Iterator[MailIntakeMessage]: """ Iterate over parsed messages matching a provider query. + + Args: + query: Provider-specific query string used to filter messages. + + Yields: + Fully parsed and normalized `MailIntakeMessage` instances. + + Raises: + MailIntakeParsingError: If a message cannot be parsed. """ for ref in self._adapter.iter_message_refs(query): raw = self._adapter.fetch_message(ref["message_id"]) @@ -39,7 +75,17 @@ class MailIntakeReader: """ Iterate over threads constructed from messages matching a query. - Messages are grouped by thread_id and yielded as complete threads. + Messages are grouped by `thread_id` and yielded as complete thread + objects containing all associated messages. + + Args: + query: Provider-specific query string used to filter messages. + + Returns: + An iterator of `MailIntakeThread` instances. + + Raises: + MailIntakeParsingError: If a message cannot be parsed. """ threads: Dict[str, MailIntakeThread] = {} @@ -61,7 +107,17 @@ class MailIntakeReader: def _parse_message(self, raw_message: Dict[str, Any]) -> MailIntakeMessage: """ - Parse a raw provider message into a MailIntakeMessage. + Parse a raw provider message into a `MailIntakeMessage`. + + Args: + raw_message: Provider-native message payload. + + Returns: + A fully populated `MailIntakeMessage` instance. + + Raises: + MailIntakeParsingError: If the message payload is missing required + fields or cannot be parsed. """ try: message_id = raw_message["id"] diff --git a/mail_intake/models/__init__.py b/mail_intake/models/__init__.py index e69de29..47d17c3 100644 --- a/mail_intake/models/__init__.py +++ b/mail_intake/models/__init__.py @@ -0,0 +1,22 @@ +""" +Domain models for Mail Intake. + +This package defines the **canonical, provider-agnostic data models** +used throughout the Mail Intake ingestion pipeline. + +Models in this package: +- Represent fully parsed and normalized mail data +- Are safe to persist, serialize, and index +- Contain no provider-specific payloads or API semantics +- Serve as stable inputs for downstream processing and analysis + +These models form the core internal data contract of the library. +""" + +from .message import MailIntakeMessage +from .thread import MailIntakeThread + +__all__ = [ + "MailIntakeMessage", + "MailIntakeThread", +] diff --git a/mail_intake/models/message.py b/mail_intake/models/message.py index 2696d75..115b4eb 100644 --- a/mail_intake/models/message.py +++ b/mail_intake/models/message.py @@ -1,3 +1,14 @@ +""" +Message domain models for Mail Intake. + +This module defines the **canonical, provider-agnostic representation** +of an individual email message as used internally by the Mail Intake +ingestion pipeline. + +Models in this module are safe to persist and must not contain any +provider-specific fields or semantics. +""" + from dataclasses import dataclass from datetime import datetime from typing import Optional, Dict @@ -8,19 +19,37 @@ class MailIntakeMessage: """ Canonical internal representation of a single email message. - This model is provider-agnostic and safe to persist. - No Gmail-specific fields should appear here. + This model represents a fully parsed and normalized email message. + It is intentionally provider-agnostic and suitable for persistence, + indexing, and downstream processing. + + No provider-specific identifiers, payloads, or API semantics + should appear in this model. """ message_id: str + """Provider-specific message identifier.""" + thread_id: str + """Provider-specific thread identifier to which this message belongs.""" + timestamp: datetime + """Message timestamp as a timezone-naive UTC datetime.""" from_email: str + """Sender email address.""" + from_name: Optional[str] + """Optional human-readable sender name.""" subject: str + """Raw subject line of the message.""" + body_text: str + """Extracted plain-text body content of the message.""" + snippet: str + """Short provider-supplied preview snippet of the message.""" raw_headers: Dict[str, str] + """Normalized mapping of message headers (header name → value).""" diff --git a/mail_intake/models/thread.py b/mail_intake/models/thread.py index ddcf757..6cd85c3 100644 --- a/mail_intake/models/thread.py +++ b/mail_intake/models/thread.py @@ -1,3 +1,13 @@ +""" +Thread domain models for Mail Intake. + +This module defines the **canonical, provider-agnostic representation** +of an email thread as used internally by the Mail Intake ingestion pipeline. + +Threads group related messages and serve as the primary unit of reasoning +for higher-level correspondence workflows. +""" + from dataclasses import dataclass, field from datetime import datetime from typing import List, Set @@ -10,21 +20,40 @@ class MailIntakeThread: """ Canonical internal representation of an email thread. - Threads are the primary unit of reasoning for correspondence - workflows (job applications, interviews, follow-ups, etc.). + A thread groups multiple related messages under a single subject + and participant set. It is designed to support reasoning over + conversational context such as job applications, interviews, + follow-ups, and ongoing discussions. + + This model is provider-agnostic and safe to persist. """ thread_id: str + """Provider-specific thread identifier.""" + normalized_subject: str + """Normalized subject line used to group related messages.""" participants: Set[str] = field(default_factory=set) + """Set of unique participant email addresses observed in the thread.""" + messages: List[MailIntakeMessage] = field(default_factory=list) + """Ordered list of messages belonging to this thread.""" last_activity_at: datetime | None = None + """Timestamp of the most recent message in the thread.""" def add_message(self, message: MailIntakeMessage) -> None: """ Add a message to the thread and update derived fields. + + This method: + - Appends the message to the thread + - Tracks unique participants + - Updates the last activity timestamp + + Args: + message: Parsed mail message to add to the thread. """ self.messages.append(message) diff --git a/mail_intake/parsers/__init__.py b/mail_intake/parsers/__init__.py index e69de29..f2d4855 100644 --- a/mail_intake/parsers/__init__.py +++ b/mail_intake/parsers/__init__.py @@ -0,0 +1,30 @@ +""" +Message parsing utilities for Mail Intake. + +This package contains **provider-aware but adapter-agnostic parsing helpers** +used to extract and normalize structured information from raw mail payloads. + +Parsers in this package are responsible for: +- Interpreting provider-native message structures +- Extracting meaningful fields such as headers, body text, and subjects +- Normalizing data into consistent internal representations + +This package does not: +- Perform network or IO operations +- Contain provider API logic +- Construct domain models directly + +Parsing functions are designed to be composable and are orchestrated by the +ingestion layer. +""" + +from .body import extract_body +from .headers import parse_headers, extract_sender +from .subject import normalize_subject + +__all__ = [ + "extract_body", + "parse_headers", + "extract_sender", + "normalize_subject", +] diff --git a/mail_intake/parsers/body.py b/mail_intake/parsers/body.py index 82e14de..2ef4877 100644 --- a/mail_intake/parsers/body.py +++ b/mail_intake/parsers/body.py @@ -1,3 +1,13 @@ +""" +Message body extraction utilities for Mail Intake. + +This module contains helper functions for extracting a best-effort +plain-text body from provider-native message payloads. + +The logic is intentionally tolerant of malformed or partial data and +prefers human-readable text over fidelity to original formatting. +""" + import base64 from typing import Dict, Any, Optional @@ -9,6 +19,18 @@ from mail_intake.exceptions import MailIntakeParsingError def _decode_base64(data: str) -> str: """ Decode Gmail URL-safe base64 payload into UTF-8 text. + + Gmail message bodies are encoded using URL-safe base64, which may + omit padding and use non-standard characters. + + Args: + data: URL-safe base64-encoded string. + + Returns: + Decoded UTF-8 text with replacement for invalid characters. + + Raises: + MailIntakeParsingError: If decoding fails. """ try: padded = data.replace("-", "+").replace("_", "/") @@ -21,6 +43,16 @@ def _decode_base64(data: str) -> str: def _extract_from_part(part: Dict[str, Any]) -> Optional[str]: """ Extract text content from a single MIME part. + + Supports: + - text/plain + - text/html (converted to plain text) + + Args: + part: MIME part dictionary from a provider payload. + + Returns: + Extracted plain-text content, or None if unsupported or empty. """ mime_type = part.get("mimeType") body = part.get("body", {}) @@ -49,7 +81,14 @@ def extract_body(payload: Dict[str, Any]) -> str: Priority: 1. text/plain 2. text/html (stripped to text) - 3. empty string (if nothing usable found) + 3. Single-part body + 4. empty string (if nothing usable found) + + Args: + payload: Provider-native message payload dictionary. + + Returns: + Extracted plain-text message body. """ if not payload: return "" diff --git a/mail_intake/parsers/headers.py b/mail_intake/parsers/headers.py index 7c0c929..74fdc87 100644 --- a/mail_intake/parsers/headers.py +++ b/mail_intake/parsers/headers.py @@ -1,3 +1,13 @@ +""" +Message header parsing utilities for Mail Intake. + +This module provides helper functions for normalizing and extracting +useful information from provider-native message headers. + +The functions here are intentionally simple and tolerant of malformed +or incomplete header data. +""" + from typing import Dict, List, Tuple, Optional @@ -5,19 +15,29 @@ def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]: """ Convert a list of Gmail-style headers into a normalized dict. - Input: - [ - {"name": "From", "value": "John Doe "}, - {"name": "Subject", "value": "Re: Interview Update"}, - ... - ] + Provider payloads (such as Gmail) typically represent headers as a list + of name/value mappings. This function normalizes them into a + case-insensitive dictionary keyed by lowercase header names. - Output: - { - "from": "...", - "subject": "...", - ... - } + Args: + raw_headers: List of header dictionaries, each containing + ``name`` and ``value`` keys. + + Returns: + Dictionary mapping lowercase header names to stripped values. + + Example: + Input: + [ + {"name": "From", "value": "John Doe "}, + {"name": "Subject", "value": "Re: Interview Update"}, + ] + + Output: + { + "from": "John Doe ", + "subject": "Re: Interview Update", + } """ headers: Dict[str, str] = {} @@ -37,18 +57,27 @@ def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]: """ Extract sender email and optional display name from headers. - Returns: - (email, name) + This function parses the ``From`` header and attempts to extract: + - Sender email address + - Optional human-readable display name - If name cannot be determined, name will be None. + Args: + headers: Normalized header dictionary as returned by + :func:`parse_headers`. + + Returns: + A tuple ``(email, name)`` where: + - ``email`` is the sender email address + - ``name`` is the display name, or ``None`` if unavailable + + Examples: + ``"John Doe "`` → ``("john@example.com", "John Doe")`` + ``"john@example.com"`` → ``("john@example.com", None)`` """ from_header = headers.get("from") if not from_header: return "", None - # Common forms: - # Name - # email@domain if "<" in from_header and ">" in from_header: name_part, email_part = from_header.split("<", 1) email = email_part.rstrip(">").strip() diff --git a/mail_intake/parsers/subject.py b/mail_intake/parsers/subject.py index df5041d..960a71f 100644 --- a/mail_intake/parsers/subject.py +++ b/mail_intake/parsers/subject.py @@ -1,7 +1,18 @@ +""" +Subject line normalization utilities for Mail Intake. + +This module provides helper functions for normalizing email subject lines +to enable reliable thread-level comparison and grouping. + +Normalization is intentionally conservative to avoid altering semantic +meaning while removing common reply and forward prefixes. +""" + import re _PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE) +"""Regular expression matching common reply/forward subject prefixes.""" def normalize_subject(subject: str) -> str: @@ -9,11 +20,19 @@ def normalize_subject(subject: str) -> str: Normalize an email subject for thread-level comparison. Operations: - - Strip common prefixes (Re:, Fwd:, FW:) - - Collapse whitespace - - Preserve original casing (no lowercasing) + - Strips common prefixes such as ``Re:``, ``Fwd:``, and ``FW:`` + - Repeats prefix stripping to handle stacked prefixes + - Collapses excessive whitespace + - Preserves original casing (no lowercasing) - This function is intentionally conservative. + This function is intentionally conservative and avoids aggressive + transformations that could alter the semantic meaning of the subject. + + Args: + subject: Raw subject line from a message header. + + Returns: + Normalized subject string suitable for thread grouping. """ if not subject: return ""