docs(mail_intake): add comprehensive docstrings across ingestion, adapters, auth, and parsing layers

- docs(mail_intake/__init__.py): document module-based public API and usage patterns
- docs(mail_intake/ingestion/reader.py): document high-level ingestion orchestration
- docs(mail_intake/adapters/base.py): document adapter contract for mail providers
- docs(mail_intake/adapters/gmail.py): document Gmail adapter implementation and constraints
- docs(mail_intake/auth/base.py): document authentication provider contract
- docs(mail_intake/auth/google.py): document Google OAuth authentication provider
- docs(mail_intake/models/message.py): document canonical email message model
- docs(mail_intake/models/thread.py): document canonical email thread model
- docs(mail_intake/parsers/body.py): document message body extraction logic
- docs(mail_intake/parsers/headers.py): document message header normalization utilities
- docs(mail_intake/parsers/subject.py): document subject normalization utilities
- docs(mail_intake/config.py): document global configuration model
- docs(mail_intake/exceptions.py): document library exception hierarchy
This commit is contained in:
2026-01-09 17:40:25 +05:30
parent dbfef295b8
commit f22af90e98
18 changed files with 751 additions and 71 deletions

View File

@@ -0,0 +1,30 @@
"""
Message parsing utilities for Mail Intake.
This package contains **provider-aware but adapter-agnostic parsing helpers**
used to extract and normalize structured information from raw mail payloads.
Parsers in this package are responsible for:
- Interpreting provider-native message structures
- Extracting meaningful fields such as headers, body text, and subjects
- Normalizing data into consistent internal representations
This package does not:
- Perform network or IO operations
- Contain provider API logic
- Construct domain models directly
Parsing functions are designed to be composable and are orchestrated by the
ingestion layer.
"""
from .body import extract_body
from .headers import parse_headers, extract_sender
from .subject import normalize_subject
__all__ = [
"extract_body",
"parse_headers",
"extract_sender",
"normalize_subject",
]

View File

@@ -1,3 +1,13 @@
"""
Message body extraction utilities for Mail Intake.
This module contains helper functions for extracting a best-effort
plain-text body from provider-native message payloads.
The logic is intentionally tolerant of malformed or partial data and
prefers human-readable text over fidelity to original formatting.
"""
import base64
from typing import Dict, Any, Optional
@@ -9,6 +19,18 @@ from mail_intake.exceptions import MailIntakeParsingError
def _decode_base64(data: str) -> str:
"""
Decode Gmail URL-safe base64 payload into UTF-8 text.
Gmail message bodies are encoded using URL-safe base64, which may
omit padding and use non-standard characters.
Args:
data: URL-safe base64-encoded string.
Returns:
Decoded UTF-8 text with replacement for invalid characters.
Raises:
MailIntakeParsingError: If decoding fails.
"""
try:
padded = data.replace("-", "+").replace("_", "/")
@@ -21,6 +43,16 @@ def _decode_base64(data: str) -> str:
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
"""
Extract text content from a single MIME part.
Supports:
- text/plain
- text/html (converted to plain text)
Args:
part: MIME part dictionary from a provider payload.
Returns:
Extracted plain-text content, or None if unsupported or empty.
"""
mime_type = part.get("mimeType")
body = part.get("body", {})
@@ -49,7 +81,14 @@ def extract_body(payload: Dict[str, Any]) -> str:
Priority:
1. text/plain
2. text/html (stripped to text)
3. empty string (if nothing usable found)
3. Single-part body
4. empty string (if nothing usable found)
Args:
payload: Provider-native message payload dictionary.
Returns:
Extracted plain-text message body.
"""
if not payload:
return ""

View File

@@ -1,3 +1,13 @@
"""
Message header parsing utilities for Mail Intake.
This module provides helper functions for normalizing and extracting
useful information from provider-native message headers.
The functions here are intentionally simple and tolerant of malformed
or incomplete header data.
"""
from typing import Dict, List, Tuple, Optional
@@ -5,19 +15,29 @@ def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
"""
Convert a list of Gmail-style headers into a normalized dict.
Input:
[
{"name": "From", "value": "John Doe <john@example.com>"},
{"name": "Subject", "value": "Re: Interview Update"},
...
]
Provider payloads (such as Gmail) typically represent headers as a list
of name/value mappings. This function normalizes them into a
case-insensitive dictionary keyed by lowercase header names.
Output:
{
"from": "...",
"subject": "...",
...
}
Args:
raw_headers: List of header dictionaries, each containing
``name`` and ``value`` keys.
Returns:
Dictionary mapping lowercase header names to stripped values.
Example:
Input:
[
{"name": "From", "value": "John Doe <john@example.com>"},
{"name": "Subject", "value": "Re: Interview Update"},
]
Output:
{
"from": "John Doe <john@example.com>",
"subject": "Re: Interview Update",
}
"""
headers: Dict[str, str] = {}
@@ -37,18 +57,27 @@ def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
"""
Extract sender email and optional display name from headers.
Returns:
(email, name)
This function parses the ``From`` header and attempts to extract:
- Sender email address
- Optional human-readable display name
If name cannot be determined, name will be None.
Args:
headers: Normalized header dictionary as returned by
:func:`parse_headers`.
Returns:
A tuple ``(email, name)`` where:
- ``email`` is the sender email address
- ``name`` is the display name, or ``None`` if unavailable
Examples:
``"John Doe <john@example.com>"`` → ``("john@example.com", "John Doe")``
``"john@example.com"`` → ``("john@example.com", None)``
"""
from_header = headers.get("from")
if not from_header:
return "", None
# Common forms:
# Name <email@domain>
# email@domain
if "<" in from_header and ">" in from_header:
name_part, email_part = from_header.split("<", 1)
email = email_part.rstrip(">").strip()

View File

@@ -1,7 +1,18 @@
"""
Subject line normalization utilities for Mail Intake.
This module provides helper functions for normalizing email subject lines
to enable reliable thread-level comparison and grouping.
Normalization is intentionally conservative to avoid altering semantic
meaning while removing common reply and forward prefixes.
"""
import re
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
"""Regular expression matching common reply/forward subject prefixes."""
def normalize_subject(subject: str) -> str:
@@ -9,11 +20,19 @@ def normalize_subject(subject: str) -> str:
Normalize an email subject for thread-level comparison.
Operations:
- Strip common prefixes (Re:, Fwd:, FW:)
- Collapse whitespace
- Preserve original casing (no lowercasing)
- Strips common prefixes such as ``Re:``, ``Fwd:``, and ``FW:``
- Repeats prefix stripping to handle stacked prefixes
- Collapses excessive whitespace
- Preserves original casing (no lowercasing)
This function is intentionally conservative.
This function is intentionally conservative and avoids aggressive
transformations that could alter the semantic meaning of the subject.
Args:
subject: Raw subject line from a message header.
Returns:
Normalized subject string suitable for thread grouping.
"""
if not subject:
return ""