docs(mail_intake): add comprehensive docstrings across ingestion, adapters, auth, and parsing layers

- docs(mail_intake/__init__.py): document module-based public API and usage patterns - docs(mail_intake/ingestion/reader.py): document high-level ingestion orchestration - docs(mail_intake/adapters/base.py): document adapter contract for mail providers - docs(mail_intake/adapters/gmail.py): document Gmail adapter implementation and constraints - docs(mail_intake/auth/base.py): document authentication provider contract - docs(mail_intake/auth/google.py): document Google OAuth authentication provider - docs(mail_intake/models/message.py): document canonical email message model - docs(mail_intake/models/thread.py): document canonical email thread model - docs(mail_intake/parsers/body.py): document message body extraction logic - docs(mail_intake/parsers/headers.py): document message header normalization utilities - docs(mail_intake/parsers/subject.py): document subject normalization utilities - docs(mail_intake/config.py): document global configuration model - docs(mail_intake/exceptions.py): document library exception hierarchy
2026-01-09 17:40:25 +05:30
parent dbfef295b8
commit f22af90e98
18 changed files with 751 additions and 71 deletions
--- a/mail_intake/parsers/init.py
+++ b/mail_intake/parsers/init.py
@@ -0,0 +1,30 @@
+"""
+Message parsing utilities for Mail Intake.
+
+This package contains **provider-aware but adapter-agnostic parsing helpers**
+used to extract and normalize structured information from raw mail payloads.
+
+Parsers in this package are responsible for:
+- Interpreting provider-native message structures
+- Extracting meaningful fields such as headers, body text, and subjects
+- Normalizing data into consistent internal representations
+
+This package does not:
+- Perform network or IO operations
+- Contain provider API logic
+- Construct domain models directly
+
+Parsing functions are designed to be composable and are orchestrated by the
+ingestion layer.
+"""
+
+from .body import extract_body
+from .headers import parse_headers, extract_sender
+from .subject import normalize_subject
+
+__all__ = [
+    "extract_body",
+    "parse_headers",
+    "extract_sender",
+    "normalize_subject",
+]
--- a/mail_intake/parsers/body.py
+++ b/mail_intake/parsers/body.py
@@ -1,3 +1,13 @@
+"""
+Message body extraction utilities for Mail Intake.
+
+This module contains helper functions for extracting a best-effort
+plain-text body from provider-native message payloads.
+
+The logic is intentionally tolerant of malformed or partial data and
+prefers human-readable text over fidelity to original formatting.
+"""
+
 import base64
 from typing import Dict, Any, Optional

@@ -9,6 +19,18 @@ from mail_intake.exceptions import MailIntakeParsingError
 def _decode_base64(data: str) -> str:
    """
    Decode Gmail URL-safe base64 payload into UTF-8 text.
+
+    Gmail message bodies are encoded using URL-safe base64, which may
+    omit padding and use non-standard characters.
+
+    Args:
+        data: URL-safe base64-encoded string.
+
+    Returns:
+        Decoded UTF-8 text with replacement for invalid characters.
+
+    Raises:
+        MailIntakeParsingError: If decoding fails.
    """
    try:
        padded = data.replace("-", "+").replace("_", "/")
@@ -21,6 +43,16 @@ def _decode_base64(data: str) -> str:
 def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
    """
    Extract text content from a single MIME part.
+
+    Supports:
+    - text/plain
+    - text/html (converted to plain text)
+
+    Args:
+        part: MIME part dictionary from a provider payload.
+
+    Returns:
+        Extracted plain-text content, or None if unsupported or empty.
    """
    mime_type = part.get("mimeType")
    body = part.get("body", {})
@@ -49,7 +81,14 @@ def extract_body(payload: Dict[str, Any]) -> str:
    Priority:
    1. text/plain
    2. text/html (stripped to text)
-    3. empty string (if nothing usable found)
+    3. Single-part body
+    4. empty string (if nothing usable found)
+
+    Args:
+        payload: Provider-native message payload dictionary.
+
+    Returns:
+        Extracted plain-text message body.
    """
    if not payload:
        return ""
--- a/mail_intake/parsers/headers.py
+++ b/mail_intake/parsers/headers.py
@@ -1,3 +1,13 @@
+"""
+Message header parsing utilities for Mail Intake.
+
+This module provides helper functions for normalizing and extracting
+useful information from provider-native message headers.
+
+The functions here are intentionally simple and tolerant of malformed
+or incomplete header data.
+"""
+
 from typing import Dict, List, Tuple, Optional


@@ -5,19 +15,29 @@ def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
    """
    Convert a list of Gmail-style headers into a normalized dict.

-    Input:
-    [
-        {"name": "From", "value": "John Doe <john@example.com>"},
-        {"name": "Subject", "value": "Re: Interview Update"},
-        ...
-    ]
+    Provider payloads (such as Gmail) typically represent headers as a list
+    of name/value mappings. This function normalizes them into a
+    case-insensitive dictionary keyed by lowercase header names.

-    Output:
-    {
-        "from": "...",
-        "subject": "...",
-        ...
-    }
+    Args:
+        raw_headers: List of header dictionaries, each containing
+            ``name`` and ``value`` keys.
+
+    Returns:
+        Dictionary mapping lowercase header names to stripped values.
+
+    Example:
+        Input:
+            [
+                {"name": "From", "value": "John Doe <john@example.com>"},
+                {"name": "Subject", "value": "Re: Interview Update"},
+            ]
+
+        Output:
+            {
+                "from": "John Doe <john@example.com>",
+                "subject": "Re: Interview Update",
+            }
    """
    headers: Dict[str, str] = {}

@@ -37,18 +57,27 @@ def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
    """
    Extract sender email and optional display name from headers.

-    Returns:
-        (email, name)
+    This function parses the ``From`` header and attempts to extract:
+    - Sender email address
+    - Optional human-readable display name

-    If name cannot be determined, name will be None.
+    Args:
+        headers: Normalized header dictionary as returned by
+            :func:`parse_headers`.
+
+    Returns:
+        A tuple ``(email, name)`` where:
+        - ``email`` is the sender email address
+        - ``name`` is the display name, or ``None`` if unavailable
+
+    Examples:
+        ``"John Doe <john@example.com>"`` → ``("john@example.com", "John Doe")``
+        ``"john@example.com"`` → ``("john@example.com", None)``
    """
    from_header = headers.get("from")
    if not from_header:
        return "", None

-    # Common forms:
-    #   Name <email@domain>
-    #   email@domain
    if "<" in from_header and ">" in from_header:
        name_part, email_part = from_header.split("<", 1)
        email = email_part.rstrip(">").strip()
--- a/mail_intake/parsers/subject.py
+++ b/mail_intake/parsers/subject.py
@@ -1,7 +1,18 @@
+"""
+Subject line normalization utilities for Mail Intake.
+
+This module provides helper functions for normalizing email subject lines
+to enable reliable thread-level comparison and grouping.
+
+Normalization is intentionally conservative to avoid altering semantic
+meaning while removing common reply and forward prefixes.
+"""
+
 import re


 _PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
+"""Regular expression matching common reply/forward subject prefixes."""


 def normalize_subject(subject: str) -> str:
@@ -9,11 +20,19 @@ def normalize_subject(subject: str) -> str:
    Normalize an email subject for thread-level comparison.

    Operations:
-    - Strip common prefixes (Re:, Fwd:, FW:)
-    - Collapse whitespace
-    - Preserve original casing (no lowercasing)
+    - Strips common prefixes such as ``Re:``, ``Fwd:``, and ``FW:``
+    - Repeats prefix stripping to handle stacked prefixes
+    - Collapses excessive whitespace
+    - Preserves original casing (no lowercasing)

-    This function is intentionally conservative.
+    This function is intentionally conservative and avoids aggressive
+    transformations that could alter the semantic meaning of the subject.
+
+    Args:
+        subject: Raw subject line from a message header.
+
+    Returns:
+        Normalized subject string suitable for thread grouping.
    """
    if not subject:
        return ""