docs(mail_intake): add comprehensive docstrings across ingestion, adapters, auth, and parsing layers
- docs(mail_intake/__init__.py): document module-based public API and usage patterns - docs(mail_intake/ingestion/reader.py): document high-level ingestion orchestration - docs(mail_intake/adapters/base.py): document adapter contract for mail providers - docs(mail_intake/adapters/gmail.py): document Gmail adapter implementation and constraints - docs(mail_intake/auth/base.py): document authentication provider contract - docs(mail_intake/auth/google.py): document Google OAuth authentication provider - docs(mail_intake/models/message.py): document canonical email message model - docs(mail_intake/models/thread.py): document canonical email thread model - docs(mail_intake/parsers/body.py): document message body extraction logic - docs(mail_intake/parsers/headers.py): document message header normalization utilities - docs(mail_intake/parsers/subject.py): document subject normalization utilities - docs(mail_intake/config.py): document global configuration model - docs(mail_intake/exceptions.py): document library exception hierarchy
This commit is contained in:
@@ -0,0 +1,30 @@
|
||||
"""
|
||||
Message parsing utilities for Mail Intake.
|
||||
|
||||
This package contains **provider-aware but adapter-agnostic parsing helpers**
|
||||
used to extract and normalize structured information from raw mail payloads.
|
||||
|
||||
Parsers in this package are responsible for:
|
||||
- Interpreting provider-native message structures
|
||||
- Extracting meaningful fields such as headers, body text, and subjects
|
||||
- Normalizing data into consistent internal representations
|
||||
|
||||
This package does not:
|
||||
- Perform network or IO operations
|
||||
- Contain provider API logic
|
||||
- Construct domain models directly
|
||||
|
||||
Parsing functions are designed to be composable and are orchestrated by the
|
||||
ingestion layer.
|
||||
"""
|
||||
|
||||
from .body import extract_body
|
||||
from .headers import parse_headers, extract_sender
|
||||
from .subject import normalize_subject
|
||||
|
||||
__all__ = [
|
||||
"extract_body",
|
||||
"parse_headers",
|
||||
"extract_sender",
|
||||
"normalize_subject",
|
||||
]
|
||||
|
||||
@@ -1,3 +1,13 @@
|
||||
"""
|
||||
Message body extraction utilities for Mail Intake.
|
||||
|
||||
This module contains helper functions for extracting a best-effort
|
||||
plain-text body from provider-native message payloads.
|
||||
|
||||
The logic is intentionally tolerant of malformed or partial data and
|
||||
prefers human-readable text over fidelity to original formatting.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
@@ -9,6 +19,18 @@ from mail_intake.exceptions import MailIntakeParsingError
|
||||
def _decode_base64(data: str) -> str:
|
||||
"""
|
||||
Decode Gmail URL-safe base64 payload into UTF-8 text.
|
||||
|
||||
Gmail message bodies are encoded using URL-safe base64, which may
|
||||
omit padding and use non-standard characters.
|
||||
|
||||
Args:
|
||||
data: URL-safe base64-encoded string.
|
||||
|
||||
Returns:
|
||||
Decoded UTF-8 text with replacement for invalid characters.
|
||||
|
||||
Raises:
|
||||
MailIntakeParsingError: If decoding fails.
|
||||
"""
|
||||
try:
|
||||
padded = data.replace("-", "+").replace("_", "/")
|
||||
@@ -21,6 +43,16 @@ def _decode_base64(data: str) -> str:
|
||||
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
|
||||
"""
|
||||
Extract text content from a single MIME part.
|
||||
|
||||
Supports:
|
||||
- text/plain
|
||||
- text/html (converted to plain text)
|
||||
|
||||
Args:
|
||||
part: MIME part dictionary from a provider payload.
|
||||
|
||||
Returns:
|
||||
Extracted plain-text content, or None if unsupported or empty.
|
||||
"""
|
||||
mime_type = part.get("mimeType")
|
||||
body = part.get("body", {})
|
||||
@@ -49,7 +81,14 @@ def extract_body(payload: Dict[str, Any]) -> str:
|
||||
Priority:
|
||||
1. text/plain
|
||||
2. text/html (stripped to text)
|
||||
3. empty string (if nothing usable found)
|
||||
3. Single-part body
|
||||
4. empty string (if nothing usable found)
|
||||
|
||||
Args:
|
||||
payload: Provider-native message payload dictionary.
|
||||
|
||||
Returns:
|
||||
Extracted plain-text message body.
|
||||
"""
|
||||
if not payload:
|
||||
return ""
|
||||
|
||||
@@ -1,3 +1,13 @@
|
||||
"""
|
||||
Message header parsing utilities for Mail Intake.
|
||||
|
||||
This module provides helper functions for normalizing and extracting
|
||||
useful information from provider-native message headers.
|
||||
|
||||
The functions here are intentionally simple and tolerant of malformed
|
||||
or incomplete header data.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
|
||||
|
||||
@@ -5,19 +15,29 @@ def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
|
||||
"""
|
||||
Convert a list of Gmail-style headers into a normalized dict.
|
||||
|
||||
Input:
|
||||
[
|
||||
{"name": "From", "value": "John Doe <john@example.com>"},
|
||||
{"name": "Subject", "value": "Re: Interview Update"},
|
||||
...
|
||||
]
|
||||
Provider payloads (such as Gmail) typically represent headers as a list
|
||||
of name/value mappings. This function normalizes them into a
|
||||
case-insensitive dictionary keyed by lowercase header names.
|
||||
|
||||
Output:
|
||||
{
|
||||
"from": "...",
|
||||
"subject": "...",
|
||||
...
|
||||
}
|
||||
Args:
|
||||
raw_headers: List of header dictionaries, each containing
|
||||
``name`` and ``value`` keys.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping lowercase header names to stripped values.
|
||||
|
||||
Example:
|
||||
Input:
|
||||
[
|
||||
{"name": "From", "value": "John Doe <john@example.com>"},
|
||||
{"name": "Subject", "value": "Re: Interview Update"},
|
||||
]
|
||||
|
||||
Output:
|
||||
{
|
||||
"from": "John Doe <john@example.com>",
|
||||
"subject": "Re: Interview Update",
|
||||
}
|
||||
"""
|
||||
headers: Dict[str, str] = {}
|
||||
|
||||
@@ -37,18 +57,27 @@ def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
|
||||
"""
|
||||
Extract sender email and optional display name from headers.
|
||||
|
||||
Returns:
|
||||
(email, name)
|
||||
This function parses the ``From`` header and attempts to extract:
|
||||
- Sender email address
|
||||
- Optional human-readable display name
|
||||
|
||||
If name cannot be determined, name will be None.
|
||||
Args:
|
||||
headers: Normalized header dictionary as returned by
|
||||
:func:`parse_headers`.
|
||||
|
||||
Returns:
|
||||
A tuple ``(email, name)`` where:
|
||||
- ``email`` is the sender email address
|
||||
- ``name`` is the display name, or ``None`` if unavailable
|
||||
|
||||
Examples:
|
||||
``"John Doe <john@example.com>"`` → ``("john@example.com", "John Doe")``
|
||||
``"john@example.com"`` → ``("john@example.com", None)``
|
||||
"""
|
||||
from_header = headers.get("from")
|
||||
if not from_header:
|
||||
return "", None
|
||||
|
||||
# Common forms:
|
||||
# Name <email@domain>
|
||||
# email@domain
|
||||
if "<" in from_header and ">" in from_header:
|
||||
name_part, email_part = from_header.split("<", 1)
|
||||
email = email_part.rstrip(">").strip()
|
||||
|
||||
@@ -1,7 +1,18 @@
|
||||
"""
|
||||
Subject line normalization utilities for Mail Intake.
|
||||
|
||||
This module provides helper functions for normalizing email subject lines
|
||||
to enable reliable thread-level comparison and grouping.
|
||||
|
||||
Normalization is intentionally conservative to avoid altering semantic
|
||||
meaning while removing common reply and forward prefixes.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
|
||||
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
|
||||
"""Regular expression matching common reply/forward subject prefixes."""
|
||||
|
||||
|
||||
def normalize_subject(subject: str) -> str:
|
||||
@@ -9,11 +20,19 @@ def normalize_subject(subject: str) -> str:
|
||||
Normalize an email subject for thread-level comparison.
|
||||
|
||||
Operations:
|
||||
- Strip common prefixes (Re:, Fwd:, FW:)
|
||||
- Collapse whitespace
|
||||
- Preserve original casing (no lowercasing)
|
||||
- Strips common prefixes such as ``Re:``, ``Fwd:``, and ``FW:``
|
||||
- Repeats prefix stripping to handle stacked prefixes
|
||||
- Collapses excessive whitespace
|
||||
- Preserves original casing (no lowercasing)
|
||||
|
||||
This function is intentionally conservative.
|
||||
This function is intentionally conservative and avoids aggressive
|
||||
transformations that could alter the semantic meaning of the subject.
|
||||
|
||||
Args:
|
||||
subject: Raw subject line from a message header.
|
||||
|
||||
Returns:
|
||||
Normalized subject string suitable for thread grouping.
|
||||
"""
|
||||
if not subject:
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user