134 lines
3.2 KiB
Python
134 lines
3.2 KiB
Python
"""
|
|
# Summary
|
|
|
|
Message body extraction utilities for Mail Intake.
|
|
|
|
This module contains helper functions for extracting a best-effort
|
|
plain-text body from provider-native message payloads.
|
|
|
|
The logic is intentionally tolerant of malformed or partial data and
|
|
prefers human-readable text over fidelity to original formatting.
|
|
"""
|
|
|
|
import base64
|
|
from typing import Dict, Any, Optional
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from mail_intake.exceptions import MailIntakeParsingError
|
|
|
|
|
|
def _decode_base64(data: str) -> str:
|
|
"""
|
|
Decode Gmail URL-safe base64 payload into UTF-8 text.
|
|
|
|
Gmail message bodies are encoded using URL-safe base64, which may
|
|
omit padding and use non-standard characters.
|
|
|
|
Args:
|
|
data (str):
|
|
URL-safe base64-encoded string.
|
|
|
|
Returns:
|
|
str:
|
|
Decoded UTF-8 text with replacement for invalid characters.
|
|
|
|
Raises:
|
|
MailIntakeParsingError:
|
|
If decoding fails.
|
|
"""
|
|
try:
|
|
padded = data.replace("-", "+").replace("_", "/")
|
|
decoded = base64.b64decode(padded)
|
|
return decoded.decode("utf-8", errors="replace")
|
|
except Exception as exc:
|
|
raise MailIntakeParsingError("Failed to decode message body") from exc
|
|
|
|
|
|
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
|
|
"""
|
|
Extract text content from a single MIME part.
|
|
|
|
Supports:
|
|
|
|
- `text/plain`
|
|
- `text/html` (converted to plain text)
|
|
|
|
Args:
|
|
part (Dict[str, Any]):
|
|
MIME part dictionary from a provider payload.
|
|
|
|
Returns:
|
|
Optional[str]:
|
|
Extracted plain-text content, or `None` if unsupported or empty.
|
|
"""
|
|
mime_type = part.get("mimeType")
|
|
body = part.get("body", {})
|
|
data = body.get("data")
|
|
|
|
if not data:
|
|
return None
|
|
|
|
text = _decode_base64(data)
|
|
|
|
if mime_type == "text/plain":
|
|
return text
|
|
|
|
if mime_type == "text/html":
|
|
# soup = BeautifulSoup(text, "lxml")
|
|
soup = BeautifulSoup(text, "html.parser")
|
|
return soup.get_text(separator="\n", strip=True)
|
|
|
|
return None
|
|
|
|
|
|
def extract_body(payload: Dict[str, Any]) -> str:
|
|
"""
|
|
Extract the best-effort message body from a Gmail payload.
|
|
|
|
Priority:
|
|
|
|
1. `text/plain`
|
|
2. `text/html` (stripped to text)
|
|
3. Single-part body
|
|
4. Empty string (if nothing usable found)
|
|
|
|
Args:
|
|
payload (Dict[str, Any]):
|
|
Provider-native message payload dictionary.
|
|
|
|
Returns:
|
|
str:
|
|
Extracted plain-text message body.
|
|
"""
|
|
if not payload:
|
|
return ""
|
|
|
|
# Multipart message
|
|
if "parts" in payload:
|
|
text_plain = None
|
|
text_html = None
|
|
|
|
for part in payload.get("parts", []):
|
|
content = _extract_from_part(part)
|
|
if not content:
|
|
continue
|
|
|
|
if part.get("mimeType") == "text/plain" and text_plain is None:
|
|
text_plain = content
|
|
elif part.get("mimeType") == "text/html" and text_html is None:
|
|
text_html = content
|
|
|
|
if text_plain:
|
|
return text_plain
|
|
if text_html:
|
|
return text_html
|
|
|
|
# Single-part message
|
|
body = payload.get("body", {})
|
|
data = body.get("data")
|
|
if data:
|
|
return _decode_base64(data)
|
|
|
|
return ""
|