""" Message body extraction utilities for Mail Intake. This module contains helper functions for extracting a best-effort plain-text body from provider-native message payloads. The logic is intentionally tolerant of malformed or partial data and prefers human-readable text over fidelity to original formatting. """ import base64 from typing import Dict, Any, Optional from bs4 import BeautifulSoup from mail_intake.exceptions import MailIntakeParsingError def _decode_base64(data: str) -> str: """ Decode Gmail URL-safe base64 payload into UTF-8 text. Gmail message bodies are encoded using URL-safe base64, which may omit padding and use non-standard characters. Args: data: URL-safe base64-encoded string. Returns: Decoded UTF-8 text with replacement for invalid characters. Raises: MailIntakeParsingError: If decoding fails. """ try: padded = data.replace("-", "+").replace("_", "/") decoded = base64.b64decode(padded) return decoded.decode("utf-8", errors="replace") except Exception as exc: raise MailIntakeParsingError("Failed to decode message body") from exc def _extract_from_part(part: Dict[str, Any]) -> Optional[str]: """ Extract text content from a single MIME part. Supports: - text/plain - text/html (converted to plain text) Args: part: MIME part dictionary from a provider payload. Returns: Extracted plain-text content, or None if unsupported or empty. """ mime_type = part.get("mimeType") body = part.get("body", {}) data = body.get("data") if not data: return None text = _decode_base64(data) if mime_type == "text/plain": return text if mime_type == "text/html": # soup = BeautifulSoup(text, "lxml") soup = BeautifulSoup(text, "html.parser") return soup.get_text(separator="\n", strip=True) return None def extract_body(payload: Dict[str, Any]) -> str: """ Extract the best-effort message body from a Gmail payload. Priority: 1. text/plain 2. text/html (stripped to text) 3. Single-part body 4. empty string (if nothing usable found) Args: payload: Provider-native message payload dictionary. Returns: Extracted plain-text message body. """ if not payload: return "" # Multipart message if "parts" in payload: text_plain = None text_html = None for part in payload.get("parts", []): content = _extract_from_part(part) if not content: continue if part.get("mimeType") == "text/plain" and text_plain is None: text_plain = content elif part.get("mimeType") == "text/html" and text_html is None: text_html = content if text_plain: return text_plain if text_html: return text_html # Single-part message body = payload.get("body", {}) data = body.get("data") if data: return _decode_base64(data) return ""