lib init

2026-01-03 05:21:55 +05:30
parent 278f0a3d40
commit 412a9c7bec
22 changed files with 950 additions and 0 deletions
--- a/mail_intake/parsers/init.py
+++ b/mail_intake/parsers/init.py
--- a/mail_intake/parsers/body.py
+++ b/mail_intake/parsers/body.py
@@ -0,0 +1,83 @@
+import base64
+from typing import Dict, Any, Optional
+
+from bs4 import BeautifulSoup
+
+from mail_intake.exceptions import MailIntakeParsingError
+
+
+def _decode_base64(data: str) -> str:
+    """
+    Decode Gmail URL-safe base64 payload into UTF-8 text.
+    """
+    try:
+        padded = data.replace("-", "+").replace("_", "/")
+        decoded = base64.b64decode(padded)
+        return decoded.decode("utf-8", errors="replace")
+    except Exception as exc:
+        raise MailIntakeParsingError("Failed to decode message body") from exc
+
+
+def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
+    """
+    Extract text content from a single MIME part.
+    """
+    mime_type = part.get("mimeType")
+    body = part.get("body", {})
+    data = body.get("data")
+
+    if not data:
+        return None
+
+    text = _decode_base64(data)
+
+    if mime_type == "text/plain":
+        return text
+
+    if mime_type == "text/html":
+        # soup = BeautifulSoup(text, "lxml")
+        soup = BeautifulSoup(text, "html.parser")
+        return soup.get_text(separator="\n", strip=True)
+
+    return None
+
+
+def extract_body(payload: Dict[str, Any]) -> str:
+    """
+    Extract the best-effort message body from a Gmail payload.
+
+    Priority:
+    1. text/plain
+    2. text/html (stripped to text)
+    3. empty string (if nothing usable found)
+    """
+    if not payload:
+        return ""
+
+    # Multipart message
+    if "parts" in payload:
+        text_plain = None
+        text_html = None
+
+        for part in payload.get("parts", []):
+            content = _extract_from_part(part)
+            if not content:
+                continue
+
+            if part.get("mimeType") == "text/plain" and text_plain is None:
+                text_plain = content
+            elif part.get("mimeType") == "text/html" and text_html is None:
+                text_html = content
+
+        if text_plain:
+            return text_plain
+        if text_html:
+            return text_html
+
+    # Single-part message
+    body = payload.get("body", {})
+    data = body.get("data")
+    if data:
+        return _decode_base64(data)
+
+    return ""
--- a/mail_intake/parsers/headers.py
+++ b/mail_intake/parsers/headers.py
@@ -0,0 +1,58 @@
+from typing import Dict, List, Tuple, Optional
+
+
+def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
+    """
+    Convert a list of Gmail-style headers into a normalized dict.
+
+    Input:
+    [
+        {"name": "From", "value": "John Doe <john@example.com>"},
+        {"name": "Subject", "value": "Re: Interview Update"},
+        ...
+    ]
+
+    Output:
+    {
+        "from": "...",
+        "subject": "...",
+        ...
+    }
+    """
+    headers: Dict[str, str] = {}
+
+    for header in raw_headers or []:
+        name = header.get("name")
+        value = header.get("value")
+
+        if not name or value is None:
+            continue
+
+        headers[name.lower()] = value.strip()
+
+    return headers
+
+
+def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
+    """
+    Extract sender email and optional display name from headers.
+
+    Returns:
+        (email, name)
+
+    If name cannot be determined, name will be None.
+    """
+    from_header = headers.get("from")
+    if not from_header:
+        return "", None
+
+    # Common forms:
+    #   Name <email@domain>
+    #   email@domain
+    if "<" in from_header and ">" in from_header:
+        name_part, email_part = from_header.split("<", 1)
+        email = email_part.rstrip(">").strip()
+        name = name_part.strip().strip('"') or None
+        return email, name
+
+    return from_header.strip(), None
--- a/mail_intake/parsers/subject.py
+++ b/mail_intake/parsers/subject.py
@@ -0,0 +1,33 @@
+import re
+
+
+_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
+
+
+def normalize_subject(subject: str) -> str:
+    """
+    Normalize an email subject for thread-level comparison.
+
+    Operations:
+    - Strip common prefixes (Re:, Fwd:, FW:)
+    - Collapse whitespace
+    - Preserve original casing (no lowercasing)
+
+    This function is intentionally conservative.
+    """
+    if not subject:
+        return ""
+
+    normalized = subject.strip()
+
+    # Strip prefixes repeatedly (e.g., Re: Fwd: Re:)
+    while True:
+        new_value = _PREFIX_RE.sub("", normalized)
+        if new_value == normalized:
+            break
+        normalized = new_value.strip()
+
+    # Normalize whitespace
+    normalized = " ".join(normalized.split())
+
+    return normalized