This commit is contained in:
2026-01-03 05:21:55 +05:30
parent 278f0a3d40
commit 412a9c7bec
22 changed files with 950 additions and 0 deletions

View File

View File

@@ -0,0 +1,83 @@
import base64
from typing import Dict, Any, Optional
from bs4 import BeautifulSoup
from mail_intake.exceptions import MailIntakeParsingError
def _decode_base64(data: str) -> str:
"""
Decode Gmail URL-safe base64 payload into UTF-8 text.
"""
try:
padded = data.replace("-", "+").replace("_", "/")
decoded = base64.b64decode(padded)
return decoded.decode("utf-8", errors="replace")
except Exception as exc:
raise MailIntakeParsingError("Failed to decode message body") from exc
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
"""
Extract text content from a single MIME part.
"""
mime_type = part.get("mimeType")
body = part.get("body", {})
data = body.get("data")
if not data:
return None
text = _decode_base64(data)
if mime_type == "text/plain":
return text
if mime_type == "text/html":
# soup = BeautifulSoup(text, "lxml")
soup = BeautifulSoup(text, "html.parser")
return soup.get_text(separator="\n", strip=True)
return None
def extract_body(payload: Dict[str, Any]) -> str:
"""
Extract the best-effort message body from a Gmail payload.
Priority:
1. text/plain
2. text/html (stripped to text)
3. empty string (if nothing usable found)
"""
if not payload:
return ""
# Multipart message
if "parts" in payload:
text_plain = None
text_html = None
for part in payload.get("parts", []):
content = _extract_from_part(part)
if not content:
continue
if part.get("mimeType") == "text/plain" and text_plain is None:
text_plain = content
elif part.get("mimeType") == "text/html" and text_html is None:
text_html = content
if text_plain:
return text_plain
if text_html:
return text_html
# Single-part message
body = payload.get("body", {})
data = body.get("data")
if data:
return _decode_base64(data)
return ""

View File

@@ -0,0 +1,58 @@
from typing import Dict, List, Tuple, Optional
def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
"""
Convert a list of Gmail-style headers into a normalized dict.
Input:
[
{"name": "From", "value": "John Doe <john@example.com>"},
{"name": "Subject", "value": "Re: Interview Update"},
...
]
Output:
{
"from": "...",
"subject": "...",
...
}
"""
headers: Dict[str, str] = {}
for header in raw_headers or []:
name = header.get("name")
value = header.get("value")
if not name or value is None:
continue
headers[name.lower()] = value.strip()
return headers
def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
"""
Extract sender email and optional display name from headers.
Returns:
(email, name)
If name cannot be determined, name will be None.
"""
from_header = headers.get("from")
if not from_header:
return "", None
# Common forms:
# Name <email@domain>
# email@domain
if "<" in from_header and ">" in from_header:
name_part, email_part = from_header.split("<", 1)
email = email_part.rstrip(">").strip()
name = name_part.strip().strip('"') or None
return email, name
return from_header.strip(), None

View File

@@ -0,0 +1,33 @@
import re
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
def normalize_subject(subject: str) -> str:
"""
Normalize an email subject for thread-level comparison.
Operations:
- Strip common prefixes (Re:, Fwd:, FW:)
- Collapse whitespace
- Preserve original casing (no lowercasing)
This function is intentionally conservative.
"""
if not subject:
return ""
normalized = subject.strip()
# Strip prefixes repeatedly (e.g., Re: Fwd: Re:)
while True:
new_value = _PREFIX_RE.sub("", normalized)
if new_value == normalized:
break
normalized = new_value.strip()
# Normalize whitespace
normalized = " ".join(normalized.split())
return normalized