lib init
This commit is contained in:
0
mail_intake/parsers/__init__.py
Normal file
0
mail_intake/parsers/__init__.py
Normal file
83
mail_intake/parsers/body.py
Normal file
83
mail_intake/parsers/body.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import base64
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from mail_intake.exceptions import MailIntakeParsingError
|
||||
|
||||
|
||||
def _decode_base64(data: str) -> str:
|
||||
"""
|
||||
Decode Gmail URL-safe base64 payload into UTF-8 text.
|
||||
"""
|
||||
try:
|
||||
padded = data.replace("-", "+").replace("_", "/")
|
||||
decoded = base64.b64decode(padded)
|
||||
return decoded.decode("utf-8", errors="replace")
|
||||
except Exception as exc:
|
||||
raise MailIntakeParsingError("Failed to decode message body") from exc
|
||||
|
||||
|
||||
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
|
||||
"""
|
||||
Extract text content from a single MIME part.
|
||||
"""
|
||||
mime_type = part.get("mimeType")
|
||||
body = part.get("body", {})
|
||||
data = body.get("data")
|
||||
|
||||
if not data:
|
||||
return None
|
||||
|
||||
text = _decode_base64(data)
|
||||
|
||||
if mime_type == "text/plain":
|
||||
return text
|
||||
|
||||
if mime_type == "text/html":
|
||||
# soup = BeautifulSoup(text, "lxml")
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
return soup.get_text(separator="\n", strip=True)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_body(payload: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract the best-effort message body from a Gmail payload.
|
||||
|
||||
Priority:
|
||||
1. text/plain
|
||||
2. text/html (stripped to text)
|
||||
3. empty string (if nothing usable found)
|
||||
"""
|
||||
if not payload:
|
||||
return ""
|
||||
|
||||
# Multipart message
|
||||
if "parts" in payload:
|
||||
text_plain = None
|
||||
text_html = None
|
||||
|
||||
for part in payload.get("parts", []):
|
||||
content = _extract_from_part(part)
|
||||
if not content:
|
||||
continue
|
||||
|
||||
if part.get("mimeType") == "text/plain" and text_plain is None:
|
||||
text_plain = content
|
||||
elif part.get("mimeType") == "text/html" and text_html is None:
|
||||
text_html = content
|
||||
|
||||
if text_plain:
|
||||
return text_plain
|
||||
if text_html:
|
||||
return text_html
|
||||
|
||||
# Single-part message
|
||||
body = payload.get("body", {})
|
||||
data = body.get("data")
|
||||
if data:
|
||||
return _decode_base64(data)
|
||||
|
||||
return ""
|
||||
58
mail_intake/parsers/headers.py
Normal file
58
mail_intake/parsers/headers.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
|
||||
|
||||
def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
|
||||
"""
|
||||
Convert a list of Gmail-style headers into a normalized dict.
|
||||
|
||||
Input:
|
||||
[
|
||||
{"name": "From", "value": "John Doe <john@example.com>"},
|
||||
{"name": "Subject", "value": "Re: Interview Update"},
|
||||
...
|
||||
]
|
||||
|
||||
Output:
|
||||
{
|
||||
"from": "...",
|
||||
"subject": "...",
|
||||
...
|
||||
}
|
||||
"""
|
||||
headers: Dict[str, str] = {}
|
||||
|
||||
for header in raw_headers or []:
|
||||
name = header.get("name")
|
||||
value = header.get("value")
|
||||
|
||||
if not name or value is None:
|
||||
continue
|
||||
|
||||
headers[name.lower()] = value.strip()
|
||||
|
||||
return headers
|
||||
|
||||
|
||||
def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
|
||||
"""
|
||||
Extract sender email and optional display name from headers.
|
||||
|
||||
Returns:
|
||||
(email, name)
|
||||
|
||||
If name cannot be determined, name will be None.
|
||||
"""
|
||||
from_header = headers.get("from")
|
||||
if not from_header:
|
||||
return "", None
|
||||
|
||||
# Common forms:
|
||||
# Name <email@domain>
|
||||
# email@domain
|
||||
if "<" in from_header and ">" in from_header:
|
||||
name_part, email_part = from_header.split("<", 1)
|
||||
email = email_part.rstrip(">").strip()
|
||||
name = name_part.strip().strip('"') or None
|
||||
return email, name
|
||||
|
||||
return from_header.strip(), None
|
||||
33
mail_intake/parsers/subject.py
Normal file
33
mail_intake/parsers/subject.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import re
|
||||
|
||||
|
||||
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
|
||||
|
||||
|
||||
def normalize_subject(subject: str) -> str:
|
||||
"""
|
||||
Normalize an email subject for thread-level comparison.
|
||||
|
||||
Operations:
|
||||
- Strip common prefixes (Re:, Fwd:, FW:)
|
||||
- Collapse whitespace
|
||||
- Preserve original casing (no lowercasing)
|
||||
|
||||
This function is intentionally conservative.
|
||||
"""
|
||||
if not subject:
|
||||
return ""
|
||||
|
||||
normalized = subject.strip()
|
||||
|
||||
# Strip prefixes repeatedly (e.g., Re: Fwd: Re:)
|
||||
while True:
|
||||
new_value = _PREFIX_RE.sub("", normalized)
|
||||
if new_value == normalized:
|
||||
break
|
||||
normalized = new_value.strip()
|
||||
|
||||
# Normalize whitespace
|
||||
normalized = " ".join(normalized.split())
|
||||
|
||||
return normalized
|
||||
Reference in New Issue
Block a user