Files
mail-intake/mail_intake/parsers/subject.py

63 lines
1.6 KiB
Python

"""
# Summary
Subject line normalization utilities for Mail Intake.
This module provides helper functions for normalizing email subject lines
to enable reliable thread-level comparison and grouping.
Normalization is intentionally conservative to avoid altering semantic
meaning while removing common reply and forward prefixes.
"""
import re
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
"""
Regular expression matching common reply/forward subject prefixes.
"""
def normalize_subject(subject: str) -> str:
"""
Normalize an email subject for thread-level comparison.
Args:
subject (str):
Raw subject line from a message header.
Returns:
str:
Normalized subject string suitable for thread grouping.
Notes:
**Responsibilities:**
- Strips common prefixes such as `Re:`, `Fwd:`, and `FW:`.
- Repeats prefix stripping to handle stacked prefixes.
- Collapses excessive whitespace.
- Preserves original casing (no lowercasing).
**Guarantees:**
- This function is intentionally conservative and avoids aggressive
transformations that could alter the semantic meaning of the subject.
"""
if not subject:
return ""
normalized = subject.strip()
# Strip prefixes repeatedly (e.g., Re: Fwd: Re:)
while True:
new_value = _PREFIX_RE.sub("", normalized)
if new_value == normalized:
break
normalized = new_value.strip()
# Normalize whitespace
normalized = " ".join(normalized.split())
return normalized