""" Subject line normalization utilities for Mail Intake. This module provides helper functions for normalizing email subject lines to enable reliable thread-level comparison and grouping. Normalization is intentionally conservative to avoid altering semantic meaning while removing common reply and forward prefixes. """ import re _PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE) """Regular expression matching common reply/forward subject prefixes.""" def normalize_subject(subject: str) -> str: """ Normalize an email subject for thread-level comparison. Operations: - Strips common prefixes such as ``Re:``, ``Fwd:``, and ``FW:`` - Repeats prefix stripping to handle stacked prefixes - Collapses excessive whitespace - Preserves original casing (no lowercasing) This function is intentionally conservative and avoids aggressive transformations that could alter the semantic meaning of the subject. Args: subject: Raw subject line from a message header. Returns: Normalized subject string suitable for thread grouping. """ if not subject: return "" normalized = subject.strip() # Strip prefixes repeatedly (e.g., Re: Fwd: Re:) while True: new_value = _PREFIX_RE.sub("", normalized) if new_value == normalized: break normalized = new_value.strip() # Normalize whitespace normalized = " ".join(normalized.split()) return normalized