mail-intake/mail_intake/parsers/subject.py

"""
Subject line normalization utilities for Mail Intake.

---

## Summary

This module provides helper functions for normalizing email subject lines
to enable reliable thread-level comparison and grouping.

Normalization is intentionally conservative to avoid altering semantic
meaning while removing common reply and forward prefixes.
"""

import re


_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
"""
Regular expression matching common reply/forward subject prefixes.
"""


def normalize_subject(subject: str) -> str:
    """
    Normalize an email subject for thread-level comparison.

    Args:
        subject (str):
            Raw subject line from a message header.

    Returns:
        str:
            Normalized subject string suitable for thread grouping.

    Notes:
        **Responsibilities:**

            - Strips common prefixes such as ``Re:``, ``Fwd:``, and ``FW:``
            - Repeats prefix stripping to handle stacked prefixes
            - Collapses excessive whitespace
            - Preserves original casing (no lowercasing)

        **Guarantees:**

            - This function is intentionally conservative and avoids aggressive transformations that could alter the semantic meaning of the subject
    """
    if not subject:
        return ""

    normalized = subject.strip()

    # Strip prefixes repeatedly (e.g., Re: Fwd: Re:)
    while True:
        new_value = _PREFIX_RE.sub("", normalized)
        if new_value == normalized:
            break
        normalized = new_value.strip()

    # Normalize whitespace
    normalized = " ".join(normalized.split())

    return normalized