64 lines
1.6 KiB
Python
64 lines
1.6 KiB
Python
"""
|
|
Subject line normalization utilities for Mail Intake.
|
|
|
|
---
|
|
|
|
## Summary
|
|
|
|
This module provides helper functions for normalizing email subject lines
|
|
to enable reliable thread-level comparison and grouping.
|
|
|
|
Normalization is intentionally conservative to avoid altering semantic
|
|
meaning while removing common reply and forward prefixes.
|
|
"""
|
|
|
|
import re
|
|
|
|
|
|
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
|
|
"""
|
|
Regular expression matching common reply/forward subject prefixes.
|
|
"""
|
|
|
|
|
|
def normalize_subject(subject: str) -> str:
|
|
"""
|
|
Normalize an email subject for thread-level comparison.
|
|
|
|
Args:
|
|
subject (str):
|
|
Raw subject line from a message header.
|
|
|
|
Returns:
|
|
str:
|
|
Normalized subject string suitable for thread grouping.
|
|
|
|
Notes:
|
|
**Responsibilities:**
|
|
|
|
- Strips common prefixes such as ``Re:``, ``Fwd:``, and ``FW:``
|
|
- Repeats prefix stripping to handle stacked prefixes
|
|
- Collapses excessive whitespace
|
|
- Preserves original casing (no lowercasing)
|
|
|
|
**Guarantees:**
|
|
|
|
- This function is intentionally conservative and avoids aggressive transformations that could alter the semantic meaning of the subject
|
|
"""
|
|
if not subject:
|
|
return ""
|
|
|
|
normalized = subject.strip()
|
|
|
|
# Strip prefixes repeatedly (e.g., Re: Fwd: Re:)
|
|
while True:
|
|
new_value = _PREFIX_RE.sub("", normalized)
|
|
if new_value == normalized:
|
|
break
|
|
normalized = new_value.strip()
|
|
|
|
# Normalize whitespace
|
|
normalized = " ".join(normalized.split())
|
|
|
|
return normalized
|