Compare commits

..

10 Commits

Author SHA1 Message Date
677ead8ef5 fixed tests to use correct imports
Some checks failed
continuous-integration/drone/tag Build is failing
2026-01-09 17:44:42 +05:30
4cf5110684 fixed imports in mail_intake 2026-01-09 17:44:31 +05:30
77dabf8df8 added .drone.yml 2026-01-09 17:41:53 +05:30
b14ffe9e44 docs(mail_intake): add MkDocs navigation and API reference document structure
- docs(mkdocs.yml): define navigation structure for mail_intake modules
- docs(mkdocs.yml): configure mkdocstrings for Python API rendering
- docs(docs): add documentation root and mail_intake index page
- docs(docs): add adapters documentation pages with mkdocstrings directives
- docs(docs): add auth documentation pages with mkdocstrings directives
- docs(docs): add ingestion documentation pages with mkdocstrings directives
- docs(docs): add models documentation pages with mkdocstrings directives
- docs(docs): add parsers documentation pages with mkdocstrings directives
- docs(docs): add config and exceptions documentation pages
2026-01-09 17:41:10 +05:30
f22af90e98 docs(mail_intake): add comprehensive docstrings across ingestion, adapters, auth, and parsing layers
- docs(mail_intake/__init__.py): document module-based public API and usage patterns
- docs(mail_intake/ingestion/reader.py): document high-level ingestion orchestration
- docs(mail_intake/adapters/base.py): document adapter contract for mail providers
- docs(mail_intake/adapters/gmail.py): document Gmail adapter implementation and constraints
- docs(mail_intake/auth/base.py): document authentication provider contract
- docs(mail_intake/auth/google.py): document Google OAuth authentication provider
- docs(mail_intake/models/message.py): document canonical email message model
- docs(mail_intake/models/thread.py): document canonical email thread model
- docs(mail_intake/parsers/body.py): document message body extraction logic
- docs(mail_intake/parsers/headers.py): document message header normalization utilities
- docs(mail_intake/parsers/subject.py): document subject normalization utilities
- docs(mail_intake/config.py): document global configuration model
- docs(mail_intake/exceptions.py): document library exception hierarchy
2026-01-09 17:40:25 +05:30
dbfef295b8 added google packages 2026-01-09 17:40:14 +05:30
505950eafa also generate index.md for __init__.py files 2026-01-09 17:07:32 +05:30
3a550ab576 manage_docs cli 2026-01-09 17:03:59 +05:30
44d36575c6 added mkdocs packages 2026-01-09 17:03:43 +05:30
b18b717c52 added mkdocs packages 2026-01-09 16:42:35 +05:30
42 changed files with 1170 additions and 80 deletions

129
.drone.yml Normal file
View File

@@ -0,0 +1,129 @@
---
kind: pipeline
type: docker
name: build-and-publish-pypi
platform:
os: linux
arch: arm64
workspace:
path: /drone/src
steps:
- name: check-version
image: curlimages/curl:latest
environment:
PIP_REPO_URL:
from_secret: PIP_REPO_URL
PIP_USERNAME:
from_secret: PIP_USERNAME
PIP_PASSWORD:
from_secret: PIP_PASSWORD
commands:
- PACKAGE_NAME=$(grep -E '^name\s*=' pyproject.toml | head -1 | cut -d'"' -f2)
- VERSION=$(grep -E '^version\s*=' pyproject.toml | head -1 | cut -d'"' -f2)
- echo "🔍 Checking if $PACKAGE_NAME==$VERSION exists on $PIP_REPO_URL ..."
- |
if curl -fsSL -u "$PIP_USERNAME:$PIP_PASSWORD" "$PIP_REPO_URL/simple/$PACKAGE_NAME/" | grep -q "$VERSION"; then
echo "✅ $PACKAGE_NAME==$VERSION already exists — skipping build."
exit 78
else
echo "🆕 New version detected: $PACKAGE_NAME==$VERSION"
fi
- name: build-package
image: python:3.13-slim
commands:
- pip install --upgrade pip build
- echo "📦 Building Python package..."
- python -m build
- ls -l dist
- name: upload-to-private-pypi
image: python:3.13-slim
environment:
PIP_REPO_URL:
from_secret: PIP_REPO_URL
PIP_USERNAME:
from_secret: PIP_USERNAME
PIP_PASSWORD:
from_secret: PIP_PASSWORD
commands:
- pip install --upgrade twine
- echo "🚀 Uploading to private PyPI at $PIP_REPO_URL ..."
- |
twine upload \
--repository-url "$PIP_REPO_URL" \
-u "$PIP_USERNAME" \
-p "$PIP_PASSWORD" \
dist/*
trigger:
event:
- tag
---
kind: pipeline
type: docker
name: backfill-pypi-from-tags
platform:
os: linux
arch: arm64
workspace:
path: /drone/src
steps:
- name: fetch-tags
image: alpine/git
commands:
- git fetch --tags --force
- name: build-and-upload-missing
image: python:3.13-slim
environment:
PIP_REPO_URL:
from_secret: PIP_REPO_URL
PIP_USERNAME:
from_secret: PIP_USERNAME
PIP_PASSWORD:
from_secret: PIP_PASSWORD
commands:
- apt-get update
- apt-get install -y git curl ca-certificates
- pip install --upgrade pip build twine
- |
set -e
PACKAGE_NAME=$(grep -E '^name\s*=' pyproject.toml | cut -d'"' -f2)
echo "📦 Package: $PACKAGE_NAME"
for TAG in $(git tag --sort=version:refname); do
VERSION="$TAG"
echo "🔁 Version: $VERSION"
if curl -fsSL -u "$PIP_USERNAME:$PIP_PASSWORD" \
"$PIP_REPO_URL/simple/$PACKAGE_NAME/" | grep -q "$VERSION"; then
echo "⏭️ Exists, skipping"
continue
fi
git checkout --force "$TAG"
echo "🏗️ Building $VERSION"
rm -rf dist
python -m build
echo "⬆️ Uploading $VERSION"
twine upload \
--repository-url "$PIP_REPO_URL" \
-u "$PIP_USERNAME" \
-p "$PIP_PASSWORD" \
dist/*
done
trigger:
event:
- custom

View File

@@ -0,0 +1,3 @@
# Base
::: mail_intake.adapters.base

View File

@@ -0,0 +1,3 @@
# Gmail
::: mail_intake.adapters.gmail

View File

@@ -0,0 +1,3 @@
# Adapters
::: mail_intake.adapters

View File

@@ -0,0 +1,3 @@
# Base
::: mail_intake.auth.base

View File

@@ -0,0 +1,3 @@
# Google
::: mail_intake.auth.google

View File

@@ -0,0 +1,3 @@
# Auth
::: mail_intake.auth

View File

@@ -0,0 +1,3 @@
# Config
::: mail_intake.config

View File

@@ -0,0 +1,3 @@
# Exceptions
::: mail_intake.exceptions

View File

@@ -0,0 +1,3 @@
# Mail Intake
::: mail_intake

View File

@@ -0,0 +1,3 @@
# Ingestion
::: mail_intake.ingestion

View File

@@ -0,0 +1,3 @@
# Reader
::: mail_intake.ingestion.reader

View File

@@ -0,0 +1,3 @@
# Models
::: mail_intake.models

View File

@@ -0,0 +1,3 @@
# Message
::: mail_intake.models.message

View File

@@ -0,0 +1,3 @@
# Thread
::: mail_intake.models.thread

View File

@@ -0,0 +1,3 @@
# Body
::: mail_intake.parsers.body

View File

@@ -0,0 +1,3 @@
# Headers
::: mail_intake.parsers.headers

View File

@@ -0,0 +1,3 @@
# Parsers
::: mail_intake.parsers

View File

@@ -0,0 +1,3 @@
# Subject
::: mail_intake.parsers.subject

View File

@@ -0,0 +1,126 @@
"""
Mail Intake — provider-agnostic, read-only email ingestion framework.
Mail Intake is a **contract-first library** designed to ingest, parse, and
normalize email data from external providers (such as Gmail) into clean,
provider-agnostic domain models.
The library is intentionally structured around clear layers, each exposed
as a first-class module at the package root:
- adapters: provider-specific access (e.g. Gmail)
- auth: authentication providers and credential management
- parsers: extraction and normalization of message content
- ingestion: orchestration and high-level ingestion workflows
- models: canonical, provider-agnostic data representations
- config: explicit global configuration
- exceptions: library-defined error hierarchy
The package root acts as a **namespace**, not a facade. Consumers are
expected to import functionality explicitly from the appropriate module.
----------------------------------------------------------------------
Installation
----------------------------------------------------------------------
Install using pip:
pip install mail-intake
Or with Poetry:
poetry add mail-intake
Mail Intake is pure Python and has no runtime dependencies beyond those
required by the selected provider (for example, Google APIs for Gmail).
----------------------------------------------------------------------
Basic Usage
----------------------------------------------------------------------
Minimal Gmail ingestion example:
from mail_intake.ingestion import MailIntakeReader
from mail_intake.adapters import MailIntakeGmailAdapter
from mail_intake.auth import MailIntakeGoogleAuth
auth = MailIntakeGoogleAuth(
credentials_path="credentials.json",
token_path="token.pickle",
scopes=["https://www.googleapis.com/auth/gmail.readonly"],
)
adapter = MailIntakeGmailAdapter(auth_provider=auth)
reader = MailIntakeReader(adapter)
for message in reader.iter_messages("from:recruiter@example.com"):
print(message.subject, message.from_email)
Iterating over threads:
for thread in reader.iter_threads("subject:Interview"):
print(thread.normalized_subject, len(thread.messages))
----------------------------------------------------------------------
Extensibility Model
----------------------------------------------------------------------
Mail Intake is designed to be extensible via **public contracts** exposed
through its modules:
- Users MAY implement their own mail adapters by subclassing
`adapters.MailIntakeAdapter`
- Users MAY implement their own authentication providers by subclassing
`auth.MailIntakeAuthProvider`
Users SHOULD NOT subclass built-in adapter implementations. Built-in
adapters (such as Gmail) are reference implementations and may change
internally without notice.
----------------------------------------------------------------------
Public API Surface
----------------------------------------------------------------------
The supported public API consists of the following top-level modules:
- mail_intake.ingestion
- mail_intake.adapters
- mail_intake.auth
- mail_intake.parsers
- mail_intake.models
- mail_intake.config
- mail_intake.exceptions
Classes and functions should be imported explicitly from these modules.
No individual symbols are re-exported at the package root.
----------------------------------------------------------------------
Design Guarantees
----------------------------------------------------------------------
- Read-only access: no mutation of provider state
- Provider-agnostic domain models
- Explicit configuration and dependency injection
- No implicit global state or environment reads
- Deterministic, testable behavior
Mail Intake favors correctness, clarity, and explicitness over convenience
shortcuts.
"""
from . import ingestion
from . import adapters
from . import auth
from . import models
from . import config
from . import exceptions
__all__ = [
"ingestion",
"adapters",
"auth",
"models",
"config",
"exceptions",
]

View File

@@ -0,0 +1,28 @@
"""
Mail provider adapter implementations for Mail Intake.
This package contains **adapter-layer implementations** responsible for
interfacing with external mail providers and exposing a normalized,
provider-agnostic contract to the rest of the system.
Adapters in this package:
- Implement the `MailIntakeAdapter` interface
- Encapsulate all provider-specific APIs and semantics
- Perform read-only access to mail data
- Return provider-native payloads without interpretation
Provider-specific logic **must not leak** outside of adapter implementations.
All parsings, normalizations, and transformations must be handled by downstream
components.
Public adapters exported from this package are considered the supported
integration surface for mail providers.
"""
from .base import MailIntakeAdapter
from .gmail import MailIntakeGmailAdapter
__all__ = [
"MailIntakeAdapter",
"MailIntakeGmailAdapter",
]

View File

@@ -1,3 +1,14 @@
"""
Mail provider adapter contracts for Mail Intake.
This module defines the **provider-agnostic adapter interface** used for
read-only mail ingestion.
Adapters encapsulate all provider-specific access logic and expose a
minimal, normalized contract to the rest of the system. No provider-specific
types or semantics should leak beyond implementations of this interface.
"""
from abc import ABC, abstractmethod
from typing import Iterator, Dict, Any
@@ -6,43 +17,60 @@ class MailIntakeAdapter(ABC):
"""
Base adapter interface for mail providers.
This interface defines the minimal contract required for
read-only mail ingestion. No provider-specific concepts
should leak beyond implementations of this class.
This interface defines the minimal contract required to:
- Discover messages matching a query
- Retrieve full message payloads
- Retrieve full thread payloads
Adapters are intentionally read-only and must not mutate provider state.
"""
@abstractmethod
def iter_message_refs(self, query: str) -> Iterator[Dict[str, str]]:
"""
Iterate over lightweight message references.
Iterate over lightweight message references matching a query.
Must yield dictionaries containing at least:
- message_id
- thread_id
Implementations must yield dictionaries containing at least:
- ``message_id``: Provider-specific message identifier
- ``thread_id``: Provider-specific thread identifier
Args:
query: Provider-specific query string used to filter messages.
Yields:
Dictionaries containing message and thread identifiers.
Example yield:
{
"message_id": "...",
"thread_id": "..."
}
{
"message_id": "...",
"thread_id": "..."
}
"""
raise NotImplementedError
@abstractmethod
def fetch_message(self, message_id: str) -> Dict[str, Any]:
"""
Fetch a full raw message by message_id.
Fetch a full raw message by message identifier.
Returns the provider-native message payload
(e.g., Gmail message JSON).
Args:
message_id: Provider-specific message identifier.
Returns:
Provider-native message payload
(e.g., Gmail message JSON structure).
"""
raise NotImplementedError
@abstractmethod
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
"""
Fetch a full raw thread by thread_id.
Fetch a full raw thread by thread identifier.
Returns the provider-native thread payload.
Args:
thread_id: Provider-specific thread identifier.
Returns:
Provider-native thread payload.
"""
raise NotImplementedError

View File

@@ -1,3 +1,17 @@
"""
Gmail adapter implementation for Mail Intake.
This module provides a **Gmail-specific implementation** of the
`MailIntakeAdapter` contract.
It is the only place in the codebase where:
- `googleapiclient` is imported
- Gmail REST API semantics are known
- Low-level `.execute()` calls are made
All Gmail-specific behavior must be strictly contained within this module.
"""
from typing import Iterator, Dict, Any
from googleapiclient.discovery import build
@@ -12,12 +26,19 @@ class MailIntakeGmailAdapter(MailIntakeAdapter):
"""
Gmail read-only adapter.
This adapter implements the `MailIntakeAdapter` interface using the
Gmail REST API. It translates the generic mail intake contract into
Gmail-specific API calls.
This class is the ONLY place where:
- googleapiclient is imported
- Gmail REST semantics are known
- .execute() is called
It must remain thin and dumb by design.
Design constraints:
- Must remain thin and imperative
- Must not perform parsing or interpretation
- Must not expose Gmail-specific types beyond this class
"""
def __init__(
@@ -25,12 +46,29 @@ class MailIntakeGmailAdapter(MailIntakeAdapter):
auth_provider: MailIntakeAuthProvider,
user_id: str = "me",
):
"""
Initialize the Gmail adapter.
Args:
auth_provider: Authentication provider capable of supplying
valid Gmail API credentials.
user_id: Gmail user identifier. Defaults to `"me"`.
"""
self._auth_provider = auth_provider
self._user_id = user_id
self._service = None
@property
def service(self):
"""
Lazily initialize and return the Gmail API service client.
Returns:
Initialized Gmail API service instance.
Raises:
MailIntakeAdapterError: If the Gmail service cannot be initialized.
"""
if self._service is None:
try:
creds = self._auth_provider.get_credentials()
@@ -45,11 +83,16 @@ class MailIntakeGmailAdapter(MailIntakeAdapter):
"""
Iterate over message references matching the query.
Args:
query: Gmail search query string.
Yields:
{
"message_id": "...",
"thread_id": "..."
}
Dictionaries containing:
- ``message_id``: Gmail message ID
- ``thread_id``: Gmail thread ID
Raises:
MailIntakeAdapterError: If the Gmail API returns an error.
"""
try:
request = (
@@ -79,6 +122,18 @@ class MailIntakeGmailAdapter(MailIntakeAdapter):
) from exc
def fetch_message(self, message_id: str) -> Dict[str, Any]:
"""
Fetch a full Gmail message by message ID.
Args:
message_id: Gmail message identifier.
Returns:
Provider-native Gmail message payload.
Raises:
MailIntakeAdapterError: If the Gmail API returns an error.
"""
try:
return (
self.service.users()
@@ -92,6 +147,18 @@ class MailIntakeGmailAdapter(MailIntakeAdapter):
) from exc
def fetch_thread(self, thread_id: str) -> Dict[str, Any]:
"""
Fetch a full Gmail thread by thread ID.
Args:
thread_id: Gmail thread identifier.
Returns:
Provider-native Gmail thread payload.
Raises:
MailIntakeAdapterError: If the Gmail API returns an error.
"""
try:
return (
self.service.users()

View File

@@ -0,0 +1,26 @@
"""
Authentication provider implementations for Mail Intake.
This package defines the **authentication layer** used by mail adapters
to obtain provider-specific credentials.
It exposes:
- A stable, provider-agnostic authentication contract
- Concrete authentication providers for supported platforms
Authentication providers:
- Are responsible for credential acquisition and lifecycle management
- Are intentionally decoupled from adapter logic
- May be extended by users to support additional providers
Consumers should depend on the abstract interface and use concrete
implementations only where explicitly required.
"""
from .base import MailIntakeAuthProvider
from .google import MailIntakeGoogleAuth
__all__ = [
"MailIntakeAuthProvider",
"MailIntakeGoogleAuth",
]

View File

@@ -1,3 +1,14 @@
"""
Authentication provider contracts for Mail Intake.
This module defines the **authentication abstraction layer** used by mail
adapters to obtain provider-specific credentials.
Authentication concerns are intentionally decoupled from adapter logic.
Adapters depend only on this interface and must not be aware of how
credentials are acquired, refreshed, or stored.
"""
from abc import ABC, abstractmethod
@@ -7,6 +18,17 @@ class MailIntakeAuthProvider(ABC):
Mail adapters depend on this interface, not on concrete
OAuth or credential implementations.
Authentication providers encapsulate all logic required to acquire
valid credentials for a mail provider.
Implementations may involve:
- OAuth flows
- Service account credentials
- Token refresh logic
- Secure credential storage
Adapters must treat the returned credentials as opaque and provider-specific.
"""
@abstractmethod
@@ -16,5 +38,13 @@ class MailIntakeAuthProvider(ABC):
This method is synchronous by design and must either
return valid credentials or raise MailIntakeAuthError.
Returns:
Provider-specific credentials object suitable for use by
the corresponding mail adapter.
Raises:
Exception: Authentication-specific errors defined by the
implementation.
"""
raise NotImplementedError

View File

@@ -1,3 +1,17 @@
"""
Google authentication provider implementation for Mail Intake.
This module provides a **Google OAuthbased authentication provider**
used primarily for Gmail access.
It encapsulates all Google-specific authentication concerns, including:
- Credential loading and persistence
- Token refresh handling
- Interactive OAuth flow initiation
No Google authentication details should leak outside this module.
"""
import os
import pickle
from typing import Sequence
@@ -14,12 +28,16 @@ class MailIntakeGoogleAuth(MailIntakeAuthProvider):
"""
Google OAuth provider for Gmail access.
Responsibilities:
- Load cached credentials from disk
- Refresh expired tokens when possible
- Trigger interactive login only when strictly required
This provider implements the `MailIntakeAuthProvider` interface using
Google's OAuth 2.0 flow and credential management libraries.
This class is synchronous and intentionally state-light.
Responsibilities:
- Load cached credentials from disk when available
- Refresh expired credentials when possible
- Initiate an interactive OAuth flow only when required
- Persist refreshed or newly obtained credentials
This class is synchronous by design and maintains a minimal internal state.
"""
def __init__(
@@ -28,11 +46,36 @@ class MailIntakeGoogleAuth(MailIntakeAuthProvider):
token_path: str,
scopes: Sequence[str],
):
"""
Initialize the Google authentication provider.
Args:
credentials_path: Path to the Google OAuth client secrets file.
token_path: Path where OAuth tokens will be cached.
scopes: OAuth scopes required for access.
"""
self.credentials_path = credentials_path
self.token_path = token_path
self.scopes = list(scopes)
def get_credentials(self):
"""
Retrieve valid Google OAuth credentials.
This method attempts to:
1. Load cached credentials from disk
2. Refresh expired credentials when possible
3. Perform an interactive OAuth login as a fallback
4. Persist valid credentials for future use
Returns:
Google OAuth credentials object suitable for use with
Google API clients.
Raises:
MailIntakeAuthError: If credentials cannot be loaded, refreshed,
or obtained via interactive authentication.
"""
creds = None
# Attempt to load cached credentials

View File

@@ -1,3 +1,14 @@
"""
Global configuration models for Mail Intake.
This module defines the **top-level configuration object** used to control
mail ingestion behavior across adapters, authentication providers, and
ingestion workflows.
Configuration is intentionally explicit, immutable, and free of implicit
environment reads to ensure predictability and testability.
"""
from dataclasses import dataclass
from typing import Optional
@@ -9,12 +20,26 @@ class MailIntakeConfig:
This configuration is intentionally explicit and immutable.
No implicit environment reads or global state.
Design principles:
- Immutable once constructed
- Explicit configuration over implicit defaults
- No direct environment or filesystem access
This model is safe to pass across layers and suitable for serialization.
"""
provider: str = "gmail"
user_id: str = "me"
readonly: bool = True
"""Identifier of the mail provider to use (e.g., ``"gmail"``)."""
user_id: str = "me"
"""Provider-specific user identifier. Defaults to the authenticated user."""
readonly: bool = True
"""Whether ingestion should operate in read-only mode."""
# Provider-specific paths (optional at this layer)
credentials_path: Optional[str] = None
"""Optional path to provider credentials configuration."""
token_path: Optional[str] = None
"""Optional path to persisted authentication tokens."""

View File

@@ -1,19 +1,49 @@
"""
Exception hierarchy for Mail Intake.
This module defines the **canonical exception types** used throughout the
Mail Intake library.
All library-raised errors derive from `MailIntakeError`. Consumers are
encouraged to catch this base type (or specific subclasses) rather than
provider-specific or third-party exceptions.
"""
class MailIntakeError(Exception):
"""
Base exception for all mail-intake errors.
Base exception for all Mail Intake errors.
Users of the library should catch this type (or subclasses)
instead of provider-specific or third-party exceptions.
This is the root of the Mail Intake exception hierarchy.
All errors raised by the library must derive from this class.
Consumers should generally catch this type when handling
library-level failures.
"""
class MailIntakeAuthError(MailIntakeError):
"""Authentication and credential-related failures."""
"""
Authentication and credential-related failures.
Raised when authentication providers are unable to acquire,
refresh, or persist valid credentials.
"""
class MailIntakeAdapterError(MailIntakeError):
"""Errors raised by mail provider adapters."""
"""
Errors raised by mail provider adapters.
Raised when a provider adapter encounters API errors,
transport failures, or invalid provider responses.
"""
class MailIntakeParsingError(MailIntakeError):
"""Errors encountered while parsing message content."""
"""
Errors encountered while parsing message content.
Raised when raw provider payloads cannot be interpreted
or normalized into internal domain models.
"""

View File

@@ -0,0 +1,24 @@
"""
Mail ingestion orchestration for Mail Intake.
This package contains **high-level ingestion components** responsible for
coordinating mail retrieval, parsing, normalization, and model construction.
It represents the **top of the ingestion pipeline** and is intended to be the
primary interaction surface for library consumers.
Components in this package:
- Are provider-agnostic
- Depend only on adapter and parser contracts
- Contain no provider-specific API logic
- Expose read-only ingestion workflows
Consumers are expected to construct a mail adapter and pass it to the
ingestion layer to begin processing messages and threads.
"""
from .reader import MailIntakeReader
__all__ = [
"MailIntakeReader",
]

View File

@@ -1,3 +1,18 @@
"""
High-level mail ingestion orchestration for Mail Intake.
This module provides the primary, provider-agnostic entry point for
reading and processing mail data.
It coordinates:
- Mail adapter access
- Message and thread iteration
- Header and body parsing
- Normalization and model construction
No provider-specific logic or API semantics are permitted in this layer.
"""
from datetime import datetime
from typing import Iterator, Dict, Any
@@ -14,22 +29,43 @@ class MailIntakeReader:
"""
High-level read-only ingestion interface.
This is the primary entry point users should interact with.
It orchestrates:
- adapter calls
- parsing
- normalization
- model construction
This class is the **primary entry point** for consumers of the Mail
Intake library.
No provider-specific logic exists here.
It orchestrates the full ingestion pipeline:
- Querying the adapter for message references
- Fetching raw provider messages
- Parsing and normalizing message data
- Constructing domain models
This class is intentionally:
- Provider-agnostic
- Stateless beyond iteration scope
- Read-only
"""
def __init__(self, adapter: MailIntakeAdapter):
"""
Initialize the mail reader.
Args:
adapter: Mail adapter implementation used to retrieve raw
messages and threads from a mail provider.
"""
self._adapter = adapter
def iter_messages(self, query: str) -> Iterator[MailIntakeMessage]:
"""
Iterate over parsed messages matching a provider query.
Args:
query: Provider-specific query string used to filter messages.
Yields:
Fully parsed and normalized `MailIntakeMessage` instances.
Raises:
MailIntakeParsingError: If a message cannot be parsed.
"""
for ref in self._adapter.iter_message_refs(query):
raw = self._adapter.fetch_message(ref["message_id"])
@@ -39,7 +75,17 @@ class MailIntakeReader:
"""
Iterate over threads constructed from messages matching a query.
Messages are grouped by thread_id and yielded as complete threads.
Messages are grouped by `thread_id` and yielded as complete thread
objects containing all associated messages.
Args:
query: Provider-specific query string used to filter messages.
Returns:
An iterator of `MailIntakeThread` instances.
Raises:
MailIntakeParsingError: If a message cannot be parsed.
"""
threads: Dict[str, MailIntakeThread] = {}
@@ -61,7 +107,17 @@ class MailIntakeReader:
def _parse_message(self, raw_message: Dict[str, Any]) -> MailIntakeMessage:
"""
Parse a raw provider message into a MailIntakeMessage.
Parse a raw provider message into a `MailIntakeMessage`.
Args:
raw_message: Provider-native message payload.
Returns:
A fully populated `MailIntakeMessage` instance.
Raises:
MailIntakeParsingError: If the message payload is missing required
fields or cannot be parsed.
"""
try:
message_id = raw_message["id"]

View File

@@ -0,0 +1,22 @@
"""
Domain models for Mail Intake.
This package defines the **canonical, provider-agnostic data models**
used throughout the Mail Intake ingestion pipeline.
Models in this package:
- Represent fully parsed and normalized mail data
- Are safe to persist, serialize, and index
- Contain no provider-specific payloads or API semantics
- Serve as stable inputs for downstream processing and analysis
These models form the core internal data contract of the library.
"""
from .message import MailIntakeMessage
from .thread import MailIntakeThread
__all__ = [
"MailIntakeMessage",
"MailIntakeThread",
]

View File

@@ -1,3 +1,14 @@
"""
Message domain models for Mail Intake.
This module defines the **canonical, provider-agnostic representation**
of an individual email message as used internally by the Mail Intake
ingestion pipeline.
Models in this module are safe to persist and must not contain any
provider-specific fields or semantics.
"""
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Dict
@@ -8,19 +19,37 @@ class MailIntakeMessage:
"""
Canonical internal representation of a single email message.
This model is provider-agnostic and safe to persist.
No Gmail-specific fields should appear here.
This model represents a fully parsed and normalized email message.
It is intentionally provider-agnostic and suitable for persistence,
indexing, and downstream processing.
No provider-specific identifiers, payloads, or API semantics
should appear in this model.
"""
message_id: str
"""Provider-specific message identifier."""
thread_id: str
"""Provider-specific thread identifier to which this message belongs."""
timestamp: datetime
"""Message timestamp as a timezone-naive UTC datetime."""
from_email: str
"""Sender email address."""
from_name: Optional[str]
"""Optional human-readable sender name."""
subject: str
"""Raw subject line of the message."""
body_text: str
"""Extracted plain-text body content of the message."""
snippet: str
"""Short provider-supplied preview snippet of the message."""
raw_headers: Dict[str, str]
"""Normalized mapping of message headers (header name → value)."""

View File

@@ -1,3 +1,13 @@
"""
Thread domain models for Mail Intake.
This module defines the **canonical, provider-agnostic representation**
of an email thread as used internally by the Mail Intake ingestion pipeline.
Threads group related messages and serve as the primary unit of reasoning
for higher-level correspondence workflows.
"""
from dataclasses import dataclass, field
from datetime import datetime
from typing import List, Set
@@ -10,21 +20,40 @@ class MailIntakeThread:
"""
Canonical internal representation of an email thread.
Threads are the primary unit of reasoning for correspondence
workflows (job applications, interviews, follow-ups, etc.).
A thread groups multiple related messages under a single subject
and participant set. It is designed to support reasoning over
conversational context such as job applications, interviews,
follow-ups, and ongoing discussions.
This model is provider-agnostic and safe to persist.
"""
thread_id: str
"""Provider-specific thread identifier."""
normalized_subject: str
"""Normalized subject line used to group related messages."""
participants: Set[str] = field(default_factory=set)
"""Set of unique participant email addresses observed in the thread."""
messages: List[MailIntakeMessage] = field(default_factory=list)
"""Ordered list of messages belonging to this thread."""
last_activity_at: datetime | None = None
"""Timestamp of the most recent message in the thread."""
def add_message(self, message: MailIntakeMessage) -> None:
"""
Add a message to the thread and update derived fields.
This method:
- Appends the message to the thread
- Tracks unique participants
- Updates the last activity timestamp
Args:
message: Parsed mail message to add to the thread.
"""
self.messages.append(message)

View File

@@ -0,0 +1,30 @@
"""
Message parsing utilities for Mail Intake.
This package contains **provider-aware but adapter-agnostic parsing helpers**
used to extract and normalize structured information from raw mail payloads.
Parsers in this package are responsible for:
- Interpreting provider-native message structures
- Extracting meaningful fields such as headers, body text, and subjects
- Normalizing data into consistent internal representations
This package does not:
- Perform network or IO operations
- Contain provider API logic
- Construct domain models directly
Parsing functions are designed to be composable and are orchestrated by the
ingestion layer.
"""
from .body import extract_body
from .headers import parse_headers, extract_sender
from .subject import normalize_subject
__all__ = [
"extract_body",
"parse_headers",
"extract_sender",
"normalize_subject",
]

View File

@@ -1,3 +1,13 @@
"""
Message body extraction utilities for Mail Intake.
This module contains helper functions for extracting a best-effort
plain-text body from provider-native message payloads.
The logic is intentionally tolerant of malformed or partial data and
prefers human-readable text over fidelity to original formatting.
"""
import base64
from typing import Dict, Any, Optional
@@ -9,6 +19,18 @@ from mail_intake.exceptions import MailIntakeParsingError
def _decode_base64(data: str) -> str:
"""
Decode Gmail URL-safe base64 payload into UTF-8 text.
Gmail message bodies are encoded using URL-safe base64, which may
omit padding and use non-standard characters.
Args:
data: URL-safe base64-encoded string.
Returns:
Decoded UTF-8 text with replacement for invalid characters.
Raises:
MailIntakeParsingError: If decoding fails.
"""
try:
padded = data.replace("-", "+").replace("_", "/")
@@ -21,6 +43,16 @@ def _decode_base64(data: str) -> str:
def _extract_from_part(part: Dict[str, Any]) -> Optional[str]:
"""
Extract text content from a single MIME part.
Supports:
- text/plain
- text/html (converted to plain text)
Args:
part: MIME part dictionary from a provider payload.
Returns:
Extracted plain-text content, or None if unsupported or empty.
"""
mime_type = part.get("mimeType")
body = part.get("body", {})
@@ -49,7 +81,14 @@ def extract_body(payload: Dict[str, Any]) -> str:
Priority:
1. text/plain
2. text/html (stripped to text)
3. empty string (if nothing usable found)
3. Single-part body
4. empty string (if nothing usable found)
Args:
payload: Provider-native message payload dictionary.
Returns:
Extracted plain-text message body.
"""
if not payload:
return ""

View File

@@ -1,3 +1,13 @@
"""
Message header parsing utilities for Mail Intake.
This module provides helper functions for normalizing and extracting
useful information from provider-native message headers.
The functions here are intentionally simple and tolerant of malformed
or incomplete header data.
"""
from typing import Dict, List, Tuple, Optional
@@ -5,19 +15,29 @@ def parse_headers(raw_headers: List[Dict[str, str]]) -> Dict[str, str]:
"""
Convert a list of Gmail-style headers into a normalized dict.
Input:
[
{"name": "From", "value": "John Doe <john@example.com>"},
{"name": "Subject", "value": "Re: Interview Update"},
...
]
Provider payloads (such as Gmail) typically represent headers as a list
of name/value mappings. This function normalizes them into a
case-insensitive dictionary keyed by lowercase header names.
Output:
{
"from": "...",
"subject": "...",
...
}
Args:
raw_headers: List of header dictionaries, each containing
``name`` and ``value`` keys.
Returns:
Dictionary mapping lowercase header names to stripped values.
Example:
Input:
[
{"name": "From", "value": "John Doe <john@example.com>"},
{"name": "Subject", "value": "Re: Interview Update"},
]
Output:
{
"from": "John Doe <john@example.com>",
"subject": "Re: Interview Update",
}
"""
headers: Dict[str, str] = {}
@@ -37,18 +57,27 @@ def extract_sender(headers: Dict[str, str]) -> Tuple[str, Optional[str]]:
"""
Extract sender email and optional display name from headers.
Returns:
(email, name)
This function parses the ``From`` header and attempts to extract:
- Sender email address
- Optional human-readable display name
If name cannot be determined, name will be None.
Args:
headers: Normalized header dictionary as returned by
:func:`parse_headers`.
Returns:
A tuple ``(email, name)`` where:
- ``email`` is the sender email address
- ``name`` is the display name, or ``None`` if unavailable
Examples:
``"John Doe <john@example.com>"`` → ``("john@example.com", "John Doe")``
``"john@example.com"`` → ``("john@example.com", None)``
"""
from_header = headers.get("from")
if not from_header:
return "", None
# Common forms:
# Name <email@domain>
# email@domain
if "<" in from_header and ">" in from_header:
name_part, email_part = from_header.split("<", 1)
email = email_part.rstrip(">").strip()

View File

@@ -1,7 +1,18 @@
"""
Subject line normalization utilities for Mail Intake.
This module provides helper functions for normalizing email subject lines
to enable reliable thread-level comparison and grouping.
Normalization is intentionally conservative to avoid altering semantic
meaning while removing common reply and forward prefixes.
"""
import re
_PREFIX_RE = re.compile(r"^(re|fw|fwd)\s*:\s*", re.IGNORECASE)
"""Regular expression matching common reply/forward subject prefixes."""
def normalize_subject(subject: str) -> str:
@@ -9,11 +20,19 @@ def normalize_subject(subject: str) -> str:
Normalize an email subject for thread-level comparison.
Operations:
- Strip common prefixes (Re:, Fwd:, FW:)
- Collapse whitespace
- Preserve original casing (no lowercasing)
- Strips common prefixes such as ``Re:``, ``Fwd:``, and ``FW:``
- Repeats prefix stripping to handle stacked prefixes
- Collapses excessive whitespace
- Preserves original casing (no lowercasing)
This function is intentionally conservative.
This function is intentionally conservative and avoids aggressive
transformations that could alter the semantic meaning of the subject.
Args:
subject: Raw subject line from a message header.
Returns:
Normalized subject string suitable for thread grouping.
"""
if not subject:
return ""

159
manage_docs.py Normal file
View File

@@ -0,0 +1,159 @@
"""
MkDocs documentation management CLI.
This script provides a proper CLI interface to:
- Generate MkDocs Markdown files with mkdocstrings directives
- Build the documentation site
- Serve the documentation site locally
All operations are performed by calling MkDocs as a Python library
(no shell command invocation).
Requirements:
- mkdocs
- mkdocs-material
- mkdocstrings[python]
Usage:
python manage_docs.py generate
python manage_docs.py build
python manage_docs.py serve
Optional flags:
--docs-dir PATH Path to docs directory (default: ./docs)
--package-root NAME Root Python package name (default: mail_intake)
"""
from __future__ import annotations
import argparse
from pathlib import Path
from mkdocs.commands import build as mkdocs_build
from mkdocs.commands import serve as mkdocs_serve
from mkdocs.config import load_config
PROJECT_ROOT = Path(__file__).resolve().parent
DEFAULT_DOCS_DIR = PROJECT_ROOT / "docs"
DEFAULT_PACKAGE_ROOT = "mail_intake"
MKDOCS_YML = PROJECT_ROOT / "mkdocs.yml"
def generate_docs_from_nav(
project_root: Path,
docs_root: Path,
package_root: str,
) -> None:
"""
Create and populate MkDocs Markdown files with mkdocstrings directives.
This function:
- Walks the Python package structure
- Mirrors it under the docs directory
- Creates missing .md files
- Creates index.md for packages (__init__.py)
- Overwrites content with ::: package.module
Examples:
mail_intake/__init__.py -> docs/mail_intake/index.md
mail_intake/config.py -> docs/mail_intake/config.md
mail_intake/adapters/__init__.py -> docs/mail_intake/adapters/index.md
mail_intake/adapters/base.py -> docs/mail_intake/adapters/base.md
"""
package_dir = project_root / package_root
if not package_dir.exists():
raise FileNotFoundError(f"Package not found: {package_dir}")
docs_root.mkdir(parents=True, exist_ok=True)
for py_file in package_dir.rglob("*.py"):
rel = py_file.relative_to(project_root)
if py_file.name == "__init__.py":
# Package → index.md
module_path = ".".join(rel.parent.parts)
md_path = docs_root / rel.parent / "index.md"
title = rel.parent.name.replace("_", " ").title()
else:
# Regular module → <module>.md
module_path = ".".join(rel.with_suffix("").parts)
md_path = docs_root / rel.with_suffix(".md")
title = md_path.stem.replace("_", " ").title()
md_path.parent.mkdir(parents=True, exist_ok=True)
content = f"""# {title}
::: {module_path}
"""
md_path.write_text(content, encoding="utf-8")
def load_mkdocs_config():
if not MKDOCS_YML.exists():
raise FileNotFoundError("mkdocs.yml not found at project root")
return load_config(str(MKDOCS_YML))
def cmd_generate(args: argparse.Namespace) -> None:
generate_docs_from_nav(
project_root=PROJECT_ROOT,
docs_root=args.docs_dir,
package_root=args.package_root,
)
def cmd_build(_: argparse.Namespace) -> None:
config = load_mkdocs_config()
mkdocs_build.build(config)
def cmd_serve(_: argparse.Namespace) -> None:
config = load_mkdocs_config()
mkdocs_serve.serve(config)
def main() -> None:
parser = argparse.ArgumentParser(
prog="manage_docs.py",
description="Manage MkDocs documentation for the project",
)
parser.add_argument(
"--docs-dir",
type=Path,
default=DEFAULT_DOCS_DIR,
help="Path to the docs directory",
)
parser.add_argument(
"--package-root",
default=DEFAULT_PACKAGE_ROOT,
help="Root Python package name",
)
subparsers = parser.add_subparsers(dest="command", required=True)
subparsers.add_parser(
"generate",
help="Generate Markdown files with mkdocstrings directives",
).set_defaults(func=cmd_generate)
subparsers.add_parser(
"build",
help="Build the MkDocs site",
).set_defaults(func=cmd_build)
subparsers.add_parser(
"serve",
help="Serve the MkDocs site locally",
).set_defaults(func=cmd_serve)
args = parser.parse_args()
args.func(args)
if __name__ == "__main__":
main()

61
mkdocs.yml Normal file
View File

@@ -0,0 +1,61 @@
site_name: Aetoskia Mail Intake
site_description: Format-agnostic document reading, parsing, and scraping framework
theme:
name: material
palette:
- scheme: slate
primary: deep purple
accent: cyan
font:
text: Inter
code: JetBrains Mono
features:
- navigation.tabs
- navigation.expand
- navigation.top
- navigation.instant
- content.code.copy
- content.code.annotate
plugins:
- search
- mkdocstrings:
handlers:
python:
paths: ["."]
options:
docstring_style: google
show_source: false
show_signature_annotations: true
separate_signature: true
merge_init_into_class: true
inherited_members: true
annotations_path: brief
show_root_heading: true
group_by_category: true
nav:
- Home: mail_intake/index.md
- Adapters:
- Base Adapter: mail_intake/adapters/base.md
- Gmail Adapter: mail_intake/adapters/gmail.md
- Auth:
- Base Auth: mail_intake/auth/base.md
- Google Auth: mail_intake/auth/google.md
- Mail Reader: mail_intake/ingestion/reader.md
- Models:
- Message: mail_intake/models/message.md
- Thread: mail_intake/models/thread.md
- Parsers:
- Body: mail_intake/parsers/body.md
- Headers: mail_intake/parsers/headers.md
- Subject: mail_intake/parsers/subject.md
- Config: mail_intake/config.md
- Exceptions: mail_intake/exceptions.md

View File

@@ -1,10 +1,17 @@
beautifulsoup4==4.12.0
google-api-python-client==2.187.0
google-auth-oauthlib==1.2.3
types-beautifulsoup4
# Test Packages
pytest==7.4.0
pytest-asyncio==0.21.0
pytest-cov==4.1.0
types-beautifulsoup4
# Optional, useful locally
ipython
# Doc Packages
mkdocs==1.6.1
mkdocs-material==9.6.23
neoteroi-mkdocs==1.1.3
pymdown-extensions==10.16.1
mkdocstrings==1.0.0
mkdocstrings-python==2.0.1

View File

@@ -1,7 +1,7 @@
from datetime import datetime, timedelta
from mail_intake.models.message import MailIntakeMessage
from mail_intake.models.thread import MailIntakeThread
from mail_intake.models import MailIntakeMessage
from mail_intake.models import MailIntakeThread
def test_message_is_immutable():

View File

@@ -1,8 +1,8 @@
import base64
from mail_intake.parsers.subject import normalize_subject
from mail_intake.parsers.headers import parse_headers, extract_sender
from mail_intake.parsers.body import extract_body
from mail_intake.parsers import normalize_subject
from mail_intake.parsers import parse_headers, extract_sender
from mail_intake.parsers import extract_body
def _b64(text: str) -> str: