commit 3860abf1f05a7da2c50644acb3009a1984645cf5 Author: Jonathan Rosenbaum Date: Sat Jan 17 13:46:20 2026 -0500 First commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..f8a7e58 --- /dev/null +++ b/README.md @@ -0,0 +1,152 @@ +# HyperKitty Search‑and‑Replace Management Command + +This Django management command performs **search and replace operations** on +HyperKitty email bodies and text‑based attachments. It is designed for cases +where sensitive information must be scrubbed from archived mailing‑list data. + +The command supports simulation mode, selective processing, and deduplication of +both emails and attachments. + +--- + +## Features + +- Replace sensitive strings in: + - Email bodies + - Text‑based attachments (`text/plain`, `text/html`, `application/xhtml+xml`) +- Simulation mode (`--simulate`) to preview changes without saving +- Deduplication of: + - Emails (by `message_id`) + - Attachments (by SHA‑1 content hash) +- Flexible processing: + - `--only-emails` + - `--only-attachments` +- Reads replacements from a simple text file using `shlex` parsing + +--- + +## Installation + +Place the command file in: + +``` +/usr/lib/python3.10/site-packages/hyperkitty/management/commands/ +``` + +(or the equivalent path for your Python/Django installation) + +The filename should match the command name, for example: + +``` +sanitize_hyperkitty.py +``` + +Django will automatically detect it as a management command. + +--- + +## Usage + +Run the command from your Django project directory: + +```bash +./manage.py sanitize_hyperkitty \ + --list mylist@example.com \ + --replacements-file replacements.txt +``` + +### Common Options + +| Option | Description | +|--------|-------------| +| `--list` | **Required.** Mailing list name (e.g. `team@lists.example.org`) | +| `--replacements-file` | **Required.** Path to a file containing replacement rules | +| `--simulate` | Show changes without saving them | +| `--only-emails` | Process only email bodies | +| `--only-attachments` | Process only attachments | + +--- + +## Replacements File Format + +The replacements file uses **shlex parsing**, allowing quoted strings. + +Each line must contain **exactly two values**: + +``` +old_value new_value +``` + +### Examples + +``` +password "********" +"secret token" "[REDACTED]" +john@example.com jane@example.com +``` + +Lines beginning with `#` are ignored. + +--- + +## How It Works + +### 1. Load Replacements +The command reads the replacements file and builds a dictionary of `old → new` +pairs. Malformed lines are skipped with warnings. + +### 2. Fetch and Deduplicate Emails +Emails are filtered by mailing list name and deduplicated by `message_id`. + +### 3. Process Email Bodies +If enabled, each email body is scanned and replacements are applied. + +### 4. Process Attachments +Attachments are: + +- Deduplicated by SHA‑1 hash +- Checked for text‑based MIME types +- Decoded using the attachment’s encoding +- Updated and saved if modified + +### 5. Simulation Mode +If `--simulate` is used: + +- Changes are printed to stdout +- No data is saved + +### 6. Rebuild Search Index + +After real modifications, rebuild the HyperKitty search index: + +```bash +./manage.py rebuild_index +``` + +--- + +## Example + +```bash +./manage.py sanitize_hyperkitty \ + --list devteam@lists.example.org \ + --replacements-file scrub.txt \ + --simulate +``` + +This will scan all messages, show what would change, and leave the database untouched. + +--- + +## Notes + +- `--only-emails` and `--only-attachments` cannot be used together. +- Attachments without a MIME type attempt fallback detection based on filename. +- Non‑text attachments are skipped automatically. + +--- + +## License + +This script is intended for administrative use within Django/HyperKitty +environments under GNU General Public License v3.0. \ No newline at end of file diff --git a/sanitize_mail.py b/sanitize_mail.py new file mode 100644 index 0000000..daed6ec --- /dev/null +++ b/sanitize_mail.py @@ -0,0 +1,193 @@ +import sys +import shlex +from django.core.management.base import BaseCommand +from hyperkitty.models import Email, Attachment + +# put command in /usr/lib/python3.10/site-packages/hyperkitty/management/commands + +TEXT_MIMETYPES = { + "text/plain", + "text/html", + "application/xhtml+xml", +} + + +class Command(BaseCommand): + help = "Search and replace sensitive data in HyperKitty emails and attachments." + + def add_arguments(self, parser): + parser.add_argument( + "--list", + required=True, + help="Mailing list name, e.g. bikeboard@lists.bikelover.org", + ) + parser.add_argument( + "--simulate", + action="store_true", + help="Show what would be changed without saving.", + ) + parser.add_argument( + "--replacements-file", + required=True, + help="Path to a text file containing replacements, one per line.", + ) + parser.add_argument( + "--only-emails", + action="store_true", + help="Process only email bodies, not attachments.", + ) + parser.add_argument( + "--only-attachments", + action="store_true", + help="Process only attachments, not email bodies.", + ) + + def load_replacements(self, filepath): + replacements = {} + + with open(filepath, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + + if not line or line.startswith("#"): + continue + + try: + parts = shlex.split(line) + except ValueError: + self.stdout.write(self.style.WARNING(f"Skipping malformed line: {line}")) + continue + + if len(parts) != 2: + self.stdout.write(self.style.WARNING(f"Skipping invalid line: {line}")) + continue + + old, new = parts + replacements[old] = new + + return replacements + + def handle(self, *args, **options): + mailing_list = options["list"] + simulate = options["simulate"] + replacements_file = options["replacements_file"] + only_emails = options["only_emails"] + only_attachments = options["only_attachments"] + + # Validate flags + if only_emails and only_attachments: + self.stdout.write(self.style.ERROR("Cannot use --only-emails and --only-attachments together.")) + return + + replacements = self.load_replacements(replacements_file) + + if not replacements: + self.stdout.write(self.style.ERROR("No valid replacements found.")) + return + + self.stdout.write(f"Loaded {len(replacements)} replacements.") + emails = Email.objects.filter(mailinglist__name=mailing_list) + self.stdout.write(f"Scanning {emails.count()} messages…") + + # Deduplicate emails by message_id + unique_emails = {} + for msg in emails: + if msg.message_id not in unique_emails: + unique_emails[msg.message_id] = msg + + emails = unique_emails.values() + + + + for msg in emails: + changed = False + + # --- Process email body --- + if not only_attachments: + if msg.content: + original = msg.content + updated = original + + for old, new in replacements.items(): + if old in updated and simulate: + self.stdout.write(f" Change in email body:") + self.stdout.write(f" - {old}") + self.stdout.write(f" + {new}") + + updated = updated.replace(old, new) + + if updated != original: + changed = True + self.stdout.write(f"[Email] {msg.subject}") + + if not simulate: + msg.content = updated + msg.save() + + + # --- Process attachments --- + if not only_emails: + attachments = Attachment.objects.filter(email=msg) + + import hashlib + + # Deduplicate attachments by content hash + unique_attachments = {} + for att in attachments: + raw = att.content if isinstance(att.content, bytes) else att.content.encode("utf-8", errors="ignore") + digest = hashlib.sha1(raw).hexdigest() + + if digest not in unique_attachments: + unique_attachments[digest] = att + + attachments = unique_attachments.values() + + seen_changes = set() + for att in attachments: + mime = getattr(att, "content_type", None) + filename = getattr(att, "name", None) + + if not mime: + if filename and filename.lower().endswith((".htm", ".html", ".txt", ".xhtml")): + mime = "text/html" + else: + continue + + if mime not in TEXT_MIMETYPES: + continue + + try: + content = att.content.decode(att.encoding or "utf-8", errors="ignore") + except AttributeError: + content = att.content + + original = content + updated = content + + for old, new in replacements.items(): + key = (filename, old, new) + if simulate and key not in seen_changes and old in updated: + seen_changes.add(key) + self.stdout.write(f" Change in attachment {filename}:") + self.stdout.write(f" - {old}") + self.stdout.write(f" + {new}") + + updated = updated.replace(old, new) + + if updated != original: + changed = True + self.stdout.write(f"[Attachment] {filename} in {msg.subject}") + + if not simulate: + encoded = updated.encode(att.encoding or "utf-8") + att.content = encoded + att.size = len(encoded) + att.save() + + + if changed and simulate: + self.stdout.write(" (simulate mode: no changes saved)") + + self.stdout.write("\nDone.") + if not simulate: + self.stdout.write("Run `./manage.py rebuild_index` to refresh search index.") \ No newline at end of file