First commit
This commit is contained in:
commit
3860abf1f0
152
README.md
Normal file
152
README.md
Normal file
@ -0,0 +1,152 @@
|
||||
# HyperKitty Search‑and‑Replace Management Command
|
||||
|
||||
This Django management command performs **search and replace operations** on
|
||||
HyperKitty email bodies and text‑based attachments. It is designed for cases
|
||||
where sensitive information must be scrubbed from archived mailing‑list data.
|
||||
|
||||
The command supports simulation mode, selective processing, and deduplication of
|
||||
both emails and attachments.
|
||||
|
||||
---
|
||||
|
||||
## Features
|
||||
|
||||
- Replace sensitive strings in:
|
||||
- Email bodies
|
||||
- Text‑based attachments (`text/plain`, `text/html`, `application/xhtml+xml`)
|
||||
- Simulation mode (`--simulate`) to preview changes without saving
|
||||
- Deduplication of:
|
||||
- Emails (by `message_id`)
|
||||
- Attachments (by SHA‑1 content hash)
|
||||
- Flexible processing:
|
||||
- `--only-emails`
|
||||
- `--only-attachments`
|
||||
- Reads replacements from a simple text file using `shlex` parsing
|
||||
|
||||
---
|
||||
|
||||
## Installation
|
||||
|
||||
Place the command file in:
|
||||
|
||||
```
|
||||
/usr/lib/python3.10/site-packages/hyperkitty/management/commands/
|
||||
```
|
||||
|
||||
(or the equivalent path for your Python/Django installation)
|
||||
|
||||
The filename should match the command name, for example:
|
||||
|
||||
```
|
||||
sanitize_hyperkitty.py
|
||||
```
|
||||
|
||||
Django will automatically detect it as a management command.
|
||||
|
||||
---
|
||||
|
||||
## Usage
|
||||
|
||||
Run the command from your Django project directory:
|
||||
|
||||
```bash
|
||||
./manage.py sanitize_hyperkitty \
|
||||
--list mylist@example.com \
|
||||
--replacements-file replacements.txt
|
||||
```
|
||||
|
||||
### Common Options
|
||||
|
||||
| Option | Description |
|
||||
|--------|-------------|
|
||||
| `--list` | **Required.** Mailing list name (e.g. `team@lists.example.org`) |
|
||||
| `--replacements-file` | **Required.** Path to a file containing replacement rules |
|
||||
| `--simulate` | Show changes without saving them |
|
||||
| `--only-emails` | Process only email bodies |
|
||||
| `--only-attachments` | Process only attachments |
|
||||
|
||||
---
|
||||
|
||||
## Replacements File Format
|
||||
|
||||
The replacements file uses **shlex parsing**, allowing quoted strings.
|
||||
|
||||
Each line must contain **exactly two values**:
|
||||
|
||||
```
|
||||
old_value new_value
|
||||
```
|
||||
|
||||
### Examples
|
||||
|
||||
```
|
||||
password "********"
|
||||
"secret token" "[REDACTED]"
|
||||
john@example.com jane@example.com
|
||||
```
|
||||
|
||||
Lines beginning with `#` are ignored.
|
||||
|
||||
---
|
||||
|
||||
## How It Works
|
||||
|
||||
### 1. Load Replacements
|
||||
The command reads the replacements file and builds a dictionary of `old → new`
|
||||
pairs. Malformed lines are skipped with warnings.
|
||||
|
||||
### 2. Fetch and Deduplicate Emails
|
||||
Emails are filtered by mailing list name and deduplicated by `message_id`.
|
||||
|
||||
### 3. Process Email Bodies
|
||||
If enabled, each email body is scanned and replacements are applied.
|
||||
|
||||
### 4. Process Attachments
|
||||
Attachments are:
|
||||
|
||||
- Deduplicated by SHA‑1 hash
|
||||
- Checked for text‑based MIME types
|
||||
- Decoded using the attachment’s encoding
|
||||
- Updated and saved if modified
|
||||
|
||||
### 5. Simulation Mode
|
||||
If `--simulate` is used:
|
||||
|
||||
- Changes are printed to stdout
|
||||
- No data is saved
|
||||
|
||||
### 6. Rebuild Search Index
|
||||
|
||||
After real modifications, rebuild the HyperKitty search index:
|
||||
|
||||
```bash
|
||||
./manage.py rebuild_index
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Example
|
||||
|
||||
```bash
|
||||
./manage.py sanitize_hyperkitty \
|
||||
--list devteam@lists.example.org \
|
||||
--replacements-file scrub.txt \
|
||||
--simulate
|
||||
```
|
||||
|
||||
This will scan all messages, show what would change, and leave the database untouched.
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- `--only-emails` and `--only-attachments` cannot be used together.
|
||||
- Attachments without a MIME type attempt fallback detection based on filename.
|
||||
- Non‑text attachments are skipped automatically.
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
This script is intended for administrative use within Django/HyperKitty
|
||||
environments under GNU General Public License v3.0.
|
||||
193
sanitize_mail.py
Normal file
193
sanitize_mail.py
Normal file
@ -0,0 +1,193 @@
|
||||
import sys
|
||||
import shlex
|
||||
from django.core.management.base import BaseCommand
|
||||
from hyperkitty.models import Email, Attachment
|
||||
|
||||
# put command in /usr/lib/python3.10/site-packages/hyperkitty/management/commands
|
||||
|
||||
TEXT_MIMETYPES = {
|
||||
"text/plain",
|
||||
"text/html",
|
||||
"application/xhtml+xml",
|
||||
}
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Search and replace sensitive data in HyperKitty emails and attachments."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"--list",
|
||||
required=True,
|
||||
help="Mailing list name, e.g. bikeboard@lists.bikelover.org",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--simulate",
|
||||
action="store_true",
|
||||
help="Show what would be changed without saving.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--replacements-file",
|
||||
required=True,
|
||||
help="Path to a text file containing replacements, one per line.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only-emails",
|
||||
action="store_true",
|
||||
help="Process only email bodies, not attachments.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only-attachments",
|
||||
action="store_true",
|
||||
help="Process only attachments, not email bodies.",
|
||||
)
|
||||
|
||||
def load_replacements(self, filepath):
|
||||
replacements = {}
|
||||
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
try:
|
||||
parts = shlex.split(line)
|
||||
except ValueError:
|
||||
self.stdout.write(self.style.WARNING(f"Skipping malformed line: {line}"))
|
||||
continue
|
||||
|
||||
if len(parts) != 2:
|
||||
self.stdout.write(self.style.WARNING(f"Skipping invalid line: {line}"))
|
||||
continue
|
||||
|
||||
old, new = parts
|
||||
replacements[old] = new
|
||||
|
||||
return replacements
|
||||
|
||||
def handle(self, *args, **options):
|
||||
mailing_list = options["list"]
|
||||
simulate = options["simulate"]
|
||||
replacements_file = options["replacements_file"]
|
||||
only_emails = options["only_emails"]
|
||||
only_attachments = options["only_attachments"]
|
||||
|
||||
# Validate flags
|
||||
if only_emails and only_attachments:
|
||||
self.stdout.write(self.style.ERROR("Cannot use --only-emails and --only-attachments together."))
|
||||
return
|
||||
|
||||
replacements = self.load_replacements(replacements_file)
|
||||
|
||||
if not replacements:
|
||||
self.stdout.write(self.style.ERROR("No valid replacements found."))
|
||||
return
|
||||
|
||||
self.stdout.write(f"Loaded {len(replacements)} replacements.")
|
||||
emails = Email.objects.filter(mailinglist__name=mailing_list)
|
||||
self.stdout.write(f"Scanning {emails.count()} messages…")
|
||||
|
||||
# Deduplicate emails by message_id
|
||||
unique_emails = {}
|
||||
for msg in emails:
|
||||
if msg.message_id not in unique_emails:
|
||||
unique_emails[msg.message_id] = msg
|
||||
|
||||
emails = unique_emails.values()
|
||||
|
||||
|
||||
|
||||
for msg in emails:
|
||||
changed = False
|
||||
|
||||
# --- Process email body ---
|
||||
if not only_attachments:
|
||||
if msg.content:
|
||||
original = msg.content
|
||||
updated = original
|
||||
|
||||
for old, new in replacements.items():
|
||||
if old in updated and simulate:
|
||||
self.stdout.write(f" Change in email body:")
|
||||
self.stdout.write(f" - {old}")
|
||||
self.stdout.write(f" + {new}")
|
||||
|
||||
updated = updated.replace(old, new)
|
||||
|
||||
if updated != original:
|
||||
changed = True
|
||||
self.stdout.write(f"[Email] {msg.subject}")
|
||||
|
||||
if not simulate:
|
||||
msg.content = updated
|
||||
msg.save()
|
||||
|
||||
|
||||
# --- Process attachments ---
|
||||
if not only_emails:
|
||||
attachments = Attachment.objects.filter(email=msg)
|
||||
|
||||
import hashlib
|
||||
|
||||
# Deduplicate attachments by content hash
|
||||
unique_attachments = {}
|
||||
for att in attachments:
|
||||
raw = att.content if isinstance(att.content, bytes) else att.content.encode("utf-8", errors="ignore")
|
||||
digest = hashlib.sha1(raw).hexdigest()
|
||||
|
||||
if digest not in unique_attachments:
|
||||
unique_attachments[digest] = att
|
||||
|
||||
attachments = unique_attachments.values()
|
||||
|
||||
seen_changes = set()
|
||||
for att in attachments:
|
||||
mime = getattr(att, "content_type", None)
|
||||
filename = getattr(att, "name", None)
|
||||
|
||||
if not mime:
|
||||
if filename and filename.lower().endswith((".htm", ".html", ".txt", ".xhtml")):
|
||||
mime = "text/html"
|
||||
else:
|
||||
continue
|
||||
|
||||
if mime not in TEXT_MIMETYPES:
|
||||
continue
|
||||
|
||||
try:
|
||||
content = att.content.decode(att.encoding or "utf-8", errors="ignore")
|
||||
except AttributeError:
|
||||
content = att.content
|
||||
|
||||
original = content
|
||||
updated = content
|
||||
|
||||
for old, new in replacements.items():
|
||||
key = (filename, old, new)
|
||||
if simulate and key not in seen_changes and old in updated:
|
||||
seen_changes.add(key)
|
||||
self.stdout.write(f" Change in attachment {filename}:")
|
||||
self.stdout.write(f" - {old}")
|
||||
self.stdout.write(f" + {new}")
|
||||
|
||||
updated = updated.replace(old, new)
|
||||
|
||||
if updated != original:
|
||||
changed = True
|
||||
self.stdout.write(f"[Attachment] {filename} in {msg.subject}")
|
||||
|
||||
if not simulate:
|
||||
encoded = updated.encode(att.encoding or "utf-8")
|
||||
att.content = encoded
|
||||
att.size = len(encoded)
|
||||
att.save()
|
||||
|
||||
|
||||
if changed and simulate:
|
||||
self.stdout.write(" (simulate mode: no changes saved)")
|
||||
|
||||
self.stdout.write("\nDone.")
|
||||
if not simulate:
|
||||
self.stdout.write("Run `./manage.py rebuild_index` to refresh search index.")
|
||||
Loading…
x
Reference in New Issue
Block a user