Source code for minimost.clean

"""
minimost.clean
==============

Maintenance utilities for purging old uploads and messages.

There are two kinds of limit — **age-based** retention and **size-based** caps:

* :func:`delete_files_older_than` removes file attachments from ``uploads/``
  once they pass a per-type age threshold.
* :func:`delete_messages_older_than` hard-deletes message rows from the shared
  ``messages.db`` once they pass an age threshold.
* :func:`delete_files_over_size` deletes the oldest files in ``uploads/`` until
  the directory's total size is back under a cap.
* :func:`delete_messages_over_size` deletes the oldest messages from
  ``messages.db`` until the database is back under a cap.

All are called automatically by a background daemon thread started in
:func:`minimost.create_app` — no cron job or external scheduler is required.
The thread runs 5 minutes after startup and repeats every 24 hours.  Settings
are read from ``settings.json`` on each run:

* ``"image_retention_days"`` — image file attachments (default: 30 days).
* ``"file_retention_days"`` — all other file attachments (default: 30 days).
* ``"message_retention_days"`` — messages in the message database (default: 770
  days).
* ``"max_upload_dir_size_mb"`` — total size cap for the ``uploads/`` directory;
  oldest files are deleted when exceeded (``0`` or absent disables the cap).
* ``"max_message_db_size_mb"`` — size cap for the shared message database;
  oldest messages are deleted when exceeded (``0`` or absent disables the cap).

This module can also be invoked directly for ad-hoc cleanup:

.. code-block:: bash

    python3 src/minimost/clean.py
"""

import sqlite3
from pathlib import Path
import time

_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp"}


def _maybe_delete_file(
    path: Path, image_cutoff: float, file_cutoff: float, dry_run: bool
) -> None:
    try:
        mtime = path.stat().st_mtime
    except OSError:
        return
    cutoff = image_cutoff if path.suffix.lower() in _IMAGE_EXTENSIONS else file_cutoff
    if mtime >= cutoff:
        return
    if dry_run:
        print(f"[DRY RUN] Would delete: {path}")
        return
    try:
        path.unlink()
        print(f"Deleted: {path}")
    except FileNotFoundError:
        pass  # already removed by another process


[docs] def delete_files_older_than( directory: str, image_days: int, file_days: int, dry_run: bool = False, ): """Delete files in *directory* based on type-specific retention periods. Image files (jpg, jpeg, png, gif, webp) are removed when older than *image_days*; all other files are removed when older than *file_days*. :param directory: Path to the directory to clean. :type directory: str :param image_days: Retention period in days for image files. :type image_days: int :param file_days: Retention period in days for non-image files. :type file_days: int :param dry_run: If ``True``, only print what would be deleted without removing any files. Defaults to ``False``. :type dry_run: bool :raises ValueError: If *directory* does not exist or is not a directory. """ now = time.time() image_cutoff = now - (image_days * 86400) file_cutoff = now - (file_days * 86400) dirpath = Path(directory) if not dirpath.is_dir(): raise ValueError(f"{directory} is not a valid directory") for path in dirpath.iterdir(): if path.is_file(): _maybe_delete_file(path, image_cutoff, file_cutoff, dry_run)
[docs] def delete_files_over_size( directory: str, max_size_mb: float, dry_run: bool = False, ) -> None: """Delete the oldest files in *directory* until it fits within a size cap. The combined size of every regular file directly in *directory* is compared against *max_size_mb*. While the total exceeds the cap, files are deleted **oldest-first** (by modification time) until the directory is back under it. This bounds the disk footprint of ``uploads/`` independently of the age-based retention in :func:`delete_files_older_than`: a burst of large uploads is trimmed by size even before any of it ages out. The two run together — age-based cleanup first, then this size cap on whatever remains. Subdirectories are ignored (only regular files are counted and deleted), so the function is safe to point at a directory that nests other content. A cap of ``0`` (or any non-positive value) disables the check. :param directory: Path to the directory to bound. :type directory: str :param max_size_mb: Maximum combined size in mebibytes. Non-positive disables the check. :type max_size_mb: float :param dry_run: If ``True``, only print what would be deleted without removing any files. Defaults to ``False``. :type dry_run: bool :raises ValueError: If *directory* does not exist or is not a directory. """ if not max_size_mb or max_size_mb <= 0: return dirpath = Path(directory) if not dirpath.is_dir(): raise ValueError(f"{directory} is not a valid directory") max_bytes = int(max_size_mb * 1024 * 1024) # Snapshot (mtime, size, path) for every regular file. stat() can race with # another worker's cleanup removing the file, so tolerate it disappearing. entries = [] total = 0 for path in dirpath.iterdir(): if not path.is_file(): continue try: stat = path.stat() except OSError: continue entries.append((stat.st_mtime, stat.st_size, path)) total += stat.st_size if total <= max_bytes: return # Oldest first, so the most recent uploads are the last to be removed. entries.sort(key=lambda entry: entry[0]) for _mtime, size, path in entries: if total <= max_bytes: break if dry_run: print(f"[DRY RUN] Would delete (size cap): {path}") total -= size continue try: path.unlink() total -= size print(f"Deleted (size cap): {path}") except FileNotFoundError: total -= size # already removed by another worker; count it as freed
[docs] def delete_messages_older_than(users_dir: str, days: int, dry_run: bool = False): """Hard-delete messages older than *days* from every user database. Iterates every ``*.db`` file in *users_dir* and removes rows from the ``messages`` table whose ``ts`` timestamp predates the cutoff. Each database is processed independently so a single corrupted file does not abort the run. :param users_dir: Path to the directory containing per-user ``.db`` files. :type users_dir: str :param days: Messages older than this many days are deleted. :type days: int :param dry_run: If ``True``, print what would be deleted without making any changes. Defaults to ``False``. :type dry_run: bool :raises ValueError: If *users_dir* does not exist or is not a directory. """ cutoff = time.time() - (days * 86400) dirpath = Path(users_dir) if not dirpath.is_dir(): raise ValueError(f"{users_dir} is not a valid directory") for db_file in sorted(dirpath.glob("*.db")): try: _clean_user_db(db_file, cutoff, dry_run) except Exception: # nosec B110 — one bad DB must not stop the rest pass
[docs] def _live_size_bytes(conn) -> int: """Return the size in bytes of the *live* (non-free) pages of a database. ``page_count`` counts every page, including those on the freelist left behind by deletes/edits; those free pages are reclaimed when the database is compacted. ``(page_count - freelist_count) × page_size`` is therefore the size the ``.db`` file shrinks to after compaction, which is the meaningful quantity to cap: it ignores transient free-page bloat (so we don't delete messages merely because space has not been reclaimed yet) and is independent of the WAL file, avoiding the WAL-mode quirk where ``os.stat`` on the main file lags committed changes until a checkpoint. """ page_count = conn.execute("PRAGMA page_count").fetchone()[0] freelist = conn.execute("PRAGMA freelist_count").fetchone()[0] page_size = conn.execute("PRAGMA page_size").fetchone()[0] return (page_count - freelist) * page_size
[docs] def delete_messages_over_size( db_path: str, max_size_mb: float, dry_run: bool = False, batch: int = 1000, ) -> None: """Delete the oldest messages until the message database fits a size cap. The shared ``messages.db`` is the only database that grows with prunable content, so it is the only one a size cap can be enforced on. Size is measured as the live (post-compaction) data size — see :func:`_live_size_bytes` — so transient free-page bloat never triggers a deletion. When that size exceeds *max_size_mb*, the oldest messages (lowest ``ts``) are deleted in batches of *batch* rows until the database is back under the cap, after which the freed pages are reclaimed in one ``VACUUM`` and the WAL checkpointed so the on-disk file shrinks to match. ``VACUUM`` runs at most once per call, and only when something was actually pruned. A size cap of ``0`` (or any non-positive value) disables the check, leaving age-based retention as the only purge. The database is opened only when it actually exceeds the cap. :param db_path: Path to the shared ``messages.db`` file. :type db_path: str :param max_size_mb: Maximum allowed size in mebibytes. Non-positive disables the cap. :type max_size_mb: float :param dry_run: If ``True``, only report what would be deleted. :type dry_run: bool :param batch: Number of oldest messages to delete per cycle. :type batch: int """ if not max_size_mb or max_size_mb <= 0: return path = Path(db_path) if not path.is_file(): return max_bytes = int(max_size_mb * 1024 * 1024) conn = sqlite3.connect(str(path)) try: conn.execute("PRAGMA journal_mode=WAL") if not _has_table(conn, "messages"): return size = _live_size_bytes(conn) if size <= max_bytes: return if dry_run: print( f"[DRY RUN] {path.name} holds {size / 1048576:.1f} MiB of " f"messages, over the {max_size_mb} MiB cap; would delete the " f"oldest" ) return deleted_total = 0 while size > max_bytes: ids = [ row[0] for row in conn.execute( "SELECT id FROM messages ORDER BY ts ASC, id ASC LIMIT ?", (batch,), ).fetchall() ] if not ids: break # table is empty but the schema still exceeds the cap placeholders = ",".join("?" * len(ids)) # nosec B608 — placeholders is a string of bound-parameter markers, # never message data; the ids are passed as parameters. conn.execute( f"DELETE FROM messages WHERE id IN ({placeholders})", ids # nosec B608 ) # Reactions reference messages by id; drop any now-orphaned rows. # (The FTS index self-cleans via its delete trigger.) if _has_table(conn, "reactions"): conn.execute( "DELETE FROM reactions WHERE message_id NOT IN " "(SELECT id FROM messages)" ) conn.commit() deleted_total += len(ids) new_size = _live_size_bytes(conn) if new_size >= size: # A batch freed no whole page (e.g. many tiny rows). Stop rather # than loop forever making no progress toward the cap. break size = new_size if deleted_total: # Reclaim the freed pages and collapse the WAL into the main file so # the on-disk size reflects the deletions immediately. VACUUM fully # compacts in one pass (a stepped ``PRAGMA incremental_vacuum`` only # releases a single page per call through Python's sqlite3); it runs # at most once per call and only after a prune, so the cost is rare. conn.execute("VACUUM") conn.execute("PRAGMA wal_checkpoint(TRUNCATE)") print( f"Deleted {deleted_total} oldest messages from {path.name} " f"to stay under {max_size_mb} MiB" ) finally: conn.close()
def _has_table(conn, name: str) -> bool: return ( conn.execute( "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (name,) ).fetchone() is not None ) def _clean_user_db(db_file: Path, cutoff: float, dry_run: bool) -> None: # try/finally guarantees the connection is closed even when a query raises # (e.g. a locked DB or a VACUUM error). The caller swallows the exception, # so without this the connection would leak and surface as a ResourceWarning. conn = sqlite3.connect(str(db_file)) try: conn.execute("PRAGMA journal_mode=WAL") if dry_run: row = conn.execute( "SELECT COUNT(*) FROM messages WHERE ts < ?", (cutoff,) ).fetchone() count = row[0] if row else 0 if count > 0: print(f"[DRY RUN] Would delete {count} messages from {db_file.name}") else: cur = conn.execute("DELETE FROM messages WHERE ts < ?", (cutoff,)) if cur.rowcount > 0: print(f"Deleted {cur.rowcount} messages from {db_file.name}") # Reactions reference messages by id; drop any now-orphaned rows so # they don't accumulate. (The FTS index self-cleans via its trigger.) if _has_table(conn, "reactions"): conn.execute( "DELETE FROM reactions WHERE message_id NOT IN (SELECT id FROM messages)" ) conn.commit() if conn.execute("PRAGMA auto_vacuum").fetchone()[0] == 0: conn.execute("PRAGMA auto_vacuum = FULL") conn.execute("VACUUM") finally: conn.close() if __name__ == "__main__": delete_files_older_than(directory="uploads", image_days=30, file_days=30) delete_files_over_size(directory="uploads", max_size_mb=2048) delete_messages_older_than(users_dir="users", days=770) delete_messages_over_size(db_path="users/messages.db", max_size_mb=1024)