RolnickLab · mihow · Oct 30, 2025 · coderabbitai · Oct 31, 2025 · coderabbitai
diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile
@@ -41,6 +41,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   libpq-dev \
   # Translations dependencies
   gettext \
+  # healthcheck dependencies
+  procps \
   # cleaning up unused files
   && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
   && rm -rf /var/lib/apt/lists/*
@@ -74,6 +76,10 @@ COPY ./compose/local/django/celery/flower/start /start-flower
 RUN sed -i 's/\r$//g' /start-flower
 RUN chmod +x /start-flower
 
+# Copy celery scripts directory for healthcheck
+COPY ./compose/local/django/celery /celery
+RUN chmod +x /celery/healthcheck.sh
+
 
 # copy application code to WORKDIR
 COPY . ${APP_HOME}

diff --git a/compose/local/django/celery/healthcheck.sh b/compose/local/django/celery/healthcheck.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#
+# Celery Worker Healthcheck Script
+#
+# This script checks if the Celery worker process is running and responsive.
+# It uses two checks:
+# 1. Process check - is celery worker process running?
+# 2. Redis connectivity - can we connect to the broker?
+#
+# When used with the autoheal container, unhealthy workers will be
+# automatically restarted.
+
+set -e
+
+# Check 1: Is the celery worker process running?
+if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then
+    echo "ERROR: Celery worker process not found" >&2
+    exit 1
+fi
+
+# Check 2: Can we connect to Redis (the broker)?
+# Use redis-cli if available, otherwise skip
+if command -v redis-cli > /dev/null 2>&1; then
+    if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
+        echo "ERROR: Cannot connect to Redis broker" >&2
+        exit 1
+    fi
+fi
+
+# All checks passed
+exit 0
diff --git a/compose/local/django/celery/worker/start b/compose/local/django/celery/worker/start
@@ -3,5 +3,26 @@
 set -o errexit
 set -o nounset
 
+# Local development with auto-reload and optional debugging
+#
+# CELERY_DEBUG=1 - Enable debugpy for remote debugging on port 5678
+# CELERY_NO_RELOAD=1 - Disable watchfiles auto-reload
+#
+# Worker protections (same as production):
+# --max-tasks-per-child=50 - Restart after 50 tasks (prevents memory leaks)
+# --max-memory-per-child=4000000 - Restart if memory exceeds 4GB
 
-exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker -l INFO'
+# Check if debugging is enabled
+if [ "${CELERY_DEBUG:-0}" = "1" ]; then
+    echo "Starting Celery worker with debugpy on port 5678..."
+    exec python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000
+fi
+
+# Check if auto-reload should be disabled
+if [ "${CELERY_NO_RELOAD:-0}" = "1" ]; then
+    echo "Starting Celery worker without auto-reload..."
+    exec celery -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000
+else
+    echo "Starting Celery worker with watchfiles auto-reload..."
+    exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000'
+fi
diff --git a/compose/production/django/Dockerfile b/compose/production/django/Dockerfile
@@ -45,6 +45,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   libpq-dev \
   # Translations dependencies
   gettext \
+  # healthcheck dependencies
+  procps \
   # cleaning up unused files
   && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
   && rm -rf /var/lib/apt/lists/*
@@ -81,6 +83,11 @@ RUN sed -i 's/\r$//g' /start-flower
 RUN chmod +x /start-flower
 
 
+# Copy celery scripts directory for healthcheck
+COPY --chown=django:django ./compose/production/django/celery /celery
+RUN chmod +x /celery/healthcheck.sh
+
+
 # copy application code to WORKDIR
 COPY --chown=django:django . ${APP_HOME}
 

diff --git a/compose/production/django/celery/healthcheck.sh b/compose/production/django/celery/healthcheck.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#
+# Celery Worker Healthcheck Script (Production)
+#
+# This script checks if the Celery worker process is running and responsive.
+# It uses two checks:
+# 1. Process check - is celery worker process running?
+# 2. Redis connectivity - can we connect to the broker?
+#
+# When used with the autoheal container, unhealthy workers will be
+# automatically restarted.
+
+set -e
+
+# Check 1: Is the celery worker process running?
+if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then
+    echo "ERROR: Celery worker process not found" >&2
+    exit 1
+fi
+
+# Check 2: Can we connect to Redis (the broker)?
+# Use redis-cli if available, otherwise skip
+if command -v redis-cli > /dev/null 2>&1; then
+    if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
+        echo "ERROR: Cannot connect to Redis broker" >&2
+        exit 1
+    fi
+fi
-# Check 2: Can we connect to Redis (the broker)?
-# Use redis-cli if available, otherwise skip
-if command -v redis-cli > /dev/null 2>&1; then
-    if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
-        echo "ERROR: Cannot connect to Redis broker" >&2
-        exit 1
-    fi
-fi
+# Check 2: Can we connect to Redis (the broker)?
+# Use redis-cli if available, otherwise skip
+if command -v redis-cli > /dev/null 2>&1; then
+    REDIS_HOST=$(echo "${CELERY_BROKER_URL:-redis://redis:6379}" | sed -E 's|^redis://([^:/@]+).*|\1|')
+    if ! timeout 5 redis-cli -h "${REDIS_HOST}" ping > /dev/null 2>&1; then
+        echo "ERROR: Cannot connect to Redis broker" >&2
+        exit 1
+    fi
+fi
-# Check 2: Can we connect to Redis (the broker)?
-# Use redis-cli if available, otherwise skip
-if command -v redis-cli > /dev/null 2>&1; then
-    if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
-        echo "ERROR: Cannot connect to Redis broker" >&2
-        exit 1
-    fi
-fi
+# Check 2: Can we connect to Redis (the broker)?
+# Use redis-cli if available, otherwise skip
+if command -v redis-cli > /dev/null 2>&1; then
+    REDIS_HOST=$(echo "${CELERY_BROKER_URL:-redis://redis:6379}" | sed -E 's|^redis://([^:/@]+).*|\1|')
+    if ! timeout 5 redis-cli -h "${REDIS_HOST}" ping > /dev/null 2>&1; then
+        echo "ERROR: Cannot connect to Redis broker" >&2
+        exit 1
+    fi
+fi
+
+# All checks passed
+exit 0
diff --git a/compose/production/django/celery/worker/start b/compose/production/django/celery/worker/start
@@ -4,4 +4,23 @@ set -o errexit
 set -o pipefail
 set -o nounset
 
-exec newrelic-admin run-program celery -A config.celery_app worker -l INFO
+# Celery worker with built-in protections against stuck/leaking workers:
+#
+# --max-tasks-per-child=50
+#   Restart worker process after 50 tasks to prevent memory leaks
+#   Conservative value since ML tasks can be memory-intensive
+#
+# --max-memory-per-child=4000000
+#   Restart worker if memory exceeds 4GB (4,000,000 KB)
+#   Prevents runaway memory consumption from large images/models
+#
+# These options work in conjunction with the Docker healthcheck:
+# - Healthcheck detects STUCK workers (not responding to ping)
+# - These options prevent RESOURCE LEAKS (memory/task buildup)
+# - Autoheal restarts UNHEALTHY containers
+# - restart:always brings containers back after any exit
+
+exec newrelic-admin run-program celery -A config.celery_app worker \
+    -l INFO \
+    --max-tasks-per-child=50 \
+    --max-memory-per-child=4000000
diff --git a/docker-compose.production.yml b/docker-compose.production.yml
@@ -29,12 +29,28 @@ services:
     ports: []
     command: /start-celeryworker
     restart: always
+    healthcheck:
+      test: ["CMD-SHELL", "/celery/healthcheck.sh"]
+      interval: 30s      # Check every 30 seconds
+      timeout: 15s       # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
+      retries: 3         # Mark unhealthy after 3 consecutive failures (90s total)
+      start_period: 60s  # Grace period during container startup
+    labels:
+      - "autoheal=true"  # Enable autoheal to restart this container when unhealthy
 
   celerybeat:
     <<: *django
     ports: []
     command: /start-celerybeat
     restart: always
+    healthcheck:
+      test: ["CMD-SHELL", "pgrep -f 'celery.*beat' > /dev/null || exit 1"]
+      interval: 60s      # Beat is less critical, check every minute
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    labels:
+      - "autoheal=true"
 
   flower:
     <<: *django
@@ -44,6 +60,25 @@ services:
     restart: always
     volumes:
       - ./data/flower/:/data/
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 30s
+    labels:
+      - "autoheal=true"
+
+  autoheal:
+    image: willfarrell/autoheal:latest
+    container_name: ami_production_autoheal
+    restart: always
+    environment:
+      - AUTOHEAL_CONTAINER_LABEL=autoheal
+      - AUTOHEAL_INTERVAL=10      # Check container health every 10 seconds
+      - AUTOHEAL_START_PERIOD=60  # Don't restart containers for 60s after they start
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
 
   awscli:
     build:

diff --git a/docker-compose.worker.yml b/docker-compose.worker.yml
@@ -25,3 +25,22 @@ services:
     ports: []
     command: /start-celeryworker
     restart: always
+    healthcheck:
+      test: ["CMD-SHELL", "/celery/healthcheck.sh"]
+      interval: 30s      # Check every 30 seconds
+      timeout: 15s       # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
+      retries: 3         # Mark unhealthy after 3 consecutive failures (90s total)
+      start_period: 60s  # Grace period during container startup
+    labels:
+      - "autoheal=true"  # Enable autoheal to restart this container when unhealthy
+
+  autoheal:
+    image: willfarrell/autoheal:latest
+    container_name: ami_worker_autoheal
+    restart: always
+    environment:
+      - AUTOHEAL_CONTAINER_LABEL=autoheal
+      - AUTOHEAL_INTERVAL=10      # Check container health every 10 seconds
+      - AUTOHEAL_START_PERIOD=60  # Don't restart containers for 60s after they start
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -90,11 +90,21 @@ services:
     <<: *django
     image: ami_local_celeryworker
     scale: 1
-    # For remote debugging with debugpy, should get overridden for production
+    # For remote debugging with debugpy, set CELERY_DEBUG=1 in environment
+    # To disable watchfiles auto-reload, set CELERY_NO_RELOAD=1
     # Also make sure to install debugpy in your requirements/local.txt
     ports:
       - "5678:5678"
-    command: python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO
+    # environment:
+    #   - CELERY_DEBUG=1
+    #   - CELERY_NO_RELOAD=1
+    command: /start-celeryworker
+    healthcheck:
+      test: ["CMD-SHELL", "/celery/healthcheck.sh"]
+      interval: 30s      # Check every 30 seconds
+      timeout: 15s       # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
+      retries: 3         # Mark unhealthy after 3 consecutive failures (90s total)
+      start_period: 60s  # Grace period during container startup
 
   celerybeat:
     <<: *django