diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile index 0e778f82b..d0460f246 100644 --- a/compose/local/django/Dockerfile +++ b/compose/local/django/Dockerfile @@ -41,6 +41,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ libpq-dev \ # Translations dependencies gettext \ + # healthcheck dependencies + procps \ # cleaning up unused files && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ && rm -rf /var/lib/apt/lists/* @@ -74,6 +76,10 @@ COPY ./compose/local/django/celery/flower/start /start-flower RUN sed -i 's/\r$//g' /start-flower RUN chmod +x /start-flower +# Copy celery scripts directory for healthcheck +COPY ./compose/local/django/celery /celery +RUN chmod +x /celery/healthcheck.sh + # copy application code to WORKDIR COPY . ${APP_HOME} diff --git a/compose/local/django/celery/healthcheck.sh b/compose/local/django/celery/healthcheck.sh new file mode 100755 index 000000000..3e065a471 --- /dev/null +++ b/compose/local/django/celery/healthcheck.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# Celery Worker Healthcheck Script +# +# This script checks if the Celery worker process is running and responsive. +# It uses two checks: +# 1. Process check - is celery worker process running? +# 2. Redis connectivity - can we connect to the broker? +# +# When used with the autoheal container, unhealthy workers will be +# automatically restarted. + +set -e + +# Check 1: Is the celery worker process running? +if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then + echo "ERROR: Celery worker process not found" >&2 + exit 1 +fi + +# Check 2: Can we connect to Redis (the broker)? +# Use redis-cli if available, otherwise skip +if command -v redis-cli > /dev/null 2>&1; then + if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then + echo "ERROR: Cannot connect to Redis broker" >&2 + exit 1 + fi +fi + +# All checks passed +exit 0 diff --git a/compose/local/django/celery/worker/start b/compose/local/django/celery/worker/start index 183a80159..f3255b474 100644 --- a/compose/local/django/celery/worker/start +++ b/compose/local/django/celery/worker/start @@ -3,5 +3,26 @@ set -o errexit set -o nounset +# Local development with auto-reload and optional debugging +# +# CELERY_DEBUG=1 - Enable debugpy for remote debugging on port 5678 +# CELERY_NO_RELOAD=1 - Disable watchfiles auto-reload +# +# Worker protections (same as production): +# --max-tasks-per-child=50 - Restart after 50 tasks (prevents memory leaks) +# --max-memory-per-child=4000000 - Restart if memory exceeds 4GB -exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker -l INFO' +# Check if debugging is enabled +if [ "${CELERY_DEBUG:-0}" = "1" ]; then + echo "Starting Celery worker with debugpy on port 5678..." + exec python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000 +fi + +# Check if auto-reload should be disabled +if [ "${CELERY_NO_RELOAD:-0}" = "1" ]; then + echo "Starting Celery worker without auto-reload..." + exec celery -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000 +else + echo "Starting Celery worker with watchfiles auto-reload..." + exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000' +fi diff --git a/compose/production/django/Dockerfile b/compose/production/django/Dockerfile index fd6b80ec1..920efeb2a 100644 --- a/compose/production/django/Dockerfile +++ b/compose/production/django/Dockerfile @@ -45,6 +45,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ libpq-dev \ # Translations dependencies gettext \ + # healthcheck dependencies + procps \ # cleaning up unused files && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ && rm -rf /var/lib/apt/lists/* @@ -81,6 +83,11 @@ RUN sed -i 's/\r$//g' /start-flower RUN chmod +x /start-flower +# Copy celery scripts directory for healthcheck +COPY --chown=django:django ./compose/production/django/celery /celery +RUN chmod +x /celery/healthcheck.sh + + # copy application code to WORKDIR COPY --chown=django:django . ${APP_HOME} diff --git a/compose/production/django/celery/healthcheck.sh b/compose/production/django/celery/healthcheck.sh new file mode 100755 index 000000000..ac10abf6d --- /dev/null +++ b/compose/production/django/celery/healthcheck.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# Celery Worker Healthcheck Script (Production) +# +# This script checks if the Celery worker process is running and responsive. +# It uses two checks: +# 1. Process check - is celery worker process running? +# 2. Redis connectivity - can we connect to the broker? +# +# When used with the autoheal container, unhealthy workers will be +# automatically restarted. + +set -e + +# Check 1: Is the celery worker process running? +if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then + echo "ERROR: Celery worker process not found" >&2 + exit 1 +fi + +# Check 2: Can we connect to Redis (the broker)? +# Use redis-cli if available, otherwise skip +if command -v redis-cli > /dev/null 2>&1; then + if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then + echo "ERROR: Cannot connect to Redis broker" >&2 + exit 1 + fi +fi + +# All checks passed +exit 0 diff --git a/compose/production/django/celery/worker/start b/compose/production/django/celery/worker/start index 9d41926e7..53ec0eb42 100644 --- a/compose/production/django/celery/worker/start +++ b/compose/production/django/celery/worker/start @@ -4,4 +4,23 @@ set -o errexit set -o pipefail set -o nounset -exec newrelic-admin run-program celery -A config.celery_app worker -l INFO +# Celery worker with built-in protections against stuck/leaking workers: +# +# --max-tasks-per-child=50 +# Restart worker process after 50 tasks to prevent memory leaks +# Conservative value since ML tasks can be memory-intensive +# +# --max-memory-per-child=4000000 +# Restart worker if memory exceeds 4GB (4,000,000 KB) +# Prevents runaway memory consumption from large images/models +# +# These options work in conjunction with the Docker healthcheck: +# - Healthcheck detects STUCK workers (not responding to ping) +# - These options prevent RESOURCE LEAKS (memory/task buildup) +# - Autoheal restarts UNHEALTHY containers +# - restart:always brings containers back after any exit + +exec newrelic-admin run-program celery -A config.celery_app worker \ + -l INFO \ + --max-tasks-per-child=50 \ + --max-memory-per-child=4000000 diff --git a/docker-compose.production.yml b/docker-compose.production.yml index 099464277..207ebd571 100644 --- a/docker-compose.production.yml +++ b/docker-compose.production.yml @@ -29,12 +29,28 @@ services: ports: [] command: /start-celeryworker restart: always + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck.sh"] + interval: 30s # Check every 30 seconds + timeout: 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead) + retries: 3 # Mark unhealthy after 3 consecutive failures (90s total) + start_period: 60s # Grace period during container startup + labels: + - "autoheal=true" # Enable autoheal to restart this container when unhealthy celerybeat: <<: *django ports: [] command: /start-celerybeat restart: always + healthcheck: + test: ["CMD-SHELL", "pgrep -f 'celery.*beat' > /dev/null || exit 1"] + interval: 60s # Beat is less critical, check every minute + timeout: 10s + retries: 3 + start_period: 30s + labels: + - "autoheal=true" flower: <<: *django @@ -44,6 +60,25 @@ services: restart: always volumes: - ./data/flower/:/data/ + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 30s + labels: + - "autoheal=true" + + autoheal: + image: willfarrell/autoheal:latest + container_name: ami_production_autoheal + restart: always + environment: + - AUTOHEAL_CONTAINER_LABEL=autoheal + - AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds + - AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start + volumes: + - /var/run/docker.sock:/var/run/docker.sock awscli: build: diff --git a/docker-compose.worker.yml b/docker-compose.worker.yml index fb98c28f7..241854a87 100644 --- a/docker-compose.worker.yml +++ b/docker-compose.worker.yml @@ -25,3 +25,22 @@ services: ports: [] command: /start-celeryworker restart: always + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck.sh"] + interval: 30s # Check every 30 seconds + timeout: 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead) + retries: 3 # Mark unhealthy after 3 consecutive failures (90s total) + start_period: 60s # Grace period during container startup + labels: + - "autoheal=true" # Enable autoheal to restart this container when unhealthy + + autoheal: + image: willfarrell/autoheal:latest + container_name: ami_worker_autoheal + restart: always + environment: + - AUTOHEAL_CONTAINER_LABEL=autoheal + - AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds + - AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start + volumes: + - /var/run/docker.sock:/var/run/docker.sock diff --git a/docker-compose.yml b/docker-compose.yml index ff9d125f0..80b50bc61 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -90,11 +90,21 @@ services: <<: *django image: ami_local_celeryworker scale: 1 - # For remote debugging with debugpy, should get overridden for production + # For remote debugging with debugpy, set CELERY_DEBUG=1 in environment + # To disable watchfiles auto-reload, set CELERY_NO_RELOAD=1 # Also make sure to install debugpy in your requirements/local.txt ports: - "5678:5678" - command: python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO + # environment: + # - CELERY_DEBUG=1 + # - CELERY_NO_RELOAD=1 + command: /start-celeryworker + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck.sh"] + interval: 30s # Check every 30 seconds + timeout: 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead) + retries: 3 # Mark unhealthy after 3 consecutive failures (90s total) + start_period: 60s # Grace period during container startup celerybeat: <<: *django