Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions compose/local/django/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
libpq-dev \
# Translations dependencies
gettext \
# healthcheck dependencies
procps \
# cleaning up unused files
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& rm -rf /var/lib/apt/lists/*
Expand Down Expand Up @@ -74,6 +76,10 @@ COPY ./compose/local/django/celery/flower/start /start-flower
RUN sed -i 's/\r$//g' /start-flower
RUN chmod +x /start-flower

# Copy celery scripts directory for healthcheck
COPY ./compose/local/django/celery /celery
RUN chmod +x /celery/healthcheck.sh


# copy application code to WORKDIR
COPY . ${APP_HOME}
Expand Down
31 changes: 31 additions & 0 deletions compose/local/django/celery/healthcheck.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash
#
# Celery Worker Healthcheck Script
#
# This script checks if the Celery worker process is running and responsive.
# It uses two checks:
# 1. Process check - is celery worker process running?
# 2. Redis connectivity - can we connect to the broker?
#
# When used with the autoheal container, unhealthy workers will be
# automatically restarted.

set -e

# Check 1: Is the celery worker process running?
if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then
echo "ERROR: Celery worker process not found" >&2
exit 1
fi

# Check 2: Can we connect to Redis (the broker)?
# Use redis-cli if available, otherwise skip
if command -v redis-cli > /dev/null 2>&1; then
if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
echo "ERROR: Cannot connect to Redis broker" >&2
exit 1
fi
fi
Comment on lines +21 to +28
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Fix Redis broker URL parsing.

Line 24 uses ${CELERY_BROKER_URL:-redis} directly as the hostname argument to redis-cli, but CELERY_BROKER_URL is typically a full URL like redis://redis:6379/0, not just a hostname. This will cause the healthcheck to fail when the environment variable is set.

Apply this diff to properly extract the hostname from the broker URL:

 # Check 2: Can we connect to Redis (the broker)?
 # Use redis-cli if available, otherwise skip
 if command -v redis-cli > /dev/null 2>&1; then
-    if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
+    REDIS_HOST=$(echo "${CELERY_BROKER_URL:-redis://redis:6379}" | sed -E 's|^redis://([^:/@]+).*|\1|')
+    if ! timeout 5 redis-cli -h "${REDIS_HOST}" ping > /dev/null 2>&1; then
         echo "ERROR: Cannot connect to Redis broker" >&2
         exit 1
     fi
 fi

This also adds a 5-second timeout to prevent the healthcheck from hanging indefinitely if Redis is unresponsive.

🤖 Prompt for AI Agents
In compose/local/django/celery/healthcheck.sh around lines 21 to 28, the script
currently passes CELERY_BROKER_URL (a full URL) directly to redis-cli -h which
fails; instead parse CELERY_BROKER_URL to extract host and port (strip scheme
like redis://, remove any userinfo before @, cut off any path/query after first
/, then split host[:port] into host and port), default to host "redis" and port
"6379" if missing, and call redis-cli with both -h host -p port and a 5-second
timeout (redis-cli -h <host> -p <port> -t 5 ping) so the healthcheck correctly
connects and won’t hang.


# All checks passed
exit 0
23 changes: 22 additions & 1 deletion compose/local/django/celery/worker/start
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,26 @@
set -o errexit
set -o nounset

# Local development with auto-reload and optional debugging
#
# CELERY_DEBUG=1 - Enable debugpy for remote debugging on port 5678
# CELERY_NO_RELOAD=1 - Disable watchfiles auto-reload
#
# Worker protections (same as production):
# --max-tasks-per-child=50 - Restart after 50 tasks (prevents memory leaks)
# --max-memory-per-child=4000000 - Restart if memory exceeds 4GB

exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker -l INFO'
# Check if debugging is enabled
if [ "${CELERY_DEBUG:-0}" = "1" ]; then
echo "Starting Celery worker with debugpy on port 5678..."
exec python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000
fi

# Check if auto-reload should be disabled
if [ "${CELERY_NO_RELOAD:-0}" = "1" ]; then
echo "Starting Celery worker without auto-reload..."
exec celery -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000
else
echo "Starting Celery worker with watchfiles auto-reload..."
exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000'
fi
7 changes: 7 additions & 0 deletions compose/production/django/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
libpq-dev \
# Translations dependencies
gettext \
# healthcheck dependencies
procps \
# cleaning up unused files
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& rm -rf /var/lib/apt/lists/*
Expand Down Expand Up @@ -81,6 +83,11 @@ RUN sed -i 's/\r$//g' /start-flower
RUN chmod +x /start-flower


# Copy celery scripts directory for healthcheck
COPY --chown=django:django ./compose/production/django/celery /celery
RUN chmod +x /celery/healthcheck.sh


# copy application code to WORKDIR
COPY --chown=django:django . ${APP_HOME}

Expand Down
31 changes: 31 additions & 0 deletions compose/production/django/celery/healthcheck.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash
#
# Celery Worker Healthcheck Script (Production)
#
# This script checks if the Celery worker process is running and responsive.
# It uses two checks:
# 1. Process check - is celery worker process running?
# 2. Redis connectivity - can we connect to the broker?
#
# When used with the autoheal container, unhealthy workers will be
# automatically restarted.

set -e

# Check 1: Is the celery worker process running?
if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then
echo "ERROR: Celery worker process not found" >&2
exit 1
fi

# Check 2: Can we connect to Redis (the broker)?
# Use redis-cli if available, otherwise skip
if command -v redis-cli > /dev/null 2>&1; then
if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
echo "ERROR: Cannot connect to Redis broker" >&2
exit 1
fi
fi
Comment on lines +21 to +28
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Fix Redis broker URL parsing.

Line 24 uses ${CELERY_BROKER_URL:-redis} directly as the hostname argument to redis-cli, but CELERY_BROKER_URL is typically a full URL like redis://redis:6379/0, not just a hostname. This will cause the healthcheck to fail when the environment variable is set.

Apply this diff to properly extract the hostname from the broker URL:

 # Check 2: Can we connect to Redis (the broker)?
 # Use redis-cli if available, otherwise skip
 if command -v redis-cli > /dev/null 2>&1; then
-    if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
+    REDIS_HOST=$(echo "${CELERY_BROKER_URL:-redis://redis:6379}" | sed -E 's|^redis://([^:/@]+).*|\1|')
+    if ! timeout 5 redis-cli -h "${REDIS_HOST}" ping > /dev/null 2>&1; then
         echo "ERROR: Cannot connect to Redis broker" >&2
         exit 1
     fi
 fi

This also adds a 5-second timeout to prevent the healthcheck from hanging indefinitely if Redis is unresponsive.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Check 2: Can we connect to Redis (the broker)?
# Use redis-cli if available, otherwise skip
if command -v redis-cli > /dev/null 2>&1; then
if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
echo "ERROR: Cannot connect to Redis broker" >&2
exit 1
fi
fi
# Check 2: Can we connect to Redis (the broker)?
# Use redis-cli if available, otherwise skip
if command -v redis-cli > /dev/null 2>&1; then
REDIS_HOST=$(echo "${CELERY_BROKER_URL:-redis://redis:6379}" | sed -E 's|^redis://([^:/@]+).*|\1|')
if ! timeout 5 redis-cli -h "${REDIS_HOST}" ping > /dev/null 2>&1; then
echo "ERROR: Cannot connect to Redis broker" >&2
exit 1
fi
fi
🤖 Prompt for AI Agents
In compose/production/django/celery/healthcheck.sh around lines 21 to 28, the
script currently passes ${CELERY_BROKER_URL:-redis} directly to redis-cli which
fails when CELERY_BROKER_URL is a full URL (e.g. redis://redis:6379/0); update
the script to parse CELERY_BROKER_URL to extract host (and optionally port)
using shell string manipulation or a simple URL parse (fall back to "redis" host
if unset), then call redis-cli with -h <host> and -p <port> as appropriate and
include a connection timeout (e.g. --connect-timeout 5 or use redis-cli -t 5) so
the healthcheck fails fast on unresponsive Redis.


# All checks passed
exit 0
21 changes: 20 additions & 1 deletion compose/production/django/celery/worker/start
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,23 @@ set -o errexit
set -o pipefail
set -o nounset

exec newrelic-admin run-program celery -A config.celery_app worker -l INFO
# Celery worker with built-in protections against stuck/leaking workers:
#
# --max-tasks-per-child=50
# Restart worker process after 50 tasks to prevent memory leaks
# Conservative value since ML tasks can be memory-intensive
#
# --max-memory-per-child=4000000
# Restart worker if memory exceeds 4GB (4,000,000 KB)
# Prevents runaway memory consumption from large images/models
#
# These options work in conjunction with the Docker healthcheck:
# - Healthcheck detects STUCK workers (not responding to ping)
# - These options prevent RESOURCE LEAKS (memory/task buildup)
# - Autoheal restarts UNHEALTHY containers
# - restart:always brings containers back after any exit

exec newrelic-admin run-program celery -A config.celery_app worker \
-l INFO \
--max-tasks-per-child=50 \
--max-memory-per-child=4000000
35 changes: 35 additions & 0 deletions docker-compose.production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,28 @@ services:
ports: []
command: /start-celeryworker
restart: always
healthcheck:
test: ["CMD-SHELL", "/celery/healthcheck.sh"]
interval: 30s # Check every 30 seconds
timeout: 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
retries: 3 # Mark unhealthy after 3 consecutive failures (90s total)
start_period: 60s # Grace period during container startup
labels:
- "autoheal=true" # Enable autoheal to restart this container when unhealthy

celerybeat:
<<: *django
ports: []
command: /start-celerybeat
restart: always
healthcheck:
test: ["CMD-SHELL", "pgrep -f 'celery.*beat' > /dev/null || exit 1"]
interval: 60s # Beat is less critical, check every minute
timeout: 10s
retries: 3
start_period: 30s
labels:
- "autoheal=true"

flower:
<<: *django
Expand All @@ -44,6 +60,25 @@ services:
restart: always
volumes:
- ./data/flower/:/data/
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
labels:
- "autoheal=true"

autoheal:
image: willfarrell/autoheal:latest
container_name: ami_production_autoheal
restart: always
environment:
- AUTOHEAL_CONTAINER_LABEL=autoheal
- AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds
- AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start
volumes:
- /var/run/docker.sock:/var/run/docker.sock

awscli:
build:
Expand Down
19 changes: 19 additions & 0 deletions docker-compose.worker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,22 @@ services:
ports: []
command: /start-celeryworker
restart: always
healthcheck:
test: ["CMD-SHELL", "/celery/healthcheck.sh"]
interval: 30s # Check every 30 seconds
timeout: 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
retries: 3 # Mark unhealthy after 3 consecutive failures (90s total)
start_period: 60s # Grace period during container startup
labels:
- "autoheal=true" # Enable autoheal to restart this container when unhealthy

autoheal:
image: willfarrell/autoheal:latest
container_name: ami_worker_autoheal
restart: always
environment:
- AUTOHEAL_CONTAINER_LABEL=autoheal
- AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds
- AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start
volumes:
- /var/run/docker.sock:/var/run/docker.sock
14 changes: 12 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,21 @@ services:
<<: *django
image: ami_local_celeryworker
scale: 1
# For remote debugging with debugpy, should get overridden for production
# For remote debugging with debugpy, set CELERY_DEBUG=1 in environment
# To disable watchfiles auto-reload, set CELERY_NO_RELOAD=1
# Also make sure to install debugpy in your requirements/local.txt
ports:
- "5678:5678"
command: python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO
# environment:
# - CELERY_DEBUG=1
# - CELERY_NO_RELOAD=1
command: /start-celeryworker
healthcheck:
test: ["CMD-SHELL", "/celery/healthcheck.sh"]
interval: 30s # Check every 30 seconds
timeout: 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
retries: 3 # Mark unhealthy after 3 consecutive failures (90s total)
start_period: 60s # Grace period during container startup

celerybeat:
<<: *django
Expand Down