From d5396944097f6854afdb32f6b480206dcd5b4f0f Mon Sep 17 00:00:00 2001
From: Braden McDorman <bmcdorman@gmail.com>
Date: Thu, 9 Oct 2025 07:46:12 -0700
Subject: [PATCH 1/2] Add Prometheus metrics and monitoring for Redis

- Add prom-client dependency for metrics
- Create /metrics endpoint exposing Prometheus metrics
- Track Redis connection status (database_redis_connection_status)
- Track Redis operation success/failure counts by operation type
- Update RedisCache to report connection events and operation metrics
- Add comprehensive monitoring documentation

Metrics exposed:
- database_redis_connection_status: 1=connected, 0=disconnected
- database_redis_operation_success_total: successful operations by type
- database_redis_operation_failures_total: failed operations by type

Note: This PR is independent and can be deployed alongside PR #10 for
graceful Redis failure handling, or standalone for monitoring only.
---
 REDIS_MONITORING.md | 87 +++++++++++++++++++++++++++++++++++++++++++++
 package.json        |  3 +-
 src/RedisCache.ts   | 63 ++++++++++++++++++++++++++------
 src/index.ts        |  7 ++++
 src/metrics.ts      | 39 ++++++++++++++++++++
 5 files changed, 188 insertions(+), 11 deletions(-)
 create mode 100644 REDIS_MONITORING.md
 create mode 100644 src/metrics.ts

diff --git a/REDIS_MONITORING.md b/REDIS_MONITORING.md
new file mode 100644
index 0000000..d45a854
--- /dev/null
+++ b/REDIS_MONITORING.md
@@ -0,0 +1,87 @@
+# Redis Monitoring and Alerting
+
+This document describes the Redis monitoring and alerting setup for the database service.
+
+## Overview
+
+The database service now includes comprehensive monitoring for Redis connection status and operation health. When Redis goes down, the application continues to function without caching, and alerts are triggered to notify operators.
+
+## Metrics
+
+The following Prometheus metrics are exposed at `/metrics`:
+
+### `database_redis_connection_status`
+- **Type**: Gauge
+- **Values**: `1` (connected) or `0` (disconnected)
+- **Description**: Current Redis connection status
+
+### `database_redis_operation_success_total`
+- **Type**: Counter
+- **Labels**: `operation` (get, set, remove)
+- **Description**: Total number of successful Redis operations
+
+### `database_redis_operation_failures_total`
+- **Type**: Counter
+- **Labels**: `operation` (get, set, remove)
+- **Description**: Total number of failed Redis operations
+
+## Alerts
+
+### DatabaseRedisDown
+- **Severity**: Critical
+- **Condition**: Redis connection is down for more than 1 minute
+- **Description**: The database service has lost connection to Redis. Cache is unavailable but the service continues to operate.
+- **Action**: Check Redis pod status, network connectivity, and Redis logs.
+
+### DatabaseRedisOperationFailures
+- **Severity**: Warning
+- **Condition**: Redis operations failing at rate > 0.1/second for 2 minutes
+- **Description**: Redis operations are experiencing failures
+- **Action**: Check Redis health, network latency, and error logs.
+
+### DatabaseRedisHighFailureRate
+- **Severity**: Critical
+- **Condition**: Redis operations failing at rate > 1/second for 1 minute
+- **Description**: Critical failure rate - service is degraded
+- **Action**: Investigate immediately. Check Redis status, restart if necessary.
+
+## Grafana Dashboard
+
+A dedicated Grafana dashboard "Database Redis Monitoring" provides:
+
+1. **Redis Connection Status** - Real-time connection state
+2. **Operation Success Rate** - Rate of successful operations by type
+3. **Operation Failure Rate** - Rate of failed operations by type
+4. **Success Rate %** - Overall success percentage
+5. **Connection History** - Timeline of connection up/down events
+
+Import the dashboard from: `Simulator/grafana-dashboards/database-redis-monitoring.json`
+
+## Deployment
+
+The monitoring stack is deployed automatically with the database Helm chart:
+
+- **ServiceMonitor**: Scrapes `/metrics` endpoint every 30 seconds
+- **PrometheusRule**: Defines alert rules
+- **Service**: Labeled for Prometheus discovery
+
+## Testing Alerting
+
+To test the alerting system:
+
+1. Deploy to staging environment
+2. Stop the Redis pod: `kubectl delete pod -l app=redis`
+3. Verify metrics show `database_redis_connection_status = 0`
+4. Wait 1 minute for `DatabaseRedisDown` alert to fire
+5. Check Alertmanager UI for active alerts
+6. Restart Redis and verify recovery
+
+## Configuration
+
+Alert routing and notification channels are configured in Alertmanager. Ensure the following labels are routed appropriately:
+
+- `severity: critical` → PagerDuty / immediate notifications
+- `severity: warning` → Slack / email notifications
+- `component: database`
+- `service: redis`
+
diff --git a/package.json b/package.json
index fbed264..e9be562 100644
--- a/package.json
+++ b/package.json
@@ -16,7 +16,8 @@
     "@google-cloud/storage": "^6.9.3",
     "fastify": "^4.9.2",
     "firebase-admin": "^11.2.0",
-    "ioredis": "^5.2.3"
+    "ioredis": "^5.2.3",
+    "prom-client": "^14.2.0"
   },
   "devDependencies": {
     "@types/argparse": "^2.0.10",
diff --git a/src/RedisCache.ts b/src/RedisCache.ts
index fb112ae..c3438db 100644
--- a/src/RedisCache.ts
+++ b/src/RedisCache.ts
@@ -2,6 +2,7 @@ import Cache from './Cache';
 
 import Redis, { RedisOptions } from 'ioredis';
 import Selector from './model/Selector';
+import { redisConnectionGauge, redisFailureCounter, redisSuccessCounter } from './metrics';
 
 class RedisCache implements Cache {
   private static DEFAULT_TTL = 60 * 60 * 24 * 7;
@@ -10,6 +11,31 @@ class RedisCache implements Cache {
 
   constructor(options: RedisOptions) {
     this.redis_ = new Redis(options);
+    
+    this.redis_.on('connect', () => {
+      console.log('Redis connected');
+      redisConnectionGauge.set(1);
+    });
+
+    this.redis_.on('ready', () => {
+      console.log('Redis ready');
+      redisConnectionGauge.set(1);
+    });
+
+    this.redis_.on('error', (err) => {
+      console.error('Redis error:', err.message);
+      redisConnectionGauge.set(0);
+    });
+
+    this.redis_.on('close', () => {
+      console.warn('Redis connection closed');
+      redisConnectionGauge.set(0);
+    });
+
+    this.redis_.on('end', () => {
+      console.warn('Redis connection ended');
+      redisConnectionGauge.set(0);
+    });
   }
 
   private static key_ = ({ collection, id }: Selector): string => {
@@ -17,23 +43,40 @@ class RedisCache implements Cache {
   };
 
   async get(selector: Selector): Promise<object | null> {
-    const data = await this.redis_.get(RedisCache.key_(selector));
-    if (!data) return null;
-
-    return JSON.parse(data);
+    try {
+      const data = await this.redis_.get(RedisCache.key_(selector));
+      redisSuccessCounter.inc({ operation: 'get' });
+      if (!data) return null;
+      return JSON.parse(data);
+    } catch (err) {
+      redisFailureCounter.inc({ operation: 'get' });
+      throw err;
+    }
   }
 
   async set(selector: Selector, value: object | null): Promise<void> {
-    if (!value) {
-      await this.redis_.del(RedisCache.key_(selector));
-      return;
+    try {
+      if (!value) {
+        await this.redis_.del(RedisCache.key_(selector));
+        redisSuccessCounter.inc({ operation: 'set' });
+        return;
+      }
+      await this.redis_.setex(RedisCache.key_(selector), RedisCache.DEFAULT_TTL, JSON.stringify(value));
+      redisSuccessCounter.inc({ operation: 'set' });
+    } catch (err) {
+      redisFailureCounter.inc({ operation: 'set' });
+      throw err;
     }
-
-    await this.redis_.setex(RedisCache.key_(selector), RedisCache.DEFAULT_TTL, JSON.stringify(value));
   }
 
   async remove(selector: Selector): Promise<void> {
-    await this.redis_.del(RedisCache.key_(selector));
+    try {
+      await this.redis_.del(RedisCache.key_(selector));
+      redisSuccessCounter.inc({ operation: 'remove' });
+    } catch (err) {
+      redisFailureCounter.inc({ operation: 'remove' });
+      throw err;
+    }
   }
 }
 
diff --git a/src/index.ts b/src/index.ts
index 6c72b51..788e10c 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -12,6 +12,7 @@ import authorize, { AuthorizeResult } from './authorize';
 import { CHALLENGE_COMPLETION_COLLECTION, USER_COLLECTION } from './model/constants';
 
 import bigStore from './big-store';
+import { register as metricsRegister } from './metrics';
 
 const UNAUTHORIZED_RESULT = { message: 'Unauthorized' };
 const NOT_FOUND_RESULT = { message: 'Not Found' };
@@ -47,6 +48,12 @@ app.get('/', async (request, reply) => {
   reply.send({ database: 'alive' });
 });
 
+// Prometheus metrics endpoint
+app.get('/metrics', async (request, reply) => {
+  reply.header('Content-Type', metricsRegister.contentType);
+  reply.send(await metricsRegister.metrics());
+});
+
 app.get('/:collection/:id', async (request, reply) => {
   const token = await authenticate(request);
 
diff --git a/src/metrics.ts b/src/metrics.ts
new file mode 100644
index 0000000..8ce81e2
--- /dev/null
+++ b/src/metrics.ts
@@ -0,0 +1,39 @@
+import { Registry, Gauge, Counter } from 'prom-client';
+
+// Create a custom registry
+export const register = new Registry();
+
+// Redis connection status gauge (1 = connected, 0 = disconnected)
+export const redisConnectionGauge = new Gauge({
+  name: 'database_redis_connection_status',
+  help: 'Redis connection status (1 = connected, 0 = disconnected)',
+  registers: [register],
+});
+
+// Redis operation failures counter
+export const redisFailureCounter = new Counter({
+  name: 'database_redis_operation_failures_total',
+  help: 'Total number of failed Redis operations',
+  labelNames: ['operation'], // 'get', 'set', 'remove'
+  registers: [register],
+});
+
+// Redis operation success counter
+export const redisSuccessCounter = new Counter({
+  name: 'database_redis_operation_success_total',
+  help: 'Total number of successful Redis operations',
+  labelNames: ['operation'],
+  registers: [register],
+});
+
+// HTTP request counter
+export const httpRequestCounter = new Counter({
+  name: 'database_http_requests_total',
+  help: 'Total number of HTTP requests',
+  labelNames: ['method', 'route', 'status_code'],
+  registers: [register],
+});
+
+// Initialize Redis status as disconnected
+redisConnectionGauge.set(0);
+

From e0f0c9284bb24273d9ab21b5405c805c9c3542cb Mon Sep 17 00:00:00 2001
From: Braden McDorman <bmcdorman@gmail.com>
Date: Thu, 9 Oct 2025 07:47:18 -0700
Subject: [PATCH 2/2] Remove REDIS_MONITORING.md documentation file

---
 REDIS_MONITORING.md | 87 ---------------------------------------------
 1 file changed, 87 deletions(-)
 delete mode 100644 REDIS_MONITORING.md

diff --git a/REDIS_MONITORING.md b/REDIS_MONITORING.md
deleted file mode 100644
index d45a854..0000000
--- a/REDIS_MONITORING.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Redis Monitoring and Alerting
-
-This document describes the Redis monitoring and alerting setup for the database service.
-
-## Overview
-
-The database service now includes comprehensive monitoring for Redis connection status and operation health. When Redis goes down, the application continues to function without caching, and alerts are triggered to notify operators.
-
-## Metrics
-
-The following Prometheus metrics are exposed at `/metrics`:
-
-### `database_redis_connection_status`
-- **Type**: Gauge
-- **Values**: `1` (connected) or `0` (disconnected)
-- **Description**: Current Redis connection status
-
-### `database_redis_operation_success_total`
-- **Type**: Counter
-- **Labels**: `operation` (get, set, remove)
-- **Description**: Total number of successful Redis operations
-
-### `database_redis_operation_failures_total`
-- **Type**: Counter
-- **Labels**: `operation` (get, set, remove)
-- **Description**: Total number of failed Redis operations
-
-## Alerts
-
-### DatabaseRedisDown
-- **Severity**: Critical
-- **Condition**: Redis connection is down for more than 1 minute
-- **Description**: The database service has lost connection to Redis. Cache is unavailable but the service continues to operate.
-- **Action**: Check Redis pod status, network connectivity, and Redis logs.
-
-### DatabaseRedisOperationFailures
-- **Severity**: Warning
-- **Condition**: Redis operations failing at rate > 0.1/second for 2 minutes
-- **Description**: Redis operations are experiencing failures
-- **Action**: Check Redis health, network latency, and error logs.
-
-### DatabaseRedisHighFailureRate
-- **Severity**: Critical
-- **Condition**: Redis operations failing at rate > 1/second for 1 minute
-- **Description**: Critical failure rate - service is degraded
-- **Action**: Investigate immediately. Check Redis status, restart if necessary.
-
-## Grafana Dashboard
-
-A dedicated Grafana dashboard "Database Redis Monitoring" provides:
-
-1. **Redis Connection Status** - Real-time connection state
-2. **Operation Success Rate** - Rate of successful operations by type
-3. **Operation Failure Rate** - Rate of failed operations by type
-4. **Success Rate %** - Overall success percentage
-5. **Connection History** - Timeline of connection up/down events
-
-Import the dashboard from: `Simulator/grafana-dashboards/database-redis-monitoring.json`
-
-## Deployment
-
-The monitoring stack is deployed automatically with the database Helm chart:
-
-- **ServiceMonitor**: Scrapes `/metrics` endpoint every 30 seconds
-- **PrometheusRule**: Defines alert rules
-- **Service**: Labeled for Prometheus discovery
-
-## Testing Alerting
-
-To test the alerting system:
-
-1. Deploy to staging environment
-2. Stop the Redis pod: `kubectl delete pod -l app=redis`
-3. Verify metrics show `database_redis_connection_status = 0`
-4. Wait 1 minute for `DatabaseRedisDown` alert to fire
-5. Check Alertmanager UI for active alerts
-6. Restart Redis and verify recovery
-
-## Configuration
-
-Alert routing and notification channels are configured in Alertmanager. Ensure the following labels are routed appropriately:
-
-- `severity: critical` → PagerDuty / immediate notifications
-- `severity: warning` → Slack / email notifications
-- `component: database`
-- `service: redis`
-