From 101f4cfcbe759d195e75ec1f2e92cc2bc979ef25 Mon Sep 17 00:00:00 2001
From: Morgan Wowk <morgan.wowk@shopify.com>
Date: Mon, 2 Feb 2026 12:33:39 -0800
Subject: [PATCH] feat: Add orchestrator error tracking by type

Tracks orchestrator errors categorized by type for targeted troubleshooting.

Metrics added:
- orchestrator_errors_total: Counter tracking errors by error_type (system_error/launch_error/missing_outputs)

This enables identifying the root cause of orchestrator failures:
- system_error: Unexpected errors during execution processing
- launch_error: Container launch failures
- missing_outputs: Executions failed due to missing output artifacts

These metrics complement queue processing error metrics to provide complete visibility into orchestrator health.
---
 .../instrumentation/metrics.py                | 34 +++++++++++++++++++
 cloud_pipelines_backend/orchestrator_sql.py   |  8 +++++
 2 files changed, 42 insertions(+)

diff --git a/cloud_pipelines_backend/instrumentation/metrics.py b/cloud_pipelines_backend/instrumentation/metrics.py
index 3719c72..d50860d 100644
--- a/cloud_pipelines_backend/instrumentation/metrics.py
+++ b/cloud_pipelines_backend/instrumentation/metrics.py
@@ -414,6 +414,40 @@ def track_database_query_duration(operation: str, duration_seconds: float):
         histogram.record(duration_seconds, {"operation": operation.lower()})
 
 
+# Orchestrator Error Tracking
+_orchestrator_errors_counter = None
+
+
+def get_orchestrator_errors_counter():
+    """Get or create orchestrator errors counter."""
+    global _orchestrator_errors_counter
+
+    meter = get_meter()
+    if meter is None:
+        return None
+
+    if _orchestrator_errors_counter is None:
+        _orchestrator_errors_counter = meter.create_counter(
+            name="orchestrator_errors_total",
+            description="Total number of orchestrator errors by type",
+            unit="1",
+        )
+
+    return _orchestrator_errors_counter
+
+
+def track_orchestrator_error(error_type: str):
+    """
+    Track orchestrator error by type.
+
+    Args:
+        error_type: Type of error (system_error/launch_error/missing_outputs)
+    """
+    counter = get_orchestrator_errors_counter()
+    if counter:
+        counter.add(1, {"error_type": error_type})
+
+
 class HTTPMetricsMiddleware(BaseHTTPMiddleware):
     """
     Middleware to track HTTP request metrics.
diff --git a/cloud_pipelines_backend/orchestrator_sql.py b/cloud_pipelines_backend/orchestrator_sql.py
index 843ba69..a77688e 100644
--- a/cloud_pipelines_backend/orchestrator_sql.py
+++ b/cloud_pipelines_backend/orchestrator_sql.py
@@ -111,6 +111,8 @@ def internal_process_queued_executions_queue(self, session: orm.Session):
                     _logger.exception("Error processing queued execution")
                     # Track processing error
                     metrics.track_queue_processing_error(queue_type="queued")
+                    # Track system error
+                    metrics.track_orchestrator_error(error_type="system_error")
                     session.rollback()
                     queued_execution.container_execution_status = (
                         bts.ContainerExecutionStatus.SYSTEM_ERROR
@@ -189,6 +191,8 @@ def internal_process_running_executions_queue(self, session: orm.Session):
                     _logger.exception("Error processing running container execution")
                     # Track processing error
                     metrics.track_queue_processing_error(queue_type="running")
+                    # Track system error
+                    metrics.track_orchestrator_error(error_type="system_error")
                     session.rollback()
                     running_container_execution.status = (
                         bts.ContainerExecutionStatus.SYSTEM_ERROR
@@ -566,6 +570,8 @@ def generate_execution_log_uri(
                 launcher_class_name=launcher_class_name,
                 success=False,
             )
+            # Track launch error
+            metrics.track_orchestrator_error(error_type="launch_error")
             session.rollback()
             with session.begin():
                 # Logs whole exception
@@ -818,6 +824,8 @@ def _maybe_preload_value(
             if missing_output_names:
                 # Marking the container execution as FAILED (even though the program itself has completed successfully)
                 container_execution.status = bts.ContainerExecutionStatus.FAILED
+                # Track missing outputs error
+                metrics.track_orchestrator_error(error_type="missing_outputs")
                 # Track container execution duration
                 if container_execution.started_at and container_execution.ended_at:
                     duration = (