From 101f4cfcbe759d195e75ec1f2e92cc2bc979ef25 Mon Sep 17 00:00:00 2001 From: Morgan Wowk Date: Mon, 2 Feb 2026 12:33:39 -0800 Subject: [PATCH] feat: Add orchestrator error tracking by type Tracks orchestrator errors categorized by type for targeted troubleshooting. Metrics added: - orchestrator_errors_total: Counter tracking errors by error_type (system_error/launch_error/missing_outputs) This enables identifying the root cause of orchestrator failures: - system_error: Unexpected errors during execution processing - launch_error: Container launch failures - missing_outputs: Executions failed due to missing output artifacts These metrics complement queue processing error metrics to provide complete visibility into orchestrator health. --- .../instrumentation/metrics.py | 34 +++++++++++++++++++ cloud_pipelines_backend/orchestrator_sql.py | 8 +++++ 2 files changed, 42 insertions(+) diff --git a/cloud_pipelines_backend/instrumentation/metrics.py b/cloud_pipelines_backend/instrumentation/metrics.py index 3719c72..d50860d 100644 --- a/cloud_pipelines_backend/instrumentation/metrics.py +++ b/cloud_pipelines_backend/instrumentation/metrics.py @@ -414,6 +414,40 @@ def track_database_query_duration(operation: str, duration_seconds: float): histogram.record(duration_seconds, {"operation": operation.lower()}) +# Orchestrator Error Tracking +_orchestrator_errors_counter = None + + +def get_orchestrator_errors_counter(): + """Get or create orchestrator errors counter.""" + global _orchestrator_errors_counter + + meter = get_meter() + if meter is None: + return None + + if _orchestrator_errors_counter is None: + _orchestrator_errors_counter = meter.create_counter( + name="orchestrator_errors_total", + description="Total number of orchestrator errors by type", + unit="1", + ) + + return _orchestrator_errors_counter + + +def track_orchestrator_error(error_type: str): + """ + Track orchestrator error by type. + + Args: + error_type: Type of error (system_error/launch_error/missing_outputs) + """ + counter = get_orchestrator_errors_counter() + if counter: + counter.add(1, {"error_type": error_type}) + + class HTTPMetricsMiddleware(BaseHTTPMiddleware): """ Middleware to track HTTP request metrics. diff --git a/cloud_pipelines_backend/orchestrator_sql.py b/cloud_pipelines_backend/orchestrator_sql.py index 843ba69..a77688e 100644 --- a/cloud_pipelines_backend/orchestrator_sql.py +++ b/cloud_pipelines_backend/orchestrator_sql.py @@ -111,6 +111,8 @@ def internal_process_queued_executions_queue(self, session: orm.Session): _logger.exception("Error processing queued execution") # Track processing error metrics.track_queue_processing_error(queue_type="queued") + # Track system error + metrics.track_orchestrator_error(error_type="system_error") session.rollback() queued_execution.container_execution_status = ( bts.ContainerExecutionStatus.SYSTEM_ERROR @@ -189,6 +191,8 @@ def internal_process_running_executions_queue(self, session: orm.Session): _logger.exception("Error processing running container execution") # Track processing error metrics.track_queue_processing_error(queue_type="running") + # Track system error + metrics.track_orchestrator_error(error_type="system_error") session.rollback() running_container_execution.status = ( bts.ContainerExecutionStatus.SYSTEM_ERROR @@ -566,6 +570,8 @@ def generate_execution_log_uri( launcher_class_name=launcher_class_name, success=False, ) + # Track launch error + metrics.track_orchestrator_error(error_type="launch_error") session.rollback() with session.begin(): # Logs whole exception @@ -818,6 +824,8 @@ def _maybe_preload_value( if missing_output_names: # Marking the container execution as FAILED (even though the program itself has completed successfully) container_execution.status = bts.ContainerExecutionStatus.FAILED + # Track missing outputs error + metrics.track_orchestrator_error(error_type="missing_outputs") # Track container execution duration if container_execution.started_at and container_execution.ended_at: duration = (