From b296a79f0801a28f8c7153b68bae9e0564b224ce Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Tue, 13 Jan 2026 15:31:35 -0800
Subject: [PATCH 1/2] feat: add live tracking for evaluation spans

---
 src/uipath/_cli/_evals/_runtime.py            |  81 ++++-
 .../eval/test_live_tracking_span_processor.py | 292 ++++++++++++++++++
 2 files changed, 371 insertions(+), 2 deletions(-)
 create mode 100644 tests/cli/eval/test_live_tracking_span_processor.py

diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
index b9fa8e9b7..9e130b9a5 100644
--- a/src/uipath/_cli/_evals/_runtime.py
+++ b/src/uipath/_cli/_evals/_runtime.py
@@ -18,8 +18,11 @@
 
 import coverage
 from opentelemetry import context as context_api
-from opentelemetry.sdk.trace import ReadableSpan, Span
-from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
+from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
+from opentelemetry.sdk.trace.export import (
+    SpanExporter,
+    SpanExportResult,
+)
 from opentelemetry.trace import Status, StatusCode
 from pydantic import BaseModel
 from uipath.core.tracing import UiPathTraceManager
@@ -47,6 +50,7 @@
 from uipath._cli._evals.mocks.input_mocker import (
     generate_llm_input,
 )
+from uipath.tracing import LlmOpsHttpExporter, SpanStatus
 
 from ..._events._event_bus import EventBus
 from ..._events._events import (
@@ -155,6 +159,73 @@ def on_start(
                 self.collector.add_span(span, exec_id)
 
 
+class LiveTrackingSpanProcessor(SpanProcessor):
+    """Span processor for live span tracking using upsert_span API.
+
+    Sends real-time span updates:
+    - On span start: Upsert with RUNNING status
+    - On span end: Upsert with final status (OK/ERROR)
+    """
+
+    def __init__(self, exporter: LlmOpsHttpExporter):
+        self.exporter = exporter
+        self.span_status = SpanStatus
+
+    def on_start(
+        self, span: Span, parent_context: context_api.Context | None = None
+    ) -> None:
+        """Called when span starts - upsert with RUNNING status."""
+        # Only track evaluation-related spans
+        if span.attributes and self._is_eval_span(span):
+            try:
+                self.exporter.upsert_span(
+                    span, status_override=self.span_status.RUNNING
+                )
+            except Exception as e:
+                logger.debug(f"Failed to upsert span on start: {e}")
+
+    def on_end(self, span: ReadableSpan) -> None:
+        """Called when span ends - upsert with final status."""
+        # Only track evaluation-related spans
+        if span.attributes and self._is_eval_span(span):
+            try:
+                self.exporter.upsert_span(span)
+            except Exception as e:
+                logger.debug(f"Failed to upsert span on end: {e}")
+
+    def _is_eval_span(self, span: Span | ReadableSpan) -> bool:
+        """Check if span is evaluation-related."""
+        if not span.attributes:
+            return False
+
+        span_type = span.attributes.get("span_type")
+        # Track eval-related span types
+        eval_span_types = {
+            "eval",
+            "evaluator",
+            "evaluation",
+            "eval_set_run",
+            "evalOutput",
+        }
+
+        if span_type in eval_span_types:
+            return True
+
+        # Also track spans with execution.id (eval executions)
+        if "execution.id" in span.attributes:
+            return True
+
+        return False
+
+    def shutdown(self) -> None:
+        """Shutdown the processor."""
+        pass
+
+    def force_flush(self, timeout_millis: int = 30000) -> bool:
+        """Force flush - no-op for live tracking."""
+        return True
+
+
 class ExecutionLogsExporter:
     """Custom exporter that stores multiple execution log handlers."""
 
@@ -217,6 +288,12 @@ def __init__(
         self.trace_manager.tracer_span_processors.append(span_processor)
         self.trace_manager.tracer_provider.add_span_processor(span_processor)
 
+        # Live tracking processor for real-time span updates
+        live_tracking_exporter = LlmOpsHttpExporter()
+        live_tracking_processor = LiveTrackingSpanProcessor(live_tracking_exporter)
+        self.trace_manager.tracer_span_processors.append(live_tracking_processor)
+        self.trace_manager.tracer_provider.add_span_processor(live_tracking_processor)
+
         self.logs_exporter: ExecutionLogsExporter = ExecutionLogsExporter()
         self.execution_id = str(uuid.uuid4())
         self.coverage = coverage.Coverage(branch=True)
diff --git a/tests/cli/eval/test_live_tracking_span_processor.py b/tests/cli/eval/test_live_tracking_span_processor.py
new file mode 100644
index 000000000..3adf3e658
--- /dev/null
+++ b/tests/cli/eval/test_live_tracking_span_processor.py
@@ -0,0 +1,292 @@
+"""Tests for LiveTrackingSpanProcessor in _runtime.py."""
+
+from typing import Any
+from unittest.mock import Mock
+
+import pytest
+from opentelemetry import context as context_api
+from opentelemetry.sdk.trace import ReadableSpan, Span
+
+from uipath._cli._evals._runtime import LiveTrackingSpanProcessor
+from uipath.tracing import SpanStatus
+
+
+class TestLiveTrackingSpanProcessor:
+    """Test suite for LiveTrackingSpanProcessor."""
+
+    @pytest.fixture
+    def mock_exporter(self):
+        """Create a mock LlmOpsHttpExporter."""
+        exporter = Mock()
+        exporter.upsert_span = Mock()
+        return exporter
+
+    @pytest.fixture
+    def processor(self, mock_exporter):
+        """Create a LiveTrackingSpanProcessor with mock exporter."""
+        return LiveTrackingSpanProcessor(mock_exporter)
+
+    def create_mock_span(self, attributes: dict[str, Any] | None = None):
+        """Create a mock span with attributes."""
+        span = Mock(spec=Span)
+        span.attributes = attributes or {}
+        return span
+
+    def create_mock_readable_span(self, attributes: dict[str, Any] | None = None):
+        """Create a mock ReadableSpan with attributes."""
+        span = Mock(spec=ReadableSpan)
+        span.attributes = attributes or {}
+        return span
+
+    def test_init(self, mock_exporter):
+        """Test processor initialization."""
+        processor = LiveTrackingSpanProcessor(mock_exporter)
+
+        assert processor.exporter == mock_exporter
+        assert processor.span_status == SpanStatus
+
+    def test_on_start_with_eval_span_type(self, processor, mock_exporter):
+        """Test on_start is called for eval span type."""
+        span = self.create_mock_span({"span_type": "eval"})
+
+        processor.on_start(span, None)
+
+        mock_exporter.upsert_span.assert_called_once_with(
+            span, status_override=SpanStatus.RUNNING
+        )
+
+    def test_on_start_with_evaluator_span_type(self, processor, mock_exporter):
+        """Test on_start is called for evaluator span type."""
+        span = self.create_mock_span({"span_type": "evaluator"})
+
+        processor.on_start(span, None)
+
+        mock_exporter.upsert_span.assert_called_once_with(
+            span, status_override=SpanStatus.RUNNING
+        )
+
+    def test_on_start_with_evaluation_span_type(self, processor, mock_exporter):
+        """Test on_start is called for evaluation span type."""
+        span = self.create_mock_span({"span_type": "evaluation"})
+
+        processor.on_start(span, None)
+
+        mock_exporter.upsert_span.assert_called_once_with(
+            span, status_override=SpanStatus.RUNNING
+        )
+
+    def test_on_start_with_eval_set_run_span_type(self, processor, mock_exporter):
+        """Test on_start is called for eval_set_run span type."""
+        span = self.create_mock_span({"span_type": "eval_set_run"})
+
+        processor.on_start(span, None)
+
+        mock_exporter.upsert_span.assert_called_once_with(
+            span, status_override=SpanStatus.RUNNING
+        )
+
+    def test_on_start_with_eval_output_span_type(self, processor, mock_exporter):
+        """Test on_start is called for evalOutput span type."""
+        span = self.create_mock_span({"span_type": "evalOutput"})
+
+        processor.on_start(span, None)
+
+        mock_exporter.upsert_span.assert_called_once_with(
+            span, status_override=SpanStatus.RUNNING
+        )
+
+    def test_on_start_with_execution_id(self, processor, mock_exporter):
+        """Test on_start is called for span with execution.id."""
+        span = self.create_mock_span({"execution.id": "test-exec-id"})
+
+        processor.on_start(span, None)
+
+        mock_exporter.upsert_span.assert_called_once_with(
+            span, status_override=SpanStatus.RUNNING
+        )
+
+    def test_on_start_with_non_eval_span(self, processor, mock_exporter):
+        """Test on_start is NOT called for non-eval spans."""
+        span = self.create_mock_span({"span_type": "agent"})
+
+        processor.on_start(span, None)
+
+        mock_exporter.upsert_span.assert_not_called()
+
+    def test_on_start_with_no_attributes(self, processor, mock_exporter):
+        """Test on_start is NOT called when span has no attributes."""
+        span = self.create_mock_span(None)
+
+        processor.on_start(span, None)
+
+        mock_exporter.upsert_span.assert_not_called()
+
+    def test_on_start_with_empty_attributes(self, processor, mock_exporter):
+        """Test on_start is NOT called when span has empty attributes."""
+        span = self.create_mock_span({})
+
+        processor.on_start(span, None)
+
+        mock_exporter.upsert_span.assert_not_called()
+
+    def test_on_start_exception_handling(self, processor, mock_exporter):
+        """Test on_start handles exceptions gracefully."""
+        span = self.create_mock_span({"span_type": "eval"})
+        mock_exporter.upsert_span.side_effect = Exception("Network error")
+
+        # Should not raise exception
+        processor.on_start(span, None)
+
+        mock_exporter.upsert_span.assert_called_once()
+
+    def test_on_end_with_eval_span_type(self, processor, mock_exporter):
+        """Test on_end is called for eval span type."""
+        span = self.create_mock_readable_span({"span_type": "eval"})
+
+        processor.on_end(span)
+
+        mock_exporter.upsert_span.assert_called_once_with(span)
+
+    def test_on_end_with_evaluator_span_type(self, processor, mock_exporter):
+        """Test on_end is called for evaluator span type."""
+        span = self.create_mock_readable_span({"span_type": "evaluator"})
+
+        processor.on_end(span)
+
+        mock_exporter.upsert_span.assert_called_once_with(span)
+
+    def test_on_end_with_evaluation_span_type(self, processor, mock_exporter):
+        """Test on_end is called for evaluation span type."""
+        span = self.create_mock_readable_span({"span_type": "evaluation"})
+
+        processor.on_end(span)
+
+        mock_exporter.upsert_span.assert_called_once_with(span)
+
+    def test_on_end_with_execution_id(self, processor, mock_exporter):
+        """Test on_end is called for span with execution.id."""
+        span = self.create_mock_readable_span({"execution.id": "test-exec-id"})
+
+        processor.on_end(span)
+
+        mock_exporter.upsert_span.assert_called_once_with(span)
+
+    def test_on_end_with_non_eval_span(self, processor, mock_exporter):
+        """Test on_end is NOT called for non-eval spans."""
+        span = self.create_mock_readable_span({"span_type": "agent"})
+
+        processor.on_end(span)
+
+        mock_exporter.upsert_span.assert_not_called()
+
+    def test_on_end_with_no_attributes(self, processor, mock_exporter):
+        """Test on_end is NOT called when span has no attributes."""
+        span = self.create_mock_readable_span(None)
+
+        processor.on_end(span)
+
+        mock_exporter.upsert_span.assert_not_called()
+
+    def test_on_end_exception_handling(self, processor, mock_exporter):
+        """Test on_end handles exceptions gracefully."""
+        span = self.create_mock_readable_span({"span_type": "eval"})
+        mock_exporter.upsert_span.side_effect = Exception("Network error")
+
+        # Should not raise exception
+        processor.on_end(span)
+
+        mock_exporter.upsert_span.assert_called_once()
+
+    def test_is_eval_span_with_eval_type(self, processor):
+        """Test _is_eval_span returns True for eval span type."""
+        span = self.create_mock_span({"span_type": "eval"})
+        assert processor._is_eval_span(span) is True
+
+    def test_is_eval_span_with_evaluator_type(self, processor):
+        """Test _is_eval_span returns True for evaluator span type."""
+        span = self.create_mock_span({"span_type": "evaluator"})
+        assert processor._is_eval_span(span) is True
+
+    def test_is_eval_span_with_evaluation_type(self, processor):
+        """Test _is_eval_span returns True for evaluation span type."""
+        span = self.create_mock_span({"span_type": "evaluation"})
+        assert processor._is_eval_span(span) is True
+
+    def test_is_eval_span_with_eval_set_run_type(self, processor):
+        """Test _is_eval_span returns True for eval_set_run span type."""
+        span = self.create_mock_span({"span_type": "eval_set_run"})
+        assert processor._is_eval_span(span) is True
+
+    def test_is_eval_span_with_eval_output_type(self, processor):
+        """Test _is_eval_span returns True for evalOutput span type."""
+        span = self.create_mock_span({"span_type": "evalOutput"})
+        assert processor._is_eval_span(span) is True
+
+    def test_is_eval_span_with_execution_id(self, processor):
+        """Test _is_eval_span returns True for span with execution.id."""
+        span = self.create_mock_span({"execution.id": "test-id"})
+        assert processor._is_eval_span(span) is True
+
+    def test_is_eval_span_with_both_criteria(self, processor):
+        """Test _is_eval_span returns True when both criteria match."""
+        span = self.create_mock_span(
+            {"span_type": "evaluation", "execution.id": "test-id"}
+        )
+        assert processor._is_eval_span(span) is True
+
+    def test_is_eval_span_with_non_eval_type(self, processor):
+        """Test _is_eval_span returns False for non-eval span type."""
+        span = self.create_mock_span({"span_type": "agent"})
+        assert processor._is_eval_span(span) is False
+
+    def test_is_eval_span_with_no_attributes(self, processor):
+        """Test _is_eval_span returns False when span has no attributes."""
+        span = self.create_mock_span(None)
+        assert processor._is_eval_span(span) is False
+
+    def test_is_eval_span_with_empty_attributes(self, processor):
+        """Test _is_eval_span returns False when span has empty attributes."""
+        span = self.create_mock_span({})
+        assert processor._is_eval_span(span) is False
+
+    def test_shutdown(self, processor):
+        """Test shutdown method."""
+        # Should not raise exception
+        processor.shutdown()
+
+    def test_force_flush(self, processor):
+        """Test force_flush method."""
+        result = processor.force_flush()
+        assert result is True
+
+    def test_force_flush_with_timeout(self, processor):
+        """Test force_flush with custom timeout."""
+        result = processor.force_flush(timeout_millis=5000)
+        assert result is True
+
+    def test_on_start_with_parent_context(self, processor, mock_exporter):
+        """Test on_start with parent context."""
+        span = self.create_mock_span({"span_type": "eval"})
+        parent_context = Mock(spec=context_api.Context)
+
+        processor.on_start(span, parent_context)
+
+        mock_exporter.upsert_span.assert_called_once_with(
+            span, status_override=SpanStatus.RUNNING
+        )
+
+    def test_processor_handles_all_eval_span_types(self, processor):
+        """Test that all eval span types are properly detected."""
+        eval_span_types = [
+            "eval",
+            "evaluator",
+            "evaluation",
+            "eval_set_run",
+            "evalOutput",
+        ]
+
+        for span_type in eval_span_types:
+            span = self.create_mock_span({"span_type": span_type})
+            assert processor._is_eval_span(span) is True, (
+                f"Failed for span_type: {span_type}"
+            )

From 61f77021346ec2ea807c29014ef4e052dbdc270b Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Wed, 14 Jan 2026 10:17:15 -0800
Subject: [PATCH 2/2] chore: bump version to 2.4.20

---
 pyproject.toml | 2 +-
 uv.lock        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7b3035833..3da530fc3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.4.19"
+version = "2.4.20"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/uv.lock b/uv.lock
index f3155edd9..81f311af3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2486,7 +2486,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.4.19"
+version = "2.4.20"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },