Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "uipath"
version = "2.4.14"
version = "2.4.15"
description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.11"
Expand Down
1 change: 1 addition & 0 deletions samples/calculator/evaluations/eval-sets/legacy.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"batchSize": 10,
"evaluatorRefs": [
"equality",
"equality-with-target-key",
"llm-as-a-judge",
"json-similarity",
"trajectory"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"fileName": "equality-with-target-key.json",
"id": "equality-with-target-key",
"name": "Legacy Equality Evaluator With Target Key",
"description": "An evaluator that judges the agent based on expected output under \"result\" key.",
"category": 0,
"type": 1,
"targetOutputKey": "result",
"createdAt": "2025-06-26T17:45:39.651Z",
"updatedAt": "2025-06-26T17:45:39.651Z"
}
21 changes: 19 additions & 2 deletions src/uipath/eval/evaluators/legacy_exact_match_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,24 @@ async def evaluate(
Returns:
EvaluationResult: Boolean result indicating exact match (True/False)
"""
actual_output = agent_execution.agent_output
expected_output = evaluation_criteria.expected_output

if self.target_output_key and self.target_output_key != "*":
if isinstance(actual_output, dict) and isinstance(expected_output, dict):
if not (
self.target_output_key in actual_output
and self.target_output_key in expected_output
):
# Assuming that we should pass the test.
expected_output = actual_output = {}
else:
if self.target_output_key in actual_output:
actual_output = actual_output[self.target_output_key]
if self.target_output_key in expected_output:
expected_output = expected_output[self.target_output_key]

return BooleanEvaluationResult(
score=self._canonical_json(agent_execution.agent_output)
== self._canonical_json(evaluation_criteria.expected_output)
score=self._canonical_json(actual_output)
== self._canonical_json(expected_output)
)
3 changes: 2 additions & 1 deletion testcases/calculator-evals/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ echo "Authenticating with UiPath..."
uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL"

echo "Running evaluations with custom evaluator..."
uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/default.json --no-report
uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/legacy.json --no-report --output-file legacy.json
uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/default.json --no-report --output-file default.json

echo "Test completed successfully!"
159 changes: 85 additions & 74 deletions testcases/calculator-evals/src/assert.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,92 @@
def main() -> None:
"""Main assertion logic."""
# Check if output file exists
output_file = "__uipath/output.json"
for output_file in ["default.json", "legacy.json"]:
assert os.path.isfile(output_file), (
f"Evaluation output file '{output_file}' not found"
)
print(f"✓ Found evaluation output file: {output_file}")

# Load evaluation results
with open(output_file, "r", encoding="utf-8") as f:
output_data = json.load(f)

print("✓ Loaded evaluation output")

# Extract output data
output = output_data

# Validate structure
assert "evaluationSetResults" in output, "Missing 'evaluationSetResults' in output"

evaluation_results = output["evaluationSetResults"]
assert len(evaluation_results) > 0, "No evaluation results found"

print(f"✓ Found {len(evaluation_results)} evaluation result(s)")

# Validate each evaluation result
passed_count = 0
failed_count = 0
skipped_count = 0
has_positive_scores = False

for eval_result in evaluation_results:
eval_name = eval_result.get("evaluationName", "Unknown")
print(f"\n→ Validating: {eval_name}")

try:
# Validate evaluation results are present
eval_run_results = eval_result.get("evaluationRunResults", [])
if len(eval_run_results) == 0:
print(f" ⊘ Skipping '{eval_name}' (no evaluation run results)")
skipped_count += 1
continue

# Check that evaluations have scores > 0
all_passed = True
min_score = 100
for eval_run in eval_run_results:
evaluator_name = eval_run.get("evaluatorName", "Unknown")
result = eval_run.get("result", {})
score = result.get("score", 0)
min_score = min(min_score, score)

# Check if score is greater than 0
if score > 0:
has_positive_scores = True
print(f" ✓ {evaluator_name}: score={score:.1f}")
else:
print(f" ✗ {evaluator_name}: score={score:.1f} (must be > 0)")
all_passed = False

if all_passed and min_score > 0:
print(
f" ✓ All evaluators passed for '{eval_name}' (min score: {min_score:.1f})"
)
passed_count += 1
else:
print(f" ✗ Some evaluators failed for '{eval_name}'")
failed_count += 1

except Exception as e:
print(f" ✗ Error validating '{eval_name}': {e}")
failed_count += 1

# Final summary
print(f"\n{'=' * 60}")
print("Summary:")
print(f" Total evaluations: {passed_count + failed_count + skipped_count}")
print(f" ✓ Passed: {passed_count}")
print(f" ✗ Failed: {failed_count}")
print(f" ⊘ Skipped: {skipped_count}")
print(f"{'=' * 60}")

assert failed_count == 0, "Some assertions failed"
assert has_positive_scores, "No evaluation scores greater than 0 were found"

print("\n✅ All assertions passed!")

output_file = "__uipath/output.json"
assert os.path.isfile(output_file), (
f"Evaluation output file '{output_file}' not found"
)
Expand All @@ -30,79 +114,6 @@ def main() -> None:
assert status == "successful", f"Evaluation run failed with status: {status}"
print("✓ Evaluation run status: successful")

# Extract output data
output = output_data.get("output", {})

# Validate structure
assert "evaluationSetResults" in output, "Missing 'evaluationSetResults' in output"

evaluation_results = output["evaluationSetResults"]
assert len(evaluation_results) > 0, "No evaluation results found"

print(f"✓ Found {len(evaluation_results)} evaluation result(s)")

# Validate each evaluation result
passed_count = 0
failed_count = 0
skipped_count = 0
has_positive_scores = False

for eval_result in evaluation_results:
eval_name = eval_result.get("evaluationName", "Unknown")
print(f"\n→ Validating: {eval_name}")

try:
# Validate evaluation results are present
eval_run_results = eval_result.get("evaluationRunResults", [])
if len(eval_run_results) == 0:
print(f" ⊘ Skipping '{eval_name}' (no evaluation run results)")
skipped_count += 1
continue

# Check that evaluations have scores > 0
all_passed = True
min_score = 100
for eval_run in eval_run_results:
evaluator_name = eval_run.get("evaluatorName", "Unknown")
result = eval_run.get("result", {})
score = result.get("score", 0)
min_score = min(min_score, score)

# Check if score is greater than 0
if score > 0:
has_positive_scores = True
print(f" ✓ {evaluator_name}: score={score:.1f}")
else:
print(f" ✗ {evaluator_name}: score={score:.1f} (must be > 0)")
all_passed = False

if all_passed and min_score > 0:
print(
f" ✓ All evaluators passed for '{eval_name}' (min score: {min_score:.1f})"
)
passed_count += 1
else:
print(f" ✗ Some evaluators failed for '{eval_name}'")
failed_count += 1

except Exception as e:
print(f" ✗ Error validating '{eval_name}': {e}")
failed_count += 1

# Final summary
print(f"\n{'=' * 60}")
print("Summary:")
print(f" Total evaluations: {passed_count + failed_count + skipped_count}")
print(f" ✓ Passed: {passed_count}")
print(f" ✗ Failed: {failed_count}")
print(f" ⊘ Skipped: {skipped_count}")
print(f"{'=' * 60}")

assert failed_count == 0, "Some assertions failed"
assert has_positive_scores, "No evaluation scores greater than 0 were found"

print("\n✅ All assertions passed!")


if __name__ == "__main__":
main()
Loading
Loading