From f8f1c9df40f56815dab30b6eb19c9ab95832b1b3 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Tue, 20 Jan 2026 13:05:53 +0530 Subject: [PATCH 1/6] fix score format --- backend/app/crud/evaluations/core.py | 26 ++++++++-- backend/app/crud/evaluations/processing.py | 26 +++++----- .../app/services/evaluations/evaluation.py | 50 +++++++++++++++---- 3 files changed, 73 insertions(+), 29 deletions(-) diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index b2b118df..59dec8e2 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -207,7 +207,7 @@ def get_or_fetch_score( This function implements a cache-on-first-request pattern: - If score already has 'traces' key, return it - - Otherwise, fetch from Langfuse, update score column, and return + - Otherwise, fetch from Langfuse, merge with existing summary_scores, and return - If force_refetch is True, always fetch fresh data from Langfuse Args: @@ -224,8 +224,8 @@ def get_or_fetch_score( Exception: If Langfuse API calls fail """ # Check if score already exists with traces - has_score = eval_run.score is not None and "traces" in eval_run.score - if not force_refetch and has_score: + has_traces = eval_run.score is not None and "traces" in eval_run.score + if not force_refetch and has_traces: logger.info( f"[get_or_fetch_score] Returning existing score | evaluation_id={eval_run.id}" ) @@ -237,13 +237,31 @@ def get_or_fetch_score( f"run={eval_run.run_name} | force_refetch={force_refetch}" ) + # Get existing summary_scores if any (e.g., cosine_similarity from cron job) + existing_summary_scores = [] + if eval_run.score and "summary_scores" in eval_run.score: + existing_summary_scores = eval_run.score.get("summary_scores", []) + # Fetch from Langfuse - score = fetch_trace_scores_from_langfuse( + langfuse_score = fetch_trace_scores_from_langfuse( langfuse=langfuse, dataset_name=eval_run.dataset_name, run_name=eval_run.run_name, ) + # Merge summary_scores: existing scores + new scores from Langfuse + existing_scores_map = {s["name"]: s for s in existing_summary_scores} + for langfuse_summary in langfuse_score.get("summary_scores", []): + existing_scores_map[langfuse_summary["name"]] = langfuse_summary + + merged_summary_scores = list(existing_scores_map.values()) + + # Build final score with merged summary_scores and traces + score: dict[str, Any] = { + "summary_scores": merged_summary_scores, + "traces": langfuse_score.get("traces", []), + } + # Update score column using existing helper update_evaluation_run(session=session, eval_run=eval_run, score=score) diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index fbc2d231..076ac9f3 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -382,21 +382,19 @@ async def process_completed_embedding_batch( # Step 4: Calculate similarity scores similarity_stats = calculate_average_similarity(embedding_pairs=embedding_pairs) - # Step 5: Update evaluation_run with scores - if eval_run.score is None: - eval_run.score = {} - - eval_run.score["cosine_similarity"] = { - "avg": similarity_stats["cosine_similarity_avg"], - "std": similarity_stats["cosine_similarity_std"], - "total_pairs": similarity_stats["total_pairs"], - } - - # Optionally store per-item scores if not too large - if len(similarity_stats.get("per_item_scores", [])) <= 100: - eval_run.score["cosine_similarity"]["per_item_scores"] = similarity_stats[ - "per_item_scores" + # Step 5: Update evaluation_run with scores in summary_scores format + # This format is consistent with what Langfuse returns when fetching traces + eval_run.score = { + "summary_scores": [ + { + "name": "cosine_similarity", + "avg": round(float(similarity_stats["cosine_similarity_avg"]), 2), + "std": round(float(similarity_stats["cosine_similarity_std"]), 2), + "total_pairs": similarity_stats["total_pairs"], + "data_type": "NUMERIC", + } ] + } # Step 6: Update Langfuse traces with cosine similarity scores logger.info( diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index bf0d4dd0..4c1a5de7 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -263,19 +263,27 @@ def get_evaluation_with_scores( if not eval_run: return None, None - if not get_trace_info: - return eval_run, None - # Only fetch trace info for completed evaluations if eval_run.status != "completed": - return eval_run, ( - f"Trace info is only available for completed evaluations. " - f"Current status: {eval_run.status}" - ) + if get_trace_info: + return eval_run, ( + f"Trace info is only available for completed evaluations. " + f"Current status: {eval_run.status}" + ) + return eval_run, None + + # Check if we already have cached summary_scores + has_summary_scores = ( + eval_run.score is not None and "summary_scores" in eval_run.score + ) - # Check if we already have cached scores - has_cached_score = eval_run.score is not None and "traces" in eval_run.score - if not resync_score and has_cached_score: + # If not requesting trace info, return existing score (with summary_scores) + if not get_trace_info: + return eval_run, None + + # Check if we already have cached traces + has_cached_traces = eval_run.score is not None and "traces" in eval_run.score + if not resync_score and has_cached_traces: return eval_run, None langfuse = get_langfuse_client( @@ -288,9 +296,12 @@ def get_evaluation_with_scores( dataset_name = eval_run.dataset_name run_name = eval_run.run_name eval_run_id = eval_run.id + existing_summary_scores = ( + eval_run.score.get("summary_scores", []) if has_summary_scores else [] + ) try: - score = fetch_trace_scores_from_langfuse( + langfuse_score = fetch_trace_scores_from_langfuse( langfuse=langfuse, dataset_name=dataset_name, run_name=run_name, @@ -309,6 +320,23 @@ def get_evaluation_with_scores( ) return eval_run, f"Failed to fetch trace info from Langfuse: {str(e)}" + # Merge summary_scores: existing scores + new scores from Langfuse + # Create a map of existing scores by name + existing_scores_map = {s["name"]: s for s in existing_summary_scores} + langfuse_summary_scores = langfuse_score.get("summary_scores", []) + + # Merge: Langfuse scores take precedence (more up-to-date) + for langfuse_summary in langfuse_summary_scores: + existing_scores_map[langfuse_summary["name"]] = langfuse_summary + + merged_summary_scores = list(existing_scores_map.values()) + + # Build final score with merged summary_scores and traces + score = { + "summary_scores": merged_summary_scores, + "traces": langfuse_score.get("traces", []), + } + eval_run = save_score( eval_run_id=eval_run_id, organization_id=organization_id, From 7c9ca371292a31c387c95bfdffc834fad0aed969 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Tue, 20 Jan 2026 13:10:51 +0530 Subject: [PATCH 2/6] cleanup documentation --- backend/app/api/routes/evaluations/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/api/routes/evaluations/__init__.py b/backend/app/api/routes/evaluations/__init__.py index 3f7fe120..1c7034a7 100644 --- a/backend/app/api/routes/evaluations/__init__.py +++ b/backend/app/api/routes/evaluations/__init__.py @@ -4,7 +4,7 @@ from app.api.routes.evaluations import dataset, evaluation -router = APIRouter(prefix="/evaluations", tags=["evaluation"]) +router = APIRouter(prefix="/evaluations", tags=["Evaluation"]) # Include dataset routes under /evaluations/datasets router.include_router(dataset.router, prefix="/datasets") From 60512d232c0c14371e80ae9c47d74ccc81321642 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Tue, 20 Jan 2026 13:20:07 +0530 Subject: [PATCH 3/6] cleanup router --- backend/app/api/main.py | 5 +++-- backend/app/api/routes/evaluations/__init__.py | 13 ------------- backend/app/api/routes/evaluations/dataset.py | 2 +- backend/app/api/routes/evaluations/evaluation.py | 2 +- 4 files changed, 5 insertions(+), 17 deletions(-) delete mode 100644 backend/app/api/routes/evaluations/__init__.py diff --git a/backend/app/api/main.py b/backend/app/api/main.py index 47cea3b1..bcd64eb5 100644 --- a/backend/app/api/main.py +++ b/backend/app/api/main.py @@ -20,11 +20,11 @@ onboarding, credentials, cron, - evaluations, fine_tuning, model_evaluation, collection_job, ) +from app.api.routes.evaluations import dataset as evaluation_dataset, evaluation from app.core.config import settings api_router = APIRouter() @@ -37,7 +37,8 @@ api_router.include_router(cron.router) api_router.include_router(documents.router) api_router.include_router(doc_transformation_job.router) -api_router.include_router(evaluations.router) +api_router.include_router(evaluation_dataset.router) +api_router.include_router(evaluation.router) api_router.include_router(llm.router) api_router.include_router(login.router) api_router.include_router(onboarding.router) diff --git a/backend/app/api/routes/evaluations/__init__.py b/backend/app/api/routes/evaluations/__init__.py deleted file mode 100644 index 1c7034a7..00000000 --- a/backend/app/api/routes/evaluations/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Evaluation API routes.""" - -from fastapi import APIRouter - -from app.api.routes.evaluations import dataset, evaluation - -router = APIRouter(prefix="/evaluations", tags=["Evaluation"]) - -# Include dataset routes under /evaluations/datasets -router.include_router(dataset.router, prefix="/datasets") - -# Include evaluation routes directly under /evaluations -router.include_router(evaluation.router) diff --git a/backend/app/api/routes/evaluations/dataset.py b/backend/app/api/routes/evaluations/dataset.py index 25ecacff..d66ff71c 100644 --- a/backend/app/api/routes/evaluations/dataset.py +++ b/backend/app/api/routes/evaluations/dataset.py @@ -31,7 +31,7 @@ logger = logging.getLogger(__name__) -router = APIRouter() +router = APIRouter(prefix="/evaluations/datasets", tags=["Evaluation"]) def _dataset_to_response(dataset: EvaluationDataset) -> DatasetUploadResponse: diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py index b51a5948..d40a88a1 100644 --- a/backend/app/api/routes/evaluations/evaluation.py +++ b/backend/app/api/routes/evaluations/evaluation.py @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) -router = APIRouter() +router = APIRouter(prefix="/evaluations", tags=["Evaluation"]) @router.post( From 2b6861799346bd338be094672043cfa63cf21d17 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Wed, 21 Jan 2026 10:59:15 +0530 Subject: [PATCH 4/6] updated testcase --- backend/app/tests/crud/evaluations/test_processing.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/app/tests/crud/evaluations/test_processing.py b/backend/app/tests/crud/evaluations/test_processing.py index 95a91bbd..a449671b 100644 --- a/backend/app/tests/crud/evaluations/test_processing.py +++ b/backend/app/tests/crud/evaluations/test_processing.py @@ -527,8 +527,13 @@ async def test_process_completed_embedding_batch_success( db.refresh(result) assert result.status == "completed" assert result.score is not None - assert "cosine_similarity" in result.score - assert result.score["cosine_similarity"]["avg"] == 0.95 + assert "summary_scores" in result.score + summary_scores = result.score["summary_scores"] + cosine_score = next( + (s for s in summary_scores if s["name"] == "cosine_similarity"), None + ) + assert cosine_score is not None + assert cosine_score["avg"] == 0.95 mock_update_traces.assert_called_once() @pytest.mark.asyncio From 8655b2d3f544ad629ec6d1444dcc1d42f981c6eb Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Wed, 21 Jan 2026 11:02:52 +0530 Subject: [PATCH 5/6] updated testcase --- backend/app/tests/crud/evaluations/test_processing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/app/tests/crud/evaluations/test_processing.py b/backend/app/tests/crud/evaluations/test_processing.py index a449671b..afb0ac0e 100644 --- a/backend/app/tests/crud/evaluations/test_processing.py +++ b/backend/app/tests/crud/evaluations/test_processing.py @@ -534,7 +534,6 @@ async def test_process_completed_embedding_batch_success( ) assert cosine_score is not None assert cosine_score["avg"] == 0.95 - mock_update_traces.assert_called_once() @pytest.mark.asyncio @patch("app.crud.evaluations.processing.download_batch_results") From dd80a36cba53485e53c7eab8640c0c3644447d60 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 22 Jan 2026 09:44:12 +0530 Subject: [PATCH 6/6] added types --- backend/app/crud/evaluations/__init__.py | 8 ++++ backend/app/crud/evaluations/core.py | 8 ++-- backend/app/crud/evaluations/langfuse.py | 12 ++--- backend/app/crud/evaluations/score.py | 56 ++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 9 deletions(-) create mode 100644 backend/app/crud/evaluations/score.py diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py index bb095413..e667dcbb 100644 --- a/backend/app/crud/evaluations/__init__.py +++ b/backend/app/crud/evaluations/__init__.py @@ -35,3 +35,11 @@ process_completed_embedding_batch, process_completed_evaluation, ) +from app.crud.evaluations.score import ( + CategoricalSummaryScore, + EvaluationScore, + NumericSummaryScore, + SummaryScore, + TraceData, + TraceScore, +) diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 59dec8e2..33b6777f 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -1,11 +1,11 @@ import logging -from typing import Any from langfuse import Langfuse from sqlmodel import Session, select from app.core.util import now from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse +from app.crud.evaluations.score import EvaluationScore from app.models import EvaluationRun logger = logging.getLogger(__name__) @@ -201,7 +201,7 @@ def get_or_fetch_score( eval_run: EvaluationRun, langfuse: Langfuse, force_refetch: bool = False, -) -> dict[str, Any]: +) -> EvaluationScore: """ Get cached score with trace info or fetch from Langfuse and update. @@ -257,7 +257,7 @@ def get_or_fetch_score( merged_summary_scores = list(existing_scores_map.values()) # Build final score with merged summary_scores and traces - score: dict[str, Any] = { + score: EvaluationScore = { "summary_scores": merged_summary_scores, "traces": langfuse_score.get("traces", []), } @@ -278,7 +278,7 @@ def save_score( eval_run_id: int, organization_id: int, project_id: int, - score: dict[str, Any], + score: EvaluationScore, ) -> EvaluationRun | None: """ Save score to evaluation run with its own session. diff --git a/backend/app/crud/evaluations/langfuse.py b/backend/app/crud/evaluations/langfuse.py index 01a1104a..eced1e7b 100644 --- a/backend/app/crud/evaluations/langfuse.py +++ b/backend/app/crud/evaluations/langfuse.py @@ -15,6 +15,8 @@ import numpy as np from langfuse import Langfuse +from app.crud.evaluations.score import EvaluationScore, TraceData, TraceScore + logger = logging.getLogger(__name__) @@ -319,7 +321,7 @@ def fetch_trace_scores_from_langfuse( langfuse: Langfuse, dataset_name: str, run_name: str, -) -> dict[str, Any]: +) -> EvaluationScore: """ Fetch trace scores from Langfuse for an evaluation run. @@ -402,14 +404,14 @@ def fetch_trace_scores_from_langfuse( ) # 3. Fetch trace details with scores for each trace - traces = [] + traces: list[TraceData] = [] # Track score aggregations by name: {name: {"data_type": str, "values": list}} score_aggregations: dict[str, dict[str, Any]] = {} for trace_id in trace_ids: try: trace = langfuse.api.trace.get(trace_id) - trace_data: dict[str, Any] = { + trace_data: TraceData = { "trace_id": trace_id, "question": "", "llm_answer": "", @@ -453,7 +455,7 @@ def fetch_trace_scores_from_langfuse( ): score_value = round(float(score_value), 2) - score_entry: dict[str, Any] = { + score_entry: TraceScore = { "name": score_name, "value": score_value, "data_type": data_type, @@ -534,7 +536,7 @@ def fetch_trace_scores_from_langfuse( } ) - result: dict[str, Any] = { + result: EvaluationScore = { "summary_scores": summary_scores, "traces": traces, } diff --git a/backend/app/crud/evaluations/score.py b/backend/app/crud/evaluations/score.py new file mode 100644 index 00000000..0371ef8f --- /dev/null +++ b/backend/app/crud/evaluations/score.py @@ -0,0 +1,56 @@ +""" +Type definitions for evaluation scores. + +This module contains TypedDict definitions for type-safe score data +used throughout the evaluation system. +""" + +from typing import NotRequired, TypedDict + + +class TraceScore(TypedDict): + """A score attached to a trace.""" + + name: str + value: float | str + data_type: str + comment: NotRequired[str] + + +class TraceData(TypedDict): + """Data for a single trace including Q&A and scores.""" + + trace_id: str + question: str + llm_answer: str + ground_truth_answer: str + scores: list[TraceScore] + + +class NumericSummaryScore(TypedDict): + """Summary statistics for a numeric score across all traces.""" + + name: str + avg: float + std: float + total_pairs: int + data_type: str + + +class CategoricalSummaryScore(TypedDict): + """Summary statistics for a categorical score across all traces.""" + + name: str + distribution: dict[str, int] + total_pairs: int + data_type: str + + +SummaryScore = NumericSummaryScore | CategoricalSummaryScore + + +class EvaluationScore(TypedDict): + """Complete evaluation score data with traces and summary statistics.""" + + summary_scores: list[SummaryScore] + traces: list[TraceData]