Skip to content
Open
60 changes: 60 additions & 0 deletions backend/app/alembic/versions/041_add_config_in_evals_run_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""add config in evals run table
Revision ID: 041
Revises: 040
Create Date: 2025-12-15 14:03:22.082746
"""
from alembic import op
import sqlalchemy as sa
import sqlmodel.sql.sqltypes
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "041"
down_revision = "040"
branch_labels = None
depends_on = None


def upgrade():
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Add return type hints to migration functions.

Both upgrade() and downgrade() functions are missing return type hints.

As per coding guidelines, all functions should have type hints.

📝 Proposed fix
-def upgrade():
+def upgrade() -> None:
-def downgrade():
+def downgrade() -> None:

Also applies to: 45-45

🤖 Prompt for AI Agents
In @backend/app/alembic/versions/041_add_config_in_evals_run_table.py at line
20, The migration functions upgrade() and downgrade() lack return type hints;
update both function definitions (upgrade and downgrade) to include explicit
return types (e.g., change "def upgrade():" and "def downgrade():" to "def
upgrade() -> None:" and "def downgrade() -> None:") so they conform to the
project's typing guidelines.

# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"evaluation_run",
sa.Column(
"config_id",
sa.Uuid(),
nullable=True,
comment="Reference to the stored config used",
),
)
op.add_column(
"evaluation_run",
sa.Column(
"config_version",
sa.Integer(),
nullable=True,
comment="Version of the config used",
),
)
op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
op.drop_column("evaluation_run", "config")
Comment on lines +22 to +41
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Data loss and foreign key constraint naming issues.

This migration has several critical problems:

  1. Data loss: Line 41 drops the config column without migrating existing data to the new config_id/config_version columns. Any existing evaluation runs will lose their configuration data permanently.

  2. Foreign key constraint naming: Line 40 creates a foreign key with None as the constraint name, causing Alembic to auto-generate a name. However, the downgrade function (Line 57) also uses None to drop the constraint, which won't match the auto-generated name and will fail.

Required actions:

  1. Add a data migration step before dropping the config column. You'll need to:

    • Parse each existing config JSONB object
    • Look up or create corresponding config records with appropriate versions
    • Update config_id and config_version for each evaluation_run
    • Or, if data migration isn't feasible, add a comment explaining why data loss is acceptable
  2. Specify an explicit constraint name instead of None:

🔧 Proposed fix for FK constraint naming
-    op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
+    op.create_foreign_key(
+        "fk_evaluation_run_config_id", 
+        "evaluation_run", 
+        "config", 
+        ["config_id"], 
+        ["id"]
+    )

And update the downgrade:

-    op.drop_constraint(None, "evaluation_run", type_="foreignkey")
+    op.drop_constraint("fk_evaluation_run_config_id", "evaluation_run", type_="foreignkey")

Committable suggestion skipped: line range outside the PR's diff.

# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"evaluation_run",
sa.Column(
"config",
postgresql.JSONB(astext_type=sa.Text()),
autoincrement=False,
nullable=False,
comment="Evaluation configuration (model, instructions, etc.)",
),
)
Comment on lines +47 to +56
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Downgrade will fail with existing data.

The downgrade re-adds the config column with nullable=False (Line 53). If the evaluation_run table contains any records when downgrading, this operation will fail because PostgreSQL cannot add a non-nullable column to a table with existing rows without specifying a default value.

Either:

  1. Make the column nullable during downgrade: nullable=True
  2. Provide a server default value
  3. Add a data migration to populate the column before setting it non-nullable
🔧 Proposed fix (Option 1: Make nullable)
     op.add_column(
         "evaluation_run",
         sa.Column(
             "config",
             postgresql.JSONB(astext_type=sa.Text()),
             autoincrement=False,
-            nullable=False,
+            nullable=True,
             comment="Evaluation configuration (model, instructions, etc.)",
         ),
     )
🤖 Prompt for AI Agents
In @backend/app/alembic/versions/041_add_config_in_evals_run_table.py around
lines 47 - 56, The downgrade currently re-adds the "config" column on the
"evaluation_run" table using op.add_column with sa.Column(..., nullable=False)
which will fail if rows exist; update that op.add_column call in the downgrade
to use nullable=True (or alternatively add a server_default or a prior data
migration to populate values before setting non-nullable), ensuring the column
is created nullable during downgrade to avoid PostgreSQL errors.

op.drop_constraint(None, "evaluation_run", type_="foreignkey")
op.drop_column("evaluation_run", "config_version")
op.drop_column("evaluation_run", "config_id")
# ### end Alembic commands ###
110 changes: 39 additions & 71 deletions backend/app/api/routes/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,21 @@
import logging
import re
from pathlib import Path

from fastapi import APIRouter, Body, File, Form, HTTPException, Query, UploadFile
from uuid import UUID

from fastapi import (
APIRouter,
Body,
File,
Form,
HTTPException,
Query,
UploadFile,
)

from app.api.deps import AuthContextDep, SessionDep
from app.core.cloud import get_cloud_storage
from app.crud.assistants import get_assistant_by_id
from app.crud.config.version import ConfigVersionCrud
from app.crud.evaluations import (
create_evaluation_dataset,
create_evaluation_run,
Expand All @@ -27,13 +36,17 @@
DatasetUploadResponse,
EvaluationRunPublic,
)
from app.models.llm.request import LLMCallConfig
from app.services.llm.jobs import resolve_config_blob
from app.services.llm.providers import LLMProvider
from app.utils import (
APIResponse,
get_langfuse_client,
get_openai_client,
load_description,
)


logger = logging.getLogger(__name__)

# File upload security constants
Expand Down Expand Up @@ -430,20 +443,9 @@ def evaluate(
experiment_name: str = Body(
..., description="Name for this evaluation experiment/run"
),
config: dict = Body(default_factory=dict, description="Evaluation configuration"),
assistant_id: str
| None = Body(
None, description="Optional assistant ID to fetch configuration from"
),
config_id: UUID = Body(..., description="Stored config ID"),
config_version: int = Body(..., ge=1, description="Stored config version"),
) -> APIResponse[EvaluationRunPublic]:
logger.info(
f"[evaluate] Starting evaluation | experiment_name={experiment_name} | "
f"dataset_id={dataset_id} | "
f"org_id={auth_context.organization.id} | "
f"assistant_id={assistant_id} | "
f"config_keys={list(config.keys())}"
)

# Step 1: Fetch dataset from database
dataset = get_dataset_by_id(
session=_session,
Expand All @@ -459,12 +461,6 @@ def evaluate(
f"organization/project",
)

logger.info(
f"[evaluate] Found dataset | id={dataset.id} | name={dataset.name} | "
f"object_store_url={'present' if dataset.object_store_url else 'None'} | "
f"langfuse_id={dataset.langfuse_dataset_id}"
)

dataset_name = dataset.name

# Get API clients
Expand All @@ -487,63 +483,35 @@ def evaluate(
"Please ensure Langfuse credentials were configured when the dataset was created.",
)

# Handle assistant_id if provided
if assistant_id:
# Fetch assistant details from database
assistant = get_assistant_by_id(
session=_session,
assistant_id=assistant_id,
project_id=auth_context.project.id,
)

if not assistant:
raise HTTPException(
status_code=404, detail=f"Assistant {assistant_id} not found"
)
config_version_crud = ConfigVersionCrud(
session=_session, config_id=config_id, project_id=auth_context.project.id
)

logger.info(
f"[evaluate] Found assistant in DB | id={assistant.id} | "
f"model={assistant.model} | instructions="
f"{assistant.instructions[:50] if assistant.instructions else 'None'}..."
config, error = resolve_config_blob(
config_crud=config_version_crud,
config=LLMCallConfig(id=config_id, version=config_version),
)
if error:
raise HTTPException(
status_code=400,
detail=f"Failed to resolve config from stored config: {error}",
)

# Build config from assistant (use provided config values to override
# if present)
config = {
"model": config.get("model", assistant.model),
"instructions": config.get("instructions", assistant.instructions),
"temperature": config.get("temperature", assistant.temperature),
}

# Add tools if vector stores are available
vector_store_ids = config.get(
"vector_store_ids", assistant.vector_store_ids or []
elif config.completion.provider != LLMProvider.OPENAI:
raise HTTPException(
status_code=422,
detail="Only 'openai' provider is supported for evaluation configs",
)
if vector_store_ids and len(vector_store_ids) > 0:
config["tools"] = [
{
"type": "file_search",
"vector_store_ids": vector_store_ids,
}
]

logger.info("[evaluate] Using config from assistant")
else:
logger.info("[evaluate] Using provided config directly")
# Validate that config has minimum required fields
if not config.get("model"):
raise HTTPException(
status_code=400,
detail="Config must include 'model' when assistant_id is not provided",
)

# Create EvaluationRun record
logger.info("[evaluate] Successfully resolved config from config management")

# Create EvaluationRun record with config references
eval_run = create_evaluation_run(
session=_session,
run_name=experiment_name,
dataset_name=dataset_name,
dataset_id=dataset_id,
config=config,
config_id=config_id,
config_version=config_version,
organization_id=auth_context.organization.id,
project_id=auth_context.project.id,
)
Expand All @@ -555,7 +523,7 @@ def evaluate(
openai_client=openai_client,
session=_session,
eval_run=eval_run,
config=config,
config=config.completion.params,
)

logger.info(
Expand Down
2 changes: 2 additions & 0 deletions backend/app/crud/evaluations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
create_evaluation_run,
get_evaluation_run_by_id,
list_evaluation_runs,
resolve_model_from_config,
)
from app.crud.evaluations.cron import (
process_all_pending_evaluations,
Expand Down Expand Up @@ -39,6 +40,7 @@
"create_evaluation_run",
"get_evaluation_run_by_id",
"list_evaluation_runs",
"resolve_model_from_config",
# Cron
"process_all_pending_evaluations",
"process_all_pending_evaluations_sync",
Expand Down
62 changes: 58 additions & 4 deletions backend/app/crud/evaluations/core.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import logging
from typing import Any
from uuid import UUID

from langfuse import Langfuse
from sqlmodel import Session, select

from app.core.util import now
from app.crud.config.version import ConfigVersionCrud
from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse
from app.models import EvaluationRun
from app.models.llm.request import LLMCallConfig
from app.services.llm.jobs import resolve_config_blob

logger = logging.getLogger(__name__)

Expand All @@ -16,7 +20,8 @@ def create_evaluation_run(
run_name: str,
dataset_name: str,
dataset_id: int,
config: dict,
config_id: UUID,
config_version: int,
organization_id: int,
project_id: int,
) -> EvaluationRun:
Expand All @@ -28,7 +33,8 @@ def create_evaluation_run(
run_name: Name of the evaluation run/experiment
dataset_name: Name of the dataset being used
dataset_id: ID of the dataset
config: Configuration dict for the evaluation
config_id: UUID of the stored config
config_version: Version number of the config
organization_id: Organization ID
project_id: Project ID

Expand All @@ -39,7 +45,8 @@ def create_evaluation_run(
run_name=run_name,
dataset_name=dataset_name,
dataset_id=dataset_id,
config=config,
config_id=config_id,
config_version=config_version,
status="pending",
organization_id=organization_id,
project_id=project_id,
Expand All @@ -56,7 +63,10 @@ def create_evaluation_run(
logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True)
raise

logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}")
logger.info(
f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, "
f"config_id={config_id}, config_version={config_version}"
)

return eval_run

Expand Down Expand Up @@ -293,3 +303,47 @@ def save_score(
f"traces={len(score.get('traces', []))}"
)
return eval_run


def resolve_model_from_config(
session: Session,
eval_run: EvaluationRun,
) -> str:
"""
Resolve the model name from the evaluation run's config.

Args:
session: Database session
eval_run: EvaluationRun instance

Returns:
Model name from config

Raises:
ValueError: If config is missing, invalid, or has no model
"""
if not eval_run.config_id or not eval_run.config_version:
raise ValueError(
f"Evaluation run {eval_run.id} has no config reference "
f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})"
)

config_version_crud = ConfigVersionCrud(
session=session,
config_id=eval_run.config_id,
project_id=eval_run.project_id,
)

config, error = resolve_config_blob(
config_crud=config_version_crud,
config=LLMCallConfig(id=eval_run.config_id, version=eval_run.config_version),
)

if error or config is None:
raise ValueError(
f"Config resolution failed for evaluation {eval_run.id} "
f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}"
)

model = config.completion.params.get("model")
return model
14 changes: 1 addition & 13 deletions backend/app/crud/evaluations/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,19 +364,7 @@ def start_embedding_batch(
logger.info(f"Starting embedding batch for evaluation run {eval_run.id}")

# Get embedding model from config (default: text-embedding-3-large)
embedding_model = eval_run.config.get(
"embedding_model", "text-embedding-3-large"
)

# Validate and fallback to default if invalid
try:
validate_embedding_model(embedding_model)
except ValueError as e:
logger.warning(
f"Invalid embedding model '{embedding_model}' in config: {e}. "
f"Falling back to text-embedding-3-large"
)
embedding_model = "text-embedding-3-large"
embedding_model = "text-embedding-3-large"

# Step 1: Build embedding JSONL with trace_ids
jsonl_data = build_embedding_jsonl(
Expand Down
Loading