Skip to content
24 changes: 17 additions & 7 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,8 @@ def initialize_model_from_run(run_id: int, *, strict_version: bool = True) -> An
run = get_run(run_id)
# TODO(eddiebergman): I imagine this is None if it's not published,
# might need to raise an explicit error for that
assert run.setup_id is not None
if run.setup_id is None:
raise ValueError(f"Run {run_id} has no associated setup_id. Cannot initialize model.")
return initialize_model(setup_id=run.setup_id, strict_version=strict_version)


Expand Down Expand Up @@ -416,7 +417,8 @@ def initialize_model_from_trace(
run = get_run(run_id)
# TODO(eddiebergman): I imagine this is None if it's not published,
# might need to raise an explicit error for that
assert run.flow_id is not None
if run.flow_id is None:
raise ValueError(f"Run {run_id} has no associated flow_id. Cannot initialize model.")

flow = get_flow(run.flow_id)
run_trace = get_run_trace(run_id)
Expand Down Expand Up @@ -576,8 +578,10 @@ def _calculate_local_measure( # type: ignore
_user_defined_measures_fold[openml_name] = sklearn_fn(_test_y, _pred_y)

if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
assert test_y is not None
assert proba_y is not None
if test_y is None:
raise ValueError("test_y cannot be None for classification tasks.")
if proba_y is None:
raise ValueError("proba_y cannot be None for classification tasks.")

for i, tst_idx in enumerate(test_indices):
if task.class_labels is not None:
Expand Down Expand Up @@ -622,7 +626,8 @@ def _calculate_local_measure( # type: ignore
)

elif isinstance(task, OpenMLRegressionTask):
assert test_y is not None
if test_y is None:
raise ValueError("test_y cannot be None for regression tasks.")
for i, _ in enumerate(test_indices):
truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
arff_line = format_prediction(
Expand Down Expand Up @@ -743,7 +748,8 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913

if isinstance(task, OpenMLSupervisedTask):
x, y = task.get_X_and_y()
assert isinstance(y, (pd.Series, pd.DataFrame))
if not isinstance(y, (pd.Series, pd.DataFrame)):
raise TypeError(f"y must be a pandas Series or DataFrame, got {type(y).__name__}")
train_x = x.iloc[train_indices]
train_y = y.iloc[train_indices]
test_x = x.iloc[test_indices]
Expand Down Expand Up @@ -1213,7 +1219,11 @@ def __list_runs(api_call: str) -> pd.DataFrame:
f'"http://openml.org/openml": {runs_dict}',
)

assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type(runs_dict["oml:runs"])
if not isinstance(runs_dict["oml:runs"]["oml:run"], list):
raise TypeError(
f"Expected runs_dict['oml:runs']['oml:run'] to be a list, "
f"got {type(runs_dict['oml:runs']).__name__}"
)

runs = {
int(r["oml:run_id"]): {
Expand Down
119 changes: 49 additions & 70 deletions openml/runs/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,48 @@ def to_filesystem(
if self.trace is not None:
self.trace._to_filesystem(directory)

def _get_arff_attributes_for_task(self, task: OpenMLTask) -> list[tuple[str, Any]]:
"""Get ARFF attributes based on task type.

Parameters
----------
task : OpenMLTask
The task for which to generate attributes.

Returns
-------
list[tuple[str, Any]]
List of attribute tuples (name, type).
"""
instance_specifications = [
("repeat", "NUMERIC"),
("fold", "NUMERIC"),
]

if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)):
instance_specifications.append(("sample", "NUMERIC"))

instance_specifications.append(("row_id", "NUMERIC"))

if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)):
class_labels = task.class_labels
if class_labels is None:
raise ValueError("The task has no class labels")

prediction_confidences = [
("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels))
]
prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
return instance_specifications + prediction_and_true + prediction_confidences

if isinstance(task, OpenMLRegressionTask):
return [*instance_specifications, ("prediction", "NUMERIC"), ("truth", "NUMERIC")]

if isinstance(task, OpenMLClusteringTask):
return [*instance_specifications, ("cluster", "NUMERIC")]

raise NotImplementedError(f"Task type {task.task_type!s} is not yet supported.")

def _generate_arff_dict(self) -> OrderedDict[str, Any]:
"""Generates the arff dictionary for uploading predictions to the
server.
Expand All @@ -406,7 +448,8 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]:
if self.data_content is None:
raise ValueError("Run has not been executed.")
if self.flow is None:
assert self.flow_id is not None, "Run has no associated flow id!"
if self.flow_id is None:
raise ValueError("Run has no associated flow id!")
self.flow = get_flow(self.flow_id)

if self.description_text is None:
Expand All @@ -417,74 +460,7 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]:
arff_dict["data"] = self.data_content
arff_dict["description"] = self.description_text
arff_dict["relation"] = f"openml_task_{task.task_id}_predictions"

if isinstance(task, OpenMLLearningCurveTask):
class_labels = task.class_labels
instance_specifications = [
("repeat", "NUMERIC"),
("fold", "NUMERIC"),
("sample", "NUMERIC"),
("row_id", "NUMERIC"),
]

arff_dict["attributes"] = instance_specifications
if class_labels is not None:
arff_dict["attributes"] = (
arff_dict["attributes"]
+ [("prediction", class_labels), ("correct", class_labels)]
+ [
("confidence." + class_labels[i], "NUMERIC")
for i in range(len(class_labels))
]
)
else:
raise ValueError("The task has no class labels")

elif isinstance(task, OpenMLClassificationTask):
class_labels = task.class_labels
instance_specifications = [
("repeat", "NUMERIC"),
("fold", "NUMERIC"),
("sample", "NUMERIC"), # Legacy
("row_id", "NUMERIC"),
]

arff_dict["attributes"] = instance_specifications
if class_labels is not None:
prediction_confidences = [
("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels))
]
prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
arff_dict["attributes"] = (
arff_dict["attributes"] + prediction_and_true + prediction_confidences
)
else:
raise ValueError("The task has no class labels")

elif isinstance(task, OpenMLRegressionTask):
arff_dict["attributes"] = [
("repeat", "NUMERIC"),
("fold", "NUMERIC"),
("row_id", "NUMERIC"),
("prediction", "NUMERIC"),
("truth", "NUMERIC"),
]

elif isinstance(task, OpenMLClusteringTask):
arff_dict["attributes"] = [
("repeat", "NUMERIC"),
("fold", "NUMERIC"),
("row_id", "NUMERIC"),
("cluster", "NUMERIC"),
]

else:
raise NotImplementedError(
f"Task type '{task.task_type}' is not yet supported. "
f"Supported task types: Classification, Regression, Clustering, Learning Curve. "
f"Task ID: {task.task_id}. "
f"Please check the OpenML documentation for supported task types."
)
arff_dict["attributes"] = self._get_arff_attributes_for_task(task)

return arff_dict

Expand Down Expand Up @@ -641,7 +617,10 @@ def _get_file_elements(self) -> dict:

if self.parameter_settings is None:
if self.flow is None:
assert self.flow_id is not None # for mypy
if self.flow_id is None:
raise ValueError(
"Run has no associated flow_id and cannot obtain parameter values."
)
self.flow = openml.flows.get_flow(self.flow_id)
self.parameter_settings = self.flow.extension.obtain_parameter_values(
self.flow,
Expand Down
15 changes: 12 additions & 3 deletions openml/runs/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ def get_parameters(self) -> dict[str, Any]:
for param, value in self.setup_string.items()
}

assert self.parameters is not None
if self.parameters is None:
raise ValueError("Parameters must be set before calling get_parameters().")
return {param[len(PREFIX) :]: value for param, value in self.parameters.items()}


Expand Down Expand Up @@ -490,13 +491,21 @@ def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace:
for iteration in trace:
key = (iteration.repeat, iteration.fold, iteration.iteration)

assert iteration.parameters is not None
if iteration.parameters is None:
raise ValueError(
f"Iteration parameters cannot be None for repeat {iteration.repeat}, "
f"fold {iteration.fold}, iteration {iteration.iteration}"
)
param_keys = iteration.parameters.keys()

if previous_iteration is not None:
trace_itr = merged_trace[previous_iteration]

assert trace_itr.parameters is not None
if trace_itr.parameters is None:
raise ValueError(
f"Trace iteration parameters cannot be None "
f"for iteration {previous_iteration}"
)
trace_itr_keys = trace_itr.parameters.keys()

if list(param_keys) != list(trace_itr_keys):
Expand Down