From a38a0b46abb1fc0828ce5417ff7da134fe9f2734 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Mon, 5 Jan 2026 12:37:36 -0500 Subject: [PATCH 1/2] model changes for prov --- fairscape_models/activity.py | 10 +++++++--- fairscape_models/annotation.py | 21 +++++++++++++++++--- fairscape_models/computation.py | 22 +++++++++++++++++++-- fairscape_models/dataset.py | 30 +++++++++++++++++++++++++++-- fairscape_models/digital_object.py | 11 ++++++++--- fairscape_models/experiment.py | 28 +++++++++++++++++++++++---- fairscape_models/fairscape_base.py | 1 + fairscape_models/mlmodel.py | 31 +++++++++++++++++++++++++++++- fairscape_models/rocrate.py | 23 ++++++++++++++++++++++ fairscape_models/software.py | 19 ++++++++++++++++-- 10 files changed, 176 insertions(+), 20 deletions(-) diff --git a/fairscape_models/activity.py b/fairscape_models/activity.py index 0a3e402..2363adb 100644 --- a/fairscape_models/activity.py +++ b/fairscape_models/activity.py @@ -1,5 +1,5 @@ -from pydantic import BaseModel, Field, ConfigDict -from typing import Optional, List +from pydantic import BaseModel, Field, ConfigDict, model_validator +from typing import Optional, List, Union from fairscape_models.fairscape_base import IdentifierValue @@ -13,4 +13,8 @@ class Activity(BaseModel): generated: Optional[List[IdentifierValue]] = Field(default=[]) isPartOf: Optional[List[IdentifierValue]] = Field(default=[]) - model_config = ConfigDict(extra="allow") + # PROV-O fields (auto-populated) + used: Optional[List[Union[str, IdentifierValue]]] = Field(default=[], alias="prov:used") + wasAssociatedWith: Optional[List[Union[str, IdentifierValue]]] = Field(default=[], alias="prov:wasAssociatedWith") + + model_config = ConfigDict(extra="allow", populate_by_name=True) diff --git a/fairscape_models/annotation.py b/fairscape_models/annotation.py index 773bf37..ec6aa51 100644 --- a/fairscape_models/annotation.py +++ b/fairscape_models/annotation.py @@ -1,5 +1,5 @@ -from pydantic import Field, ConfigDict -from typing import Optional, List +from pydantic import Field, ConfigDict, model_validator +from typing import Optional, List, Union from fairscape_models.fairscape_base import IdentifierValue, ANNOTATION_TYPE from fairscape_models.activity import Activity @@ -7,6 +7,21 @@ class Annotation(Activity): metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Annotation", alias="@type") additionalType: Optional[str] = Field(default=ANNOTATION_TYPE) - createdBy: str + createdBy: Union[str, IdentifierValue] dateCreated: str usedDataset: Optional[List[IdentifierValue]] = Field(default=[]) + + @model_validator(mode='after') + def populate_prov_fields(self): + """Auto-populate PROV-O fields from EVI fields""" + # Map usedDataset to prov:used (preserving their types) + if self.usedDataset: + self.used = self.usedDataset + else: + self.used = [] + + # Map createdBy to prov:wasAssociatedWith (preserve type: str or IdentifierValue) + if self.createdBy: + self.wasAssociatedWith = [self.createdBy] + + return self diff --git a/fairscape_models/computation.py b/fairscape_models/computation.py index 50332ad..6d98179 100644 --- a/fairscape_models/computation.py +++ b/fairscape_models/computation.py @@ -1,4 +1,4 @@ -from pydantic import Field, ConfigDict +from pydantic import Field, ConfigDict, model_validator from typing import Optional, List, Union from fairscape_models.fairscape_base import IdentifierValue, COMPUTATION_TYPE @@ -7,10 +7,28 @@ class Computation(Activity): metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Computation", alias="@type") additionalType: Optional[str] = Field(default=COMPUTATION_TYPE) - runBy: str + runBy: Union[str, IdentifierValue] dateCreated: str additionalDocumentation: Optional[str] = Field(default=None) command: Optional[Union[List[str], str]] = Field(default=None) usedSoftware: Optional[List[IdentifierValue]] = Field(default=[]) usedMLModel: Optional[List[IdentifierValue]] = Field(default=[]) usedDataset: Optional[List[IdentifierValue]] = Field(default=[]) + + @model_validator(mode='after') + def populate_prov_fields(self): + """Auto-populate PROV-O fields from EVI fields""" + # Aggregate all inputs into prov:used + used_items = [] + if self.usedSoftware: + used_items.extend(self.usedSoftware) + if self.usedMLModel: + used_items.extend(self.usedMLModel) + if self.usedDataset: + used_items.extend(self.usedDataset) + self.used = used_items + + if self.runBy: + self.wasAssociatedWith = [self.runBy] + + return self diff --git a/fairscape_models/dataset.py b/fairscape_models/dataset.py index 69c2d3f..50c7b14 100644 --- a/fairscape_models/dataset.py +++ b/fairscape_models/dataset.py @@ -1,4 +1,4 @@ -from pydantic import Field, ConfigDict, AliasChoices +from pydantic import Field, ConfigDict, AliasChoices, model_validator from typing import Optional, List, Union from fairscape_models.fairscape_base import IdentifierValue, DATASET_TYPE @@ -16,4 +16,30 @@ class Dataset(DigitalObject): default=None ) generatedBy: Optional[Union[IdentifierValue, List[IdentifierValue]]] = Field(default=[]) - derivedFrom: Optional[List[IdentifierValue]] = Field(default=[]) \ No newline at end of file + derivedFrom: Optional[List[IdentifierValue]] = Field(default=[]) + + @model_validator(mode='after') + def populate_prov_fields(self): + """Auto-populate PROV-O fields from EVI fields""" + # Map generatedBy → prov:wasGeneratedBy + if self.generatedBy: + if isinstance(self.generatedBy, list): + self.wasGeneratedBy = self.generatedBy + else: + self.wasGeneratedBy = [self.generatedBy] + else: + self.wasGeneratedBy = [] + + # Map derivedFrom → prov:wasDerivedFrom + self.wasDerivedFrom = self.derivedFrom or [] + + # Map author + if self.author: + if isinstance(self.author, str): + self.wasAttributedTo = [IdentifierValue(**{"@id": self.author})] + elif isinstance(self.author, list): + self.wasAttributedTo = [IdentifierValue(**{"@id": a}) for a in self.author] + else: + self.wasAttributedTo = [] + + return self \ No newline at end of file diff --git a/fairscape_models/digital_object.py b/fairscape_models/digital_object.py index 755c0fd..7fb61bd 100644 --- a/fairscape_models/digital_object.py +++ b/fairscape_models/digital_object.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, Field, ConfigDict +from pydantic import BaseModel, Field, ConfigDict, model_validator from typing import Optional, List, Union from fairscape_models.fairscape_base import IdentifierValue @@ -8,7 +8,7 @@ class DigitalObject(BaseModel): guid: str = Field(alias="@id") name: str metadataType: Optional[str] = Field(default=None, alias="@type") - author: Union[str, List[str]] + author: Union[str, IdentifierValue, List[Union[str, IdentifierValue]]] description: str = Field(min_length=10) version: str = Field(default="0.1.0") associatedPublication: Optional[Union[str, List[str]]] = Field(default=None) @@ -17,4 +17,9 @@ class DigitalObject(BaseModel): isPartOf: Optional[List[IdentifierValue]] = Field(default=[]) usedByComputation: Optional[List[IdentifierValue]] = Field(default=[]) - model_config = ConfigDict(extra="allow") + # PROV-O fields (auto-populated) + wasGeneratedBy: Optional[List[Union[str, IdentifierValue]]] = Field(default=[], alias="prov:wasGeneratedBy") + wasDerivedFrom: Optional[List[Union[str, IdentifierValue]]] = Field(default=[], alias="prov:wasDerivedFrom") + wasAttributedTo: Optional[List[Union[str, IdentifierValue]]] = Field(default=[], alias="prov:wasAttributedTo") + + model_config = ConfigDict(extra="allow", populate_by_name=True) diff --git a/fairscape_models/experiment.py b/fairscape_models/experiment.py index e5e4046..600aee4 100644 --- a/fairscape_models/experiment.py +++ b/fairscape_models/experiment.py @@ -1,15 +1,35 @@ -from pydantic import Field, ConfigDict -from typing import Optional, List +from pydantic import Field, ConfigDict, model_validator +from typing import Optional, List, Union from fairscape_models.fairscape_base import IdentifierValue from fairscape_models.activity import Activity class Experiment(Activity): metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Experiment", alias="@type") experimentType: str - runBy: str + runBy: Union[str, IdentifierValue] datePerformed: str protocol: Optional[str] = Field(default=None) usedInstrument: Optional[List[IdentifierValue]] = Field(default=[]) usedSample: Optional[List[IdentifierValue]] = Field(default=[]) usedTreatment: Optional[List[IdentifierValue]] = Field(default=[]) - usedStain: Optional[List[IdentifierValue]] = Field(default=[]) \ No newline at end of file + usedStain: Optional[List[IdentifierValue]] = Field(default=[]) + + @model_validator(mode='after') + def populate_prov_fields(self): + """Auto-populate PROV-O fields from EVI fields""" + # Aggregate all inputs into prov:used + used_items = [] + if self.usedInstrument: + used_items.extend(self.usedInstrument) + if self.usedSample: + used_items.extend(self.usedSample) + if self.usedTreatment: + used_items.extend(self.usedTreatment) + if self.usedStain: + used_items.extend(self.usedStain) + self.used = used_items + + if self.runBy: + self.wasAssociatedWith = [self.runBy] + + return self \ No newline at end of file diff --git a/fairscape_models/fairscape_base.py b/fairscape_models/fairscape_base.py index 132bdc2..69236f9 100644 --- a/fairscape_models/fairscape_base.py +++ b/fairscape_models/fairscape_base.py @@ -32,6 +32,7 @@ "@vocab": "https://schema.org/", "evi": "https://w3id.org/EVI#", "rai": "http://mlcommons.org/croissant/RAI/", + "prov": "http://www.w3.org/ns/prov#", # TODO fully specify default context "usedSoftware": { diff --git a/fairscape_models/mlmodel.py b/fairscape_models/mlmodel.py index 7ff56bd..47dee56 100644 --- a/fairscape_models/mlmodel.py +++ b/fairscape_models/mlmodel.py @@ -1,4 +1,4 @@ -from pydantic import Field, ConfigDict +from pydantic import Field, ConfigDict, model_validator from typing import Optional, List, Union from fairscape_models.fairscape_base import IdentifierValue, MLMODEL_TYPE @@ -12,3 +12,32 @@ class MLModel(DigitalObject): modelTask: Optional[str] = Field(default=None) modelArchitecture: Optional[str] = Field(default=None) trainedOn: Optional[List[IdentifierValue]] = Field(default=[]) + generatedBy: Optional[Union[IdentifierValue, List[IdentifierValue]]] = Field(default=[]) + derivedFrom: Optional[List[IdentifierValue]] = Field(default=[]) + + @model_validator(mode='after') + def populate_prov_fields(self): + """Auto-populate PROV-O fields from EVI fields""" + + # Map generatedBy → prov:wasGeneratedBy + if self.generatedBy: + if isinstance(self.generatedBy, list): + self.wasGeneratedBy = self.generatedBy + else: + self.wasGeneratedBy = [self.generatedBy] + else: + self.wasGeneratedBy = [] + + # Map derivedFrom → prov:wasDerivedFrom + self.wasDerivedFrom = self.derivedFrom or [] + + # Map author → prov:wasAttributedTo + if self.author: + if isinstance(self.author, str): + self.wasAttributedTo = [IdentifierValue(**{"@id": self.author})] + elif isinstance(self.author, list): + self.wasAttributedTo = [IdentifierValue(**{"@id": a}) for a in self.author] + else: + self.wasAttributedTo = [] + + return self diff --git a/fairscape_models/rocrate.py b/fairscape_models/rocrate.py index 1ce1010..d4826ee 100644 --- a/fairscape_models/rocrate.py +++ b/fairscape_models/rocrate.py @@ -245,14 +245,25 @@ def cleanIdentifierUnion(identifier_union): cleanIdentifierUnion(elem.generatedBy) + # Clean PROV fields + cleanIdentifierList(elem.wasGeneratedBy) + cleanIdentifierList(elem.wasDerivedFrom) + cleanIdentifierList(elem.wasAttributedTo) + if isinstance(elem, Software): cleanIdentifierList(elem.usedByComputation) + # Clean PROV fields + cleanIdentifierList(elem.wasAttributedTo) + if isinstance(elem, MLModel): cleanIdentifierList(elem.usedByComputation) cleanIdentifierList(elem.trainedOn) + # Clean PROV fields + cleanIdentifierList(elem.wasAttributedTo) + if isinstance(elem, Computation): cleanIdentifierList(elem.usedDataset) @@ -263,12 +274,20 @@ def cleanIdentifierUnion(identifier_union): cleanIdentifierList(elem.usedMLModel) + # Clean PROV fields + cleanIdentifierList(elem.used) + cleanIdentifierList(elem.wasAssociatedWith) + if isinstance(elem, Annotation): cleanIdentifierList(elem.usedDataset) cleanIdentifierList(elem.generated) + # Clean PROV fields + cleanIdentifierList(elem.used) + cleanIdentifierList(elem.wasAssociatedWith) + if isinstance(elem, Experiment): cleanIdentifierList(elem.usedInstrument) @@ -281,6 +300,10 @@ def cleanIdentifierUnion(identifier_union): cleanIdentifierList(elem.generated) + # Clean PROV fields + cleanIdentifierList(elem.used) + cleanIdentifierList(elem.wasAssociatedWith) + def getCrateMetadata(self)-> ROCrateMetadataElem: """ Filter the Metadata Graph for the Metadata Element Describing the Toplevel ROCrate diff --git a/fairscape_models/software.py b/fairscape_models/software.py index 2622d03..2531b42 100644 --- a/fairscape_models/software.py +++ b/fairscape_models/software.py @@ -1,4 +1,4 @@ -from pydantic import Field, ConfigDict +from pydantic import Field, ConfigDict, model_validator from typing import Optional, List from fairscape_models.fairscape_base import IdentifierValue, SOFTWARE_TYPE @@ -7,5 +7,20 @@ class Software(DigitalObject): metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Software", alias="@type") additionalType: Optional[str] = Field(default=SOFTWARE_TYPE) - dateModified: Optional[str] + dateModified: Optional[str] = None fileFormat: str = Field(title="fileFormat", alias="format") + + @model_validator(mode='after') + def populate_prov_fields(self): + """Auto-populate PROV-O fields from EVI fields""" + + # Map author → prov:wasAttributedTo + if self.author: + if isinstance(self.author, list): + self.wasAttributedTo = self.author + else: + self.wasAttributedTo = [self.author] + else: + self.wasAttributedTo = [] + + return self From 60a5d8943aa5173fba9ba9788a1907b899ddc48d Mon Sep 17 00:00:00 2001 From: jniestroy Date: Mon, 5 Jan 2026 12:38:07 -0500 Subject: [PATCH 2/2] tests --- tests/test_annotation.py | 22 ++++++++ tests/test_computation.py | 25 ++++++++- tests/test_dataset.py | 53 ++++++++++++++++++- tests/test_experiment.py | 18 +++++-- tests/test_mlmodel.py | 107 ++++++++++++++++++++++++++++++++++++++ tests/test_software.py | 36 ++++++++++++- 6 files changed, 254 insertions(+), 7 deletions(-) create mode 100644 tests/test_mlmodel.py diff --git a/tests/test_annotation.py b/tests/test_annotation.py index e32ba94..5807887 100644 --- a/tests/test_annotation.py +++ b/tests/test_annotation.py @@ -1,6 +1,7 @@ import pytest from pydantic import ValidationError from fairscape_models.annotation import Annotation +from fairscape_models.fairscape_base import IdentifierValue def test_annotation_instantiation(annotation_minimal_data): """Test successful instantiation of an Annotation model.""" @@ -8,8 +9,29 @@ def test_annotation_instantiation(annotation_minimal_data): assert annotation.guid == annotation_minimal_data["@id"] assert annotation.description == annotation_minimal_data["description"] + # Test PROV field auto-population + assert annotation.used == [] # No usedDataset provided + assert len(annotation.wasAssociatedWith) == 1 + assert annotation.wasAssociatedWith[0] == annotation_minimal_data["createdBy"] + def test_annotation_short_description(annotation_minimal_data): """Test that a short description raises a ValidationError.""" annotation_minimal_data["description"] = "too short" with pytest.raises(ValidationError): Annotation.model_validate(annotation_minimal_data) + +def test_annotation_with_datasets(annotation_minimal_data): + """Test PROV field population with usedDataset.""" + annotation_minimal_data["usedDataset"] = [ + {"@id": "ark:59852/dataset-1"}, + {"@id": "ark:59852/dataset-2"} + ] + + annotation = Annotation.model_validate(annotation_minimal_data) + + # Test PROV:used is populated from usedDataset + assert len(annotation.used) == 2 + assert all(isinstance(item, IdentifierValue) for item in annotation.used) + used_ids = [item.guid for item in annotation.used] + assert "ark:59852/dataset-1" in used_ids + assert "ark:59852/dataset-2" in used_ids diff --git a/tests/test_computation.py b/tests/test_computation.py index cf8537f..9efbd95 100644 --- a/tests/test_computation.py +++ b/tests/test_computation.py @@ -1,6 +1,7 @@ import pytest from pydantic import ValidationError from fairscape_models.computation import Computation +from fairscape_models.fairscape_base import IdentifierValue def test_computation_instantiation(computation_minimal_data): """Test successful instantiation of a Computation model.""" @@ -8,8 +9,30 @@ def test_computation_instantiation(computation_minimal_data): assert computation.guid == computation_minimal_data["@id"] assert computation.description == computation_minimal_data["description"] + # Test PROV field auto-population + assert computation.used == [] # No inputs provided + assert len(computation.wasAssociatedWith) == 1 + assert computation.wasAssociatedWith[0] == computation_minimal_data["runBy"] + def test_computation_short_description(computation_minimal_data): """Test that a short description raises a ValidationError.""" computation_minimal_data["description"] = "too short" with pytest.raises(ValidationError): - Computation.model_validate(computation_minimal_data) \ No newline at end of file + Computation.model_validate(computation_minimal_data) + +def test_computation_with_inputs(computation_minimal_data): + """Test PROV field population with usedSoftware, usedDataset, usedMLModel.""" + computation_minimal_data["usedSoftware"] = [{"@id": "ark:59852/software-1"}] + computation_minimal_data["usedDataset"] = [{"@id": "ark:59852/dataset-1"}] + computation_minimal_data["usedMLModel"] = [{"@id": "ark:59852/model-1"}] + + computation = Computation.model_validate(computation_minimal_data) + + # Test PROV:used aggregates all inputs + assert len(computation.used) == 3 + assert all(isinstance(item, IdentifierValue) for item in computation.used) + used_ids = [item.guid for item in computation.used] + assert "ark:59852/software-1" in used_ids + assert "ark:59852/dataset-1" in used_ids + assert "ark:59852/model-1" in used_ids + diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 1938f32..7bcf147 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,6 +1,7 @@ import pytest from pydantic import ValidationError from fairscape_models.dataset import Dataset +from fairscape_models.fairscape_base import IdentifierValue def test_dataset_instantiation(dataset_minimal_data): """Test successful instantiation of a Dataset model.""" @@ -9,6 +10,13 @@ def test_dataset_instantiation(dataset_minimal_data): assert dataset.name == dataset_minimal_data["name"] assert dataset.fileFormat == dataset_minimal_data["format"] + # Test PROV field auto-population + assert dataset.wasGeneratedBy == [] # No generatedBy provided + assert dataset.wasDerivedFrom == [] # No derivedFrom provided + assert len(dataset.wasAttributedTo) == 1 + assert isinstance(dataset.wasAttributedTo[0], IdentifierValue) + assert dataset.wasAttributedTo[0].guid == dataset_minimal_data["author"] + def test_dataset_missing_required_field(dataset_minimal_data): """Test that a ValidationError is raised for a missing required field.""" del dataset_minimal_data["name"] @@ -36,7 +44,48 @@ def test_dataset_custom_validator(dataset_minimal_data): # Use the 'schema' alias, which is what the model expects for validation dataset_minimal_data_v3 = {**dataset_minimal_data, "schema": schema_id} dataset3 = Dataset.model_validate(dataset_minimal_data_v3) - + # Assert that the dataSchema attribute is correctly populated assert dataset3.dataSchema is not None - assert dataset3.dataSchema.guid == schema_id["@id"] \ No newline at end of file + assert dataset3.dataSchema.guid == schema_id["@id"] + +def test_dataset_with_provenance(dataset_minimal_data): + """Test PROV field population with generatedBy and derivedFrom.""" + dataset_minimal_data["generatedBy"] = [{"@id": "ark:59852/computation-1"}] + dataset_minimal_data["derivedFrom"] = [{"@id": "ark:59852/dataset-source"}] + dataset_minimal_data["author"] = ["Author 1", "Author 2"] # Test list of authors + + dataset = Dataset.model_validate(dataset_minimal_data) + + # Test PROV fields + assert len(dataset.wasGeneratedBy) == 1 + assert isinstance(dataset.wasGeneratedBy[0], IdentifierValue) + assert dataset.wasGeneratedBy[0].guid == "ark:59852/computation-1" + + assert len(dataset.wasDerivedFrom) == 1 + assert isinstance(dataset.wasDerivedFrom[0], IdentifierValue) + assert dataset.wasDerivedFrom[0].guid == "ark:59852/dataset-source" + + assert len(dataset.wasAttributedTo) == 2 + assert all(isinstance(item, IdentifierValue) for item in dataset.wasAttributedTo) + author_ids = [item.guid for item in dataset.wasAttributedTo] + assert "Author 1" in author_ids + assert "Author 2" in author_ids + +def test_dataset_edge_case_empty_author(): + """Test PROV field population when author is falsy (defensive code path).""" + # Test with empty list for author (valid but falsy) + dataset_data = { + "@id": "ark:59852/test-dataset", + "name": "Test Dataset", + "author": [], + "datePublished": "2023-11-09", + "description": "This is a test dataset with sufficient description.", + "keywords": ["test", "dataset"], + "format": "text/csv" + } + + dataset = Dataset.model_validate(dataset_data) + + # Should hit the else clause and set wasAttributedTo to empty list + assert dataset.wasAttributedTo == [] \ No newline at end of file diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 058c5e2..308f92f 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -10,6 +10,11 @@ def test_experiment_instantiation(experiment_minimal_data): assert experiment.name == experiment_minimal_data["name"] assert experiment.experimentType == experiment_minimal_data["experimentType"] + # Test PROV field auto-population + assert experiment.used == [] # No used items provided + assert len(experiment.wasAssociatedWith) == 1 + assert experiment.wasAssociatedWith[0] == experiment_minimal_data["runBy"] + def test_experiment_missing_required_field(experiment_minimal_data): """Test that a ValidationError is raised for a missing required field.""" del experiment_minimal_data["runBy"] @@ -26,13 +31,20 @@ def test_experiment_with_used_items(experiment_minimal_data): """Test instantiation with various 'used' lists.""" instrument_id = {"@id": "ark:59852/inst-1"} sample_id = {"@id": "ark:59852/sample-1"} - + experiment_minimal_data["usedInstrument"] = [instrument_id] experiment_minimal_data["usedSample"] = [sample_id] - + experiment = Experiment.model_validate(experiment_minimal_data) assert isinstance(experiment.usedInstrument[0], IdentifierValue) assert experiment.usedInstrument[0].guid == instrument_id["@id"] assert isinstance(experiment.usedSample[0], IdentifierValue) - assert experiment.usedSample[0].guid == sample_id["@id"] \ No newline at end of file + assert experiment.usedSample[0].guid == sample_id["@id"] + + # Test PROV:used aggregates all inputs + assert len(experiment.used) == 2 + assert all(isinstance(item, IdentifierValue) for item in experiment.used) + used_ids = [item.guid for item in experiment.used] + assert "ark:59852/inst-1" in used_ids + assert "ark:59852/sample-1" in used_ids \ No newline at end of file diff --git a/tests/test_mlmodel.py b/tests/test_mlmodel.py new file mode 100644 index 0000000..c0e1677 --- /dev/null +++ b/tests/test_mlmodel.py @@ -0,0 +1,107 @@ +import pytest +from pydantic import ValidationError +from fairscape_models.mlmodel import MLModel +from fairscape_models.fairscape_base import IdentifierValue + +@pytest.fixture +def mlmodel_minimal_data(): + """Minimal data for a valid MLModel.""" + return { + "@id": "ark:59852/test-mlmodel", + "name": "Test ML Model", + "author": "Test ML Author", + "description": "This is a test ML model with sufficient description.", + "format": "application/x-pickle" + } + +def test_mlmodel_instantiation(mlmodel_minimal_data): + """Test successful instantiation of an MLModel model.""" + mlmodel = MLModel.model_validate(mlmodel_minimal_data) + assert mlmodel.guid == mlmodel_minimal_data["@id"] + assert mlmodel.name == mlmodel_minimal_data["name"] + + # Test PROV field auto-population + assert len(mlmodel.wasAttributedTo) == 1 + assert isinstance(mlmodel.wasAttributedTo[0], IdentifierValue) + assert mlmodel.wasAttributedTo[0].guid == mlmodel_minimal_data["author"] + +def test_mlmodel_missing_required_field(mlmodel_minimal_data): + """Test ValidationError for missing a required field.""" + del mlmodel_minimal_data["author"] + with pytest.raises(ValidationError): + MLModel.model_validate(mlmodel_minimal_data) + +def test_mlmodel_short_description(mlmodel_minimal_data): + """Test that a short description raises a ValidationError.""" + mlmodel_minimal_data["description"] = "too short" + with pytest.raises(ValidationError): + MLModel.model_validate(mlmodel_minimal_data) + +def test_mlmodel_with_multiple_authors(mlmodel_minimal_data): + """Test PROV field population with multiple authors.""" + mlmodel_minimal_data["author"] = ["ML Author 1", "ML Author 2"] + + mlmodel = MLModel.model_validate(mlmodel_minimal_data) + + # Test PROV:wasAttributedTo handles list of authors + assert len(mlmodel.wasAttributedTo) == 2 + assert all(isinstance(item, IdentifierValue) for item in mlmodel.wasAttributedTo) + author_ids = [item.guid for item in mlmodel.wasAttributedTo] + assert "ML Author 1" in author_ids + assert "ML Author 2" in author_ids + +def test_mlmodel_with_generated_by_single(mlmodel_minimal_data): + """Test PROV field population with single generatedBy.""" + mlmodel_minimal_data["generatedBy"] = {"@id": "ark:59852/computation-1"} + + mlmodel = MLModel.model_validate(mlmodel_minimal_data) + + # Test PROV:wasGeneratedBy with single value + assert len(mlmodel.wasGeneratedBy) == 1 + assert isinstance(mlmodel.wasGeneratedBy[0], IdentifierValue) + assert mlmodel.wasGeneratedBy[0].guid == "ark:59852/computation-1" + +def test_mlmodel_with_generated_by_list(mlmodel_minimal_data): + """Test PROV field population with list of generatedBy.""" + mlmodel_minimal_data["generatedBy"] = [ + {"@id": "ark:59852/computation-1"}, + {"@id": "ark:59852/computation-2"} + ] + + mlmodel = MLModel.model_validate(mlmodel_minimal_data) + + # Test PROV:wasGeneratedBy with list + assert len(mlmodel.wasGeneratedBy) == 2 + assert all(isinstance(item, IdentifierValue) for item in mlmodel.wasGeneratedBy) + generated_ids = [item.guid for item in mlmodel.wasGeneratedBy] + assert "ark:59852/computation-1" in generated_ids + assert "ark:59852/computation-2" in generated_ids + +def test_mlmodel_with_derived_from(mlmodel_minimal_data): + """Test PROV field population with derivedFrom.""" + mlmodel_minimal_data["derivedFrom"] = [ + {"@id": "ark:59852/model-source"} + ] + + mlmodel = MLModel.model_validate(mlmodel_minimal_data) + + # Test PROV:wasDerivedFrom + assert len(mlmodel.wasDerivedFrom) == 1 + assert isinstance(mlmodel.wasDerivedFrom[0], IdentifierValue) + assert mlmodel.wasDerivedFrom[0].guid == "ark:59852/model-source" + +def test_mlmodel_edge_case_empty_author(): + """Test PROV field population when author is falsy (defensive code path).""" + # Test with empty list for author (valid but falsy) + mlmodel_data = { + "@id": "ark:59852/test-mlmodel", + "name": "Test Model", + "author": [], + "description": "This is a test ML model with sufficient description.", + "format": "application/x-pickle" + } + + mlmodel = MLModel.model_validate(mlmodel_data) + + # Should hit the else clause and set wasAttributedTo to empty list + assert mlmodel.wasAttributedTo == [] diff --git a/tests/test_software.py b/tests/test_software.py index 603c2c5..bfa5133 100644 --- a/tests/test_software.py +++ b/tests/test_software.py @@ -1,6 +1,7 @@ import pytest from pydantic import ValidationError from fairscape_models.software import Software +from fairscape_models.fairscape_base import IdentifierValue def test_software_instantiation(software_minimal_data): """Test successful instantiation of a Software model.""" @@ -8,6 +9,10 @@ def test_software_instantiation(software_minimal_data): assert software.guid == software_minimal_data["@id"] assert software.name == software_minimal_data["name"] + # Test PROV field auto-population + assert len(software.wasAttributedTo) == 1 + assert software.wasAttributedTo[0] == software_minimal_data["author"] + def test_software_missing_required_field(software_minimal_data): """Test ValidationError for missing a required field.""" del software_minimal_data["author"] @@ -18,4 +23,33 @@ def test_software_short_description(software_minimal_data): """Test that a short description raises a ValidationError.""" software_minimal_data["description"] = "too short" with pytest.raises(ValidationError): - Software.model_validate(software_minimal_data) \ No newline at end of file + Software.model_validate(software_minimal_data) + +def test_software_with_multiple_authors(software_minimal_data): + """Test PROV field population with multiple authors.""" + software_minimal_data["author"] = ["Author 1", "Author 2", "Author 3"] + + software = Software.model_validate(software_minimal_data) + + # Test PROV:wasAttributedTo handles list of authors + assert len(software.wasAttributedTo) == 3 + assert "Author 1" in software.wasAttributedTo + assert "Author 2" in software.wasAttributedTo + assert "Author 3" in software.wasAttributedTo + +def test_software_edge_case_empty_author(): + """Test PROV field population when author is falsy (defensive code path).""" + # Test with empty list for author (valid but falsy) + software_data = { + "@id": "ark:59852/test-software", + "name": "Test Software", + "author": [], + "dateModified": "2023-11-09", + "description": "This is a test software with a good description.", + "format": "application/x-python" + } + + software = Software.model_validate(software_data) + + # Should hit the else clause and set wasAttributedTo to empty list + assert software.wasAttributedTo == [] \ No newline at end of file