diff --git a/src/fairscape_cli/commands/build_commands.py b/src/fairscape_cli/commands/build_commands.py index 6f5a236..dd3eee2 100644 --- a/src/fairscape_cli/commands/build_commands.py +++ b/src/fairscape_cli/commands/build_commands.py @@ -19,7 +19,8 @@ from fairscape_cli.models import ( GenerateROCrate, LinkSubcrates, - collect_subcrate_metadata + collect_subcrate_metadata, + collect_subcrate_aggregated_metrics ) from fairscape_models.rocrate import ROCrateV1_2 @@ -182,6 +183,9 @@ def build_release( if keyword not in combined_keywords: combined_keywords.append(keyword) + # Collect aggregated metrics for AI-Ready scoring + aggregated_metrics = collect_subcrate_aggregated_metrics(release_directory) + parent_params = { "guid": guid, "name": name, @@ -285,6 +289,17 @@ def build_release( click.echo(f"ERROR: {e}") ctx.exit(1) + # Add aggregated metrics as individual properties (following evi: prefix pattern) + parent_params["evi:datasetCount"] = aggregated_metrics.dataset_count + parent_params["evi:computationCount"] = aggregated_metrics.computation_count + parent_params["evi:softwareCount"] = aggregated_metrics.software_count + parent_params["evi:schemaCount"] = aggregated_metrics.schema_count + parent_params["evi:totalContentSizeBytes"] = aggregated_metrics.total_content_size_bytes + parent_params["evi:entitiesWithSummaryStats"] = aggregated_metrics.entities_with_summary_stats + parent_params["evi:entitiesWithChecksums"] = aggregated_metrics.entities_with_checksums + parent_params["evi:totalEntities"] = aggregated_metrics.total_entities + parent_params["evi:formats"] = sorted(list(aggregated_metrics.formats)) + try: click.echo("\n=== Creating release RO-Crate ===") parent_crate_root_dict = GenerateROCrate(**parent_params) diff --git a/src/fairscape_cli/commands/import_commands.py b/src/fairscape_cli/commands/import_commands.py index 1eadad4..587d635 100644 --- a/src/fairscape_cli/commands/import_commands.py +++ b/src/fairscape_cli/commands/import_commands.py @@ -325,7 +325,7 @@ def import_figshare( @import_group.command('dataverse') @click.argument('dataset-doi', type=str) @click.option('--server-url', default='https://dataverse.harvard.edu', show_default=True, help='Dataverse server URL.') -@click.option('--token', required=False, type=str, help='Dataverse API token (optional, for restricted datasets).') +@click.option('--token', required=False, type=str, help='Dataverse API token.') @generic_importer_options @click.pass_context def import_dataverse( diff --git a/src/fairscape_cli/commands/rocrate_commands.py b/src/fairscape_cli/commands/rocrate_commands.py index a4f418b..bb8f31a 100644 --- a/src/fairscape_cli/commands/rocrate_commands.py +++ b/src/fairscape_cli/commands/rocrate_commands.py @@ -956,7 +956,7 @@ def registerModel( "keywords": list(keywords), "modelType": model_type, "framework": framework, - "modelFormat": model_format, + "format": model_format, "trainingDataset": list(training_dataset), "generatedBy": generated_by, "filepath": filepath, @@ -1099,7 +1099,7 @@ def registerHuggingFaceModel( "keywords": list(keywords) if keywords else hf_metadata.get('keywords', []), "modelType": model_type or hf_metadata.get('model_type'), "framework": framework or hf_metadata.get('framework'), - "modelFormat": model_format or hf_metadata.get('model_format'), + "format": model_format or hf_metadata.get('model_format'), "trainingDataset": list(training_dataset) if training_dataset else hf_metadata.get('training_datasets', []), "filepath": hf_metadata.get('download_url'), "url": hf_metadata.get('landing_page_url'), diff --git a/src/fairscape_cli/data_fetcher/generic_data/research_data.py b/src/fairscape_cli/data_fetcher/generic_data/research_data.py index 49d5018..fc29888 100644 --- a/src/fairscape_cli/data_fetcher/generic_data/research_data.py +++ b/src/fairscape_cli/data_fetcher/generic_data/research_data.py @@ -80,8 +80,7 @@ def to_rocrate(self, output_dir: str, **kwargs) -> str: "version": file_info.get("version", "1.0"), "associatedPublication": self.doi or None, "additionalDocumentation": None, - "format": file_format, - "schema": "", + "format": file_format, "derivedFrom": [], "usedBy": [], "generatedBy": [], diff --git a/src/fairscape_cli/models/__init__.py b/src/fairscape_cli/models/__init__.py index 3c87a28..6e92907 100644 --- a/src/fairscape_cli/models/__init__.py +++ b/src/fairscape_cli/models/__init__.py @@ -8,14 +8,16 @@ from fairscape_cli.models.computation import Computation, GenerateComputation from fairscape_cli.models.rocrate import ( - ROCrate, + ROCrate, GenerateROCrate, - ReadROCrateMetadata, - AppendCrate, + ReadROCrateMetadata, + AppendCrate, CopyToROCrate, UpdateCrate, LinkSubcrates, - collect_subcrate_metadata + collect_subcrate_metadata, + collect_subcrate_aggregated_metrics, + AggregatedMetrics ) from fairscape_cli.models.bagit import BagIt from fairscape_cli.models.pep import PEPtoROCrateMapper @@ -39,5 +41,7 @@ 'BagIt', 'PEPtoROCrateMapper', 'LinkSubcrates', - 'collect_subcrate_metadata' + 'collect_subcrate_metadata', + 'collect_subcrate_aggregated_metrics', + 'AggregatedMetrics' ] diff --git a/src/fairscape_cli/models/biochem_entity.py b/src/fairscape_cli/models/biochem_entity.py index 29737ba..46f21cd 100644 --- a/src/fairscape_cli/models/biochem_entity.py +++ b/src/fairscape_cli/models/biochem_entity.py @@ -40,7 +40,6 @@ def GenerateBioChemEntity( entityMetadata = { "@id": guid, "name": name, - "@type": "https://schema.org/BioChemEntity", "description": description } diff --git a/src/fairscape_cli/models/computation.py b/src/fairscape_cli/models/computation.py index 6bbcb1f..2d4791e 100644 --- a/src/fairscape_cli/models/computation.py +++ b/src/fairscape_cli/models/computation.py @@ -34,7 +34,6 @@ def GenerateComputation( computationMetadata = { "@id": guid, "name": name, - "@type": "https://w3id.org/EVI#Computation" } for key, value in kwargs.items(): diff --git a/src/fairscape_cli/models/dataset.py b/src/fairscape_cli/models/dataset.py index c26711b..6bd352d 100644 --- a/src/fairscape_cli/models/dataset.py +++ b/src/fairscape_cli/models/dataset.py @@ -41,8 +41,7 @@ def GenerateDataset( datasetMetadata = { "@id": guid, - "name": name, - "@type": "https://w3id.org/EVI#Dataset" + "name": name } content_url = None diff --git a/src/fairscape_cli/models/experiment.py b/src/fairscape_cli/models/experiment.py index 1a9dc1e..6883949 100644 --- a/src/fairscape_cli/models/experiment.py +++ b/src/fairscape_cli/models/experiment.py @@ -34,8 +34,7 @@ def GenerateExperiment( experimentMetadata = { "@id": guid, - "name": name, - "@type": "https://w3id.org/EVI#Experiment" + "name": name } for key, value in kwargs.items(): diff --git a/src/fairscape_cli/models/instrument.py b/src/fairscape_cli/models/instrument.py index 259b3b7..14b3b8b 100644 --- a/src/fairscape_cli/models/instrument.py +++ b/src/fairscape_cli/models/instrument.py @@ -40,7 +40,6 @@ def GenerateInstrument( instrumentMetadata = { "@id": guid, "name": name, - "@type": "https://w3id.org/EVI#Instrument" } if filepath and cratePath: diff --git a/src/fairscape_cli/models/rocrate.py b/src/fairscape_cli/models/rocrate.py index 1289256..d5dea96 100644 --- a/src/fairscape_cli/models/rocrate.py +++ b/src/fairscape_cli/models/rocrate.py @@ -2,12 +2,13 @@ import shutil import json from datetime import datetime -from typing import Optional, Union, List, Literal, Dict, Any +from typing import Optional, Union, List, Literal, Dict, Any, Set +from dataclasses import dataclass, field from pydantic import BaseModel, Field, ConfigDict, model_validator import uuid import mongomock -from fairscape_cli.config import NAAN, DEFAULT_CONTEXT +from fairscape_models import DEFAULT_CONTEXT, DEFAULT_ARK_NAAN as NAAN from fairscape_cli.models.software import Software from fairscape_cli.models.dataset import Dataset from fairscape_cli.models.computation import Computation @@ -530,6 +531,7 @@ def find_and_process_subcrates(directory: pathlib.Path, base_path: pathlib.Path) print("No valid sub-crates found to link.") return linked_sub_crate_ids + def collect_subcrate_metadata(parent_crate_path: pathlib.Path) -> dict: """ Collects author and keyword metadata from all subcrates in the parent crate. @@ -584,7 +586,204 @@ def process_directory(directory): 'authors': sorted(list(authors)), 'keywords': sorted(list(keywords)) } - + + +@dataclass +class AggregatedMetrics: + """ + Aggregated metrics from all sub-crates for AI-Ready scoring. + + This class accumulates entity counts, statistics, checksums, formats, + and schema references from all sub-crates in a release to enable + efficient AI-Ready score calculation without recursive file reads. + """ + + # Entity counts (for provenance scoring) + dataset_count: int = 0 + computation_count: int = 0 + software_count: int = 0 + schema_count: int = 0 + + # Statistics (for characterization scoring) + total_content_size_bytes: int = 0 + entities_with_summary_stats: int = 0 + + # Verification (for pre-model explainability) + entities_with_checksums: int = 0 + total_entities: int = 0 + + # Computability + formats: Set[str] = field(default_factory=set) + + # Standards + schemas: List[Dict[str, str]] = field(default_factory=list) + + +def _extract_content_size_bytes(size_str: str) -> int: + """ + Extract content size in bytes from a size string. + + Args: + size_str: Size string like "125.5 GB" or "1.2 TB" + + Returns: + Size in bytes as integer, or 0 if parsing fails + """ + if not size_str or not isinstance(size_str, str): + return 0 + + try: + size_str = size_str.strip().upper() + if "TB" in size_str: + return int(float(size_str.replace("TB", "").strip()) * 1e12) + elif "GB" in size_str: + return int(float(size_str.replace("GB", "").strip()) * 1e9) + elif "MB" in size_str: + return int(float(size_str.replace("MB", "").strip()) * 1e6) + elif "KB" in size_str: + return int(float(size_str.replace("KB", "").strip()) * 1e3) + else: + # Assume bytes if no unit + return int(float(size_str)) + except (ValueError, AttributeError): + return 0 + + +def _extract_checksum(entity: Dict[str, Any]) -> Optional[str]: + """ + Extract checksum from an entity. + + Args: + entity: Entity dictionary from RO-Crate @graph + + Returns: + Checksum string (e.g., "md5:abc123...") or None + """ + # Check common checksum fields + md5 = entity.get("md5") or entity.get("MD5") + if md5: + if md5.startswith("md5:"): + return md5 + else: + return f"md5:{md5}" + + sha256 = entity.get("sha256") or entity.get("SHA256") + if sha256: + if sha256.startswith("sha256:"): + return sha256 + else: + return f"sha256:{sha256}" + + return None + + +def _get_entity_type(entity: Dict[str, Any]) -> str: + """ + Get type from entity's @type or metadataType field. + + Args: + entity: Entity dictionary from RO-Crate @graph + + Returns: + Type string (last item if list), or empty string + """ + type_val = entity.get("@type") or entity.get("metadataType") or [] + if isinstance(type_val, str): + return type_val + elif isinstance(type_val, list) and type_val: + return type_val[-1] + return "" + + +def collect_subcrate_aggregated_metrics( + parent_crate_path: pathlib.Path +) -> AggregatedMetrics: + """ + Collect aggregated metrics from all subcrates for AI-Ready scoring. + + This function traverses all sub-crates in a release directory and + aggregates entity counts, statistics, checksums, formats, and schemas. + These aggregated metrics are added to the release-level RO-Crate to + enable efficient AI-Ready score calculation without requiring recursive + file system reads during scoring. + + Args: + parent_crate_path: Path to the release directory containing sub-crates + + Returns: + AggregatedMetrics object with all roll-up properties + """ + parent_crate_path = pathlib.Path(parent_crate_path) + metrics = AggregatedMetrics() + processed_files = set() + + def process_directory(directory: pathlib.Path): + """Recursively process directories to find and aggregate subcrate metadata.""" + for path in directory.glob('**/ro-crate-metadata.json'): + if path.is_file() and str(path) not in processed_files: + processed_files.add(str(path)) + + subcrate_metadata = ReadROCrateMetadata(path) + graph = subcrate_metadata.get('@graph', []) + + for entity in graph: + # Convert pydantic to dict + if hasattr(entity, 'model_dump'): + entity = entity.model_dump(by_alias=True) + + if entity.get('@id') == 'ro-crate-metadata.json': + continue + + entity_type = _get_entity_type(entity) + + if "Dataset" in entity_type: + metrics.dataset_count += 1 + metrics.total_entities += 1 + + elif "Computation" in entity_type or "Experiment" in entity_type: + metrics.computation_count += 1 + metrics.total_entities += 1 + + elif "Software" in entity_type: + metrics.software_count += 1 + metrics.total_entities += 1 + + elif "Schema" in entity_type: + metrics.schema_count += 1 + schema_id = entity.get('@id') + if schema_id: + metrics.schemas.append({"@id": schema_id}) + + content_size = entity.get("contentSize") + if content_size: + size_bytes = _extract_content_size_bytes(content_size) + if size_bytes > 0: + metrics.total_content_size_bytes += size_bytes + + if entity.get("hasSummaryStatistics"): + metrics.entities_with_summary_stats += 1 + + checksum = _extract_checksum(entity) + if checksum: + metrics.entities_with_checksums += 1 + + format_val = entity.get("format") or entity.get("encodingFormat") + if format_val: + if isinstance(format_val, str): + metrics.formats.add(format_val) + elif isinstance(format_val, list): + for fmt in format_val: + if isinstance(fmt, str): + metrics.formats.add(fmt) + + + for dir_item in parent_crate_path.iterdir(): + if dir_item.is_dir(): + process_directory(dir_item) + + return metrics + + ################################ # # Mongomock update tests diff --git a/src/fairscape_cli/models/sample.py b/src/fairscape_cli/models/sample.py index 488b221..feddf80 100644 --- a/src/fairscape_cli/models/sample.py +++ b/src/fairscape_cli/models/sample.py @@ -39,8 +39,7 @@ def GenerateSample( sampleMetadata = { "@id": guid, - "name": name, - "@type": "https://w3id.org/EVI#Sample" + "name": name } if filepath and cratePath: diff --git a/src/fairscape_cli/models/software.py b/src/fairscape_cli/models/software.py index 482c556..67a4f79 100644 --- a/src/fairscape_cli/models/software.py +++ b/src/fairscape_cli/models/software.py @@ -40,8 +40,7 @@ def GenerateSoftware( softwareMetadata = { "@id": guid, - "name" : name, - "@type": "https://w3id.org/EVI#Software" + "name" : name } content_url = None