Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion src/fairscape_cli/commands/build_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from fairscape_cli.models import (
GenerateROCrate,
LinkSubcrates,
collect_subcrate_metadata
collect_subcrate_metadata,
collect_subcrate_aggregated_metrics
)

from fairscape_models.rocrate import ROCrateV1_2
Expand Down Expand Up @@ -182,6 +183,9 @@ def build_release(
if keyword not in combined_keywords:
combined_keywords.append(keyword)

# Collect aggregated metrics for AI-Ready scoring
aggregated_metrics = collect_subcrate_aggregated_metrics(release_directory)

parent_params = {
"guid": guid,
"name": name,
Expand Down Expand Up @@ -285,6 +289,17 @@ def build_release(
click.echo(f"ERROR: {e}")
ctx.exit(1)

# Add aggregated metrics as individual properties (following evi: prefix pattern)
parent_params["evi:datasetCount"] = aggregated_metrics.dataset_count
parent_params["evi:computationCount"] = aggregated_metrics.computation_count
parent_params["evi:softwareCount"] = aggregated_metrics.software_count
parent_params["evi:schemaCount"] = aggregated_metrics.schema_count
parent_params["evi:totalContentSizeBytes"] = aggregated_metrics.total_content_size_bytes
parent_params["evi:entitiesWithSummaryStats"] = aggregated_metrics.entities_with_summary_stats
parent_params["evi:entitiesWithChecksums"] = aggregated_metrics.entities_with_checksums
parent_params["evi:totalEntities"] = aggregated_metrics.total_entities
parent_params["evi:formats"] = sorted(list(aggregated_metrics.formats))

try:
click.echo("\n=== Creating release RO-Crate ===")
parent_crate_root_dict = GenerateROCrate(**parent_params)
Expand Down
2 changes: 1 addition & 1 deletion src/fairscape_cli/commands/import_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ def import_figshare(
@import_group.command('dataverse')
@click.argument('dataset-doi', type=str)
@click.option('--server-url', default='https://dataverse.harvard.edu', show_default=True, help='Dataverse server URL.')
@click.option('--token', required=False, type=str, help='Dataverse API token (optional, for restricted datasets).')
@click.option('--token', required=False, type=str, help='Dataverse API token.')
@generic_importer_options
@click.pass_context
def import_dataverse(
Expand Down
4 changes: 2 additions & 2 deletions src/fairscape_cli/commands/rocrate_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,7 +956,7 @@ def registerModel(
"keywords": list(keywords),
"modelType": model_type,
"framework": framework,
"modelFormat": model_format,
"format": model_format,
"trainingDataset": list(training_dataset),
"generatedBy": generated_by,
"filepath": filepath,
Expand Down Expand Up @@ -1099,7 +1099,7 @@ def registerHuggingFaceModel(
"keywords": list(keywords) if keywords else hf_metadata.get('keywords', []),
"modelType": model_type or hf_metadata.get('model_type'),
"framework": framework or hf_metadata.get('framework'),
"modelFormat": model_format or hf_metadata.get('model_format'),
"format": model_format or hf_metadata.get('model_format'),
"trainingDataset": list(training_dataset) if training_dataset else hf_metadata.get('training_datasets', []),
"filepath": hf_metadata.get('download_url'),
"url": hf_metadata.get('landing_page_url'),
Expand Down
3 changes: 1 addition & 2 deletions src/fairscape_cli/data_fetcher/generic_data/research_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,7 @@ def to_rocrate(self, output_dir: str, **kwargs) -> str:
"version": file_info.get("version", "1.0"),
"associatedPublication": self.doi or None,
"additionalDocumentation": None,
"format": file_format,
"schema": "",
"format": file_format,
"derivedFrom": [],
"usedBy": [],
"generatedBy": [],
Expand Down
14 changes: 9 additions & 5 deletions src/fairscape_cli/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@

from fairscape_cli.models.computation import Computation, GenerateComputation
from fairscape_cli.models.rocrate import (
ROCrate,
ROCrate,
GenerateROCrate,
ReadROCrateMetadata,
AppendCrate,
ReadROCrateMetadata,
AppendCrate,
CopyToROCrate,
UpdateCrate,
LinkSubcrates,
collect_subcrate_metadata
collect_subcrate_metadata,
collect_subcrate_aggregated_metrics,
AggregatedMetrics
)
from fairscape_cli.models.bagit import BagIt
from fairscape_cli.models.pep import PEPtoROCrateMapper
Expand All @@ -39,5 +41,7 @@
'BagIt',
'PEPtoROCrateMapper',
'LinkSubcrates',
'collect_subcrate_metadata'
'collect_subcrate_metadata',
'collect_subcrate_aggregated_metrics',
'AggregatedMetrics'
]
1 change: 0 additions & 1 deletion src/fairscape_cli/models/biochem_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def GenerateBioChemEntity(
entityMetadata = {
"@id": guid,
"name": name,
"@type": "https://schema.org/BioChemEntity",
"description": description
}

Expand Down
1 change: 0 additions & 1 deletion src/fairscape_cli/models/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def GenerateComputation(
computationMetadata = {
"@id": guid,
"name": name,
"@type": "https://w3id.org/EVI#Computation"
}

for key, value in kwargs.items():
Expand Down
3 changes: 1 addition & 2 deletions src/fairscape_cli/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ def GenerateDataset(

datasetMetadata = {
"@id": guid,
"name": name,
"@type": "https://w3id.org/EVI#Dataset"
"name": name
}

content_url = None
Expand Down
3 changes: 1 addition & 2 deletions src/fairscape_cli/models/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ def GenerateExperiment(

experimentMetadata = {
"@id": guid,
"name": name,
"@type": "https://w3id.org/EVI#Experiment"
"name": name
}

for key, value in kwargs.items():
Expand Down
1 change: 0 additions & 1 deletion src/fairscape_cli/models/instrument.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def GenerateInstrument(
instrumentMetadata = {
"@id": guid,
"name": name,
"@type": "https://w3id.org/EVI#Instrument"
}

if filepath and cratePath:
Expand Down
205 changes: 202 additions & 3 deletions src/fairscape_cli/models/rocrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import shutil
import json
from datetime import datetime
from typing import Optional, Union, List, Literal, Dict, Any
from typing import Optional, Union, List, Literal, Dict, Any, Set
from dataclasses import dataclass, field
from pydantic import BaseModel, Field, ConfigDict, model_validator
import uuid
import mongomock

from fairscape_cli.config import NAAN, DEFAULT_CONTEXT
from fairscape_models import DEFAULT_CONTEXT, DEFAULT_ARK_NAAN as NAAN
from fairscape_cli.models.software import Software
from fairscape_cli.models.dataset import Dataset
from fairscape_cli.models.computation import Computation
Expand Down Expand Up @@ -530,6 +531,7 @@ def find_and_process_subcrates(directory: pathlib.Path, base_path: pathlib.Path)
print("No valid sub-crates found to link.")

return linked_sub_crate_ids

def collect_subcrate_metadata(parent_crate_path: pathlib.Path) -> dict:
"""
Collects author and keyword metadata from all subcrates in the parent crate.
Expand Down Expand Up @@ -584,7 +586,204 @@ def process_directory(directory):
'authors': sorted(list(authors)),
'keywords': sorted(list(keywords))
}



@dataclass
class AggregatedMetrics:
"""
Aggregated metrics from all sub-crates for AI-Ready scoring.

This class accumulates entity counts, statistics, checksums, formats,
and schema references from all sub-crates in a release to enable
efficient AI-Ready score calculation without recursive file reads.
"""

# Entity counts (for provenance scoring)
dataset_count: int = 0
computation_count: int = 0
software_count: int = 0
schema_count: int = 0

# Statistics (for characterization scoring)
total_content_size_bytes: int = 0
entities_with_summary_stats: int = 0

# Verification (for pre-model explainability)
entities_with_checksums: int = 0
total_entities: int = 0

# Computability
formats: Set[str] = field(default_factory=set)

# Standards
schemas: List[Dict[str, str]] = field(default_factory=list)


def _extract_content_size_bytes(size_str: str) -> int:
"""
Extract content size in bytes from a size string.

Args:
size_str: Size string like "125.5 GB" or "1.2 TB"

Returns:
Size in bytes as integer, or 0 if parsing fails
"""
if not size_str or not isinstance(size_str, str):
return 0

try:
size_str = size_str.strip().upper()
if "TB" in size_str:
return int(float(size_str.replace("TB", "").strip()) * 1e12)
elif "GB" in size_str:
return int(float(size_str.replace("GB", "").strip()) * 1e9)
elif "MB" in size_str:
return int(float(size_str.replace("MB", "").strip()) * 1e6)
elif "KB" in size_str:
return int(float(size_str.replace("KB", "").strip()) * 1e3)
else:
# Assume bytes if no unit
return int(float(size_str))
except (ValueError, AttributeError):
return 0


def _extract_checksum(entity: Dict[str, Any]) -> Optional[str]:
"""
Extract checksum from an entity.

Args:
entity: Entity dictionary from RO-Crate @graph

Returns:
Checksum string (e.g., "md5:abc123...") or None
"""
# Check common checksum fields
md5 = entity.get("md5") or entity.get("MD5")
if md5:
if md5.startswith("md5:"):
return md5
else:
return f"md5:{md5}"

sha256 = entity.get("sha256") or entity.get("SHA256")
if sha256:
if sha256.startswith("sha256:"):
return sha256
else:
return f"sha256:{sha256}"

return None


def _get_entity_type(entity: Dict[str, Any]) -> str:
"""
Get type from entity's @type or metadataType field.

Args:
entity: Entity dictionary from RO-Crate @graph

Returns:
Type string (last item if list), or empty string
"""
type_val = entity.get("@type") or entity.get("metadataType") or []
if isinstance(type_val, str):
return type_val
elif isinstance(type_val, list) and type_val:
return type_val[-1]
return ""


def collect_subcrate_aggregated_metrics(
parent_crate_path: pathlib.Path
) -> AggregatedMetrics:
"""
Collect aggregated metrics from all subcrates for AI-Ready scoring.

This function traverses all sub-crates in a release directory and
aggregates entity counts, statistics, checksums, formats, and schemas.
These aggregated metrics are added to the release-level RO-Crate to
enable efficient AI-Ready score calculation without requiring recursive
file system reads during scoring.

Args:
parent_crate_path: Path to the release directory containing sub-crates

Returns:
AggregatedMetrics object with all roll-up properties
"""
parent_crate_path = pathlib.Path(parent_crate_path)
metrics = AggregatedMetrics()
processed_files = set()

def process_directory(directory: pathlib.Path):
"""Recursively process directories to find and aggregate subcrate metadata."""
for path in directory.glob('**/ro-crate-metadata.json'):
if path.is_file() and str(path) not in processed_files:
processed_files.add(str(path))

subcrate_metadata = ReadROCrateMetadata(path)
graph = subcrate_metadata.get('@graph', [])

for entity in graph:
# Convert pydantic to dict
if hasattr(entity, 'model_dump'):
entity = entity.model_dump(by_alias=True)

if entity.get('@id') == 'ro-crate-metadata.json':
continue

entity_type = _get_entity_type(entity)

if "Dataset" in entity_type:
metrics.dataset_count += 1
metrics.total_entities += 1

elif "Computation" in entity_type or "Experiment" in entity_type:
metrics.computation_count += 1
metrics.total_entities += 1

elif "Software" in entity_type:
metrics.software_count += 1
metrics.total_entities += 1

elif "Schema" in entity_type:
metrics.schema_count += 1
schema_id = entity.get('@id')
if schema_id:
metrics.schemas.append({"@id": schema_id})

content_size = entity.get("contentSize")
if content_size:
size_bytes = _extract_content_size_bytes(content_size)
if size_bytes > 0:
metrics.total_content_size_bytes += size_bytes

if entity.get("hasSummaryStatistics"):
metrics.entities_with_summary_stats += 1

checksum = _extract_checksum(entity)
if checksum:
metrics.entities_with_checksums += 1

format_val = entity.get("format") or entity.get("encodingFormat")
if format_val:
if isinstance(format_val, str):
metrics.formats.add(format_val)
elif isinstance(format_val, list):
for fmt in format_val:
if isinstance(fmt, str):
metrics.formats.add(fmt)


for dir_item in parent_crate_path.iterdir():
if dir_item.is_dir():
process_directory(dir_item)

return metrics


################################
#
# Mongomock update tests
Expand Down
3 changes: 1 addition & 2 deletions src/fairscape_cli/models/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@ def GenerateSample(

sampleMetadata = {
"@id": guid,
"name": name,
"@type": "https://w3id.org/EVI#Sample"
"name": name
}

if filepath and cratePath:
Expand Down
3 changes: 1 addition & 2 deletions src/fairscape_cli/models/software.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@ def GenerateSoftware(

softwareMetadata = {
"@id": guid,
"name" : name,
"@type": "https://w3id.org/EVI#Software"
"name" : name
}

content_url = None
Expand Down
Loading