diff --git a/fairscape_models/conversion/mapping/AIReady.py b/fairscape_models/conversion/mapping/AIReady.py index b23c7e0..aea3a9b 100644 --- a/fairscape_models/conversion/mapping/AIReady.py +++ b/fairscape_models/conversion/mapping/AIReady.py @@ -181,6 +181,33 @@ def _score_characterization(characterization: CharacterizationScore, root_data: has_content=True, details=str(bias)[:200] + ("..." if len(str(bias)) > 200 else "") ) + + missDataDoc = root_data.get("rai:dataCollectionMissingData", "") + if missDataDoc and str(missDataDoc).strip(): + characterization.data_quality = SubCriterionScore( + has_content=True, + details=str(missDataDoc)[:200] + ("..." if len(str(missDataDoc)) > 200 else "") + ) + + schema_count = root_data.get("evi:schemaCount") + + if schema_count is not None: + pass + else: + # Fall back to counting in metadata_graph (for backwards compatibility) + schema_count = 0 + + for entity in metadata_graph: + entity_type = _get_type(entity) + + if "schema" in entity_type: + schema_count += 1 + + if schema_count > 0: + characterization.standards = SubCriterionScore( + has_content=True, + details=f"{schema_count} schema(s) documented" + ) # Check for aggregated metrics first total_size_bytes = root_data.get("evi:totalContentSizeBytes") @@ -409,6 +436,12 @@ def _score_computability(computability: ComputabilityScore, root_data: Dict[str, has_content=True, details=f"Formats: {', '.join(fmt_list)}{suffix}" ) + + if root_data.get("publisher"): + computability.computationally_accessible = SubCriterionScore( + has_content=True, + details=f"Publisher: {root_data.get('publisher')}" + ) def _build_ai_ready_score(value: Any, *, converter_instance) -> AIReadyScore: """Builder function for use with ROCToTargetConverter.""" diff --git a/fairscape_models/conversion/mapping/FairscapeDatasheet.py b/fairscape_models/conversion/mapping/FairscapeDatasheet.py index ff2568b..0465815 100644 --- a/fairscape_models/conversion/mapping/FairscapeDatasheet.py +++ b/fairscape_models/conversion/mapping/FairscapeDatasheet.py @@ -112,9 +112,27 @@ def _extract_id(value: Any) -> Optional[str]: USECASES_MAPPING = { "intended_use": {"source_key": "rai:dataUseCases"}, "limitations": {"source_key": "rai:dataLimitations"}, - "prohibited_uses": {"source_key": "additionalProperty", "parser": from_additional_property("Prohibited Uses")}, + "prohibited_uses": {"source_key": "additionalProperty", "parser": from_additional_property("Prohibited Uses")}, "potential_sources_of_bias": {"source_key": "rai:dataBiases"}, - "maintenance_plan": {"source_key": "rai:dataMaintenancePlan"}, + "maintenance_plan": {"source_key": "rai:dataReleaseMaintenancePlan"}, + + # Additional RAI fields + "data_collection": {"source_key": "rai:dataCollection"}, + "data_collection_type": {"source_key": "rai:dataCollectionType", "parser": _list_to_str}, + "data_collection_missing_data": {"source_key": "rai:dataCollectionMissingData"}, + "data_collection_raw_data": {"source_key": "rai:dataCollectionRawData"}, + "data_collection_timeframe": {"source_key": "rai:dataCollectionTimeframe", "parser": _list_to_str}, + "data_imputation_protocol": {"source_key": "rai:dataImputationProtocol"}, + "data_manipulation_protocol": {"source_key": "rai:dataManipulationProtocol"}, + "data_preprocessing_protocol": {"source_key": "rai:dataPreprocessingProtocol", "parser": _list_to_str}, + "data_annotation_protocol": {"source_key": "rai:dataAnnotationProtocol"}, + "data_annotation_platform": {"source_key": "rai:dataAnnotationPlatform", "parser": _list_to_str}, + "data_annotation_analysis": {"source_key": "rai:dataAnnotationAnalysis", "parser": _list_to_str}, + "personal_sensitive_information": {"source_key": "rai:personalSensitiveInformation", "parser": _list_to_str}, + "data_social_impact": {"source_key": "rai:dataSocialImpact"}, + "annotations_per_item": {"source_key": "rai:annotationsPerItem"}, + "annotator_demographics": {"source_key": "rai:annotatorDemographics", "parser": _list_to_str}, + "machine_annotation_tools": {"source_key": "rai:machineAnnotationTools", "parser": _list_to_str}, } USECASES_MAPPING_CONFIGURATION = { diff --git a/fairscape_models/conversion/mapping/d4d.py b/fairscape_models/conversion/mapping/d4d.py index 05233eb..0deb19b 100644 --- a/fairscape_models/conversion/mapping/d4d.py +++ b/fairscape_models/conversion/mapping/d4d.py @@ -1,50 +1,47 @@ +""" +ROCrate to D4D conversion mappings and utility functions. + +This module provides the mapping configurations and parser functions needed +to convert ROCrate format data to D4D (Data Sheets for Datasets) format. +""" + +from typing import Dict, Any, Optional, List from datetime import datetime import re -from typing import List, Dict, Any, Optional -from pathlib import Path -import json - -from fairscape_models.conversion.models.d4d import DatasetCollection, Dataset, FormatEnum -from fairscape_models.rocrate import ROCrateV1_2 - -def parse_authors_from_ro_crate(authors: Any) -> List[str]: - if not authors: return [] - if isinstance(authors, str): - return [name.strip() for name in authors.replace(';', ',').split(',') if name.strip()] - elif isinstance(authors, list): - return [str(item) for item in authors] - return [] - -def parse_funders_from_ro_crate(funders: Any) -> List[str]: - if not funders: return [] - if isinstance(funders, str): - return [part.strip() for part in re.split(r'\.\s*|[;,]', funders) if part.strip()] - elif isinstance(funders, list): - return [str(item) for item in funders] - return [] - -def parse_keywords_simple(keywords: Any) -> List[str]: - if not keywords: return [] + + +# ============================================================================ +# Parser Functions - Type Conversions +# ============================================================================ + +def _parse_iso_to_datetime(dt: Any) -> Optional[datetime]: + """Convert ISO format strings to datetime objects.""" + if dt is None: + return None + if isinstance(dt, datetime): + return dt + if isinstance(dt, str): + # Try various formats + for fmt in ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%m/%d/%Y"]: + try: + return datetime.strptime(dt.split(".")[0], fmt) + except ValueError: + continue + return None + +def _parse_keywords_to_list(keywords: Any) -> Optional[List[str]]: + """Convert keywords to list of strings.""" + if not keywords: + return None if isinstance(keywords, str): return [kw.strip() for kw in re.split(r'[;,]', keywords) if kw.strip()] elif isinstance(keywords, list): - return [str(item) for item in keywords] - return [] - -def parse_related_publications(value_from_lookup: Any) -> List[str]: - if not value_from_lookup: return [] - pubs = [] - items_to_process = value_from_lookup if isinstance(value_from_lookup, list) else [value_from_lookup] - - for pub in items_to_process: - if isinstance(pub, dict): - citation = pub.get("citation") or pub.get("name") or pub.get("@id") - if citation: pubs.append(str(citation)) - elif isinstance(pub, str) and pub.strip(): - pubs.append(pub.strip()) - return pubs - -def parse_file_size_to_bytes(size_value: Any) -> Optional[int]: + return [str(item) for item in keywords if item] + return None + + +def _parse_size_to_bytes(size_value: Any) -> Optional[int]: + """Convert human-readable size strings to bytes.""" if size_value is None: return None if isinstance(size_value, int): @@ -53,7 +50,7 @@ def parse_file_size_to_bytes(size_value: Any) -> Optional[int]: size_str = size_value.strip().lower() if size_str.isdigit(): return int(size_str) - + units = { 'b': 1, 'byte': 1, 'bytes': 1, 'kb': 1024, 'kilobyte': 1024, 'kilobytes': 1024, @@ -61,7 +58,7 @@ def parse_file_size_to_bytes(size_value: Any) -> Optional[int]: 'gb': 1024**3, 'gigabyte': 1024**3, 'gigabytes': 1024**3, 'tb': 1024**4, 'terabyte': 1024**4, 'terabytes': 1024**4 } - + for unit, multiplier in units.items(): if size_str.endswith(unit): try: @@ -71,176 +68,116 @@ def parse_file_size_to_bytes(size_value: Any) -> Optional[int]: continue return None -def from_additional_property(name: str, default: Optional[str] = None): - def _parser(prop_list: Any) -> Optional[str]: - if isinstance(prop_list, list): - for p in prop_list: - if isinstance(p, dict) and p.get("name") == name: - val = p.get("value") - return str(val) if val is not None else default - return default - return _parser - -def _build_datasets_from_subcrates(*, converter_instance, source_entity_model) -> List[Dataset]: - """Build Dataset objects from sub-crates, with sub-crate properties taking precedence.""" - datasets = [] - root_dict = source_entity_model.model_dump(by_alias=True) - - release_properties = { - "version": root_dict.get("version"), - "license": root_dict.get("license"), - "keywords": parse_keywords_simple(root_dict.get("keywords")), - "created_on": root_dict.get("datePublished"), - "issued": root_dict.get("datePublished"), - "publisher": root_dict.get("publisher"), - "doi": root_dict.get("identifier"), - "creators": parse_authors_from_ro_crate(root_dict.get("author")), - "funders": parse_funders_from_ro_crate(root_dict.get("funder")), - "purposes": root_dict.get("rai:dataUseCases"), - "tasks": root_dict.get("rai:dataLimitations"), - "ethical_reviews": root_dict.get("ethicalReview"), - "discouraged_uses": from_additional_property("Prohibited Uses")(root_dict.get("additionalProperty")), - "updates": root_dict.get("rai:dataReleaseMaintenancePlan"), - "sensitive_info": root_dict.get("rai:personalSensitiveInformation"), - } - - subcrate_entities = [ - e for e in converter_instance.source_crate.metadataGraph - if hasattr(e, 'ro-crate-metadata') and e.guid != "ro-crate-metadata.json" - ] - - for subcrate_entity in subcrate_entities: - subcrate_dict = subcrate_entity.model_dump(by_alias=True) - metadata_path = getattr(subcrate_entity, 'ro-crate-metadata', None) - - dataset_args = { - "id": subcrate_dict.get("@id", f"subcrate-{len(datasets)}"), - "title": subcrate_dict.get("name", "Unnamed Subcrate"), - "description": subcrate_dict.get("description"), - "download_url": subcrate_dict.get("contentUrl"), - "path": subcrate_dict.get("contentUrl"), - "bytes": parse_file_size_to_bytes(subcrate_dict.get("contentSize")), - "md5": subcrate_dict.get("md5"), - **release_properties - } - - if metadata_path: - try: - with Path(metadata_path).open("r") as f: - sub_crate_dict = json.load(f) - - sub_graph = sub_crate_dict.get("@graph", []) - - subcrate_root = None - for idx, entity in enumerate(sub_graph): - if entity.get("@id") == "ro-crate-metadata.json": - about_ref = entity.get("about", {}) - root_id = about_ref.get("@id") if isinstance(about_ref, dict) else about_ref - if root_id: - for e in sub_graph: - if e.get("@id") == root_id: - subcrate_root = e - break - break - - if subcrate_root: - if subcrate_root.get("version"): - dataset_args["version"] = subcrate_root.get("version") - if subcrate_root.get("license"): - dataset_args["license"] = subcrate_root.get("license") - if subcrate_root.get("keywords"): - dataset_args["keywords"] = parse_keywords_simple(subcrate_root.get("keywords")) - if subcrate_root.get("datePublished"): - date_str = subcrate_root.get("datePublished") - parsed_date = datetime.strptime(date_str, "%m/%d/%Y" if "/" in date_str else "%Y-%m-%d") - dataset_args["created_on"] = parsed_date - dataset_args["issued"] = parsed_date - if subcrate_root.get("publisher"): - dataset_args["publisher"] = subcrate_root.get("publisher") - if subcrate_root.get("identifier"): - dataset_args["doi"] = subcrate_root.get("identifier") - if subcrate_root.get("author"): - dataset_args["creators"] = parse_authors_from_ro_crate(subcrate_root.get("author")) - if subcrate_root.get("funder"): - dataset_args["funders"] = parse_funders_from_ro_crate(subcrate_root.get("funder")) - if subcrate_root.get("rai:dataUseCases"): - dataset_args["purposes"] = subcrate_root.get("rai:dataUseCases") - if subcrate_root.get("rai:dataLimitations"): - dataset_args["tasks"] = subcrate_root.get("rai:dataLimitations") - if subcrate_root.get("ethicalReview"): - dataset_args["ethical_reviews"] = subcrate_root.get("ethicalReview") - if subcrate_root.get("rai:dataReleaseMaintenancePlan"): - dataset_args["updates"] = subcrate_root.get("rai:dataReleaseMaintenancePlan") - if subcrate_root.get("rai:personalSensitiveInformation"): - dataset_args["sensitive_info"] = subcrate_root.get("rai:personalSensitiveInformation") - - addl_props = subcrate_root.get("additionalProperty") - if addl_props: - prohibited_uses = from_additional_property("Prohibited Uses")(addl_props) - if prohibited_uses: - dataset_args["discouraged_uses"] = prohibited_uses - - sub_rocrate = ROCrateV1_2.model_validate(sub_crate_dict) - - formats = set() - for entity in sub_rocrate.metadataGraph: - entity_dict = entity.model_dump(by_alias=True) - entity_type = entity_dict.get("@type") or [] - if isinstance(entity_type, str): - entity_type = [entity_type] - - if any(t in str(entity_type).lower() for t in ["dataset", "file"]): - fmt = entity_dict.get("fileFormat") or entity_dict.get("format") - if fmt: - formats.add(str(fmt)) - - if formats: - for fmt in formats: - try: - dataset_args["format"] = FormatEnum(fmt) - break - except: - pass - except Exception as e: - print(f"Could not parse subcrate metadata: {e}") - - dataset_args = {k: v for k, v in dataset_args.items() if v is not None} - - try: - datasets.append(Dataset(**dataset_args)) - except Exception as e: - print(f"Error creating Dataset: {e}") - print(f"Args: {dataset_args}") - - return datasets - -D4D_DATASET_COLLECTION_MAPPING = { + +def _string_to_list(value: Any) -> Optional[List[str]]: + """Convert a string to a single-item list, or return list as-is.""" + if value is None: + return None + if isinstance(value, list): + return value + if isinstance(value, str): + return [value] + return [str(value)] + + +# ============================================================================ +# Builder Functions - Complex Field Extraction +# ============================================================================ + + + +# ============================================================================ +# Mapping Configuration +# ============================================================================ + +ROCRATE_TO_D4D_MAPPING = { + + #named thing "id": {"source_key": "@id"}, + "name": {"source_key": "name"}, "title": {"source_key": "name"}, "description": {"source_key": "description"}, - "version": {"source_key": "version"}, - "license": {"source_key": "license"}, - "keywords": {"source_key": "keywords", "parser": parse_keywords_simple}, - "created_on": {"source_key": "datePublished"}, - "issued": {"source_key": "datePublished"}, - "publisher": {"source_key": "publisher"}, + + #information + "compression": {"source_key": "evi:formats"}, + "conforms_to": {"fixed_value": "D4D Schema"}, + "created_by": {"source_key": "author"}, + "created_on": {"source_key": "dateCreated", "parser": _parse_iso_to_datetime}, "doi": {"source_key": "identifier"}, "download_url": {"source_key": "contentUrl"}, - "resources": {"builder_func": _build_datasets_from_subcrates}, -} + "keywords": {"source_key": "keywords"}, + "language": {"source_key": "language"}, + "last_updated_on": {"source_key": "dateModified", "parser": _parse_iso_to_datetime}, + "license": {"source_key": "license"}, + "page": {"source_key": "url"}, + "publisher": {"source_key": "publisher"}, + "version": {"source_key": "version"}, + "was_derived_from": {"source_key": "generatedBy"}, + + + -MAPPING_CONFIGURATION = { - "entity_map": { - ("ROCrateMetadataElem", "ROOT"): { - "target_class": DatasetCollection, - "mapping_def": D4D_DATASET_COLLECTION_MAPPING - }, - - ("Dataset", "COMPONENT"): None, - ("Schema", "COMPONENT"): None, - ("Software", "COMPONENT"): None, - ("Computation", "COMPONENT"): None, - }, - - "assembly_instructions": [] -} \ No newline at end of file + + # Dataset + + "bytes": {"source_key": "contentSize", "parser": _parse_size_to_bytes}, + "encoding": {"source_key": "evi:formats"}, + "format": {"source_key": "evi:formats"}, + "hash": {"source_key": "MD5"}, + "md5": {"source_key": "MD5"}, + "sha256": {"source_key": "sha256"}, + + #media_type, path, external_resources, resources + + "purposes": {"source_key": "rai:dataUseCases"}, + "tasks": {"source_key": "rai:dataUseCases"}, + #addressing_gaps + "creators": {"source_key": "author"}, + "funders": {"source_key": "funders"}, + + #subsets, instances, anomalies + + "known_biases": {"source_key": "rai:dataBiases"}, + "known_limitations": {"source_key": "rai:dataLimitations"}, + + #confidential_elements, content_warnings, subpopulations + "sensitive_elements": {"source_key": "rai:personalSensitiveInformation"}, + "aquisition_methods": {"source_key": "rai:dataCollection"}, + "collection_mechanisms": {"source_key": "rai:dataCollection"}, + + #sampling_strategies, data_collectors + + "collection_timeframes": {"source_key": "rai:dataCollectionTimeframe"}, + "missing_data_documentation": {"source_key": "rai:dataCollectionMissingData"}, + "raw_data_sources": {"source_key": "rai:dataCollectionRawData"}, + "ethical_reviews": {"source_key": "ethicalReview"}, + + #data_protection_impacts + "human_subject_research": {"source_key": "humanSubject"}, + + #informed_consent, participant_privacy, participant_compensation, vulnerable_populations + + "preprocessing_strategies": {"source_key": "rai:dataPreprocessingProtocol"}, + + #cleaning_strategies + + "labeling_strategies": {"source_key": "rai:dataAnnotationProtocol"}, + "raw_sources": {"source_key":"rai:dataCollectionRawData"}, + "imputation_protocols": {"source_key":"rai:dataImputationProtocol"}, + "annotation_analyses": {"source_key":"rai:dataAnnotationProtocol"}, + "machine_annotation_tools": {"source_key":"rai:machineAnnotationTools"}, + + #existing_ueses, use_repostitory, other_tasks + + "future_use_impacts": {"source_key":"rai:dataSocialImpact"}, + "discouraged_uses": {"source_key":"prohibitedUses"}, + "intended_uses": {"source_key": "rai:dataUseCases"}, + "prohibited_uses": {"source_key":"prohibitedUses"}, + "distribution_formats": {"source_key":"evi:formats"}, + "license_and_use_terms": {"source_key":"license"}, + + #ip_restrictions, regional_restrictions, maintainers, errata, version_access, extension_mechanism, variables, is_deidentified, is_tabular + + "citation": {"source_key":"citation"}, + +} diff --git a/fairscape_models/conversion/mapping/d4d_to_rocrate.py b/fairscape_models/conversion/mapping/d4d_to_rocrate.py index e3fc856..b7ae1a5 100644 --- a/fairscape_models/conversion/mapping/d4d_to_rocrate.py +++ b/fairscape_models/conversion/mapping/d4d_to_rocrate.py @@ -2,7 +2,7 @@ D4D to ROCrate conversion mappings and utility functions. This module provides the mapping configurations and parser functions needed -to convert D4D format data (Data for Development) to ROCrate format. +to convert D4D pydantic classes to ROCrate v1.2. """ from typing import Dict, Any, Optional, List @@ -81,17 +81,8 @@ def _extract_strings_recursively(value: Any) -> List[str]: return [value.strip()] if value.strip() else [] if isinstance(value, dict): - # Look for specific keys first - for key in ['description', 'response', 'identification', 'distribution', - 'was_directly_observed', 'was_reported_by_subjects', - 'was_inferred_derived', 'was_validated_verified']: - if key in value: - strings.extend(_extract_strings_recursively(value[key])) - - # If no specific keys found, extract from all values - if not strings: - for v in value.values(): - strings.extend(_extract_strings_recursively(v)) + for v in value.values(): + strings.extend(_extract_strings_recursively(v)) elif isinstance(value, list): for item in value: @@ -138,32 +129,10 @@ def _combine_license_terms(source_dict: Dict[str, Any]) -> Optional[str]: return " | ".join(parts) if parts else None -def _combine_limitations(source_dict: Dict[str, Any]) -> Optional[str]: - """Combine limitation-related fields.""" - items = [] - for key in ["discouraged_uses", "errata", "content_warnings"]: - if source_dict.get(key): - extracted = _flatten_to_string(source_dict[key]) - if extracted: - items.append(extracted) - return " ".join(items) if items else None - - -def _combine_biases(source_dict: Dict[str, Any]) -> Optional[str]: - """Combine bias-related fields.""" - items = [] - for key in ["anomalies", "subpopulations"]: - if source_dict.get(key): - extracted = _flatten_to_string(source_dict[key]) - if extracted: - items.append(extracted) - return "; ".join(items) if items else None - - -def _combine_use_cases(source_dict: Dict[str, Any]) -> Optional[str]: - """Combine use case-related fields.""" +def _combine_social_impact(source_dict: Dict[str, Any]) -> Optional[str]: + """Combine social impact-related fields.""" items = [] - for key in ["purposes", "tasks", "existing_uses", "other_tasks"]: + for key in ["future_use_impacts", "data_protection_impacts"]: if source_dict.get(key): extracted = _flatten_to_string(source_dict[key]) if extracted: @@ -171,74 +140,124 @@ def _combine_use_cases(source_dict: Dict[str, Any]) -> Optional[str]: return " ".join(items) if items else None -def _combine_maintenance(source_dict: Dict[str, Any]) -> Optional[str]: - """Combine maintenance-related fields with labels.""" - parts = [] - if source_dict.get("maintainers"): - extracted = _flatten_to_string(source_dict["maintainers"]) - if extracted: - parts.append(f"Maintainers: {extracted}") - if source_dict.get("updates"): - extracted = _flatten_to_string(source_dict["updates"]) - if extracted: - parts.append(f"Updates: {extracted}") - if source_dict.get("retention_limit"): - extracted = _flatten_to_string(source_dict["retention_limit"]) - if extracted: - parts.append(f"Retention: {extracted}") - return " | ".join(parts) if parts else None - - -def _combine_collection_info(source_dict: Dict[str, Any]) -> Optional[str]: - """Combine data collection information.""" +def _extract_missing_data(source_dict: Dict[str, Any]) -> Optional[str]: + """Extract missing data information from instances and documentation.""" items = [] - if source_dict.get("acquisition_methods"): - extracted = _flatten_to_string(source_dict["acquisition_methods"]) + # Extract from missing_data_documentation at dataset level + if source_dict.get("missing_data_documentation"): + extracted = _flatten_to_string(source_dict["missing_data_documentation"]) if extracted: items.append(extracted) + # Extract missing_information from instances if source_dict.get("instances"): instances = source_dict["instances"] if isinstance(instances, list): - items.append(f"{len(instances)} instances") - else: - items.append("Instances documented") + for inst in instances: + if isinstance(inst, dict) and inst.get("missing_information"): + extracted = _flatten_to_string(inst["missing_information"]) + if extracted: + items.append(extracted) return " ".join(items) if items else None -def _combine_collection_mechanisms(source_dict: Dict[str, Any]) -> List[str]: - """Extract collection mechanisms as a list.""" +def _extract_annotations_per_item(source_dict: Dict[str, Any]) -> Optional[str]: + """Extract annotations_per_item from labeling strategies.""" items = [] - if source_dict.get("collection_mechanisms"): - extracted = _flatten_to_list(source_dict["collection_mechanisms"]) - if extracted: - items.extend(extracted) - return items if items else None - - -def _combine_sensitive_info(source_dict: Dict[str, Any]) -> List[str]: - """Combine sensitive information fields.""" + if source_dict.get("labeling_strategies"): + strategies = source_dict["labeling_strategies"] + if isinstance(strategies, list): + for strategy in strategies: + if isinstance(strategy, dict) and strategy.get("annotations_per_item"): + items.append(str(strategy.get("annotations_per_item"))) + elif isinstance(strategies, dict) and strategies.get("annotations_per_item"): + items.append(str(strategies.get("annotations_per_item"))) + return ", ".join(items) if items else None + +def _extract_annotations_platform(source_dict: Dict[str, Any]) -> Optional[str]: + """Extract annotations_platform from labeling strategies.""" items = [] - for key in ["confidential_elements", "sensitive_elements"]: - if source_dict.get(key): - extracted = _flatten_to_list(source_dict[key]) - if extracted: - items.extend(extracted) - if source_dict.get("is_deidentified"): - deident = _flatten_to_string(source_dict["is_deidentified"]) - if deident: - items.append(f"Deidentified: {deident}") - return items if items else None - - -def _combine_social_impact(source_dict: Dict[str, Any]) -> Optional[str]: - """Combine social impact-related fields.""" - items = [] - for key in ["future_use_impacts", "data_protection_impacts"]: - if source_dict.get(key): - extracted = _flatten_to_string(source_dict[key]) - if extracted: - items.append(extracted) - return " ".join(items) if items else None + if source_dict.get("labeling_strategies"): + strategies = source_dict["labeling_strategies"] + if isinstance(strategies, list): + for strategy in strategies: + if isinstance(strategy, dict) and strategy.get("data_annotation_platform"): + items.append(str(strategy.get("data_annotation_platform"))) + elif isinstance(strategies, dict) and strategies.get("data_annotation_platform"): + items.append(str(strategies.get("data_annotation_platform"))) + return ", ".join(items) if items else None + + +def _extract_confidentiality_level(source_dict: Dict[str, Any]) -> Optional[str]: + """Extract confidentiality level from regulatory restrictions.""" + if source_dict.get("regulatory_restrictions"): + restrictions = source_dict["regulatory_restrictions"] + if isinstance(restrictions, dict): + level = restrictions.get("confidentiality_level") + if level: + return _format_enum_value(level) + elif isinstance(restrictions, list): + for r in restrictions: + if isinstance(r, dict) and r.get("confidentiality_level"): + return _format_enum_value(r.get("confidentiality_level")) + return None + + +def _extract_governance_committee(source_dict: Dict[str, Any]) -> Optional[str]: + """Extract governance committee contact from regulatory restrictions.""" + if source_dict.get("regulatory_restrictions"): + restrictions = source_dict["regulatory_restrictions"] + if isinstance(restrictions, dict): + return restrictions.get("governance_committee_contact") + elif isinstance(restrictions, list): + for r in restrictions: + if isinstance(r, dict) and r.get("governance_committee_contact"): + return r.get("governance_committee_contact") + return None + + +def _extract_principal_investigator(source_dict: Dict[str, Any]) -> Optional[str]: + """Extract principal investigators from creators where principal_investigator=True.""" + pi_names = [] + if source_dict.get("creators"): + creators = source_dict["creators"] + if isinstance(creators, list): + for creator in creators: + if isinstance(creator, dict): + if creator.get("principal_investigator"): + person = creator.get("person") + if isinstance(person, dict): + name = person.get("name") or person.get("id") + if name: + pi_names.append(str(name)) + elif isinstance(person, str): + pi_names.append(person) + elif creator.get("name"): + pi_names.append(str(creator.get("name"))) + elif creator.get("id"): + pi_names.append(str(creator.get("id"))) + return ", ".join(pi_names) if pi_names else None + + +def _extract_contact_email(source_dict: Dict[str, Any]) -> Optional[str]: + """Extract contact email from ethical reviews contact_person.""" + if source_dict.get("ethical_reviews"): + reviews = source_dict["ethical_reviews"] + if isinstance(reviews, list): + for review in reviews: + if isinstance(review, dict): + contact = review.get("contact_person") + if isinstance(contact, dict) and contact.get("email"): + return contact.get("email") + elif isinstance(contact, str): + # It's just an ID/string reference + return contact + elif isinstance(reviews, dict): + contact = reviews.get("contact_person") + if isinstance(contact, dict) and contact.get("email"): + return contact.get("email") + elif isinstance(contact, str): + return contact + return None # ============================================================================ @@ -270,23 +289,44 @@ def _combine_social_impact(source_dict: Dict[str, Any]) -> Optional[str]: "conditionsOfAccess": {"builder_func": _combine_license_terms}, "conformsTo": {"source_key": "conforms_to"}, - # RAI (Responsible AI) properties - "rai:dataLimitations": {"builder_func": _combine_limitations}, - "rai:dataBiases": {"builder_func": _combine_biases}, - "rai:dataUseCases": {"builder_func": _combine_use_cases}, - "rai:dataReleaseMaintenancePlan": {"builder_func": _combine_maintenance}, - "rai:dataCollection": {"builder_func": _combine_collection_info}, - "rai:dataCollectionType": {"builder_func": _combine_collection_mechanisms}, - "rai:dataCollectionRawData": {"source_key": "raw_sources", "parser": _flatten_to_string}, - "rai:dataManipulationProtocol": {"source_key": "cleaning_strategies", "parser": _flatten_to_string}, - "rai:dataPreprocessingProtocol": {"source_key": "preprocessing_strategies", "parser": _flatten_to_string}, + # RAI (Responsible AI) properties - Data Lifecycle + "rai:dataLimitations": {"source_key": "known_limitations", "parser": _flatten_to_string}, + "rai:dataCollection": {"source_key": "collection_mechanisms", "parser": _flatten_to_string}, + "rai:dataCollectionType": {"source_key": "collection_mechanisms", "parser": _flatten_to_string}, + "rai:dataCollectionMissingData": {"builder_func": _extract_missing_data}, + "rai:dataCollectionRawData": {"source_key": "raw_data_sources", "parser": _flatten_to_string}, + "rai:dataCollectionTimeframe": {"source_key": "collection_timeframes", "parser": _flatten_to_string}, + "rai:dataPreprocessingProtocol": {"source_key": "preprocessing_strategies", "parser": _flatten_to_list}, + + # RAI - Data Labeling "rai:dataAnnotationProtocol": {"source_key": "labeling_strategies", "parser": _flatten_to_string}, - "rai:personalSensitiveInformation": {"builder_func": _combine_sensitive_info}, + "rai:dataAnnotationPlatform": {"builder_func": _extract_annotations_platform}, + "rai:dataAnnotationProtocol": {"source_key": "annotation_analysis", "parser": _flatten_to_string}, + "rai:annotationsPerItem": {"builder_func": _extract_annotations_per_item}, + "rai:machineAnnotationTools": {"source_key": "machine_annotation_tools", "parser": _flatten_to_string}, + + # RAI - Safety & Fairness + "rai:dataBiases": {"source_key": "known_biases", "parser": _flatten_to_string}, "rai:dataSocialImpact": {"builder_func": _combine_social_impact}, + "rai:personalSensitiveInformation": {"source_key": "sensitive_elements", "parser": _flatten_to_string}, + "rai:dataUseCases": {"source_key": "intended_uses", "parser": _flatten_to_string}, + + # RAI - Compliance & Governance + "rai:dataManipulationProtocol": {"source_key": "cleaning_strategies", "parser": _flatten_to_string}, + "rai:dataImputationProtocol": {"source_key": "imputation_method", "parser": _flatten_to_string}, + "rai:dataReleaseMaintenancePlan": {"source_key": "update_plan", "parser": _flatten_to_string}, # Additional metadata "funder": {"source_key": "funders", "parser": _flatten_to_string}, "ethicalReview": {"source_key": "ethical_reviews", "parser": _flatten_to_string}, + "citation": {"source_key": "citation"}, + "principalInvestigator": {"builder_func": _extract_principal_investigator}, + "contactEmail": {"builder_func": _extract_contact_email}, + "confidentialityLevel": {"builder_func": _extract_confidentiality_level}, + "humanSubject": {"source_key": "human_subject_research", "parser": _flatten_to_string}, + "governanceCommittee": {"builder_func": _extract_governance_committee}, + "prohibitedUses": {"source_key": "discouraged_uses", "parser": _flatten_to_string}, + "evi:formats": {"source_key": "distribution_formats", "parser": _flatten_to_string}, } @@ -311,32 +351,46 @@ def _combine_social_impact(source_dict: Dict[str, Any]) -> Optional[str]: "url": {"source_key": "page"}, "contentUrl": {"source_key": "download_url"}, "encodingFormat": {"source_key": "encoding", "parser": _format_enum_value}, - "fileFormat": {"source_key": "format", "parser": _format_enum_value}, "contentSize": {"source_key": "bytes", "parser": _parse_bytes_to_size_string}, - - # Checksums - "md5": {"source_key": "md5"}, - "sha256": {"source_key": "sha256"}, - - # Access and conformance "conditionsOfAccess": {"builder_func": _combine_license_terms}, "conformsTo": {"source_key": "conforms_to"}, - # RAI (Responsible AI) properties - "rai:dataLimitations": {"builder_func": _combine_limitations}, - "rai:dataBiases": {"builder_func": _combine_biases}, - "rai:dataUseCases": {"builder_func": _combine_use_cases}, - "rai:dataReleaseMaintenancePlan": {"builder_func": _combine_maintenance}, - "rai:dataCollection": {"builder_func": _combine_collection_info}, - "rai:dataCollectionType": {"builder_func": _combine_collection_mechanisms}, + # RAI (Responsible AI) properties - Data Lifecycle + "rai:dataLimitations": {"source_key": "known_limitations", "parser": _flatten_to_string}, + "rai:dataCollection": {"source_key": "collection_mechanisms", "parser": _flatten_to_string}, + "rai:dataCollectionType": {"source_key": "collection_mechanisms", "parser": _flatten_to_string}, + "rai:dataCollectionMissingData": {"builder_func": _extract_missing_data}, "rai:dataCollectionRawData": {"source_key": "raw_sources", "parser": _flatten_to_string}, - "rai:dataManipulationProtocol": {"source_key": "cleaning_strategies", "parser": _flatten_to_string}, - "rai:dataPreprocessingProtocol": {"source_key": "preprocessing_strategies", "parser": _flatten_to_string}, + "rai:dataCollectionTimeframe": {"source_key": "collection_timeframes", "parser": _flatten_to_string}, + "rai:dataPreprocessingProtocol": {"source_key": "preprocessing_strategies", "parser": _flatten_to_list}, + + # RAI - Data Labeling "rai:dataAnnotationProtocol": {"source_key": "labeling_strategies", "parser": _flatten_to_string}, - "rai:personalSensitiveInformation": {"builder_func": _combine_sensitive_info}, + "rai:dataAnnotationPlatform": {"builder_func": _extract_annotations_platform}, + "rai:dataAnnotationProtocol": {"source_key": "annotation_analysis", "parser": _flatten_to_string}, + "rai:annotationsPerItem": {"builder_func": _extract_annotations_per_item}, + "rai:machineAnnotationTools": {"source_key": "machine_annotation_tools", "parser": _flatten_to_string}, + + # RAI - Safety & Fairness + "rai:dataBiases": {"source_key": "known_biases", "parser": _flatten_to_string}, "rai:dataSocialImpact": {"builder_func": _combine_social_impact}, + "rai:personalSensitiveInformation": {"source_key": "sensitive_elements", "parser": _flatten_to_string}, + "rai:dataUseCases": {"source_key": "intended_uses", "parser": _flatten_to_string}, + + # RAI - Compliance & Governance + "rai:dataManipulationProtocol": {"source_key": "cleaning_strategies", "parser": _flatten_to_string}, + "rai:dataImputationProtocol": {"source_key": "imputation_method", "parser": _flatten_to_string}, + "rai:dataReleaseMaintenancePlan": {"source_key": "update_plan", "parser": _flatten_to_string}, # Additional metadata "funder": {"source_key": "funders", "parser": _flatten_to_string}, "ethicalReview": {"source_key": "ethical_reviews", "parser": _flatten_to_string}, -} + "citation": {"source_key": "citation"}, + "principalInvestigator": {"builder_func": _extract_principal_investigator}, + "contactEmail": {"builder_func": _extract_contact_email}, + "confidentialityLevel": {"builder_func": _extract_confidentiality_level}, + "humanSubject": {"source_key": "human_subject_research", "parser": _flatten_to_string}, + "governanceCommittee": {"builder_func": _extract_governance_committee}, + "prohibitedUses": {"source_key": "discouraged_uses", "parser": _flatten_to_string}, + "evi:formats": {"source_key": "distribution_formats", "parser": _flatten_to_string}, +} \ No newline at end of file diff --git a/fairscape_models/conversion/models/AIReady.py b/fairscape_models/conversion/models/AIReady.py index b1f1d5d..867ba10 100644 --- a/fairscape_models/conversion/models/AIReady.py +++ b/fairscape_models/conversion/models/AIReady.py @@ -46,7 +46,7 @@ class CharacterizationScore(BaseModel): has_content=False, details="No statistical characterization available" )) standards: SubCriterionScore = Field(default_factory=lambda: SubCriterionScore( - has_content=True, details="This dataset adheres to the RO-Crate 1.2 and Croissant RAI 1.0 community standards." + has_content=False, details="No schemas provided for datasets." )) potential_sources_of_bias: SubCriterionScore = Field(default_factory=lambda: SubCriterionScore( has_content=False, details="No bias description provided" @@ -99,7 +99,7 @@ class ComputabilityScore(BaseModel): has_content=False, details="No format information available" )) computationally_accessible: SubCriterionScore = Field(default_factory=lambda: SubCriterionScore( - has_content=True, details="Data is hosted in public repositories (e.g., NCBI, MassIVE, Dataverse) that support programmatic access." + has_content=False, details="No publisher provided." )) portable: SubCriterionScore = Field(default_factory=lambda: SubCriterionScore( has_content=True, details="The dataset is packaged as a self-contained RO-Crate, a standard designed for portability across systems." diff --git a/fairscape_models/conversion/models/FairscapeDatasheet.py b/fairscape_models/conversion/models/FairscapeDatasheet.py index 4da6410..93feba0 100644 --- a/fairscape_models/conversion/models/FairscapeDatasheet.py +++ b/fairscape_models/conversion/models/FairscapeDatasheet.py @@ -74,6 +74,56 @@ class UseCasesSection(BaseModel): default=None, description="Versioning, maintainers, and deprecation policies" ) + # Additional RAI fields + data_collection: Optional[str] = Field( + default=None, description="Description of data collection methodology" + ) + data_collection_type: Optional[str] = Field( + default=None, description="Type of data collection" + ) + data_collection_missing_data: Optional[str] = Field( + default=None, description="Description of missing data in collection" + ) + data_collection_raw_data: Optional[str] = Field( + default=None, description="Description of raw data from collection" + ) + data_collection_timeframe: Optional[str] = Field( + default=None, description="Timeframe of data collection" + ) + data_imputation_protocol: Optional[str] = Field( + default=None, description="Protocol used for data imputation" + ) + data_manipulation_protocol: Optional[str] = Field( + default=None, description="Protocol used for data manipulation" + ) + data_preprocessing_protocol: Optional[str] = Field( + default=None, description="Protocol used for data preprocessing" + ) + data_annotation_protocol: Optional[str] = Field( + default=None, description="Protocol used for data annotation" + ) + data_annotation_platform: Optional[str] = Field( + default=None, description="Platform used for data annotation" + ) + data_annotation_analysis: Optional[str] = Field( + default=None, description="Analysis of data annotations" + ) + personal_sensitive_information: Optional[str] = Field( + default=None, description="Description of personal/sensitive information" + ) + data_social_impact: Optional[str] = Field( + default=None, description="Social impact of the data" + ) + annotations_per_item: Optional[str] = Field( + default=None, description="Number of annotations per item" + ) + annotator_demographics: Optional[str] = Field( + default=None, description="Demographics of annotators" + ) + machine_annotation_tools: Optional[str] = Field( + default=None, description="Machine tools used for annotation" + ) + model_config = ConfigDict(extra="allow")