diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py index 3ea9280..e6693f5 100644 --- a/src/dcd_mapping/version.py +++ b/src/dcd_mapping/version.py @@ -1,3 +1,3 @@ """Provide dcd mapping version""" -dcd_mapping_version = "2025.3.1" +dcd_mapping_version = "2026.1.0" diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index db81d57..3f3e400 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -4,6 +4,7 @@ from collections.abc import Iterable from itertools import cycle +import requests from Bio.Seq import Seq from bioutils.accessions import infer_namespace from cool_seq_tool.schemas import AnnotationLayer, Strand @@ -22,6 +23,7 @@ from dcd_mapping.exceptions import ( MissingSequenceIdError, + ResourceAcquisitionError, UnsupportedReferenceSequenceNameSpaceError, UnsupportedReferenceSequencePrefixError, ) @@ -47,6 +49,8 @@ _logger = logging.getLogger(__name__) +CLINGEN_API_URL = "https://reg.genome.network/allele" + def _hgvs_variant_is_valid(hgvs_string: str) -> bool: return not hgvs_string.endswith((".=", ")", "X")) @@ -85,6 +89,29 @@ def is_intronic_variant(variant: Variant) -> bool: return False +def fetch_clingen_genomic_hgvs(hgvs: str) -> str | None: + """Fetch the genomic HGVS string from ClinGen. + + :param hgvs: The HGVS string to fetch + :return: The genomic HGVS string on GRCh38, or None if not found + """ + response = requests.get(f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30) + try: + response.raise_for_status() + except requests.HTTPError as e: + msg = f"Received HTTPError from {CLINGEN_API_URL} for HGVS {hgvs}" + _logger.error(msg) + raise ResourceAcquisitionError(msg) from e + if response.status_code == 200: + data = response.json() + for allele in data.get("genomicAlleles", []): + if allele.get("referenceGenome") == "GRCh38": + for coordinates in allele.get("hgvs", []): + if coordinates.startswith("NC_"): + return coordinates + return None + + def _create_pre_mapped_hgvs_strings( raw_description: str, layer: AnnotationLayer, @@ -155,6 +182,7 @@ def _create_post_mapped_hgvs_strings( layer: AnnotationLayer, tx: TxSelectResult | None = None, alignment: AlignmentResult | None = None, + accession_id: str | None = None, ) -> list[str]: """Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings. @@ -166,13 +194,14 @@ def _create_post_mapped_hgvs_strings( :param layer: An enum denoting the targeted annotation layer of these HGVS strings :param tx: A TxSelectResult object defining the transcript we are mapping to (or None) :param alignment: An AlignmentResult object defining the alignment we are mapping to (or None) + :param accession_id: An accession id describing the reference sequence (or None). Only used for accession-based variants. :return: A list of HGVS strings relative to the `tx` or `alignment` """ if layer is AnnotationLayer.PROTEIN and tx is None: msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)." raise ValueError(msg) - if layer is AnnotationLayer.GENOMIC and alignment is None: - msg = f"Alignment result must be provided for {layer} annotations (Alignment was `{alignment}`)." + if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None: + msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}` and Accession ID was `{accession_id}`)." raise ValueError(msg) raw_variants = _parse_raw_variant_str(raw_description) @@ -196,12 +225,22 @@ def _create_post_mapped_hgvs_strings( variant = _adjust_protein_variant_to_ref(variant, tx) hgvs_strings.append(tx.np + ":" + str(variant)) elif layer is AnnotationLayer.GENOMIC: - assert alignment # noqa: S101. mypy help - - variant = _adjust_genomic_variant_to_ref(variant, alignment) - hgvs_strings.append( - get_chromosome_identifier(alignment.chrom) + ":" + str(variant) - ) + if accession_id: + pre_mapped_hgvs = accession_id + ":" + str(variant) + # use ClinGen to align accession-based variants to genomic reference + genomic_hgvs = fetch_clingen_genomic_hgvs(pre_mapped_hgvs) + if genomic_hgvs: + hgvs_strings.append(genomic_hgvs) + else: + msg = f"Could not fetch genomic HGVS on GRCh38 for accession-based variant: {pre_mapped_hgvs}" + raise ValueError(msg) + else: + assert alignment # noqa: S101. mypy help + + variant = _adjust_genomic_variant_to_ref(variant, alignment) + hgvs_strings.append( + get_chromosome_identifier(alignment.chrom) + ":" + str(variant) + ) else: msg = ( f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}" @@ -539,7 +578,7 @@ def _map_genomic( post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( row.hgvs_nt, AnnotationLayer.GENOMIC, - alignment=align_result, + accession_id=sequence_id, ) post_mapped_genomic = _construct_vrs_allele( post_mapped_hgvs_strings,