From 26cba7227ab03b11e01ee32002474aa75d8aa2f9 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 14 Jan 2026 07:11:32 -0800 Subject: [PATCH 1/2] Use ClinGen to map genomic accession-based variants to genome This change reduces likelihood of errors when mapping accession-based transcript variants to genomic positions. Keep alignment process for now so that we can report the transcript and gene for these variants. Eventually, just the transcript id could be used to access this information. --- src/dcd_mapping/vrs_map.py | 57 ++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index db81d57..3f3e400 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -4,6 +4,7 @@ from collections.abc import Iterable from itertools import cycle +import requests from Bio.Seq import Seq from bioutils.accessions import infer_namespace from cool_seq_tool.schemas import AnnotationLayer, Strand @@ -22,6 +23,7 @@ from dcd_mapping.exceptions import ( MissingSequenceIdError, + ResourceAcquisitionError, UnsupportedReferenceSequenceNameSpaceError, UnsupportedReferenceSequencePrefixError, ) @@ -47,6 +49,8 @@ _logger = logging.getLogger(__name__) +CLINGEN_API_URL = "https://reg.genome.network/allele" + def _hgvs_variant_is_valid(hgvs_string: str) -> bool: return not hgvs_string.endswith((".=", ")", "X")) @@ -85,6 +89,29 @@ def is_intronic_variant(variant: Variant) -> bool: return False +def fetch_clingen_genomic_hgvs(hgvs: str) -> str | None: + """Fetch the genomic HGVS string from ClinGen. + + :param hgvs: The HGVS string to fetch + :return: The genomic HGVS string on GRCh38, or None if not found + """ + response = requests.get(f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30) + try: + response.raise_for_status() + except requests.HTTPError as e: + msg = f"Received HTTPError from {CLINGEN_API_URL} for HGVS {hgvs}" + _logger.error(msg) + raise ResourceAcquisitionError(msg) from e + if response.status_code == 200: + data = response.json() + for allele in data.get("genomicAlleles", []): + if allele.get("referenceGenome") == "GRCh38": + for coordinates in allele.get("hgvs", []): + if coordinates.startswith("NC_"): + return coordinates + return None + + def _create_pre_mapped_hgvs_strings( raw_description: str, layer: AnnotationLayer, @@ -155,6 +182,7 @@ def _create_post_mapped_hgvs_strings( layer: AnnotationLayer, tx: TxSelectResult | None = None, alignment: AlignmentResult | None = None, + accession_id: str | None = None, ) -> list[str]: """Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings. @@ -166,13 +194,14 @@ def _create_post_mapped_hgvs_strings( :param layer: An enum denoting the targeted annotation layer of these HGVS strings :param tx: A TxSelectResult object defining the transcript we are mapping to (or None) :param alignment: An AlignmentResult object defining the alignment we are mapping to (or None) + :param accession_id: An accession id describing the reference sequence (or None). Only used for accession-based variants. :return: A list of HGVS strings relative to the `tx` or `alignment` """ if layer is AnnotationLayer.PROTEIN and tx is None: msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)." raise ValueError(msg) - if layer is AnnotationLayer.GENOMIC and alignment is None: - msg = f"Alignment result must be provided for {layer} annotations (Alignment was `{alignment}`)." + if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None: + msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}` and Accession ID was `{accession_id}`)." raise ValueError(msg) raw_variants = _parse_raw_variant_str(raw_description) @@ -196,12 +225,22 @@ def _create_post_mapped_hgvs_strings( variant = _adjust_protein_variant_to_ref(variant, tx) hgvs_strings.append(tx.np + ":" + str(variant)) elif layer is AnnotationLayer.GENOMIC: - assert alignment # noqa: S101. mypy help - - variant = _adjust_genomic_variant_to_ref(variant, alignment) - hgvs_strings.append( - get_chromosome_identifier(alignment.chrom) + ":" + str(variant) - ) + if accession_id: + pre_mapped_hgvs = accession_id + ":" + str(variant) + # use ClinGen to align accession-based variants to genomic reference + genomic_hgvs = fetch_clingen_genomic_hgvs(pre_mapped_hgvs) + if genomic_hgvs: + hgvs_strings.append(genomic_hgvs) + else: + msg = f"Could not fetch genomic HGVS on GRCh38 for accession-based variant: {pre_mapped_hgvs}" + raise ValueError(msg) + else: + assert alignment # noqa: S101. mypy help + + variant = _adjust_genomic_variant_to_ref(variant, alignment) + hgvs_strings.append( + get_chromosome_identifier(alignment.chrom) + ":" + str(variant) + ) else: msg = ( f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}" @@ -539,7 +578,7 @@ def _map_genomic( post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( row.hgvs_nt, AnnotationLayer.GENOMIC, - alignment=align_result, + accession_id=sequence_id, ) post_mapped_genomic = _construct_vrs_allele( post_mapped_hgvs_strings, From 318316c9199c4a8a996456d5e34347f0a41a0166 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 14 Jan 2026 07:19:51 -0800 Subject: [PATCH 2/2] Bump version --- src/dcd_mapping/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py index 3ea9280..e6693f5 100644 --- a/src/dcd_mapping/version.py +++ b/src/dcd_mapping/version.py @@ -1,3 +1,3 @@ """Provide dcd mapping version""" -dcd_mapping_version = "2025.3.1" +dcd_mapping_version = "2026.1.0"