Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/dcd_mapping/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Provide dcd mapping version"""

dcd_mapping_version = "2025.3.1"
dcd_mapping_version = "2026.1.0"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can bump this just prior to being released rather than as part of any individual changes. It's pedantic, but it's not really a new version until we are ready to release it.

57 changes: 48 additions & 9 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections.abc import Iterable
from itertools import cycle

import requests
from Bio.Seq import Seq
from bioutils.accessions import infer_namespace
from cool_seq_tool.schemas import AnnotationLayer, Strand
Expand All @@ -22,6 +23,7 @@

from dcd_mapping.exceptions import (
MissingSequenceIdError,
ResourceAcquisitionError,
UnsupportedReferenceSequenceNameSpaceError,
UnsupportedReferenceSequencePrefixError,
)
Expand All @@ -47,6 +49,8 @@

_logger = logging.getLogger(__name__)

CLINGEN_API_URL = "https://reg.genome.network/allele"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's set this as an environment variable.

This should be a separate issue, but this did make me think we should standardize the way we setup envvars for ClinGen (and our other services, generally)



def _hgvs_variant_is_valid(hgvs_string: str) -> bool:
return not hgvs_string.endswith((".=", ")", "X"))
Expand Down Expand Up @@ -85,6 +89,29 @@ def is_intronic_variant(variant: Variant) -> bool:
return False


def fetch_clingen_genomic_hgvs(hgvs: str) -> str | None:
"""Fetch the genomic HGVS string from ClinGen.

:param hgvs: The HGVS string to fetch
:return: The genomic HGVS string on GRCh38, or None if not found
"""
response = requests.get(f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30)
try:
response.raise_for_status()
except requests.HTTPError as e:
msg = f"Received HTTPError from {CLINGEN_API_URL} for HGVS {hgvs}"
_logger.error(msg)
raise ResourceAcquisitionError(msg) from e
Comment on lines +99 to +104
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to back off certain response errors with some retry logic here so we can do our best to fill everything in completely and avoid missing data from transient errors.

if response.status_code == 200:
data = response.json()
for allele in data.get("genomicAlleles", []):
if allele.get("referenceGenome") == "GRCh38":
for coordinates in allele.get("hgvs", []):
if coordinates.startswith("NC_"):
return coordinates
return None


def _create_pre_mapped_hgvs_strings(
raw_description: str,
layer: AnnotationLayer,
Expand Down Expand Up @@ -155,6 +182,7 @@ def _create_post_mapped_hgvs_strings(
layer: AnnotationLayer,
tx: TxSelectResult | None = None,
alignment: AlignmentResult | None = None,
accession_id: str | None = None,
) -> list[str]:
"""Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings.

Expand All @@ -166,13 +194,14 @@ def _create_post_mapped_hgvs_strings(
:param layer: An enum denoting the targeted annotation layer of these HGVS strings
:param tx: A TxSelectResult object defining the transcript we are mapping to (or None)
:param alignment: An AlignmentResult object defining the alignment we are mapping to (or None)
:param accession_id: An accession id describing the reference sequence (or None). Only used for accession-based variants.
:return: A list of HGVS strings relative to the `tx` or `alignment`
"""
if layer is AnnotationLayer.PROTEIN and tx is None:
msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)."
raise ValueError(msg)
if layer is AnnotationLayer.GENOMIC and alignment is None:
msg = f"Alignment result must be provided for {layer} annotations (Alignment was `{alignment}`)."
if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None:
msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}` and Accession ID was `{accession_id}`)."
raise ValueError(msg)

raw_variants = _parse_raw_variant_str(raw_description)
Expand All @@ -196,12 +225,22 @@ def _create_post_mapped_hgvs_strings(
variant = _adjust_protein_variant_to_ref(variant, tx)
hgvs_strings.append(tx.np + ":" + str(variant))
elif layer is AnnotationLayer.GENOMIC:
assert alignment # noqa: S101. mypy help

variant = _adjust_genomic_variant_to_ref(variant, alignment)
hgvs_strings.append(
get_chromosome_identifier(alignment.chrom) + ":" + str(variant)
)
if accession_id:
pre_mapped_hgvs = accession_id + ":" + str(variant)
# use ClinGen to align accession-based variants to genomic reference
genomic_hgvs = fetch_clingen_genomic_hgvs(pre_mapped_hgvs)
if genomic_hgvs:
hgvs_strings.append(genomic_hgvs)
else:
msg = f"Could not fetch genomic HGVS on GRCh38 for accession-based variant: {pre_mapped_hgvs}"
raise ValueError(msg)
else:
assert alignment # noqa: S101. mypy help

variant = _adjust_genomic_variant_to_ref(variant, alignment)
hgvs_strings.append(
get_chromosome_identifier(alignment.chrom) + ":" + str(variant)
)
else:
msg = (
f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}"
Expand Down Expand Up @@ -539,7 +578,7 @@ def _map_genomic(
post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings(
row.hgvs_nt,
AnnotationLayer.GENOMIC,
alignment=align_result,
accession_id=sequence_id,
)
post_mapped_genomic = _construct_vrs_allele(
post_mapped_hgvs_strings,
Expand Down