-
Notifications
You must be signed in to change notification settings - Fork 0
Fix endogenous genomic mapping #68
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: mavedb-dev
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| """Provide dcd mapping version""" | ||
|
|
||
| dcd_mapping_version = "2025.3.1" | ||
| dcd_mapping_version = "2026.1.0" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,7 @@ | |
| from collections.abc import Iterable | ||
| from itertools import cycle | ||
|
|
||
| import requests | ||
| from Bio.Seq import Seq | ||
| from bioutils.accessions import infer_namespace | ||
| from cool_seq_tool.schemas import AnnotationLayer, Strand | ||
|
|
@@ -22,6 +23,7 @@ | |
|
|
||
| from dcd_mapping.exceptions import ( | ||
| MissingSequenceIdError, | ||
| ResourceAcquisitionError, | ||
| UnsupportedReferenceSequenceNameSpaceError, | ||
| UnsupportedReferenceSequencePrefixError, | ||
| ) | ||
|
|
@@ -47,6 +49,8 @@ | |
|
|
||
| _logger = logging.getLogger(__name__) | ||
|
|
||
| CLINGEN_API_URL = "https://reg.genome.network/allele" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's set this as an environment variable. This should be a separate issue, but this did make me think we should standardize the way we setup envvars for ClinGen (and our other services, generally) |
||
|
|
||
|
|
||
| def _hgvs_variant_is_valid(hgvs_string: str) -> bool: | ||
| return not hgvs_string.endswith((".=", ")", "X")) | ||
|
|
@@ -85,6 +89,29 @@ def is_intronic_variant(variant: Variant) -> bool: | |
| return False | ||
|
|
||
|
|
||
| def fetch_clingen_genomic_hgvs(hgvs: str) -> str | None: | ||
| """Fetch the genomic HGVS string from ClinGen. | ||
|
|
||
| :param hgvs: The HGVS string to fetch | ||
| :return: The genomic HGVS string on GRCh38, or None if not found | ||
| """ | ||
| response = requests.get(f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30) | ||
| try: | ||
| response.raise_for_status() | ||
| except requests.HTTPError as e: | ||
| msg = f"Received HTTPError from {CLINGEN_API_URL} for HGVS {hgvs}" | ||
| _logger.error(msg) | ||
| raise ResourceAcquisitionError(msg) from e | ||
|
Comment on lines
+99
to
+104
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might want to back off certain response errors with some retry logic here so we can do our best to fill everything in completely and avoid missing data from transient errors. |
||
| if response.status_code == 200: | ||
| data = response.json() | ||
| for allele in data.get("genomicAlleles", []): | ||
| if allele.get("referenceGenome") == "GRCh38": | ||
| for coordinates in allele.get("hgvs", []): | ||
| if coordinates.startswith("NC_"): | ||
| return coordinates | ||
| return None | ||
|
|
||
|
|
||
| def _create_pre_mapped_hgvs_strings( | ||
| raw_description: str, | ||
| layer: AnnotationLayer, | ||
|
|
@@ -155,6 +182,7 @@ def _create_post_mapped_hgvs_strings( | |
| layer: AnnotationLayer, | ||
| tx: TxSelectResult | None = None, | ||
| alignment: AlignmentResult | None = None, | ||
| accession_id: str | None = None, | ||
| ) -> list[str]: | ||
| """Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings. | ||
|
|
||
|
|
@@ -166,13 +194,14 @@ def _create_post_mapped_hgvs_strings( | |
| :param layer: An enum denoting the targeted annotation layer of these HGVS strings | ||
| :param tx: A TxSelectResult object defining the transcript we are mapping to (or None) | ||
| :param alignment: An AlignmentResult object defining the alignment we are mapping to (or None) | ||
| :param accession_id: An accession id describing the reference sequence (or None). Only used for accession-based variants. | ||
| :return: A list of HGVS strings relative to the `tx` or `alignment` | ||
| """ | ||
| if layer is AnnotationLayer.PROTEIN and tx is None: | ||
| msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)." | ||
| raise ValueError(msg) | ||
| if layer is AnnotationLayer.GENOMIC and alignment is None: | ||
| msg = f"Alignment result must be provided for {layer} annotations (Alignment was `{alignment}`)." | ||
| if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None: | ||
| msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}` and Accession ID was `{accession_id}`)." | ||
| raise ValueError(msg) | ||
|
|
||
| raw_variants = _parse_raw_variant_str(raw_description) | ||
|
|
@@ -196,12 +225,22 @@ def _create_post_mapped_hgvs_strings( | |
| variant = _adjust_protein_variant_to_ref(variant, tx) | ||
| hgvs_strings.append(tx.np + ":" + str(variant)) | ||
| elif layer is AnnotationLayer.GENOMIC: | ||
| assert alignment # noqa: S101. mypy help | ||
|
|
||
| variant = _adjust_genomic_variant_to_ref(variant, alignment) | ||
| hgvs_strings.append( | ||
| get_chromosome_identifier(alignment.chrom) + ":" + str(variant) | ||
| ) | ||
| if accession_id: | ||
| pre_mapped_hgvs = accession_id + ":" + str(variant) | ||
| # use ClinGen to align accession-based variants to genomic reference | ||
| genomic_hgvs = fetch_clingen_genomic_hgvs(pre_mapped_hgvs) | ||
| if genomic_hgvs: | ||
| hgvs_strings.append(genomic_hgvs) | ||
| else: | ||
| msg = f"Could not fetch genomic HGVS on GRCh38 for accession-based variant: {pre_mapped_hgvs}" | ||
| raise ValueError(msg) | ||
| else: | ||
| assert alignment # noqa: S101. mypy help | ||
|
|
||
| variant = _adjust_genomic_variant_to_ref(variant, alignment) | ||
| hgvs_strings.append( | ||
| get_chromosome_identifier(alignment.chrom) + ":" + str(variant) | ||
| ) | ||
| else: | ||
| msg = ( | ||
| f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}" | ||
|
|
@@ -539,7 +578,7 @@ def _map_genomic( | |
| post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( | ||
| row.hgvs_nt, | ||
| AnnotationLayer.GENOMIC, | ||
| alignment=align_result, | ||
| accession_id=sequence_id, | ||
| ) | ||
| post_mapped_genomic = _construct_vrs_allele( | ||
| post_mapped_hgvs_strings, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can bump this just prior to being released rather than as part of any individual changes. It's pedantic, but it's not really a new version until we are ready to release it.