diff --git a/docs/release-notes/3932.fix.md b/docs/release-notes/3932.fix.md new file mode 100644 index 0000000000..46478ff5b2 --- /dev/null +++ b/docs/release-notes/3932.fix.md @@ -0,0 +1 @@ +Allow {func}`scanpy.read_10x_mtx` to read numeric gene IDs {smaller}`P Angerer` diff --git a/src/scanpy/readwrite.py b/src/scanpy/readwrite.py index cdf9f82f5d..6e0b27269d 100644 --- a/src/scanpy/readwrite.py +++ b/src/scanpy/readwrite.py @@ -628,21 +628,21 @@ def _read_10x_mtx( sep="\t", ) if var_names == "gene_symbols": - var_names_idx = pd.Index(genes[1].values) + var_names_idx = pd.Index(genes[1].array) if make_unique: var_names_idx = anndata.utils.make_index_unique(var_names_idx) - adata.var_names = var_names_idx - adata.var["gene_ids"] = genes[0].values + adata.var_names = var_names_idx.astype("str") + adata.var["gene_ids"] = genes[0].array elif var_names == "gene_ids": - adata.var_names = genes[0].values - adata.var["gene_symbols"] = genes[1].values + adata.var_names = genes[0].array.astype("str") + adata.var["gene_symbols"] = genes[1].array else: msg = "`var_names` needs to be 'gene_symbols' or 'gene_ids'" raise ValueError(msg) if not is_legacy: - adata.var["feature_types"] = genes[2].values + adata.var["feature_types"] = genes[2].array barcodes = pd.read_csv(path / f"{prefix}barcodes.tsv{suffix}", header=None) - adata.obs_names = barcodes[0].values + adata.obs_names = barcodes[0].array return adata diff --git a/tests/_data/10x_data/int-ids/barcodes.tsv b/tests/_data/10x_data/int-ids/barcodes.tsv new file mode 100644 index 0000000000..8ffe6e9030 --- /dev/null +++ b/tests/_data/10x_data/int-ids/barcodes.tsv @@ -0,0 +1,5 @@ +AAACCCAAGATTAGCA-1 +AAACCCACACAATGAA-1 +AAACCCATCGGACCAC-1 +AAACCCATCTCGTTTA-1 +AAACGAAAGCAAGCCA-1 diff --git a/tests/_data/10x_data/int-ids/features.tsv b/tests/_data/10x_data/int-ids/features.tsv new file mode 100644 index 0000000000..f86c266c50 --- /dev/null +++ b/tests/_data/10x_data/int-ids/features.tsv @@ -0,0 +1,6 @@ +574405 Pwwp4b Gene Expression +574404 Pwwp4a Gene Expression +574403 Insyn2b Gene Expression +574402 Gpr17 Gene Expression +100862323 Btbd35f22 Gene Expression +100862329 Btbd35f21 Gene Expression \ No newline at end of file diff --git a/tests/_data/10x_data/int-ids/matrix.mtx b/tests/_data/10x_data/int-ids/matrix.mtx new file mode 100644 index 0000000000..f6d2c4995f --- /dev/null +++ b/tests/_data/10x_data/int-ids/matrix.mtx @@ -0,0 +1,6 @@ +%%MatrixMarket matrix coordinate real general +% +6 5 3 +4 3 6 +5 3 1 +6 3 1 diff --git a/tests/test_read_10x.py b/tests/test_read_10x.py index 1f62924992..811896a347 100644 --- a/tests/test_read_10x.py +++ b/tests/test_read_10x.py @@ -2,14 +2,20 @@ import shutil from pathlib import Path +from typing import TYPE_CHECKING from unittest.mock import patch import h5py import numpy as np +import pandas as pd import pytest import scanpy as sc +if TYPE_CHECKING: + from typing import Literal + + ROOT = Path(__file__).parent ROOT = ROOT / "_data" / "10x_data" VISIUM_ROOT = Path(__file__).parent / "_data" / "visium_data" @@ -28,15 +34,19 @@ def assert_anndata_equal(a1, a2): pytest.param( ROOT / "1.2.0" / "filtered_gene_bc_matrices" / "hg19_chr21", ROOT / "1.2.0" / "filtered_gene_bc_matrices_h5.h5", + id="1.2.0", ), pytest.param( ROOT / "3.0.0" / "filtered_feature_bc_matrix", ROOT / "3.0.0" / "filtered_feature_bc_matrix.h5", + id="3.0.0", ), ], ) -@pytest.mark.parametrize("prefix", [None, "prefix_"]) -def test_read_10x(tmp_path, mtx_path, h5_path, prefix): +@pytest.mark.parametrize("prefix", [None, "prefix_"], ids=["no_prefix", "prefix"]) +def test_read_10x( + tmp_path: Path, mtx_path: Path, h5_path: Path, prefix: str | None +) -> None: if prefix is not None: # Build files named "prefix_XXX.xxx" in a temporary directory. mtx_path_orig = mtx_path @@ -66,6 +76,27 @@ def test_read_10x(tmp_path, mtx_path, h5_path, prefix): assert_anndata_equal(sc.read_h5ad(from_mtx_pth), sc.read_h5ad(from_h5_pth)) +@pytest.mark.parametrize( + ("genes", "col_dtypes"), + [ + pytest.param("symbols", dict(gene_ids="int64"), id="symbols"), + pytest.param("ids", dict(gene_symbols="str"), id="ids"), + ], +) +def test_read_10x_mtx_int( + genes: Literal["symbols", "ids"], col_dtypes: dict[str, str] +) -> None: + str_dt = "str" if pd.options.future.infer_string else "object" + col_dtypes = {k: str_dt if v == "str" else v for k, v in col_dtypes.items()} + + adata = sc.read_10x_mtx( + ROOT / "int-ids", var_names=f"gene_{genes}", compressed=False + ) + + assert adata.var.index.dtype == str_dt + assert dict(adata.var.dtypes) == dict(feature_types=str_dt, **col_dtypes) + + def test_read_10x_h5_v1(): spec_genome_v1 = sc.read_10x_h5( ROOT / "1.2.0" / "filtered_gene_bc_matrices_h5.h5",