Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes/3932.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Allow {func}`scanpy.read_10x_mtx` to read numeric gene IDs {smaller}`P Angerer`
14 changes: 7 additions & 7 deletions src/scanpy/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,21 +628,21 @@ def _read_10x_mtx(
sep="\t",
)
if var_names == "gene_symbols":
var_names_idx = pd.Index(genes[1].values)
var_names_idx = pd.Index(genes[1].array)
if make_unique:
var_names_idx = anndata.utils.make_index_unique(var_names_idx)
adata.var_names = var_names_idx
adata.var["gene_ids"] = genes[0].values
adata.var_names = var_names_idx.astype("str")
adata.var["gene_ids"] = genes[0].array
elif var_names == "gene_ids":
adata.var_names = genes[0].values
adata.var["gene_symbols"] = genes[1].values
adata.var_names = genes[0].array.astype("str")
adata.var["gene_symbols"] = genes[1].array
else:
msg = "`var_names` needs to be 'gene_symbols' or 'gene_ids'"
raise ValueError(msg)
if not is_legacy:
adata.var["feature_types"] = genes[2].values
adata.var["feature_types"] = genes[2].array
barcodes = pd.read_csv(path / f"{prefix}barcodes.tsv{suffix}", header=None)
adata.obs_names = barcodes[0].values
adata.obs_names = barcodes[0].array
return adata


Expand Down
5 changes: 5 additions & 0 deletions tests/_data/10x_data/int-ids/barcodes.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
AAACCCAAGATTAGCA-1
AAACCCACACAATGAA-1
AAACCCATCGGACCAC-1
AAACCCATCTCGTTTA-1
AAACGAAAGCAAGCCA-1
6 changes: 6 additions & 0 deletions tests/_data/10x_data/int-ids/features.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
574405 Pwwp4b Gene Expression
574404 Pwwp4a Gene Expression
574403 Insyn2b Gene Expression
574402 Gpr17 Gene Expression
100862323 Btbd35f22 Gene Expression
100862329 Btbd35f21 Gene Expression
6 changes: 6 additions & 0 deletions tests/_data/10x_data/int-ids/matrix.mtx
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
%%MatrixMarket matrix coordinate real general
%
6 5 3
4 3 6
5 3 1
6 3 1
35 changes: 33 additions & 2 deletions tests/test_read_10x.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,20 @@

import shutil
from pathlib import Path
from typing import TYPE_CHECKING
from unittest.mock import patch

import h5py
import numpy as np
import pandas as pd
import pytest

import scanpy as sc

if TYPE_CHECKING:
from typing import Literal


ROOT = Path(__file__).parent
ROOT = ROOT / "_data" / "10x_data"
VISIUM_ROOT = Path(__file__).parent / "_data" / "visium_data"
Expand All @@ -28,15 +34,19 @@ def assert_anndata_equal(a1, a2):
pytest.param(
ROOT / "1.2.0" / "filtered_gene_bc_matrices" / "hg19_chr21",
ROOT / "1.2.0" / "filtered_gene_bc_matrices_h5.h5",
id="1.2.0",
),
pytest.param(
ROOT / "3.0.0" / "filtered_feature_bc_matrix",
ROOT / "3.0.0" / "filtered_feature_bc_matrix.h5",
id="3.0.0",
),
],
)
@pytest.mark.parametrize("prefix", [None, "prefix_"])
def test_read_10x(tmp_path, mtx_path, h5_path, prefix):
@pytest.mark.parametrize("prefix", [None, "prefix_"], ids=["no_prefix", "prefix"])
def test_read_10x(
tmp_path: Path, mtx_path: Path, h5_path: Path, prefix: str | None
) -> None:
if prefix is not None:
# Build files named "prefix_XXX.xxx" in a temporary directory.
mtx_path_orig = mtx_path
Expand Down Expand Up @@ -66,6 +76,27 @@ def test_read_10x(tmp_path, mtx_path, h5_path, prefix):
assert_anndata_equal(sc.read_h5ad(from_mtx_pth), sc.read_h5ad(from_h5_pth))


@pytest.mark.parametrize(
("genes", "col_dtypes"),
[
pytest.param("symbols", dict(gene_ids="int64"), id="symbols"),
pytest.param("ids", dict(gene_symbols="str"), id="ids"),
],
)
def test_read_10x_mtx_int(
genes: Literal["symbols", "ids"], col_dtypes: dict[str, str]
) -> None:
str_dt = "str" if pd.options.future.infer_string else "object"
col_dtypes = {k: str_dt if v == "str" else v for k, v in col_dtypes.items()}

adata = sc.read_10x_mtx(
ROOT / "int-ids", var_names=f"gene_{genes}", compressed=False
)

assert adata.var.index.dtype == str_dt
assert dict(adata.var.dtypes) == dict(feature_types=str_dt, **col_dtypes)


def test_read_10x_h5_v1():
spec_genome_v1 = sc.read_10x_h5(
ROOT / "1.2.0" / "filtered_gene_bc_matrices_h5.h5",
Expand Down
Loading