diff --git a/docs/release-notes/3616.feature.md b/docs/release-notes/3616.feature.md new file mode 100644 index 0000000000..78502887a1 --- /dev/null +++ b/docs/release-notes/3616.feature.md @@ -0,0 +1 @@ +Add modularity scoring via {func}`scanpy.metrics.modularity` with support for directed/undirected graphs {smaller}`A. Karesh` diff --git a/src/scanpy/metrics/__init__.py b/src/scanpy/metrics/__init__.py index 9cdf82bdf2..03de5baf44 100644 --- a/src/scanpy/metrics/__init__.py +++ b/src/scanpy/metrics/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations from ._gearys_c import gearys_c -from ._metrics import confusion_matrix +from ._metrics import confusion_matrix, modularity from ._morans_i import morans_i -__all__ = ["confusion_matrix", "gearys_c", "morans_i"] +__all__ = ["confusion_matrix", "gearys_c", "modularity", "morans_i"] diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index c24cfe9fd1..5b24187f45 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -2,16 +2,23 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, overload import numpy as np import pandas as pd +from anndata import AnnData from natsort import natsorted from pandas.api.types import CategoricalDtype +from scipy.sparse import coo_matrix + +from .._compat import SpBase +from .._utils import NeighborsView if TYPE_CHECKING: from collections.abc import Sequence + from numpy.typing import ArrayLike + def confusion_matrix( orig: pd.Series | np.ndarray | Sequence, @@ -60,7 +67,7 @@ def confusion_matrix( orig, new = pd.Series(orig), pd.Series(new) assert len(orig) == len(new) - unique_labels = pd.unique(np.concatenate((orig.values, new.values))) + unique_labels = pd.unique(np.concatenate((orig.to_numpy(), new.to_numpy()))) # Compute mtx = _confusion_matrix(orig, new, labels=unique_labels) @@ -89,3 +96,122 @@ def confusion_matrix( df = df.loc[np.array(orig_idx), np.array(new_idx)] return df + + +@overload +def modularity( + connectivities: ArrayLike | SpBase, + /, + labels: pd.Series | ArrayLike, + *, + is_directed: bool, +) -> float: ... + + +@overload +def modularity( + adata: AnnData, + /, + labels: str | pd.Series | ArrayLike = "leiden", + *, + neighbors_key: str | None = None, + is_directed: bool | None = None, +) -> float: ... + + +def modularity( + adata_or_connectivities: AnnData | ArrayLike | SpBase, + /, + labels: str | pd.Series | ArrayLike = "leiden", + *, + neighbors_key: str | None = None, + is_directed: bool | None = None, +) -> float: + """Compute the modularity of a graph given its connectivities and labels. + + Parameters + ---------- + adata_or_connectivities + The AnnData object containing the data or a weighted adjacency matrix representing the graph. + Can be a dense NumPy array or a sparse CSR matrix. + labels + Cluster labels for each node in the graph. + When `AnnData` is provided, this can be the key in `adata.obs` that contains the clustering labels and defaults to `"leiden"`. + neighbors_key + When `AnnData` is provided, the key in `adata.obsp` that contains the connectivities. + is_directed + Whether the graph is directed or undirected. If True, the graph is treated as directed; otherwise, it is treated as undirected. + + Returns + ------- + The modularity of the graph based on the provided clustering. + """ + if isinstance(adata_or_connectivities, AnnData): + return modularity_adata( + adata_or_connectivities, + labels=labels, + neighbors_key=neighbors_key, + is_directed=is_directed, + ) + if isinstance(labels, str): + msg = "`labels` must be provided as array when passing a connectivities array" + raise TypeError(msg) + if is_directed is None: + msg = "`is_directed` must be provided when passing a connectivities array" + raise TypeError(msg) + return modularity_array( + adata_or_connectivities, labels=labels, is_directed=is_directed + ) + + +def modularity_adata( + adata: AnnData, + /, + *, + labels: str | pd.Series | ArrayLike, + neighbors_key: str | None, + is_directed: bool | None, +) -> float: + labels = adata.obs[labels] if isinstance(labels, str) else labels + nv = NeighborsView(adata, neighbors_key) + connectivities = nv["connectivities"] + + if is_directed is None and (is_directed := nv["params"].get("is_directed")) is None: + msg = "`adata` has no `'is_directed'` in `adata.uns[neighbors_key]['params']`, need to specify `is_directed`" + raise ValueError(msg) + + return modularity(connectivities, labels, is_directed=is_directed) + + +def modularity_array( + connectivities: ArrayLike | SpBase, + /, + *, + labels: pd.Series | ArrayLike, + is_directed: bool, +) -> float: + try: + # try to import igraph in case the user wants to calculate modularity + # not in the main module to avoid import errors + import igraph as ig + except ImportError as e: + msg = "igraph is require for computing modularity" + raise ImportError(msg) from e + if isinstance(connectivities, SpBase): + # check if the connectivities is a sparse matrix + coo = coo_matrix(connectivities) + edges = list(zip(coo.row, coo.col, strict=True)) + # converting to the coo format to extract the edges and weights + # storing only non-zero elements and their indices + weights = coo.data.tolist() + graph = ig.Graph(edges=edges, directed=is_directed) + graph.es["weight"] = weights + else: + # if the graph is dense, creates it directly using igraph's adjacency matrix + dense_array = np.asarray(connectivities) + igraph_mode = ig.ADJ_DIRECTED if is_directed else ig.ADJ_UNDIRECTED + graph = ig.Graph.Weighted_Adjacency(dense_array.tolist(), mode=igraph_mode) + # cluster labels to integer codes required by igraph + labels = pd.Categorical(np.asarray(labels)).codes + + return graph.modularity(labels) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 3882cced68..482370f5e3 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -19,15 +19,17 @@ from .._utils import NeighborsView, _doc_params, get_literal_vals from . import _connectivity from ._common import ( + _get_indices_distances_from_dense_matrix, _get_indices_distances_from_sparse_matrix, _get_sparse_matrix_from_indices_distances, ) +from ._connectivity import umap from ._doc import doc_n_pcs, doc_use_rep from ._types import _KnownTransformer, _Method if TYPE_CHECKING: from collections.abc import Callable, Mapping, MutableMapping - from typing import Any, Literal, NotRequired, TypeAlias + from typing import Any, Literal, NotRequired, TypeAlias, Unpack from anndata import AnnData from igraph import Graph @@ -56,6 +58,13 @@ class KwdsForTransformer(TypedDict): random_state: _LegacyRandom +class NeighborsDict(TypedDict): # noqa: D101 + connectivities_key: str + distances_key: str + params: NeighborsParams + rp_forest: NotRequired[RPForestDict] + + class NeighborsParams(TypedDict): # noqa: D101 n_neighbors: int method: _Method @@ -64,6 +73,7 @@ class NeighborsParams(TypedDict): # noqa: D101 metric_kwds: NotRequired[Mapping[str, Any]] use_rep: NotRequired[str] n_pcs: NotRequired[int] + is_directed: NotRequired[bool] @_doc_params(n_pcs=doc_n_pcs, use_rep=doc_use_rep) @@ -72,6 +82,7 @@ def neighbors( # noqa: PLR0913 n_neighbors: int = 15, n_pcs: int | None = None, *, + distances: np.ndarray | SpBase | None = None, use_rep: str | None = None, knn: bool = True, method: _Method = "umap", @@ -80,6 +91,7 @@ def neighbors( # noqa: PLR0913 metric_kwds: Mapping[str, Any] = MappingProxyType({}), random_state: _LegacyRandom = 0, key_added: str | None = None, + is_directed: bool = False, copy: bool = False, ) -> AnnData | None: """Compute the nearest neighbors distance matrix and a neighborhood graph of observations :cite:p:`McInnes2018`. @@ -136,6 +148,7 @@ def neighbors( # noqa: PLR0913 Use :func:`rapids_singlecell.pp.neighbors` instead. metric A known metric’s name or a callable that returns a distance. + If `distances` is given, this parameter is simply stored in `.uns` (see below). *ignored if ``transformer`` is an instance.* metric_kwds @@ -151,8 +164,10 @@ def neighbors( # noqa: PLR0913 distances and connectivities are stored in `.obsp['distances']` and `.obsp['connectivities']` respectively. If specified, the neighbors data is added to .uns[key_added], - distances are stored in `.obsp[key_added+'_distances']` and - connectivities in `.obsp[key_added+'_connectivities']`. + distances are stored in `.obsp[f'{key_added}_distances']` and + connectivities in `.obsp[f'{key_added}_connectivities']`. + is_directed + If `True`, the connectivity matrix is expected to be a directed graph. copy Return a copy instead of writing to adata. @@ -187,6 +202,20 @@ def neighbors( # noqa: PLR0913 :doc:`/how-to/knn-transformers` """ + if distances is not None: + if callable(metric): + msg = "`metric` must be a string if `distances` is given." + raise TypeError(msg) + # if a precomputed distance matrix is provided, skip the PCA and distance computation + return _neighbors_from_distance( + adata, + distances, + n_neighbors=n_neighbors, + metric=metric, + method=method, + key_added=key_added, + is_directed=is_directed, + ) start = logg.info("computing neighbors") adata = adata.copy() if copy else adata if adata.is_view: # we shouldn't need this here... @@ -204,51 +233,105 @@ def neighbors( # noqa: PLR0913 random_state=random_state, ) - if key_added is None: - key_added = "neighbors" - conns_key = "connectivities" - dists_key = "distances" - else: - conns_key = f"{key_added}_connectivities" - dists_key = f"{key_added}_distances" - - adata.uns[key_added] = {} - - neighbors_dict = adata.uns[key_added] - - neighbors_dict["connectivities_key"] = conns_key - neighbors_dict["distances_key"] = dists_key - - neighbors_dict["params"] = NeighborsParams( + key_added, neighbors_dict = _get_metadata( + key_added, n_neighbors=neighbors.n_neighbors, method=method, random_state=random_state, metric=metric, + is_directed=is_directed, + **({} if not metric_kwds else dict(metric_kwds=metric_kwds)), + **({} if use_rep is None else dict(use_rep=use_rep)), + **({} if n_pcs is None else dict(n_pcs=n_pcs)), ) - if metric_kwds: - neighbors_dict["params"]["metric_kwds"] = metric_kwds - if use_rep is not None: - neighbors_dict["params"]["use_rep"] = use_rep - if n_pcs is not None: - neighbors_dict["params"]["n_pcs"] = n_pcs - - adata.obsp[dists_key] = neighbors.distances - adata.obsp[conns_key] = neighbors.connectivities if neighbors.rp_forest is not None: neighbors_dict["rp_forest"] = neighbors.rp_forest + + adata.uns[key_added] = neighbors_dict + adata.obsp[neighbors_dict["distances_key"]] = neighbors.distances + adata.obsp[neighbors_dict["connectivities_key"]] = neighbors.connectivities + logg.info( " finished", time=start, deep=( f"added to `.uns[{key_added!r}]`\n" - f" `.obsp[{dists_key!r}]`, distances for each pair of neighbors\n" - f" `.obsp[{conns_key!r}]`, weighted adjacency matrix" + f" `.obsp[{neighbors_dict['distances_key']!r}]`, distances for each pair of neighbors\n" + f" `.obsp[{neighbors_dict['connectivities_key']!r}]`, weighted adjacency matrix" ), ) return adata if copy else None +def _neighbors_from_distance( + adata: AnnData, + distances: np.ndarray | SpBase, + *, + n_neighbors: int, + metric: _Metric, + method: _Method, + key_added: str | None, + is_directed: bool, +) -> AnnData: + if isinstance(distances, SpBase): + distances = sparse.csr_matrix(distances) # noqa: TID251 + distances.setdiag(0) + distances.eliminate_zeros() + else: + distances = np.asarray(distances) + np.fill_diagonal(distances, 0) + + if method == "umap": + if isinstance(distances, CSRBase): + knn_indices, knn_distances = _get_indices_distances_from_sparse_matrix( + distances, n_neighbors + ) + else: + knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( + distances, n_neighbors + ) + connectivities = umap( + knn_indices, knn_distances, n_obs=adata.n_obs, n_neighbors=n_neighbors + ) + elif method == "gauss": + distances = sparse.csr_matrix(distances) # noqa: TID251 + connectivities = _connectivity.gauss(distances, n_neighbors, knn=True) + else: + msg = f"Method {method} not implemented." + raise NotImplementedError(msg) + + key_added, neighbors_dict = _get_metadata( + key_added, + n_neighbors=n_neighbors, + method=method, + random_state=0, + metric=metric, + is_directed=is_directed, + ) + adata.uns[key_added] = neighbors_dict + adata.obsp[neighbors_dict["distances_key"]] = distances + adata.obsp[neighbors_dict["connectivities_key"]] = connectivities + return adata + + +def _get_metadata( + key_added: str | None, + **params: Unpack[NeighborsParams], +) -> tuple[str, NeighborsDict]: + if key_added is None: + return "neighbors", NeighborsDict( + connectivities_key="connectivities", + distances_key="distances", + params=params, + ) + return key_added, NeighborsDict( + connectivities_key=f"{key_added}_connectivities", + distances_key=f"{key_added}_distances", + params=params, + ) + + class FlatTree(NamedTuple): # noqa: D101 hyperplanes: None offsets: None @@ -356,7 +439,7 @@ class Neighbors: n_dcs Number of diffusion components to use. neighbors_key - Where to look in `.uns` and `.obsp` for neighbors data + Where to look in `.uns` and `.obsp` for neighbors data. """ diff --git a/tests/test_metrics.py b/tests/test_metrics.py index e4b5bed022..4ff32b7ccf 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -12,7 +12,9 @@ from scipy import sparse import scanpy as sc +from scanpy.metrics import modularity from testing.scanpy._helpers.data import pbmc68k_reduced +from testing.scanpy._pytest.marks import needs from testing.scanpy._pytest.params import ARRAY_TYPES if TYPE_CHECKING: @@ -200,3 +202,93 @@ def test_confusion_matrix_api(): pd.testing.assert_frame_equal( expected, sc.metrics.confusion_matrix(data["a"], "b", data) ) + + +@pytest.mark.parametrize("is_directed", [False, True], ids=["undirected", "directed"]) +@pytest.mark.parametrize("use_sparse", [False, True], ids=["dense", "sparse"]) +@needs.igraph +def test_modularity_sample_structure(*, use_sparse: bool, is_directed: bool) -> None: + """Sample graph with clear community structure (dense & sparse, directed & undirected).""" + # 4 node adjacency matrix with two separate 2-node communities + mat = np.array([ + [1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 1, 1], + [0, 0, 1, 1], + ]) + labels = ["A", "A", "B", "B"] + adj = sparse.csr_matrix(mat) if use_sparse else mat # noqa: TID251 + + score = modularity(adj, labels, is_directed=is_directed) + + # Modularity should be between 0 and 1 + assert 0 <= score <= 1 + + +@needs.igraph +def test_modularity_single_community() -> None: + """Edge case when all nodes belong to the same community/cluster.""" + # fully connected graph sample + adj = np.ones((4, 4)) - np.eye(4) + labels = ["A", "A", "A", "A"] + + score = modularity(adj, labels, is_directed=False) + + # modularity should be 0 + assert score == pytest.approx(0.0, rel=1e-6) + + +@needs.igraph +def test_modularity_invalid_labels() -> None: + """Invalad input, labels length does not match adjacency matrix size.""" + import igraph as ig + + adj = np.eye(4) + labels = ["A", "A", "B"] + + with pytest.raises(ig.InternalError, match=r"Membership vector size differs"): + modularity(adj, labels, is_directed=False) + + +@needs.igraph +@needs.leidenalg +def test_modularity_adata() -> None: + """Test domain of modularity score.""" + adata = sc.datasets.pbmc3k() + sc.pp.pca(adata) + sc.pp.neighbors(adata) + sc.tl.leiden(adata) + + score = modularity(adata, labels="louvain") + + assert 0 <= score <= 1 + + +@needs.igraph +def test_modularity_order() -> None: + """Modularity should be the same no matter the order of the labels.""" + adj = np.array([ + [1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 1, 1], + [0, 0, 1, 1], + ]) + labels1 = ["A", "A", "B", "B"] + labels2 = ["B", "B", "A", "A"] + + score_1 = modularity(adj, labels1, is_directed=False) + score_2 = modularity(adj, labels2, is_directed=False) + + assert score_1 == score_2 + + +@needs.igraph +def test_modularity_disconnected_graph() -> None: + """Modularity on disconnected graph like edge-case behavior in some algorithms.""" + adj = np.zeros((4, 4)) + labels = ["A", "B", "C", "D"] + + score = modularity(adj, labels, is_directed=False) + + # Modularity should be undefined for disconnected graphs + assert np.isnan(score) diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 0389615fde..a9bcd1b6c3 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -12,6 +12,7 @@ import scanpy as sc from scanpy import Neighbors from scanpy._compat import CSBase, pkg_version +from testing.scanpy._helpers.data import pbmc68k_reduced if TYPE_CHECKING: from typing import Literal @@ -276,3 +277,23 @@ def test_restore_n_neighbors(neigh, conv): ad.uns["neighbors"] = dict(connectivities=conv(neigh.connectivities)) neigh_restored = Neighbors(ad) assert neigh_restored.n_neighbors == 1 + + +def test_neighbors_distance_equivalence() -> None: + adata = pbmc68k_reduced() + adata_d = adata.copy() + + sc.pp.neighbors(adata) + # reusing the same distances + sc.pp.neighbors(adata_d, distances=adata.obsp["distances"]) + + np.testing.assert_allclose( + adata.obsp["connectivities"].toarray(), + adata_d.obsp["connectivities"].toarray(), + rtol=1e-5, + ) + np.testing.assert_allclose( + adata.obsp["distances"].toarray(), + adata_d.obsp["distances"].toarray(), + rtol=1e-5, + )