From 8b87d97d32ebfbbae733256638ac985f03d7cecb Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Sat, 3 Jan 2026 16:44:29 -0800 Subject: [PATCH 1/4] classes extend BiocObject --- CHANGELOG.md | 3 +- setup.cfg | 1 + src/biostrings/dnastring.py | 89 +++++++++------------------------- src/biostrings/dnastringset.py | 86 +++++++------------------------- 4 files changed, 42 insertions(+), 137 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad81a98..4499881 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ # Changelog -## Version 0.0.1 +## Version 0.0.1 - 0.0.2 - Initial implementation, added the DNAString and DNAStringSet classes. +- Classes extend Biocobject from biocutils. diff --git a/setup.cfg b/setup.cfg index bdb4e11..89a8e40 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,6 +51,7 @@ install_requires = importlib-metadata; python_version<"3.8" iranges numpy + biocutils>=0.3.3 [options.packages.find] diff --git a/src/biostrings/dnastring.py b/src/biostrings/dnastring.py index 20ffe3e..2e3e5a3 100644 --- a/src/biostrings/dnastring.py +++ b/src/biostrings/dnastring.py @@ -1,8 +1,12 @@ +from __future__ import annotations + import re from copy import deepcopy -from typing import Optional, Union +from typing import Any, Dict, Optional, Union from warnings import warn +import biocutils as ut + from .utils import _sanitize_metadata # From R's DNA_ALPHABET @@ -22,14 +26,19 @@ __license__ = "MIT" -class DNAString: +class DNAString(ut.BiocObject): """A string container for a DNA sequence, similar to Bioconductor's DNAString. This class stores the sequence internally as bytes, enforcing the DNA alphabet. """ - def __init__(self, sequence: Union[str, bytes], metadata: Optional[dict] = None, validate: bool = True): + def __init__( + self, + sequence: Union[str, bytes], + metadata: Optional[Union[Dict[str, Any], ut.NamedList]] = None, + _validate: bool = True, + ): """Create a DNAString. Args: @@ -39,9 +48,11 @@ def __init__(self, sequence: Union[str, bytes], metadata: Optional[dict] = None, metadata: Additional metadata. If None, defaults to an empty dictionary. - validate: + _validate: Whether to validate the arguments, internal use only. """ + super().__init__(metadata=metadata, _validate=_validate) + if isinstance(sequence, str): self._data = sequence.upper().encode("ascii") elif isinstance(sequence, bytes): @@ -51,7 +62,7 @@ def __init__(self, sequence: Union[str, bytes], metadata: Optional[dict] = None, self._metadata = _sanitize_metadata(metadata) - if validate: + if _validate: if not _DNA_VALIDATOR.match(self._data.decode("ascii")): raise ValueError("Input string contains non-DNA characters.") @@ -59,13 +70,7 @@ def __init__(self, sequence: Union[str, bytes], metadata: Optional[dict] = None, #### Copying #### ################# - def _define_output(self, in_place): - if in_place: - return self - else: - return self.__copy__() - - def __copy__(self) -> "DNAString": + def __copy__(self) -> DNAString: """Shallow copy of the object. Returns: @@ -74,10 +79,10 @@ def __copy__(self) -> "DNAString": return type(self)( sequence=str(self), metadata=self._metadata, - validate=False, + _validate=False, ) - def __deepcopy__(self, memo) -> "DNAString": + def __deepcopy__(self, memo) -> DNAString: """Deep copy of the object. Args: @@ -89,63 +94,13 @@ def __deepcopy__(self, memo) -> "DNAString": return type(self)( sequence=deepcopy(str(self), memo), metadata=deepcopy(self._metadata, memo), - validate=False, + _validate=False, ) ######################## #### Getter/setters #### ######################## - def get_metadata(self) -> dict: - """Get additional metadata. - - Returns: - Dictionary containing additional metadata. - """ - return self._metadata - - def set_metadata(self, metadata: Optional[dict], in_place: bool = False) -> "DNAString": - """Set or replace metadata. - - Args: - metadata: - Additional metadata. - - in_place: - Whether to modify the object in place. - - Returns: - If ``in_place = False``, a new ``DNAString`` is returned with the - modified metadata. Otherwise, the current object is directly - modified and a reference to it is returned. - """ - output = self._define_output(in_place) - output._metadata = _sanitize_metadata(metadata) - return output - - @property - def metadata(self) -> dict: - """Get additional metadata. - - Returns: - Dictionary containing additional metadata. - """ - return self.get_metadata() - - @metadata.setter - def metadata(self, metadata: Optional[dict]): - """Set or replace metadata (in-place operation). - - Args: - metadata: - Additional metadata. - """ - warn( - "Setting property 'metadata'is an in-place operation, use 'set_metadata' instead", - UserWarning, - ) - self.set_metadata(metadata, in_place=True) - def get_sequence(self) -> str: """Get the sequence. @@ -191,7 +146,7 @@ def __eq__(self, other) -> bool: #### Getitem/setitem #### ######################### - def __getitem__(self, key: Union[int, slice]) -> "DNAString": + def __getitem__(self, key: Union[int, slice]) -> DNAString: """Extract a subsequence (slicing). Args: @@ -214,7 +169,7 @@ def __getitem__(self, key: Union[int, slice]) -> "DNAString": #### methods #### ################# - def reverse_complement(self) -> "DNAString": + def reverse_complement(self) -> DNAString: """Compute the reverse complement of the sequence. Returns: diff --git a/src/biostrings/dnastringset.py b/src/biostrings/dnastringset.py index 9a9fa9d..2c666e6 100644 --- a/src/biostrings/dnastringset.py +++ b/src/biostrings/dnastringset.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from copy import deepcopy -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union from warnings import warn import biocutils as ut @@ -23,7 +25,7 @@ __license__ = "MIT" -class DNAStringSet: +class DNAStringSet(ut.BiocObject): """A collection of DNA sequences, similar to Bioconductor's DNAStringSet. This class follows the "pool and ranges" model for high memory @@ -36,10 +38,10 @@ def __init__( self, sequences: Optional[List[str]] = None, names: Optional[Union[List[str], ut.Names]] = None, - _pool: bytes = None, - _ranges: IRanges = None, - metadata: Optional[dict] = None, - validate: bool = True, + _pool: Optional[bytes] = None, + _ranges: Optional[IRanges] = None, + metadata: Optional[Union[Dict[str, Any], ut.NamedList]] = None, + _validate: bool = True, ): """Create a DNAStringSet. @@ -63,12 +65,14 @@ def __init__( validate: Whether to validate the arguments, internal use only. """ + super().__init__(metadata=metadata, _validate=_validate) + if _pool is not None and _ranges is not None: self._pool = _pool self._ranges = _ranges elif sequences is not None and len(sequences) > 0: - if validate: + if _validate: for i, seq_str in enumerate(sequences): if not _DNA_VALIDATOR.match(seq_str): raise ValueError(f"Sequence at index {i} contains non-DNA characters.") @@ -104,13 +108,7 @@ def __init__( #### Copying #### ################# - def _define_output(self, in_place): - if in_place: - return self - else: - return self.__copy__() - - def __copy__(self) -> "DNAStringSet": + def __copy__(self) -> DNAStringSet: """Shallow copy of the object. Returns: @@ -120,10 +118,10 @@ def __copy__(self) -> "DNAStringSet": _pool=self._pool, _ranges=self._ranges, metadata=self._metadata, - validate=False, + _validate=False, ) - def __deepcopy__(self, memo) -> "DNAStringSet": + def __deepcopy__(self, memo) -> DNAStringSet: """Deep copy of the object. Args: @@ -136,63 +134,13 @@ def __deepcopy__(self, memo) -> "DNAStringSet": _pool=deepcopy(self._pool, memo), _ranges=deepcopy(self._ranges, memo), metadata=deepcopy(self._metadata, memo), - validate=False, + _validate=False, ) ######################## #### Getter/setters #### ######################## - def get_metadata(self) -> dict: - """Get additional metadata. - - Returns: - Dictionary containing additional metadata. - """ - return self._metadata - - def set_metadata(self, metadata: Optional[dict], in_place: bool = False) -> "DNAStringSet": - """Set or replace metadata. - - Args: - metadata: - Additional metadata. - - in_place: - Whether to modify the object in place. - - Returns: - If ``in_place = False``, a new ``DNAStringSet`` is returned with the - modified metadata. Otherwise, the current object is directly - modified and a reference to it is returned. - """ - output = self._define_output(in_place) - output._metadata = _sanitize_metadata(metadata) - return output - - @property - def metadata(self) -> dict: - """Get additional metadata. - - Returns: - Dictionary containing additional metadata. - """ - return self.get_metadata() - - @metadata.setter - def metadata(self, metadata: Optional[dict]): - """Set or replace metadata (in-place operation). - - Args: - metadata: - Additional metadata. - """ - warn( - "Setting property 'metadata'is an in-place operation, use 'set_metadata' instead", - UserWarning, - ) - self.set_metadata(metadata, in_place=True) - def get_names(self) -> Optional[ut.Names]: """Get range names. @@ -202,7 +150,7 @@ def get_names(self) -> Optional[ut.Names]: """ return self._ranges.get_names() - def set_names(self, names: Optional[List[str]], in_place: bool = False) -> "DNAStringSet": + def set_names(self, names: Optional[List[str]], in_place: bool = False) -> DNAStringSet: """ Args: names: @@ -303,7 +251,7 @@ def width(self) -> np.ndarray: """Alias to :py:meth:`~.get_width`.""" return self.get_width() - def __getitem__(self, key: Union[int, slice, List[int], np.ndarray]) -> Union[DNAString, "DNAStringSet"]: + def __getitem__(self, key: Union[int, slice, List[int], np.ndarray]) -> Union[DNAString, DNAStringSet]: """Extract one or more sequences. Args: From fbd2a52eefbb4725b294232129675f2db0d0f724 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 4 Jan 2026 00:44:46 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/requirements.txt | 4 ++-- lib/src/stringsetpool.cpp | 8 ++++---- src/biostrings/__init__.py | 2 +- src/biostrings/dnastring.py | 1 - tests/conftest.py | 10 +++++----- 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index a1b9d2b..c20cf60 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,9 +1,9 @@ +furo +myst-nb # Requirements file for ReadTheDocs, check .readthedocs.yml. # To build the module reference correctly, make sure every external package # under `install_requires` in `setup.cfg` is also listed here! # sphinx_rtd_theme myst-parser[linkify] sphinx>=3.2.1 -myst-nb -furo sphinx-autodoc-typehints diff --git a/lib/src/stringsetpool.cpp b/lib/src/stringsetpool.cpp index 72da9be..ccb203e 100644 --- a/lib/src/stringsetpool.cpp +++ b/lib/src/stringsetpool.cpp @@ -19,7 +19,7 @@ py::tuple create_dnastringset_pool(py::list py_seqs) { int32_t* starts_ptr = np_starts.mutable_data(); int32_t* widths_ptr = np_widths.mutable_data(); - + std::stringstream pool_stream; int32_t current_start = 0; const std::string valid_chars = "ACGTRYSWKMBDHVN-"; @@ -38,11 +38,11 @@ py::tuple create_dnastringset_pool(py::list py_seqs) { ); } } - + pool_stream.write(s.c_str(), current_width); current_start += current_width; } - + py::bytes pool = py::bytes(pool_stream.str()); return py::make_tuple(pool, np_starts, np_widths); } @@ -54,4 +54,4 @@ void init_stringsetpool(pybind11::module &m) { &create_dnastringset_pool, "Efficiently create the pool and ranges for a DnaStringset from a list of strings." ); -} \ No newline at end of file +} diff --git a/src/biostrings/__init__.py b/src/biostrings/__init__.py index cf4eb23..531197d 100644 --- a/src/biostrings/__init__.py +++ b/src/biostrings/__init__.py @@ -17,4 +17,4 @@ from .dnastring import DNAString -from .dnastringset import DNAStringSet \ No newline at end of file +from .dnastringset import DNAStringSet diff --git a/src/biostrings/dnastring.py b/src/biostrings/dnastring.py index 2e3e5a3..c9a86a8 100644 --- a/src/biostrings/dnastring.py +++ b/src/biostrings/dnastring.py @@ -3,7 +3,6 @@ import re from copy import deepcopy from typing import Any, Dict, Optional, Union -from warnings import warn import biocutils as ut diff --git a/tests/conftest.py b/tests/conftest.py index 152eaee..bbdaf93 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,10 @@ """ - Dummy conftest.py for biostrings. +Dummy conftest.py for biostrings. - If you don't know what this is for, just leave it empty. - Read more about conftest.py under: - - https://docs.pytest.org/en/stable/fixture.html - - https://docs.pytest.org/en/stable/writing_plugins.html +If you don't know what this is for, just leave it empty. +Read more about conftest.py under: +- https://docs.pytest.org/en/stable/fixture.html +- https://docs.pytest.org/en/stable/writing_plugins.html """ # import pytest From 6928e4a9292145d5144da784bff0b058a5a5536a Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Sat, 3 Jan 2026 16:50:53 -0800 Subject: [PATCH 3/4] fix tests --- tests/test_dnastringset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_dnastringset.py b/tests/test_dnastringset.py index 7a8b0da..071877d 100644 --- a/tests/test_dnastringset.py +++ b/tests/test_dnastringset.py @@ -27,7 +27,7 @@ def sample_set(sample_seqs): def test_construction(sample_set, sample_seqs): assert len(sample_set) == 5 - assert sample_set.names == sample_seqs["names"] + assert list(sample_set.names) == sample_seqs["names"] assert np.array_equal(sample_set.width(), [4, 7, 0, 13, 8]) @@ -49,7 +49,7 @@ def test_construction_empty(): def test_names_setter(sample_set): new_names = ["a", "b", "c", "d", "e"] sample_set.names = new_names - assert sample_set.names == new_names + assert list(sample_set.names) == new_names with pytest.raises(Exception): sample_set.names = ["a", "b"] @@ -72,7 +72,7 @@ def test_getitem_slice_view(sample_set): subset = sample_set[1:4] assert isinstance(subset, DNAStringSet) assert len(subset) == 3 - assert subset.names == ["seq2", "empty", "iupac"] + assert list(subset.names) == ["seq2", "empty", "iupac"] assert np.array_equal(subset.width(), [7, 0, 13]) # Test it's a view (shares the pool) @@ -85,7 +85,7 @@ def test_getitem_list_view(sample_set): assert isinstance(subset, DNAStringSet) assert len(subset) == 3 - assert subset.names == ["seq1", "iupac", "seq5"] + assert list(subset.names) == ["seq1", "iupac", "seq5"] assert np.array_equal(subset.width(), [4, 13, 8]) # Test it's a view From a5dd95cce48e7d38d20832039b1835804c60545f Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Sat, 3 Jan 2026 16:55:00 -0800 Subject: [PATCH 4/4] remove 3.9 --- .github/workflows/run-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 3091205..79e7d96 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -28,7 +28,7 @@ jobs: test: strategy: matrix: - python: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] + python: ["3.10", "3.11", "3.12", "3.13", "3.14"] platform: - ubuntu-latest - macos-latest