From 52fa993e70f869cdf80a38eb061f785c6c657067 Mon Sep 17 00:00:00 2001 From: Austin Noto-Moniz Date: Wed, 7 Jan 2026 14:27:14 -0500 Subject: [PATCH] Drop support for CSVDataSource --- src/citrine/informatics/data_sources.py | 61 +------------------------ tests/informatics/test_data_source.py | 26 +---------- 2 files changed, 3 insertions(+), 84 deletions(-) diff --git a/src/citrine/informatics/data_sources.py b/src/citrine/informatics/data_sources.py index 77bed62d6..5b209100f 100644 --- a/src/citrine/informatics/data_sources.py +++ b/src/citrine/informatics/data_sources.py @@ -1,19 +1,15 @@ """Tools for working with Descriptors.""" from abc import abstractmethod -from typing import Type, List, Mapping, Optional, Union +from typing import Type, List, Union from uuid import UUID -from warnings import warn from citrine._serialization import properties from citrine._serialization.polymorphic_serializable import PolymorphicSerializable from citrine._serialization.serializable import Serializable -from citrine.informatics.descriptors import Descriptor -from citrine.resources.file_link import FileLink from citrine.resources.gemtables import GemTable __all__ = [ 'DataSource', - 'CSVDataSource', 'GemTableDataSource', 'ExperimentDataSourceRef', 'SnapshotDataSource', @@ -36,7 +32,7 @@ def __eq__(self, other): @classmethod def _subclass_list(self) -> List[Type[Serializable]]: - return [CSVDataSource, GemTableDataSource, ExperimentDataSourceRef, SnapshotDataSource] + return [GemTableDataSource, ExperimentDataSourceRef, SnapshotDataSource] @classmethod def get_type(cls, data) -> Type[Serializable]: @@ -72,59 +68,6 @@ def to_data_source_id(self) -> str: """Generate the data_source_id for this DataSource.""" -class CSVDataSource(Serializable['CSVDataSource'], DataSource): - """A data source based on a CSV file stored on the data platform. - - Parameters - ---------- - file_link: FileLink - link to the CSV file to read the data from - column_definitions: Mapping[str, Descriptor] - Map the column headers to the descriptors that will be used to interpret the cell contents - identifiers: Optional[List[str]] - List of one or more column headers whose values uniquely identify a row. These may overlap - with ``column_definitions`` if a column should be used as data and as an identifier, - but this is not necessary. Identifiers must be unique within a dataset. No two rows can - contain the same value. - - """ - - typ = properties.String('type', default='csv_data_source', deserializable=False) - file_link = properties.Object(FileLink, "file_link") - column_definitions = properties.Mapping( - properties.String, properties.Object(Descriptor), "column_definitions") - identifiers = properties.Optional(properties.List(properties.String), "identifiers") - - _data_source_type = "csv" - - def __init__(self, - *, - file_link: FileLink, - column_definitions: Mapping[str, Descriptor], - identifiers: Optional[List[str]] = None): - warn("CSVDataSource is deprecated as of 3.28.0 and will be removed in 4.0.0. Please use " - "another type of data source, such as GemTableDataSource.", - category=DeprecationWarning) - self.file_link = file_link - self.column_definitions = column_definitions - self.identifiers = identifiers - - @classmethod - def _data_source_id_builder(cls, *args) -> DataSource: - # TODO Figure out how to populate the column definitions - warn("A CSVDataSource was derived from a data_source_id " - "but is missing its column_definitions and identities", - UserWarning) - return CSVDataSource( - file_link=FileLink(url=args[0], filename=args[1]), - column_definitions={} - ) - - def to_data_source_id(self) -> str: - """Generate the data_source_id for this DataSource.""" - return f"{self._data_source_type}::{self.file_link.url}::{self.file_link.filename}" - - class GemTableDataSource(Serializable['GemTableDataSource'], DataSource): """A data source based on a GEM Table hosted on the data platform. diff --git a/tests/informatics/test_data_source.py b/tests/informatics/test_data_source.py index b1b4e2a06..7ca003371 100644 --- a/tests/informatics/test_data_source.py +++ b/tests/informatics/test_data_source.py @@ -4,7 +4,7 @@ import pytest from citrine.informatics.data_sources import ( - DataSource, CSVDataSource, ExperimentDataSourceRef, GemTableDataSource, SnapshotDataSource + DataSource, ExperimentDataSourceRef, GemTableDataSource, SnapshotDataSource ) from citrine.informatics.descriptors import RealDescriptor from citrine.resources.file_link import FileLink @@ -54,27 +54,3 @@ def test_from_gem_table(): def test_invalid_data_source_id(): with pytest.raises(ValueError): DataSource.from_data_source_id(f"Undefined::{uuid.uuid4()}") - - -def test_deser_from_parent_deprecated(): - with pytest.deprecated_call(): - data_source = CSVDataSource(file_link=FileLink("foo.spam", "http://example.com"), - column_definitions={"spam": RealDescriptor("eggs", lower_bound=0, upper_bound=1.0, units="")}, - identifiers=["identifier"]) - - # Serialize and deserialize the descriptors, making sure they are round-trip serializable - data = data_source.dump() - data_source_deserialized = DataSource.build(data) - assert data_source == data_source_deserialized - -def test_data_source_id_deprecated(): - with pytest.deprecated_call(): - data_source = CSVDataSource(file_link=FileLink("foo.spam", "http://example.com"), - column_definitions={"spam": RealDescriptor("eggs", lower_bound=0, upper_bound=1.0, units="")}, - identifiers=["identifier"]) - - # TODO: There's no obvious way to recover the column_definitions & identifiers from the ID - with pytest.deprecated_call(): - with pytest.warns(UserWarning): - transformed = DataSource.from_data_source_id(data_source.to_data_source_id()) - assert transformed.file_link == data_source.file_link