Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 2 additions & 59 deletions src/citrine/informatics/data_sources.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
"""Tools for working with Descriptors."""
from abc import abstractmethod
from typing import Type, List, Mapping, Optional, Union
from typing import Type, List, Union
from uuid import UUID
from warnings import warn

from citrine._serialization import properties
from citrine._serialization.polymorphic_serializable import PolymorphicSerializable
from citrine._serialization.serializable import Serializable
from citrine.informatics.descriptors import Descriptor
from citrine.resources.file_link import FileLink
from citrine.resources.gemtables import GemTable

__all__ = [
'DataSource',
'CSVDataSource',
'GemTableDataSource',
'ExperimentDataSourceRef',
'SnapshotDataSource',
Expand All @@ -36,7 +32,7 @@ def __eq__(self, other):

@classmethod
def _subclass_list(self) -> List[Type[Serializable]]:
return [CSVDataSource, GemTableDataSource, ExperimentDataSourceRef, SnapshotDataSource]
return [GemTableDataSource, ExperimentDataSourceRef, SnapshotDataSource]

@classmethod
def get_type(cls, data) -> Type[Serializable]:
Expand Down Expand Up @@ -72,59 +68,6 @@ def to_data_source_id(self) -> str:
"""Generate the data_source_id for this DataSource."""


class CSVDataSource(Serializable['CSVDataSource'], DataSource):
"""A data source based on a CSV file stored on the data platform.

Parameters
----------
file_link: FileLink
link to the CSV file to read the data from
column_definitions: Mapping[str, Descriptor]
Map the column headers to the descriptors that will be used to interpret the cell contents
identifiers: Optional[List[str]]
List of one or more column headers whose values uniquely identify a row. These may overlap
with ``column_definitions`` if a column should be used as data and as an identifier,
but this is not necessary. Identifiers must be unique within a dataset. No two rows can
contain the same value.

"""

typ = properties.String('type', default='csv_data_source', deserializable=False)
file_link = properties.Object(FileLink, "file_link")
column_definitions = properties.Mapping(
properties.String, properties.Object(Descriptor), "column_definitions")
identifiers = properties.Optional(properties.List(properties.String), "identifiers")

_data_source_type = "csv"

def __init__(self,
*,
file_link: FileLink,
column_definitions: Mapping[str, Descriptor],
identifiers: Optional[List[str]] = None):
warn("CSVDataSource is deprecated as of 3.28.0 and will be removed in 4.0.0. Please use "
"another type of data source, such as GemTableDataSource.",
category=DeprecationWarning)
self.file_link = file_link
self.column_definitions = column_definitions
self.identifiers = identifiers

@classmethod
def _data_source_id_builder(cls, *args) -> DataSource:
# TODO Figure out how to populate the column definitions
warn("A CSVDataSource was derived from a data_source_id "
"but is missing its column_definitions and identities",
UserWarning)
return CSVDataSource(
file_link=FileLink(url=args[0], filename=args[1]),
column_definitions={}
)

def to_data_source_id(self) -> str:
"""Generate the data_source_id for this DataSource."""
return f"{self._data_source_type}::{self.file_link.url}::{self.file_link.filename}"


class GemTableDataSource(Serializable['GemTableDataSource'], DataSource):
"""A data source based on a GEM Table hosted on the data platform.

Expand Down
26 changes: 1 addition & 25 deletions tests/informatics/test_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

from citrine.informatics.data_sources import (
DataSource, CSVDataSource, ExperimentDataSourceRef, GemTableDataSource, SnapshotDataSource
DataSource, ExperimentDataSourceRef, GemTableDataSource, SnapshotDataSource
)
from citrine.informatics.descriptors import RealDescriptor
from citrine.resources.file_link import FileLink
Expand Down Expand Up @@ -54,27 +54,3 @@ def test_from_gem_table():
def test_invalid_data_source_id():
with pytest.raises(ValueError):
DataSource.from_data_source_id(f"Undefined::{uuid.uuid4()}")


def test_deser_from_parent_deprecated():
with pytest.deprecated_call():
data_source = CSVDataSource(file_link=FileLink("foo.spam", "http://example.com"),
column_definitions={"spam": RealDescriptor("eggs", lower_bound=0, upper_bound=1.0, units="")},
identifiers=["identifier"])

# Serialize and deserialize the descriptors, making sure they are round-trip serializable
data = data_source.dump()
data_source_deserialized = DataSource.build(data)
assert data_source == data_source_deserialized

def test_data_source_id_deprecated():
with pytest.deprecated_call():
data_source = CSVDataSource(file_link=FileLink("foo.spam", "http://example.com"),
column_definitions={"spam": RealDescriptor("eggs", lower_bound=0, upper_bound=1.0, units="")},
identifiers=["identifier"])

# TODO: There's no obvious way to recover the column_definitions & identifiers from the ID
with pytest.deprecated_call():
with pytest.warns(UserWarning):
transformed = DataSource.from_data_source_id(data_source.to_data_source_id())
assert transformed.file_link == data_source.file_link
Loading