diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 09cf8f0..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/datatorch/api/api.py b/datatorch/api/api.py index 883b9fb..f46256f 100644 --- a/datatorch/api/api.py +++ b/datatorch/api/api.py @@ -163,11 +163,55 @@ def upload_to_default_filesource( ) print(r.text + " " + endpoint) + def upload_to_filesource( + self, + project: Project, + file: IO, + storageId: str = None, + storageFolderName=None, + dataset: Dataset = None, + **kwargs, + ): + """ + Uploads a file to the provided `storage_id` if available; + otherwise, retrieves the default storage ID (DataTorch Storage) from the project. + """ + # Retrieve default storage_id if not explicitly provided + if storageId is None: + storageId = project.storage_link_default().id + + storageFolderName = "" if storageFolderName is None else storageFolderName + datasetId = "" if dataset is None else dataset.id + importFiles = "false" if dataset is None else "true" + + # Construct the endpoint + endpoint = f"{self.api_url}/file/v1/upload/{storageId}?path={storageFolderName}&import={importFiles}&datasetId={datasetId}" + + # Determine MIME type + if magic: + tell = file.tell() + mimetype = magic.from_buffer(file.read(1024), mime=True) + file.seek(tell) + else: + mimetype = mimetypes.guess_type(file.name)[0] + + # Make the POST request + r = requests.post( + endpoint, + files={"file": (os.path.basename(file.name), file, mimetype)}, + headers={self.token_header: self._api_token}, + stream=True, + ) + + # Raise an error for failed requests + r.raise_for_status() + def glob_upload_folder( self, project: Project, uploadingFromGlob: str, storageFolderName: str, + storageId: str = None, folderSplit=1000, dataset: Dataset = None, recursive=False, @@ -192,9 +236,10 @@ def glob_upload_folder( folderIndex += 1 uploadFolderName = storageFolderName + "_" + str(folderIndex) file = open(file, "rb") - self.upload_to_default_filesource( + self.upload_to_filesource( project=project, file=file, + storageId=storageId, storageFolderName=uploadFolderName, dataset=dataset, ) diff --git a/datatorch/cli/groups.py b/datatorch/cli/groups.py index 71ecd96..3d816dd 100644 --- a/datatorch/cli/groups.py +++ b/datatorch/cli/groups.py @@ -12,6 +12,7 @@ from .pipeline import pipeline from .action import action from .import_cmds import import_cmd +from .upload import upload @click.group() @@ -29,3 +30,5 @@ def main(): main.add_command(agent) main.add_command(action) main.add_command(import_cmd) + +main.add_command(upload) diff --git a/datatorch/cli/upload/__init__.py b/datatorch/cli/upload/__init__.py new file mode 100644 index 0000000..4c4accb --- /dev/null +++ b/datatorch/cli/upload/__init__.py @@ -0,0 +1,10 @@ +import click +from .folder import folder + + +@click.group(help="Commands for managing uploads.") +def upload(): + pass + + +upload.add_command(folder) diff --git a/datatorch/cli/upload/folder.py b/datatorch/cli/upload/folder.py new file mode 100644 index 0000000..579e115 --- /dev/null +++ b/datatorch/cli/upload/folder.py @@ -0,0 +1,131 @@ +import os +import click +from datatorch.core.settings import UserSettings +from datatorch.api.api import ApiClient +from datatorch.api.entity.project import Project +from ..spinner import Spinner + + +@click.command("folder") +@click.argument("folder_path", type=click.Path(exists=True, file_okay=False)) +@click.argument("project_id", type=str) +def folder(folder_path, project_id): + """Bulk upload files to a specified project.""" + + # Get the list of files to upload + files = [f for f in os.listdir(folder_path) + if os.path.isfile(os.path.join(folder_path, f))] + total_files = len(files) + + if total_files == 0: + click.echo("No files found in the specified folder.") + return + + # Load user settings + user_settings = UserSettings() + api_key = user_settings.api_key + api_url = user_settings.api_url + + if not api_key or not api_url: + click.echo("You are not logged in. " + "Please log in using the `login` command.") + return + + # Initialize the API client + client = ApiClient(api_url=api_url, api_key=api_key) + + # Validate the endpoint + if not client.validate_endpoint(): + click.echo("Error: Invalid API endpoint.") + return + click.echo("Valid API endpoint verified.") + + # Retrieve the project by ID + try: + project = client.project(project_id) + click.echo(f"Retrieved project: {project.name}") + except Exception as e: + click.echo(f"Error: Unable to retrieve " + f"project with ID '{project_id}'. {e}") + return + + # Display available dataset + try: + datasets = project.datasets() + if datasets: + click.echo("\nAvailable Dataset:") + for idx, dataset in enumerate(datasets, start=1): + click.echo(f"{idx}. {dataset.name} (ID: {dataset.id})") + + # Prompt user to select a dataset + choice = click.prompt( + "Enter the number of the dataset", + type=int, + default=1, + ) + if 1 <= choice <= len(datasets): + selected_dataset = datasets[choice - 1] + click.echo(f"Selected Dataset: {selected_dataset.name} (ID: {selected_dataset.id}") + else: + click.echo(f"Invalid choice. Please select a number between 1 and {len(datasets)}") + else: + # No datasets found, as if user want to continue with global upload + continue_upload = click.confirm("No datasets found for this project" + "Do you want to continue with global upload?", default=False) + if not continue_upload: + click.echo("Ending...") + return + except Exception as e: + click.echo(f"Error retrieving data set: {e}") + return + + # Display available storage links and prompt user selection + try: + storage_links = project.storage_links() + if not storage_links: + click.echo("No storage available for this project.") + return + + click.echo("\nAvailable Storages:") + for idx, storage_link in enumerate(storage_links): + click.echo(f"{idx + 1}. {storage_link.name} " + f"(ID: {storage_link.id})") + + # Prompt user to select a storage link + choice = click.prompt( + "Enter the number of the storage to use", + type=int, + default=1, + ) + if 1 <= choice <= len(storage_links): + selected_storage_link = storage_links[choice - 1] + else: + click.echo(f"Invalid choice. Please select a number between 1 and {len(storage_links)}.") + return + + click.echo(f"Selected Storage: {selected_storage_link.name} " + f"(ID: {selected_storage_link.id})") + except Exception as e: + click.echo(f"Error retrieving storage: {e}") + return + + # Initialize the spinner + spinner = Spinner(f"Uploading files (0/{total_files})") + + # Upload files to the selected storage and dataset using their IDs + try: + for idx, file_name in enumerate(files, start=1): + file_path = os.path.join(folder_path, file_name) + spinner.set_text(f"Uploading file ({idx}/{total_files})") + with open(file_path, "rb") as file: + client.upload_to_filesource( + project=project, + file=file, + storageId=selected_storage_link.id, + storageFolderName=None, + dataset=selected_dataset, + ) + spinner.done(f"Uploaded all {total_files} files successfully!") + except Exception as e: + spinner.done(f"Error during upload: {e}") + return diff --git a/examples/upload_files.py b/examples/upload_files.py new file mode 100644 index 0000000..f8858d1 --- /dev/null +++ b/examples/upload_files.py @@ -0,0 +1,28 @@ +import os +import datatorch as dt + +api = dt.api.ApiClient('your-api-key') +proj = api.project('user-name/project-name') +dset = proj.dataset('data-set-name') + +folder_to_upload = 'uploadme' +upload_to_storage_id = 'your-storage-id' + +# Get all the file names in the folder +files = [f for f in os.listdir(folder_to_upload) + if os.path.isfile(os.path.join(folder_to_upload, f))] + +# Upload files to the selected storage and dataset using their IDs +try: + for file_name in files: + file_path = os.path.join(folder_to_upload, file_name) + with open(file_path, "rb") as file: + api.upload_to_filesource( + project=proj, + file=file, + storageId=upload_to_storage_id, + storageFolderName=None, + dataset=dset, + ) +except Exception as e: + print(f"Error Uploading: {e}") diff --git a/examples/uploadme/1copy.jpg b/examples/uploadme/1copy.jpg new file mode 100755 index 0000000..362b0a8 Binary files /dev/null and b/examples/uploadme/1copy.jpg differ diff --git a/examples/uploadme/2copy.jpg b/examples/uploadme/2copy.jpg new file mode 100755 index 0000000..7b5acfb Binary files /dev/null and b/examples/uploadme/2copy.jpg differ diff --git a/examples/uploadme/3copy.jpg b/examples/uploadme/3copy.jpg new file mode 100755 index 0000000..e79011f Binary files /dev/null and b/examples/uploadme/3copy.jpg differ diff --git a/setup.py b/setup.py index a41131e..a20eec8 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,8 @@ from setuptools import setup, find_packages import sys -# Ensure the Python version is 3.13 or higher -assert sys.version_info >= (3, 13, 0), "DataTorch requires Python 3.13+" +# Ensure the Python version is 3.12 or higher +assert sys.version_info >= (3, 12, 0), "DataTorch requires Python 3.12+" with open("README.md", "r", encoding="utf-8") as fp: long_description = fp.read() @@ -33,7 +33,7 @@ setup( name="datatorch", - version="0.4.8.4", + version="0.4.8.5", description="A CLI and library for interacting with DataTorch.", author="DataTorch", author_email="support@datatorch.io", @@ -45,7 +45,7 @@ long_description=long_description, long_description_content_type="text/markdown", install_requires=requirements, - python_requires=">=3.13", + python_requires=">=3.12", license="MIT license", zip_safe=False, include_package_data=True, @@ -55,7 +55,7 @@ "Framework :: Pytest", "Intended Audience :: Developers", "Natural Language :: English", - "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries :: Python Modules", ],