-
Notifications
You must be signed in to change notification settings - Fork 4
Add functions to Python helper library for working with datasets #94
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
f195325
initial commit
monwen c7bb32a
adding xdg initialization
monwen ef3f9d6
adding command line to add db into xdg dir
monwen eac32b1
Adding DatasetCatalog class for loading sqlite.db into python env. An…
monwen 497d55f
change name dataset_loader to dataset_viewer. add dependency packages…
monwen 67c1509
-list-datasets
monwen 48bf9f3
clean up unessesary functionality
monwen 2546413
clean up
monwen 3deadf7
add read only dataset loader
monwen 7f4f4d8
dlete outdated test file
monwen d3606db
adding sqlite3 import
monwen 87b9095
Merge branch 'main' into python_dataset_plugin
nightlark c7956e8
format code
nightlark b45a63d
format pyproject
nightlark a13b045
add missing os import and fix appdata folder location for Windows
nightlark File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,133 @@ | ||
| import platform | ||
| import os | ||
| from pathlib import Path | ||
| from dataclasses import dataclass | ||
| from datetime import datetime, timezone | ||
| from typing import Dict, List, Any, Optional | ||
| import tomlkit | ||
| import sqlite3 | ||
|
|
||
|
|
||
| @dataclass | ||
| class DatasetMeta: | ||
| """Dataset metadata matching Rust Dataset struct""" | ||
|
|
||
| version: int | ||
| format: str | ||
| timestamp: datetime | ||
| categories: List[str] | ||
| filepath: Path | ||
|
|
||
|
|
||
| class DatasetCatalog: | ||
| """Class for managing SQLite databases via dataset_info.toml""" | ||
|
|
||
| def __init__(self, app_name: Optional[str] = "dapper", file_path: Optional[str] = None): | ||
| self.app_name = app_name | ||
| self.dataset_metas: Dict[str, DatasetMeta] = {} | ||
|
|
||
| self._load_from_dataset_info_toml(file_path) | ||
|
|
||
| def _load_from_dataset_info_toml(self, file_path: Optional[str] = None): | ||
| """Load installed datasets from dataset_info.toml""" | ||
| try: | ||
| toml_path = self._find_dataset_info_toml(file_path) | ||
| with open(toml_path, "r") as f: | ||
| config = tomlkit.load(f) | ||
|
|
||
| datasets_dict = config.get("datasets", {}) | ||
| for name, dataset_data in datasets_dict.items(): | ||
| self.dataset_metas[name] = DatasetMeta( | ||
| version=int(dataset_data["version"]), | ||
| format=dataset_data["format"], | ||
| timestamp=datetime.fromisoformat( | ||
| dataset_data["timestamp"].replace("Z", "+00:00") | ||
| ), | ||
| categories=dataset_data["categories"], | ||
| filepath=Path(dataset_data["filepath"]), | ||
| ) | ||
|
|
||
| print(f"dataset Loaded {len(self.dataset_metas)} datasets from dataset_info.toml") | ||
|
|
||
| except FileNotFoundError: | ||
| print("No dataset_info.toml found - starting with empty catalog") | ||
| except Exception as e: | ||
| print(f"Error loading dataset_info.toml: {e}") | ||
|
|
||
| def _find_dataset_info_toml(self, file_path: Optional[str] = None) -> Path: | ||
| if file_path: | ||
| # If directory provided, append filename | ||
| path = Path(file_path) | ||
| if path.is_dir(): | ||
| candidate = path / "dataset_info.toml" | ||
| if candidate.exists(): | ||
| return candidate | ||
| # If file provided directly | ||
| elif path.is_file(): | ||
| return path | ||
| raise FileNotFoundError(f"Could not find dataset_info.toml at {file_path}") | ||
|
|
||
| # Default: look in current directory first, then app data | ||
| current_dir = Path(".") / "dataset_info.toml" | ||
| if current_dir.exists(): | ||
| return current_dir | ||
|
|
||
| # Fallback to app data directory | ||
| app_dir = Path(self.get_app_data_dir(self.app_name)) | ||
| candidate = app_dir / "dataset_info.toml" | ||
| if candidate.exists(): | ||
| return candidate | ||
|
|
||
| raise FileNotFoundError("Could not find dataset_info.toml") | ||
|
|
||
| @staticmethod | ||
| def get_app_data_dir(app_name: Optional[str] = "dapper") -> str: | ||
| """Get the platform-specific application data directory""" | ||
|
|
||
| system = platform.system() | ||
|
|
||
| if system == "Linux": | ||
| xdg_data_home = os.environ.get("XDG_DATA_HOME") | ||
| if xdg_data_home: | ||
| return os.path.join(xdg_data_home, app_name) | ||
| else: | ||
| return os.path.join(os.path.expanduser("~"), ".local", "share", app_name) | ||
|
|
||
| elif system == "Darwin": | ||
| return os.path.join(os.path.expanduser("~"), "Library", "Application Support", app_name) | ||
|
|
||
| elif system == "Windows": | ||
| appdata = os.environ.get("LOCALAPPDATA") | ||
| if appdata: | ||
| return os.path.join(appdata, app_name, "data") | ||
| else: | ||
| return os.path.join(os.path.expanduser("~"), "AppData", "Local", app_name, "data") | ||
|
|
||
| else: | ||
| return os.path.join(os.path.expanduser("~"), f".{app_name}") | ||
|
|
||
| def get_available_datasets(self, category: Optional[str] = None) -> List[str]: | ||
| """Return list of dataset names, optionally filtered by category""" | ||
| if not category: | ||
| return list(self.dataset_metas.keys()) | ||
| return [name for name, meta in self.dataset_metas.items() if category in meta.categories] | ||
|
|
||
| def get_dataset_path(self, dataset_name: str) -> Optional[Path]: | ||
| """Get path to dataset file for loading/querying""" | ||
| if dataset_name in self.dataset_metas: | ||
| return self.dataset_metas[dataset_name].filepath | ||
| return None | ||
|
|
||
| def get_dataset_info(self, dataset_name: str) -> Optional[DatasetMeta]: | ||
| """Get full metadata for a dataset""" | ||
| return self.dataset_metas.get(dataset_name) | ||
|
|
||
| def load_dataset(self, dataset_name: str) -> sqlite3.Connection: | ||
| """Load/open a dataset database for READ-ONLY querying""" | ||
| db_path = self.get_dataset_path(dataset_name) | ||
| if not db_path or not db_path.exists(): | ||
| raise FileNotFoundError(f"Dataset '{dataset_name}' not found") | ||
|
|
||
| # Open in read-only mode | ||
| uri = f"file:{db_path}?mode=ro" | ||
| return sqlite3.connect(uri, uri=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.