diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index e9de2c3b9..9fa68c523 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -32,6 +32,9 @@ jobs: python -m pip install --upgrade pip pip install -e ".[test]" + - name: Run typo checks + uses: crate-ci/typos@v1.40.0 + - name: Run coding checks run: | make lint diff --git a/Makefile b/Makefile index ef8207c55..5bf39b8a8 100644 --- a/Makefile +++ b/Makefile @@ -8,3 +8,6 @@ format: lint: PYTHONPATH=`pwd` python3 -m black vectordb_bench --check PYTHONPATH=`pwd` python3 -m ruff check vectordb_bench + +typos: + typos README.md install.py vectordb_bench tests diff --git a/tests/test_chroma.py b/tests/test_chroma.py index 2b1b0596e..29cb4956f 100644 --- a/tests/test_chroma.py +++ b/tests/test_chroma.py @@ -20,7 +20,7 @@ -dict = {} #Assumes chroma is acception connections on localhost:8000 +dict = {} # Assumes chroma is accepting connections on localhost:8000 dict['name'] = "chroma" dict['host'] = "localhost" dict['port'] = 8000 @@ -102,4 +102,4 @@ def test_insert_and_search(self): break assert isFilter, f"Filter not working, id_list: {id_list}" - \ No newline at end of file + diff --git a/typos.toml b/typos.toml new file mode 100644 index 000000000..c4067b2c7 --- /dev/null +++ b/typos.toml @@ -0,0 +1,5 @@ +# Respect these words as correct spelling. +[default.extend-words] +"rabit" = "rabit" +"typ" = "typ" +"infom" = "infom" diff --git a/vectordb_bench/backend/clients/alisql/alisql.py b/vectordb_bench/backend/clients/alisql/alisql.py index 76179c2ed..549ff3320 100644 --- a/vectordb_bench/backend/clients/alisql/alisql.py +++ b/vectordb_bench/backend/clients/alisql/alisql.py @@ -91,7 +91,7 @@ def _create_db_table(self, dim: int): @contextmanager def init(self): - """create and destory connections to database. + """create and destroy connections to database. Examples: >>> with self.init(): diff --git a/vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py b/vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py index 138e10bba..3aba33328 100644 --- a/vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +++ b/vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py @@ -310,7 +310,7 @@ def search_embedding( return [one_res["id"] for one_res in res["result"]] def need_normalize_cosine(self) -> bool: - """Wheather this database need to normalize dataset to support COSINE""" + """Whether this database need to normalize dataset to support COSINE""" if self.case_config.metric_type == MetricType.COSINE: log.info("cosine dataset need normalize.") return True diff --git a/vectordb_bench/backend/clients/api.py b/vectordb_bench/backend/clients/api.py index 605e85ac0..b67ee1bff 100644 --- a/vectordb_bench/backend/clients/api.py +++ b/vectordb_bench/backend/clients/api.py @@ -97,7 +97,7 @@ def not_empty_field(cls, v: any, field: any): class DBCaseConfig(ABC): - """Case specific vector database configs, usually uesed for index params like HNSW""" + """Case specific vector database configs, usually used for index params like HNSW""" @abstractmethod def index_param(self) -> dict: @@ -176,7 +176,7 @@ def __init__( @abstractmethod @contextmanager def init(self) -> None: - """create and destory connections to database. + """create and destroy connections to database. Why contextmanager: In multiprocessing search tasks, vectordbbench might init @@ -192,7 +192,7 @@ def init(self) -> None: raise NotImplementedError def need_normalize_cosine(self) -> bool: - """Wheather this database need to normalize dataset to support COSINE""" + """Whether this database need to normalize dataset to support COSINE""" return False @abstractmethod diff --git a/vectordb_bench/backend/clients/chroma/chroma.py b/vectordb_bench/backend/clients/chroma/chroma.py index 7f2cd2f1c..47beb72e7 100644 --- a/vectordb_bench/backend/clients/chroma/chroma.py +++ b/vectordb_bench/backend/clients/chroma/chroma.py @@ -40,7 +40,7 @@ def __init__( @contextmanager def init(self) -> None: - """create and destory connections to database. + """create and destroy connections to database. Examples: >>> with self.init(): diff --git a/vectordb_bench/backend/clients/clickhouse/clickhouse.py b/vectordb_bench/backend/clients/clickhouse/clickhouse.py index de09895a8..860144344 100644 --- a/vectordb_bench/backend/clients/clickhouse/clickhouse.py +++ b/vectordb_bench/backend/clients/clickhouse/clickhouse.py @@ -91,7 +91,7 @@ def _drop_table(self): log.warning(f"Failed to drop table {self.db_config['database']}.{self.table_name}: {e}") raise e from None - def _perfomance_tuning(self): + def _performance_tuning(self): self.conn.command("SET materialize_skip_indexes_on_insert = 1") def _create_index(self): @@ -120,7 +120,7 @@ def _create_index(self): """ self.conn.command(cmd=query) else: - log.warning("HNSW is only avaliable method in clickhouse now") + log.warning("HNSW is only available method in clickhouse now") except Exception as e: log.warning(f"Failed to create Clickhouse vector index on table: {self.table_name} error: {e}") raise e from None diff --git a/vectordb_bench/backend/clients/doris/doris.py b/vectordb_bench/backend/clients/doris/doris.py index 82b3a12da..4aebe2b72 100644 --- a/vectordb_bench/backend/clients/doris/doris.py +++ b/vectordb_bench/backend/clients/doris/doris.py @@ -234,7 +234,7 @@ def optimize(self, data_size: int | None = None) -> None: log.info("Optimization completed using doris-vector-search library") def need_normalize_cosine(self) -> bool: - """Wheather this database need to normalize dataset to support COSINE""" + """Whether this database need to normalize dataset to support COSINE""" if self.case_config.metric_type == MetricType.COSINE: log.info("cosine dataset need normalize.") return True diff --git a/vectordb_bench/backend/clients/mariadb/mariadb.py b/vectordb_bench/backend/clients/mariadb/mariadb.py index 5ccddfe7a..e1698162a 100644 --- a/vectordb_bench/backend/clients/mariadb/mariadb.py +++ b/vectordb_bench/backend/clients/mariadb/mariadb.py @@ -89,7 +89,7 @@ def _create_db_table(self, dim: int): @contextmanager def init(self): - """create and destory connections to database. + """create and destroy connections to database. Examples: >>> with self.init(): diff --git a/vectordb_bench/backend/clients/memorydb/memorydb.py b/vectordb_bench/backend/clients/memorydb/memorydb.py index 7e7a8650b..d7bea957b 100644 --- a/vectordb_bench/backend/clients/memorydb/memorydb.py +++ b/vectordb_bench/backend/clients/memorydb/memorydb.py @@ -148,7 +148,7 @@ def get_client(self, **kwargs): @contextmanager def init(self) -> Generator[None, None, None]: - """create and destory connections to database. + """create and destroy connections to database. Examples: >>> with self.init(): diff --git a/vectordb_bench/backend/clients/milvus/milvus.py b/vectordb_bench/backend/clients/milvus/milvus.py index b177af332..aca6c45fe 100644 --- a/vectordb_bench/backend/clients/milvus/milvus.py +++ b/vectordb_bench/backend/clients/milvus/milvus.py @@ -135,7 +135,7 @@ def init(self): self.col: Collection | None = None connections.connect(**self.db_config, timeout=60) - # Grab the existing colection with connections + # Grab the existing collection with connections self.col = Collection(self.collection_name) yield @@ -174,7 +174,7 @@ def wait_index(): try: self.col.compact() self.col.wait_for_compaction_completed() - log.info("compactation completed. waiting for the rest of index buliding.") + log.info("compactation completed. waiting for the rest of index building.") except Exception as e: log.warning(f"{self.name} compact error: {e}") if hasattr(e, "code"): @@ -192,7 +192,7 @@ def optimize(self, data_size: int | None = None): self._optimize() def need_normalize_cosine(self) -> bool: - """Wheather this database need to normalize dataset to support COSINE""" + """Whether this database need to normalize dataset to support COSINE""" if self.case_config.is_gpu_index: log.info("current gpu_index only supports IP / L2, cosine dataset need normalize.") return True diff --git a/vectordb_bench/backend/clients/pinecone/pinecone.py b/vectordb_bench/backend/clients/pinecone/pinecone.py index 9c2b38888..e7ceffc47 100644 --- a/vectordb_bench/backend/clients/pinecone/pinecone.py +++ b/vectordb_bench/backend/clients/pinecone/pinecone.py @@ -76,7 +76,7 @@ def insert_embeddings( try: for batch_start_offset in range(0, len(embeddings), self.batch_size): batch_end_offset = min(batch_start_offset + self.batch_size, len(embeddings)) - insert_datas = [] + insert_batch = [] for i in range(batch_start_offset, batch_end_offset): metadata_dict = {self._scalar_id_field: metadata[i]} if self.with_scalar_labels: @@ -86,8 +86,8 @@ def insert_embeddings( embeddings[i], metadata_dict, ) - insert_datas.append(insert_data) - self.index.upsert(insert_datas) + insert_batch.append(insert_data) + self.index.upsert(insert_batch) insert_count += batch_end_offset - batch_start_offset except Exception as e: return insert_count, e diff --git a/vectordb_bench/backend/clients/redis/redis.py b/vectordb_bench/backend/clients/redis/redis.py index ef0aad9aa..dc778a0d9 100644 --- a/vectordb_bench/backend/clients/redis/redis.py +++ b/vectordb_bench/backend/clients/redis/redis.py @@ -76,7 +76,7 @@ def make_index(self, vector_dimensions: int, conn: redis.Redis): @contextmanager def init(self) -> None: - """create and destory connections to database. + """create and destroy connections to database. Examples: >>> with self.init(): diff --git a/vectordb_bench/backend/clients/s3_vectors/s3_vectors.py b/vectordb_bench/backend/clients/s3_vectors/s3_vectors.py index b05134f6b..09995aafb 100644 --- a/vectordb_bench/backend/clients/s3_vectors/s3_vectors.py +++ b/vectordb_bench/backend/clients/s3_vectors/s3_vectors.py @@ -95,7 +95,7 @@ def optimize(self, **kwargs): return def need_normalize_cosine(self) -> bool: - """Wheather this database need to normalize dataset to support COSINE""" + """Whether this database need to normalize dataset to support COSINE""" return False def insert_embeddings( diff --git a/vectordb_bench/backend/clients/vespa/vespa.py b/vectordb_bench/backend/clients/vespa/vespa.py index 5288bc04c..dedd9b42e 100644 --- a/vectordb_bench/backend/clients/vespa/vespa.py +++ b/vectordb_bench/backend/clients/vespa/vespa.py @@ -40,7 +40,7 @@ def __init__( @contextmanager def init(self) -> Generator[None, None, None]: - """create and destory connections to database. + """create and destroy connections to database. Why contextmanager: In multiprocessing search tasks, vectordbbench might init @@ -58,7 +58,7 @@ def init(self) -> Generator[None, None, None]: self.client = None def need_normalize_cosine(self) -> bool: - """Wheather this database need to normalize dataset to support COSINE""" + """Whether this database need to normalize dataset to support COSINE""" return False def insert_embeddings( diff --git a/vectordb_bench/backend/runner/mp_runner.py b/vectordb_bench/backend/runner/mp_runner.py index 9133e407a..21e31d897 100644 --- a/vectordb_bench/backend/runner/mp_runner.py +++ b/vectordb_bench/backend/runner/mp_runner.py @@ -32,7 +32,7 @@ class MultiProcessingSearchRunner: Args: k(int): search topk, default to 100 concurrency(Iterable): concurrencies, default [1, 5, 10, 15, 20, 25, 30, 35] - duration(int): duration for each concurency, default to 30s + duration(int): duration for each concurrency, default to 30s """ def __init__( diff --git a/vectordb_bench/backend/runner/read_write_runner.py b/vectordb_bench/backend/runner/read_write_runner.py index d3d1df2fa..717e0ee52 100644 --- a/vectordb_bench/backend/runner/read_write_runner.py +++ b/vectordb_bench/backend/runner/read_write_runner.py @@ -88,7 +88,7 @@ def __init__( @time_it def run_optimize(self): - """Optimize needs to run in differenct process for pymilvus schema recursion problem""" + """Optimize needs to run in different process for pymilvus schema recursion problem""" with self.db.init(): log.info("Search after write - Optimize start") self.db.optimize(data_size=self.data_volume) @@ -104,7 +104,7 @@ def run_search(self, perc: int): f"p99={p99_latency}, p95={p95_latency}, dur={ssearch_dur:.4f}", ) log.info( - f"Search after wirte - Conc search start, dur for each conc={self.read_dur_after_write}", + f"Search after write - Conc search start, dur for each conc={self.read_dur_after_write}", ) result = self.run_by_dur(self.read_dur_after_write) max_qps = result[0] @@ -114,7 +114,7 @@ def run_search(self, perc: int): conc_latency_p99_list = result[4] conc_latency_p95_list = result[5] conc_latency_avg_list = result[6] - log.info(f"Search after wirte - Conc search finished, max_qps={max_qps}") + log.info(f"Search after write - Conc search finished, max_qps={max_qps}") return [ ( diff --git a/vectordb_bench/backend/utils.py b/vectordb_bench/backend/utils.py index 86c4faf5e..85dabe811 100644 --- a/vectordb_bench/backend/utils.py +++ b/vectordb_bench/backend/utils.py @@ -11,7 +11,7 @@ def numerize(n: int) -> str: >>> numerize(1_000_000_000) '1B' """ - sufix2upbound = { + suffix2upbound = { "EMPTY": 1e3, "K": 1e6, "M": 1e9, @@ -19,19 +19,19 @@ def numerize(n: int) -> str: "END": float("inf"), } - display_n, sufix = n, "" - for s, base in sufix2upbound.items(): - # number >= 1000B will alway have sufix 'B' + display_n, suffix = n, "" + for s, base in suffix2upbound.items(): + # number >= 1000B will always have suffix 'B' if s == "END": display_n = int(n / 1e9) - sufix = "B" + suffix = "B" break if n < base: - sufix = "" if s == "EMPTY" else s + suffix = "" if s == "EMPTY" else s display_n = int(n / (base / 1e3)) break - return f"{display_n}{sufix}" + return f"{display_n}{suffix}" def time_it(func: any): diff --git a/vectordb_bench/custom/custom_case.json b/vectordb_bench/custom/custom_case.json index 12ca6597b..697c41791 100644 --- a/vectordb_bench/custom/custom_case.json +++ b/vectordb_bench/custom/custom_case.json @@ -1,6 +1,6 @@ [ { - "name": "My Dataset (Performace Case)", + "name": "My Dataset (Performance Case)", "description": "this is a customized dataset.", "load_timeout": 36000, "optimize_timeout": 36000, diff --git a/vectordb_bench/frontend/components/check_results/filters.py b/vectordb_bench/frontend/components/check_results/filters.py index 6016c0040..c92ec1690 100644 --- a/vectordb_bench/frontend/components/check_results/filters.py +++ b/vectordb_bench/frontend/components/check_results/filters.py @@ -98,7 +98,7 @@ def getShowDbsAndCases(st, result: list[CaseResult], filter_type: FilterOp) -> t "Case Filter", datasetWithSizeTypes, col=1, - optionLables=[v.value for v in datasetWithSizeTypes], + optionLabels=[v.value for v in datasetWithSizeTypes], ) datasets = [dataset_with_size_type.get_manager() for dataset_with_size_type in showDatasetWithSizeTypes] showCaseNames = list(set([case.name for case in allCases if case.dataset in datasets])) @@ -106,7 +106,7 @@ def getShowDbsAndCases(st, result: list[CaseResult], filter_type: FilterOp) -> t return showDBNames, showCaseNames -def filterView(container, header, options, col, optionLables=None): +def filterView(container, header, options, col, optionLabels=None): selectAllState = f"{header}-select-all-state" if selectAllState not in st.session_state: st.session_state[selectAllState] = True @@ -137,14 +137,14 @@ def filterView(container, header, options, col, optionLables=None): col, gap="small", ) - if optionLables is None: - optionLables = options - isActive = {option: st.session_state[selectAllState] for option in optionLables} - for i, option in enumerate(optionLables): + if optionLabels is None: + optionLabels = options + isActive = {option: st.session_state[selectAllState] for option in optionLabels} + for i, option in enumerate(optionLabels): isActive[option] = columns[i % col].checkbox( - optionLables[i], + optionLabels[i], value=isActive[option], - key=f"{optionLables[i]}-{st.session_state[countKeyState]}", + key=f"{optionLabels[i]}-{st.session_state[countKeyState]}", ) - return [options[i] for i, option in enumerate(optionLables) if isActive[option]] + return [options[i] for i, option in enumerate(optionLabels) if isActive[option]] diff --git a/vectordb_bench/frontend/components/check_results/nav.py b/vectordb_bench/frontend/components/check_results/nav.py index ba4fa99c7..f1bf08d84 100644 --- a/vectordb_bench/frontend/components/check_results/nav.py +++ b/vectordb_bench/frontend/components/check_results/nav.py @@ -9,11 +9,11 @@ def NavToRunTest(st): switch_page("run test") -def NavToQuriesPerDollar(st): +def NavToQueriesPerDollar(st): st.subheader("Compare qps with price.") - navClick = st.button("QP$ (Quries per Dollar) >") + navClick = st.button("QP$ (Queries per Dollar) >") if navClick: - switch_page("quries_per_dollar") + switch_page("queries_per_dollar") def NavToResults(st, key="nav-to-results"): @@ -27,7 +27,7 @@ def NavToPages(st): {"name": "Run Test", "link": "run_test"}, {"name": "Results", "link": "results"}, {"name": "Qps & Recall", "link": "qps_recall"}, - {"name": "Quries Per Dollar", "link": "quries_per_dollar"}, + {"name": "Queries Per Dollar", "link": "queries_per_dollar"}, {"name": "Concurrent", "link": "concurrent"}, {"name": "Label Filter", "link": "label_filter"}, {"name": "Int Filter", "link": "int_filter"}, diff --git a/vectordb_bench/frontend/components/custom/displayCustomCase.py b/vectordb_bench/frontend/components/custom/displayCustomCase.py index 1aa03f96b..e6ac81887 100644 --- a/vectordb_bench/frontend/components/custom/displayCustomCase.py +++ b/vectordb_bench/frontend/components/custom/displayCustomCase.py @@ -7,7 +7,7 @@ def displayCustomCase(customCase: CustomCaseConfig, st, key): customCase.dataset_config.name = columns[0].text_input( "Name", key=f"{key}_name", value=customCase.dataset_config.name ) - customCase.name = f"{customCase.dataset_config.name} (Performace Case)" + customCase.name = f"{customCase.dataset_config.name} (Performance Case)" customCase.dataset_config.dir = columns[1].text_input( "Folder Path", key=f"{key}_dir", value=customCase.dataset_config.dir ) @@ -59,7 +59,7 @@ def displayCustomCase(customCase: CustomCaseConfig, st, key): default_label_percentages = ",".join(map(str, customCase.dataset_config.with_label_percentages)) label_percentage_input = columns[1].text_input( "label percentages", - key=f"{key}_label_percantages", + key=f"{key}_label_percentages", value=default_label_percentages, ) try: diff --git a/vectordb_bench/frontend/components/custom/getCustomConfig.py b/vectordb_bench/frontend/components/custom/getCustomConfig.py index a1ddfb737..c3eb11f9a 100644 --- a/vectordb_bench/frontend/components/custom/getCustomConfig.py +++ b/vectordb_bench/frontend/components/custom/getCustomConfig.py @@ -27,7 +27,7 @@ class CustomDatasetConfig(BaseModel): class CustomCaseConfig(BaseModel): - name: str = "custom_dataset (Performace Case)" + name: str = "custom_dataset (Performance Case)" description: str = "" load_timeout: int = 36000 optimize_timeout: int = 36000 diff --git a/vectordb_bench/frontend/components/run_test/caseSelector.py b/vectordb_bench/frontend/components/run_test/caseSelector.py index 2e104ce54..f4d7d37a3 100644 --- a/vectordb_bench/frontend/components/run_test/caseSelector.py +++ b/vectordb_bench/frontend/components/run_test/caseSelector.py @@ -6,7 +6,7 @@ UICaseItem, UICaseItemCluster, get_case_config_inputs, - get_custom_case_cluter, + get_custom_case_cluster, get_custom_streaming_case_cluster, ) from vectordb_bench.frontend.config.styles import ( @@ -19,7 +19,7 @@ from vectordb_bench.models import CaseConfig -def caseSelector(st, activedDbList: list[DB]): +def caseSelector(st, activeDbList: list[DB]): st.markdown( "
", unsafe_allow_html=True, @@ -30,32 +30,32 @@ def caseSelector(st, activedDbList: list[DB]): unsafe_allow_html=True, ) - activedCaseList: list[CaseConfig] = [] + activeCaseList: list[CaseConfig] = [] dbToCaseClusterConfigs = defaultdict(lambda: defaultdict(dict)) dbToCaseConfigs = defaultdict(lambda: defaultdict(dict)) - caseClusters = UI_CASE_CLUSTERS + [get_custom_case_cluter(), get_custom_streaming_case_cluster()] + caseClusters = UI_CASE_CLUSTERS + [get_custom_case_cluster(), get_custom_streaming_case_cluster()] for caseCluster in caseClusters: - activedCaseList += caseClusterExpander(st, caseCluster, dbToCaseClusterConfigs, activedDbList) + activeCaseList += caseClusterExpander(st, caseCluster, dbToCaseClusterConfigs, activeDbList) for db in dbToCaseClusterConfigs: for uiCaseItem in dbToCaseClusterConfigs[db]: for case in uiCaseItem.get_cases(): dbToCaseConfigs[db][case] = dbToCaseClusterConfigs[db][uiCaseItem] - return activedCaseList, dbToCaseConfigs + return activeCaseList, dbToCaseConfigs -def caseClusterExpander(st, caseCluster: UICaseItemCluster, dbToCaseClusterConfigs, activedDbList: list[DB]): +def caseClusterExpander(st, caseCluster: UICaseItemCluster, dbToCaseClusterConfigs, activeDbList: list[DB]): expander = st.expander(caseCluster.label, False) - activedCases: list[CaseConfig] = [] + activeCases: list[CaseConfig] = [] for uiCaseItem in caseCluster.uiCaseItems: if uiCaseItem.isLine: addHorizontalLine(expander) else: - activedCases += caseItemCheckbox(expander, dbToCaseClusterConfigs, uiCaseItem, activedDbList) - return activedCases + activeCases += caseItemCheckbox(expander, dbToCaseClusterConfigs, uiCaseItem, activeDbList) + return activeCases -def caseItemCheckbox(st, dbToCaseClusterConfigs, uiCaseItem: UICaseItem, activedDbList: list[DB]): +def caseItemCheckbox(st, dbToCaseClusterConfigs, uiCaseItem: UICaseItem, activeDbList: list[DB]): selected = st.checkbox(uiCaseItem.label) st.markdown( f"