Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/pull_request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ jobs:
python -m pip install --upgrade pip
pip install -e ".[test]"

- name: Run typo checks
uses: crate-ci/typos@v1.40.0

- name: Run coding checks
run: |
make lint
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ format:
lint:
PYTHONPATH=`pwd` python3 -m black vectordb_bench --check
PYTHONPATH=`pwd` python3 -m ruff check vectordb_bench

typos:
typos README.md install.py vectordb_bench tests
4 changes: 2 additions & 2 deletions tests/test_chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@



dict = {} #Assumes chroma is acception connections on localhost:8000
dict = {} # Assumes chroma is accepting connections on localhost:8000
dict['name'] = "chroma"
dict['host'] = "localhost"
dict['port'] = 8000
Expand Down Expand Up @@ -102,4 +102,4 @@ def test_insert_and_search(self):
break
assert isFilter, f"Filter not working, id_list: {id_list}"



5 changes: 5 additions & 0 deletions typos.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Respect these words as correct spelling.
[default.extend-words]
"rabit" = "rabit"
"typ" = "typ"
"infom" = "infom"
2 changes: 1 addition & 1 deletion vectordb_bench/backend/clients/alisql/alisql.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def _create_db_table(self, dim: int):

@contextmanager
def init(self):
"""create and destory connections to database.
"""create and destroy connections to database.

Examples:
>>> with self.init():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def search_embedding(
return [one_res["id"] for one_res in res["result"]]

def need_normalize_cosine(self) -> bool:
"""Wheather this database need to normalize dataset to support COSINE"""
"""Whether this database need to normalize dataset to support COSINE"""
if self.case_config.metric_type == MetricType.COSINE:
log.info("cosine dataset need normalize.")
return True
Expand Down
6 changes: 3 additions & 3 deletions vectordb_bench/backend/clients/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def not_empty_field(cls, v: any, field: any):


class DBCaseConfig(ABC):
"""Case specific vector database configs, usually uesed for index params like HNSW"""
"""Case specific vector database configs, usually used for index params like HNSW"""

@abstractmethod
def index_param(self) -> dict:
Expand Down Expand Up @@ -176,7 +176,7 @@ def __init__(
@abstractmethod
@contextmanager
def init(self) -> None:
"""create and destory connections to database.
"""create and destroy connections to database.
Why contextmanager:

In multiprocessing search tasks, vectordbbench might init
Expand All @@ -192,7 +192,7 @@ def init(self) -> None:
raise NotImplementedError

def need_normalize_cosine(self) -> bool:
"""Wheather this database need to normalize dataset to support COSINE"""
"""Whether this database need to normalize dataset to support COSINE"""
return False

@abstractmethod
Expand Down
2 changes: 1 addition & 1 deletion vectordb_bench/backend/clients/chroma/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(

@contextmanager
def init(self) -> None:
"""create and destory connections to database.
"""create and destroy connections to database.

Examples:
>>> with self.init():
Expand Down
4 changes: 2 additions & 2 deletions vectordb_bench/backend/clients/clickhouse/clickhouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def _drop_table(self):
log.warning(f"Failed to drop table {self.db_config['database']}.{self.table_name}: {e}")
raise e from None

def _perfomance_tuning(self):
def _performance_tuning(self):
self.conn.command("SET materialize_skip_indexes_on_insert = 1")

def _create_index(self):
Expand Down Expand Up @@ -120,7 +120,7 @@ def _create_index(self):
"""
self.conn.command(cmd=query)
else:
log.warning("HNSW is only avaliable method in clickhouse now")
log.warning("HNSW is only available method in clickhouse now")
except Exception as e:
log.warning(f"Failed to create Clickhouse vector index on table: {self.table_name} error: {e}")
raise e from None
Expand Down
2 changes: 1 addition & 1 deletion vectordb_bench/backend/clients/doris/doris.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def optimize(self, data_size: int | None = None) -> None:
log.info("Optimization completed using doris-vector-search library")

def need_normalize_cosine(self) -> bool:
"""Wheather this database need to normalize dataset to support COSINE"""
"""Whether this database need to normalize dataset to support COSINE"""
if self.case_config.metric_type == MetricType.COSINE:
log.info("cosine dataset need normalize.")
return True
Expand Down
2 changes: 1 addition & 1 deletion vectordb_bench/backend/clients/mariadb/mariadb.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def _create_db_table(self, dim: int):

@contextmanager
def init(self):
"""create and destory connections to database.
"""create and destroy connections to database.

Examples:
>>> with self.init():
Expand Down
2 changes: 1 addition & 1 deletion vectordb_bench/backend/clients/memorydb/memorydb.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def get_client(self, **kwargs):

@contextmanager
def init(self) -> Generator[None, None, None]:
"""create and destory connections to database.
"""create and destroy connections to database.

Examples:
>>> with self.init():
Expand Down
6 changes: 3 additions & 3 deletions vectordb_bench/backend/clients/milvus/milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def init(self):
self.col: Collection | None = None

connections.connect(**self.db_config, timeout=60)
# Grab the existing colection with connections
# Grab the existing collection with connections
self.col = Collection(self.collection_name)

yield
Expand Down Expand Up @@ -174,7 +174,7 @@ def wait_index():
try:
self.col.compact()
self.col.wait_for_compaction_completed()
log.info("compactation completed. waiting for the rest of index buliding.")
log.info("compactation completed. waiting for the rest of index building.")
except Exception as e:
log.warning(f"{self.name} compact error: {e}")
if hasattr(e, "code"):
Expand All @@ -192,7 +192,7 @@ def optimize(self, data_size: int | None = None):
self._optimize()

def need_normalize_cosine(self) -> bool:
"""Wheather this database need to normalize dataset to support COSINE"""
"""Whether this database need to normalize dataset to support COSINE"""
if self.case_config.is_gpu_index:
log.info("current gpu_index only supports IP / L2, cosine dataset need normalize.")
return True
Expand Down
6 changes: 3 additions & 3 deletions vectordb_bench/backend/clients/pinecone/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def insert_embeddings(
try:
for batch_start_offset in range(0, len(embeddings), self.batch_size):
batch_end_offset = min(batch_start_offset + self.batch_size, len(embeddings))
insert_datas = []
insert_batch = []
for i in range(batch_start_offset, batch_end_offset):
metadata_dict = {self._scalar_id_field: metadata[i]}
if self.with_scalar_labels:
Expand All @@ -86,8 +86,8 @@ def insert_embeddings(
embeddings[i],
metadata_dict,
)
insert_datas.append(insert_data)
self.index.upsert(insert_datas)
insert_batch.append(insert_data)
self.index.upsert(insert_batch)
insert_count += batch_end_offset - batch_start_offset
except Exception as e:
return insert_count, e
Expand Down
2 changes: 1 addition & 1 deletion vectordb_bench/backend/clients/redis/redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def make_index(self, vector_dimensions: int, conn: redis.Redis):

@contextmanager
def init(self) -> None:
"""create and destory connections to database.
"""create and destroy connections to database.

Examples:
>>> with self.init():
Expand Down
2 changes: 1 addition & 1 deletion vectordb_bench/backend/clients/s3_vectors/s3_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def optimize(self, **kwargs):
return

def need_normalize_cosine(self) -> bool:
"""Wheather this database need to normalize dataset to support COSINE"""
"""Whether this database need to normalize dataset to support COSINE"""
return False

def insert_embeddings(
Expand Down
4 changes: 2 additions & 2 deletions vectordb_bench/backend/clients/vespa/vespa.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(

@contextmanager
def init(self) -> Generator[None, None, None]:
"""create and destory connections to database.
"""create and destroy connections to database.
Why contextmanager:

In multiprocessing search tasks, vectordbbench might init
Expand All @@ -58,7 +58,7 @@ def init(self) -> Generator[None, None, None]:
self.client = None

def need_normalize_cosine(self) -> bool:
"""Wheather this database need to normalize dataset to support COSINE"""
"""Whether this database need to normalize dataset to support COSINE"""
return False

def insert_embeddings(
Expand Down
2 changes: 1 addition & 1 deletion vectordb_bench/backend/runner/mp_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class MultiProcessingSearchRunner:
Args:
k(int): search topk, default to 100
concurrency(Iterable): concurrencies, default [1, 5, 10, 15, 20, 25, 30, 35]
duration(int): duration for each concurency, default to 30s
duration(int): duration for each concurrency, default to 30s
"""

def __init__(
Expand Down
6 changes: 3 additions & 3 deletions vectordb_bench/backend/runner/read_write_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(

@time_it
def run_optimize(self):
"""Optimize needs to run in differenct process for pymilvus schema recursion problem"""
"""Optimize needs to run in different process for pymilvus schema recursion problem"""
with self.db.init():
log.info("Search after write - Optimize start")
self.db.optimize(data_size=self.data_volume)
Expand All @@ -104,7 +104,7 @@ def run_search(self, perc: int):
f"p99={p99_latency}, p95={p95_latency}, dur={ssearch_dur:.4f}",
)
log.info(
f"Search after wirte - Conc search start, dur for each conc={self.read_dur_after_write}",
f"Search after write - Conc search start, dur for each conc={self.read_dur_after_write}",
)
result = self.run_by_dur(self.read_dur_after_write)
max_qps = result[0]
Expand All @@ -114,7 +114,7 @@ def run_search(self, perc: int):
conc_latency_p99_list = result[4]
conc_latency_p95_list = result[5]
conc_latency_avg_list = result[6]
log.info(f"Search after wirte - Conc search finished, max_qps={max_qps}")
log.info(f"Search after write - Conc search finished, max_qps={max_qps}")

return [
(
Expand Down
14 changes: 7 additions & 7 deletions vectordb_bench/backend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,27 @@ def numerize(n: int) -> str:
>>> numerize(1_000_000_000)
'1B'
"""
sufix2upbound = {
suffix2upbound = {
"EMPTY": 1e3,
"K": 1e6,
"M": 1e9,
"B": 1e12,
"END": float("inf"),
}

display_n, sufix = n, ""
for s, base in sufix2upbound.items():
# number >= 1000B will alway have sufix 'B'
display_n, suffix = n, ""
for s, base in suffix2upbound.items():
# number >= 1000B will always have suffix 'B'
if s == "END":
display_n = int(n / 1e9)
sufix = "B"
suffix = "B"
break

if n < base:
sufix = "" if s == "EMPTY" else s
suffix = "" if s == "EMPTY" else s
display_n = int(n / (base / 1e3))
break
return f"{display_n}{sufix}"
return f"{display_n}{suffix}"


def time_it(func: any):
Expand Down
2 changes: 1 addition & 1 deletion vectordb_bench/custom/custom_case.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"name": "My Dataset (Performace Case)",
"name": "My Dataset (Performance Case)",
"description": "this is a customized dataset.",
"load_timeout": 36000,
"optimize_timeout": 36000,
Expand Down
18 changes: 9 additions & 9 deletions vectordb_bench/frontend/components/check_results/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,15 @@ def getShowDbsAndCases(st, result: list[CaseResult], filter_type: FilterOp) -> t
"Case Filter",
datasetWithSizeTypes,
col=1,
optionLables=[v.value for v in datasetWithSizeTypes],
optionLabels=[v.value for v in datasetWithSizeTypes],
)
datasets = [dataset_with_size_type.get_manager() for dataset_with_size_type in showDatasetWithSizeTypes]
showCaseNames = list(set([case.name for case in allCases if case.dataset in datasets]))

return showDBNames, showCaseNames


def filterView(container, header, options, col, optionLables=None):
def filterView(container, header, options, col, optionLabels=None):
selectAllState = f"{header}-select-all-state"
if selectAllState not in st.session_state:
st.session_state[selectAllState] = True
Expand Down Expand Up @@ -137,14 +137,14 @@ def filterView(container, header, options, col, optionLables=None):
col,
gap="small",
)
if optionLables is None:
optionLables = options
isActive = {option: st.session_state[selectAllState] for option in optionLables}
for i, option in enumerate(optionLables):
if optionLabels is None:
optionLabels = options
isActive = {option: st.session_state[selectAllState] for option in optionLabels}
for i, option in enumerate(optionLabels):
isActive[option] = columns[i % col].checkbox(
optionLables[i],
optionLabels[i],
value=isActive[option],
key=f"{optionLables[i]}-{st.session_state[countKeyState]}",
key=f"{optionLabels[i]}-{st.session_state[countKeyState]}",
)

return [options[i] for i, option in enumerate(optionLables) if isActive[option]]
return [options[i] for i, option in enumerate(optionLabels) if isActive[option]]
8 changes: 4 additions & 4 deletions vectordb_bench/frontend/components/check_results/nav.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ def NavToRunTest(st):
switch_page("run test")


def NavToQuriesPerDollar(st):
def NavToQueriesPerDollar(st):
st.subheader("Compare qps with price.")
navClick = st.button("QP$ (Quries per Dollar) &nbsp;&nbsp;>")
navClick = st.button("QP$ (Queries per Dollar) &nbsp;&nbsp;>")
if navClick:
switch_page("quries_per_dollar")
switch_page("queries_per_dollar")


def NavToResults(st, key="nav-to-results"):
Expand All @@ -27,7 +27,7 @@ def NavToPages(st):
{"name": "Run Test", "link": "run_test"},
{"name": "Results", "link": "results"},
{"name": "Qps & Recall", "link": "qps_recall"},
{"name": "Quries Per Dollar", "link": "quries_per_dollar"},
{"name": "Queries Per Dollar", "link": "queries_per_dollar"},
{"name": "Concurrent", "link": "concurrent"},
{"name": "Label Filter", "link": "label_filter"},
{"name": "Int Filter", "link": "int_filter"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def displayCustomCase(customCase: CustomCaseConfig, st, key):
customCase.dataset_config.name = columns[0].text_input(
"Name", key=f"{key}_name", value=customCase.dataset_config.name
)
customCase.name = f"{customCase.dataset_config.name} (Performace Case)"
customCase.name = f"{customCase.dataset_config.name} (Performance Case)"
customCase.dataset_config.dir = columns[1].text_input(
"Folder Path", key=f"{key}_dir", value=customCase.dataset_config.dir
)
Expand Down Expand Up @@ -59,7 +59,7 @@ def displayCustomCase(customCase: CustomCaseConfig, st, key):
default_label_percentages = ",".join(map(str, customCase.dataset_config.with_label_percentages))
label_percentage_input = columns[1].text_input(
"label percentages",
key=f"{key}_label_percantages",
key=f"{key}_label_percentages",
value=default_label_percentages,
)
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class CustomDatasetConfig(BaseModel):


class CustomCaseConfig(BaseModel):
name: str = "custom_dataset (Performace Case)"
name: str = "custom_dataset (Performance Case)"
description: str = ""
load_timeout: int = 36000
optimize_timeout: int = 36000
Expand Down
Loading
Loading