Skip to content

Commit

Permalink
test: Add a new range search test for all indexes and align some inde…
Browse files Browse the repository at this point in the history
…x params (#32724)

related issue: #32653

1. align some default index params
2. add a new range search tests for all indexes and float vectors

---------

Signed-off-by: yanliang567 <[email protected]>
  • Loading branch information
yanliang567 committed Apr 30, 2024
1 parent c70c21e commit 5bb672d
Show file tree
Hide file tree
Showing 11 changed files with 393 additions and 468 deletions.
3 changes: 2 additions & 1 deletion tests/python_client/base/client_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,8 @@ def init_collection_general(self, prefix="test", insert_data=False, nb=ct.defaul
expected: return collection and raw data, insert ids
"""
log.info("Test case of search interface: initialize before test case")
self._connect()
if not self.connection_wrap.has_connection(alias=DefaultConfig.DEFAULT_USING)[0]:
self._connect()
collection_name = cf.gen_unique_str(prefix)
if name is not None:
collection_name = name
Expand Down
37 changes: 24 additions & 13 deletions tests/python_client/common/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,9 +359,9 @@ def gen_collection_schema_all_datatype(description=ct.default_desc,
else:
multiple_dim_array.insert(0, dim)
for i in range(len(multiple_dim_array)):
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.vector_data_type_all[i%3]}",
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}",
dim=multiple_dim_array[i],
vector_data_type=ct.vector_data_type_all[i%3]))
vector_data_type=ct.all_float_vector_types[i%3]))

schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
primary_field=primary_field, auto_id=auto_id,
Expand Down Expand Up @@ -485,8 +485,8 @@ def gen_default_dataframe_data(nb=ct.default_nb, dim=ct.default_dim, start=0, wi

def gen_general_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0, with_json=True,
random_primary_key=False, multiple_dim_array=[], multiple_vector_field_name=[],
vector_data_type="FLOAT_VECTOR", auto_id = False,
primary_field = ct.default_int64_field_name):
vector_data_type="FLOAT_VECTOR", auto_id=False,
primary_field=ct.default_int64_field_name):
insert_list = []
if not random_primary_key:
int_values = pd.Series(data=[i for i in range(start, start + nb)])
Expand All @@ -496,14 +496,15 @@ def gen_general_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0,
string_values = pd.Series(data=[str(i) for i in range(start, start + nb)], dtype="string")
json_values = [{"number": i, "float": i*1.0} for i in range(start, start + nb)]
float_vec_values = gen_vectors(nb, dim, vector_data_type=vector_data_type)
insert_list = [int_values, float_values, string_values, json_values, float_vec_values]
insert_list = [int_values, float_values, string_values]

if with_json is True:
insert_list.append(json_values)
insert_list.append(float_vec_values)

if with_json is False:
index = insert_list.index(json_values)
del insert_list[index]
if auto_id is True:
if primary_field == ct.default_int64_field_name:
index = insert_list.index(int_values)
index = 0
elif primary_field == ct.default_string_field_name:
index = 2
del insert_list[index]
Expand Down Expand Up @@ -699,7 +700,7 @@ def gen_dataframe_all_data_type(nb=ct.default_nb, dim=ct.default_dim, start=0, w
df[ct.default_float_vec_field_name] = float_vec_values
else:
for i in range(len(multiple_dim_array)):
df[multiple_vector_field_name[i]] = gen_vectors(nb, multiple_dim_array[i], ct.vector_data_type_all[i%3])
df[multiple_vector_field_name[i]] = gen_vectors(nb, multiple_dim_array[i], ct.all_float_vector_types[i%3])

if with_json is False:
df.drop(ct.default_json_field_name, axis=1, inplace=True)
Expand Down Expand Up @@ -737,7 +738,7 @@ def gen_general_list_all_data_type(nb=ct.default_nb, dim=ct.default_dim, start=0
insert_list.append(float_vec_values)
else:
for i in range(len(multiple_dim_array)):
insert_list.append(gen_vectors(nb, multiple_dim_array[i], ct.vector_data_type_all[i%3]))
insert_list.append(gen_vectors(nb, multiple_dim_array[i], ct.all_float_vector_types[i%3]))

if with_json is False:
# index = insert_list.index(json_values)
Expand Down Expand Up @@ -782,7 +783,7 @@ def gen_default_rows_data_all_data_type(nb=ct.default_nb, dim=ct.default_dim, st
else:
for i in range(len(multiple_dim_array)):
dict[multiple_vector_field_name[i]] = gen_vectors(nb, multiple_dim_array[i],
ct.vector_data_type_all[i])[0]
ct.all_float_vector_types[i])[0]
if len(multiple_dim_array) != 0:
with open(ct.rows_all_data_type_file_path + f'_{partition_id}' + f'_dim{dim}.txt', 'wb') as json_file:
pickle.dump(array, json_file)
Expand Down Expand Up @@ -1233,7 +1234,7 @@ def gen_simple_index():
elif ct.all_index_types[i] in ct.sparse_support:
continue
dic = {"index_type": ct.all_index_types[i], "metric_type": "L2"}
dic.update({"params": ct.default_index_params[i]})
dic.update({"params": ct.default_all_indexes_params[i]})
index_params.append(dic)
return index_params

Expand Down Expand Up @@ -1671,6 +1672,16 @@ def index_to_dict(index):
}


def get_index_params_params(index_type):
"""get default params of index params by index type"""
return ct.default_all_indexes_params[ct.all_index_types.index(index_type)]


def get_search_params_params(index_type):
"""get default params of search params by index type"""
return ct.default_all_search_params_params[ct.all_index_types.index(index_type)]


def assert_json_contains(expr, list_data):
opposite = False
if expr.startswith("not"):
Expand Down
57 changes: 34 additions & 23 deletions tests/python_client/common/common_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,6 @@
default_limit = 10
default_batch_size = 1000
max_limit = 16384
default_search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}
default_search_ip_params = {"metric_type": "IP", "params": {"nprobe": 10}}
default_search_binary_params = {"metric_type": "JACCARD", "params": {"nprobe": 10}}
default_index = {"index_type": "IVF_SQ8", "metric_type": "COSINE", "params": {"nlist": 64}}
default_binary_index = {"index_type": "BIN_IVF_FLAT", "params": {"nlist": 128}, "metric_type": "JACCARD"}
default_diskann_index = {"index_type": "DISKANN", "metric_type": "COSINE", "params": {}}
default_diskann_search_params = {"metric_type": "COSINE", "params": {"search_list": 30}}
default_sparse_search_params = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}}
max_top_k = 16384
max_partition_num = 4096
max_role_num = 10
Expand Down Expand Up @@ -52,7 +44,7 @@
float_type = "FLOAT_VECTOR"
float16_type = "FLOAT16_VECTOR"
bfloat16_type = "BFLOAT16_VECTOR"
vector_data_type_all = [float_type, float16_type, bfloat16_type]
all_float_vector_types = [float_type, float16_type, bfloat16_type]
default_sparse_vec_field_name = "sparse_vector"
default_partition_name = "_default"
default_resource_group_name = '__default_resource_group'
Expand Down Expand Up @@ -108,11 +100,6 @@
err_code = "err_code"
err_msg = "err_msg"
in_cluster_env = "IN_CLUSTER"

default_flat_index = {"index_type": "FLAT", "params": {}, "metric_type": "COSINE"}
default_bin_flat_index = {"index_type": "BIN_FLAT", "params": {}, "metric_type": "JACCARD"}
default_sparse_inverted_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP",
"params": {"drop_ratio_build": 0.2}}
default_count_output = "count(*)"

rows_all_data_type_file_path = "/tmp/rows_all_data_type"
Expand Down Expand Up @@ -250,26 +237,50 @@
]

""" Specially defined list """
all_index_types = ["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ", "HNSW", "SCANN", "DISKANN", "BIN_FLAT", "BIN_IVF_FLAT",
"SPARSE_INVERTED_INDEX", "SPARSE_WAND", "GPU_IVF_FLAT", "GPU_IVF_PQ"]
L0_index_types = ["IVF_SQ8", "HNSW", "DISKANN"]
all_index_types = ["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ",
"HNSW", "SCANN", "DISKANN",
"BIN_FLAT", "BIN_IVF_FLAT",
"SPARSE_INVERTED_INDEX", "SPARSE_WAND",
"GPU_IVF_FLAT", "GPU_IVF_PQ"]

default_all_indexes_params = [{}, {"nlist": 128}, {"nlist": 128}, {"nlist": 128, "m": 16, "nbits": 8},
{"M": 32, "efConstruction": 360}, {"nlist": 128}, {},
{}, {"nlist": 64},
{"drop_ratio_build": 0.2}, {"drop_ratio_build": 0.2},
{"nlist": 64}, {"nlist": 64, "m": 16, "nbits": 8}]

default_index_params = [{"nlist": 128}, {"nlist": 128}, {"nlist": 128}, {"nlist": 128, "m": 16, "nbits": 8},
{"M": 48, "efConstruction": 500}, {"nlist": 128}, {}, {"nlist": 128}, {"nlist": 128},
{"drop_ratio_build": 0.2}, {"drop_ratio_build": 0.2},
{"nlist": 64}, {"nlist": 64, "m": 16, "nbits": 8}]
default_all_search_params_params = [{}, {"nprobe": 32}, {"nprobe": 32}, {"nprobe": 32},
{"ef": 100}, {"nprobe": 32, "reorder_k": 100}, {"search_list": 30},
{}, {"nprobe": 32},
{"drop_ratio_search": "0.2"}, {"drop_ratio_search": "0.2"},
{}, {}]

Handler_type = ["GRPC", "HTTP"]
binary_support = ["BIN_FLAT", "BIN_IVF_FLAT"]
delete_support = ["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ"]
ivf = ["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ"]
skip_pq = ["IVF_PQ"]
sparse_support = ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"]
default_L0_metric = "COSINE"
float_metrics = ["L2", "IP", "COSINE"]
binary_metrics = ["JACCARD", "HAMMING", "SUBSTRUCTURE", "SUPERSTRUCTURE"]
structure_metrics = ["SUBSTRUCTURE", "SUPERSTRUCTURE"]
all_scalar_data_types = ['int8', 'int16', 'int32', 'int64', 'float', 'double', 'bool', 'varchar']


default_flat_index = {"index_type": "FLAT", "params": {}, "metric_type": default_L0_metric}
default_bin_flat_index = {"index_type": "BIN_FLAT", "params": {}, "metric_type": "JACCARD"}
default_sparse_inverted_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP",
"params": {"drop_ratio_build": 0.2}}

default_search_params = {"params": default_all_search_params_params[2]}
default_search_ip_params = {"metric_type": "IP", "params": default_all_search_params_params[2]}
default_search_binary_params = {"metric_type": "JACCARD", "params": {"nprobe": 32}}
default_index = {"index_type": "IVF_SQ8", "metric_type": default_L0_metric, "params": default_all_indexes_params[2]}
default_binary_index = {"index_type": "BIN_IVF_FLAT", "metric_type": "JACCARD", "params": default_all_indexes_params[8]}
default_diskann_index = {"index_type": "DISKANN", "metric_type": default_L0_metric, "params": {}}
default_diskann_search_params = {"params": {"search_list": 30}}
default_sparse_search_params = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}}


class CheckTasks:
""" The name of the method used to check the result """
check_nothing = "check_nothing"
Expand Down
188 changes: 94 additions & 94 deletions tests/python_client/load/test_workload.py
Original file line number Diff line number Diff line change
@@ -1,94 +1,94 @@
import datetime
import pytest

from base.client_base import TestcaseBase
from common import common_func as cf
from common import common_type as ct
from common.common_type import CaseLabel
from utils.util_log import test_log as log
from pymilvus import utility


rounds = 100
per_nb = 100000
default_field_name = ct.default_float_vec_field_name
default_index_params = {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}


class TestLoad(TestcaseBase):
""" Test case of end to end"""
@pytest.mark.tags(CaseLabel.L3)
def test_load_default(self):
name = 'load_test_collection_1'
name2 = 'load_test_collection_2'
# create
# collection_w = self.init_collection_wrap(name=name)
# collection_w2 = self.init_collection_wrap(name=name2)
# assert collection_w.name == name

for i in range(50):
name = f"load_collection2_{i}"
self.init_collection_wrap(name=name)
log.debug(f"total collections: {len(utility.list_collections())}")

# # insert
# data = cf.gen_default_list_data(per_nb)
# log.debug(f"data len: {len(data[0])}")
# for i in range(rounds):
# t0 = datetime.datetime.now()
# ins_res, res = collection_w.insert(data, timeout=180)
# tt = datetime.datetime.now() - t0
# log.debug(f"round{i} insert: {len(ins_res.primary_keys)} entities in {tt}s")
# assert res # and per_nb == len(ins_res.primary_keys)
#
# t0 = datetime.datetime.now()
# ins_res2, res = collection_w2.insert(data, timeout=180)
# tt = datetime.datetime.now() - t0
# log.debug(f"round{i} insert2: {len(ins_res2.primary_keys)} entities in {tt}s")
# assert res
#
# # flush
# t0 = datetime.datetime.now()
# log.debug(f"current collection num_entities: {collection_w.num_entities}")
# tt = datetime.datetime.now() - t0
# log.debug(f"round{i} flush in {tt}")
#
# t0 = datetime.datetime.now()
# log.debug(f"current collection2 num_entities: {collection_w2.num_entities}")
# tt = datetime.datetime.now() - t0
# log.debug(f"round{i} flush2 in {tt}")

# index, res = collection_w.create_index(default_field_name, default_index_params, timeout=60)
# assert res

# # search
# collection_w.load()
# search_vectors = cf.gen_vectors(1, ct.default_dim)
# t0 = datetime.datetime.now()
# res_1, _ = collection_w.search(data=search_vectors,
# anns_field=ct.default_float_vec_field_name,
# param={"nprobe": 16}, limit=1)
# tt = datetime.datetime.now() - t0
# log.debug(f"assert search: {tt}")
# assert len(res_1) == 1
# # collection_w.release()
#
# # index
# collection_w.insert(cf.gen_default_dataframe_data(nb=5000))
# assert collection_w.num_entities == len(data[0]) + 5000
# _index_params = {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}
# t0 = datetime.datetime.now()
# index, _ = collection_w.create_index(field_name=ct.default_float_vec_field_name,
# index_params=_index_params,
# name=cf.gen_unique_str())
# tt = datetime.datetime.now() - t0
# log.debug(f"assert index: {tt}")
# assert len(collection_w.indexes) == 1
#
# # query
# term_expr = f'{ct.default_int64_field_name} in [3001,4001,4999,2999]'
# t0 = datetime.datetime.now()
# res, _ = collection_w.query(term_expr)
# tt = datetime.datetime.now() - t0
# log.debug(f"assert query: {tt}")
# assert len(res) == 4
# import datetime
# import pytest
#
# from base.client_base import TestcaseBase
# from common import common_func as cf
# from common import common_type as ct
# from common.common_type import CaseLabel
# from utils.util_log import test_log as log
# from pymilvus import utility
#
#
# rounds = 100
# per_nb = 100000
# default_field_name = ct.default_float_vec_field_name
# default_index_params = {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}
#
#
# class TestLoad(TestcaseBase):
# """ Test case of end to end"""
# @pytest.mark.tags(CaseLabel.L3)
# def test_load_default(self):
# name = 'load_test_collection_1'
# name2 = 'load_test_collection_2'
# # create
# # collection_w = self.init_collection_wrap(name=name)
# # collection_w2 = self.init_collection_wrap(name=name2)
# # assert collection_w.name == name
#
# for i in range(50):
# name = f"load_collection2_{i}"
# self.init_collection_wrap(name=name)
# log.debug(f"total collections: {len(utility.list_collections())}")
#
# # # insert
# # data = cf.gen_default_list_data(per_nb)
# # log.debug(f"data len: {len(data[0])}")
# # for i in range(rounds):
# # t0 = datetime.datetime.now()
# # ins_res, res = collection_w.insert(data, timeout=180)
# # tt = datetime.datetime.now() - t0
# # log.debug(f"round{i} insert: {len(ins_res.primary_keys)} entities in {tt}s")
# # assert res # and per_nb == len(ins_res.primary_keys)
# #
# # t0 = datetime.datetime.now()
# # ins_res2, res = collection_w2.insert(data, timeout=180)
# # tt = datetime.datetime.now() - t0
# # log.debug(f"round{i} insert2: {len(ins_res2.primary_keys)} entities in {tt}s")
# # assert res
# #
# # # flush
# # t0 = datetime.datetime.now()
# # log.debug(f"current collection num_entities: {collection_w.num_entities}")
# # tt = datetime.datetime.now() - t0
# # log.debug(f"round{i} flush in {tt}")
# #
# # t0 = datetime.datetime.now()
# # log.debug(f"current collection2 num_entities: {collection_w2.num_entities}")
# # tt = datetime.datetime.now() - t0
# # log.debug(f"round{i} flush2 in {tt}")
#
# # index, res = collection_w.create_index(default_field_name, default_all_indexes_params, timeout=60)
# # assert res
#
# # # search
# # collection_w.load()
# # search_vectors = cf.gen_vectors(1, ct.default_dim)
# # t0 = datetime.datetime.now()
# # res_1, _ = collection_w.search(data=search_vectors,
# # anns_field=ct.default_float_vec_field_name,
# # param={"nprobe": 16}, limit=1)
# # tt = datetime.datetime.now() - t0
# # log.debug(f"assert search: {tt}")
# # assert len(res_1) == 1
# # # collection_w.release()
# #
# # # index
# # collection_w.insert(cf.gen_default_dataframe_data(nb=5000))
# # assert collection_w.num_entities == len(data[0]) + 5000
# # _index_params = {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}
# # t0 = datetime.datetime.now()
# # index, _ = collection_w.create_index(field_name=ct.default_float_vec_field_name,
# # index_params=_index_params,
# # name=cf.gen_unique_str())
# # tt = datetime.datetime.now() - t0
# # log.debug(f"assert index: {tt}")
# # assert len(collection_w.indexes) == 1
# #
# # # query
# # term_expr = f'{ct.default_int64_field_name} in [3001,4001,4999,2999]'
# # t0 = datetime.datetime.now()
# # res, _ = collection_w.query(term_expr)
# # tt = datetime.datetime.now() - t0
# # log.debug(f"assert query: {tt}")
# # assert len(res) == 4

0 comments on commit 5bb672d

Please sign in to comment.