2021-09-16 17:29:49 +08:00

273 lines
8.3 KiB
Python

import os
import pdb
import logging
import numpy as np
import sklearn.preprocessing
import h5py
import random
from itertools import product
from pymilvus import DataType
from milvus_benchmark import config
logger = logging.getLogger("milvus_benchmark.runners.utils")
DELETE_INTERVAL_TIME = 2
VECTORS_PER_FILE = 1000000
SIFT_VECTORS_PER_FILE = 100000
BINARY_VECTORS_PER_FILE = 2000000
MAX_NQ = 10001
FILE_PREFIX = "binary_"
WARM_TOP_K = 1
WARM_NQ = 1
DEFAULT_DIM = 512
DEFAULT_METRIC_TYPE = "L2"
RANDOM_SRC_DATA_DIR = config.RAW_DATA_DIR + 'random/'
SIFT_SRC_DATA_DIR = config.RAW_DATA_DIR + 'sift1b/'
DEEP_SRC_DATA_DIR = config.RAW_DATA_DIR + 'deep1b/'
JACCARD_SRC_DATA_DIR = config.RAW_DATA_DIR + 'jaccard/'
HAMMING_SRC_DATA_DIR = config.RAW_DATA_DIR + 'hamming/'
STRUCTURE_SRC_DATA_DIR = config.RAW_DATA_DIR + 'structure/'
BINARY_SRC_DATA_DIR = config.RAW_DATA_DIR + 'binary/'
SIFT_SRC_GROUNDTRUTH_DATA_DIR = SIFT_SRC_DATA_DIR + 'gnd'
DEFAULT_F_FIELD_NAME = 'float_vector'
DEFAULT_B_FIELD_NAME = 'binary_vector'
DEFAULT_INT_FIELD_NAME = 'int64'
DEFAULT_FLOAT_FIELD_NAME = 'float'
DEFAULT_DOUBLE_FIELD_NAME = "double"
GROUNDTRUTH_MAP = {
"1000000": "idx_1M.ivecs",
"2000000": "idx_2M.ivecs",
"5000000": "idx_5M.ivecs",
"10000000": "idx_10M.ivecs",
"20000000": "idx_20M.ivecs",
"50000000": "idx_50M.ivecs",
"100000000": "idx_100M.ivecs",
"200000000": "idx_200M.ivecs",
"500000000": "idx_500M.ivecs",
"1000000000": "idx_1000M.ivecs",
}
METRIC_MAP = {
"l2": "L2",
"ip": "IP",
"jaccard": "JACCARD",
"hamming": "HAMMING",
"sub": "SUBSTRUCTURE",
"super": "SUPERSTRUCTURE"
}
def get_len_vectors_per_file(data_type, dimension):
if data_type == "random":
if dimension == 512:
vectors_per_file = VECTORS_PER_FILE
elif dimension == 4096:
vectors_per_file = 100000
elif dimension == 16384:
vectors_per_file = 10000
elif data_type == "sift":
vectors_per_file = SIFT_VECTORS_PER_FILE
elif data_type in ["binary"]:
vectors_per_file = BINARY_VECTORS_PER_FILE
elif data_type == "local":
vectors_per_file = SIFT_VECTORS_PER_FILE
else:
raise Exception("data_type: %s not supported" % data_type)
return vectors_per_file
def get_vectors_from_binary(nq, dimension, data_type):
# use the first file, nq should be less than VECTORS_PER_FILE
if nq > MAX_NQ:
raise Exception("Over size nq")
if data_type == "local":
return generate_vectors(nq, dimension)
elif data_type == "random":
file_name = RANDOM_SRC_DATA_DIR + 'query_%d.npy' % dimension
elif data_type == "sift":
file_name = SIFT_SRC_DATA_DIR + 'query.npy'
elif data_type == "deep":
file_name = DEEP_SRC_DATA_DIR + 'query.npy'
elif data_type == "binary":
file_name = BINARY_SRC_DATA_DIR + 'query.npy'
data = np.load(file_name)
vectors = data[0:nq].tolist()
return vectors
def generate_vectors(nb, dim):
return [[random.random() for _ in range(dim)] for _ in range(nb)]
def generate_values(data_type, vectors, ids):
values = None
if data_type in [DataType.INT32, DataType.INT64]:
values = ids
elif data_type in [DataType.FLOAT, DataType.DOUBLE]:
values = [(i + 0.0) for i in ids]
elif data_type in [DataType.FLOAT_VECTOR, DataType.BINARY_VECTOR]:
values = vectors
return values
def generate_entities(info, vectors, ids=None):
entities = []
for field in info["fields"]:
# if field["name"] == "_id":
# continue
field_type = field["type"]
entities.append(
{"name": field["name"], "type": field_type, "values": generate_values(field_type, vectors, ids)})
return entities
def metric_type_trans(metric_type):
if metric_type in METRIC_MAP.keys():
return METRIC_MAP[metric_type]
else:
raise Exception("metric_type: %s not in METRIC_MAP" % metric_type)
def get_dataset(hdf5_file_path):
if not os.path.exists(hdf5_file_path):
raise Exception("%s not existed" % hdf5_file_path)
dataset = h5py.File(hdf5_file_path)
return dataset
def get_default_field_name(data_type=DataType.FLOAT_VECTOR):
if data_type == DataType.FLOAT_VECTOR:
field_name = DEFAULT_F_FIELD_NAME
elif data_type == DataType.BINARY_VECTOR:
field_name = DEFAULT_B_FIELD_NAME
elif data_type == DataType.INT64:
field_name = DEFAULT_INT_FIELD_NAME
elif data_type == DataType.FLOAT:
field_name = DEFAULT_FLOAT_FIELD_NAME
else:
logger.error(data_type)
raise Exception("Not supported data type")
return field_name
def get_vector_type(data_type):
vector_type = ''
if data_type in ["random", "sift", "deep", "glove", "local"]:
vector_type = DataType.FLOAT_VECTOR
elif data_type in ["binary"]:
vector_type = DataType.BINARY_VECTOR
else:
raise Exception("Data type: %s not defined" % data_type)
return vector_type
def get_vector_type_from_metric(metric_type):
vector_type = ''
if metric_type in ["hamming", "jaccard"]:
vector_type = DataType.BINARY_VECTOR
else:
vector_type = DataType.FLOAT_VECTOR
return vector_type
def normalize(metric_type, X):
if metric_type == "ip":
logger.info("Set normalize for metric_type: %s" % metric_type)
X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
X = X.astype(np.float32)
elif metric_type == "l2":
X = X.astype(np.float32)
elif metric_type in ["jaccard", "hamming", "sub", "super"]:
tmp = []
for item in X:
new_vector = bytes(np.packbits(item, axis=-1).tolist())
tmp.append(new_vector)
X = tmp
return X
def generate_combinations(args):
if isinstance(args, list):
args = [el if isinstance(el, list) else [el] for el in args]
return [list(x) for x in product(*args)]
elif isinstance(args, dict):
flat = []
for k, v in args.items():
if isinstance(v, list):
flat.append([(k, el) for el in v])
else:
flat.append([(k, v)])
return [dict(x) for x in product(*flat)]
else:
raise TypeError("No args handling exists for %s" % type(args).__name__)
def gen_file_name(idx, dimension, data_type):
s = "%05d" % idx
fname = FILE_PREFIX + str(dimension) + "d_" + s + ".npy"
if data_type == "random":
fname = RANDOM_SRC_DATA_DIR + fname
elif data_type == "sift":
fname = SIFT_SRC_DATA_DIR + fname
elif data_type == "deep":
fname = DEEP_SRC_DATA_DIR + fname
elif data_type == "jaccard":
fname = JACCARD_SRC_DATA_DIR + fname
elif data_type == "hamming":
fname = HAMMING_SRC_DATA_DIR + fname
elif data_type == "sub" or data_type == "super":
fname = STRUCTURE_SRC_DATA_DIR + fname
return fname
def get_recall_value(true_ids, result_ids):
"""
Use the intersection length
true_ids: neighbors taken from the dataset
result_ids: ids returned by query
"""
sum_radio = 0.0
for index, item in enumerate(result_ids):
# tmp = set(item).intersection(set(flat_id_list[index]))
# Get the value of true_ids and the returned value to do the intersection
tmp = set(true_ids[index]).intersection(set(item))
# Add up each ratio
sum_radio = sum_radio + len(tmp) / len(item)
# logger.debug(sum_radio)
# Calculate the average ratio and take three digits after the decimal point
return round(sum_radio / len(result_ids), 3)
def get_ground_truth_ids(collection_size):
fname = GROUNDTRUTH_MAP[str(collection_size)]
fname = SIFT_SRC_GROUNDTRUTH_DATA_DIR + "/" + fname
a = np.fromfile(fname, dtype='int32')
d = a[0]
true_ids = a.reshape(-1, d + 1)[:, 1:].copy()
return true_ids
def normalize(metric_type, X):
if metric_type == "ip":
logger.info("Set normalize for metric_type: %s" % metric_type)
X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
X = X.astype(np.float32)
elif metric_type == "l2":
X = X.astype(np.float32)
elif metric_type in ["jaccard", "hamming", "sub", "super"]:
tmp = []
for item in X:
new_vector = bytes(np.packbits(item, axis=-1).tolist())
tmp.append(new_vector)
X = tmp
return X