mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-01 00:15:30 +08:00
issue: https://github.com/milvus-io/milvus/issues/27704 Add inverted index for some data types in Milvus. This index type can save a lot of memory compared to loading all data into RAM and speed up the term query and range query. Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL` and `VARCHAR`. Not supported: `ARRAY` and `JSON`. Note: - The inverted index for `VARCHAR` is not designed to serve full-text search now. We will treat every row as a whole keyword instead of tokenizing it into multiple terms. - The inverted index don't support retrieval well, so if you create inverted index for field, those operations which depend on the raw data will fallback to use chunk storage, which will bring some performance loss. For example, comparisons between two columns and retrieval of output fields. The inverted index is very easy to be used. Taking below collection as an example: ```python fields = [ FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100), FieldSchema(name="int8", dtype=DataType.INT8), FieldSchema(name="int16", dtype=DataType.INT16), FieldSchema(name="int32", dtype=DataType.INT32), FieldSchema(name="int64", dtype=DataType.INT64), FieldSchema(name="float", dtype=DataType.FLOAT), FieldSchema(name="double", dtype=DataType.DOUBLE), FieldSchema(name="bool", dtype=DataType.BOOL), FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000), FieldSchema(name="random", dtype=DataType.DOUBLE), FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim), ] schema = CollectionSchema(fields) collection = Collection("demo", schema) ``` Then we can simply create inverted index for field via: ```python index_type = "INVERTED" collection.create_index("int8", {"index_type": index_type}) collection.create_index("int16", {"index_type": index_type}) collection.create_index("int32", {"index_type": index_type}) collection.create_index("int64", {"index_type": index_type}) collection.create_index("float", {"index_type": index_type}) collection.create_index("double", {"index_type": index_type}) collection.create_index("bool", {"index_type": index_type}) collection.create_index("varchar", {"index_type": index_type}) ``` Then, term query and range query on the field can be speed up automatically by the inverted index: ```python result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"]) result = collection.query(expr='int64 < 5', output_fields=["pk"]) result = collection.query(expr='int64 > 2997', output_fields=["pk"]) result = collection.query(expr='1 < int64 < 5', output_fields=["pk"]) ``` --------- Signed-off-by: longjiquan <jiquan.long@zilliz.com>
331 lines
8.0 KiB
Go
331 lines
8.0 KiB
Go
package indexcgowrapper
|
|
|
|
import (
|
|
"math/rand"
|
|
"os"
|
|
"strconv"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/proto/indexpb"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"github.com/milvus-io/milvus/pkg/common"
|
|
"github.com/milvus-io/milvus/pkg/util/funcutil"
|
|
"github.com/milvus-io/milvus/pkg/util/metric"
|
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
|
)
|
|
|
|
func TestMain(m *testing.M) {
|
|
paramtable.Init()
|
|
exitCode := m.Run()
|
|
os.Exit(exitCode)
|
|
}
|
|
|
|
type indexTestCase struct {
|
|
dtype schemapb.DataType
|
|
typeParams map[string]string
|
|
indexParams map[string]string
|
|
}
|
|
|
|
func generateBoolArray(numRows int) []bool {
|
|
ret := make([]bool, 0, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret = append(ret, rand.Int()%2 == 0)
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func generateInt8Array(numRows int) []int8 {
|
|
ret := make([]int8, 0, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret = append(ret, int8(rand.Int()))
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func generateInt16Array(numRows int) []int16 {
|
|
ret := make([]int16, 0, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret = append(ret, int16(rand.Int()))
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func generateInt32Array(numRows int) []int32 {
|
|
ret := make([]int32, 0, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret = append(ret, int32(rand.Int()))
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func generateInt64Array(numRows int) []int64 {
|
|
ret := make([]int64, 0, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret = append(ret, int64(rand.Int()))
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func generateFloat32Array(numRows int) []float32 {
|
|
ret := make([]float32, 0, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret = append(ret, rand.Float32())
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func generateFloat64Array(numRows int) []float64 {
|
|
ret := make([]float64, 0, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret = append(ret, rand.Float64())
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func generateStringArray(numRows int) []string {
|
|
ret := make([]string, 0, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret = append(ret, funcutil.GenRandomStr())
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func generateFloatVectors(numRows, dim int) []float32 {
|
|
total := numRows * dim
|
|
ret := make([]float32, 0, total)
|
|
for i := 0; i < total; i++ {
|
|
ret = append(ret, rand.Float32())
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func generateBinaryVectors(numRows, dim int) []byte {
|
|
total := (numRows * dim) / 8
|
|
ret := make([]byte, total)
|
|
_, err := rand.Read(ret)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func genFieldData(dtype schemapb.DataType, numRows, dim int) storage.FieldData {
|
|
switch dtype {
|
|
case schemapb.DataType_Bool:
|
|
return &storage.BoolFieldData{
|
|
Data: generateBoolArray(numRows),
|
|
}
|
|
case schemapb.DataType_Int8:
|
|
return &storage.Int8FieldData{
|
|
Data: generateInt8Array(numRows),
|
|
}
|
|
case schemapb.DataType_Int16:
|
|
return &storage.Int16FieldData{
|
|
Data: generateInt16Array(numRows),
|
|
}
|
|
case schemapb.DataType_Int32:
|
|
return &storage.Int32FieldData{
|
|
Data: generateInt32Array(numRows),
|
|
}
|
|
case schemapb.DataType_Int64:
|
|
return &storage.Int64FieldData{
|
|
Data: generateInt64Array(numRows),
|
|
}
|
|
case schemapb.DataType_Float:
|
|
return &storage.FloatFieldData{
|
|
Data: generateFloat32Array(numRows),
|
|
}
|
|
case schemapb.DataType_Double:
|
|
return &storage.DoubleFieldData{
|
|
Data: generateFloat64Array(numRows),
|
|
}
|
|
case schemapb.DataType_String:
|
|
return &storage.StringFieldData{
|
|
Data: generateStringArray(numRows),
|
|
}
|
|
case schemapb.DataType_VarChar:
|
|
return &storage.StringFieldData{
|
|
Data: generateStringArray(numRows),
|
|
}
|
|
case schemapb.DataType_BinaryVector:
|
|
return &storage.BinaryVectorFieldData{
|
|
Dim: dim,
|
|
Data: generateBinaryVectors(numRows, dim),
|
|
}
|
|
case schemapb.DataType_FloatVector:
|
|
return &storage.FloatVectorFieldData{
|
|
Data: generateFloatVectors(numRows, dim),
|
|
Dim: dim,
|
|
}
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func genScalarIndexCases(dtype schemapb.DataType) []indexTestCase {
|
|
return []indexTestCase{
|
|
{
|
|
dtype: dtype,
|
|
typeParams: nil,
|
|
indexParams: map[string]string{
|
|
common.IndexTypeKey: "sort",
|
|
},
|
|
},
|
|
{
|
|
dtype: dtype,
|
|
typeParams: nil,
|
|
indexParams: map[string]string{
|
|
common.IndexTypeKey: "flat",
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func genStringIndexCases(dtype schemapb.DataType) []indexTestCase {
|
|
return []indexTestCase{
|
|
{
|
|
dtype: dtype,
|
|
typeParams: nil,
|
|
indexParams: map[string]string{
|
|
common.IndexTypeKey: "sort",
|
|
},
|
|
},
|
|
{
|
|
dtype: dtype,
|
|
typeParams: nil,
|
|
indexParams: map[string]string{
|
|
common.IndexTypeKey: "marisa-trie",
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func genFloatVecIndexCases(dtype schemapb.DataType) []indexTestCase {
|
|
return []indexTestCase{
|
|
{
|
|
dtype: dtype,
|
|
typeParams: nil,
|
|
indexParams: map[string]string{
|
|
common.IndexTypeKey: IndexFaissIVFPQ,
|
|
common.MetricTypeKey: metric.L2,
|
|
common.DimKey: strconv.Itoa(dim),
|
|
"nlist": strconv.Itoa(nlist),
|
|
"m": strconv.Itoa(m),
|
|
"nbits": strconv.Itoa(nbits),
|
|
},
|
|
},
|
|
{
|
|
dtype: dtype,
|
|
typeParams: nil,
|
|
indexParams: map[string]string{
|
|
common.IndexTypeKey: IndexFaissIVFFlat,
|
|
common.MetricTypeKey: metric.L2,
|
|
common.DimKey: strconv.Itoa(dim),
|
|
"nlist": strconv.Itoa(nlist),
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func genBinaryVecIndexCases(dtype schemapb.DataType) []indexTestCase {
|
|
return []indexTestCase{
|
|
{
|
|
dtype: dtype,
|
|
typeParams: nil,
|
|
indexParams: map[string]string{
|
|
common.IndexTypeKey: IndexFaissBinIVFFlat,
|
|
common.MetricTypeKey: metric.JACCARD,
|
|
common.DimKey: strconv.Itoa(dim),
|
|
"nlist": strconv.Itoa(nlist),
|
|
"nbits": strconv.Itoa(nbits),
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func genTypedIndexCase(dtype schemapb.DataType) []indexTestCase {
|
|
switch dtype {
|
|
case schemapb.DataType_Bool:
|
|
return genScalarIndexCases(dtype)
|
|
case schemapb.DataType_Int8:
|
|
return genScalarIndexCases(dtype)
|
|
case schemapb.DataType_Int16:
|
|
return genScalarIndexCases(dtype)
|
|
case schemapb.DataType_Int32:
|
|
return genScalarIndexCases(dtype)
|
|
case schemapb.DataType_Int64:
|
|
return genScalarIndexCases(dtype)
|
|
case schemapb.DataType_Float:
|
|
return genScalarIndexCases(dtype)
|
|
case schemapb.DataType_Double:
|
|
return genScalarIndexCases(dtype)
|
|
case schemapb.DataType_String:
|
|
return genScalarIndexCases(dtype)
|
|
case schemapb.DataType_VarChar:
|
|
return genStringIndexCases(dtype)
|
|
case schemapb.DataType_BinaryVector:
|
|
return genBinaryVecIndexCases(dtype)
|
|
case schemapb.DataType_FloatVector:
|
|
return genFloatVecIndexCases(dtype)
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func genIndexCase() []indexTestCase {
|
|
dtypes := []schemapb.DataType{
|
|
schemapb.DataType_Bool,
|
|
schemapb.DataType_Int8,
|
|
schemapb.DataType_Int16,
|
|
schemapb.DataType_Int32,
|
|
schemapb.DataType_Int64,
|
|
schemapb.DataType_Float,
|
|
schemapb.DataType_Double,
|
|
schemapb.DataType_String,
|
|
schemapb.DataType_VarChar,
|
|
schemapb.DataType_BinaryVector,
|
|
schemapb.DataType_FloatVector,
|
|
}
|
|
var ret []indexTestCase
|
|
for _, dtype := range dtypes {
|
|
ret = append(ret, genTypedIndexCase(dtype)...)
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func genStorageConfig() *indexpb.StorageConfig {
|
|
params := paramtable.Get()
|
|
|
|
return &indexpb.StorageConfig{
|
|
Address: params.MinioCfg.Address.GetValue(),
|
|
AccessKeyID: params.MinioCfg.AccessKeyID.GetValue(),
|
|
SecretAccessKey: params.MinioCfg.SecretAccessKey.GetValue(),
|
|
BucketName: params.MinioCfg.BucketName.GetValue(),
|
|
RootPath: params.MinioCfg.RootPath.GetValue(),
|
|
IAMEndpoint: params.MinioCfg.IAMEndpoint.GetValue(),
|
|
UseSSL: params.MinioCfg.UseSSL.GetAsBool(),
|
|
UseIAM: params.MinioCfg.UseIAM.GetAsBool(),
|
|
}
|
|
}
|
|
|
|
func TestCgoIndex(t *testing.T) {
|
|
for _, testCase := range genIndexCase() {
|
|
index, err := NewCgoIndex(testCase.dtype, testCase.typeParams, testCase.indexParams)
|
|
assert.NoError(t, err, testCase)
|
|
|
|
dataset := GenDataset(genFieldData(testCase.dtype, nb, dim))
|
|
assert.NoError(t, index.Build(dataset), testCase)
|
|
|
|
blobs, err := index.Serialize()
|
|
assert.NoError(t, err, testCase)
|
|
|
|
copyIndex, err := NewCgoIndex(testCase.dtype, testCase.typeParams, testCase.indexParams)
|
|
assert.NoError(t, err, testCase)
|
|
|
|
assert.NoError(t, copyIndex.Load(blobs), testCase)
|
|
}
|
|
}
|