mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
enhance: enable STL_SORT to support VARCHAR (#44401)
issue: https://github.com/milvus-io/milvus/issues/44399 This PR implements STL_SORT for VARCHAR data type for both RAM and MMAP mode. The general idea is that we deduplicate field values and maintains a posting list for each unique value. The serialization format of the index is: ``` [unique_count][string_offsets][string_data][post_list_offsets][post_list_data][magic_code] string_offsets: array of offsets into string_data section string_data: str_len1, str1, str_len2, str2, ... post_list_offsets: array of offsets into post_list_data section post_list_data: post_list_len1, row_id1, row_id2, ..., post_list_len2, row_id1, row_id2, ... ``` --------- Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
This commit is contained in:
parent
cfeb095ad7
commit
6077178553
@ -33,6 +33,7 @@
|
||||
|
||||
#include "index/VectorDiskIndex.h"
|
||||
#include "index/ScalarIndexSort.h"
|
||||
#include "index/StringIndexSort.h"
|
||||
#include "index/StringIndexMarisa.h"
|
||||
#include "index/BoolIndex.h"
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
@ -90,7 +91,7 @@ IndexFactory::CreatePrimitiveScalarIndex<std::string>(
|
||||
return std::make_unique<HybridScalarIndex<std::string>>(
|
||||
create_index_info.tantivy_index_version, file_manager_context);
|
||||
}
|
||||
return CreateStringIndexMarisa(file_manager_context);
|
||||
return CreateStringIndexSort(file_manager_context);
|
||||
#else
|
||||
ThrowInfo(Unsupported, "unsupported platform");
|
||||
#endif
|
||||
|
||||
@ -54,7 +54,28 @@ StringIndexMarisa::StringIndexMarisa(
|
||||
|
||||
int64_t
|
||||
StringIndexMarisa::Size() {
|
||||
return trie_.size();
|
||||
return total_size_;
|
||||
}
|
||||
|
||||
int64_t
|
||||
StringIndexMarisa::CalculateTotalSize() const {
|
||||
int64_t size = 0;
|
||||
|
||||
// Size of the trie structure
|
||||
// marisa trie uses io_size() to get the serialized size
|
||||
// which approximates the memory usage
|
||||
size += trie_.io_size();
|
||||
|
||||
// Size of str_ids_ vector (main data structure)
|
||||
size += str_ids_.size() * sizeof(int64_t);
|
||||
|
||||
// Size of str_ids_to_offsets_ map data
|
||||
for (const auto& [key, vec] : str_ids_to_offsets_) {
|
||||
size += sizeof(size_t); // key
|
||||
size += vec.size() * sizeof(size_t); // vector data
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
bool
|
||||
@ -113,6 +134,7 @@ StringIndexMarisa::BuildWithFieldData(
|
||||
fill_offsets();
|
||||
|
||||
built_ = true;
|
||||
total_size_ = CalculateTotalSize();
|
||||
}
|
||||
|
||||
void
|
||||
@ -138,6 +160,7 @@ StringIndexMarisa::Build(size_t n,
|
||||
fill_offsets();
|
||||
|
||||
built_ = true;
|
||||
total_size_ = CalculateTotalSize();
|
||||
}
|
||||
|
||||
BinarySet
|
||||
@ -222,6 +245,8 @@ StringIndexMarisa::LoadWithoutAssemble(const BinarySet& set,
|
||||
memcpy(str_ids_.data(), str_ids->data.get(), str_ids_len);
|
||||
|
||||
fill_offsets();
|
||||
built_ = true;
|
||||
total_size_ = CalculateTotalSize();
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@ -127,6 +127,9 @@ class StringIndexMarisa : public StringIndex {
|
||||
LoadWithoutAssemble(const BinarySet& binary_set,
|
||||
const Config& config) override;
|
||||
|
||||
int64_t
|
||||
CalculateTotalSize() const;
|
||||
|
||||
private:
|
||||
Config config_;
|
||||
marisa::Trie trie_;
|
||||
@ -134,6 +137,7 @@ class StringIndexMarisa : public StringIndex {
|
||||
std::map<size_t, std::vector<size_t>> str_ids_to_offsets_;
|
||||
bool built_ = false;
|
||||
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
|
||||
int64_t total_size_ = 0; // Cached total size to avoid runtime calculation
|
||||
};
|
||||
|
||||
using StringIndexMarisaPtr = std::unique_ptr<StringIndexMarisa>;
|
||||
|
||||
1130
internal/core/src/index/StringIndexSort.cpp
Normal file
1130
internal/core/src/index/StringIndexSort.cpp
Normal file
File diff suppressed because it is too large
Load Diff
431
internal/core/src/index/StringIndexSort.h
Normal file
431
internal/core/src/index/StringIndexSort.h
Normal file
@ -0,0 +1,431 @@
|
||||
// Licensed to the LF AI & Data foundation under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <cstring>
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
#include <folly/small_vector.h>
|
||||
|
||||
#include "index/StringIndex.h"
|
||||
#include "storage/MemFileManagerImpl.h"
|
||||
#include "storage/DiskFileManagerImpl.h"
|
||||
#include "storage/FileWriter.h"
|
||||
#include "common/File.h"
|
||||
|
||||
namespace milvus::index {
|
||||
|
||||
// Forward declaration
|
||||
class StringIndexSortImpl;
|
||||
|
||||
// Main StringIndexSort class using pImpl pattern
|
||||
class StringIndexSort : public StringIndex {
|
||||
public:
|
||||
static constexpr uint32_t SERIALIZATION_VERSION = 1;
|
||||
static constexpr uint64_t MAGIC_CODE =
|
||||
0x5354524E47534F52; // "STRNGSOR" in hex
|
||||
|
||||
explicit StringIndexSort(
|
||||
const storage::FileManagerContext& file_manager_context =
|
||||
storage::FileManagerContext());
|
||||
|
||||
virtual ~StringIndexSort();
|
||||
|
||||
int64_t
|
||||
Count() override;
|
||||
|
||||
ScalarIndexType
|
||||
GetIndexType() const override {
|
||||
return ScalarIndexType::STLSORT;
|
||||
}
|
||||
|
||||
const bool
|
||||
HasRawData() const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
Build(size_t n,
|
||||
const std::string* values,
|
||||
const bool* valid_data = nullptr) override;
|
||||
|
||||
void
|
||||
Build(const Config& config = {}) override;
|
||||
|
||||
void
|
||||
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;
|
||||
|
||||
// See detailed format in StringIndexSortMemoryImpl::SerializeToBinary
|
||||
BinarySet
|
||||
Serialize(const Config& config) override;
|
||||
|
||||
IndexStatsPtr
|
||||
Upload(const Config& config = {}) override;
|
||||
|
||||
void
|
||||
Load(const BinarySet& index_binary, const Config& config = {}) override;
|
||||
|
||||
void
|
||||
Load(milvus::tracer::TraceContext ctx, const Config& config = {}) override;
|
||||
|
||||
void
|
||||
LoadWithoutAssemble(const BinarySet& binary_set,
|
||||
const Config& config) override;
|
||||
|
||||
// Query methods - delegated to impl
|
||||
const TargetBitmap
|
||||
In(size_t n, const std::string* values) override;
|
||||
|
||||
const TargetBitmap
|
||||
NotIn(size_t n, const std::string* values) override;
|
||||
|
||||
const TargetBitmap
|
||||
IsNull() override;
|
||||
|
||||
TargetBitmap
|
||||
IsNotNull() override;
|
||||
|
||||
const TargetBitmap
|
||||
Range(std::string value, OpType op) override;
|
||||
|
||||
const TargetBitmap
|
||||
Range(std::string lower_bound_value,
|
||||
bool lb_inclusive,
|
||||
std::string upper_bound_value,
|
||||
bool ub_inclusive) override;
|
||||
|
||||
const TargetBitmap
|
||||
PrefixMatch(const std::string_view prefix) override;
|
||||
|
||||
std::optional<std::string>
|
||||
Reverse_Lookup(size_t offset) const override;
|
||||
|
||||
int64_t
|
||||
Size() override;
|
||||
|
||||
protected:
|
||||
int64_t
|
||||
CalculateTotalSize() const;
|
||||
|
||||
// Common fields
|
||||
int64_t field_id_ = 0;
|
||||
bool is_built_ = false;
|
||||
Config config_;
|
||||
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
|
||||
size_t total_num_rows_{0};
|
||||
TargetBitmap valid_bitset_;
|
||||
std::vector<int32_t> idx_to_offsets_;
|
||||
std::chrono::time_point<std::chrono::system_clock> index_build_begin_;
|
||||
|
||||
int64_t total_size_{0};
|
||||
std::unique_ptr<StringIndexSortImpl> impl_;
|
||||
};
|
||||
|
||||
// Abstract interface for implementations
|
||||
class StringIndexSortImpl {
|
||||
public:
|
||||
virtual ~StringIndexSortImpl() = default;
|
||||
|
||||
virtual void
|
||||
LoadFromBinary(const BinarySet& binary_set,
|
||||
size_t total_num_rows,
|
||||
TargetBitmap& valid_bitset,
|
||||
std::vector<int32_t>& idx_to_offsets) = 0;
|
||||
|
||||
struct ParsedData {
|
||||
uint32_t unique_count;
|
||||
const uint32_t* string_offsets;
|
||||
const uint8_t* string_data_start;
|
||||
const uint32_t* post_list_offsets;
|
||||
const uint8_t* post_list_data_start;
|
||||
};
|
||||
|
||||
static ParsedData
|
||||
ParseBinaryData(const uint8_t* data, size_t data_size);
|
||||
|
||||
virtual const TargetBitmap
|
||||
In(size_t n, const std::string* values, size_t total_num_rows) = 0;
|
||||
|
||||
virtual const TargetBitmap
|
||||
NotIn(size_t n,
|
||||
const std::string* values,
|
||||
size_t total_num_rows,
|
||||
const TargetBitmap& valid_bitset) = 0;
|
||||
|
||||
virtual const TargetBitmap
|
||||
IsNull(size_t total_num_rows, const TargetBitmap& valid_bitset) = 0;
|
||||
|
||||
virtual TargetBitmap
|
||||
IsNotNull(const TargetBitmap& valid_bitset) = 0;
|
||||
|
||||
virtual const TargetBitmap
|
||||
Range(std::string value, OpType op, size_t total_num_rows) = 0;
|
||||
|
||||
virtual const TargetBitmap
|
||||
Range(std::string lower_bound_value,
|
||||
bool lb_inclusive,
|
||||
std::string upper_bound_value,
|
||||
bool ub_inclusive,
|
||||
size_t total_num_rows) = 0;
|
||||
|
||||
virtual const TargetBitmap
|
||||
PrefixMatch(const std::string_view prefix, size_t total_num_rows) = 0;
|
||||
|
||||
virtual std::optional<std::string>
|
||||
Reverse_Lookup(size_t offset,
|
||||
size_t total_num_rows,
|
||||
const TargetBitmap& valid_bitset,
|
||||
const std::vector<int32_t>& idx_to_offsets) const = 0;
|
||||
|
||||
virtual int64_t
|
||||
Size() = 0;
|
||||
};
|
||||
|
||||
class StringIndexSortMemoryImpl : public StringIndexSortImpl {
|
||||
public:
|
||||
using PostingList = folly::small_vector<uint32_t, 4>;
|
||||
|
||||
void
|
||||
BuildFromRawData(size_t n,
|
||||
const std::string* values,
|
||||
const bool* valid_data,
|
||||
TargetBitmap& valid_bitset,
|
||||
std::vector<int32_t>& idx_to_offsets);
|
||||
|
||||
void
|
||||
BuildFromFieldData(const std::vector<FieldDataPtr>& field_datas,
|
||||
size_t total_num_rows,
|
||||
TargetBitmap& valid_bitset,
|
||||
std::vector<int32_t>& idx_to_offsets);
|
||||
|
||||
// Serialize to binary format
|
||||
// The binary format is : [unique_count][string_offsets][string_data][post_list_offsets][post_list_data][magic_code]
|
||||
// string_offsets: array of offsets into string_data section
|
||||
// string_data: str_len1, str1, str_len2, str2, ...
|
||||
// post_list_offsets: array of offsets into post_list_data section
|
||||
// post_list_data: post_list_len1, row_id1, row_id2, ..., post_list_len2, row_id1, row_id2, ...
|
||||
void
|
||||
SerializeToBinary(uint8_t* ptr, size_t& offset) const;
|
||||
|
||||
size_t
|
||||
GetSerializedSize() const;
|
||||
|
||||
void
|
||||
LoadFromBinary(const BinarySet& binary_set,
|
||||
size_t total_num_rows,
|
||||
TargetBitmap& valid_bitset,
|
||||
std::vector<int32_t>& idx_to_offsets) override;
|
||||
|
||||
const TargetBitmap
|
||||
In(size_t n, const std::string* values, size_t total_num_rows) override;
|
||||
|
||||
const TargetBitmap
|
||||
NotIn(size_t n,
|
||||
const std::string* values,
|
||||
size_t total_num_rows,
|
||||
const TargetBitmap& valid_bitset) override;
|
||||
|
||||
const TargetBitmap
|
||||
IsNull(size_t total_num_rows, const TargetBitmap& valid_bitset) override;
|
||||
|
||||
TargetBitmap
|
||||
IsNotNull(const TargetBitmap& valid_bitset) override;
|
||||
|
||||
const TargetBitmap
|
||||
Range(std::string value, OpType op, size_t total_num_rows) override;
|
||||
|
||||
const TargetBitmap
|
||||
Range(std::string lower_bound_value,
|
||||
bool lb_inclusive,
|
||||
std::string upper_bound_value,
|
||||
bool ub_inclusive,
|
||||
size_t total_num_rows) override;
|
||||
|
||||
const TargetBitmap
|
||||
PrefixMatch(const std::string_view prefix, size_t total_num_rows) override;
|
||||
|
||||
std::optional<std::string>
|
||||
Reverse_Lookup(size_t offset,
|
||||
size_t total_num_rows,
|
||||
const TargetBitmap& valid_bitset,
|
||||
const std::vector<int32_t>& idx_to_offsets) const override;
|
||||
|
||||
int64_t
|
||||
Size() override;
|
||||
|
||||
private:
|
||||
// Helper method for binary search
|
||||
size_t
|
||||
FindValueIndex(const std::string& value) const;
|
||||
|
||||
void
|
||||
BuildFromMap(std::map<std::string, PostingList>&& unique_map,
|
||||
size_t total_num_rows,
|
||||
std::vector<int32_t>& idx_to_offsets);
|
||||
|
||||
// Keep unique_values_ and posting_lists_ separated for cache efficiency
|
||||
// Sorted unique values
|
||||
std::vector<std::string> unique_values_;
|
||||
// Corresponding posting lists
|
||||
std::vector<PostingList> posting_lists_;
|
||||
};
|
||||
|
||||
class StringIndexSortMmapImpl : public StringIndexSortImpl {
|
||||
public:
|
||||
~StringIndexSortMmapImpl();
|
||||
|
||||
// Helper struct to access separated string and posting list data
|
||||
struct MmapEntry {
|
||||
const char* str_data_ptr; // Pointer to string data
|
||||
const uint32_t* post_list_data_ptr; // Pointer to posting list data
|
||||
uint32_t str_len; // String length
|
||||
uint32_t post_list_len; // Posting list length
|
||||
|
||||
MmapEntry() = default;
|
||||
|
||||
MmapEntry(const uint8_t* str_ptr, const uint8_t* post_list_ptr) {
|
||||
// Read string length and data pointer
|
||||
str_len = *reinterpret_cast<const uint32_t*>(str_ptr);
|
||||
str_data_ptr =
|
||||
reinterpret_cast<const char*>(str_ptr + sizeof(uint32_t));
|
||||
|
||||
// Read posting list length and data pointer
|
||||
post_list_len = *reinterpret_cast<const uint32_t*>(post_list_ptr);
|
||||
post_list_data_ptr = reinterpret_cast<const uint32_t*>(
|
||||
post_list_ptr + sizeof(uint32_t));
|
||||
}
|
||||
|
||||
std::string_view
|
||||
get_string_view() const {
|
||||
return std::string_view(str_data_ptr, str_len);
|
||||
}
|
||||
|
||||
size_t
|
||||
get_posting_list_len() const {
|
||||
return post_list_len;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
get_row_id(size_t idx) const {
|
||||
return post_list_data_ptr[idx];
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
void
|
||||
for_each_row_id(Func func) const {
|
||||
for (uint32_t i = 0; i < post_list_len; ++i) {
|
||||
func(post_list_data_ptr[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void
|
||||
LoadFromBinary(const BinarySet& binary_set,
|
||||
size_t total_num_rows,
|
||||
TargetBitmap& valid_bitset,
|
||||
std::vector<int32_t>& idx_to_offsets) override;
|
||||
|
||||
void
|
||||
SetMmapFilePath(const std::string& filepath) {
|
||||
mmap_filepath_ = filepath;
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
In(size_t n, const std::string* values, size_t total_num_rows) override;
|
||||
|
||||
const TargetBitmap
|
||||
NotIn(size_t n,
|
||||
const std::string* values,
|
||||
size_t total_num_rows,
|
||||
const TargetBitmap& valid_bitset) override;
|
||||
|
||||
const TargetBitmap
|
||||
IsNull(size_t total_num_rows, const TargetBitmap& valid_bitset) override;
|
||||
|
||||
TargetBitmap
|
||||
IsNotNull(const TargetBitmap& valid_bitset) override;
|
||||
|
||||
const TargetBitmap
|
||||
Range(std::string value, OpType op, size_t total_num_rows) override;
|
||||
|
||||
const TargetBitmap
|
||||
Range(std::string lower_bound_value,
|
||||
bool lb_inclusive,
|
||||
std::string upper_bound_value,
|
||||
bool ub_inclusive,
|
||||
size_t total_num_rows) override;
|
||||
|
||||
const TargetBitmap
|
||||
PrefixMatch(const std::string_view prefix, size_t total_num_rows) override;
|
||||
|
||||
std::optional<std::string>
|
||||
Reverse_Lookup(size_t offset,
|
||||
size_t total_num_rows,
|
||||
const TargetBitmap& valid_bitset,
|
||||
const std::vector<int32_t>& idx_to_offsets) const override;
|
||||
|
||||
int64_t
|
||||
Size() override;
|
||||
|
||||
private:
|
||||
// Binary search for a value
|
||||
size_t
|
||||
FindValueIndex(const std::string& value) const;
|
||||
|
||||
// Binary search helpers
|
||||
size_t
|
||||
LowerBound(const std::string_view& value) const;
|
||||
|
||||
size_t
|
||||
UpperBound(const std::string_view& value) const;
|
||||
|
||||
MmapEntry
|
||||
GetEntry(size_t idx) const {
|
||||
const uint8_t* str_ptr = string_data_start_ + string_offsets_[idx];
|
||||
const uint8_t* post_list_ptr =
|
||||
post_list_data_start_ + post_list_offsets_[idx];
|
||||
return MmapEntry(str_ptr, post_list_ptr);
|
||||
}
|
||||
|
||||
private:
|
||||
char* mmap_data_ = nullptr;
|
||||
size_t mmap_size_ = 0;
|
||||
std::string mmap_filepath_;
|
||||
size_t unique_count_ = 0;
|
||||
|
||||
// Pointers to different sections in mmap'd data
|
||||
const uint32_t* string_offsets_ = nullptr;
|
||||
const uint8_t* string_data_start_ = nullptr;
|
||||
const uint32_t* post_list_offsets_ = nullptr;
|
||||
const uint8_t* post_list_data_start_ = nullptr;
|
||||
};
|
||||
|
||||
using StringIndexSortPtr = std::unique_ptr<StringIndexSort>;
|
||||
|
||||
inline StringIndexSortPtr
|
||||
CreateStringIndexSort(const storage::FileManagerContext& file_manager_context =
|
||||
storage::FileManagerContext()) {
|
||||
return std::make_unique<StringIndexSort>(file_manager_context);
|
||||
}
|
||||
|
||||
} // namespace milvus::index
|
||||
607
internal/core/src/index/StringIndexSortTest.cpp
Normal file
607
internal/core/src/index/StringIndexSortTest.cpp
Normal file
@ -0,0 +1,607 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
#include "index/StringIndexSort.h"
|
||||
#include "index/IndexFactory.h"
|
||||
#include "test_utils/indexbuilder_test_utils.h"
|
||||
|
||||
constexpr int64_t nb = 100;
|
||||
|
||||
namespace milvus {
|
||||
namespace index {
|
||||
class StringIndexBaseTest : public ::testing::Test {
|
||||
protected:
|
||||
void
|
||||
SetUp() override {
|
||||
strs = GenStrArr(nb);
|
||||
*str_arr.mutable_data() = {strs.begin(), strs.end()};
|
||||
}
|
||||
|
||||
protected:
|
||||
std::vector<std::string> strs;
|
||||
schemapb::StringArray str_arr;
|
||||
};
|
||||
|
||||
class StringIndexSortTest : public StringIndexBaseTest {};
|
||||
|
||||
TEST_F(StringIndexSortTest, ConstructorMemory) {
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
ASSERT_NE(index, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, ConstructorMmap) {
|
||||
Config config;
|
||||
config["mmap_file_path"] = "/tmp/milvus_test";
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
ASSERT_NE(index, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, BuildMemory) {
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(strs.size(), strs.data());
|
||||
ASSERT_EQ(index->Count(), nb);
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, BuildMmap) {
|
||||
Config config;
|
||||
config["mmap_file_path"] = "/tmp/milvus_test";
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(strs.size(), strs.data());
|
||||
ASSERT_EQ(index->Count(), nb);
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, InMemory) {
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(nb, strs.data());
|
||||
|
||||
// Test with all strings
|
||||
auto bitset = index->In(strs.size(), strs.data());
|
||||
ASSERT_EQ(bitset.size(), strs.size());
|
||||
ASSERT_EQ(bitset.count(), strs.size());
|
||||
|
||||
// Test with subset
|
||||
std::vector<std::string> subset = {strs[0], strs[10], strs[20]};
|
||||
auto subset_bitset = index->In(subset.size(), subset.data());
|
||||
ASSERT_EQ(subset_bitset.size(), strs.size());
|
||||
ASSERT_EQ(subset_bitset.count(), 3);
|
||||
ASSERT_TRUE(subset_bitset[0]);
|
||||
ASSERT_TRUE(subset_bitset[10]);
|
||||
ASSERT_TRUE(subset_bitset[20]);
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, InMmap) {
|
||||
Config config;
|
||||
config["mmap_file_path"] = "/tmp/milvus_test";
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(nb, strs.data());
|
||||
|
||||
auto bitset = index->In(strs.size(), strs.data());
|
||||
ASSERT_EQ(bitset.size(), strs.size());
|
||||
ASSERT_EQ(bitset.count(), strs.size());
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, NotInMemory) {
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(nb, strs.data());
|
||||
|
||||
auto bitset = index->NotIn(strs.size(), strs.data());
|
||||
ASSERT_EQ(bitset.size(), strs.size());
|
||||
ASSERT_EQ(bitset.count(), 0);
|
||||
|
||||
// Test with non-existing strings
|
||||
std::vector<std::string> non_existing = {"non_existing_1",
|
||||
"non_existing_2"};
|
||||
auto non_existing_bitset =
|
||||
index->NotIn(non_existing.size(), non_existing.data());
|
||||
ASSERT_EQ(non_existing_bitset.size(), strs.size());
|
||||
ASSERT_EQ(non_existing_bitset.count(), strs.size());
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, RangeMemory) {
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
|
||||
// Build with sorted strings for predictable range tests
|
||||
std::vector<std::string> sorted_strs = {
|
||||
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"};
|
||||
index->Build(sorted_strs.size(), sorted_strs.data());
|
||||
|
||||
// Test LessThan
|
||||
auto bitset = index->Range("d", OpType::LessThan);
|
||||
ASSERT_EQ(bitset.count(), 3); // a, b, c
|
||||
|
||||
// Test LessEqual
|
||||
auto bitset2 = index->Range("d", OpType::LessEqual);
|
||||
ASSERT_EQ(bitset2.count(), 4); // a, b, c, d
|
||||
|
||||
// Test GreaterThan
|
||||
auto bitset3 = index->Range("g", OpType::GreaterThan);
|
||||
ASSERT_EQ(bitset3.count(), 3); // h, i, j
|
||||
|
||||
// Test GreaterEqual
|
||||
auto bitset4 = index->Range("g", OpType::GreaterEqual);
|
||||
ASSERT_EQ(bitset4.count(), 4); // g, h, i, j
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, RangeBetweenMemory) {
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
|
||||
std::vector<std::string> sorted_strs = {
|
||||
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"};
|
||||
index->Build(sorted_strs.size(), sorted_strs.data());
|
||||
|
||||
// Test inclusive range
|
||||
auto bitset = index->Range("c", true, "g", true);
|
||||
ASSERT_EQ(bitset.count(), 5); // c, d, e, f, g
|
||||
|
||||
// Test exclusive range
|
||||
auto bitset2 = index->Range("c", false, "g", false);
|
||||
ASSERT_EQ(bitset2.count(), 3); // d, e, f
|
||||
|
||||
// Test mixed
|
||||
auto bitset3 = index->Range("c", true, "g", false);
|
||||
ASSERT_EQ(bitset3.count(), 4); // c, d, e, f
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, PrefixMatchMemory) {
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
|
||||
std::vector<std::string> test_strs = {
|
||||
"apple", "application", "apply", "banana", "band", "cat"};
|
||||
index->Build(test_strs.size(), test_strs.data());
|
||||
|
||||
auto bitset = index->PrefixMatch("app");
|
||||
ASSERT_EQ(bitset.count(), 3); // apple, application, apply
|
||||
|
||||
auto bitset2 = index->PrefixMatch("ban");
|
||||
ASSERT_EQ(bitset2.count(), 2); // banana, band
|
||||
|
||||
auto bitset3 = index->PrefixMatch("cat");
|
||||
ASSERT_EQ(bitset3.count(), 1); // cat
|
||||
|
||||
auto bitset4 = index->PrefixMatch("dog");
|
||||
ASSERT_EQ(bitset4.count(), 0); // none
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, PrefixMatchMmap) {
|
||||
Config config;
|
||||
config["mmap_file_path"] = "/tmp/milvus_test";
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
|
||||
std::vector<std::string> test_strs = {
|
||||
"apple", "application", "apply", "banana", "band", "cat"};
|
||||
index->Build(test_strs.size(), test_strs.data());
|
||||
|
||||
auto bitset = index->PrefixMatch("app");
|
||||
ASSERT_EQ(bitset.count(), 3); // apple, application, apply
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, ReverseLookupMemory) {
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(strs.size(), strs.data());
|
||||
|
||||
for (size_t i = 0; i < strs.size(); ++i) {
|
||||
auto result = index->Reverse_Lookup(i);
|
||||
ASSERT_TRUE(result.has_value());
|
||||
ASSERT_EQ(result.value(), strs[i]);
|
||||
}
|
||||
|
||||
// Test invalid offset
|
||||
auto result = index->Reverse_Lookup(strs.size() + 100);
|
||||
ASSERT_FALSE(result.has_value());
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, ReverseLookupMmap) {
|
||||
Config config;
|
||||
config["mmap_file_path"] = "/tmp/milvus_test";
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(strs.size(), strs.data());
|
||||
|
||||
for (size_t i = 0; i < strs.size(); ++i) {
|
||||
auto result = index->Reverse_Lookup(i);
|
||||
ASSERT_TRUE(result.has_value());
|
||||
ASSERT_EQ(result.value(), strs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, SerializeDeserializeMemory) {
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(strs.size(), strs.data());
|
||||
|
||||
// Serialize
|
||||
auto binary_set = index->Serialize(config);
|
||||
|
||||
// Create new index and load
|
||||
auto new_index = milvus::index::CreateStringIndexSort({});
|
||||
new_index->Load(binary_set);
|
||||
|
||||
// Verify data integrity
|
||||
ASSERT_EQ(new_index->Count(), strs.size());
|
||||
|
||||
auto bitset = new_index->In(strs.size(), strs.data());
|
||||
ASSERT_EQ(bitset.count(), strs.size());
|
||||
|
||||
for (size_t i = 0; i < strs.size(); ++i) {
|
||||
auto result = new_index->Reverse_Lookup(i);
|
||||
ASSERT_TRUE(result.has_value());
|
||||
ASSERT_EQ(result.value(), strs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, SerializeDeserializeMmap) {
|
||||
Config config;
|
||||
config["mmap_file_path"] = "/tmp/milvus_test";
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(strs.size(), strs.data());
|
||||
|
||||
// Serialize
|
||||
auto binary_set = index->Serialize(config);
|
||||
|
||||
// Create new index and load
|
||||
auto new_index = milvus::index::CreateStringIndexSort({});
|
||||
new_index->Load(binary_set);
|
||||
|
||||
// Verify data integrity
|
||||
ASSERT_EQ(new_index->Count(), strs.size());
|
||||
|
||||
auto bitset = new_index->In(strs.size(), strs.data());
|
||||
ASSERT_EQ(bitset.count(), strs.size());
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, NullHandlingMemory) {
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
|
||||
std::unique_ptr<bool[]> valid(new bool[nb]);
|
||||
for (int i = 0; i < nb; i++) {
|
||||
valid[i] = (i % 2 == 0); // Half are valid
|
||||
}
|
||||
|
||||
index->Build(nb, strs.data(), valid.get());
|
||||
|
||||
// Test IsNull
|
||||
auto null_bitset = index->IsNull();
|
||||
ASSERT_EQ(null_bitset.count(), nb / 2);
|
||||
|
||||
// Test IsNotNull
|
||||
auto not_null_bitset = index->IsNotNull();
|
||||
ASSERT_EQ(not_null_bitset.count(), nb / 2);
|
||||
|
||||
// Verify they are complementary
|
||||
for (size_t i = 0; i < nb; ++i) {
|
||||
ASSERT_NE(null_bitset[i], not_null_bitset[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, NullHandlingMmap) {
|
||||
Config config;
|
||||
config["mmap_file_path"] = "/tmp/milvus_test";
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
|
||||
std::unique_ptr<bool[]> valid(new bool[nb]);
|
||||
for (int i = 0; i < nb; i++) {
|
||||
valid[i] = (i % 2 == 0);
|
||||
}
|
||||
|
||||
index->Build(nb, strs.data(), valid.get());
|
||||
|
||||
auto null_bitset = index->IsNull();
|
||||
ASSERT_EQ(null_bitset.count(), nb / 2);
|
||||
|
||||
auto not_null_bitset = index->IsNotNull();
|
||||
ASSERT_EQ(not_null_bitset.count(), nb / 2);
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, MmapLoadAfterSerialize) {
|
||||
// Step 1: Build index in memory and serialize
|
||||
Config build_config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
|
||||
std::vector<std::string> test_strs = {
|
||||
"apple",
|
||||
"banana",
|
||||
"cherry",
|
||||
"date",
|
||||
"elderberry",
|
||||
"fig",
|
||||
"grape",
|
||||
"honeydew",
|
||||
"kiwi",
|
||||
"lemon",
|
||||
"apple",
|
||||
"banana",
|
||||
"apple" // Include duplicates
|
||||
};
|
||||
index->Build(test_strs.size(), test_strs.data());
|
||||
|
||||
// Serialize the index
|
||||
auto binary_set = index->Serialize(build_config);
|
||||
|
||||
// Step 2: Load with mmap configuration
|
||||
Config mmap_config;
|
||||
mmap_config[MMAP_FILE_PATH] = "/tmp/test_string_index_sort_mmap.idx";
|
||||
|
||||
auto mmap_index = milvus::index::CreateStringIndexSort({});
|
||||
mmap_index->Load(binary_set, mmap_config);
|
||||
|
||||
// Step 3: Verify functionality with mmap loaded index
|
||||
// Test Count
|
||||
ASSERT_EQ(mmap_index->Count(), test_strs.size());
|
||||
|
||||
// Test In operation
|
||||
std::vector<std::string> search_vals = {"apple", "grape", "lemon"};
|
||||
auto bitset = mmap_index->In(search_vals.size(), search_vals.data());
|
||||
ASSERT_EQ(bitset.count(),
|
||||
5); // apple appears 3 times, grape once, lemon once
|
||||
ASSERT_TRUE(bitset[0]); // apple
|
||||
ASSERT_TRUE(bitset[6]); // grape
|
||||
ASSERT_TRUE(bitset[9]); // lemon
|
||||
ASSERT_TRUE(bitset[10]); // apple (duplicate)
|
||||
ASSERT_TRUE(bitset[12]); // apple (duplicate)
|
||||
|
||||
// Test NotIn operation
|
||||
std::vector<std::string> not_in_vals = {"orange", "pear"};
|
||||
auto not_bitset = mmap_index->NotIn(not_in_vals.size(), not_in_vals.data());
|
||||
ASSERT_EQ(not_bitset.count(),
|
||||
test_strs.size()); // All strings should be in result
|
||||
|
||||
// Test Range operation
|
||||
auto range_bitset =
|
||||
mmap_index->Range("cherry", milvus::OpType::GreaterEqual);
|
||||
ASSERT_EQ(
|
||||
range_bitset.count(),
|
||||
8); // cherry, date, elderberry, fig, grape, honeydew, kiwi, lemon
|
||||
|
||||
// Test Range between
|
||||
auto range_between = mmap_index->Range("banana", true, "grape", true);
|
||||
ASSERT_EQ(range_between.count(),
|
||||
7); // banana(2), cherry, date, elderberry, fig, grape
|
||||
|
||||
// Test PrefixMatch
|
||||
std::vector<std::string> prefix_test_strs = {
|
||||
"app", "apple", "application", "banana", "band"};
|
||||
auto prefix_index = milvus::index::CreateStringIndexSort({});
|
||||
prefix_index->Build(prefix_test_strs.size(), prefix_test_strs.data());
|
||||
auto prefix_binary = prefix_index->Serialize(build_config);
|
||||
|
||||
Config prefix_mmap_config;
|
||||
prefix_mmap_config[MMAP_FILE_PATH] = "/tmp/test_prefix_mmap.idx";
|
||||
auto prefix_mmap_index = milvus::index::CreateStringIndexSort({});
|
||||
prefix_mmap_index->Load(prefix_binary, prefix_mmap_config);
|
||||
|
||||
auto prefix_bitset = prefix_mmap_index->PrefixMatch("app");
|
||||
ASSERT_EQ(prefix_bitset.count(), 3); // app, apple, application
|
||||
|
||||
// Test Reverse_Lookup
|
||||
for (size_t i = 0; i < test_strs.size(); ++i) {
|
||||
auto result = mmap_index->Reverse_Lookup(i);
|
||||
ASSERT_TRUE(result.has_value());
|
||||
ASSERT_EQ(result.value(), test_strs[i]);
|
||||
}
|
||||
|
||||
// Clean up temp files
|
||||
std::remove("/tmp/test_string_index_sort_mmap.idx");
|
||||
std::remove("/tmp/test_prefix_mmap.idx");
|
||||
}
|
||||
|
||||
TEST_F(StringIndexSortTest, LoadWithoutAssembleMmap) {
|
||||
// Build and serialize index
|
||||
Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
|
||||
std::vector<std::string> test_strs = {
|
||||
"zebra", "apple", "monkey", "dog", "cat"};
|
||||
index->Build(test_strs.size(), test_strs.data());
|
||||
|
||||
auto binary_set = index->Serialize(config);
|
||||
|
||||
// Load without assemble using mmap
|
||||
Config mmap_config;
|
||||
mmap_config[MMAP_FILE_PATH] = "/tmp/test_load_without_assemble.idx";
|
||||
|
||||
auto mmap_index = milvus::index::CreateStringIndexSort({});
|
||||
mmap_index->LoadWithoutAssemble(binary_set, mmap_config);
|
||||
|
||||
// Verify the index works correctly
|
||||
auto bitset = mmap_index->In(test_strs.size(), test_strs.data());
|
||||
ASSERT_EQ(bitset.count(), test_strs.size());
|
||||
|
||||
// Test that all operations work
|
||||
auto range_bitset = mmap_index->Range("dog", milvus::OpType::LessEqual);
|
||||
ASSERT_EQ(range_bitset.count(), 3); // apple, cat, dog
|
||||
|
||||
// Clean up
|
||||
std::remove("/tmp/test_load_without_assemble.idx");
|
||||
}
|
||||
} // namespace index
|
||||
} // namespace milvus
|
||||
|
||||
TEST(StringIndexSortStandaloneTest, StringIndexSortBuildAndSearch) {
|
||||
// Test data
|
||||
std::vector<std::string> test_data = {"apple",
|
||||
"banana",
|
||||
"cherry",
|
||||
"date",
|
||||
"elderberry",
|
||||
"fig",
|
||||
"grape",
|
||||
"honeydew",
|
||||
"kiwi",
|
||||
"lemon"};
|
||||
auto n = test_data.size();
|
||||
|
||||
// Test Memory mode
|
||||
{
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(n, test_data.data());
|
||||
|
||||
// Test In operation
|
||||
std::vector<std::string> search_vals = {"apple", "grape", "lemon"};
|
||||
auto bitset = index->In(search_vals.size(), search_vals.data());
|
||||
ASSERT_EQ(bitset.count(), 3);
|
||||
ASSERT_TRUE(bitset[0]); // apple
|
||||
ASSERT_TRUE(bitset[6]); // grape
|
||||
ASSERT_TRUE(bitset[9]); // lemon
|
||||
|
||||
// Test Range operation
|
||||
auto range_bitset =
|
||||
index->Range("cherry", milvus::OpType::GreaterEqual);
|
||||
ASSERT_EQ(
|
||||
range_bitset.count(),
|
||||
8); // cherry, date, elderberry, fig, grape, honeydew, kiwi, lemon
|
||||
|
||||
// Test PrefixMatch
|
||||
std::vector<std::string> test_data_prefix = {
|
||||
"app", "apple", "application", "banana", "band"};
|
||||
auto prefix_index = milvus::index::CreateStringIndexSort({});
|
||||
prefix_index->Build(test_data_prefix.size(), test_data_prefix.data());
|
||||
auto prefix_bitset = prefix_index->PrefixMatch("app");
|
||||
ASSERT_EQ(prefix_bitset.count(), 3); // app, apple, application
|
||||
}
|
||||
|
||||
// Test Mmap mode
|
||||
{
|
||||
milvus::Config config;
|
||||
config["mmap_file_path"] = "/tmp/milvus_scalar_test";
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(n, test_data.data());
|
||||
|
||||
// Test In operation
|
||||
std::vector<std::string> search_vals = {"banana", "fig"};
|
||||
auto bitset = index->In(search_vals.size(), search_vals.data());
|
||||
ASSERT_EQ(bitset.count(), 2);
|
||||
ASSERT_TRUE(bitset[1]); // banana
|
||||
ASSERT_TRUE(bitset[5]); // fig
|
||||
|
||||
// Test NotIn operation
|
||||
auto not_bitset = index->NotIn(search_vals.size(), search_vals.data());
|
||||
ASSERT_EQ(not_bitset.count(), n - 2);
|
||||
ASSERT_FALSE(not_bitset[1]); // banana should not be in NotIn result
|
||||
ASSERT_FALSE(not_bitset[5]); // fig should not be in NotIn result
|
||||
}
|
||||
}
|
||||
|
||||
TEST(StringIndexSortStandaloneTest, StringIndexSortWithNulls) {
|
||||
std::vector<std::string> test_data = {
|
||||
"alpha", "beta", "gamma", "delta", "epsilon"};
|
||||
|
||||
std::unique_ptr<bool[]> valid_data(new bool[test_data.size()]);
|
||||
valid_data[0] = true;
|
||||
valid_data[1] = false;
|
||||
valid_data[2] = true;
|
||||
valid_data[3] = false;
|
||||
valid_data[4] = true;
|
||||
auto n = test_data.size();
|
||||
|
||||
// Memory mode with nulls
|
||||
{
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(n, test_data.data(), valid_data.get());
|
||||
|
||||
// Test IsNull
|
||||
auto null_bitset = index->IsNull();
|
||||
ASSERT_EQ(null_bitset.count(), 2);
|
||||
ASSERT_TRUE(null_bitset[1]); // beta is null
|
||||
ASSERT_TRUE(null_bitset[3]); // delta is null
|
||||
|
||||
// Test IsNotNull
|
||||
auto not_null_bitset = index->IsNotNull();
|
||||
ASSERT_EQ(not_null_bitset.count(), 3);
|
||||
ASSERT_TRUE(not_null_bitset[0]); // alpha is not null
|
||||
ASSERT_TRUE(not_null_bitset[2]); // gamma is not null
|
||||
ASSERT_TRUE(not_null_bitset[4]); // epsilon is not null
|
||||
|
||||
// Test In with nulls
|
||||
std::vector<std::string> search_vals = {"alpha", "beta", "gamma"};
|
||||
auto bitset = index->In(search_vals.size(), search_vals.data());
|
||||
ASSERT_EQ(bitset.count(), 2); // Only alpha and gamma (beta is null)
|
||||
ASSERT_TRUE(bitset[0]); // alpha
|
||||
ASSERT_FALSE(bitset[1]); // beta is null
|
||||
ASSERT_TRUE(bitset[2]); // gamma
|
||||
}
|
||||
|
||||
// Mmap mode with nulls
|
||||
{
|
||||
milvus::Config config;
|
||||
config["mmap_file_path"] = "/tmp/milvus_scalar_test";
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(n, test_data.data(), valid_data.get());
|
||||
|
||||
auto null_bitset = index->IsNull();
|
||||
ASSERT_EQ(null_bitset.count(), 2);
|
||||
|
||||
auto not_null_bitset = index->IsNotNull();
|
||||
ASSERT_EQ(not_null_bitset.count(), 3);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(StringIndexSortStandaloneTest, StringIndexSortSerialization) {
|
||||
std::vector<std::string> test_data;
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
test_data.push_back("str_" + std::to_string(i));
|
||||
}
|
||||
auto n = test_data.size();
|
||||
|
||||
// Test Memory mode serialization
|
||||
{
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(n, test_data.data());
|
||||
|
||||
// Serialize
|
||||
auto binary_set = index->Serialize(config);
|
||||
|
||||
// Create new index and deserialize
|
||||
auto new_index = milvus::index::CreateStringIndexSort({});
|
||||
new_index->Load(binary_set);
|
||||
|
||||
// Verify the data
|
||||
ASSERT_EQ(new_index->Count(), n);
|
||||
|
||||
// Test search on deserialized index
|
||||
std::vector<std::string> search_vals = {"str_10", "str_50", "str_90"};
|
||||
auto bitset = new_index->In(search_vals.size(), search_vals.data());
|
||||
ASSERT_EQ(bitset.count(), 3);
|
||||
|
||||
// Test reverse lookup
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
auto result = new_index->Reverse_Lookup(i);
|
||||
ASSERT_TRUE(result.has_value());
|
||||
ASSERT_EQ(result.value(), test_data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Test Mmap mode serialization
|
||||
{
|
||||
milvus::Config config;
|
||||
config["mmap_file_path"] = "/tmp/milvus_scalar_test";
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(n, test_data.data());
|
||||
|
||||
// Serialize
|
||||
auto binary_set = index->Serialize(config);
|
||||
|
||||
// Create new index and deserialize
|
||||
auto new_index = milvus::index::CreateStringIndexSort({});
|
||||
new_index->Load(binary_set);
|
||||
|
||||
// Verify the data
|
||||
ASSERT_EQ(new_index->Count(), n);
|
||||
|
||||
// Test range query on deserialized index
|
||||
auto bitset = new_index->Range("str_20", true, "str_30", true);
|
||||
// In lexicographical order: str_20, str_21, ..., str_29, str_3, str_30
|
||||
// So we expect more than 11 due to lexicographical ordering
|
||||
ASSERT_GT(bitset.count(), 0);
|
||||
}
|
||||
}
|
||||
@ -1,6 +1,8 @@
|
||||
package indexparamcheck
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/cockroachdb/errors"
|
||||
|
||||
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||||
@ -17,8 +19,8 @@ func (c *STLSORTChecker) CheckTrain(dataType schemapb.DataType, elementType sche
|
||||
}
|
||||
|
||||
func (c *STLSORTChecker) CheckValidDataType(indexType IndexType, field *schemapb.FieldSchema) error {
|
||||
if !typeutil.IsArithmetic(field.GetDataType()) {
|
||||
return errors.New("STL_SORT are only supported on numeric field")
|
||||
if !typeutil.IsArithmetic(field.GetDataType()) && !typeutil.IsStringType(field.GetDataType()) {
|
||||
return errors.New(fmt.Sprintf("STL_SORT are only supported on numeric or varchar field, got %s", field.GetDataType()))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -15,8 +15,8 @@ func Test_STLSORTIndexChecker(t *testing.T) {
|
||||
|
||||
assert.NoError(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_Int64}))
|
||||
assert.NoError(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_Float}))
|
||||
assert.NoError(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_VarChar}))
|
||||
|
||||
assert.Error(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_VarChar}))
|
||||
assert.Error(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_Bool}))
|
||||
assert.Error(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_JSON}))
|
||||
}
|
||||
|
||||
@ -245,10 +245,9 @@ func TestIndexAddedField(t *testing.T) {
|
||||
createIndex: index.NewInvertedIndex,
|
||||
},
|
||||
{
|
||||
name: "SortedIndex",
|
||||
indexType: "STL_SORT",
|
||||
createIndex: index.NewSortedIndex,
|
||||
expectedError: "STL_SORT are only supported on numeric field",
|
||||
name: "SortedIndex",
|
||||
indexType: "STL_SORT",
|
||||
createIndex: index.NewSortedIndex,
|
||||
},
|
||||
{
|
||||
name: "TrieIndex",
|
||||
|
||||
@ -468,10 +468,10 @@ func TestCreateSortedScalarIndex(t *testing.T) {
|
||||
idx := index.NewSortedIndex()
|
||||
for _, field := range schema.Fields {
|
||||
if hp.SupportScalarIndexFieldType(field.DataType) {
|
||||
if field.DataType == entity.FieldTypeVarChar || field.DataType == entity.FieldTypeBool ||
|
||||
if field.DataType == entity.FieldTypeBool ||
|
||||
field.DataType == entity.FieldTypeJSON || field.DataType == entity.FieldTypeArray {
|
||||
_, err := mc.CreateIndex(ctx, client.NewCreateIndexOption(schema.CollectionName, field.Name, idx))
|
||||
common.CheckErr(t, err, false, "STL_SORT are only supported on numeric field")
|
||||
require.ErrorContains(t, err, "STL_SORT are only supported on numeric or varchar field")
|
||||
} else {
|
||||
idxTask, err := mc.CreateIndex(ctx, client.NewCreateIndexOption(schema.CollectionName, field.Name, idx))
|
||||
common.CheckErr(t, err, true)
|
||||
@ -623,12 +623,12 @@ func TestCreateIndexJsonField(t *testing.T) {
|
||||
errMsg string
|
||||
}
|
||||
inxError := []scalarIndexError{
|
||||
{index.NewSortedIndex(), "STL_SORT are only supported on numeric field"},
|
||||
{index.NewSortedIndex(), "STL_SORT are only supported on numeric or varchar field"},
|
||||
{index.NewTrieIndex(), "TRIE are only supported on varchar field"},
|
||||
}
|
||||
for _, idxErr := range inxError {
|
||||
_, err := mc.CreateIndex(ctx, client.NewCreateIndexOption(schema.CollectionName, common.DefaultJSONFieldName, idxErr.idx).WithIndexName("json_index"))
|
||||
common.CheckErr(t, err, false, idxErr.errMsg)
|
||||
require.ErrorContains(t, err, idxErr.errMsg)
|
||||
}
|
||||
}
|
||||
|
||||
@ -649,7 +649,7 @@ func TestCreateUnsupportedIndexArrayField(t *testing.T) {
|
||||
errMsg string
|
||||
}
|
||||
inxError := []scalarIndexError{
|
||||
{index.NewSortedIndex(), "STL_SORT are only supported on numeric field"},
|
||||
{index.NewSortedIndex(), "STL_SORT are only supported on numeric or varchar field"},
|
||||
{index.NewTrieIndex(), "TRIE are only supported on varchar field"},
|
||||
}
|
||||
|
||||
@ -660,11 +660,11 @@ func TestCreateUnsupportedIndexArrayField(t *testing.T) {
|
||||
if field.DataType == entity.FieldTypeArray {
|
||||
// create vector index
|
||||
_, err1 := mc.CreateIndex(ctx, client.NewCreateIndexOption(schema.CollectionName, field.Name, vectorIdx).WithIndexName("vector_index"))
|
||||
common.CheckErr(t, err1, false, "index SCANN only supports vector data type")
|
||||
require.ErrorContains(t, err1, "index SCANN only supports vector data type")
|
||||
|
||||
// create scalar index
|
||||
_, err := mc.CreateIndex(ctx, client.NewCreateIndexOption(schema.CollectionName, field.Name, idxErr.idx))
|
||||
common.CheckErr(t, err, false, idxErr.errMsg)
|
||||
require.ErrorContains(t, err, idxErr.errMsg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -849,13 +849,16 @@ class TestMilvusClientJsonPathIndexInvalid(TestMilvusClientV2Base):
|
||||
# 3. create index
|
||||
if not_supported_varchar_scalar_index == "TRIE":
|
||||
supported_field_type = "varchar"
|
||||
got_json_suffix = ""
|
||||
if not_supported_varchar_scalar_index == "STL_SORT":
|
||||
supported_field_type = "numeric"
|
||||
supported_field_type = "numeric or varchar"
|
||||
got_json_suffix = ", got JSON"
|
||||
if not_supported_varchar_scalar_index == "BITMAP":
|
||||
supported_field_type = "bool, int, string and array"
|
||||
not_supported_varchar_scalar_index = "bitmap index"
|
||||
got_json_suffix = ""
|
||||
error = {ct.err_code: 1100, ct.err_msg: f"{not_supported_varchar_scalar_index} are only supported on "
|
||||
f"{supported_field_type} field: invalid parameter[expected=valid "
|
||||
f"{supported_field_type} field{got_json_suffix}: invalid parameter[expected=valid "
|
||||
f"index params][actual=invalid index params]"}
|
||||
self.create_index(client, collection_name, index_params,
|
||||
check_task=CheckTasks.err_res, check_items=error)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user