enhance: enable STL_SORT to support VARCHAR (#44401)

issue: https://github.com/milvus-io/milvus/issues/44399

This PR implements STL_SORT for VARCHAR data type for both RAM and MMAP
mode.
The general idea is that we deduplicate field values and maintains a
posting list for each unique value.

The serialization format of the index is:
```
[unique_count][string_offsets][string_data][post_list_offsets][post_list_data][magic_code]
string_offsets: array of offsets into string_data section
string_data: str_len1, str1, str_len2, str2, ...
post_list_offsets: array of offsets into post_list_data section
post_list_data: post_list_len1, row_id1, row_id2, ..., post_list_len2, row_id1, row_id2, ...
```

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
This commit is contained in:
Spade A 2025-10-23 11:00:05 +08:00 committed by GitHub
parent cfeb095ad7
commit 6077178553
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 2220 additions and 18 deletions

View File

@ -33,6 +33,7 @@
#include "index/VectorDiskIndex.h"
#include "index/ScalarIndexSort.h"
#include "index/StringIndexSort.h"
#include "index/StringIndexMarisa.h"
#include "index/BoolIndex.h"
#include "index/InvertedIndexTantivy.h"
@ -90,7 +91,7 @@ IndexFactory::CreatePrimitiveScalarIndex<std::string>(
return std::make_unique<HybridScalarIndex<std::string>>(
create_index_info.tantivy_index_version, file_manager_context);
}
return CreateStringIndexMarisa(file_manager_context);
return CreateStringIndexSort(file_manager_context);
#else
ThrowInfo(Unsupported, "unsupported platform");
#endif

View File

@ -54,7 +54,28 @@ StringIndexMarisa::StringIndexMarisa(
int64_t
StringIndexMarisa::Size() {
return trie_.size();
return total_size_;
}
int64_t
StringIndexMarisa::CalculateTotalSize() const {
int64_t size = 0;
// Size of the trie structure
// marisa trie uses io_size() to get the serialized size
// which approximates the memory usage
size += trie_.io_size();
// Size of str_ids_ vector (main data structure)
size += str_ids_.size() * sizeof(int64_t);
// Size of str_ids_to_offsets_ map data
for (const auto& [key, vec] : str_ids_to_offsets_) {
size += sizeof(size_t); // key
size += vec.size() * sizeof(size_t); // vector data
}
return size;
}
bool
@ -113,6 +134,7 @@ StringIndexMarisa::BuildWithFieldData(
fill_offsets();
built_ = true;
total_size_ = CalculateTotalSize();
}
void
@ -138,6 +160,7 @@ StringIndexMarisa::Build(size_t n,
fill_offsets();
built_ = true;
total_size_ = CalculateTotalSize();
}
BinarySet
@ -222,6 +245,8 @@ StringIndexMarisa::LoadWithoutAssemble(const BinarySet& set,
memcpy(str_ids_.data(), str_ids->data.get(), str_ids_len);
fill_offsets();
built_ = true;
total_size_ = CalculateTotalSize();
}
void

View File

@ -127,6 +127,9 @@ class StringIndexMarisa : public StringIndex {
LoadWithoutAssemble(const BinarySet& binary_set,
const Config& config) override;
int64_t
CalculateTotalSize() const;
private:
Config config_;
marisa::Trie trie_;
@ -134,6 +137,7 @@ class StringIndexMarisa : public StringIndex {
std::map<size_t, std::vector<size_t>> str_ids_to_offsets_;
bool built_ = false;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
int64_t total_size_ = 0; // Cached total size to avoid runtime calculation
};
using StringIndexMarisaPtr = std::unique_ptr<StringIndexMarisa>;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,431 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <memory>
#include <utility>
#include <vector>
#include <string>
#include <map>
#include <cstring>
#include <sys/mman.h>
#include <unistd.h>
#include <folly/small_vector.h>
#include "index/StringIndex.h"
#include "storage/MemFileManagerImpl.h"
#include "storage/DiskFileManagerImpl.h"
#include "storage/FileWriter.h"
#include "common/File.h"
namespace milvus::index {
// Forward declaration
class StringIndexSortImpl;
// Main StringIndexSort class using pImpl pattern
class StringIndexSort : public StringIndex {
public:
static constexpr uint32_t SERIALIZATION_VERSION = 1;
static constexpr uint64_t MAGIC_CODE =
0x5354524E47534F52; // "STRNGSOR" in hex
explicit StringIndexSort(
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext());
virtual ~StringIndexSort();
int64_t
Count() override;
ScalarIndexType
GetIndexType() const override {
return ScalarIndexType::STLSORT;
}
const bool
HasRawData() const override {
return true;
}
void
Build(size_t n,
const std::string* values,
const bool* valid_data = nullptr) override;
void
Build(const Config& config = {}) override;
void
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;
// See detailed format in StringIndexSortMemoryImpl::SerializeToBinary
BinarySet
Serialize(const Config& config) override;
IndexStatsPtr
Upload(const Config& config = {}) override;
void
Load(const BinarySet& index_binary, const Config& config = {}) override;
void
Load(milvus::tracer::TraceContext ctx, const Config& config = {}) override;
void
LoadWithoutAssemble(const BinarySet& binary_set,
const Config& config) override;
// Query methods - delegated to impl
const TargetBitmap
In(size_t n, const std::string* values) override;
const TargetBitmap
NotIn(size_t n, const std::string* values) override;
const TargetBitmap
IsNull() override;
TargetBitmap
IsNotNull() override;
const TargetBitmap
Range(std::string value, OpType op) override;
const TargetBitmap
Range(std::string lower_bound_value,
bool lb_inclusive,
std::string upper_bound_value,
bool ub_inclusive) override;
const TargetBitmap
PrefixMatch(const std::string_view prefix) override;
std::optional<std::string>
Reverse_Lookup(size_t offset) const override;
int64_t
Size() override;
protected:
int64_t
CalculateTotalSize() const;
// Common fields
int64_t field_id_ = 0;
bool is_built_ = false;
Config config_;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
size_t total_num_rows_{0};
TargetBitmap valid_bitset_;
std::vector<int32_t> idx_to_offsets_;
std::chrono::time_point<std::chrono::system_clock> index_build_begin_;
int64_t total_size_{0};
std::unique_ptr<StringIndexSortImpl> impl_;
};
// Abstract interface for implementations
class StringIndexSortImpl {
public:
virtual ~StringIndexSortImpl() = default;
virtual void
LoadFromBinary(const BinarySet& binary_set,
size_t total_num_rows,
TargetBitmap& valid_bitset,
std::vector<int32_t>& idx_to_offsets) = 0;
struct ParsedData {
uint32_t unique_count;
const uint32_t* string_offsets;
const uint8_t* string_data_start;
const uint32_t* post_list_offsets;
const uint8_t* post_list_data_start;
};
static ParsedData
ParseBinaryData(const uint8_t* data, size_t data_size);
virtual const TargetBitmap
In(size_t n, const std::string* values, size_t total_num_rows) = 0;
virtual const TargetBitmap
NotIn(size_t n,
const std::string* values,
size_t total_num_rows,
const TargetBitmap& valid_bitset) = 0;
virtual const TargetBitmap
IsNull(size_t total_num_rows, const TargetBitmap& valid_bitset) = 0;
virtual TargetBitmap
IsNotNull(const TargetBitmap& valid_bitset) = 0;
virtual const TargetBitmap
Range(std::string value, OpType op, size_t total_num_rows) = 0;
virtual const TargetBitmap
Range(std::string lower_bound_value,
bool lb_inclusive,
std::string upper_bound_value,
bool ub_inclusive,
size_t total_num_rows) = 0;
virtual const TargetBitmap
PrefixMatch(const std::string_view prefix, size_t total_num_rows) = 0;
virtual std::optional<std::string>
Reverse_Lookup(size_t offset,
size_t total_num_rows,
const TargetBitmap& valid_bitset,
const std::vector<int32_t>& idx_to_offsets) const = 0;
virtual int64_t
Size() = 0;
};
class StringIndexSortMemoryImpl : public StringIndexSortImpl {
public:
using PostingList = folly::small_vector<uint32_t, 4>;
void
BuildFromRawData(size_t n,
const std::string* values,
const bool* valid_data,
TargetBitmap& valid_bitset,
std::vector<int32_t>& idx_to_offsets);
void
BuildFromFieldData(const std::vector<FieldDataPtr>& field_datas,
size_t total_num_rows,
TargetBitmap& valid_bitset,
std::vector<int32_t>& idx_to_offsets);
// Serialize to binary format
// The binary format is : [unique_count][string_offsets][string_data][post_list_offsets][post_list_data][magic_code]
// string_offsets: array of offsets into string_data section
// string_data: str_len1, str1, str_len2, str2, ...
// post_list_offsets: array of offsets into post_list_data section
// post_list_data: post_list_len1, row_id1, row_id2, ..., post_list_len2, row_id1, row_id2, ...
void
SerializeToBinary(uint8_t* ptr, size_t& offset) const;
size_t
GetSerializedSize() const;
void
LoadFromBinary(const BinarySet& binary_set,
size_t total_num_rows,
TargetBitmap& valid_bitset,
std::vector<int32_t>& idx_to_offsets) override;
const TargetBitmap
In(size_t n, const std::string* values, size_t total_num_rows) override;
const TargetBitmap
NotIn(size_t n,
const std::string* values,
size_t total_num_rows,
const TargetBitmap& valid_bitset) override;
const TargetBitmap
IsNull(size_t total_num_rows, const TargetBitmap& valid_bitset) override;
TargetBitmap
IsNotNull(const TargetBitmap& valid_bitset) override;
const TargetBitmap
Range(std::string value, OpType op, size_t total_num_rows) override;
const TargetBitmap
Range(std::string lower_bound_value,
bool lb_inclusive,
std::string upper_bound_value,
bool ub_inclusive,
size_t total_num_rows) override;
const TargetBitmap
PrefixMatch(const std::string_view prefix, size_t total_num_rows) override;
std::optional<std::string>
Reverse_Lookup(size_t offset,
size_t total_num_rows,
const TargetBitmap& valid_bitset,
const std::vector<int32_t>& idx_to_offsets) const override;
int64_t
Size() override;
private:
// Helper method for binary search
size_t
FindValueIndex(const std::string& value) const;
void
BuildFromMap(std::map<std::string, PostingList>&& unique_map,
size_t total_num_rows,
std::vector<int32_t>& idx_to_offsets);
// Keep unique_values_ and posting_lists_ separated for cache efficiency
// Sorted unique values
std::vector<std::string> unique_values_;
// Corresponding posting lists
std::vector<PostingList> posting_lists_;
};
class StringIndexSortMmapImpl : public StringIndexSortImpl {
public:
~StringIndexSortMmapImpl();
// Helper struct to access separated string and posting list data
struct MmapEntry {
const char* str_data_ptr; // Pointer to string data
const uint32_t* post_list_data_ptr; // Pointer to posting list data
uint32_t str_len; // String length
uint32_t post_list_len; // Posting list length
MmapEntry() = default;
MmapEntry(const uint8_t* str_ptr, const uint8_t* post_list_ptr) {
// Read string length and data pointer
str_len = *reinterpret_cast<const uint32_t*>(str_ptr);
str_data_ptr =
reinterpret_cast<const char*>(str_ptr + sizeof(uint32_t));
// Read posting list length and data pointer
post_list_len = *reinterpret_cast<const uint32_t*>(post_list_ptr);
post_list_data_ptr = reinterpret_cast<const uint32_t*>(
post_list_ptr + sizeof(uint32_t));
}
std::string_view
get_string_view() const {
return std::string_view(str_data_ptr, str_len);
}
size_t
get_posting_list_len() const {
return post_list_len;
}
uint32_t
get_row_id(size_t idx) const {
return post_list_data_ptr[idx];
}
template <typename Func>
void
for_each_row_id(Func func) const {
for (uint32_t i = 0; i < post_list_len; ++i) {
func(post_list_data_ptr[i]);
}
}
};
void
LoadFromBinary(const BinarySet& binary_set,
size_t total_num_rows,
TargetBitmap& valid_bitset,
std::vector<int32_t>& idx_to_offsets) override;
void
SetMmapFilePath(const std::string& filepath) {
mmap_filepath_ = filepath;
}
const TargetBitmap
In(size_t n, const std::string* values, size_t total_num_rows) override;
const TargetBitmap
NotIn(size_t n,
const std::string* values,
size_t total_num_rows,
const TargetBitmap& valid_bitset) override;
const TargetBitmap
IsNull(size_t total_num_rows, const TargetBitmap& valid_bitset) override;
TargetBitmap
IsNotNull(const TargetBitmap& valid_bitset) override;
const TargetBitmap
Range(std::string value, OpType op, size_t total_num_rows) override;
const TargetBitmap
Range(std::string lower_bound_value,
bool lb_inclusive,
std::string upper_bound_value,
bool ub_inclusive,
size_t total_num_rows) override;
const TargetBitmap
PrefixMatch(const std::string_view prefix, size_t total_num_rows) override;
std::optional<std::string>
Reverse_Lookup(size_t offset,
size_t total_num_rows,
const TargetBitmap& valid_bitset,
const std::vector<int32_t>& idx_to_offsets) const override;
int64_t
Size() override;
private:
// Binary search for a value
size_t
FindValueIndex(const std::string& value) const;
// Binary search helpers
size_t
LowerBound(const std::string_view& value) const;
size_t
UpperBound(const std::string_view& value) const;
MmapEntry
GetEntry(size_t idx) const {
const uint8_t* str_ptr = string_data_start_ + string_offsets_[idx];
const uint8_t* post_list_ptr =
post_list_data_start_ + post_list_offsets_[idx];
return MmapEntry(str_ptr, post_list_ptr);
}
private:
char* mmap_data_ = nullptr;
size_t mmap_size_ = 0;
std::string mmap_filepath_;
size_t unique_count_ = 0;
// Pointers to different sections in mmap'd data
const uint32_t* string_offsets_ = nullptr;
const uint8_t* string_data_start_ = nullptr;
const uint32_t* post_list_offsets_ = nullptr;
const uint8_t* post_list_data_start_ = nullptr;
};
using StringIndexSortPtr = std::unique_ptr<StringIndexSort>;
inline StringIndexSortPtr
CreateStringIndexSort(const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext()) {
return std::make_unique<StringIndexSort>(file_manager_context);
}
} // namespace milvus::index

View File

@ -0,0 +1,607 @@
#include <gtest/gtest.h>
#include <boost/filesystem.hpp>
#include "index/StringIndexSort.h"
#include "index/IndexFactory.h"
#include "test_utils/indexbuilder_test_utils.h"
constexpr int64_t nb = 100;
namespace milvus {
namespace index {
class StringIndexBaseTest : public ::testing::Test {
protected:
void
SetUp() override {
strs = GenStrArr(nb);
*str_arr.mutable_data() = {strs.begin(), strs.end()};
}
protected:
std::vector<std::string> strs;
schemapb::StringArray str_arr;
};
class StringIndexSortTest : public StringIndexBaseTest {};
TEST_F(StringIndexSortTest, ConstructorMemory) {
Config config;
auto index = milvus::index::CreateStringIndexSort({});
ASSERT_NE(index, nullptr);
}
TEST_F(StringIndexSortTest, ConstructorMmap) {
Config config;
config["mmap_file_path"] = "/tmp/milvus_test";
auto index = milvus::index::CreateStringIndexSort({});
ASSERT_NE(index, nullptr);
}
TEST_F(StringIndexSortTest, BuildMemory) {
Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(strs.size(), strs.data());
ASSERT_EQ(index->Count(), nb);
}
TEST_F(StringIndexSortTest, BuildMmap) {
Config config;
config["mmap_file_path"] = "/tmp/milvus_test";
auto index = milvus::index::CreateStringIndexSort({});
index->Build(strs.size(), strs.data());
ASSERT_EQ(index->Count(), nb);
}
TEST_F(StringIndexSortTest, InMemory) {
Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(nb, strs.data());
// Test with all strings
auto bitset = index->In(strs.size(), strs.data());
ASSERT_EQ(bitset.size(), strs.size());
ASSERT_EQ(bitset.count(), strs.size());
// Test with subset
std::vector<std::string> subset = {strs[0], strs[10], strs[20]};
auto subset_bitset = index->In(subset.size(), subset.data());
ASSERT_EQ(subset_bitset.size(), strs.size());
ASSERT_EQ(subset_bitset.count(), 3);
ASSERT_TRUE(subset_bitset[0]);
ASSERT_TRUE(subset_bitset[10]);
ASSERT_TRUE(subset_bitset[20]);
}
TEST_F(StringIndexSortTest, InMmap) {
Config config;
config["mmap_file_path"] = "/tmp/milvus_test";
auto index = milvus::index::CreateStringIndexSort({});
index->Build(nb, strs.data());
auto bitset = index->In(strs.size(), strs.data());
ASSERT_EQ(bitset.size(), strs.size());
ASSERT_EQ(bitset.count(), strs.size());
}
TEST_F(StringIndexSortTest, NotInMemory) {
Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(nb, strs.data());
auto bitset = index->NotIn(strs.size(), strs.data());
ASSERT_EQ(bitset.size(), strs.size());
ASSERT_EQ(bitset.count(), 0);
// Test with non-existing strings
std::vector<std::string> non_existing = {"non_existing_1",
"non_existing_2"};
auto non_existing_bitset =
index->NotIn(non_existing.size(), non_existing.data());
ASSERT_EQ(non_existing_bitset.size(), strs.size());
ASSERT_EQ(non_existing_bitset.count(), strs.size());
}
TEST_F(StringIndexSortTest, RangeMemory) {
Config config;
auto index = milvus::index::CreateStringIndexSort({});
// Build with sorted strings for predictable range tests
std::vector<std::string> sorted_strs = {
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"};
index->Build(sorted_strs.size(), sorted_strs.data());
// Test LessThan
auto bitset = index->Range("d", OpType::LessThan);
ASSERT_EQ(bitset.count(), 3); // a, b, c
// Test LessEqual
auto bitset2 = index->Range("d", OpType::LessEqual);
ASSERT_EQ(bitset2.count(), 4); // a, b, c, d
// Test GreaterThan
auto bitset3 = index->Range("g", OpType::GreaterThan);
ASSERT_EQ(bitset3.count(), 3); // h, i, j
// Test GreaterEqual
auto bitset4 = index->Range("g", OpType::GreaterEqual);
ASSERT_EQ(bitset4.count(), 4); // g, h, i, j
}
TEST_F(StringIndexSortTest, RangeBetweenMemory) {
Config config;
auto index = milvus::index::CreateStringIndexSort({});
std::vector<std::string> sorted_strs = {
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"};
index->Build(sorted_strs.size(), sorted_strs.data());
// Test inclusive range
auto bitset = index->Range("c", true, "g", true);
ASSERT_EQ(bitset.count(), 5); // c, d, e, f, g
// Test exclusive range
auto bitset2 = index->Range("c", false, "g", false);
ASSERT_EQ(bitset2.count(), 3); // d, e, f
// Test mixed
auto bitset3 = index->Range("c", true, "g", false);
ASSERT_EQ(bitset3.count(), 4); // c, d, e, f
}
TEST_F(StringIndexSortTest, PrefixMatchMemory) {
Config config;
auto index = milvus::index::CreateStringIndexSort({});
std::vector<std::string> test_strs = {
"apple", "application", "apply", "banana", "band", "cat"};
index->Build(test_strs.size(), test_strs.data());
auto bitset = index->PrefixMatch("app");
ASSERT_EQ(bitset.count(), 3); // apple, application, apply
auto bitset2 = index->PrefixMatch("ban");
ASSERT_EQ(bitset2.count(), 2); // banana, band
auto bitset3 = index->PrefixMatch("cat");
ASSERT_EQ(bitset3.count(), 1); // cat
auto bitset4 = index->PrefixMatch("dog");
ASSERT_EQ(bitset4.count(), 0); // none
}
TEST_F(StringIndexSortTest, PrefixMatchMmap) {
Config config;
config["mmap_file_path"] = "/tmp/milvus_test";
auto index = milvus::index::CreateStringIndexSort({});
std::vector<std::string> test_strs = {
"apple", "application", "apply", "banana", "band", "cat"};
index->Build(test_strs.size(), test_strs.data());
auto bitset = index->PrefixMatch("app");
ASSERT_EQ(bitset.count(), 3); // apple, application, apply
}
TEST_F(StringIndexSortTest, ReverseLookupMemory) {
Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(strs.size(), strs.data());
for (size_t i = 0; i < strs.size(); ++i) {
auto result = index->Reverse_Lookup(i);
ASSERT_TRUE(result.has_value());
ASSERT_EQ(result.value(), strs[i]);
}
// Test invalid offset
auto result = index->Reverse_Lookup(strs.size() + 100);
ASSERT_FALSE(result.has_value());
}
TEST_F(StringIndexSortTest, ReverseLookupMmap) {
Config config;
config["mmap_file_path"] = "/tmp/milvus_test";
auto index = milvus::index::CreateStringIndexSort({});
index->Build(strs.size(), strs.data());
for (size_t i = 0; i < strs.size(); ++i) {
auto result = index->Reverse_Lookup(i);
ASSERT_TRUE(result.has_value());
ASSERT_EQ(result.value(), strs[i]);
}
}
TEST_F(StringIndexSortTest, SerializeDeserializeMemory) {
Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(strs.size(), strs.data());
// Serialize
auto binary_set = index->Serialize(config);
// Create new index and load
auto new_index = milvus::index::CreateStringIndexSort({});
new_index->Load(binary_set);
// Verify data integrity
ASSERT_EQ(new_index->Count(), strs.size());
auto bitset = new_index->In(strs.size(), strs.data());
ASSERT_EQ(bitset.count(), strs.size());
for (size_t i = 0; i < strs.size(); ++i) {
auto result = new_index->Reverse_Lookup(i);
ASSERT_TRUE(result.has_value());
ASSERT_EQ(result.value(), strs[i]);
}
}
TEST_F(StringIndexSortTest, SerializeDeserializeMmap) {
Config config;
config["mmap_file_path"] = "/tmp/milvus_test";
auto index = milvus::index::CreateStringIndexSort({});
index->Build(strs.size(), strs.data());
// Serialize
auto binary_set = index->Serialize(config);
// Create new index and load
auto new_index = milvus::index::CreateStringIndexSort({});
new_index->Load(binary_set);
// Verify data integrity
ASSERT_EQ(new_index->Count(), strs.size());
auto bitset = new_index->In(strs.size(), strs.data());
ASSERT_EQ(bitset.count(), strs.size());
}
TEST_F(StringIndexSortTest, NullHandlingMemory) {
Config config;
auto index = milvus::index::CreateStringIndexSort({});
std::unique_ptr<bool[]> valid(new bool[nb]);
for (int i = 0; i < nb; i++) {
valid[i] = (i % 2 == 0); // Half are valid
}
index->Build(nb, strs.data(), valid.get());
// Test IsNull
auto null_bitset = index->IsNull();
ASSERT_EQ(null_bitset.count(), nb / 2);
// Test IsNotNull
auto not_null_bitset = index->IsNotNull();
ASSERT_EQ(not_null_bitset.count(), nb / 2);
// Verify they are complementary
for (size_t i = 0; i < nb; ++i) {
ASSERT_NE(null_bitset[i], not_null_bitset[i]);
}
}
TEST_F(StringIndexSortTest, NullHandlingMmap) {
Config config;
config["mmap_file_path"] = "/tmp/milvus_test";
auto index = milvus::index::CreateStringIndexSort({});
std::unique_ptr<bool[]> valid(new bool[nb]);
for (int i = 0; i < nb; i++) {
valid[i] = (i % 2 == 0);
}
index->Build(nb, strs.data(), valid.get());
auto null_bitset = index->IsNull();
ASSERT_EQ(null_bitset.count(), nb / 2);
auto not_null_bitset = index->IsNotNull();
ASSERT_EQ(not_null_bitset.count(), nb / 2);
}
TEST_F(StringIndexSortTest, MmapLoadAfterSerialize) {
// Step 1: Build index in memory and serialize
Config build_config;
auto index = milvus::index::CreateStringIndexSort({});
std::vector<std::string> test_strs = {
"apple",
"banana",
"cherry",
"date",
"elderberry",
"fig",
"grape",
"honeydew",
"kiwi",
"lemon",
"apple",
"banana",
"apple" // Include duplicates
};
index->Build(test_strs.size(), test_strs.data());
// Serialize the index
auto binary_set = index->Serialize(build_config);
// Step 2: Load with mmap configuration
Config mmap_config;
mmap_config[MMAP_FILE_PATH] = "/tmp/test_string_index_sort_mmap.idx";
auto mmap_index = milvus::index::CreateStringIndexSort({});
mmap_index->Load(binary_set, mmap_config);
// Step 3: Verify functionality with mmap loaded index
// Test Count
ASSERT_EQ(mmap_index->Count(), test_strs.size());
// Test In operation
std::vector<std::string> search_vals = {"apple", "grape", "lemon"};
auto bitset = mmap_index->In(search_vals.size(), search_vals.data());
ASSERT_EQ(bitset.count(),
5); // apple appears 3 times, grape once, lemon once
ASSERT_TRUE(bitset[0]); // apple
ASSERT_TRUE(bitset[6]); // grape
ASSERT_TRUE(bitset[9]); // lemon
ASSERT_TRUE(bitset[10]); // apple (duplicate)
ASSERT_TRUE(bitset[12]); // apple (duplicate)
// Test NotIn operation
std::vector<std::string> not_in_vals = {"orange", "pear"};
auto not_bitset = mmap_index->NotIn(not_in_vals.size(), not_in_vals.data());
ASSERT_EQ(not_bitset.count(),
test_strs.size()); // All strings should be in result
// Test Range operation
auto range_bitset =
mmap_index->Range("cherry", milvus::OpType::GreaterEqual);
ASSERT_EQ(
range_bitset.count(),
8); // cherry, date, elderberry, fig, grape, honeydew, kiwi, lemon
// Test Range between
auto range_between = mmap_index->Range("banana", true, "grape", true);
ASSERT_EQ(range_between.count(),
7); // banana(2), cherry, date, elderberry, fig, grape
// Test PrefixMatch
std::vector<std::string> prefix_test_strs = {
"app", "apple", "application", "banana", "band"};
auto prefix_index = milvus::index::CreateStringIndexSort({});
prefix_index->Build(prefix_test_strs.size(), prefix_test_strs.data());
auto prefix_binary = prefix_index->Serialize(build_config);
Config prefix_mmap_config;
prefix_mmap_config[MMAP_FILE_PATH] = "/tmp/test_prefix_mmap.idx";
auto prefix_mmap_index = milvus::index::CreateStringIndexSort({});
prefix_mmap_index->Load(prefix_binary, prefix_mmap_config);
auto prefix_bitset = prefix_mmap_index->PrefixMatch("app");
ASSERT_EQ(prefix_bitset.count(), 3); // app, apple, application
// Test Reverse_Lookup
for (size_t i = 0; i < test_strs.size(); ++i) {
auto result = mmap_index->Reverse_Lookup(i);
ASSERT_TRUE(result.has_value());
ASSERT_EQ(result.value(), test_strs[i]);
}
// Clean up temp files
std::remove("/tmp/test_string_index_sort_mmap.idx");
std::remove("/tmp/test_prefix_mmap.idx");
}
TEST_F(StringIndexSortTest, LoadWithoutAssembleMmap) {
// Build and serialize index
Config config;
auto index = milvus::index::CreateStringIndexSort({});
std::vector<std::string> test_strs = {
"zebra", "apple", "monkey", "dog", "cat"};
index->Build(test_strs.size(), test_strs.data());
auto binary_set = index->Serialize(config);
// Load without assemble using mmap
Config mmap_config;
mmap_config[MMAP_FILE_PATH] = "/tmp/test_load_without_assemble.idx";
auto mmap_index = milvus::index::CreateStringIndexSort({});
mmap_index->LoadWithoutAssemble(binary_set, mmap_config);
// Verify the index works correctly
auto bitset = mmap_index->In(test_strs.size(), test_strs.data());
ASSERT_EQ(bitset.count(), test_strs.size());
// Test that all operations work
auto range_bitset = mmap_index->Range("dog", milvus::OpType::LessEqual);
ASSERT_EQ(range_bitset.count(), 3); // apple, cat, dog
// Clean up
std::remove("/tmp/test_load_without_assemble.idx");
}
} // namespace index
} // namespace milvus
TEST(StringIndexSortStandaloneTest, StringIndexSortBuildAndSearch) {
// Test data
std::vector<std::string> test_data = {"apple",
"banana",
"cherry",
"date",
"elderberry",
"fig",
"grape",
"honeydew",
"kiwi",
"lemon"};
auto n = test_data.size();
// Test Memory mode
{
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(n, test_data.data());
// Test In operation
std::vector<std::string> search_vals = {"apple", "grape", "lemon"};
auto bitset = index->In(search_vals.size(), search_vals.data());
ASSERT_EQ(bitset.count(), 3);
ASSERT_TRUE(bitset[0]); // apple
ASSERT_TRUE(bitset[6]); // grape
ASSERT_TRUE(bitset[9]); // lemon
// Test Range operation
auto range_bitset =
index->Range("cherry", milvus::OpType::GreaterEqual);
ASSERT_EQ(
range_bitset.count(),
8); // cherry, date, elderberry, fig, grape, honeydew, kiwi, lemon
// Test PrefixMatch
std::vector<std::string> test_data_prefix = {
"app", "apple", "application", "banana", "band"};
auto prefix_index = milvus::index::CreateStringIndexSort({});
prefix_index->Build(test_data_prefix.size(), test_data_prefix.data());
auto prefix_bitset = prefix_index->PrefixMatch("app");
ASSERT_EQ(prefix_bitset.count(), 3); // app, apple, application
}
// Test Mmap mode
{
milvus::Config config;
config["mmap_file_path"] = "/tmp/milvus_scalar_test";
auto index = milvus::index::CreateStringIndexSort({});
index->Build(n, test_data.data());
// Test In operation
std::vector<std::string> search_vals = {"banana", "fig"};
auto bitset = index->In(search_vals.size(), search_vals.data());
ASSERT_EQ(bitset.count(), 2);
ASSERT_TRUE(bitset[1]); // banana
ASSERT_TRUE(bitset[5]); // fig
// Test NotIn operation
auto not_bitset = index->NotIn(search_vals.size(), search_vals.data());
ASSERT_EQ(not_bitset.count(), n - 2);
ASSERT_FALSE(not_bitset[1]); // banana should not be in NotIn result
ASSERT_FALSE(not_bitset[5]); // fig should not be in NotIn result
}
}
TEST(StringIndexSortStandaloneTest, StringIndexSortWithNulls) {
std::vector<std::string> test_data = {
"alpha", "beta", "gamma", "delta", "epsilon"};
std::unique_ptr<bool[]> valid_data(new bool[test_data.size()]);
valid_data[0] = true;
valid_data[1] = false;
valid_data[2] = true;
valid_data[3] = false;
valid_data[4] = true;
auto n = test_data.size();
// Memory mode with nulls
{
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(n, test_data.data(), valid_data.get());
// Test IsNull
auto null_bitset = index->IsNull();
ASSERT_EQ(null_bitset.count(), 2);
ASSERT_TRUE(null_bitset[1]); // beta is null
ASSERT_TRUE(null_bitset[3]); // delta is null
// Test IsNotNull
auto not_null_bitset = index->IsNotNull();
ASSERT_EQ(not_null_bitset.count(), 3);
ASSERT_TRUE(not_null_bitset[0]); // alpha is not null
ASSERT_TRUE(not_null_bitset[2]); // gamma is not null
ASSERT_TRUE(not_null_bitset[4]); // epsilon is not null
// Test In with nulls
std::vector<std::string> search_vals = {"alpha", "beta", "gamma"};
auto bitset = index->In(search_vals.size(), search_vals.data());
ASSERT_EQ(bitset.count(), 2); // Only alpha and gamma (beta is null)
ASSERT_TRUE(bitset[0]); // alpha
ASSERT_FALSE(bitset[1]); // beta is null
ASSERT_TRUE(bitset[2]); // gamma
}
// Mmap mode with nulls
{
milvus::Config config;
config["mmap_file_path"] = "/tmp/milvus_scalar_test";
auto index = milvus::index::CreateStringIndexSort({});
index->Build(n, test_data.data(), valid_data.get());
auto null_bitset = index->IsNull();
ASSERT_EQ(null_bitset.count(), 2);
auto not_null_bitset = index->IsNotNull();
ASSERT_EQ(not_null_bitset.count(), 3);
}
}
TEST(StringIndexSortStandaloneTest, StringIndexSortSerialization) {
std::vector<std::string> test_data;
for (int i = 0; i < 100; ++i) {
test_data.push_back("str_" + std::to_string(i));
}
auto n = test_data.size();
// Test Memory mode serialization
{
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(n, test_data.data());
// Serialize
auto binary_set = index->Serialize(config);
// Create new index and deserialize
auto new_index = milvus::index::CreateStringIndexSort({});
new_index->Load(binary_set);
// Verify the data
ASSERT_EQ(new_index->Count(), n);
// Test search on deserialized index
std::vector<std::string> search_vals = {"str_10", "str_50", "str_90"};
auto bitset = new_index->In(search_vals.size(), search_vals.data());
ASSERT_EQ(bitset.count(), 3);
// Test reverse lookup
for (size_t i = 0; i < n; ++i) {
auto result = new_index->Reverse_Lookup(i);
ASSERT_TRUE(result.has_value());
ASSERT_EQ(result.value(), test_data[i]);
}
}
// Test Mmap mode serialization
{
milvus::Config config;
config["mmap_file_path"] = "/tmp/milvus_scalar_test";
auto index = milvus::index::CreateStringIndexSort({});
index->Build(n, test_data.data());
// Serialize
auto binary_set = index->Serialize(config);
// Create new index and deserialize
auto new_index = milvus::index::CreateStringIndexSort({});
new_index->Load(binary_set);
// Verify the data
ASSERT_EQ(new_index->Count(), n);
// Test range query on deserialized index
auto bitset = new_index->Range("str_20", true, "str_30", true);
// In lexicographical order: str_20, str_21, ..., str_29, str_3, str_30
// So we expect more than 11 due to lexicographical ordering
ASSERT_GT(bitset.count(), 0);
}
}

View File

@ -1,6 +1,8 @@
package indexparamcheck
import (
"fmt"
"github.com/cockroachdb/errors"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
@ -17,8 +19,8 @@ func (c *STLSORTChecker) CheckTrain(dataType schemapb.DataType, elementType sche
}
func (c *STLSORTChecker) CheckValidDataType(indexType IndexType, field *schemapb.FieldSchema) error {
if !typeutil.IsArithmetic(field.GetDataType()) {
return errors.New("STL_SORT are only supported on numeric field")
if !typeutil.IsArithmetic(field.GetDataType()) && !typeutil.IsStringType(field.GetDataType()) {
return errors.New(fmt.Sprintf("STL_SORT are only supported on numeric or varchar field, got %s", field.GetDataType()))
}
return nil
}

View File

@ -15,8 +15,8 @@ func Test_STLSORTIndexChecker(t *testing.T) {
assert.NoError(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_Int64}))
assert.NoError(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_Float}))
assert.NoError(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_VarChar}))
assert.Error(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_VarChar}))
assert.Error(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_Bool}))
assert.Error(t, c.CheckValidDataType(IndexSTLSORT, &schemapb.FieldSchema{DataType: schemapb.DataType_JSON}))
}

View File

@ -245,10 +245,9 @@ func TestIndexAddedField(t *testing.T) {
createIndex: index.NewInvertedIndex,
},
{
name: "SortedIndex",
indexType: "STL_SORT",
createIndex: index.NewSortedIndex,
expectedError: "STL_SORT are only supported on numeric field",
name: "SortedIndex",
indexType: "STL_SORT",
createIndex: index.NewSortedIndex,
},
{
name: "TrieIndex",

View File

@ -468,10 +468,10 @@ func TestCreateSortedScalarIndex(t *testing.T) {
idx := index.NewSortedIndex()
for _, field := range schema.Fields {
if hp.SupportScalarIndexFieldType(field.DataType) {
if field.DataType == entity.FieldTypeVarChar || field.DataType == entity.FieldTypeBool ||
if field.DataType == entity.FieldTypeBool ||
field.DataType == entity.FieldTypeJSON || field.DataType == entity.FieldTypeArray {
_, err := mc.CreateIndex(ctx, client.NewCreateIndexOption(schema.CollectionName, field.Name, idx))
common.CheckErr(t, err, false, "STL_SORT are only supported on numeric field")
require.ErrorContains(t, err, "STL_SORT are only supported on numeric or varchar field")
} else {
idxTask, err := mc.CreateIndex(ctx, client.NewCreateIndexOption(schema.CollectionName, field.Name, idx))
common.CheckErr(t, err, true)
@ -623,12 +623,12 @@ func TestCreateIndexJsonField(t *testing.T) {
errMsg string
}
inxError := []scalarIndexError{
{index.NewSortedIndex(), "STL_SORT are only supported on numeric field"},
{index.NewSortedIndex(), "STL_SORT are only supported on numeric or varchar field"},
{index.NewTrieIndex(), "TRIE are only supported on varchar field"},
}
for _, idxErr := range inxError {
_, err := mc.CreateIndex(ctx, client.NewCreateIndexOption(schema.CollectionName, common.DefaultJSONFieldName, idxErr.idx).WithIndexName("json_index"))
common.CheckErr(t, err, false, idxErr.errMsg)
require.ErrorContains(t, err, idxErr.errMsg)
}
}
@ -649,7 +649,7 @@ func TestCreateUnsupportedIndexArrayField(t *testing.T) {
errMsg string
}
inxError := []scalarIndexError{
{index.NewSortedIndex(), "STL_SORT are only supported on numeric field"},
{index.NewSortedIndex(), "STL_SORT are only supported on numeric or varchar field"},
{index.NewTrieIndex(), "TRIE are only supported on varchar field"},
}
@ -660,11 +660,11 @@ func TestCreateUnsupportedIndexArrayField(t *testing.T) {
if field.DataType == entity.FieldTypeArray {
// create vector index
_, err1 := mc.CreateIndex(ctx, client.NewCreateIndexOption(schema.CollectionName, field.Name, vectorIdx).WithIndexName("vector_index"))
common.CheckErr(t, err1, false, "index SCANN only supports vector data type")
require.ErrorContains(t, err1, "index SCANN only supports vector data type")
// create scalar index
_, err := mc.CreateIndex(ctx, client.NewCreateIndexOption(schema.CollectionName, field.Name, idxErr.idx))
common.CheckErr(t, err, false, idxErr.errMsg)
require.ErrorContains(t, err, idxErr.errMsg)
}
}
}

View File

@ -849,13 +849,16 @@ class TestMilvusClientJsonPathIndexInvalid(TestMilvusClientV2Base):
# 3. create index
if not_supported_varchar_scalar_index == "TRIE":
supported_field_type = "varchar"
got_json_suffix = ""
if not_supported_varchar_scalar_index == "STL_SORT":
supported_field_type = "numeric"
supported_field_type = "numeric or varchar"
got_json_suffix = ", got JSON"
if not_supported_varchar_scalar_index == "BITMAP":
supported_field_type = "bool, int, string and array"
not_supported_varchar_scalar_index = "bitmap index"
got_json_suffix = ""
error = {ct.err_code: 1100, ct.err_msg: f"{not_supported_varchar_scalar_index} are only supported on "
f"{supported_field_type} field: invalid parameter[expected=valid "
f"{supported_field_type} field{got_json_suffix}: invalid parameter[expected=valid "
f"index params][actual=invalid index params]"}
self.create_index(client, collection_name, index_params,
check_task=CheckTasks.err_res, check_items=error)