fix: make sure inverted index has only one segment (#32858)

issue: #32717

---------

Signed-off-by: longjiquan <jiquan.long@zilliz.com>
This commit is contained in:
Jiquan Long 2024-05-08 21:25:30 +08:00 committed by GitHub
parent 5037497929
commit 035a508722
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 188 additions and 20 deletions

View File

@ -29,6 +29,55 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "anstream"
version = "0.6.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
[[package]]
name = "anstyle-parse"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5"
dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
dependencies = [
"anstyle",
"windows-sys 0.52.0",
]
[[package]] [[package]]
name = "arc-swap" name = "arc-swap"
version = "1.7.1" version = "1.7.1"
@ -167,6 +216,12 @@ dependencies = [
"os_str_bytes", "os_str_bytes",
] ]
[[package]]
name = "colorchoice"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
[[package]] [[package]]
name = "crc32fast" name = "crc32fast"
version = "1.4.0" version = "1.4.0"
@ -238,6 +293,29 @@ version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
[[package]]
name = "env_filter"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea"
dependencies = [
"log",
"regex",
]
[[package]]
name = "env_logger"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9"
dependencies = [
"anstream",
"anstyle",
"env_filter",
"humantime",
"log",
]
[[package]] [[package]]
name = "errno" name = "errno"
version = "0.3.8" version = "0.3.8"
@ -432,6 +510,12 @@ version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163"
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]] [[package]]
name = "indexmap" name = "indexmap"
version = "1.9.3" version = "1.9.3"
@ -454,6 +538,12 @@ dependencies = [
"web-sys", "web-sys",
] ]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
[[package]] [[package]]
name = "itertools" name = "itertools"
version = "0.11.0" version = "0.11.0"
@ -978,8 +1068,10 @@ name = "tantivy-binding"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"cbindgen", "cbindgen",
"env_logger",
"futures", "futures",
"libc", "libc",
"log",
"scopeguard", "scopeguard",
"tantivy", "tantivy",
"zstd-sys", "zstd-sys",
@ -1243,6 +1335,12 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba"
[[package]]
name = "utf8parse"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
[[package]] [[package]]
name = "uuid" name = "uuid"
version = "1.8.0" version = "1.8.0"

View File

@ -11,6 +11,8 @@ futures = "0.3.21"
libc = "0.2" libc = "0.2"
scopeguard = "1.2" scopeguard = "1.2"
zstd-sys = "=2.0.9" zstd-sys = "=2.0.9"
env_logger = "0.11.3"
log = "0.4.21"
[build-dependencies] [build-dependencies]
cbindgen = "0.26.0" cbindgen = "0.26.0"

View File

@ -6,6 +6,7 @@ use tantivy::query::{Query, RangeQuery, RegexQuery, TermQuery};
use tantivy::schema::{Field, IndexRecordOption}; use tantivy::schema::{Field, IndexRecordOption};
use tantivy::{Index, IndexReader, ReloadPolicy, Term}; use tantivy::{Index, IndexReader, ReloadPolicy, Term};
use crate::log::init_log;
use crate::util::make_bounds; use crate::util::make_bounds;
use crate::vec_collector::VecCollector; use crate::vec_collector::VecCollector;
@ -18,6 +19,8 @@ pub struct IndexReaderWrapper {
impl IndexReaderWrapper { impl IndexReaderWrapper {
pub fn new(index: &Index, field_name: &String, field: Field) -> IndexReaderWrapper { pub fn new(index: &Index, field_name: &String, field: Field) -> IndexReaderWrapper {
init_log();
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)

View File

@ -212,10 +212,7 @@ pub extern "C" fn tantivy_prefix_query_keyword(
} }
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_regex_query( pub extern "C" fn tantivy_regex_query(ptr: *mut c_void, pattern: *const c_char) -> RustArray {
ptr: *mut c_void,
pattern: *const c_char,
) -> RustArray {
let real = ptr as *mut IndexReaderWrapper; let real = ptr as *mut IndexReaderWrapper;
unsafe { unsafe {
let c_str = CStr::from_ptr(pattern); let c_str = CStr::from_ptr(pattern);

View File

@ -1,21 +1,24 @@
use futures::executor::block_on; use futures::executor::block_on;
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, INDEXED}; use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, INDEXED};
use tantivy::{doc, tokenizer, Index, IndexWriter}; use tantivy::{doc, tokenizer, Index, IndexWriter, SingleSegmentIndexWriter};
use crate::data_type::TantivyDataType; use crate::data_type::TantivyDataType;
use crate::index_writer;
use crate::log::init_log;
pub struct IndexWriterWrapper { pub struct IndexWriterWrapper {
pub field_name: String, pub field_name: String,
pub field: Field, pub field: Field,
pub data_type: TantivyDataType, pub data_type: TantivyDataType,
pub path: String, pub path: String,
pub index: Index, pub index_writer: SingleSegmentIndexWriter,
pub index_writer: IndexWriter,
} }
impl IndexWriterWrapper { impl IndexWriterWrapper {
pub fn new(field_name: String, data_type: TantivyDataType, path: String) -> IndexWriterWrapper { pub fn new(field_name: String, data_type: TantivyDataType, path: String) -> IndexWriterWrapper {
init_log();
let field: Field; let field: Field;
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let mut use_raw_tokenizer = false; let mut use_raw_tokenizer = false;
@ -45,60 +48,59 @@ impl IndexWriterWrapper {
.tokenizers() .tokenizers()
.register("raw_tokenizer", tokenizer::RawTokenizer::default()); .register("raw_tokenizer", tokenizer::RawTokenizer::default());
} }
let index_writer = index.writer_with_num_threads(1, 15_000_000).unwrap(); let index_writer = SingleSegmentIndexWriter::new(index, 15 * 1024 * 1024).unwrap();
IndexWriterWrapper { IndexWriterWrapper {
field_name, field_name,
field, field,
data_type, data_type,
path, path,
index,
index_writer, index_writer,
} }
} }
pub fn add_i8(&self, data: i8) { pub fn add_i8(&mut self, data: i8) {
self.add_i64(data.into()) self.add_i64(data.into())
} }
pub fn add_i16(&self, data: i16) { pub fn add_i16(&mut self, data: i16) {
self.add_i64(data.into()) self.add_i64(data.into())
} }
pub fn add_i32(&self, data: i32) { pub fn add_i32(&mut self, data: i32) {
self.add_i64(data.into()) self.add_i64(data.into())
} }
pub fn add_i64(&self, data: i64) { pub fn add_i64(&mut self, data: i64) {
self.index_writer self.index_writer
.add_document(doc!(self.field => data)) .add_document(doc!(self.field => data))
.unwrap(); .unwrap();
} }
pub fn add_f32(&self, data: f32) { pub fn add_f32(&mut self, data: f32) {
self.add_f64(data.into()) self.add_f64(data.into())
} }
pub fn add_f64(&self, data: f64) { pub fn add_f64(&mut self, data: f64) {
self.index_writer self.index_writer
.add_document(doc!(self.field => data)) .add_document(doc!(self.field => data))
.unwrap(); .unwrap();
} }
pub fn add_bool(&self, data: bool) { pub fn add_bool(&mut self, data: bool) {
self.index_writer self.index_writer
.add_document(doc!(self.field => data)) .add_document(doc!(self.field => data))
.unwrap(); .unwrap();
} }
pub fn add_keyword(&self, data: &str) { pub fn add_keyword(&mut self, data: &str) {
self.index_writer self.index_writer
.add_document(doc!(self.field => data)) .add_document(doc!(self.field => data))
.unwrap(); .unwrap();
} }
pub fn finish(mut self) { pub fn finish(mut self) {
self.index_writer.commit().unwrap(); self.index_writer
block_on(self.index_writer.garbage_collect_files()).unwrap(); .finalize()
self.index_writer.wait_merging_threads().unwrap(); .expect("failed to build inverted index");
} }
} }

View File

@ -6,6 +6,7 @@ mod index_reader_c;
mod index_writer; mod index_writer;
mod index_writer_c; mod index_writer_c;
mod linkedlist_collector; mod linkedlist_collector;
mod log;
mod util; mod util;
mod util_c; mod util_c;
mod vec_collector; mod vec_collector;

View File

@ -0,0 +1,10 @@
use env_logger::Env;
use std::sync::Once;
pub(crate) fn init_log() {
static _INITIALIZED: Once = Once::new();
_INITIALIZED.call_once(|| {
let _env = Env::default().filter_or("MY_LOG_LEVEL", "info");
env_logger::init_from_env(_env);
});
}

View File

@ -1,3 +1,4 @@
use log::warn;
use tantivy::{ use tantivy::{
collector::{Collector, SegmentCollector}, collector::{Collector, SegmentCollector},
DocId, DocId,
@ -26,6 +27,10 @@ impl Collector for VecCollector {
if segment_fruits.len() == 1 { if segment_fruits.len() == 1 {
Ok(segment_fruits.into_iter().next().unwrap()) Ok(segment_fruits.into_iter().next().unwrap())
} else { } else {
warn!(
"inverted index should have only one segment, but got {} segments",
segment_fruits.len()
);
let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum(); let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum();
let mut result = Vec::with_capacity(len); let mut result = Vec::with_capacity(len);
for docs in segment_fruits { for docs in segment_fruits {

View File

@ -2,6 +2,9 @@
#include <cassert> #include <cassert>
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <iostream> #include <iostream>
#include <random>
#include <set>
#include <map>
#include "tantivy-binding.h" #include "tantivy-binding.h"
#include "tantivy-wrapper.h" #include "tantivy-wrapper.h"
@ -152,8 +155,55 @@ run<std::string>() {
} }
} }
void
test_32717() {
using T = int16_t;
auto path = "/tmp/inverted-index/test-binding/";
boost::filesystem::remove_all(path);
boost::filesystem::create_directories(path);
if (tantivy_index_exist(path)) {
auto w = TantivyIndexWrapper(path);
auto cnt = w.count();
std::cout << "index already exist, open it, count: " << cnt
<< std::endl;
return;
}
auto w = TantivyIndexWrapper("test_field_name", guess_data_type<T>(), path);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> dis(1, 1000);
std::vector<int16_t> arr;
std::map<int16_t, std::set<int>> inverted;
size_t l = 1000000;
for (size_t i = 0; i < l; i++) {
auto n = static_cast<int16_t>(dis(gen));
arr.push_back(n);
if (inverted.find(n) == inverted.end()) {
inverted[n] = std::set<int>();
}
inverted[n].insert(i);
}
w.add_data(arr.data(), l);
w.finish();
assert(w.count() == l);
for (int16_t term = 1; term < 1000; term += 10) {
auto hits = w.term_query(term);
for (size_t i = 0; i < hits.array_.len; i++) {
assert(arr[hits.array_.array[i]] == term);
}
}
}
int int
main(int argc, char* argv[]) { main(int argc, char* argv[]) {
test_32717();
run<int8_t>(); run<int8_t>();
run<int16_t>(); run<int16_t>();
run<int32_t>(); run<int32_t>();