From 67ab5be15afb70ffd2785bcb95cabfac9959fdfa Mon Sep 17 00:00:00 2001 From: Jiquan Long Date: Thu, 11 Jan 2024 11:12:49 +0800 Subject: [PATCH] enhance: optimize search performance of inverted index (#29794) issue: #29793 Use `DocSetCollector` instead of `TopDocsCollector`, which will avoid scoring and sorting. --------- Signed-off-by: longjiquan --- .../core/thirdparty/tantivy/CMakeLists.txt | 7 ++ internal/core/thirdparty/tantivy/bench.cpp | 65 +++++++++++++++++++ .../tantivy/tantivy-binding/build.rs | 1 - .../tantivy-binding/include/tantivy-binding.h | 48 +++++++------- .../tantivy/tantivy-binding/src/array.rs | 3 +- .../tantivy/tantivy-binding/src/data_type.rs | 12 ++-- .../tantivy-binding/src/hashset_collector.rs | 59 +++++++++++++++++ .../tantivy-binding/src/index_reader.rs | 43 +++++------- .../tantivy-binding/src/index_reader_c.rs | 10 ++- .../tantivy-binding/src/index_writer.rs | 1 - .../tantivy-binding/src/index_writer_c.rs | 8 +-- .../tantivy/tantivy-binding/src/lib.rs | 13 ++-- .../src/linkedlist_collector.rs | 61 +++++++++++++++++ .../tantivy/tantivy-binding/src/util.rs | 8 +-- .../tantivy/tantivy-binding/src/util_c.rs | 6 +- .../tantivy-binding/src/vec_collector.rs | 55 ++++++++++++++++ .../core/thirdparty/tantivy/time_recorder.h | 65 +++++++++++++++++++ 17 files changed, 386 insertions(+), 79 deletions(-) create mode 100644 internal/core/thirdparty/tantivy/bench.cpp create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/hashset_collector.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/linkedlist_collector.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs create mode 100644 internal/core/thirdparty/tantivy/time_recorder.h diff --git a/internal/core/thirdparty/tantivy/CMakeLists.txt b/internal/core/thirdparty/tantivy/CMakeLists.txt index 1ba1fdd1e5..f4d9289228 100644 --- a/internal/core/thirdparty/tantivy/CMakeLists.txt +++ b/internal/core/thirdparty/tantivy/CMakeLists.txt @@ -64,3 +64,10 @@ target_link_libraries(test_tantivy boost_filesystem dl ) + +add_executable(bench_tantivy bench.cpp) +target_link_libraries(bench_tantivy + tantivy_binding + boost_filesystem + dl + ) diff --git a/internal/core/thirdparty/tantivy/bench.cpp b/internal/core/thirdparty/tantivy/bench.cpp new file mode 100644 index 0000000000..8b8defd403 --- /dev/null +++ b/internal/core/thirdparty/tantivy/bench.cpp @@ -0,0 +1,65 @@ +#include +#include +#include +#include +#include + +#include "tantivy-binding.h" +#include "tantivy-wrapper.h" +#include "time_recorder.h" + +using namespace milvus::tantivy; + +void +build_index(size_t n = 1000000) { + auto path = "/tmp/inverted-index/test-binding/"; + boost::filesystem::remove_all(path); + boost::filesystem::create_directories(path); + + auto w = + TantivyIndexWrapper("test_field_name", TantivyDataType::Keyword, path); + + std::vector arr; + arr.reserve(n); + + std::default_random_engine er(42); + int64_t sample = 10000; + for (size_t i = 0; i < n; i++) { + auto x = er() % sample; + arr.push_back(std::to_string(x)); + } + + w.add_data(arr.data(), arr.size()); + + w.finish(); + assert(w.count() == n); +} + +void +search(size_t repeat = 10) { + TimeRecorder tr("bench-tantivy-search"); + + auto path = "/tmp/inverted-index/test-binding/"; + assert(tantivy_index_exist(path)); + tr.RecordSection("check if index exist"); + + auto w = TantivyIndexWrapper(path); + auto cnt = w.count(); + tr.RecordSection("count num_entities"); + std::cout << "index already exist, open it, count: " << cnt << std::endl; + + for (size_t i = 0; i < repeat; i++) { + w.lower_bound_range_query(std::to_string(45), false); + tr.RecordSection("query"); + } + + tr.ElapseFromBegin("done"); +} + +int +main(int argc, char* argv[]) { + build_index(1000000); + search(10); + + return 0; +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/build.rs b/internal/core/thirdparty/tantivy/tantivy-binding/build.rs index f47ea86fd3..9d583e0a0c 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/build.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/build.rs @@ -1,6 +1,5 @@ use std::{env, path::PathBuf}; - fn main() { let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); let package_name = env::var("CARGO_PKG_NAME").unwrap(); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h index 0cabaf6571..9c921155fb 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h +++ b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h @@ -21,30 +21,6 @@ struct RustArray { extern "C" { -void *tantivy_create_index(const char *field_name, TantivyDataType data_type, const char *path); - -void tantivy_free_index_writer(void *ptr); - -void tantivy_finish_index(void *ptr); - -void tantivy_index_add_int8s(void *ptr, const int8_t *array, uintptr_t len); - -void tantivy_index_add_int16s(void *ptr, const int16_t *array, uintptr_t len); - -void tantivy_index_add_int32s(void *ptr, const int32_t *array, uintptr_t len); - -void tantivy_index_add_int64s(void *ptr, const int64_t *array, uintptr_t len); - -void tantivy_index_add_f32s(void *ptr, const float *array, uintptr_t len); - -void tantivy_index_add_f64s(void *ptr, const double *array, uintptr_t len); - -void tantivy_index_add_bools(void *ptr, const bool *array, uintptr_t len); - -void tantivy_index_add_keyword(void *ptr, const char *s); - -bool tantivy_index_exist(const char *path); - void free_rust_array(RustArray array); void *tantivy_load_index(const char *path); @@ -97,4 +73,28 @@ RustArray tantivy_range_query_keyword(void *ptr, RustArray tantivy_prefix_query_keyword(void *ptr, const char *prefix); +void *tantivy_create_index(const char *field_name, TantivyDataType data_type, const char *path); + +void tantivy_free_index_writer(void *ptr); + +void tantivy_finish_index(void *ptr); + +void tantivy_index_add_int8s(void *ptr, const int8_t *array, uintptr_t len); + +void tantivy_index_add_int16s(void *ptr, const int16_t *array, uintptr_t len); + +void tantivy_index_add_int32s(void *ptr, const int32_t *array, uintptr_t len); + +void tantivy_index_add_int64s(void *ptr, const int64_t *array, uintptr_t len); + +void tantivy_index_add_f32s(void *ptr, const float *array, uintptr_t len); + +void tantivy_index_add_f64s(void *ptr, const double *array, uintptr_t len); + +void tantivy_index_add_bools(void *ptr, const bool *array, uintptr_t len); + +void tantivy_index_add_keyword(void *ptr, const char *s); + +bool tantivy_index_exist(const char *path); + } // extern "C" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs index 842ff84a28..9d71ffa315 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs @@ -1,6 +1,5 @@ use libc::size_t; - #[repr(C)] pub struct RustArray { array: *mut u32, @@ -23,7 +22,7 @@ impl RustArray { #[no_mangle] pub extern "C" fn free_rust_array(array: RustArray) { - let RustArray { array, len , cap} = array; + let RustArray { array, len, cap } = array; unsafe { Vec::from_raw_parts(array, len, cap); } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs index 8242331418..9b646f648f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs @@ -1,9 +1,9 @@ #[repr(u8)] pub enum TantivyDataType { - // Text, - Keyword, - // U64, - I64, - F64, - Bool, + // Text, + Keyword, + // U64, + I64, + F64, + Bool, } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/hashset_collector.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/hashset_collector.rs new file mode 100644 index 0000000000..07002e446c --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/hashset_collector.rs @@ -0,0 +1,59 @@ +use std::collections::HashSet; + +use tantivy::{ + collector::{Collector, SegmentCollector}, + DocId, +}; + +pub struct HashSetCollector; + +impl Collector for HashSetCollector { + type Fruit = HashSet; + + type Child = HashSetChildCollector; + + fn for_segment( + &self, + _segment_local_id: tantivy::SegmentOrdinal, + _segment: &tantivy::SegmentReader, + ) -> tantivy::Result { + Ok(HashSetChildCollector { + docs: HashSet::new(), + }) + } + + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits(&self, segment_fruits: Vec>) -> tantivy::Result> { + if segment_fruits.len() == 1 { + Ok(segment_fruits.into_iter().next().unwrap()) + } else { + let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum(); + let mut result = HashSet::with_capacity(len); + for docs in segment_fruits { + for doc in docs { + result.insert(doc); + } + } + Ok(result) + } + } +} + +pub struct HashSetChildCollector { + docs: HashSet, +} + +impl SegmentCollector for HashSetChildCollector { + type Fruit = HashSet; + + fn collect(&mut self, doc: DocId, _score: tantivy::Score) { + self.docs.insert(doc); + } + + fn harvest(self) -> Self::Fruit { + self.docs + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs index 805431d41a..6e877dd5d6 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs @@ -1,22 +1,19 @@ use std::ops::Bound; use std::str::FromStr; - - -use tantivy::collector::TopDocs; use tantivy::directory::MmapDirectory; -use tantivy::query::{Query, RangeQuery, TermQuery, RegexQuery}; +use tantivy::query::{Query, RangeQuery, RegexQuery, TermQuery}; use tantivy::schema::{Field, IndexRecordOption}; use tantivy::{Index, IndexReader, ReloadPolicy, Term}; - use crate::util::make_bounds; +use crate::vec_collector::VecCollector; pub struct IndexReaderWrapper { pub field_name: String, pub field: Field, pub reader: IndexReader, - pub cnt: u32, + pub cnt: u32, } impl IndexReaderWrapper { @@ -26,20 +23,18 @@ impl IndexReaderWrapper { .reload_policy(ReloadPolicy::Manual) .try_into() .unwrap(); - let metas = index - .searchable_segment_metas() - .unwrap(); + let metas = index.searchable_segment_metas().unwrap(); let mut sum: u32 = 0; for meta in metas { sum += meta.max_doc(); } - reader.reload().unwrap(); + reader.reload().unwrap(); IndexReaderWrapper { - field_name: field_name.to_string(), - field, - reader, - cnt: sum, - } + field_name: field_name.to_string(), + field, + reader, + cnt: sum, + } } pub fn load(path: &str) -> IndexReaderWrapper { @@ -52,18 +47,15 @@ impl IndexReaderWrapper { } pub fn count(&self) -> u32 { - self.cnt + self.cnt } fn search(&self, q: &dyn Query) -> Vec { let searcher = self.reader.searcher(); - let cnt = self.cnt; - let hits = searcher - .search(q, &TopDocs::with_limit(cnt as usize)) - .unwrap(); - let mut ret = Vec::new(); - for (_, address) in hits { - ret.push(address.doc_id); + let hits = searcher.search(q, &VecCollector).unwrap(); + let mut ret = Vec::with_capacity(hits.len()); + for address in hits { + ret.push(address); } ret } @@ -193,10 +185,7 @@ impl IndexReaderWrapper { self.search(&q) } - pub fn prefix_query_keyword( - &self, - prefix: &str, - ) -> Vec { + pub fn prefix_query_keyword(&self, prefix: &str) -> Vec { let pattern = format!("{}(.|\n)*", prefix); let q = RegexQuery::from_pattern(&pattern, self.field).unwrap(); self.search(&q) diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_c.rs index bdacbf66b7..93afd7daf1 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_c.rs @@ -1,7 +1,10 @@ use std::ffi::{c_char, c_void, CStr}; use crate::{ - index_reader::IndexReaderWrapper, util_c::tantivy_index_exist, util::{create_binding, free_binding}, array::RustArray, + array::RustArray, + index_reader::IndexReaderWrapper, + util::{create_binding, free_binding}, + util_c::tantivy_index_exist, }; #[no_mangle] @@ -196,7 +199,10 @@ pub extern "C" fn tantivy_range_query_keyword( } #[no_mangle] -pub extern "C" fn tantivy_prefix_query_keyword(ptr: *mut c_void, prefix: *const c_char) -> RustArray { +pub extern "C" fn tantivy_prefix_query_keyword( + ptr: *mut c_void, + prefix: *const c_char, +) -> RustArray { let real = ptr as *mut IndexReaderWrapper; unsafe { let c_str = CStr::from_ptr(prefix); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs index 4f8ed8e9df..96a466ad51 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs @@ -4,7 +4,6 @@ use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextO use tantivy::{doc, tokenizer, Index, IndexWriter}; use crate::data_type::TantivyDataType; -use crate::index_reader::IndexReaderWrapper; pub struct IndexWriterWrapper { pub field_name: String, diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs index 482011d305..c882278115 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs @@ -2,7 +2,9 @@ use core::slice; use std::ffi::{c_char, c_void, CStr}; use crate::{ - data_type::TantivyDataType, index_writer::IndexWriterWrapper, util::{create_binding, free_binding}, + data_type::TantivyDataType, + index_writer::IndexWriterWrapper, + util::{create_binding, free_binding}, }; #[no_mangle] @@ -31,9 +33,7 @@ pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) { #[no_mangle] pub extern "C" fn tantivy_finish_index(ptr: *mut c_void) { let real = ptr as *mut IndexWriterWrapper; - unsafe { - Box::from_raw(real).finish() - } + unsafe { Box::from_raw(real).finish() } } // -------------------------build-------------------- diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs index 0327a3ec83..c9ae7235e6 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs @@ -1,11 +1,14 @@ -mod data_type; -mod index_writer; -mod index_writer_c; -mod util; -mod util_c; mod array; +mod data_type; +mod hashset_collector; mod index_reader; mod index_reader_c; +mod index_writer; +mod index_writer_c; +mod linkedlist_collector; +mod util; +mod util_c; +mod vec_collector; pub fn add(left: usize, right: usize) -> usize { left + right diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/linkedlist_collector.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/linkedlist_collector.rs new file mode 100644 index 0000000000..5200f7102c --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/linkedlist_collector.rs @@ -0,0 +1,61 @@ +use std::collections::LinkedList; + +use tantivy::{ + collector::{Collector, SegmentCollector}, + DocId, +}; + +pub struct LinkedListCollector; + +impl Collector for LinkedListCollector { + type Fruit = LinkedList; + + type Child = LinkedListChildCollector; + + fn for_segment( + &self, + _segment_local_id: tantivy::SegmentOrdinal, + _segment: &tantivy::SegmentReader, + ) -> tantivy::Result { + Ok(LinkedListChildCollector { + docs: LinkedList::new(), + }) + } + + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits( + &self, + segment_fruits: Vec>, + ) -> tantivy::Result> { + if segment_fruits.len() == 1 { + Ok(segment_fruits.into_iter().next().unwrap()) + } else { + let mut result = LinkedList::new(); + for docs in segment_fruits { + for doc in docs { + result.push_front(doc); + } + } + Ok(result) + } + } +} + +pub struct LinkedListChildCollector { + docs: LinkedList, +} + +impl SegmentCollector for LinkedListChildCollector { + type Fruit = LinkedList; + + fn collect(&mut self, doc: DocId, _score: tantivy::Score) { + self.docs.push_front(doc); + } + + fn harvest(self) -> Self::Fruit { + self.docs + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs index 4abc4482e1..1f1c1655c1 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs @@ -1,5 +1,5 @@ +use std::ffi::c_void; use std::ops::Bound; -use std::ffi::{c_void}; use tantivy::{directory::MmapDirectory, Index}; @@ -17,9 +17,9 @@ pub fn make_bounds(bound: T, inclusive: bool) -> Bound { } pub fn create_binding(wrapper: T) -> *mut c_void { - let bp = Box::new(wrapper); - let p_heap : *mut T = Box::into_raw(bp); - p_heap as *mut c_void + let bp = Box::new(wrapper); + let p_heap: *mut T = Box::into_raw(bp); + p_heap as *mut c_void } pub fn free_binding(ptr: *mut c_void) { diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/util_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/util_c.rs index f39756379f..cc35e0c97b 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/util_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/util_c.rs @@ -1,9 +1,9 @@ use std::ffi::{c_char, CStr}; -use crate::{util::index_exist}; +use crate::util::index_exist; #[no_mangle] pub extern "C" fn tantivy_index_exist(path: *const c_char) -> bool { - let path_str = unsafe { CStr::from_ptr(path) }; - index_exist(path_str.to_str().unwrap()) + let path_str = unsafe { CStr::from_ptr(path) }; + index_exist(path_str.to_str().unwrap()) } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs new file mode 100644 index 0000000000..56261d77ab --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs @@ -0,0 +1,55 @@ +use tantivy::{ + collector::{Collector, SegmentCollector}, + DocId, +}; + +pub struct VecCollector; + +impl Collector for VecCollector { + type Fruit = Vec; + + type Child = VecChildCollector; + + fn for_segment( + &self, + _segment_local_id: tantivy::SegmentOrdinal, + _segment: &tantivy::SegmentReader, + ) -> tantivy::Result { + Ok(VecChildCollector { docs: Vec::new() }) + } + + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits(&self, segment_fruits: Vec>) -> tantivy::Result> { + if segment_fruits.len() == 1 { + Ok(segment_fruits.into_iter().next().unwrap()) + } else { + let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum(); + let mut result = Vec::with_capacity(len); + for docs in segment_fruits { + for doc in docs { + result.push(doc); + } + } + Ok(result) + } + } +} + +pub struct VecChildCollector { + docs: Vec, +} + +impl SegmentCollector for VecChildCollector { + type Fruit = Vec; + + fn collect(&mut self, doc: DocId, _score: tantivy::Score) { + self.docs.push(doc); + } + + fn harvest(self) -> Self::Fruit { + self.docs + } +} diff --git a/internal/core/thirdparty/tantivy/time_recorder.h b/internal/core/thirdparty/tantivy/time_recorder.h new file mode 100644 index 0000000000..c2a8d7b82b --- /dev/null +++ b/internal/core/thirdparty/tantivy/time_recorder.h @@ -0,0 +1,65 @@ +#pragma once + +#include +#include +#include + +class TimeRecorder { + using stdclock = std::chrono::high_resolution_clock; + + public: + // trace = 0, debug = 1, info = 2, warn = 3, error = 4, critical = 5 + explicit TimeRecorder(std::string hdr, int64_t log_level = 0) + : header_(std::move(hdr)), log_level_(log_level) { + start_ = last_ = stdclock::now(); + } + virtual ~TimeRecorder() = default; + + double + RecordSection(const std::string& msg) { + stdclock::time_point curr = stdclock::now(); + double span = + (std::chrono::duration(curr - last_)).count(); + last_ = curr; + + PrintTimeRecord(msg, span); + return span; + } + + double + ElapseFromBegin(const std::string& msg) { + stdclock::time_point curr = stdclock::now(); + double span = + (std::chrono::duration(curr - start_)).count(); + + PrintTimeRecord(msg, span); + return span; + } + + static std::string + GetTimeSpanStr(double span) { + std::string str_ms = std::to_string(span * 0.001) + " ms"; + return str_ms; + } + + private: + void + PrintTimeRecord(const std::string& msg, double span) { + std::string str_log; + if (!header_.empty()) { + str_log += header_ + ": "; + } + str_log += msg; + str_log += " ("; + str_log += TimeRecorder::GetTimeSpanStr(span); + str_log += ")"; + + std::cout << str_log << std::endl; + } + + private: + std::string header_; + stdclock::time_point start_; + stdclock::time_point last_; + int64_t log_level_; +};