diff --git a/Makefile b/Makefile index f56e6d07e3..f94f22672c 100644 --- a/Makefile +++ b/Makefile @@ -145,6 +145,10 @@ cppcheck: @#(env bash ${PWD}/scripts/core_build.sh -l) @(env bash ${PWD}/scripts/check_cpp_fmt.sh) +rustcheck: + @echo "Running cargo format" + @env bash ${PWD}/scripts/run_cargo_format.sh ${PWD}/internal/core/thirdparty/tantivy/tantivy-binding/ + fmt: ifdef GO_DIFF_FILES @@ -201,7 +205,7 @@ static-check: getdeps @echo "Start check go_client e2e package" @source $(PWD)/scripts/setenv.sh && cd tests/go_client && GO111MODULE=on GOFLAGS=-buildvcs=false $(INSTALL_PATH)/golangci-lint run --build-tags L0,L1,L2,test --timeout=30m --config $(PWD)/tests/go_client/.golangci.yml -verifiers: build-cpp getdeps cppcheck fmt static-check +verifiers: build-cpp getdeps cppcheck rustcheck fmt static-check # Build various components locally. binlog: diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/build_in_analyzer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/build_in_analyzer.rs index 429708af7f..5581de88d2 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/build_in_analyzer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/build_in_analyzer.rs @@ -1,8 +1,8 @@ use tantivy::tokenizer::*; -use super::tokenizers::*; -use super::filter::*; use super::filter::stop_words; +use super::filter::*; +use super::tokenizers::*; // default build-in analyzer pub(crate) fn standard_analyzer(stop_words: Vec) -> TextAnalyzer { @@ -37,4 +37,4 @@ pub fn english_analyzer(stop_words: Vec) -> TextAnalyzer { } builder.build() -} \ No newline at end of file +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/mod.rs index 470102921f..54ad1636e8 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/mod.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/mod.rs @@ -1,6 +1,6 @@ mod filter; -mod remove_punct_filter; mod regex_filter; +mod remove_punct_filter; pub(crate) mod stop_words; mod util; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs index 737bf82d9e..acf98119dd 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs @@ -1,9 +1,9 @@ use core::{option::Option::Some, result::Result::Ok}; use jieba_rs; -use std::io::BufReader; use lazy_static::lazy_static; use serde_json as json; use std::borrow::Cow; +use std::io::BufReader; use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; use crate::error::{Result, TantivyBindingError}; @@ -52,7 +52,9 @@ impl TokenStream for JiebaTokenStream { } } -fn get_jieba_dict(params: &json::Map) -> Result<(Vec, Option)> { +fn get_jieba_dict( + params: &json::Map, +) -> Result<(Vec, Option)> { match params.get("dict") { Some(value) => { if !value.is_array() { @@ -77,15 +79,13 @@ fn get_jieba_dict(params: &json::Map) -> Result<(Vec { - Ok((vec![], Some("_default_".to_string()))) - } + _ => Ok((vec![], Some("_default_".to_string()))), } } @@ -138,21 +138,23 @@ impl<'a> JiebaTokenizer<'a> { pub fn from_json(params: &json::Map) -> Result> { let (dict, system_dict) = get_jieba_dict(params)?; - let mut tokenizer = system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| { - match name.as_str() { + let mut tokenizer = + system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| match name.as_str() { "_default_" => Ok(jieba_rs::Jieba::new()), "_extend_default_" => { let mut buf = BufReader::new(EXTEND_DEFAULT_DICT.as_bytes()); - jieba_rs::Jieba::with_dict(&mut buf).map_err(|e| - TantivyBindingError::InternalError(format!("failed to load extend default system dict: {}", e)) - ) - }, + jieba_rs::Jieba::with_dict(&mut buf).map_err(|e| { + TantivyBindingError::InternalError(format!( + "failed to load extend default system dict: {}", + e + )) + }) + } _ => Err(TantivyBindingError::InternalError(format!( "invalid system dict name: {}", name - ))) - } - })?; + ))), + })?; for word in dict { tokenizer.add_word(word.as_str(), None, None); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lang_ident_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lang_ident_tokenizer.rs index af1c8e9478..6d99f9ca04 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lang_ident_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lang_ident_tokenizer.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use tantivy::tokenizer::{BoxTokenStream, TextAnalyzer, Tokenizer}; use whatlang::detect; -pub trait Identifier : Sync + Send{ +pub trait Identifier: Sync + Send { fn detect(&self, text: &str) -> String; fn box_clone(&self) -> Box; } @@ -246,7 +246,6 @@ impl<'a> LangIdentTokenizer<'a> { } } - impl Tokenizer for LangIdentTokenizer<'static> { type TokenStream<'a> = BoxTokenStream<'a>; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs index ae78b86f12..739fdc015d 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs @@ -1,5 +1,5 @@ pub const ENGLISH: &[&str] = &[ - "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", - "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", - "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with", ]; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/token_stream_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/token_stream_c.rs index 3a3a48960d..12bcb35979 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/token_stream_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/token_stream_c.rs @@ -10,17 +10,17 @@ use crate::{ }; #[repr(C)] -pub struct TantivyToken{ +pub struct TantivyToken { pub token: *const c_char, pub start_offset: i64, - pub end_offset:i64, - pub position:i64, - pub position_length:i64, + pub end_offset: i64, + pub position: i64, + pub position_length: i64, } -impl TantivyToken{ - pub fn from_token(token: &Token) -> Self{ - TantivyToken{ +impl TantivyToken { + pub fn from_token(token: &Token) -> Self { + TantivyToken { token: create_string(&token.text), start_offset: token.offset_from as i64, end_offset: token.offset_to as i64, @@ -30,7 +30,6 @@ impl TantivyToken{ } } - // Note: the tokenizer and text must be released after the token_stream. #[no_mangle] pub extern "C" fn tantivy_create_token_stream( @@ -62,7 +61,9 @@ pub extern "C" fn tantivy_token_stream_get_token(token_stream: *mut c_void) -> * } #[no_mangle] -pub extern "C" fn tantivy_token_stream_get_detailed_token(token_stream: *mut c_void) -> TantivyToken { +pub extern "C" fn tantivy_token_stream_get_detailed_token( + token_stream: *mut c_void, +) -> TantivyToken { let real = token_stream as *mut BoxTokenStream<'_>; - TantivyToken::from_token(unsafe { (*real).token()}) + TantivyToken::from_token(unsafe { (*real).token() }) } diff --git a/scripts/run_cargo_format.sh b/scripts/run_cargo_format.sh new file mode 100644 index 0000000000..49b1831229 --- /dev/null +++ b/scripts/run_cargo_format.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +( + cd $1 + cargo fmt +) \ No newline at end of file