From e92cc55ec1ce3e41de79838d18ed99a988bd1416 Mon Sep 17 00:00:00 2001 From: xgc Date: Fri, 29 Nov 2024 16:29:16 +0800 Subject: [PATCH] =?UTF-8?q?fix=EF=BC=9A=E4=BF=AE=E5=A4=8D2.2.0=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E4=BD=BF=E7=94=A8=E5=86=85=E7=BD=AE=E5=88=86=E6=9E=90?= =?UTF-8?q?=E4=BB=AA=E7=9A=84=E9=97=AE=E9=A2=98=EF=BC=8C=E5=B9=B6=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E6=B3=A8=E8=A7=A3=E7=9A=84=E4=BD=BF=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Sparse-BM25文本搜索.md | 42 +++++++++++---- .../plus/annotation/AnalyzerParams.java | 9 ++-- .../{BuiltInFilter.java => Filter.java} | 8 ++- .../milvus/plus/annotation/MilvusField.java | 4 +- .../milvus/plus/model/AnalyzerType.java | 8 +++ .../milvus/plus/model/TokenizerType.java | 8 --- .../milvus/plus/util/AnalyzerParamsUtils.java | 52 +++++++++++++------ milvus-plus-parent/pom.xml | 2 +- milvus-spring-demo/pom.xml | 2 +- 9 files changed, 88 insertions(+), 47 deletions(-) rename milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/{BuiltInFilter.java => Filter.java} (69%) create mode 100644 milvus-plus-core/src/main/java/org/dromara/milvus/plus/model/AnalyzerType.java delete mode 100644 milvus-plus-core/src/main/java/org/dromara/milvus/plus/model/TokenizerType.java diff --git a/Sparse-BM25文本搜索.md b/Sparse-BM25文本搜索.md index 6eb2a46..612a903 100644 --- a/Sparse-BM25文本搜索.md +++ b/Sparse-BM25文本搜索.md @@ -34,35 +34,55 @@ public class TextEntity { dataType = DataType.VarChar, enableAnalyzer = true, analyzerParams = @AnalyzerParams( - builtInFilters = { - @BuiltInFilter - }, - customFilters = { - @CustomFilter(type = "length", max = 40), - @CustomFilter(type = "stop", stopWords = {"of", "to"}) - } + type= AnalyzerType.CHINESE ) +// analyzerParams = @AnalyzerParams( +// tokenizer= "standard", +// filter=@Filter( +// builtInFilters={ +// BuiltInFilterType.lowercase +// }, +// customFilters = { +// @CustomFilter( +// type = "length", +// max = 40 +// ), +// @CustomFilter( +// type = "stop", +// stopWords = {"of","to"} +// ) +// } +// ) +// ) ) private String text; } ``` - 非专业人员不要设置 analyzerParams,只需设置 enableAnalyzer = true即可。 +## 分析仪的使用 + +- 使用type指定内置分析仪 +- 自定义分析仪,需配置分词器和过滤器 + +### 默认分析仪(Type) + +- **默认分析仪**:可以根据 AnalyzerType 接口中的常量来设置具体的分词器类型 + ### 分词器(Tokenizer) - **默认分词器**:`standard` 分词器,基于语法规则将文本拆分为离散的单词单元。 -- **注解属性**:`tokenizer`,其默认值为 `TokenizerType.standard`。 +- **注解属性**:在 `AnalyzerParams` 注解中使用 `tokenizer` 属性来配置分词器。如果未指定,则默认为空字符串 ### 过滤器(Filter) - **默认过滤器**:`lowercase` 过滤器,将所有标记转换为小写,以支持不区分大小写的搜索。 -- **注解属性**:`builtInFilters` 和 `customFilters`,分别用于配置内置过滤器和自定义过滤器。 +- **注解属性**:在 `Filter` 注解中使用 `builtInFilters` 和 `customFilters` 属性来配置内置过滤器和自定义过滤器。`builtInFilters` 属性接受 `BuiltInFilterType` 枚举值的数组,而 `customFilters` 属性接受 `CustomFilter` 注解的数组。 ### 自定义停用词(StopWords) - **可选参数**:`stop_words`,用于指定要从分词结果中排除的停用词列表。 -- **注解属性**:`customFilters` 中的 `stopWords` 属性,允许定义自定义停用词。 +- **注解属性**:在 `CustomFilter` 注解中使用 `stopWords` 属性来定义自定义停用词。 diff --git a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/AnalyzerParams.java b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/AnalyzerParams.java index 3350411..a2213b2 100644 --- a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/AnalyzerParams.java +++ b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/AnalyzerParams.java @@ -1,7 +1,5 @@ package org.dromara.milvus.plus.annotation; -import org.dromara.milvus.plus.model.TokenizerType; - import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; @@ -13,9 +11,8 @@ import java.lang.annotation.Target; @Target(ElementType.ANNOTATION_TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface AnalyzerParams { - - TokenizerType tokenizer() default TokenizerType.standard; // 分词器配置 - BuiltInFilter[] builtInFilters() default {}; //内置过滤器 - CustomFilter[] customFilters() default {}; //自定义过滤器 + String type() default ""; // 分析器类型 + String tokenizer() default ""; // 自定义分词器配置 + Filter filter() default @Filter; //过滤器 } \ No newline at end of file diff --git a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/BuiltInFilter.java b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/Filter.java similarity index 69% rename from milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/BuiltInFilter.java rename to milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/Filter.java index cdb2f60..7f7b87c 100644 --- a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/BuiltInFilter.java +++ b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/Filter.java @@ -12,6 +12,10 @@ import java.lang.annotation.Target; */ @Target(ElementType.ANNOTATION_TYPE) @Retention(RetentionPolicy.RUNTIME) -public @interface BuiltInFilter { - BuiltInFilterType name() default BuiltInFilterType.lowercase; +public @interface Filter { + + BuiltInFilterType[] builtInFilters() default {};; //内置过滤器 + + CustomFilter[] customFilters() default {}; //自定义过滤器 + } \ No newline at end of file diff --git a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/MilvusField.java b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/MilvusField.java index ca9dfb4..1d94553 100644 --- a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/MilvusField.java +++ b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/annotation/MilvusField.java @@ -1,6 +1,7 @@ package org.dromara.milvus.plus.annotation; import io.milvus.v2.common.DataType; +import org.dromara.milvus.plus.model.AnalyzerType; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; @@ -82,6 +83,7 @@ public @interface MilvusField { /** * 分析器参数。 */ - AnalyzerParams analyzerParams() default @AnalyzerParams; + AnalyzerParams analyzerParams() default @AnalyzerParams(type =AnalyzerType.STANDARD); + } \ No newline at end of file diff --git a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/model/AnalyzerType.java b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/model/AnalyzerType.java new file mode 100644 index 0000000..47c46ce --- /dev/null +++ b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/model/AnalyzerType.java @@ -0,0 +1,8 @@ +package org.dromara.milvus.plus.model; + +public interface AnalyzerType { + String STANDARD="standard"; + String ENGLISH="english"; + String CHINESE="chinese"; + +} diff --git a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/model/TokenizerType.java b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/model/TokenizerType.java deleted file mode 100644 index 61f2e40..0000000 --- a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/model/TokenizerType.java +++ /dev/null @@ -1,8 +0,0 @@ -package org.dromara.milvus.plus.model; - -/** - * 枚举表示内置的分词器类型。 - */ -public enum TokenizerType { - standard, whitespace, english, chinese -} \ No newline at end of file diff --git a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/util/AnalyzerParamsUtils.java b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/util/AnalyzerParamsUtils.java index 7d8bc08..d7752bb 100644 --- a/milvus-plus-core/src/main/java/org/dromara/milvus/plus/util/AnalyzerParamsUtils.java +++ b/milvus-plus-core/src/main/java/org/dromara/milvus/plus/util/AnalyzerParamsUtils.java @@ -1,9 +1,12 @@ package org.dromara.milvus.plus.util; import com.google.common.collect.Lists; +import org.apache.commons.lang3.StringUtils; import org.dromara.milvus.plus.annotation.AnalyzerParams; -import org.dromara.milvus.plus.annotation.BuiltInFilter; import org.dromara.milvus.plus.annotation.CustomFilter; +import org.dromara.milvus.plus.annotation.Filter; +import org.dromara.milvus.plus.model.BuiltInFilterType; +import org.springframework.util.CollectionUtils; import java.util.ArrayList; import java.util.HashMap; @@ -15,31 +18,46 @@ public class AnalyzerParamsUtils { public static Map convertToMap(AnalyzerParams analyzerParams) { Map paramsMap = new HashMap<>(); if (analyzerParams != null) { - // 设置分词器 - paramsMap.put("tokenizer", analyzerParams.tokenizer().name().toLowerCase()); - // 处理内置过滤器 - List builtInFiltersList = new ArrayList<>(); - for (BuiltInFilter builtInFilter : analyzerParams.builtInFilters()) { - builtInFiltersList.add(builtInFilter.name().name()); + String type = analyzerParams.type(); + if(StringUtils.isNotEmpty(type)){ + //使用默认分析器 + paramsMap.put("type", type); } - // 处理自定义过滤器 + String tokenizer = analyzerParams.tokenizer(); + if(StringUtils.isNotEmpty(tokenizer)){ + // 设置分词器 + paramsMap.put("tokenizer",tokenizer); + } + Filter filter = analyzerParams.filter(); + List builtInFiltersList = new ArrayList<>(); List> customFiltersList = new ArrayList<>(); - for (CustomFilter customFilter : analyzerParams.customFilters()) { - Map filterMap = new HashMap<>(); - filterMap.put("type", customFilter.type()); - if (customFilter.max() > 0) { - filterMap.put("max", customFilter.max()); + if(filter!=null){ + CustomFilter[] customFilters = filter.customFilters(); + BuiltInFilterType[] builtInFilterTypes = filter.builtInFilters(); + // 处理内置过滤器 + for (BuiltInFilterType builtInFilterType : builtInFilterTypes) { + builtInFiltersList.add(builtInFilterType.name()); } - if (customFilter.stopWords().length > 0) { - filterMap.put("stopWords", new ArrayList<>(Lists.newArrayList(customFilter.stopWords()))); + //处理自定义过滤器 + for (CustomFilter customFilter : customFilters) { + Map filterMap = new HashMap<>(); + filterMap.put("type", customFilter.type()); + if (customFilter.max() > 0) { + filterMap.put("max", customFilter.max()); + } + if (customFilter.stopWords().length > 0) { + filterMap.put("stop_words", new ArrayList<>(Lists.newArrayList(customFilter.stopWords()))); + } + customFiltersList.add(filterMap); } - customFiltersList.add(filterMap); } // 合并过滤器列表 List filters = new ArrayList<>(); filters.addAll(builtInFiltersList); filters.addAll(customFiltersList); - paramsMap.put("filter", filters); + if(!CollectionUtils.isEmpty(filters)){ + paramsMap.put("filter", filters); + } } return paramsMap; } diff --git a/milvus-plus-parent/pom.xml b/milvus-plus-parent/pom.xml index 0f9eb63..3b10618 100644 --- a/milvus-plus-parent/pom.xml +++ b/milvus-plus-parent/pom.xml @@ -30,7 +30,7 @@ - 2.2.0 + 2.2.1 ${java.version} ${java.version} 3.11.0 diff --git a/milvus-spring-demo/pom.xml b/milvus-spring-demo/pom.xml index 202ee1f..1804145 100644 --- a/milvus-spring-demo/pom.xml +++ b/milvus-spring-demo/pom.xml @@ -21,7 +21,7 @@ org.dromara.milvus-plus milvus-plus-boot-starter - 2.2.0 + 2.2.1