milvus-io · sre-ci-robot · Oct 10, 2024 · Sep 24, 2024 · Oct 8, 2024
diff --git a/internal/core/src/common/FieldMeta.cpp b/internal/core/src/common/FieldMeta.cpp
@@ -20,7 +20,7 @@
 namespace milvus {
 TokenizerParams
 ParseTokenizerParams(const TypeParams& params) {
-    auto iter = params.find("analyzer_params");
+    auto iter = params.find("tokenizer_params");
     if (iter == params.end()) {
         return {};
     }
@@ -47,9 +47,20 @@
     return string_info_->enable_match;
 }
 
+bool
+FieldMeta::enable_tokenizer() const {
+    if (!IsStringDataType(type_)) {
+        return false;
+    }
+    if (!string_info_.has_value()) {
+        return false;
+    }
+    return string_info_->enable_tokenizer;
+}
+
 TokenizerParams
 FieldMeta::get_tokenizer_params() const {
-    Assert(enable_match());
+    Assert(enable_tokenizer());
     auto params = string_info_->params;
     return ParseTokenizerParams(params);
 }
@@ -91,29 +102,32 @@
         auto type_map = RepeatedKeyValToMap(schema_proto.type_params());
         AssertInfo(type_map.count(MAX_LENGTH), "max_length not found");
         auto max_len = boost::lexical_cast<int64_t>(type_map.at(MAX_LENGTH));
-        bool enable_match = false;
-        if (type_map.count("enable_match")) {
-            auto param_str = type_map.at("enable_match");
+
+        auto get_bool_value = [&](const std::string& key) -> bool {
+            if (!type_map.count(key)) {
+                return false;
+            }
+            auto param_str = type_map.at(key);
             std::transform(param_str.begin(),
                            param_str.end(),
                            param_str.begin(),
                            ::tolower);
+            std::istringstream ss(param_str);
+            bool b;
+            ss >> std::boolalpha >> b;
+            return b;
+        };
 
-            auto bool_cast = [](const std::string& arg) -> bool {
-                std::istringstream ss(arg);
-                bool b;
-                ss >> std::boolalpha >> b;
-                return b;
-            };
+        bool enable_tokenizer = get_bool_value("enable_tokenizer");
+        bool enable_match = get_bool_value("enable_match");
 
-            enable_match = bool_cast(param_str);
-        }
         return FieldMeta{name,
                          field_id,
                          data_type,
                          max_len,
                          nullable,
                          enable_match,
+                         enable_tokenizer,
                          type_map};
     }
 

diff --git a/internal/core/src/common/FieldMeta.h b/internal/core/src/common/FieldMeta.h
@@ -64,11 +64,13 @@ class FieldMeta {
               int64_t max_length,
               bool nullable,
               bool enable_match,
+              bool enable_tokenizer,
               std::map<std::string, std::string>& params)
         : name_(name),
           id_(id),
           type_(type),
-          string_info_(StringInfo{max_length, enable_match, std::move(params)}),
+          string_info_(StringInfo{
+              max_length, enable_match, enable_tokenizer, std::move(params)}),
           nullable_(nullable) {
         Assert(IsStringDataType(type_));
     }
@@ -122,6 +124,9 @@ class FieldMeta {
     bool
     enable_match() const;
 
+    bool
+    enable_tokenizer() const;
+
     TokenizerParams
     get_tokenizer_params() const;
 
@@ -198,6 +203,7 @@ class FieldMeta {
     struct StringInfo {
         int64_t max_length;
         bool enable_match;
+        bool enable_tokenizer;
         std::map<std::string, std::string> params;
     };
     FieldName name_;

diff --git a/internal/core/src/common/Schema.h b/internal/core/src/common/Schema.h
@@ -121,9 +121,16 @@ class Schema {
              int64_t max_length,
              bool nullable,
              bool enable_match,
+             bool enable_tokenizer,
              std::map<std::string, std::string>& params) {
-        auto field_meta = FieldMeta(
-            name, id, data_type, max_length, nullable, enable_match, params);
+        auto field_meta = FieldMeta(name,
+                                    id,
+                                    data_type,
+                                    max_length,
+                                    nullable,
+                                    enable_match,
+                                    enable_tokenizer,
+                                    params);
         this->AddField(std::move(field_meta));
     }
 

diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
@@ -33,7 +33,7 @@ pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextA
             }
         },
         None => {
-            info!("no tokenizer is specific, use default tokenizer");
+            info!("no tokenizer is specified, using default tokenizer");
             Some(default_tokenizer())
         }
     }

diff --git a/internal/core/unittest/test_c_tokenizer.cpp b/internal/core/unittest/test_c_tokenizer.cpp
@@ -31,7 +31,7 @@ TEST(ValidateTextSchema, JieBa) {
     milvus::proto::schema::FieldSchema schema;
     {
         auto kv = schema.add_type_params();
-        kv->set_key("analyzer_params");
+        kv->set_key("tokenizer_params");
         kv->set_value(R"({"tokenizer": "jieba"})");
     }
 

diff --git a/internal/core/unittest/test_text_match.cpp b/internal/core/unittest/test_text_match.cpp
@@ -40,6 +40,7 @@ GenTestSchema(std::map<std::string, std::string> params = {}) {
                     65536,
                     false,
                     true,
+                    true,
                     params);
         schema->AddField(std::move(f));
     }
@@ -76,14 +77,14 @@ TEST(ParseJson, Naive) {
     }
 }
 
-TEST(ParseTokenizerParams, NoAnalyzerParams) {
+TEST(ParseTokenizerParams, NoTokenizerParams) {
     TypeParams params{{"k", "v"}};
     auto p = ParseTokenizerParams(params);
     ASSERT_EQ(0, p.size());
 }
 
 TEST(ParseTokenizerParams, Default) {
-    TypeParams params{{"analyzer_params", R"({"tokenizer": "default"})"}};
+    TypeParams params{{"tokenizer_params", R"({"tokenizer": "default"})"}};
     auto p = ParseTokenizerParams(params);
     ASSERT_EQ(1, p.size());
     auto iter = p.find("tokenizer");
@@ -251,7 +252,8 @@ TEST(TextMatch, SealedNaive) {
 TEST(TextMatch, GrowingJieBa) {
     auto schema = GenTestSchema({
         {"enable_match", "true"},
-        {"analyzer_params", R"({"tokenizer": "jieba"})"},
+        {"enable_tokenizer", "true"},
+        {"tokenizer_params", R"({"tokenizer": "jieba"})"},
     });
     auto seg = CreateGrowingSegment(schema, empty_index_meta);
     std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
@@ -327,7 +329,8 @@ TEST(TextMatch, GrowingJieBa) {
 TEST(TextMatch, SealedJieBa) {
     auto schema = GenTestSchema({
         {"enable_match", "true"},
-        {"analyzer_params", R"({"tokenizer": "jieba"})"},
+        {"enable_tokenizer", "true"},
+        {"tokenizer_params", R"({"tokenizer": "jieba"})"},
     });
     auto seg = CreateSealedSegment(schema, empty_index_meta);
     std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};

diff --git a/internal/datacoord/job_manager_test.go b/internal/datacoord/job_manager_test.go
@@ -56,6 +56,9 @@ func (s *jobManagerSuite) TestJobManager_triggerStatsTaskLoop() {
 								{
 									Key: "enable_match", Value: "true",
 								},
+								{
+									Key: "enable_tokenizer", Value: "true",
+								},
 							},
 						},
 					},

diff --git a/internal/flushcommon/pipeline/flow_graph_embedding_node_test.go b/internal/flushcommon/pipeline/flow_graph_embedding_node_test.go
@@ -21,6 +21,7 @@ import (
 
 	"github.com/stretchr/testify/assert"
 
+	"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
 	"github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
 	"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
 	"github.com/milvus-io/milvus/internal/util/flowgraph"
@@ -44,6 +45,12 @@ func TestEmbeddingNode_BM25_Operator(t *testing.T) {
 				Name:     "text",
 				FieldID:  101,
 				DataType: schemapb.DataType_VarChar,
+				TypeParams: []*commonpb.KeyValuePair{
+					{
+						Key:   "enable_tokenizer",
+						Value: "true",
+					},
+				},
 			}, {
 				Name:             "sparse",
 				FieldID:          102,

diff --git a/internal/metastore/kv/rootcoord/kv_catalog_test.go b/internal/metastore/kv/rootcoord/kv_catalog_test.go
@@ -1243,9 +1243,31 @@ func TestCatalog_CreateCollection(t *testing.T) {
 			Partitions: []*model.Partition{
 				{PartitionName: "test"},
 			},
-			Fields:    []*model.Field{{Name: "text", DataType: schemapb.DataType_VarChar}, {Name: "sparse", DataType: schemapb.DataType_SparseFloatVector}},
-			Functions: []*model.Function{{Name: "test", Type: schemapb.FunctionType_BM25, InputFieldNames: []string{"text"}, OutputFieldNames: []string{"sparse"}}},
-			State:     pb.CollectionState_CollectionCreating,
+			Fields: []*model.Field{
+				{
+					Name:     "text",
+					DataType: schemapb.DataType_VarChar,
+					TypeParams: []*commonpb.KeyValuePair{
+						{
+							Key:   "enable_tokenizer",
+							Value: "true",
+						},
+					},
+				},
+				{
+					Name:     "sparse",
+					DataType: schemapb.DataType_SparseFloatVector,
+				},
+			},
+			Functions: []*model.Function{
+				{
+					Name:             "test",
+					Type:             schemapb.FunctionType_BM25,
+					InputFieldNames:  []string{"text"},
+					OutputFieldNames: []string{"sparse"},
+				},
+			},
+			State: pb.CollectionState_CollectionCreating,
 		}
 		err := kc.CreateCollection(ctx, coll, 100)
 		assert.NoError(t, err)
@@ -1325,9 +1347,31 @@ func TestCatalog_DropCollection(t *testing.T) {
 			Partitions: []*model.Partition{
 				{PartitionName: "test"},
 			},
-			Fields:    []*model.Field{{Name: "text", DataType: schemapb.DataType_VarChar}, {Name: "sparse", DataType: schemapb.DataType_SparseFloatVector}},
-			Functions: []*model.Function{{Name: "test", Type: schemapb.FunctionType_BM25, InputFieldNames: []string{"text"}, OutputFieldNames: []string{"sparse"}}},
-			State:     pb.CollectionState_CollectionDropping,
+			Fields: []*model.Field{
+				{
+					Name:     "text",
+					DataType: schemapb.DataType_VarChar,
+					TypeParams: []*commonpb.KeyValuePair{
+						{
+							Key:   "enable_tokenizer",
+							Value: "true",
+						},
+					},
+				},
+				{
+					Name:     "sparse",
+					DataType: schemapb.DataType_SparseFloatVector,
+				},
+			},
+			Functions: []*model.Function{
+				{
+					Name:             "test",
+					Type:             schemapb.FunctionType_BM25,
+					InputFieldNames:  []string{"text"},
+					OutputFieldNames: []string{"sparse"},
+				},
+			},
+			State: pb.CollectionState_CollectionDropping,
 		}
 		err := kc.DropCollection(ctx, coll, 100)
 		assert.NoError(t, err)

diff --git a/internal/proxy/task_test.go b/internal/proxy/task_test.go
@@ -3116,6 +3116,10 @@ func TestCreateCollectionTaskWithPartitionKey(t *testing.T) {
 				Key:   "max_length",
 				Value: strconv.Itoa(testMaxVarCharLength),
 			},
+			{
+				Key:   "enable_tokenizer",
+				Value: "true",
+			},
 		},
 	}
 	floatVecField := &schemapb.FieldSchema{

diff --git a/internal/proxy/util.go b/internal/proxy/util.go
@@ -697,6 +697,10 @@
 			return fmt.Errorf("only one VARCHAR input field is allowed for a BM25 Function, got %d field with type %s",
 				len(fields), fields[0].DataType.String())
 		}
+		h := typeutil.CreateFieldSchemaHelper(fields[0])
+		if !h.EnableTokenizer() {
+			return fmt.Errorf("BM25 input field must set enable_tokenizer to true")
+		}
 
 	default:
 		return fmt.Errorf("check input field with unknown function type")
@@ -739,7 +743,7 @@
 					return fmt.Errorf("bm25_avgdl must large than zero but now %f", avgdl)
 				}
 
-			case "analyzer_params":
+			case "tokenizer_params":
 				// TODO ADD tokenizer check
 			default:
 				return fmt.Errorf("invalid function params, key: %s, value:%s", kv.GetKey(), kv.GetValue())

diff --git a/internal/util/ctokenizer/text_schema_validator.go b/internal/util/ctokenizer/text_schema_validator.go
@@ -23,6 +23,10 @@ func ValidateTextSchema(fieldSchema *schemapb.FieldSchema) error {
 		return nil
 	}
 
+	if !h.EnableTokenizer() {
+		return fmt.Errorf("field %s is set to enable match but not enable tokenizer", fieldSchema.Name)
+	}
+
 	bs, err := proto.Marshal(fieldSchema)
 	if err != nil {
 		return fmt.Errorf("failed to marshal field schema: %w", err)