Skip to content

Commit

Permalink
add an enable_tokenizer param to varchar field: must be set to true s…
Browse files Browse the repository at this point in the history
…o that a varchar field can enable_match or used as input of BM25 function

Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
  • Loading branch information
zhengbuqian committed Sep 30, 2024
1 parent a6545b2 commit 7a55818
Show file tree
Hide file tree
Showing 14 changed files with 185 additions and 83 deletions.
40 changes: 27 additions & 13 deletions internal/core/src/common/FieldMeta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
namespace milvus {
TokenizerParams
ParseTokenizerParams(const TypeParams& params) {
auto iter = params.find("analyzer_params");
auto iter = params.find("tokenizer_params");

Check warning on line 23 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L23

Added line #L23 was not covered by tests
if (iter == params.end()) {
return {};
}
Expand All @@ -47,9 +47,20 @@ FieldMeta::enable_match() const {
return string_info_->enable_match;
}

bool
FieldMeta::enable_tokenizer() const {
if (!IsStringDataType(type_)) {

Check warning on line 52 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L51-L52

Added lines #L51 - L52 were not covered by tests
return false;
}
if (!string_info_.has_value()) {

Check warning on line 55 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L55

Added line #L55 was not covered by tests
return false;
}
return string_info_->enable_tokenizer;

Check warning on line 58 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L58

Added line #L58 was not covered by tests
}

TokenizerParams
FieldMeta::get_tokenizer_params() const {
Assert(enable_match());
Assert(enable_tokenizer());

Check warning on line 63 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L63

Added line #L63 was not covered by tests
auto params = string_info_->params;
return ParseTokenizerParams(params);
}
Expand Down Expand Up @@ -91,29 +102,32 @@ FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
auto type_map = RepeatedKeyValToMap(schema_proto.type_params());
AssertInfo(type_map.count(MAX_LENGTH), "max_length not found");
auto max_len = boost::lexical_cast<int64_t>(type_map.at(MAX_LENGTH));
bool enable_match = false;
if (type_map.count("enable_match")) {
auto param_str = type_map.at("enable_match");

auto get_bool_value = [&](const std::string& key) -> bool {
if (!type_map.count(key)) {
return false;

Check warning on line 108 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L106-L108

Added lines #L106 - L108 were not covered by tests
}
auto param_str = type_map.at(key);

Check warning on line 110 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L110

Added line #L110 was not covered by tests
std::transform(param_str.begin(),
param_str.end(),
param_str.begin(),
::tolower);
std::istringstream ss(param_str);
bool b;
ss >> std::boolalpha >> b;
return b;
};

Check warning on line 119 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L115-L119

Added lines #L115 - L119 were not covered by tests

auto bool_cast = [](const std::string& arg) -> bool {
std::istringstream ss(arg);
bool b;
ss >> std::boolalpha >> b;
return b;
};
bool enable_tokenizer = get_bool_value("enable_tokenizer");
bool enable_match = get_bool_value("enable_match");

Check warning on line 122 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L121-L122

Added lines #L121 - L122 were not covered by tests

enable_match = bool_cast(param_str);
}
return FieldMeta{name,
field_id,
data_type,
max_len,
nullable,
enable_match,
enable_tokenizer,
type_map};
}

Expand Down
8 changes: 7 additions & 1 deletion internal/core/src/common/FieldMeta.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,13 @@ class FieldMeta {
int64_t max_length,
bool nullable,
bool enable_match,
bool enable_tokenizer,
std::map<std::string, std::string>& params)
: name_(name),
id_(id),
type_(type),
string_info_(StringInfo{max_length, enable_match, std::move(params)}),
string_info_(StringInfo{
max_length, enable_match, enable_tokenizer, std::move(params)}),
nullable_(nullable) {
Assert(IsStringDataType(type_));
}
Expand Down Expand Up @@ -122,6 +124,9 @@ class FieldMeta {
bool
enable_match() const;

bool
enable_tokenizer() const;

TokenizerParams
get_tokenizer_params() const;

Expand Down Expand Up @@ -198,6 +203,7 @@ class FieldMeta {
struct StringInfo {
int64_t max_length;
bool enable_match;
bool enable_tokenizer;
std::map<std::string, std::string> params;
};
FieldName name_;
Expand Down
11 changes: 9 additions & 2 deletions internal/core/src/common/Schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,16 @@ class Schema {
int64_t max_length,
bool nullable,
bool enable_match,
bool enable_tokenizer,
std::map<std::string, std::string>& params) {
auto field_meta = FieldMeta(
name, id, data_type, max_length, nullable, enable_match, params);
auto field_meta = FieldMeta(name,
id,
data_type,
max_length,
nullable,
enable_match,
enable_tokenizer,
params);
this->AddField(std::move(field_meta));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextA
}
},
None => {
info!("no tokenizer is specific, use default tokenizer");
info!("no tokenizer is specified, using default tokenizer");
Some(default_tokenizer())
}
}
Expand Down
2 changes: 1 addition & 1 deletion internal/core/unittest/test_c_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ TEST(ValidateTextSchema, JieBa) {
milvus::proto::schema::FieldSchema schema;
{
auto kv = schema.add_type_params();
kv->set_key("analyzer_params");
kv->set_key("tokenizer_params");
kv->set_value(R"({"tokenizer": "jieba"})");
}

Expand Down
11 changes: 7 additions & 4 deletions internal/core/unittest/test_text_match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ GenTestSchema(std::map<std::string, std::string> params = {}) {
65536,
false,
true,
true,
params);
schema->AddField(std::move(f));
}
Expand Down Expand Up @@ -76,14 +77,14 @@ TEST(ParseJson, Naive) {
}
}

TEST(ParseTokenizerParams, NoAnalyzerParams) {
TEST(ParseTokenizerParams, NoTokenizerParams) {
TypeParams params{{"k", "v"}};
auto p = ParseTokenizerParams(params);
ASSERT_EQ(0, p.size());
}

TEST(ParseTokenizerParams, Default) {
TypeParams params{{"analyzer_params", R"({"tokenizer": "default"})"}};
TypeParams params{{"tokenizer_params", R"({"tokenizer": "default"})"}};
auto p = ParseTokenizerParams(params);
ASSERT_EQ(1, p.size());
auto iter = p.find("tokenizer");
Expand Down Expand Up @@ -251,7 +252,8 @@ TEST(TextMatch, SealedNaive) {
TEST(TextMatch, GrowingJieBa) {
auto schema = GenTestSchema({
{"enable_match", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
{"enable_tokenizer", "true"},
{"tokenizer_params", R"({"tokenizer": "jieba"})"},
});
auto seg = CreateGrowingSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
Expand Down Expand Up @@ -327,7 +329,8 @@ TEST(TextMatch, GrowingJieBa) {
TEST(TextMatch, SealedJieBa) {
auto schema = GenTestSchema({
{"enable_match", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
{"enable_tokenizer", "true"},
{"tokenizer_params", R"({"tokenizer": "jieba"})"},
});
auto seg = CreateSealedSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
Expand Down
3 changes: 3 additions & 0 deletions internal/datacoord/job_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ func (s *jobManagerSuite) TestJobManager_triggerStatsTaskLoop() {
{
Key: "enable_match", Value: "true",
},
{
Key: "enable_tokenizer", Value: "true",
},
},
},
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (

"github.com/stretchr/testify/assert"

"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/util/flowgraph"
Expand All @@ -44,6 +45,12 @@ func TestEmbeddingNode_BM25_Operator(t *testing.T) {
Name: "text",
FieldID: 101,
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "enable_tokenizer",
Value: "true",
},
},
}, {
Name: "sparse",
FieldID: 102,
Expand Down
56 changes: 50 additions & 6 deletions internal/metastore/kv/rootcoord/kv_catalog_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1243,9 +1243,31 @@ func TestCatalog_CreateCollection(t *testing.T) {
Partitions: []*model.Partition{
{PartitionName: "test"},
},
Fields: []*model.Field{{Name: "text", DataType: schemapb.DataType_VarChar}, {Name: "sparse", DataType: schemapb.DataType_SparseFloatVector}},
Functions: []*model.Function{{Name: "test", Type: schemapb.FunctionType_BM25, InputFieldNames: []string{"text"}, OutputFieldNames: []string{"sparse"}}},
State: pb.CollectionState_CollectionCreating,
Fields: []*model.Field{
{
Name: "text",
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "enable_tokenizer",
Value: "true",
},
},
},
{
Name: "sparse",
DataType: schemapb.DataType_SparseFloatVector,
},
},
Functions: []*model.Function{
{
Name: "test",
Type: schemapb.FunctionType_BM25,
InputFieldNames: []string{"text"},
OutputFieldNames: []string{"sparse"},
},
},
State: pb.CollectionState_CollectionCreating,
}
err := kc.CreateCollection(ctx, coll, 100)
assert.NoError(t, err)
Expand Down Expand Up @@ -1325,9 +1347,31 @@ func TestCatalog_DropCollection(t *testing.T) {
Partitions: []*model.Partition{
{PartitionName: "test"},
},
Fields: []*model.Field{{Name: "text", DataType: schemapb.DataType_VarChar}, {Name: "sparse", DataType: schemapb.DataType_SparseFloatVector}},
Functions: []*model.Function{{Name: "test", Type: schemapb.FunctionType_BM25, InputFieldNames: []string{"text"}, OutputFieldNames: []string{"sparse"}}},
State: pb.CollectionState_CollectionDropping,
Fields: []*model.Field{
{
Name: "text",
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "enable_tokenizer",
Value: "true",
},
},
},
{
Name: "sparse",
DataType: schemapb.DataType_SparseFloatVector,
},
},
Functions: []*model.Function{
{
Name: "test",
Type: schemapb.FunctionType_BM25,
InputFieldNames: []string{"text"},
OutputFieldNames: []string{"sparse"},
},
},
State: pb.CollectionState_CollectionDropping,
}
err := kc.DropCollection(ctx, coll, 100)
assert.NoError(t, err)
Expand Down
4 changes: 4 additions & 0 deletions internal/proxy/task_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3116,6 +3116,10 @@ func TestCreateCollectionTaskWithPartitionKey(t *testing.T) {
Key: "max_length",
Value: strconv.Itoa(testMaxVarCharLength),
},
{
Key: "enable_tokenizer",
Value: "true",
},
},
}
floatVecField := &schemapb.FieldSchema{
Expand Down
6 changes: 5 additions & 1 deletion internal/proxy/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,10 @@ func checkFunctionInputField(function *schemapb.FunctionSchema, fields []*schema
return fmt.Errorf("only one VARCHAR input field is allowed for a BM25 Function, got %d field with type %s",
len(fields), fields[0].DataType.String())
}
h := typeutil.CreateFieldSchemaHelper(fields[0])
if !h.EnableTokenizer() {
return fmt.Errorf("BM25 input field must set enable_tokenizer to true")

Check warning on line 702 in internal/proxy/util.go

View check run for this annotation

Codecov / codecov/patch

internal/proxy/util.go#L702

Added line #L702 was not covered by tests
}

default:
return fmt.Errorf("check input field with unknown function type")
Expand Down Expand Up @@ -739,7 +743,7 @@ func checkFunctionParams(function *schemapb.FunctionSchema) error {
return fmt.Errorf("bm25_avgdl must large than zero but now %f", avgdl)
}

case "analyzer_params":
case "tokenizer_params":

Check warning on line 746 in internal/proxy/util.go

View check run for this annotation

Codecov / codecov/patch

internal/proxy/util.go#L746

Added line #L746 was not covered by tests
// TODO ADD tokenizer check
default:
return fmt.Errorf("invalid function params, key: %s, value:%s", kv.GetKey(), kv.GetValue())
Expand Down
4 changes: 4 additions & 0 deletions internal/util/ctokenizer/text_schema_validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ func ValidateTextSchema(fieldSchema *schemapb.FieldSchema) error {
return nil
}

if !h.EnableTokenizer() {
return fmt.Errorf("field %s is set to enable match but not enable tokenizer", fieldSchema.Name)
}

bs, err := proto.Marshal(fieldSchema)
if err != nil {
return fmt.Errorf("failed to marshal field schema: %w", err)
Expand Down
Loading

0 comments on commit 7a55818

Please # to comment.