Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

feat: add enable_tokenizer params to VarChar field #36480

Merged
merged 2 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 27 additions & 13 deletions internal/core/src/common/FieldMeta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
namespace milvus {
TokenizerParams
ParseTokenizerParams(const TypeParams& params) {
auto iter = params.find("analyzer_params");
auto iter = params.find("tokenizer_params");

Check warning on line 23 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L23

Added line #L23 was not covered by tests
if (iter == params.end()) {
return {};
}
Expand All @@ -47,9 +47,20 @@
return string_info_->enable_match;
}

bool
FieldMeta::enable_tokenizer() const {
if (!IsStringDataType(type_)) {

Check warning on line 52 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L51-L52

Added lines #L51 - L52 were not covered by tests
return false;
}
if (!string_info_.has_value()) {

Check warning on line 55 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L55

Added line #L55 was not covered by tests
return false;
}
return string_info_->enable_tokenizer;

Check warning on line 58 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L58

Added line #L58 was not covered by tests
}

TokenizerParams
FieldMeta::get_tokenizer_params() const {
Assert(enable_match());
Assert(enable_tokenizer());

Check warning on line 63 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L63

Added line #L63 was not covered by tests
auto params = string_info_->params;
return ParseTokenizerParams(params);
}
Expand Down Expand Up @@ -91,29 +102,32 @@
auto type_map = RepeatedKeyValToMap(schema_proto.type_params());
AssertInfo(type_map.count(MAX_LENGTH), "max_length not found");
auto max_len = boost::lexical_cast<int64_t>(type_map.at(MAX_LENGTH));
bool enable_match = false;
if (type_map.count("enable_match")) {
auto param_str = type_map.at("enable_match");

auto get_bool_value = [&](const std::string& key) -> bool {
if (!type_map.count(key)) {
return false;

Check warning on line 108 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L106-L108

Added lines #L106 - L108 were not covered by tests
}
auto param_str = type_map.at(key);

Check warning on line 110 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L110

Added line #L110 was not covered by tests
std::transform(param_str.begin(),
param_str.end(),
param_str.begin(),
::tolower);
std::istringstream ss(param_str);
bool b;
ss >> std::boolalpha >> b;
return b;
};

Check warning on line 119 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L115-L119

Added lines #L115 - L119 were not covered by tests

auto bool_cast = [](const std::string& arg) -> bool {
std::istringstream ss(arg);
bool b;
ss >> std::boolalpha >> b;
return b;
};
bool enable_tokenizer = get_bool_value("enable_tokenizer");
bool enable_match = get_bool_value("enable_match");

Check warning on line 122 in internal/core/src/common/FieldMeta.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/FieldMeta.cpp#L121-L122

Added lines #L121 - L122 were not covered by tests

enable_match = bool_cast(param_str);
}
return FieldMeta{name,
field_id,
data_type,
max_len,
nullable,
enable_match,
enable_tokenizer,
type_map};
}

Expand Down
8 changes: 7 additions & 1 deletion internal/core/src/common/FieldMeta.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,13 @@ class FieldMeta {
int64_t max_length,
bool nullable,
bool enable_match,
bool enable_tokenizer,
std::map<std::string, std::string>& params)
: name_(name),
id_(id),
type_(type),
string_info_(StringInfo{max_length, enable_match, std::move(params)}),
string_info_(StringInfo{
max_length, enable_match, enable_tokenizer, std::move(params)}),
nullable_(nullable) {
Assert(IsStringDataType(type_));
}
Expand Down Expand Up @@ -122,6 +124,9 @@ class FieldMeta {
bool
enable_match() const;

bool
enable_tokenizer() const;

TokenizerParams
get_tokenizer_params() const;

Expand Down Expand Up @@ -198,6 +203,7 @@ class FieldMeta {
struct StringInfo {
int64_t max_length;
bool enable_match;
bool enable_tokenizer;
std::map<std::string, std::string> params;
};
FieldName name_;
Expand Down
11 changes: 9 additions & 2 deletions internal/core/src/common/Schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,16 @@ class Schema {
int64_t max_length,
bool nullable,
bool enable_match,
bool enable_tokenizer,
std::map<std::string, std::string>& params) {
auto field_meta = FieldMeta(
name, id, data_type, max_length, nullable, enable_match, params);
auto field_meta = FieldMeta(name,
id,
data_type,
max_length,
nullable,
enable_match,
enable_tokenizer,
params);
this->AddField(std::move(field_meta));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextA
}
},
None => {
info!("no tokenizer is specific, use default tokenizer");
info!("no tokenizer is specified, using default tokenizer");
Some(default_tokenizer())
}
}
Expand Down
2 changes: 1 addition & 1 deletion internal/core/unittest/test_c_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ TEST(ValidateTextSchema, JieBa) {
milvus::proto::schema::FieldSchema schema;
{
auto kv = schema.add_type_params();
kv->set_key("analyzer_params");
kv->set_key("tokenizer_params");
kv->set_value(R"({"tokenizer": "jieba"})");
}

Expand Down
11 changes: 7 additions & 4 deletions internal/core/unittest/test_text_match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ GenTestSchema(std::map<std::string, std::string> params = {}) {
65536,
false,
true,
true,
params);
schema->AddField(std::move(f));
}
Expand Down Expand Up @@ -76,14 +77,14 @@ TEST(ParseJson, Naive) {
}
}

TEST(ParseTokenizerParams, NoAnalyzerParams) {
TEST(ParseTokenizerParams, NoTokenizerParams) {
TypeParams params{{"k", "v"}};
auto p = ParseTokenizerParams(params);
ASSERT_EQ(0, p.size());
}

TEST(ParseTokenizerParams, Default) {
TypeParams params{{"analyzer_params", R"({"tokenizer": "default"})"}};
TypeParams params{{"tokenizer_params", R"({"tokenizer": "default"})"}};
auto p = ParseTokenizerParams(params);
ASSERT_EQ(1, p.size());
auto iter = p.find("tokenizer");
Expand Down Expand Up @@ -251,7 +252,8 @@ TEST(TextMatch, SealedNaive) {
TEST(TextMatch, GrowingJieBa) {
auto schema = GenTestSchema({
{"enable_match", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
{"enable_tokenizer", "true"},
{"tokenizer_params", R"({"tokenizer": "jieba"})"},
});
auto seg = CreateGrowingSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
Expand Down Expand Up @@ -327,7 +329,8 @@ TEST(TextMatch, GrowingJieBa) {
TEST(TextMatch, SealedJieBa) {
auto schema = GenTestSchema({
{"enable_match", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
{"enable_tokenizer", "true"},
{"tokenizer_params", R"({"tokenizer": "jieba"})"},
});
auto seg = CreateSealedSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
Expand Down
3 changes: 3 additions & 0 deletions internal/datacoord/job_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ func (s *jobManagerSuite) TestJobManager_triggerStatsTaskLoop() {
{
Key: "enable_match", Value: "true",
},
{
Key: "enable_tokenizer", Value: "true",
},
},
},
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (

"github.com/stretchr/testify/assert"

"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/util/flowgraph"
Expand All @@ -44,6 +45,12 @@ func TestEmbeddingNode_BM25_Operator(t *testing.T) {
Name: "text",
FieldID: 101,
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "enable_tokenizer",
Value: "true",
},
},
}, {
Name: "sparse",
FieldID: 102,
Expand Down
56 changes: 50 additions & 6 deletions internal/metastore/kv/rootcoord/kv_catalog_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1243,9 +1243,31 @@ func TestCatalog_CreateCollection(t *testing.T) {
Partitions: []*model.Partition{
{PartitionName: "test"},
},
Fields: []*model.Field{{Name: "text", DataType: schemapb.DataType_VarChar}, {Name: "sparse", DataType: schemapb.DataType_SparseFloatVector}},
Functions: []*model.Function{{Name: "test", Type: schemapb.FunctionType_BM25, InputFieldNames: []string{"text"}, OutputFieldNames: []string{"sparse"}}},
State: pb.CollectionState_CollectionCreating,
Fields: []*model.Field{
{
Name: "text",
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "enable_tokenizer",
Value: "true",
},
},
},
{
Name: "sparse",
DataType: schemapb.DataType_SparseFloatVector,
},
},
Functions: []*model.Function{
{
Name: "test",
Type: schemapb.FunctionType_BM25,
InputFieldNames: []string{"text"},
OutputFieldNames: []string{"sparse"},
},
},
State: pb.CollectionState_CollectionCreating,
}
err := kc.CreateCollection(ctx, coll, 100)
assert.NoError(t, err)
Expand Down Expand Up @@ -1325,9 +1347,31 @@ func TestCatalog_DropCollection(t *testing.T) {
Partitions: []*model.Partition{
{PartitionName: "test"},
},
Fields: []*model.Field{{Name: "text", DataType: schemapb.DataType_VarChar}, {Name: "sparse", DataType: schemapb.DataType_SparseFloatVector}},
Functions: []*model.Function{{Name: "test", Type: schemapb.FunctionType_BM25, InputFieldNames: []string{"text"}, OutputFieldNames: []string{"sparse"}}},
State: pb.CollectionState_CollectionDropping,
Fields: []*model.Field{
{
Name: "text",
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "enable_tokenizer",
Value: "true",
},
},
},
{
Name: "sparse",
DataType: schemapb.DataType_SparseFloatVector,
},
},
Functions: []*model.Function{
{
Name: "test",
Type: schemapb.FunctionType_BM25,
InputFieldNames: []string{"text"},
OutputFieldNames: []string{"sparse"},
},
},
State: pb.CollectionState_CollectionDropping,
}
err := kc.DropCollection(ctx, coll, 100)
assert.NoError(t, err)
Expand Down
4 changes: 4 additions & 0 deletions internal/proxy/task_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3116,6 +3116,10 @@ func TestCreateCollectionTaskWithPartitionKey(t *testing.T) {
Key: "max_length",
Value: strconv.Itoa(testMaxVarCharLength),
},
{
Key: "enable_tokenizer",
Value: "true",
},
},
}
floatVecField := &schemapb.FieldSchema{
Expand Down
6 changes: 5 additions & 1 deletion internal/proxy/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,10 @@
return fmt.Errorf("only one VARCHAR input field is allowed for a BM25 Function, got %d field with type %s",
len(fields), fields[0].DataType.String())
}
h := typeutil.CreateFieldSchemaHelper(fields[0])
if !h.EnableTokenizer() {
return fmt.Errorf("BM25 input field must set enable_tokenizer to true")

Check warning on line 702 in internal/proxy/util.go

View check run for this annotation

Codecov / codecov/patch

internal/proxy/util.go#L702

Added line #L702 was not covered by tests
}

default:
return fmt.Errorf("check input field with unknown function type")
Expand Down Expand Up @@ -739,7 +743,7 @@
return fmt.Errorf("bm25_avgdl must large than zero but now %f", avgdl)
}

case "analyzer_params":
case "tokenizer_params":

Check warning on line 746 in internal/proxy/util.go

View check run for this annotation

Codecov / codecov/patch

internal/proxy/util.go#L746

Added line #L746 was not covered by tests
// TODO ADD tokenizer check
default:
return fmt.Errorf("invalid function params, key: %s, value:%s", kv.GetKey(), kv.GetValue())
Expand Down
4 changes: 4 additions & 0 deletions internal/util/ctokenizer/text_schema_validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ func ValidateTextSchema(fieldSchema *schemapb.FieldSchema) error {
return nil
}

if !h.EnableTokenizer() {
return fmt.Errorf("field %s is set to enable match but not enable tokenizer", fieldSchema.Name)
}

bs, err := proto.Marshal(fieldSchema)
if err != nil {
return fmt.Errorf("failed to marshal field schema: %w", err)
Expand Down
Loading
Loading