From d4dfb9c4a67e24afa0465735a84a381d0833211d Mon Sep 17 00:00:00 2001 From: Protobuf Team Bot Date: Wed, 10 Jan 2024 13:33:51 -0800 Subject: [PATCH] Add kUpb_DecodeOption_AlwaysValidateUtf8 decode option, to force UTF-8 validation of proto2 strings. PiperOrigin-RevId: 597341799 --- upb/message/BUILD | 21 ++++++- upb/message/utf8_test.cc | 96 ++++++++++++++++++++++++++++++ upb/message/utf8_test.proto | 2 +- upb/message/utf8_test_proto2.proto | 26 ++++++++ upb/wire/decode.c | 11 ++++ upb/wire/decode.h | 10 ++++ 6 files changed, 164 insertions(+), 2 deletions(-) create mode 100644 upb/message/utf8_test_proto2.proto diff --git a/upb/message/BUILD b/upb/message/BUILD index 3212444200a7b..04647a3ebea0c 100644 --- a/upb/message/BUILD +++ b/upb/message/BUILD @@ -304,7 +304,12 @@ proto_library( name = "utf8_test_proto", testonly = 1, srcs = ["utf8_test.proto"], - deps = ["//src/google/protobuf:test_messages_proto3_proto"], +) + +proto_library( + name = "utf8_test_proto2_proto", + testonly = 1, + srcs = ["utf8_test_proto2.proto"], ) upb_minitable_proto_library( @@ -313,16 +318,30 @@ upb_minitable_proto_library( deps = [":utf8_test_proto"], ) +upb_minitable_proto_library( + name = "utf8_test_proto2_upb_minitable_proto", + testonly = 1, + deps = [":utf8_test_proto2_proto"], +) + upb_c_proto_library( name = "utf8_test_upb_proto", testonly = 1, deps = [":utf8_test_proto"], ) +upb_c_proto_library( + name = "utf8_test_proto2_upb_proto", + testonly = 1, + deps = [":utf8_test_proto2_proto"], +) + cc_test( name = "utf8_test", srcs = ["utf8_test.cc"], deps = [ + ":utf8_test_proto2_upb_minitable_proto", + ":utf8_test_proto2_upb_proto", ":utf8_test_upb_minitable_proto", ":utf8_test_upb_proto", "//upb:base", diff --git a/upb/message/utf8_test.cc b/upb/message/utf8_test.cc index b900a954b3a92..cd7dd1737135c 100644 --- a/upb/message/utf8_test.cc +++ b/upb/message/utf8_test.cc @@ -14,6 +14,8 @@ #include "upb/mem/arena.hpp" #include "upb/message/utf8_test.upb.h" #include "upb/message/utf8_test.upb_minitable.h" +#include "upb/message/utf8_test_proto2.upb.h" +#include "upb/message/utf8_test_proto2.upb_minitable.h" #include "upb/wire/decode.h" namespace { @@ -72,6 +74,100 @@ TEST(Utf8Test, RepeatedProto3FieldValidates) { ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status); } +TEST(Utf8Test, Proto2BytesValidates) { + upb::Arena arena; + size_t size; + char* data = GetBadUtf8Payload(arena.ptr(), &size); + + upb_test_TestUtf8Proto2Bytes* msg = + upb_test_TestUtf8Proto2Bytes_new(arena.ptr()); + + upb_DecodeStatus status; + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8Proto2Bytes_msg_init, nullptr, 0, + arena.ptr()); + + // Parse succeeds, because proto2 bytes fields don't validate UTF-8. + ASSERT_EQ(kUpb_DecodeStatus_Ok, status); +} + +TEST(Utf8Test, Proto2RepeatedBytesValidates) { + upb::Arena arena; + size_t size; + char* data = GetBadUtf8Payload(arena.ptr(), &size); + + upb_test_TestUtf8RepeatedProto2Bytes* msg = + upb_test_TestUtf8RepeatedProto2Bytes_new(arena.ptr()); + + upb_DecodeStatus status; + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8RepeatedProto2Bytes_msg_init, nullptr, + 0, arena.ptr()); + + // Parse succeeds, because proto2 bytes fields don't validate UTF-8. + ASSERT_EQ(kUpb_DecodeStatus_Ok, status); +} + +TEST(Utf8Test, Proto2StringValidates) { + upb::Arena arena; + size_t size; + char* data = GetBadUtf8Payload(arena.ptr(), &size); + + upb_test_TestUtf8Proto2String* msg = + upb_test_TestUtf8Proto2String_new(arena.ptr()); + + upb_DecodeStatus status; + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8Proto2String_msg_init, nullptr, 0, + arena.ptr()); + + // Parse succeeds, because proto2 string fields don't validate UTF-8. + ASSERT_EQ(kUpb_DecodeStatus_Ok, status); +} + +TEST(Utf8Test, Proto2FieldFailsValidation) { + upb::Arena arena; + size_t size; + char* data = GetBadUtf8Payload(arena.ptr(), &size); + + upb_test_TestUtf8Proto2String* msg = + upb_test_TestUtf8Proto2String_new(arena.ptr()); + + upb_DecodeStatus status; + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8Proto2String_msg_init, nullptr, 0, + arena.ptr()); + + // Parse fails, because we pass in kUpb_DecodeOption_AlwaysValidateUtf8 to + // force validation of proto2 string fields. + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8Proto2String_msg_init, nullptr, + kUpb_DecodeOption_AlwaysValidateUtf8, arena.ptr()); + ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status); +} + +TEST(Utf8Test, Proto2RepeatedFieldFailsValidation) { + upb::Arena arena; + size_t size; + char* data = GetBadUtf8Payload(arena.ptr(), &size); + + upb_test_TestUtf8RepeatedProto2String* msg = + upb_test_TestUtf8RepeatedProto2String_new(arena.ptr()); + + upb_DecodeStatus status; + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8RepeatedProto2String_msg_init, + nullptr, 0, arena.ptr()); + + // Parse fails, because we pass in kUpb_DecodeOption_AlwaysValidateUtf8 to + // force validation of proto2 string fields. + status = + upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8RepeatedProto2String_msg_init, nullptr, + kUpb_DecodeOption_AlwaysValidateUtf8, arena.ptr()); + ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status); +} + // begin:google_only // TEST(Utf8Test, Proto3MixedFieldValidates) { // upb::Arena arena; diff --git a/upb/message/utf8_test.proto b/upb/message/utf8_test.proto index 4070ac7d19684..515e3bf7d3af1 100644 --- a/upb/message/utf8_test.proto +++ b/upb/message/utf8_test.proto @@ -35,7 +35,7 @@ message TestUtf8Proto3StringEnforceUtf8False { } message TestUtf8RepeatedProto3StringEnforceUtf8False { - optional string data = 1; + repeated string data = 1; } message TestUtf8Proto3StringEnforceUtf8FalseMixed { diff --git a/upb/message/utf8_test_proto2.proto b/upb/message/utf8_test_proto2.proto new file mode 100644 index 0000000000000..6adc1055cd743 --- /dev/null +++ b/upb/message/utf8_test_proto2.proto @@ -0,0 +1,26 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2023 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +syntax = "proto2"; + +package upb_test; + +message TestUtf8Proto2Bytes { + optional bytes data = 1; +} + +message TestUtf8RepeatedProto2Bytes { + optional bytes data = 1; +} + +message TestUtf8Proto2String { + optional string data = 1; +} + +message TestUtf8RepeatedProto2String { + repeated string data = 1; +} diff --git a/upb/wire/decode.c b/upb/wire/decode.c index ecb01277e0ae2..0d21c37d15e15 100644 --- a/upb/wire/decode.c +++ b/upb/wire/decode.c @@ -1034,6 +1034,15 @@ static void _upb_Decoder_CheckUnlinked(upb_Decoder* d, const upb_MiniTable* mt, *op = kUpb_DecodeOp_UnknownField; } +UPB_FORCEINLINE +static void _upb_Decoder_MaybeVerifyUtf8(upb_Decoder* d, + const upb_MiniTableField* field, + int* op) { + if ((field->UPB_ONLYBITS(mode) & kUpb_LabelFlags_IsAlternate) && + UPB_UNLIKELY(d->options & kUpb_DecodeOption_AlwaysValidateUtf8)) + *op = kUpb_DecodeOp_String; +} + static int _upb_Decoder_GetDelimitedOp(upb_Decoder* d, const upb_MiniTable* mt, const upb_MiniTableField* field) { enum { kRepeatedBase = 19 }; @@ -1090,6 +1099,8 @@ static int _upb_Decoder_GetDelimitedOp(upb_Decoder* d, const upb_MiniTable* mt, if (op == kUpb_DecodeOp_SubMessage) { _upb_Decoder_CheckUnlinked(d, mt, field, &op); + } else if (op == kUpb_DecodeOp_Bytes) { + _upb_Decoder_MaybeVerifyUtf8(d, field, &op); } return op; diff --git a/upb/wire/decode.h b/upb/wire/decode.h index 7f2eafddca8f7..68bf63f356696 100644 --- a/upb/wire/decode.h +++ b/upb/wire/decode.h @@ -83,6 +83,16 @@ enum { * be created by the parser or the message-copying logic in message/copy.h. */ kUpb_DecodeOption_ExperimentalAllowUnlinked = 4, + + /* EXPERIMENTAL: + * + * If set, decoding will enforce UTF-8 validation for string fields, even for + * proto2 or fields with `features.utf8_validation = NONE`. Normally, only + * proto3 string fields will be validated for UTF-8. Decoding will return + * kUpb_DecodeStatus_BadUtf8 for non-UTF-8 strings, which is the same behavior + * as non-UTF-8 proto3 string fields. + */ + kUpb_DecodeOption_AlwaysValidateUtf8 = 8, }; UPB_INLINE uint32_t upb_DecodeOptions_MaxDepth(uint16_t depth) {