Skip to content

Commit d4dfb9c

Browse files
Add kUpb_DecodeOption_AlwaysValidateUtf8 decode option, to force UTF-8 validation of proto2 strings.
PiperOrigin-RevId: 597341799
1 parent 686cfc6 commit d4dfb9c

File tree

6 files changed

+164
-2
lines changed

6 files changed

+164
-2
lines changed

upb/message/BUILD

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,12 @@ proto_library(
304304
name = "utf8_test_proto",
305305
testonly = 1,
306306
srcs = ["utf8_test.proto"],
307-
deps = ["//src/google/protobuf:test_messages_proto3_proto"],
307+
)
308+
309+
proto_library(
310+
name = "utf8_test_proto2_proto",
311+
testonly = 1,
312+
srcs = ["utf8_test_proto2.proto"],
308313
)
309314

310315
upb_minitable_proto_library(
@@ -313,16 +318,30 @@ upb_minitable_proto_library(
313318
deps = [":utf8_test_proto"],
314319
)
315320

321+
upb_minitable_proto_library(
322+
name = "utf8_test_proto2_upb_minitable_proto",
323+
testonly = 1,
324+
deps = [":utf8_test_proto2_proto"],
325+
)
326+
316327
upb_c_proto_library(
317328
name = "utf8_test_upb_proto",
318329
testonly = 1,
319330
deps = [":utf8_test_proto"],
320331
)
321332

333+
upb_c_proto_library(
334+
name = "utf8_test_proto2_upb_proto",
335+
testonly = 1,
336+
deps = [":utf8_test_proto2_proto"],
337+
)
338+
322339
cc_test(
323340
name = "utf8_test",
324341
srcs = ["utf8_test.cc"],
325342
deps = [
343+
":utf8_test_proto2_upb_minitable_proto",
344+
":utf8_test_proto2_upb_proto",
326345
":utf8_test_upb_minitable_proto",
327346
":utf8_test_upb_proto",
328347
"//upb:base",

upb/message/utf8_test.cc

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
#include "upb/mem/arena.hpp"
1515
#include "upb/message/utf8_test.upb.h"
1616
#include "upb/message/utf8_test.upb_minitable.h"
17+
#include "upb/message/utf8_test_proto2.upb.h"
18+
#include "upb/message/utf8_test_proto2.upb_minitable.h"
1719
#include "upb/wire/decode.h"
1820

1921
namespace {
@@ -72,6 +74,100 @@ TEST(Utf8Test, RepeatedProto3FieldValidates) {
7274
ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status);
7375
}
7476

77+
TEST(Utf8Test, Proto2BytesValidates) {
78+
upb::Arena arena;
79+
size_t size;
80+
char* data = GetBadUtf8Payload(arena.ptr(), &size);
81+
82+
upb_test_TestUtf8Proto2Bytes* msg =
83+
upb_test_TestUtf8Proto2Bytes_new(arena.ptr());
84+
85+
upb_DecodeStatus status;
86+
status = upb_Decode(data, size, UPB_UPCAST(msg),
87+
&upb_0test__TestUtf8Proto2Bytes_msg_init, nullptr, 0,
88+
arena.ptr());
89+
90+
// Parse succeeds, because proto2 bytes fields don't validate UTF-8.
91+
ASSERT_EQ(kUpb_DecodeStatus_Ok, status);
92+
}
93+
94+
TEST(Utf8Test, Proto2RepeatedBytesValidates) {
95+
upb::Arena arena;
96+
size_t size;
97+
char* data = GetBadUtf8Payload(arena.ptr(), &size);
98+
99+
upb_test_TestUtf8RepeatedProto2Bytes* msg =
100+
upb_test_TestUtf8RepeatedProto2Bytes_new(arena.ptr());
101+
102+
upb_DecodeStatus status;
103+
status = upb_Decode(data, size, UPB_UPCAST(msg),
104+
&upb_0test__TestUtf8RepeatedProto2Bytes_msg_init, nullptr,
105+
0, arena.ptr());
106+
107+
// Parse succeeds, because proto2 bytes fields don't validate UTF-8.
108+
ASSERT_EQ(kUpb_DecodeStatus_Ok, status);
109+
}
110+
111+
TEST(Utf8Test, Proto2StringValidates) {
112+
upb::Arena arena;
113+
size_t size;
114+
char* data = GetBadUtf8Payload(arena.ptr(), &size);
115+
116+
upb_test_TestUtf8Proto2String* msg =
117+
upb_test_TestUtf8Proto2String_new(arena.ptr());
118+
119+
upb_DecodeStatus status;
120+
status = upb_Decode(data, size, UPB_UPCAST(msg),
121+
&upb_0test__TestUtf8Proto2String_msg_init, nullptr, 0,
122+
arena.ptr());
123+
124+
// Parse succeeds, because proto2 string fields don't validate UTF-8.
125+
ASSERT_EQ(kUpb_DecodeStatus_Ok, status);
126+
}
127+
128+
TEST(Utf8Test, Proto2FieldFailsValidation) {
129+
upb::Arena arena;
130+
size_t size;
131+
char* data = GetBadUtf8Payload(arena.ptr(), &size);
132+
133+
upb_test_TestUtf8Proto2String* msg =
134+
upb_test_TestUtf8Proto2String_new(arena.ptr());
135+
136+
upb_DecodeStatus status;
137+
status = upb_Decode(data, size, UPB_UPCAST(msg),
138+
&upb_0test__TestUtf8Proto2String_msg_init, nullptr, 0,
139+
arena.ptr());
140+
141+
// Parse fails, because we pass in kUpb_DecodeOption_AlwaysValidateUtf8 to
142+
// force validation of proto2 string fields.
143+
status = upb_Decode(data, size, UPB_UPCAST(msg),
144+
&upb_0test__TestUtf8Proto2String_msg_init, nullptr,
145+
kUpb_DecodeOption_AlwaysValidateUtf8, arena.ptr());
146+
ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status);
147+
}
148+
149+
TEST(Utf8Test, Proto2RepeatedFieldFailsValidation) {
150+
upb::Arena arena;
151+
size_t size;
152+
char* data = GetBadUtf8Payload(arena.ptr(), &size);
153+
154+
upb_test_TestUtf8RepeatedProto2String* msg =
155+
upb_test_TestUtf8RepeatedProto2String_new(arena.ptr());
156+
157+
upb_DecodeStatus status;
158+
status = upb_Decode(data, size, UPB_UPCAST(msg),
159+
&upb_0test__TestUtf8RepeatedProto2String_msg_init,
160+
nullptr, 0, arena.ptr());
161+
162+
// Parse fails, because we pass in kUpb_DecodeOption_AlwaysValidateUtf8 to
163+
// force validation of proto2 string fields.
164+
status =
165+
upb_Decode(data, size, UPB_UPCAST(msg),
166+
&upb_0test__TestUtf8RepeatedProto2String_msg_init, nullptr,
167+
kUpb_DecodeOption_AlwaysValidateUtf8, arena.ptr());
168+
ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status);
169+
}
170+
75171
// begin:google_only
76172
// TEST(Utf8Test, Proto3MixedFieldValidates) {
77173
// upb::Arena arena;

upb/message/utf8_test.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ message TestUtf8Proto3StringEnforceUtf8False {
3535
}
3636

3737
message TestUtf8RepeatedProto3StringEnforceUtf8False {
38-
optional string data = 1;
38+
repeated string data = 1;
3939
}
4040

4141
message TestUtf8Proto3StringEnforceUtf8FalseMixed {

upb/message/utf8_test_proto2.proto

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Protocol Buffers - Google's data interchange format
2+
// Copyright 2023 Google LLC. All rights reserved.
3+
//
4+
// Use of this source code is governed by a BSD-style
5+
// license that can be found in the LICENSE file or at
6+
// https://developers.google.com/open-source/licenses/bsd
7+
8+
syntax = "proto2";
9+
10+
package upb_test;
11+
12+
message TestUtf8Proto2Bytes {
13+
optional bytes data = 1;
14+
}
15+
16+
message TestUtf8RepeatedProto2Bytes {
17+
optional bytes data = 1;
18+
}
19+
20+
message TestUtf8Proto2String {
21+
optional string data = 1;
22+
}
23+
24+
message TestUtf8RepeatedProto2String {
25+
repeated string data = 1;
26+
}

upb/wire/decode.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,6 +1034,15 @@ static void _upb_Decoder_CheckUnlinked(upb_Decoder* d, const upb_MiniTable* mt,
10341034
*op = kUpb_DecodeOp_UnknownField;
10351035
}
10361036

1037+
UPB_FORCEINLINE
1038+
static void _upb_Decoder_MaybeVerifyUtf8(upb_Decoder* d,
1039+
const upb_MiniTableField* field,
1040+
int* op) {
1041+
if ((field->UPB_ONLYBITS(mode) & kUpb_LabelFlags_IsAlternate) &&
1042+
UPB_UNLIKELY(d->options & kUpb_DecodeOption_AlwaysValidateUtf8))
1043+
*op = kUpb_DecodeOp_String;
1044+
}
1045+
10371046
static int _upb_Decoder_GetDelimitedOp(upb_Decoder* d, const upb_MiniTable* mt,
10381047
const upb_MiniTableField* field) {
10391048
enum { kRepeatedBase = 19 };
@@ -1090,6 +1099,8 @@ static int _upb_Decoder_GetDelimitedOp(upb_Decoder* d, const upb_MiniTable* mt,
10901099

10911100
if (op == kUpb_DecodeOp_SubMessage) {
10921101
_upb_Decoder_CheckUnlinked(d, mt, field, &op);
1102+
} else if (op == kUpb_DecodeOp_Bytes) {
1103+
_upb_Decoder_MaybeVerifyUtf8(d, field, &op);
10931104
}
10941105

10951106
return op;

upb/wire/decode.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,16 @@ enum {
8383
* be created by the parser or the message-copying logic in message/copy.h.
8484
*/
8585
kUpb_DecodeOption_ExperimentalAllowUnlinked = 4,
86+
87+
/* EXPERIMENTAL:
88+
*
89+
* If set, decoding will enforce UTF-8 validation for string fields, even for
90+
* proto2 or fields with `features.utf8_validation = NONE`. Normally, only
91+
* proto3 string fields will be validated for UTF-8. Decoding will return
92+
* kUpb_DecodeStatus_BadUtf8 for non-UTF-8 strings, which is the same behavior
93+
* as non-UTF-8 proto3 string fields.
94+
*/
95+
kUpb_DecodeOption_AlwaysValidateUtf8 = 8,
8696
};
8797

8898
UPB_INLINE uint32_t upb_DecodeOptions_MaxDepth(uint16_t depth) {

0 commit comments

Comments
 (0)