Skip to content

Commit 35ba5cf

Browse files
protobuf-github-botJasonLunn
authored andcommitted
Fix UTF-8 Validation of string extensions in C++
Also adds a regression test PiperOrigin-RevId: 862026607
1 parent 08355bd commit 35ba5cf

File tree

8 files changed

+84
-11
lines changed

8 files changed

+84
-11
lines changed

conformance/failure_list_cpp.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ Recommended.*.JsonInput.FieldMaskInvalidCharacter
3535
Required.*.JsonInput.SingleValueForRepeatedFieldInt32 # Should have failed to parse, but didn't.
3636
Required.*.JsonInput.SingleValueForRepeatedFieldMessage # Should have failed to parse, but didn't.
3737
Required.*.ProtobufInput.BadTag_FieldNumberSlightlyTooHigh # Should have failed to parse, but didn't.
38-
Recommended.Editions.ProtobufInput.RejectInvalidUtf8.String.Extension # Should have failed to parse, but didn't.
3938
# TODO: Uncomment once conformance tests can express failures that are not expected to be fixed.
4039
# Recommended.Editions_Proto2.ProtobufInput.RejectInvalidUtf8.String.MapKey # Should have failed to parse, but didn't.
4140
# Recommended.Editions_Proto2.ProtobufInput.RejectInvalidUtf8.String.MapValue # Should have failed to parse, but didn't.

conformance/failure_list_rust_cc.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
Required.*.ProtobufInput.BadTag_FieldNumberSlightlyTooHigh # Should have failed to parse, but didn't.
2-
Recommended.Editions.ProtobufInput.RejectInvalidUtf8.String.Extension # Should have failed to parse, but didn't.
32
# TODO: Uncomment once conformance tests can express failures that are not expected to be fixed.
43
# Recommended.Editions_Proto2.ProtobufInput.RejectInvalidUtf8.String.MapKey # Should have failed to parse, but didn't.
54
# Recommended.Editions_Proto2.ProtobufInput.RejectInvalidUtf8.String.MapValue. # Should have failed to parse, but didn't.

src/google/protobuf/BUILD.bazel

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,6 +1090,7 @@ filegroup(
10901090
"unittest_redaction.proto",
10911091
"unittest_retention.proto",
10921092
"unittest_string_type.proto",
1093+
"unittest_utf8_string_extensions.proto",
10931094
"unittest_well_known_types.proto",
10941095
],
10951096
visibility = ["//:__subpackages__"],
@@ -1182,6 +1183,7 @@ protobuf_test_proto_library(
11821183
"unittest_retention.proto",
11831184
"unittest_string_type.proto",
11841185
"unittest_string_view.proto",
1186+
"unittest_utf8_string_extensions.proto",
11851187
"unittest_well_known_types.proto",
11861188
],
11871189
option_deps = [
@@ -1858,13 +1860,15 @@ cc_test(
18581860
":port",
18591861
":protobuf",
18601862
":protobuf_lite",
1863+
":test_textproto",
18611864
":test_util",
18621865
":test_util2",
18631866
"//src/google/protobuf/io",
18641867
"//src/google/protobuf/stubs",
18651868
"//src/google/protobuf/testing",
18661869
"//src/google/protobuf/testing:file",
18671870
"//src/google/protobuf/util:differencer",
1871+
"//third_party/utf8_range:utf8_validity",
18681872
"@abseil-cpp//absl/algorithm:container",
18691873
"@abseil-cpp//absl/log:absl_check",
18701874
"@abseil-cpp//absl/strings",

src/google/protobuf/compiler/cpp/extension.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,9 @@ void ExtensionGenerator::GenerateRegistration(io::Printer* p,
237237
DescriptorTableName(descriptor_->containing_type()->file(), options_)},
238238
{"extendee_index", find_index(descriptor_->containing_type())},
239239
{"preregister", priority == kInitPriority101},
240+
{"is_utf8", descriptor_->requires_utf8_validation()
241+
? "/*is_utf8=*/true"
242+
: "/*is_utf8=*/false"},
240243
}});
241244
switch (descriptor_->cpp_type()) {
242245
case FieldDescriptor::CPPTYPE_ENUM:
@@ -331,7 +334,7 @@ void ExtensionGenerator::GenerateRegistration(io::Printer* p,
331334
R"cc(
332335
::_pbi::ExtensionSet::RegisterExtension(
333336
&$extendee$::default_instance(), $number$, $field_type$,
334-
$repeated$, $packed$),
337+
$repeated$, $packed$, $is_utf8$),
335338
)cc");
336339
}
337340

src/google/protobuf/extension_set.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,11 @@ bool GeneratedExtensionFinder::Find(int number, ExtensionInfo* output) {
135135

136136
void ExtensionSet::RegisterExtension(const MessageLite* extendee, int number,
137137
FieldType type, bool is_repeated,
138-
bool is_packed) {
138+
bool is_packed, bool is_utf8) {
139139
ABSL_CHECK_NE(type, WireFormatLite::TYPE_ENUM);
140140
ABSL_CHECK_NE(type, WireFormatLite::TYPE_MESSAGE);
141141
ABSL_CHECK_NE(type, WireFormatLite::TYPE_GROUP);
142-
ExtensionInfo info(extendee, number, type, is_repeated, is_packed);
142+
ExtensionInfo info(extendee, number, type, is_repeated, is_packed, is_utf8);
143143
Register(info);
144144
}
145145

@@ -148,7 +148,8 @@ void ExtensionSet::RegisterEnumExtension(const MessageLite* extendee,
148148
bool is_repeated, bool is_packed,
149149
const uint32_t* validation_data) {
150150
ABSL_CHECK_EQ(type, WireFormatLite::TYPE_ENUM);
151-
ExtensionInfo info(extendee, number, type, is_repeated, is_packed);
151+
ExtensionInfo info(extendee, number, type, is_repeated, is_packed,
152+
/*is_utf8=*/false);
152153
info.enum_validity_check.func = nullptr;
153154
info.enum_validity_check.arg = validation_data;
154155
Register(info);

src/google/protobuf/extension_set.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,17 @@ enum class LazyAnnotation : int8_t {
122122

123123
// Information about a registered extension.
124124
struct ExtensionInfo {
125-
constexpr ExtensionInfo() : enum_validity_check() {}
125+
constexpr ExtensionInfo()
126+
: is_packed(false), is_utf8(false), enum_validity_check() {}
126127
constexpr ExtensionInfo(const MessageLite* extendee, int param_number,
127-
FieldType type_param, bool isrepeated, bool ispacked)
128+
FieldType type_param, bool isrepeated, bool ispacked,
129+
bool is_utf8)
128130
: message(extendee),
129131
number(param_number),
130132
type(type_param),
131133
is_repeated(isrepeated),
132134
is_packed(ispacked),
135+
is_utf8(is_utf8),
133136
enum_validity_check() {}
134137
constexpr ExtensionInfo(const MessageLite* extendee, int param_number,
135138
FieldType type_param, bool isrepeated, bool ispacked,
@@ -140,6 +143,7 @@ struct ExtensionInfo {
140143
type(type_param),
141144
is_repeated(isrepeated),
142145
is_packed(ispacked),
146+
is_utf8(false),
143147
is_lazy(islazy),
144148
enum_validity_check(),
145149
lazy_eager_verify_func(verify_func)
@@ -151,7 +155,8 @@ struct ExtensionInfo {
151155

152156
FieldType type = 0;
153157
bool is_repeated = false;
154-
bool is_packed = false;
158+
bool is_packed : 1;
159+
bool is_utf8 : 1; // validate UTF8 if true
155160
LazyAnnotation is_lazy = LazyAnnotation::kUndefined;
156161

157162
struct EnumValidityCheck {
@@ -275,7 +280,7 @@ class PROTOBUF_EXPORT ExtensionSet {
275280
// methods do.
276281
static void RegisterExtension(const MessageLite* extendee, int number,
277282
FieldType type, bool is_repeated,
278-
bool is_packed);
283+
bool is_packed, bool is_utf8 = false);
279284
static void RegisterEnumExtension(const MessageLite* extendee, int number,
280285
FieldType type, bool is_repeated,
281286
bool is_packed,

src/google/protobuf/extension_set_inl.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@
1212
#include <string>
1313
#include <utility>
1414

15+
#include "absl/base/optimization.h"
1516
#include "google/protobuf/extension_set.h"
1617
#include "google/protobuf/metadata_lite.h"
1718
#include "google/protobuf/parse_context.h"
1819
#include "google/protobuf/wire_format_lite.h"
20+
#include "utf8_validity.h"
1921

2022
namespace google {
2123
namespace protobuf {
@@ -150,7 +152,15 @@ const char* ExtensionSet::ParseFieldWithExtensionInfo(
150152
info.descriptor);
151153
int size = ReadSize(&ptr);
152154
GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
153-
return ctx->ReadString(ptr, size, value);
155+
if (info.is_utf8) {
156+
ptr = ctx->ReadString(ptr, size, value);
157+
if ABSL_PREDICT_FALSE (!utf8_range::IsStructurallyValid(*value)) {
158+
return nullptr;
159+
}
160+
return ptr;
161+
} else {
162+
return ctx->ReadString(ptr, size, value);
163+
}
154164
}
155165

156166
case WireFormatLite::TYPE_GROUP: {

src/google/protobuf/extension_set_unittest.cc

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,12 @@
4444
#include "google/protobuf/unittest_mset.pb.h"
4545
#include "google/protobuf/unittest_mset_wire_format.pb.h"
4646
#include "google/protobuf/unittest_proto3_extensions.pb.h"
47+
#include "google/protobuf/unittest_utf8_string_extensions.pb.h"
4748
#include "google/protobuf/wire_format.h"
4849
#include "google/protobuf/wire_format_lite.h"
50+
#include "utf8_validity.h"
4951

52+
#include "google/protobuf/test_textproto.h"
5053

5154
// Must be included last.
5255
#include "google/protobuf/port_def.inc"
@@ -1975,6 +1978,55 @@ INSTANTIATE_TEST_SUITE_P(
19751978
return name;
19761979
});
19771980

1981+
TEST(ExtensionSet, StringWithInvalidUTF8FailsToParse) {
1982+
// Sanity check that the extension field is marked as requiring UTF-8
1983+
// validation. `requires_utf8_validation` can only be true for string fields
1984+
// where feature.utf8_validation = VERIFY.
1985+
const google::protobuf::DescriptorPool* pool = google::protobuf::DescriptorPool::generated_pool();
1986+
ASSERT_NE(pool, nullptr);
1987+
const google::protobuf::FieldDescriptor* string_ext_fd = pool->FindExtensionByName(
1988+
"proto2_unittest.optional_utf8_string_extension");
1989+
ASSERT_NE(string_ext_fd, nullptr);
1990+
ASSERT_TRUE(string_ext_fd->requires_utf8_validation());
1991+
std::string invalid_utf8 = "\xFF";
1992+
ASSERT_FALSE(utf8_range::IsStructurallyValid(invalid_utf8));
1993+
1994+
proto2_unittest::TestUtf8ValidationOfExtensions test_message;
1995+
// It is reasonable to debate that UTF-8 validation should be checked in the
1996+
// setter, but it is not currently done because the setter doesn't have a way
1997+
// to report errors.
1998+
test_message.SetExtension(proto2_unittest::optional_utf8_string_extension,
1999+
invalid_utf8);
2000+
std::string data;
2001+
ASSERT_TRUE(test_message.SerializeToString(&data));
2002+
proto2_unittest::TestUtf8ValidationOfExtensions parsed_message;
2003+
ASSERT_FALSE(parsed_message.ParseFromString(data));
2004+
}
2005+
2006+
TEST(ExtensionSet, BytesWithInvalidUTF8Succeeds) {
2007+
// Sanity check that the extension field is not marked as requiring UTF-8
2008+
// validation. `requires_utf8_validation` can only be true for string fields.
2009+
const google::protobuf::DescriptorPool* pool = google::protobuf::DescriptorPool::generated_pool();
2010+
ASSERT_NE(pool, nullptr);
2011+
const google::protobuf::FieldDescriptor* string_ext_fd =
2012+
pool->FindExtensionByName("proto2_unittest.optional_bytes_extension");
2013+
ASSERT_NE(string_ext_fd, nullptr);
2014+
ASSERT_FALSE(string_ext_fd->requires_utf8_validation());
2015+
std::string invalid_utf8 = "\xFF";
2016+
ASSERT_FALSE(utf8_range::IsStructurallyValid(invalid_utf8));
2017+
2018+
proto2_unittest::TestAllExtensions test_message;
2019+
test_message.SetExtension(proto2_unittest::optional_bytes_extension,
2020+
invalid_utf8);
2021+
std::string data;
2022+
ASSERT_TRUE(test_message.SerializeToString(&data));
2023+
proto2_unittest::TestAllExtensions parsed_message;
2024+
EXPECT_TRUE(parsed_message.ParseFromString(data));
2025+
EXPECT_THAT(parsed_message, google::protobuf::EqualsProto(R"pb(
2026+
[proto2_unittest.optional_bytes_extension]: "\xFF"
2027+
)pb"));
2028+
}
2029+
19782030
} // namespace
19792031
} // namespace internal
19802032
} // namespace protobuf

0 commit comments

Comments
 (0)