Skip to content

Commit fe9a53c

Browse files
committed
[Ruby] Warn if assigning a "UTF-8" string with invalid UTF-8. (#17253)
This is a warning now, but will be an error in the next version. PiperOrigin-RevId: 653236128
1 parent e3e4344 commit fe9a53c

File tree

7 files changed

+219
-21
lines changed

7 files changed

+219
-21
lines changed

ruby/Rakefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ test_protos = %w[
3535
tests/test_import_proto2.proto
3636
tests/test_ruby_package.proto
3737
tests/test_ruby_package_proto2.proto
38+
tests/utf8.proto
3839
]
3940

4041
# These are omitted for now because we don't support proto2.

ruby/ext/google/protobuf_c/convert.c

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,41 @@ static int32_t Convert_ToEnum(VALUE value, const char* name,
104104
rb_raise(rb_eRangeError, "Unknown symbol value for enum field '%s'.", name);
105105
}
106106

107+
VALUE Convert_CheckStringUtf8(VALUE str) {
108+
VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding());
109+
110+
if (rb_obj_encoding(str) == utf8) {
111+
// Note: Just because a string is marked as having UTF-8 encoding does
112+
// not mean that it is *valid* UTF-8. We have to check separately
113+
// whether it is valid.
114+
if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
115+
// TODO: For now
116+
// we only warn for this case. We will remove the warning and throw an
117+
// exception below in the 30.x release
118+
119+
rb_warn(
120+
"String is invalid UTF-8. This will be an error in a future "
121+
"version.");
122+
// VALUE exc = rb_const_get_at(
123+
// rb_cEncoding, rb_intern("InvalidByteSequenceError"));
124+
// rb_raise(exc, "String is invalid UTF-8");
125+
}
126+
} else {
127+
// Note: this will not duplicate underlying string data unless
128+
// necessary.
129+
//
130+
// This will throw an exception if the conversion cannot be performed:
131+
// - Encoding::UndefinedConversionError if certain characters cannot be
132+
// converted to UTF-8.
133+
// - Encoding::InvalidByteSequenceError if certain characters were invalid
134+
// in the source encoding.
135+
str = rb_str_encode(str, utf8, 0, Qnil);
136+
PBRUBY_ASSERT(rb_enc_str_coderange(str) != ENC_CODERANGE_BROKEN);
137+
}
138+
139+
return str;
140+
}
141+
107142
upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
108143
TypeInfo type_info, upb_Arena* arena) {
109144
upb_MessageValue ret;
@@ -137,8 +172,7 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
137172
}
138173
break;
139174
}
140-
case kUpb_CType_String: {
141-
VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding());
175+
case kUpb_CType_String:
142176
if (rb_obj_class(value) == rb_cSymbol) {
143177
value = rb_funcall(value, rb_intern("to_s"), 0);
144178
} else if (!rb_obj_is_kind_of(value, rb_cString)) {
@@ -147,19 +181,9 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
147181
rb_class2name(CLASS_OF(value)));
148182
}
149183

150-
if (rb_obj_encoding(value) != utf8) {
151-
// Note: this will not duplicate underlying string data unless
152-
// necessary.
153-
value = rb_str_encode(value, utf8, 0, Qnil);
154-
155-
if (rb_enc_str_coderange(value) == ENC_CODERANGE_BROKEN) {
156-
rb_raise(rb_eEncodingError, "String is invalid UTF-8");
157-
}
158-
}
159-
184+
value = Convert_CheckStringUtf8(value);
160185
ret.str_val = Convert_StringData(value, arena);
161186
break;
162-
}
163187
case kUpb_CType_Bytes: {
164188
VALUE bytes = rb_enc_from_encoding(rb_ascii8bit_encoding());
165189
if (rb_obj_class(value) != rb_cString) {

ruby/lib/google/protobuf/ffi/internal/convert.rb

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,18 @@ def convert_ruby_to_upb(value, arena, c_type, msg_or_enum_def)
3333
return_value[:bool_val] = value
3434
when :string
3535
raise TypeError.new "Invalid argument for string field '#{name}' (given #{value.class})." unless value.is_a?(String) or value.is_a?(Symbol)
36-
begin
36+
value = value.to_s if value.is_a?(Symbol)
37+
if value.encoding == Encoding::UTF_8
38+
unless value.valid_encoding?
39+
# TODO:
40+
# For now we only warn for this case. We will remove the
41+
# warning and throw an exception below in the 30.x release
42+
warn "String is invalid UTF-8. This will be an error in a future version."
43+
# raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
44+
end
45+
string_value = value
46+
else
3747
string_value = value.to_s.encode("UTF-8")
38-
rescue Encoding::UndefinedConversionError
39-
# TODO - why not include the field name here?
40-
raise Encoding::UndefinedConversionError.new "String is invalid UTF-8"
4148
end
4249
return_value[:str_val][:size] = string_value.bytesize
4350
return_value[:str_val][:data] = Google::Protobuf::FFI.arena_malloc(arena, string_value.bytesize)

ruby/src/main/java/com/google/protobuf/jruby/Utils.java

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import org.jcodings.specific.ASCIIEncoding;
4141
import org.jcodings.specific.UTF8Encoding;
4242
import org.jruby.*;
43+
import org.jruby.common.RubyWarnings;
4344
import org.jruby.exceptions.RaiseException;
4445
import org.jruby.ext.bigdecimal.RubyBigDecimal;
4546
import org.jruby.runtime.Block;
@@ -389,11 +390,21 @@ private static IRubyObject validateAndEncodeString(
389390
if (!(value instanceof RubyString))
390391
throw createInvalidTypeError(context, fieldType, fieldName, value);
391392

393+
RubyString string = (RubyString) value;
394+
if (encoding == UTF8Encoding.INSTANCE && string.getEncoding().isUTF8()) {
395+
if (string.isCodeRangeBroken()) {
396+
// TODO: For now we only warn for
397+
// this case. We will remove the warning and throw an exception in the 30.x release
398+
context
399+
.runtime
400+
.getWarnings()
401+
.warn("String is invalid UTF-8. This will be an error in a future version.");
402+
}
403+
}
404+
392405
value =
393-
((RubyString) value)
394-
.encode(
395-
context,
396-
context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding));
406+
string.encode(
407+
context, context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding));
397408
value.setFrozen(true);
398409
return value;
399410
}

ruby/tests/BUILD.bazel

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,16 @@ ruby_test(
142142
],
143143
)
144144

145+
ruby_test(
146+
name = "utf8",
147+
srcs = ["utf8.rb"],
148+
deps = [
149+
":test_ruby_protos",
150+
"//ruby:protobuf",
151+
"@protobuf_bundle//:test-unit",
152+
],
153+
)
154+
145155
ruby_test(
146156
name = "well_known_types_test",
147157
srcs = ["well_known_types_test.rb"],

ruby/tests/utf8.proto

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
syntax = "proto2";
2+
3+
package utf8_test_protos;
4+
5+
message TestUtf8 {
6+
optional string optional_string = 1;
7+
repeated string repeated_string = 2;
8+
map<string, string> map_string_string = 3;
9+
}

ruby/tests/utf8.rb

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#!/usr/bin/ruby
2+
3+
require 'google/protobuf'
4+
require 'utf8_pb'
5+
require 'test/unit'
6+
7+
module CaptureWarnings
8+
@@warnings = nil
9+
10+
module_function
11+
12+
def warn(message, category: nil, **kwargs)
13+
if @@warnings
14+
@@warnings << message
15+
else
16+
super
17+
end
18+
end
19+
20+
def capture
21+
@@warnings = []
22+
yield
23+
@@warnings
24+
ensure
25+
@@warnings = nil
26+
end
27+
end
28+
29+
Warning.extend CaptureWarnings
30+
31+
module Utf8Test
32+
def test_scalar
33+
msg = Utf8TestProtos::TestUtf8.new
34+
assert_bad_utf8 { msg.optional_string = bad_utf8_string() }
35+
end
36+
37+
def test_repeated
38+
msg = Utf8TestProtos::TestUtf8.new
39+
assert_bad_utf8 { msg.repeated_string << bad_utf8_string() }
40+
end
41+
42+
def test_map_key
43+
msg = Utf8TestProtos::TestUtf8.new
44+
assert_bad_utf8 { msg.map_string_string[bad_utf8_string()] = "abc" }
45+
end
46+
47+
def test_map_value
48+
msg = Utf8TestProtos::TestUtf8.new
49+
assert_bad_utf8 { msg.map_string_string["abc"] = bad_utf8_string() }
50+
end
51+
end
52+
53+
# Tests the case of string objects that are marked UTF-8, but contain invalid
54+
# UTF-8.
55+
#
56+
# For now these only warn, but in the next major version they will throw an
57+
# exception.
58+
class MarkedUtf8Test < Test::Unit::TestCase
59+
def assert_bad_utf8(&block)
60+
warnings = CaptureWarnings.capture(&block)
61+
assert_equal 1, warnings.length
62+
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
63+
end
64+
65+
def bad_utf8_string
66+
str = "\x80"
67+
assert_false str.valid_encoding?
68+
str
69+
end
70+
71+
include Utf8Test
72+
end
73+
74+
# This test doesn't work in JRuby because JRuby appears to have a bug where
75+
# the "valid" bit on a string's data is not invalidated properly when the
76+
# string is modified: https://github.com/jruby/jruby/issues/8316
77+
if !defined? JRUBY_VERSION
78+
# Tests the case of string objects that are marked UTF-8, and initially contain
79+
# valid UTF-8, but are later modified to be invalid UTF-8. This may put the
80+
# string into an state of "unknown" validity.
81+
#
82+
# For now these only warn, but in the next major version they will throw an
83+
# exception.
84+
class MarkedModifiedUtf8Test < Test::Unit::TestCase
85+
def assert_bad_utf8(&block)
86+
warnings = CaptureWarnings.capture(&block)
87+
assert_equal 1, warnings.length
88+
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
89+
end
90+
91+
def bad_utf8_string
92+
str = " "
93+
assert_true str.valid_encoding?
94+
str[0] = "\x80"
95+
str
96+
end
97+
98+
include Utf8Test
99+
end
100+
end
101+
102+
# Tests the case of string objects that are marked with a non-UTF-8 encoding,
103+
# but contain invalid UTF-8.
104+
#
105+
# This case will raise Encoding::UndefinedConversionError.
106+
class MarkedNonUtf8Test < Test::Unit::TestCase
107+
def assert_bad_utf8
108+
assert_raises(Encoding::UndefinedConversionError) { yield }
109+
end
110+
111+
def bad_utf8_string
112+
str = "\x80".force_encoding(Encoding::ASCII_8BIT)
113+
assert_true str.valid_encoding?
114+
str
115+
end
116+
117+
include Utf8Test
118+
end
119+
120+
# Tests the case of string objects that are marked with a non-UTF-8 encoding,
121+
# but are invalid even in their source encoding.
122+
#
123+
# This case will raise Encoding::InvalidByteSequenceError
124+
class MarkedNonUtf8Test < Test::Unit::TestCase
125+
def assert_bad_utf8(&block)
126+
assert_raises(Encoding::InvalidByteSequenceError, &block)
127+
end
128+
129+
def bad_utf8_string
130+
str = "\x80".force_encoding(Encoding::ASCII)
131+
assert_false str.valid_encoding?
132+
str
133+
end
134+
135+
include Utf8Test
136+
end

0 commit comments

Comments
 (0)