[Ruby] Warn if assigning a "UTF-8" string with invalid UTF-8. (#17253)

haberman · haberman · commit fe9a53cb5184 · 2024-07-17T17:30:14.000Z
This is a warning now, but will be an error in the next version.

PiperOrigin-RevId: 653236128
diff --git a/ruby/Rakefile b/ruby/Rakefile
@@ -35,6 +35,7 @@ test_protos = %w[
   tests/test_import_proto2.proto
   tests/test_ruby_package.proto
   tests/test_ruby_package_proto2.proto
+  tests/utf8.proto
 ]
 
 # These are omitted for now because we don't support proto2.
diff --git a/ruby/ext/google/protobuf_c/convert.c b/ruby/ext/google/protobuf_c/convert.c
@@ -104,6 +104,41 @@ static int32_t Convert_ToEnum(VALUE value, const char* name,
   rb_raise(rb_eRangeError, "Unknown symbol value for enum field '%s'.", name);
 }
 
+VALUE Convert_CheckStringUtf8(VALUE str) {
+  VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding());
+
+  if (rb_obj_encoding(str) == utf8) {
+    // Note: Just because a string is marked as having UTF-8 encoding does
+    // not mean that it is *valid* UTF-8.  We have to check separately
+    // whether it is valid.
+    if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
+      // TODO: For now
+      // we only warn for this case.  We will remove the warning and throw an
+      // exception below in the 30.x release
+
+      rb_warn(
+          "String is invalid UTF-8. This will be an error in a future "
+          "version.");
+      // VALUE exc = rb_const_get_at(
+      //     rb_cEncoding, rb_intern("InvalidByteSequenceError"));
+      // rb_raise(exc, "String is invalid UTF-8");
+    }
+  } else {
+    // Note: this will not duplicate underlying string data unless
+    // necessary.
+    //
+    // This will throw an exception if the conversion cannot be performed:
+    // - Encoding::UndefinedConversionError if certain characters cannot be
+    //   converted to UTF-8.
+    // - Encoding::InvalidByteSequenceError if certain characters were invalid
+    //   in the source encoding.
+    str = rb_str_encode(str, utf8, 0, Qnil);
+    PBRUBY_ASSERT(rb_enc_str_coderange(str) != ENC_CODERANGE_BROKEN);
+  }
+
+  return str;
+}
+
 upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
                                    TypeInfo type_info, upb_Arena* arena) {
   upb_MessageValue ret;
@@ -137,8 +172,7 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
       }
       break;
     }
-    case kUpb_CType_String: {
-      VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding());
+    case kUpb_CType_String:
       if (rb_obj_class(value) == rb_cSymbol) {
         value = rb_funcall(value, rb_intern("to_s"), 0);
       } else if (!rb_obj_is_kind_of(value, rb_cString)) {
@@ -147,19 +181,9 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
                  rb_class2name(CLASS_OF(value)));
       }
 
-      if (rb_obj_encoding(value) != utf8) {
-        // Note: this will not duplicate underlying string data unless
-        // necessary.
-        value = rb_str_encode(value, utf8, 0, Qnil);
-
-        if (rb_enc_str_coderange(value) == ENC_CODERANGE_BROKEN) {
-          rb_raise(rb_eEncodingError, "String is invalid UTF-8");
-        }
-      }
-
+      value = Convert_CheckStringUtf8(value);
       ret.str_val = Convert_StringData(value, arena);
       break;
-    }
     case kUpb_CType_Bytes: {
       VALUE bytes = rb_enc_from_encoding(rb_ascii8bit_encoding());
       if (rb_obj_class(value) != rb_cString) {
diff --git a/ruby/lib/google/protobuf/ffi/internal/convert.rb b/ruby/lib/google/protobuf/ffi/internal/convert.rb
@@ -33,11 +33,18 @@ def convert_ruby_to_upb(value, arena, c_type, msg_or_enum_def)
             return_value[:bool_val] = value
           when :string
             raise TypeError.new "Invalid argument for string field '#{name}' (given #{value.class})." unless value.is_a?(String) or value.is_a?(Symbol)
-            begin
+            value = value.to_s if value.is_a?(Symbol)
+            if value.encoding == Encoding::UTF_8
+              unless value.valid_encoding?
+                # TODO:
+                # For now we only warn for this case.  We will remove the
+                # warning and throw an exception below in the 30.x release
+                warn "String is invalid UTF-8. This will be an error in a future version."
+                # raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
+              end
+              string_value = value
+            else
               string_value = value.to_s.encode("UTF-8")
-            rescue Encoding::UndefinedConversionError
-              # TODO - why not include the field name here?
-              raise Encoding::UndefinedConversionError.new "String is invalid UTF-8"
             end
             return_value[:str_val][:size] = string_value.bytesize
             return_value[:str_val][:data] = Google::Protobuf::FFI.arena_malloc(arena, string_value.bytesize)
diff --git a/ruby/src/main/java/com/google/protobuf/jruby/Utils.java b/ruby/src/main/java/com/google/protobuf/jruby/Utils.java
@@ -40,6 +40,7 @@
 import org.jcodings.specific.ASCIIEncoding;
 import org.jcodings.specific.UTF8Encoding;
 import org.jruby.*;
+import org.jruby.common.RubyWarnings;
 import org.jruby.exceptions.RaiseException;
 import org.jruby.ext.bigdecimal.RubyBigDecimal;
 import org.jruby.runtime.Block;
@@ -389,11 +390,21 @@ private static IRubyObject validateAndEncodeString(
     if (!(value instanceof RubyString))
       throw createInvalidTypeError(context, fieldType, fieldName, value);
 
+    RubyString string = (RubyString) value;
+    if (encoding == UTF8Encoding.INSTANCE && string.getEncoding().isUTF8()) {
+      if (string.isCodeRangeBroken()) {
+        // TODO: For now we only warn for
+        // this case.  We will remove the warning and throw an exception in the 30.x release
+        context
+            .runtime
+            .getWarnings()
+            .warn("String is invalid UTF-8. This will be an error in a future version.");
+      }
+    }
+
     value =
-        ((RubyString) value)
-            .encode(
-                context,
-                context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding));
+        string.encode(
+            context, context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding));
     value.setFrozen(true);
     return value;
   }
diff --git a/ruby/tests/BUILD.bazel b/ruby/tests/BUILD.bazel
@@ -142,6 +142,16 @@ ruby_test(
     ],
 )
 
+ruby_test(
+    name = "utf8",
+    srcs = ["utf8.rb"],
+    deps = [
+        ":test_ruby_protos",
+        "//ruby:protobuf",
+        "@protobuf_bundle//:test-unit",
+    ],
+)
+
 ruby_test(
     name = "well_known_types_test",
     srcs = ["well_known_types_test.rb"],
diff --git a/ruby/tests/utf8.proto b/ruby/tests/utf8.proto
@@ -0,0 +1,9 @@
+syntax = "proto2";
+
+package utf8_test_protos;
+
+message TestUtf8 {
+  optional string optional_string = 1;
+  repeated string repeated_string = 2;
+  map<string, string> map_string_string = 3;
+}
diff --git a/ruby/tests/utf8.rb b/ruby/tests/utf8.rb
@@ -0,0 +1,136 @@
+#!/usr/bin/ruby
+
+require 'google/protobuf'
+require 'utf8_pb'
+require 'test/unit'
+
+module CaptureWarnings
+  @@warnings = nil
+
+  module_function
+
+  def warn(message, category: nil, **kwargs)
+    if @@warnings
+      @@warnings << message
+    else
+      super
+    end
+  end
+
+  def capture
+    @@warnings = []
+    yield
+    @@warnings
+  ensure
+    @@warnings = nil
+  end
+end
+
+Warning.extend CaptureWarnings
+
+module Utf8Test
+  def test_scalar
+    msg = Utf8TestProtos::TestUtf8.new
+    assert_bad_utf8 { msg.optional_string = bad_utf8_string() }
+  end
+
+  def test_repeated
+    msg = Utf8TestProtos::TestUtf8.new
+    assert_bad_utf8 { msg.repeated_string << bad_utf8_string() }
+  end
+
+  def test_map_key
+    msg = Utf8TestProtos::TestUtf8.new
+    assert_bad_utf8 { msg.map_string_string[bad_utf8_string()] = "abc" }
+  end
+
+  def test_map_value
+    msg = Utf8TestProtos::TestUtf8.new
+    assert_bad_utf8 { msg.map_string_string["abc"] = bad_utf8_string() }
+  end
+end
+
+# Tests the case of string objects that are marked UTF-8, but contain invalid
+# UTF-8.
+#
+# For now these only warn, but in the next major version they will throw an
+# exception.
+class MarkedUtf8Test < Test::Unit::TestCase
+  def assert_bad_utf8(&block)
+    warnings = CaptureWarnings.capture(&block)
+    assert_equal 1, warnings.length
+    assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
+  end
+
+  def bad_utf8_string
+    str = "\x80"
+    assert_false str.valid_encoding?
+    str
+  end
+
+  include Utf8Test
+end
+
+# This test doesn't work in JRuby because JRuby appears to have a bug where
+# the "valid" bit on a string's data is not invalidated properly when the
+# string is modified: https://github.com/jruby/jruby/issues/8316
+if !defined? JRUBY_VERSION
+  # Tests the case of string objects that are marked UTF-8, and initially contain
+  # valid UTF-8, but are later modified to be invalid UTF-8.  This may put the
+  # string into an state of "unknown" validity.
+  #
+  # For now these only warn, but in the next major version they will throw an
+  # exception.
+  class MarkedModifiedUtf8Test < Test::Unit::TestCase
+    def assert_bad_utf8(&block)
+      warnings = CaptureWarnings.capture(&block)
+      assert_equal 1, warnings.length
+      assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
+    end
+
+    def bad_utf8_string
+      str = " "
+      assert_true str.valid_encoding?
+      str[0] = "\x80"
+      str
+    end
+
+    include Utf8Test
+  end
+end
+
+# Tests the case of string objects that are marked with a non-UTF-8 encoding,
+# but contain invalid UTF-8.
+#
+# This case will raise Encoding::UndefinedConversionError.
+class MarkedNonUtf8Test < Test::Unit::TestCase
+  def assert_bad_utf8
+    assert_raises(Encoding::UndefinedConversionError) { yield }
+  end
+
+  def bad_utf8_string
+    str = "\x80".force_encoding(Encoding::ASCII_8BIT)
+    assert_true str.valid_encoding?
+    str
+  end
+
+  include Utf8Test
+end
+
+# Tests the case of string objects that are marked with a non-UTF-8 encoding,
+# but are invalid even in their source encoding.
+#
+# This case will raise Encoding::InvalidByteSequenceError
+class MarkedNonUtf8Test < Test::Unit::TestCase
+  def assert_bad_utf8(&block)
+    assert_raises(Encoding::InvalidByteSequenceError, &block)
+  end
+
+  def bad_utf8_string
+    str = "\x80".force_encoding(Encoding::ASCII)
+    assert_false str.valid_encoding?
+    str
+  end
+
+  include Utf8Test
+end

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ test_protos = %w[`
`35`	`35`	`tests/test_import_proto2.proto`
`36`	`36`	`tests/test_ruby_package.proto`
`37`	`37`	`tests/test_ruby_package_proto2.proto`
	`38`	`+ tests/utf8.proto`
`38`	`39`	`]`
`39`	`40`
`40`	`41`	`# These are omitted for now because we don't support proto2.`