@@ -104,6 +104,41 @@ static int32_t Convert_ToEnum(VALUE value, const char* name,
104104 rb_raise (rb_eRangeError , "Unknown symbol value for enum field '%s'." , name );
105105}
106106
107+ VALUE Convert_CheckStringUtf8 (VALUE str ) {
108+ VALUE utf8 = rb_enc_from_encoding (rb_utf8_encoding ());
109+
110+ if (rb_obj_encoding (str ) == utf8 ) {
111+ // Note: Just because a string is marked as having UTF-8 encoding does
112+ // not mean that it is *valid* UTF-8. We have to check separately
113+ // whether it is valid.
114+ if (rb_enc_str_coderange (str ) == ENC_CODERANGE_BROKEN ) {
115+ // TODO: For now
116+ // we only warn for this case. We will remove the warning and throw an
117+ // exception below in the 30.x release
118+
119+ rb_warn (
120+ "String is invalid UTF-8. This will be an error in a future "
121+ "version." );
122+ // VALUE exc = rb_const_get_at(
123+ // rb_cEncoding, rb_intern("InvalidByteSequenceError"));
124+ // rb_raise(exc, "String is invalid UTF-8");
125+ }
126+ } else {
127+ // Note: this will not duplicate underlying string data unless
128+ // necessary.
129+ //
130+ // This will throw an exception if the conversion cannot be performed:
131+ // - Encoding::UndefinedConversionError if certain characters cannot be
132+ // converted to UTF-8.
133+ // - Encoding::InvalidByteSequenceError if certain characters were invalid
134+ // in the source encoding.
135+ str = rb_str_encode (str , utf8 , 0 , Qnil );
136+ PBRUBY_ASSERT (rb_enc_str_coderange (str ) != ENC_CODERANGE_BROKEN );
137+ }
138+
139+ return str ;
140+ }
141+
107142upb_MessageValue Convert_RubyToUpb (VALUE value , const char * name ,
108143 TypeInfo type_info , upb_Arena * arena ) {
109144 upb_MessageValue ret ;
@@ -137,8 +172,7 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
137172 }
138173 break ;
139174 }
140- case kUpb_CType_String : {
141- VALUE utf8 = rb_enc_from_encoding (rb_utf8_encoding ());
175+ case kUpb_CType_String :
142176 if (rb_obj_class (value ) == rb_cSymbol ) {
143177 value = rb_funcall (value , rb_intern ("to_s" ), 0 );
144178 } else if (!rb_obj_is_kind_of (value , rb_cString )) {
@@ -147,19 +181,9 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
147181 rb_class2name (CLASS_OF (value )));
148182 }
149183
150- if (rb_obj_encoding (value ) != utf8 ) {
151- // Note: this will not duplicate underlying string data unless
152- // necessary.
153- value = rb_str_encode (value , utf8 , 0 , Qnil );
154-
155- if (rb_enc_str_coderange (value ) == ENC_CODERANGE_BROKEN ) {
156- rb_raise (rb_eEncodingError , "String is invalid UTF-8" );
157- }
158- }
159-
184+ value = Convert_CheckStringUtf8 (value );
160185 ret .str_val = Convert_StringData (value , arena );
161186 break ;
162- }
163187 case kUpb_CType_Bytes : {
164188 VALUE bytes = rb_enc_from_encoding (rb_ascii8bit_encoding ());
165189 if (rb_obj_class (value ) != rb_cString ) {
0 commit comments