diff --git a/src/java_bytecode/java_bytecode_convert_class.cpp b/src/java_bytecode/java_bytecode_convert_class.cpp index 440b0ce32cf..f21b0bb1ce2 100644 --- a/src/java_bytecode/java_bytecode_convert_class.cpp +++ b/src/java_bytecode/java_bytecode_convert_class.cpp @@ -30,13 +30,15 @@ class java_bytecode_convert_classt:public messaget bool _disable_runtime_checks, size_t _max_array_length, lazy_methodst& _lazy_methods, - lazy_methods_modet _lazy_methods_mode): + lazy_methods_modet _lazy_methods_mode, + bool _string_refinement_enabled): messaget(_message_handler), symbol_table(_symbol_table), disable_runtime_checks(_disable_runtime_checks), max_array_length(_max_array_length), lazy_methods(_lazy_methods), - lazy_methods_mode(_lazy_methods_mode) + lazy_methods_mode(_lazy_methods_mode), + string_refinement_enabled(_string_refinement_enabled) { } @@ -46,6 +48,9 @@ class java_bytecode_convert_classt:public messaget if(parse_tree.loading_successful) convert(parse_tree.parsed_class); + else if(string_refinement_enabled && + parse_tree.parsed_class.name=="java.lang.String") + add_string_type(); else generate_class_stub(parse_tree.parsed_class.name); } @@ -59,6 +64,7 @@ class java_bytecode_convert_classt:public messaget const size_t max_array_length; lazy_methodst &lazy_methods; lazy_methods_modet lazy_methods_mode; + bool string_refinement_enabled; // conversion void convert(const classt &c); @@ -66,6 +72,7 @@ class java_bytecode_convert_classt:public messaget void generate_class_stub(const irep_idt &class_name); void add_array_types(); + void add_string_type(); }; /*******************************************************************\ @@ -360,7 +367,8 @@ bool java_bytecode_convert_class( bool disable_runtime_checks, size_t max_array_length, lazy_methodst &lazy_methods, - lazy_methods_modet lazy_methods_mode) + lazy_methods_modet lazy_methods_mode, + bool string_refinement_enabled) { java_bytecode_convert_classt java_bytecode_convert_class( symbol_table, @@ -368,7 +376,8 @@ bool java_bytecode_convert_class( disable_runtime_checks, max_array_length, lazy_methods, - lazy_methods_mode); + lazy_methods_mode, + string_refinement_enabled); try { @@ -392,3 +401,64 @@ bool java_bytecode_convert_class( return true; } + +/*******************************************************************\ + +Function: java_bytecode_convert_classt::add_string_type + + Purpose: Implements the java.lang.String type in the case that + we provide an internal implementation. + +\*******************************************************************/ + +void java_bytecode_convert_classt::add_string_type() +{ + class_typet string_type; + string_type.set_tag("java.lang.String"); + string_type.components().resize(3); + string_type.components()[0].set_name("@java.lang.Object"); + string_type.components()[0].set_pretty_name("@java.lang.Object"); + string_type.components()[0].type()=symbol_typet("java::java.lang.Object"); + string_type.components()[1].set_name("length"); + string_type.components()[1].set_pretty_name("length"); + string_type.components()[1].type()=java_int_type(); + string_type.components()[2].set_name("data"); + string_type.components()[2].set_pretty_name("data"); + // Use a pointer-to-unbounded-array instead of a pointer-to-char. + // Saves some casting in the string refinement algorithm but may + // be unnecessary. + string_type.components()[2].type()=pointer_typet( + array_typet(java_char_type(), infinity_exprt(java_int_type()))); + string_type.add_base(symbol_typet("java::java.lang.Object")); + + symbolt string_symbol; + string_symbol.name="java::java.lang.String"; + string_symbol.base_name="java.lang.String"; + string_symbol.type=string_type; + string_symbol.is_type=true; + + symbol_table.add(string_symbol); + + // Also add a stub of `String.equals` so that remove-virtual-functions + // generates a check for Object.equals vs. String.equals. + // No need to fill it in, as pass_preprocess will replace the calls again. + symbolt string_equals_symbol; + string_equals_symbol.name= + "java::java.lang.String.equals:(Ljava/lang/Object;)Z"; + string_equals_symbol.base_name="java.lang.String.equals"; + string_equals_symbol.pretty_name="java.lang.String.equals"; + string_equals_symbol.mode=ID_java; + + code_typet string_equals_type; + string_equals_type.return_type()=java_boolean_type(); + code_typet::parametert thisparam; + thisparam.set_this(); + thisparam.type()=pointer_typet(symbol_typet(string_symbol.name)); + code_typet::parametert otherparam; + otherparam.type()=pointer_typet(symbol_typet("java::java.lang.Object")); + string_equals_type.parameters().push_back(thisparam); + string_equals_type.parameters().push_back(otherparam); + string_equals_symbol.type=std::move(string_equals_type); + + symbol_table.add(string_equals_symbol); +} diff --git a/src/java_bytecode/java_bytecode_convert_class.h b/src/java_bytecode/java_bytecode_convert_class.h index 5cc1526a125..0d2be3d8202 100644 --- a/src/java_bytecode/java_bytecode_convert_class.h +++ b/src/java_bytecode/java_bytecode_convert_class.h @@ -22,6 +22,7 @@ bool java_bytecode_convert_class( bool disable_runtime_checks, size_t max_array_length, lazy_methodst &, - lazy_methods_modet); + lazy_methods_modet, + bool string_refinement_enabled); #endif // CPROVER_JAVA_BYTECODE_JAVA_BYTECODE_CONVERT_CLASS_H diff --git a/src/java_bytecode/java_bytecode_language.cpp b/src/java_bytecode/java_bytecode_language.cpp index e9d5a87186b..668960dbe17 100644 --- a/src/java_bytecode/java_bytecode_language.cpp +++ b/src/java_bytecode/java_bytecode_language.cpp @@ -43,6 +43,7 @@ void java_bytecode_languaget::get_language_options(const cmdlinet &cmd) { disable_runtime_checks=cmd.isset("disable-runtime-check"); assume_inputs_non_null=cmd.isset("java-assume-inputs-non-null"); + string_refinement_enabled=cmd.isset("string-refine"); if(cmd.isset("java-max-input-array-length")) max_nondet_array_length= std::stoi(cmd.get_value("java-max-input-array-length")); @@ -494,7 +495,8 @@ bool java_bytecode_languaget::typecheck( disable_runtime_checks, max_user_array_length, lazy_methods, - lazy_methods_mode)) + lazy_methods_mode, + string_refinement_enabled)) return true; } @@ -509,7 +511,7 @@ bool java_bytecode_languaget::typecheck( // now typecheck all if(java_bytecode_typecheck( - symbol_table, get_message_handler())) + symbol_table, get_message_handler(), string_refinement_enabled)) return true; return false; diff --git a/src/java_bytecode/java_bytecode_language.h b/src/java_bytecode/java_bytecode_language.h index ea177e0226a..643eacd5b7f 100644 --- a/src/java_bytecode/java_bytecode_language.h +++ b/src/java_bytecode/java_bytecode_language.h @@ -104,6 +104,7 @@ class java_bytecode_languaget:public languaget size_t max_user_array_length; // max size for user code created arrays lazy_methodst lazy_methods; lazy_methods_modet lazy_methods_mode; + bool string_refinement_enabled; }; languaget *new_java_bytecode_language(); diff --git a/src/java_bytecode/java_bytecode_typecheck.cpp b/src/java_bytecode/java_bytecode_typecheck.cpp index e72b80a885d..4f09e366452 100644 --- a/src/java_bytecode/java_bytecode_typecheck.cpp +++ b/src/java_bytecode/java_bytecode_typecheck.cpp @@ -126,10 +126,11 @@ Function: java_bytecode_typecheck bool java_bytecode_typecheck( symbol_tablet &symbol_table, - message_handlert &message_handler) + message_handlert &message_handler, + bool string_refinement_enabled) { java_bytecode_typecheckt java_bytecode_typecheck( - symbol_table, message_handler); + symbol_table, message_handler, string_refinement_enabled); return java_bytecode_typecheck.typecheck_main(); } diff --git a/src/java_bytecode/java_bytecode_typecheck.h b/src/java_bytecode/java_bytecode_typecheck.h index cceba54dee5..618b6fce44d 100644 --- a/src/java_bytecode/java_bytecode_typecheck.h +++ b/src/java_bytecode/java_bytecode_typecheck.h @@ -21,7 +21,8 @@ Author: Daniel Kroening, kroening@kroening.com bool java_bytecode_typecheck( symbol_tablet &symbol_table, - message_handlert &message_handler); + message_handlert &message_handler, + bool string_refinement_enabled); bool java_bytecode_typecheck( exprt &expr, @@ -33,10 +34,12 @@ class java_bytecode_typecheckt:public typecheckt public: java_bytecode_typecheckt( symbol_tablet &_symbol_table, - message_handlert &_message_handler): + message_handlert &_message_handler, + bool _string_refinement_enabled): typecheckt(_message_handler), symbol_table(_symbol_table), - ns(symbol_table) + ns(symbol_table), + string_refinement_enabled(_string_refinement_enabled) { } @@ -48,6 +51,7 @@ class java_bytecode_typecheckt:public typecheckt protected: symbol_tablet &symbol_table; const namespacet ns; + bool string_refinement_enabled; void typecheck_type_symbol(symbolt &); void typecheck_non_type_symbol(symbolt &); diff --git a/src/java_bytecode/java_bytecode_typecheck_expr.cpp b/src/java_bytecode/java_bytecode_typecheck_expr.cpp index c2ba2949d7d..11e488ff7fe 100644 --- a/src/java_bytecode/java_bytecode_typecheck_expr.cpp +++ b/src/java_bytecode/java_bytecode_typecheck_expr.cpp @@ -10,9 +10,14 @@ Author: Daniel Kroening, kroening@kroening.com #include #include +#include +#include + +#include #include "java_bytecode_typecheck.h" #include "java_pointer_casts.h" +#include "java_types.h" /*******************************************************************\ @@ -114,6 +119,27 @@ static std::string escape_non_alnum(const std::string &toescape) /*******************************************************************\ +Function: utf16_to_array + + Inputs: `in`: wide string to convert + + Outputs: Returns a Java char array containing the same wchars. + + Purpose: Convert UCS-2 or UTF-16 to an array expression. + +\*******************************************************************/ + +static array_exprt utf16_to_array(const std::wstring &in) +{ + const auto jchar=java_char_type(); + array_exprt ret(array_typet(jchar, infinity_exprt(java_int_type()))); + for(const auto c : in) + ret.copy_to_operands(from_integer(c, jchar)); + return ret; +} + +/*******************************************************************\ + Function: java_bytecode_typecheckt::typecheck_expr_java_string_literal Inputs: @@ -136,14 +162,14 @@ void java_bytecode_typecheckt::typecheck_expr_java_string_literal(exprt &expr) auto findit=symbol_table.symbols.find(escaped_symbol_name); if(findit!=symbol_table.symbols.end()) { - expr=findit->second.symbol_expr(); + expr=address_of_exprt(findit->second.symbol_expr()); return; } // Create a new symbol: symbolt new_symbol; new_symbol.name=escaped_symbol_name; - new_symbol.type=pointer_typet(string_type); + new_symbol.type=string_type; new_symbol.base_name="Literal"; new_symbol.pretty_name=value; new_symbol.mode=ID_java; @@ -151,13 +177,91 @@ void java_bytecode_typecheckt::typecheck_expr_java_string_literal(exprt &expr) new_symbol.is_lvalue=true; new_symbol.is_static_lifetime=true; // These are basically const global data. + // Regardless of string refinement setting, at least initialize + // the literal with @clsid = String and @lock = false: + symbol_typet jlo_symbol("java::java.lang.Object"); + const auto &jlo_struct=to_struct_type(ns.follow(jlo_symbol)); + struct_exprt jlo_init(jlo_symbol); + const auto &jls_struct=to_struct_type(ns.follow(string_type)); + + jlo_init.copy_to_operands( + constant_exprt( + "java::java.lang.String", + jlo_struct.components()[0].type())); + jlo_init.copy_to_operands( + from_integer( + 0, + jlo_struct.components()[1].type())); + + // If string refinement *is* around, populate the actual + // contents as well: + if(string_refinement_enabled) + { + struct_exprt literal_init(new_symbol.type); + literal_init.move_to_operands(jlo_init); + + // Initialize the string with a constant utf-16 array: + symbolt array_symbol; + array_symbol.name=escaped_symbol_name+"_constarray"; + array_symbol.type=array_typet( + java_char_type(), infinity_exprt(java_int_type())); + array_symbol.base_name="Literal_constarray"; + array_symbol.pretty_name=value; + array_symbol.mode=ID_java; + array_symbol.is_type=false; + array_symbol.is_lvalue=true; + // These are basically const global data: + array_symbol.is_static_lifetime=true; + array_symbol.is_state_var=true; + auto literal_array=utf16_to_array( + utf8_to_utf16_little_endian(id2string(value))); + array_symbol.value=literal_array; + + if(symbol_table.add(array_symbol)) + throw "failed to add constarray symbol to symbol table"; + + literal_init.copy_to_operands( + from_integer(literal_array.operands().size(), + jls_struct.components()[1].type())); + literal_init.copy_to_operands( + address_of_exprt(array_symbol.symbol_expr())); + + new_symbol.value=literal_init; + } + else if(jls_struct.components().size()>=1 && + jls_struct.components()[0].get_name()=="@java.lang.Object") + { + // Case where something defined java.lang.String, so it has + // a proper base class (always java.lang.Object in practical + // JDKs seen so far) + struct_exprt literal_init(new_symbol.type); + literal_init.move_to_operands(jlo_init); + for(const auto &comp : jls_struct.components()) + { + if(comp.get_name()=="@java.lang.Object") + continue; + // Other members of JDK's java.lang.String we don't understand + // without string-refinement. Just zero-init them; consider using + // test-gen-like nondet object trees instead. + literal_init.copy_to_operands( + zero_initializer(comp.type(), expr.source_location(), ns)); + } + new_symbol.value=literal_init; + } + else if(jls_struct.get_bool(ID_incomplete_class)) + { + // Case where java.lang.String was stubbed, and so directly defines + // @class_identifier and @lock: + new_symbol.value=jlo_init; + } + if(symbol_table.add(new_symbol)) { error() << "failed to add string literal symbol to symbol table" << eom; throw 0; } - expr=new_symbol.symbol_expr(); + expr=address_of_exprt(new_symbol.symbol_expr()); } /*******************************************************************\