diff --git a/src/java_bytecode/Makefile b/src/java_bytecode/Makefile index 1c0747903cf..677987cfc9b 100644 --- a/src/java_bytecode/Makefile +++ b/src/java_bytecode/Makefile @@ -1,4 +1,5 @@ SRC = bytecode_info.cpp \ + character_refine_preprocess.cpp \ ci_lazy_methods.cpp \ expr2java.cpp \ jar_file.cpp \ diff --git a/src/java_bytecode/character_refine_preprocess.cpp b/src/java_bytecode/character_refine_preprocess.cpp new file mode 100644 index 00000000000..d6dbfc38315 --- /dev/null +++ b/src/java_bytecode/character_refine_preprocess.cpp @@ -0,0 +1,2510 @@ +/*******************************************************************\ + +Module: Preprocess a goto-programs so that calls to the java Character + library are replaced by simple expressions. + +Author: Romain Brenguier + +Date: March 2017 + +\*******************************************************************/ + +#include +#include +#include "character_refine_preprocess.h" + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_char_function + + Inputs: + expr_function - A reference to a function on expressions + target - A position in a goto program + + Purpose: converts based on a function on expressions + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_char_function( + exprt (*expr_function)(const exprt &chr, const typet &type), + conversion_input &target) +{ + const code_function_callt &function_call=target; + assert(function_call.arguments().size()==1); + const exprt &arg=function_call.arguments()[0]; + const exprt &result=function_call.lhs(); + const typet &type=result.type(); + return code_assignt(result, expr_function(arg, type)); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::in_interval_expr + + Inputs: + arg - Expression we want to bound + lower_bound - Integer lower bound + upper_bound - Integer upper bound + + Outputs: A Boolean expression + + Purpose: The returned expression is true when the first argument is in the + interval defined by the lower and upper bounds (included) + +\*******************************************************************/ + +exprt character_refine_preprocesst::in_interval_expr( + const exprt &chr, + const mp_integer &lower_bound, + const mp_integer &upper_bound) +{ + return and_exprt( + binary_relation_exprt(chr, ID_ge, from_integer(lower_bound, chr.type())), + binary_relation_exprt(chr, ID_le, from_integer(upper_bound, chr.type()))); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::in_list_expr + + Inputs: + chr - An expression of type character + list - A list of integer representing unicode characters + + Outputs: A Boolean expression + + Purpose: The returned expression is true when the given character + is equal to one of the element in the list + +\*******************************************************************/ + +exprt character_refine_preprocesst::in_list_expr( + const exprt &chr, const std::list &list) +{ + exprt::operandst ops; + for(const auto &i : list) + ops.push_back(equal_exprt(chr, from_integer(i, chr.type()))); + return disjunction(ops); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_char_count + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A integer expression of the given type + + Purpose: Determines the number of char values needed to represent + the specified character (Unicode code point). + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_char_count( + const exprt &chr, const typet &type) +{ + exprt u010000=from_integer(0x010000, type); + binary_relation_exprt small(chr, ID_lt, u010000); + return if_exprt(small, from_integer(1, type), from_integer(2, type)); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_char_count + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.charCount:(I)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_char_count(conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_char_count, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_char_value + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Casts the given expression to the given type + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_char_value( + const exprt &chr, const typet &type) +{ + return typecast_exprt(chr, type); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_char_value + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.charValue:()C + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_char_value(conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_char_value, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_compare + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.compare:(CC)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_compare(conversion_input &target) +{ + const code_function_callt &function_call=target; + assert(function_call.arguments().size()==2); + const exprt &char1=function_call.arguments()[0]; + const exprt &char2=function_call.arguments()[1]; + const exprt &result=function_call.lhs(); + const typet &type=result.type(); + binary_relation_exprt smaller(char1, ID_lt, char2); + binary_relation_exprt greater(char1, ID_gt, char2); + if_exprt expr( + smaller, + from_integer(-1, type), + if_exprt(greater, from_integer(1, type), from_integer(0, type))); + + return code_assignt(result, expr); +} + + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_digit_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.digit:(CI)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_digit_char(conversion_input &target) +{ + const code_function_callt &function_call=target; + assert(function_call.arguments().size()==2); + const exprt &arg=function_call.arguments()[0]; + const exprt &radix=function_call.arguments()[1]; + const exprt &result=function_call.lhs(); + const typet &type=result.type(); + + // TODO: If the radix is not in the range MIN_RADIX <= radix <= MAX_RADIX or + // if the value of ch is not a valid digit in the specified radix, + // -1 is returned. + + // Case 1: The method isDigit is true of the character and the Unicode + // decimal digit value of the character (or its single-character + // decomposition) is less than the specified radix. + exprt invalid=from_integer(-1, arg.type()); + exprt c0=from_integer('0', arg.type()); + exprt latin_digit=in_interval_expr(arg, '0', '9'); + minus_exprt value1(arg, c0); + // TODO: this is only valid for latin digits + if_exprt case1( + binary_relation_exprt(value1, ID_lt, radix), value1, invalid); + + // Case 2: The character is one of the uppercase Latin letters 'A' + // through 'Z' and its code is less than radix + 'A' - 10, + // then ch - 'A' + 10 is returned. + exprt cA=from_integer('A', arg.type()); + exprt i10=from_integer(10, arg.type()); + exprt upper_case=in_interval_expr(arg, 'A', 'Z'); + plus_exprt value2(minus_exprt(arg, cA), i10); + if_exprt case2( + binary_relation_exprt(value2, ID_lt, radix), value2, invalid); + + // The character is one of the lowercase Latin letters 'a' through 'z' and + // its code is less than radix + 'a' - 10, then ch - 'a' + 10 is returned. + exprt ca=from_integer('a', arg.type()); + exprt lower_case=in_interval_expr(arg, 'a', 'z'); + plus_exprt value3(minus_exprt(arg, ca), i10); + if_exprt case3( + binary_relation_exprt(value3, ID_lt, radix), value3, invalid); + + + // The character is one of the fullwidth uppercase Latin letters A ('\uFF21') + // through Z ('\uFF3A') and its code is less than radix + '\uFF21' - 10. + // In this case, ch - '\uFF21' + 10 is returned. + exprt uFF21=from_integer(0xFF21, arg.type()); + exprt fullwidth_upper_case=in_interval_expr(arg, 0xFF21, 0xFF3A); + plus_exprt value4(minus_exprt(arg, uFF21), i10); + if_exprt case4( + binary_relation_exprt(value4, ID_lt, radix), value4, invalid); + + // The character is one of the fullwidth lowercase Latin letters a ('\uFF41') + // through z ('\uFF5A') and its code is less than radix + '\uFF41' - 10. + // In this case, ch - '\uFF41' + 10 is returned. + exprt uFF41=from_integer(0xFF41, arg.type()); + plus_exprt value5(minus_exprt(arg, uFF41), i10); + if_exprt case5( + binary_relation_exprt(value5, ID_lt, radix), value5, invalid); + + if_exprt fullwidth_cases(fullwidth_upper_case, case4, case5); + if_exprt expr( + latin_digit, + case1, + if_exprt(upper_case, case2, if_exprt(lower_case, case3, fullwidth_cases))); + typecast_exprt tc_expr(expr, type); + + return code_assignt(result, tc_expr); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_char_is_digit_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.digit:(II)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_digit_int(conversion_input &target) +{ + return convert_digit_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_for_digit + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.forDigit:(II)I + + TODO: For now the radix argument is ignored + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_for_digit(conversion_input &target) +{ + const code_function_callt &function_call=target; + assert(function_call.arguments().size()==2); + const exprt &digit=function_call.arguments()[0]; + const exprt &result=function_call.lhs(); + const typet &type=result.type(); + + exprt d10=from_integer(10, type); + binary_relation_exprt small(digit, ID_le, d10); + plus_exprt value1(digit, from_integer('0', type)); + minus_exprt value2(plus_exprt(digit, from_integer('a', digit.type())), d10); + return code_assignt(result, if_exprt(small, value1, value2)); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_get_directionality_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.getDirectionality:(C)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_get_directionality_char( + conversion_input &target) +{ + // TODO: This is unimplemented for now as it requires analyzing + // the UnicodeData file to find characters directionality. + return target; +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_char_is_digit_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.getDirectionality:(I)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_get_directionality_int( + conversion_input &target) +{ + return convert_get_directionality_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_get_numeric_value_char + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.getNumericValue:(C)I + + TODO: For now this is only for ASCII characters + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_get_numeric_value_char( + conversion_input &target) +{ + return convert_digit_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_get_numeric_value_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.getNumericValue:(C)I + + TODO: For now this is only for ASCII characters + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_get_numeric_value_int( + conversion_input &target) +{ + return convert_digit_int(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_get_type_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.getType:(C)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_get_type_char( + conversion_input &target) +{ + // TODO: This is unimplemented for now as it requires analyzing + // the UnicodeData file to categorize characters. + return target; +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_get_type_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.getType:(I)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_get_type_int( + conversion_input &target) +{ + return convert_get_type_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_hash_code + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.hashCode:()I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_hash_code(conversion_input &target) +{ + return convert_char_value(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_high_surrogate + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Returns the leading surrogate (a high surrogate code unit) + of the surrogate pair representing the specified + supplementary character (Unicode code point) in the UTF-16 + encoding. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_high_surrogate( + const exprt &chr, const typet &type) +{ + exprt u10000=from_integer(0x010000, type); + exprt uD800=from_integer(0xD800, type); + exprt u400=from_integer(0x0400, type); + + plus_exprt high_surrogate(uD800, div_exprt(minus_exprt(chr, u10000), u400)); + return high_surrogate; +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_high_surrogate + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.highSurrogate:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_high_surrogate( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_high_surrogate, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_ascii_lower_case + + Inputs: + chr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the specified character is an ASCII lowercase + character. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_ascii_lower_case( + const exprt &chr, const typet &type) +{ + return in_interval_expr(chr, 'a', 'z'); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_ascii_upper_case + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the specified character is an ASCII uppercase + character. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_ascii_upper_case( + const exprt &chr, const typet &type) +{ + return in_interval_expr(chr, 'A', 'Z'); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_letter + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Determines if the specified character is a letter. + + TODO: for now this is only for ASCII characters, the + following unicode categories are not yet considered: + TITLECASE_LETTER MODIFIER_LETTER OTHER_LETTER LETTER_NUMBER + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_letter( + const exprt &chr, const typet &type) +{ + return or_exprt( + expr_of_is_ascii_upper_case(chr, type), + expr_of_is_ascii_lower_case(chr, type)); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_alphabetic + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Determines if the specified character (Unicode code point) + is alphabetic. + + TODO: for now this is only for ASCII characters, the + following unicode categorise are not yet considered: + TITLECASE_LETTER MODIFIER_LETTER OTHER_LETTER LETTER_NUMBER + and contributory property Other_Alphabetic as defined by the + Unicode Standard. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_alphabetic( + const exprt &chr, const typet &type) +{ + return expr_of_is_letter(chr, type); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_alphabetic + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isAlphabetic:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_alphabetic( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_alphabetic, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_bmp_code_point + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines whether the specified character (Unicode code + point) is in the Basic Multilingual Plane (BMP). Such code + points can be represented using a single char. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_bmp_code_point( + const exprt &chr, const typet &type) +{ + binary_relation_exprt is_bmp(chr, ID_le, from_integer(0xFFFF, chr.type())); + return is_bmp; +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_bmp_code_point + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isBmpCodePoint:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_bmp_code_point( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_bmp_code_point, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_for_is_defined + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Determines if a character is defined in Unicode. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_defined( + const exprt &chr, const typet &type) +{ + // The following intervals are undefined in unicode, according to + // the Unicode Character Database: http://www.unicode.org/Public/UCD/latest/ + exprt::operandst intervals; + intervals.push_back(in_interval_expr(chr, 0x0750, 0x077F)); + intervals.push_back(in_interval_expr(chr, 0x07C0, 0x08FF)); + intervals.push_back(in_interval_expr(chr, 0x1380, 0x139F)); + intervals.push_back(in_interval_expr(chr, 0x18B0, 0x18FF)); + intervals.push_back(in_interval_expr(chr, 0x1980, 0x19DF)); + intervals.push_back(in_interval_expr(chr, 0x1A00, 0x1CFF)); + intervals.push_back(in_interval_expr(chr, 0x1D80, 0x1DFF)); + intervals.push_back(in_interval_expr(chr, 0x2C00, 0x2E7F)); + intervals.push_back(in_interval_expr(chr, 0x2FE0, 0x2FEF)); + intervals.push_back(in_interval_expr(chr, 0x31C0, 0x31EF)); + intervals.push_back(in_interval_expr(chr, 0x9FB0, 0x9FFF)); + intervals.push_back(in_interval_expr(chr, 0xA4D0, 0xABFF)); + intervals.push_back(in_interval_expr(chr, 0xD7B0, 0xD7FF)); + intervals.push_back(in_interval_expr(chr, 0xFE10, 0xFE1F)); + intervals.push_back(in_interval_expr(chr, 0x10140, 0x102FF)); + intervals.push_back(in_interval_expr(chr, 0x104B0, 0x107FF)); + intervals.push_back(in_interval_expr(chr, 0x10840, 0x1CFFF)); + intervals.push_back(in_interval_expr(chr, 0x1D200, 0x1D2FF)); + intervals.push_back(in_interval_expr(chr, 0x1D360, 0x1D3FF)); + intervals.push_back( + binary_relation_exprt(chr, ID_ge, from_integer(0x1D800, chr.type()))); + + return not_exprt(disjunction(intervals)); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_defined_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isDefined:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_defined_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_defined, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_char_is_digit_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isDefined:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_defined_int( + conversion_input &target) +{ + return convert_is_defined_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_digit + + Inputs: + chr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Determines if the specified character is a digit. + A character is a digit if its general category type, + provided by Character.getType(ch), is DECIMAL_DIGIT_NUMBER. + + TODO: for now we only support these ranges of digits: + '\u0030' through '\u0039', ISO-LATIN-1 digits ('0' through '9') + '\u0660' through '\u0669', Arabic-Indic digits + '\u06F0' through '\u06F9', Extended Arabic-Indic digits + '\u0966' through '\u096F', Devanagari digits + '\uFF10' through '\uFF19', Fullwidth digits + Many other character ranges contain digits as well. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_digit( + const exprt &chr, const typet &type) +{ + exprt latin_digit=in_interval_expr(chr, '0', '9'); + exprt arabic_indic_digit=in_interval_expr(chr, 0x660, 0x669); + exprt extended_digit=in_interval_expr(chr, 0x6F0, 0x6F9); + exprt devanagari_digit=in_interval_expr(chr, 0x966, 0x96F); + exprt fullwidth_digit=in_interval_expr(chr, 0xFF10, 0xFF19); + or_exprt digit( + or_exprt(latin_digit, or_exprt(arabic_indic_digit, extended_digit)), + or_exprt(devanagari_digit, fullwidth_digit)); + return digit; +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_digit_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isDigit:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_digit_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_digit, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_digit_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.digit:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_digit_int( + conversion_input &target) +{ + return convert_is_digit_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_high_surrogate + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the given char value is a Unicode + high-surrogate code unit (also known as leading-surrogate + code unit). + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_high_surrogate( + const exprt &chr, const typet &type) +{ + return in_interval_expr(chr, 0xD800, 0xDBFF); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_high_surrogate + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isHighSurrogate:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_high_surrogate( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_high_surrogate, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_identifier_ignorable + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the character is one of ignorable in a Java + identifier, that is, it is in one of these ranges: + '\u0000' through '\u0008' + '\u000E' through '\u001B' + '\u007F' through '\u009F' + + TODO: For now, we ignore the FORMAT general category value + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_identifier_ignorable( + const exprt &chr, const typet &type) +{ + or_exprt ignorable( + in_interval_expr(chr, 0x0000, 0x0008), + or_exprt( + in_interval_expr(chr, 0x000E, 0x001B), + in_interval_expr(chr, 0x007F, 0x009F))); + return ignorable; +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_identifier_ignorable_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method + Character.isIdentifierIgnorable:(C)Z + + TODO: For now, we ignore the FORMAT general category value + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_identifier_ignorable_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_identifier_ignorable, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_identifier_ignorable_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method + Character.isIdentifierIgnorable:(I)Z + + TODO: For now, we ignore the FORMAT general category value + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_identifier_ignorable_int( + conversion_input &target) +{ + return convert_is_identifier_ignorable_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_ideographic + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isIdeographic:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_ideographic( + conversion_input &target) +{ + const code_function_callt &function_call=target; + assert(function_call.arguments().size()==1); + const exprt &arg=function_call.arguments()[0]; + const exprt &result=function_call.lhs(); + exprt is_ideograph=in_interval_expr(arg, 0x4E00, 0x9FFF); + return code_assignt(result, is_ideograph); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_ISO_control_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isISOControl:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_ISO_control_char( + conversion_input &target) +{ + const code_function_callt &function_call=target; + assert(function_call.arguments().size()==1); + const exprt &arg=function_call.arguments()[0]; + const exprt &result=function_call.lhs(); + or_exprt iso( + in_interval_expr(arg, 0x00, 0x1F), in_interval_expr(arg, 0x7F, 0x9F)); + return code_assignt(result, iso); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_ISO_control_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isISOControl:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_ISO_control_int( + conversion_input &target) +{ + return convert_is_ISO_control_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_java_identifier_part_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isJavaIdentifierPart:(C)Z + + TODO: For now we do not allow currency symbol, connecting punctuation, + combining mark, non-spacing mark + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_java_identifier_part_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_unicode_identifier_part, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_java_identifier_part_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method isJavaIdentifierPart:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_java_identifier_part_int( + conversion_input &target) +{ + return convert_is_unicode_identifier_part_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_java_identifier_start_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method isJavaIdentifierStart:(C)Z + + TODO: For now we only allow letters and letter numbers. + The java specification for this function is not precise on the + other characters. + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_java_identifier_start_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_unicode_identifier_start, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_char_is_digit_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method isJavaIdentifierStart:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_java_identifier_start_int( + conversion_input &target) +{ + return convert_is_java_identifier_start_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_java_letter + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isJavaLetter:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_java_letter( + conversion_input &target) +{ + return convert_is_java_identifier_start_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_java_letter_or_digit + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method isJavaLetterOrDigit:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_java_letter_or_digit( + conversion_input &target) +{ + return convert_is_java_identifier_part_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_letter_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isLetter:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_letter_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_letter, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_letter_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isLetter:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_letter_int( + conversion_input &target) +{ + return convert_is_letter_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_letter_or_digit + + Inputs: + chr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Determines if the specified character is a letter or digit. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_letter_or_digit( + const exprt &chr, const typet &type) +{ + return or_exprt(expr_of_is_letter(chr, type), expr_of_is_digit(chr, type)); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_letter_or_digit_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isLetterOrDigit:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_letter_or_digit_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_digit, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_letter_or_digit_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isLetterOrDigit:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_letter_or_digit_int( + conversion_input &target) +{ + return convert_is_letter_or_digit_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_lower_case_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isLowerCase:(C)Z + + TODO: For now we only consider ASCII characters + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_lower_case_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_ascii_lower_case, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_lower_case_int() + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isLowerCase:(I)Z + + TODO: For now we only consider ASCII characters + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_lower_case_int( + conversion_input &target) +{ + return convert_is_lower_case_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_low_surrogate + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isLowSurrogate:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_low_surrogate( + conversion_input &target) +{ + const code_function_callt &function_call=target; + assert(function_call.arguments().size()==1); + const exprt &arg=function_call.arguments()[0]; + const exprt &result=function_call.lhs(); + exprt is_low_surrogate=in_interval_expr(arg, 0xDC00, 0xDFFF); + return code_assignt(result, is_low_surrogate); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_mirrored + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Determines whether the character is mirrored according to + the Unicode specification. + + TODO: For now only ASCII characters are considered + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_mirrored( + const exprt &chr, const typet &type) +{ + return in_list_expr(chr, {0x28, 0x29, 0x3C, 0x3E, 0x5B, 0x5D, 0x7B, 0x7D}); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_mirrored_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isMirrored:(C)Z + + TODO: For now only ASCII characters are considered + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_mirrored_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_mirrored, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_mirrored_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isMirrored:(I)Z + + TODO: For now only ASCII characters are considered + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_mirrored_int( + conversion_input &target) +{ + return convert_is_mirrored_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_space + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isSpace:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_space(conversion_input &target) +{ + return convert_is_whitespace_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_space_char + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the specified character is white space + according to Unicode (SPACE_SEPARATOR, LINE_SEPARATOR, or + PARAGRAPH_SEPARATOR) + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_space_char( + const exprt &chr, const typet &type) +{ + std::list space_characters= + {0x20, 0x00A0, 0x1680, 0x202F, 0x205F, 0x3000, 0x2028, 0x2029}; + exprt condition0=in_list_expr(chr, space_characters); + exprt condition1=in_interval_expr(chr, 0x2000, 0x200A); + return or_exprt(condition0, condition1); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_space_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isSpaceChar:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_space_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_space_char, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_space_char_int() + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isSpaceChar:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_space_char_int( + conversion_input &target) +{ + return convert_is_space_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_supplementary_code_point + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines whether the specified character (Unicode code + point) is in the supplementary character range. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_supplementary_code_point( + const exprt &chr, const typet &type) +{ + return binary_relation_exprt(chr, ID_gt, from_integer(0xFFFF, chr.type())); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_supplementary_code_point + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method + Character.isSupplementaryCodePoint:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_supplementary_code_point( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_supplementary_code_point, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_surrogate + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the given char value is a Unicode surrogate + code unit. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_surrogate( + const exprt &chr, const typet &type) +{ + return in_interval_expr(chr, 0xD800, 0xDFFF); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_surrogate + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isSurrogate:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_surrogate( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_surrogate, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_surrogate_pair + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isSurrogatePair:(CC)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_surrogate_pair( + conversion_input &target) +{ + const code_function_callt &function_call=target; + assert(function_call.arguments().size()==2); + const exprt &arg0=function_call.arguments()[0]; + const exprt &arg1=function_call.arguments()[1]; + const exprt &result=function_call.lhs(); + exprt is_low_surrogate=in_interval_expr(arg1, 0xDC00, 0xDFFF); + exprt is_high_surrogate=in_interval_expr(arg0, 0xD800, 0xDBFF); + return code_assignt(result, and_exprt(is_high_surrogate, is_low_surrogate)); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_title_case + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the specified character is a titlecase character. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_title_case( + const exprt &chr, const typet &type) +{ + std::listtitle_case_chars= + {0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x1FBC, 0x1FCC, 0x1FFC}; + exprt::operandst conditions; + conditions.push_back(in_list_expr(chr, title_case_chars)); + conditions.push_back(in_interval_expr(chr, 0x1F88, 0x1F8F)); + conditions.push_back(in_interval_expr(chr, 0x1F98, 0x1F9F)); + conditions.push_back(in_interval_expr(chr, 0x1FA8, 0x1FAF)); + return disjunction(conditions); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_title_case_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isTitleCase:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_title_case_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_title_case, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_title_case_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isTitleCase:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_title_case_int( + conversion_input &target) +{ + return convert_is_title_case_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_letter_number + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the specified character is in the LETTER_NUMBER + category of Unicode + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_letter_number( + const exprt &chr, const typet &type) +{ + // The following set of characters is the general category "Nl" in the + // Unicode specification. + exprt cond0=in_interval_expr(chr, 0x16EE, 0x16F0); + exprt cond1=in_interval_expr(chr, 0x2160, 0x2188); + exprt cond2=in_interval_expr(chr, 0x3021, 0x3029); + exprt cond3=in_interval_expr(chr, 0x3038, 0x303A); + exprt cond4=in_interval_expr(chr, 0xA6E6, 0xA6EF); + exprt cond5=in_interval_expr(chr, 0x10140, 0x10174); + exprt cond6=in_interval_expr(chr, 0x103D1, 0x103D5); + exprt cond7=in_interval_expr(chr, 0x12400, 0x1246E); + exprt cond8=in_list_expr(chr, {0x3007, 0x10341, 0x1034A}); + return or_exprt( + or_exprt(or_exprt(cond0, cond1), or_exprt(cond2, cond3)), + or_exprt(or_exprt(cond4, cond5), or_exprt(cond6, or_exprt(cond7, cond8)))); +} + + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_unicode_identifier_part + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the character may be part of a Unicode identifier. + + TODO: For now we do not allow connecting punctuation, combining mark, + non-spacing mark + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_unicode_identifier_part( + const exprt &chr, const typet &type) +{ + exprt::operandst conditions; + conditions.push_back(expr_of_is_unicode_identifier_start(chr, type)); + conditions.push_back(expr_of_is_digit(chr, type)); + conditions.push_back(expr_of_is_identifier_ignorable(chr, type)); + return disjunction(conditions); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_unicode_identifier_part_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method + Character.isUnicodeIdentifierPart:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_unicode_identifier_part_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_unicode_identifier_part, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_unicode_identifier_part_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method + Character.isUnicodeIdentifierPart:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_unicode_identifier_part_int( + conversion_input &target) +{ + return convert_is_unicode_identifier_part_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_unicode_identifier_start + + Inputs: + chr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the specified character is permissible as the + first character in a Unicode identifier. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_unicode_identifier_start( + const exprt &chr, const typet &type) +{ + return or_exprt( + expr_of_is_letter(chr, type), expr_of_is_letter_number(chr, type)); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_unicode_identifier_start_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method + Character.isUnicodeIdentifierStart:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_unicode_identifier_start_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_unicode_identifier_start, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_unicode_identifier_start_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method + Character.isUnicodeIdentifierStart:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_unicode_identifier_start_int( + conversion_input &target) +{ + return convert_is_unicode_identifier_start_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_upper_case_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isUpperCase:(C)Z + + TODO: For now we only consider ASCII characters + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_upper_case_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_ascii_upper_case, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_upper_case_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isUpperCase:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_upper_case_int( + conversion_input &target) +{ + return convert_is_upper_case_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_valid_code_point + + Inputs: + chr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines whether the specified code point is a valid + Unicode code point value. + That is, in the range of integers from 0 to 0x10FFFF + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_valid_code_point( + const exprt &chr, const typet &type) +{ + return binary_relation_exprt(chr, ID_le, from_integer(0x10FFFF, chr.type())); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_valid_code_point + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isValidCodePoint:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_valid_code_point( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_valid_code_point, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_is_whitespace + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A Boolean expression + + Purpose: Determines if the specified character is white space according + to Java. It is the case when it one of the following: + * a Unicode space character (SPACE_SEPARATOR, LINE_SEPARATOR, or + PARAGRAPH_SEPARATOR) but is not also a non-breaking space + ('\u00A0', '\u2007', '\u202F'). + * it is one of these: U+0009 U+000A U+000B U+000C U+000D + U+001C U+001D U+001E U+001F + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_is_whitespace( + const exprt &chr, const typet &type) +{ + exprt::operandst conditions; + std::list space_characters= + {0x20, 0x1680, 0x205F, 0x3000, 0x2028, 0x2029}; + conditions.push_back(in_list_expr(chr, space_characters)); + conditions.push_back(in_interval_expr(chr, 0x2000, 0x2006)); + conditions.push_back(in_interval_expr(chr, 0x2008, 0x200A)); + conditions.push_back(in_interval_expr(chr, 0x09, 0x0D)); + conditions.push_back(in_interval_expr(chr, 0x1C, 0x1F)); + return disjunction(conditions); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_whitespace_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isWhitespace:(C)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_whitespace_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_is_whitespace, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_is_whitespace_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.isWhitespace:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_is_whitespace_int( + conversion_input &target) +{ + return convert_is_whitespace_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_low_surrogate + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A integer expression of the given type + + Purpose: Returns the trailing surrogate (a low surrogate code unit) + of the surrogate pair representing the specified + supplementary character (Unicode code point) in the UTF-16 + encoding. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_low_surrogate( + const exprt &chr, const typet &type) +{ + exprt uDC00=from_integer(0xDC00, type); + exprt u0400=from_integer(0x0400, type); + return plus_exprt(uDC00, mod_exprt(chr, u0400)); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_low_surrogate + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.lowSurrogate:(I)Z + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_low_surrogate( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_low_surrogate, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_reverse_bytes + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A character expression of the given type + + Purpose: Returns the value obtained by reversing the order of the + bytes in the specified char value. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_reverse_bytes( + const exprt &chr, const typet &type) +{ + shl_exprt first_byte(chr, from_integer(8, type)); + lshr_exprt second_byte(chr, from_integer(8, type)); + return plus_exprt(first_byte, second_byte); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_reverse_bytes + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.reverseBytes:(C)C + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_reverse_bytes( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_reverse_bytes, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_to_chars + + Inputs: + expr - An expression of type character + type - A type for the output + + Outputs: A character array expression of the given type + + Purpose: Converts the specified character (Unicode code point) to + its UTF-16 representation stored in a char array. + If the specified code point is a BMP (Basic Multilingual + Plane or Plane 0) value, the resulting char array has the + same value as codePoint. + If the specified code point is a supplementary code point, + the resulting char array has the corresponding surrogate pair. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_to_chars( + const exprt &chr, const typet &type) +{ + array_typet array_type=to_array_type(type); + const typet &char_type=array_type.subtype(); + array_exprt case1(array_type); + array_exprt case2(array_type); + exprt low_surrogate=expr_of_low_surrogate(chr, char_type); + case1.copy_to_operands(low_surrogate); + case2.move_to_operands(low_surrogate); + exprt high_surrogate=expr_of_high_surrogate(chr, char_type); + case2.move_to_operands(high_surrogate); + return if_exprt(expr_of_is_bmp_code_point(chr, type), case1, case2); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_to_chars + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.toChars:(I)[C + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_to_chars(conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_to_chars, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_to_code_point + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.toCodePoint:(CC)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_to_code_point( + conversion_input &target) +{ + const code_function_callt &function_call=target; + assert(function_call.arguments().size()==2); + const exprt &arg0=function_call.arguments()[0]; + const exprt &arg1=function_call.arguments()[1]; + const exprt &result=function_call.lhs(); + const typet &type=result.type(); + + // These operations implement the decoding of a unicode symbol encoded + // in UTF16 for the supplementary planes (above U+10000). + // The low ten bits of the first character give the bits 10 to 19 of + // code point and the low ten bits of the second give the bits 0 to 9, + // then 0x10000 is added to the result. For more explenations see: + // https://en.wikipedia.org/wiki/UTF-16 + + exprt u010000=from_integer(0x010000, type); + exprt mask10bit=from_integer(0x03FF, type); + shl_exprt m1(bitand_exprt(arg0, mask10bit), from_integer(10, type)); + bitand_exprt m2(arg1, mask10bit); + bitor_exprt pair_value(u010000, bitor_exprt(m1, m2)); + return code_assignt(result, pair_value); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_to_lower_case + + Inputs: + chr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Converts the character argument to lowercase. + + TODO: For now we only consider ASCII characters but ultimately + we should use case mapping information from the + UnicodeData file + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_to_lower_case( + const exprt &chr, const typet &type) +{ + minus_exprt transformed( + plus_exprt(chr, from_integer('a', type)), from_integer('A', type)); + + if_exprt res(expr_of_is_ascii_upper_case(chr, type), transformed, chr); + return res; +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_to_lower_case_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.toLowerCase:(C)C + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_to_lower_case_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_to_lower_case, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_to_lower_case_int() + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.toLowerCase:(I)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_to_lower_case_int( + conversion_input &target) +{ + return convert_to_lower_case_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_to_title_case + + Inputs: + chr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Converts the character argument to titlecase. + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_to_title_case( + const exprt &chr, const typet &type) +{ + std::list increment_list={0x01C4, 0x01C7, 0x01CA, 0x01F1}; + std::list decrement_list={0x01C6, 0x01C9, 0x01CC, 0x01F3}; + exprt plus_8_interval1=in_interval_expr(chr, 0x1F80, 0x1F87); + exprt plus_8_interval2=in_interval_expr(chr, 0x1F90, 0x1F97); + exprt plus_8_interval3=in_interval_expr(chr, 0x1FA0, 0x1FA7); + std::list plus_9_list={0x1FB3, 0x1FC3, 0x1FF3}; + minus_exprt minus_1(chr, from_integer(1, type)); + plus_exprt plus_1(chr, from_integer(1, type)); + plus_exprt plus_8(chr, from_integer(8, type)); + plus_exprt plus_9(chr, from_integer(9, type)); + or_exprt plus_8_set( + plus_8_interval1, or_exprt(plus_8_interval2, plus_8_interval3)); + + if_exprt res( + in_list_expr(chr, increment_list), + plus_1, + if_exprt( + in_list_expr(chr, decrement_list), + minus_1, + if_exprt( + plus_8_set, + plus_8, + if_exprt( + in_list_expr(chr, plus_9_list), + plus_9, + chr)))); + + return res; +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_to_title_case_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.toTitleCase:(C)C + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_to_title_case_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_to_title_case, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_to_title_case_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.toTitleCase:(I)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_to_title_case_int( + conversion_input &target) +{ + return convert_to_title_case_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::expr_of_to_upper_case + + Inputs: + chr - An expression of type character + type - A type for the output + + Outputs: An expression of the given type + + Purpose: Converts the character argument to uppercase. + + TODO: For now we only consider ASCII characters but ultimately + we should use case mapping information from the + UnicodeData file + +\*******************************************************************/ + +exprt character_refine_preprocesst::expr_of_to_upper_case( + const exprt &chr, const typet &type) +{ + minus_exprt transformed( + plus_exprt(chr, from_integer('A', type)), from_integer('a', type)); + + if_exprt res(expr_of_is_ascii_lower_case(chr, type), transformed, chr); + return res; +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_to_upper_case_char + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.toUpperCase:(C)C + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_to_upper_case_char( + conversion_input &target) +{ + return convert_char_function( + &character_refine_preprocesst::expr_of_to_upper_case, target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::convert_to_upper_case_int + + Inputs: + target - a position in a goto program + + Purpose: Converts function call to an assignment of an expression + corresponding to the java method Character.toUpperCase:(I)I + +\*******************************************************************/ + +codet character_refine_preprocesst::convert_to_upper_case_int( + conversion_input &target) +{ + return convert_to_upper_case_char(target); +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::replace_character_call + + Inputs: + code - the code of a function call + + Outputs: code where character function call get replaced by + an simple instruction + + Purpose: replace function calls to functions of the Character by + an affectation if possible, returns the same code otherwise. + For this method to have an effect initialize_conversion_table + must have been called before. + +\*******************************************************************/ + +codet character_refine_preprocesst::replace_character_call( + const code_function_callt &code) const +{ + if(code.function().id()==ID_symbol) + { + const irep_idt &function_id= + to_symbol_expr(code.function()).get_identifier(); + auto it=conversion_table.find(function_id); + if(it!=conversion_table.end()) + return (it->second)(code); + } + + return code; +} + +/*******************************************************************\ + +Function: character_refine_preprocesst::initialize_conversion_table + + Purpose: fill maps with correspondance from java method names to + conversion functions + +\*******************************************************************/ + +void character_refine_preprocesst::initialize_conversion_table() +{ + // All methods are listed here in alphabetic order + // The ones that are not supported by this module (though they may be + // supported by the string solver) have no entry in the conversion + // table and are marked in this way: + // Not supported "java::java.lang.Character.()" + + conversion_table["java::java.lang.Character.charCount:(I)I"]= + &character_refine_preprocesst::convert_char_count; + conversion_table["java::java.lang.Character.charValue:()C"]= + &character_refine_preprocesst::convert_char_value; + + // Not supported "java::java.lang.Character.codePointAt:([CI)I + // Not supported "java::java.lang.Character.codePointAt:([CII)I" + // Not supported "java::java.lang.Character.codePointAt:" + // "(Ljava.lang.CharSequence;I)I" + // Not supported "java::java.lang.Character.codePointBefore:([CI)I" + // Not supported "java::java.lang.Character.codePointBefore:([CII)I" + // Not supported "java::java.lang.Character.codePointBefore:" + // "(Ljava.lang.CharSequence;I)I" + // Not supported "java::java.lang.Character.codePointCount:([CII)I" + // Not supported "java::java.lang.Character.codePointCount:" + // "(Ljava.lang.CharSequence;I)I" + // Not supported "java::java.lang.Character.compareTo:" + // "(Ljava.lang.Character;)I" + + conversion_table["java::java.lang.Character.compare:(CC)I"]= + &character_refine_preprocesst::convert_compare; + conversion_table["java::java.lang.Character.digit:(CI)I"]= + &character_refine_preprocesst::convert_digit_char; + conversion_table["java::java.lang.Character.digit:(II)I"]= + &character_refine_preprocesst::convert_digit_int; + + // Not supported "java::java.lang.Character.equals:(Ljava.lang.Object;)Z" + + conversion_table["java::java.lang.Character.forDigit:(II)C"]= + &character_refine_preprocesst::convert_for_digit; + conversion_table["java::java.lang.Character.getDirectionality:(C)B"]= + &character_refine_preprocesst::convert_get_directionality_char; + conversion_table["java::java.lang.Character.getDirectionality:(I)B"]= + &character_refine_preprocesst::convert_get_directionality_int; + + // Not supported "java::java.lang.Character.getName:(I)Ljava.lang.String;" + + conversion_table["java::java.lang.Character.getNumericValue:(C)I"]= + &character_refine_preprocesst::convert_get_numeric_value_char; + conversion_table["java::java.lang.Character.getNumericValue:(I)I"]= + &character_refine_preprocesst::convert_get_numeric_value_int; + conversion_table["java::java.lang.Character.getType:(C)I"]= + &character_refine_preprocesst::convert_get_type_char; + conversion_table["java::java.lang.Character.getType:(I)Z"]= + &character_refine_preprocesst::convert_get_type_int; + conversion_table["java::java.lang.Character.hashCode:()I"]= + &character_refine_preprocesst::convert_hash_code; + conversion_table["java::java.lang.Character.highSurrogate:(C)Z"]= + &character_refine_preprocesst::convert_high_surrogate; + conversion_table["java::java.lang.Character.isAlphabetic:(I)Z"]= + &character_refine_preprocesst::convert_is_alphabetic; + conversion_table["java::java.lang.Character.isBmpCodePoint:(I)Z"]= + &character_refine_preprocesst::convert_is_bmp_code_point; + conversion_table["java::java.lang.Character.isDefined:(C)Z"]= + &character_refine_preprocesst::convert_is_defined_char; + conversion_table["java::java.lang.Character.isDefined:(I)Z"]= + &character_refine_preprocesst::convert_is_defined_int; + conversion_table["java::java.lang.Character.isDigit:(C)Z"]= + &character_refine_preprocesst::convert_is_digit_char; + conversion_table["java::java.lang.Character.isDigit:(I)Z"]= + &character_refine_preprocesst::convert_is_digit_int; + conversion_table["java::java.lang.Character.isHighSurrogate:(C)Z"]= + &character_refine_preprocesst::convert_is_high_surrogate; + conversion_table["java::java.lang.Character.isIdentifierIgnorable:(C)Z"]= + &character_refine_preprocesst::convert_is_identifier_ignorable_char; + conversion_table["java::java.lang.Character.isIdentifierIgnorable:(I)Z"]= + &character_refine_preprocesst::convert_is_identifier_ignorable_int; + conversion_table["java::java.lang.Character.isIdeographic:(C)Z"]= + &character_refine_preprocesst::convert_is_ideographic; + conversion_table["java::java.lang.Character.isISOControl:(C)Z"]= + &character_refine_preprocesst::convert_is_ISO_control_char; + conversion_table["java::java.lang.Character.isISOControl:(I)Z"]= + &character_refine_preprocesst::convert_is_ISO_control_int; + conversion_table["java::java.lang.Character.isJavaIdentifierPart:(C)Z"]= + &character_refine_preprocesst::convert_is_java_identifier_part_char; + conversion_table["java::java.lang.Character.isJavaIdentifierPart:(I)Z"]= + &character_refine_preprocesst::convert_is_java_identifier_part_int; + conversion_table["java::java.lang.Character.isJavaIdentifierStart:(C)Z"]= + &character_refine_preprocesst::convert_is_java_identifier_start_char; + conversion_table["java::java.lang.Character.isJavaIdentifierStart:(I)Z"]= + &character_refine_preprocesst::convert_is_java_identifier_start_int; + conversion_table["java::java.lang.Character.isJavaLetter:(C)Z"]= + &character_refine_preprocesst::convert_is_java_letter; + conversion_table["java::java.lang.Character.isJavaLetterOrDigit:(C)Z"]= + &character_refine_preprocesst::convert_is_java_letter_or_digit; + conversion_table["java::java.lang.Character.isLetter:(C)Z"]= + &character_refine_preprocesst::convert_is_letter_char; + conversion_table["java::java.lang.Character.isLetter:(I)Z"]= + &character_refine_preprocesst::convert_is_letter_int; + conversion_table["java::java.lang.Character.isLetterOrDigit:(C)Z"]= + &character_refine_preprocesst::convert_is_letter_or_digit_char; + conversion_table["java::java.lang.Character.isLetterOrDigit:(I)Z"]= + &character_refine_preprocesst::convert_is_letter_or_digit_int; + conversion_table["java::java.lang.Character.isLowerCase:(C)Z"]= + &character_refine_preprocesst::convert_is_lower_case_char; + conversion_table["java::java.lang.Character.isLowerCase:(I)Z"]= + &character_refine_preprocesst::convert_is_lower_case_int; + conversion_table["java::java.lang.Character.isLowSurrogate:(I)Z"]= + &character_refine_preprocesst::convert_is_low_surrogate; + conversion_table["java::java.lang.Character.isMirrored:(C)Z"]= + &character_refine_preprocesst::convert_is_mirrored_char; + conversion_table["java::java.lang.Character.isMirrored:(I)Z"]= + &character_refine_preprocesst::convert_is_mirrored_int; + conversion_table["java::java.lang.Character.isSpace:(C)Z"]= + &character_refine_preprocesst::convert_is_space; + conversion_table["java::java.lang.Character.isSpaceChar:(C)Z"]= + &character_refine_preprocesst::convert_is_space_char; + conversion_table["java::java.lang.Character.isSpaceChar:(I)Z"]= + &character_refine_preprocesst::convert_is_space_char_int; + conversion_table["java::java.lang.Character.isSupplementaryCodePoint:(I)Z"]= + &character_refine_preprocesst::convert_is_supplementary_code_point; + conversion_table["java::java.lang.Character.isSurrogate:(C)Z"]= + &character_refine_preprocesst::convert_is_surrogate; + conversion_table["java::java.lang.Character.isSurrogatePair:(CC)Z"]= + &character_refine_preprocesst::convert_is_surrogate_pair; + conversion_table["java::java.lang.Character.isTitleCase:(C)Z"]= + &character_refine_preprocesst::convert_is_title_case_char; + conversion_table["java::java.lang.Character.isTitleCase:(I)Z"]= + &character_refine_preprocesst::convert_is_title_case_int; + conversion_table["java::java.lang.Character.isUnicodeIdentifierPart:(C)Z"]= + &character_refine_preprocesst::convert_is_unicode_identifier_part_char; + conversion_table["java::java.lang.Character.isUnicodeIdentifierPart:(I)Z"]= + &character_refine_preprocesst::convert_is_unicode_identifier_part_int; + conversion_table["java::java.lang.Character.isUnicodeIdentifierStart:(C)Z"]= + &character_refine_preprocesst::convert_is_unicode_identifier_start_char; + conversion_table["java::java.lang.Character.isUnicodeIdentifierStart:(I)Z"]= + &character_refine_preprocesst::convert_is_unicode_identifier_start_int; + conversion_table["java::java.lang.Character.isUpperCase:(C)Z"]= + &character_refine_preprocesst::convert_is_upper_case_char; + conversion_table["java::java.lang.Character.isUpperCase:(I)Z"]= + &character_refine_preprocesst::convert_is_upper_case_int; + conversion_table["java::java.lang.Character.isValidCodePoint:(I)Z"]= + &character_refine_preprocesst::convert_is_valid_code_point; + conversion_table["java::java.lang.Character.isWhitespace:(C)Z"]= + &character_refine_preprocesst::convert_is_whitespace_char; + conversion_table["java::java.lang.Character.isWhitespace:(I)Z"]= + &character_refine_preprocesst::convert_is_whitespace_int; + conversion_table["java::java.lang.Character.lowSurrogate:(I)Z"]= + &character_refine_preprocesst::convert_is_low_surrogate; + + // Not supported "java::java.lang.Character.offsetByCodePoints:([CIIII)I" + // Not supported "java::java.lang.Character.offsetByCodePoints:" + // "(Ljava.lang.CharacterSequence;II)I" + + conversion_table["java::java.lang.Character.reverseBytes:(C)C"]= + &character_refine_preprocesst::convert_reverse_bytes; + conversion_table["java::java.lang.Character.toChars:(I)[C"]= + &character_refine_preprocesst::convert_to_chars; + + // Not supported "java::java.lang.Character.toChars:(I[CI])I" + + conversion_table["java::java.lang.Character.toCodePoint:(CC)I"]= + &character_refine_preprocesst::convert_to_code_point; + conversion_table["java::java.lang.Character.toLowerCase:(C)C"]= + &character_refine_preprocesst::convert_to_lower_case_char; + conversion_table["java::java.lang.Character.toLowerCase:(I)I"]= + &character_refine_preprocesst::convert_to_lower_case_int; + + // Not supported "java::java.lang.Character.toString:()Ljava.lang.String;" + // Not supported "java::java.lang.Character.toString:(C)Ljava.lang.String;" + + conversion_table["java::java.lang.Character.toTitleCase:(C)C"]= + &character_refine_preprocesst::convert_to_title_case_char; + conversion_table["java::java.lang.Character.toTitleCase:(I)I"]= + &character_refine_preprocesst::convert_to_title_case_int; + conversion_table["java::java.lang.Character.toUpperCase:(C)C"]= + &character_refine_preprocesst::convert_to_upper_case_char; + conversion_table["java::java.lang.Character.toUpperCase:(I)I"]= + &character_refine_preprocesst::convert_to_upper_case_int; + + // Not supported "java::java.lang.Character.valueOf:(C)Ljava.lang.Character;" +} diff --git a/src/java_bytecode/character_refine_preprocess.h b/src/java_bytecode/character_refine_preprocess.h new file mode 100644 index 00000000000..25fc7ddeb67 --- /dev/null +++ b/src/java_bytecode/character_refine_preprocess.h @@ -0,0 +1,151 @@ +/*******************************************************************\ + +Module: Preprocess a goto-programs so that calls to the java Character + library are replaced by simple expressions. + For now support is limited to character in the ASCII range, + some methods may have incorrect specifications outside of this range. + +Author: Romain Brenguier + +Date: March 2017 + +\*******************************************************************/ + +#ifndef CPROVER_JAVA_BYTECODE_CHARACTER_REFINE_PREPROCESS_H +#define CPROVER_JAVA_BYTECODE_CHARACTER_REFINE_PREPROCESS_H + +#include +#include + +class character_refine_preprocesst:public messaget +{ +public: + void initialize_conversion_table(); + codet replace_character_call(const code_function_callt &call) const; + +private: + typedef const code_function_callt &conversion_input; + typedef codet (*conversion_functiont)(conversion_input &target); + // A table tells us what method to call for each java method signature + std::unordered_map + conversion_table; + + // Conversion functions + static exprt expr_of_char_count(const exprt &chr, const typet &type); + static codet convert_char_count(conversion_input &target); + static exprt expr_of_char_value(const exprt &chr, const typet &type); + static codet convert_char_value(conversion_input &target); + static codet convert_compare(conversion_input &target); + static codet convert_digit_char(conversion_input &target); + static codet convert_digit_int(conversion_input &target); + static codet convert_for_digit(conversion_input &target); + static codet convert_get_directionality_char(conversion_input &target); + static codet convert_get_directionality_int(conversion_input &target); + static codet convert_get_numeric_value_char(conversion_input &target); + static codet convert_get_numeric_value_int(conversion_input &target); + static codet convert_get_type_char(conversion_input &target); + static codet convert_get_type_int(conversion_input &target); + static codet convert_hash_code(conversion_input &target); + static exprt expr_of_high_surrogate(const exprt &chr, const typet &type); + static codet convert_high_surrogate(conversion_input &target); + static exprt expr_of_is_alphabetic(const exprt &chr, const typet &type); + static codet convert_is_alphabetic(conversion_input &target); + static exprt expr_of_is_bmp_code_point(const exprt &chr, const typet &type); + static codet convert_is_bmp_code_point(conversion_input &target); + static exprt expr_of_is_defined(const exprt &chr, const typet &type); + static codet convert_is_defined_char(conversion_input &target); + static codet convert_is_defined_int(conversion_input &target); + static exprt expr_of_is_digit(const exprt &chr, const typet &type); + static codet convert_is_digit_char(conversion_input &target); + static codet convert_is_digit_int(conversion_input &target); + static exprt expr_of_is_high_surrogate(const exprt &chr, const typet &type); + static codet convert_is_high_surrogate(conversion_input &target); + static exprt expr_of_is_identifier_ignorable( + const exprt &chr, const typet &type); + static codet convert_is_identifier_ignorable_char(conversion_input &target); + static codet convert_is_identifier_ignorable_int(conversion_input &target); + static codet convert_is_ideographic(conversion_input &target); + static codet convert_is_ISO_control_char(conversion_input &target); + static codet convert_is_ISO_control_int(conversion_input &target); + static codet convert_is_java_identifier_part_char(conversion_input &target); + static codet convert_is_java_identifier_part_int(conversion_input &target); + static codet convert_is_java_identifier_start_char(conversion_input &target); + static codet convert_is_java_identifier_start_int(conversion_input &target); + static codet convert_is_java_letter(conversion_input &target); + static codet convert_is_java_letter_or_digit(conversion_input &target); + static exprt expr_of_is_letter(const exprt &chr, const typet &type); + static codet convert_is_letter_char(conversion_input &target); + static codet convert_is_letter_int(conversion_input &target); + static exprt expr_of_is_letter_or_digit(const exprt &chr, const typet &type); + static codet convert_is_letter_or_digit_char(conversion_input &target); + static codet convert_is_letter_or_digit_int(conversion_input &target); + static exprt expr_of_is_ascii_lower_case(const exprt &chr, const typet &type); + static codet convert_is_lower_case_char(conversion_input &target); + static codet convert_is_lower_case_int(conversion_input &target); + static codet convert_is_low_surrogate(conversion_input &target); + static exprt expr_of_is_mirrored(const exprt &chr, const typet &type); + static codet convert_is_mirrored_char(conversion_input &target); + static codet convert_is_mirrored_int(conversion_input &target); + static codet convert_is_space(conversion_input &target); + static exprt expr_of_is_space_char(const exprt &chr, const typet &type); + static codet convert_is_space_char(conversion_input &target); + static codet convert_is_space_char_int(conversion_input &target); + static exprt expr_of_is_supplementary_code_point( + const exprt &chr, const typet &type); + static codet convert_is_supplementary_code_point(conversion_input &target); + static exprt expr_of_is_surrogate(const exprt &chr, const typet &type); + static codet convert_is_surrogate(conversion_input &target); + static codet convert_is_surrogate_pair(conversion_input &target); + static exprt expr_of_is_title_case(const exprt &chr, const typet &type); + static codet convert_is_title_case_char(conversion_input &target); + static codet convert_is_title_case_int(conversion_input &target); + static exprt expr_of_is_letter_number(const exprt &chr, const typet &type); + static exprt expr_of_is_unicode_identifier_part( + const exprt &chr, const typet &type); + static codet convert_is_unicode_identifier_part_char( + conversion_input &target); + static codet convert_is_unicode_identifier_part_int(conversion_input &target); + static exprt expr_of_is_unicode_identifier_start( + const exprt &chr, const typet &type); + static codet convert_is_unicode_identifier_start_char( + conversion_input &target); + static codet convert_is_unicode_identifier_start_int( + conversion_input &target); + static exprt expr_of_is_ascii_upper_case(const exprt &chr, const typet &type); + static codet convert_is_upper_case_char(conversion_input &target); + static codet convert_is_upper_case_int(conversion_input &target); + static exprt expr_of_is_valid_code_point(const exprt &chr, const typet &type); + static codet convert_is_valid_code_point(conversion_input &target); + static exprt expr_of_is_whitespace(const exprt &chr, const typet &type); + static codet convert_is_whitespace_char(conversion_input &target); + static codet convert_is_whitespace_int(conversion_input &target); + static exprt expr_of_low_surrogate(const exprt &chr, const typet &type); + static codet convert_low_surrogate(conversion_input &target); + static exprt expr_of_reverse_bytes(const exprt &chr, const typet &type); + static codet convert_reverse_bytes(conversion_input &target); + static exprt expr_of_to_chars(const exprt &chr, const typet &type); + static codet convert_to_chars(conversion_input &target); + static codet convert_to_code_point(conversion_input &target); + static exprt expr_of_to_lower_case(const exprt &chr, const typet &type); + static codet convert_to_lower_case_char(conversion_input &target); + static codet convert_to_lower_case_int(conversion_input &target); + static exprt expr_of_to_title_case(const exprt &chr, const typet &type); + static codet convert_to_title_case_char(conversion_input &target); + static codet convert_to_title_case_int(conversion_input &target); + static exprt expr_of_to_upper_case(const exprt &chr, const typet &type); + static codet convert_to_upper_case_char(conversion_input &target); + static codet convert_to_upper_case_int(conversion_input &target); + + // Helper functions + static codet convert_char_function( + exprt (*expr_function)(const exprt &chr, const typet &type), + conversion_input &target); + static exprt in_interval_expr( + const exprt &chr, + const mp_integer &lower_bound, + const mp_integer &upper_bound); + static exprt in_list_expr( + const exprt &chr, const std::list &list); +}; + +#endif // CPROVER_JAVA_BYTECODE_CHARACTER_REFINE_PREPROCESS_H diff --git a/src/java_bytecode/java_bytecode_convert_class.cpp b/src/java_bytecode/java_bytecode_convert_class.cpp index 53664271993..bb297ef3fe7 100644 --- a/src/java_bytecode/java_bytecode_convert_class.cpp +++ b/src/java_bytecode/java_bytecode_convert_class.cpp @@ -30,13 +30,15 @@ class java_bytecode_convert_classt:public messaget size_t _max_array_length, lazy_methodst& _lazy_methods, lazy_methods_modet _lazy_methods_mode, - bool _string_refinement_enabled): + bool _string_refinement_enabled, + const character_refine_preprocesst &_character_preprocess): messaget(_message_handler), symbol_table(_symbol_table), max_array_length(_max_array_length), lazy_methods(_lazy_methods), lazy_methods_mode(_lazy_methods_mode), - string_refinement_enabled(_string_refinement_enabled) + string_refinement_enabled(_string_refinement_enabled), + character_preprocess(_character_preprocess) { } @@ -62,6 +64,7 @@ class java_bytecode_convert_classt:public messaget lazy_methodst &lazy_methods; lazy_methods_modet lazy_methods_mode; bool string_refinement_enabled; + character_refine_preprocesst character_preprocess; // conversion void convert(const classt &c); @@ -166,7 +169,8 @@ void java_bytecode_convert_classt::convert(const classt &c) method, symbol_table, get_message_handler(), - max_array_length); + max_array_length, + character_preprocess); } else { @@ -365,7 +369,8 @@ bool java_bytecode_convert_class( size_t max_array_length, lazy_methodst &lazy_methods, lazy_methods_modet lazy_methods_mode, - bool string_refinement_enabled) + bool string_refinement_enabled, + const character_refine_preprocesst &character_preprocess) { java_bytecode_convert_classt java_bytecode_convert_class( symbol_table, @@ -373,7 +378,8 @@ bool java_bytecode_convert_class( max_array_length, lazy_methods, lazy_methods_mode, - string_refinement_enabled); + string_refinement_enabled, + character_preprocess); try { diff --git a/src/java_bytecode/java_bytecode_convert_class.h b/src/java_bytecode/java_bytecode_convert_class.h index 9ce5cc8c7cc..e8bc934c949 100644 --- a/src/java_bytecode/java_bytecode_convert_class.h +++ b/src/java_bytecode/java_bytecode_convert_class.h @@ -14,6 +14,7 @@ Author: Daniel Kroening, kroening@kroening.com #include "java_bytecode_parse_tree.h" #include "java_bytecode_language.h" +#include "character_refine_preprocess.h" bool java_bytecode_convert_class( const java_bytecode_parse_treet &parse_tree, @@ -22,6 +23,7 @@ bool java_bytecode_convert_class( size_t max_array_length, lazy_methodst &, lazy_methods_modet, - bool string_refinement_enabled); + bool string_refinement_enabled, + const character_refine_preprocesst &character_preprocess); #endif // CPROVER_JAVA_BYTECODE_JAVA_BYTECODE_CONVERT_CLASS_H diff --git a/src/java_bytecode/java_bytecode_convert_method.cpp b/src/java_bytecode/java_bytecode_convert_method.cpp index 43a2ded0af1..1551c9701a6 100644 --- a/src/java_bytecode/java_bytecode_convert_method.cpp +++ b/src/java_bytecode/java_bytecode_convert_method.cpp @@ -1282,7 +1282,10 @@ codet java_bytecode_convert_methodt::convert_instructions( } call.function().add_source_location()=loc; - c=call; + + // Replacing call if it is a function of the Character library, + // returning the same call otherwise + c=character_preprocess.replace_character_call(call); } else if(statement=="return") { @@ -2433,13 +2436,15 @@ void java_bytecode_convert_method( symbol_tablet &symbol_table, message_handlert &message_handler, size_t max_array_length, - safe_pointer lazy_methods) + safe_pointer lazy_methods, + const character_refine_preprocesst &character_refine) { java_bytecode_convert_methodt java_bytecode_convert_method( symbol_table, message_handler, max_array_length, - lazy_methods); + lazy_methods, + character_refine); java_bytecode_convert_method(class_symbol, method); } diff --git a/src/java_bytecode/java_bytecode_convert_method.h b/src/java_bytecode/java_bytecode_convert_method.h index db6fcc729e1..28e08b3ecbd 100644 --- a/src/java_bytecode/java_bytecode_convert_method.h +++ b/src/java_bytecode/java_bytecode_convert_method.h @@ -12,6 +12,7 @@ Author: Daniel Kroening, kroening@kroening.com #include #include #include +#include "character_refine_preprocess.h" #include "java_bytecode_parse_tree.h" #include "ci_lazy_methods.h" @@ -24,14 +25,16 @@ void java_bytecode_convert_method( symbol_tablet &symbol_table, message_handlert &message_handler, size_t max_array_length, - safe_pointer lazy_methods); + safe_pointer lazy_methods, + const character_refine_preprocesst &character_refine); inline void java_bytecode_convert_method( const symbolt &class_symbol, const java_bytecode_parse_treet::methodt &method, symbol_tablet &symbol_table, message_handlert &message_handler, - size_t max_array_length) + size_t max_array_length, + const character_refine_preprocesst &character_preprocess) { java_bytecode_convert_method( class_symbol, @@ -39,7 +42,8 @@ inline void java_bytecode_convert_method( symbol_table, message_handler, max_array_length, - safe_pointer::create_null()); + safe_pointer::create_null(), + character_preprocess); } void java_bytecode_convert_method_lazy( diff --git a/src/java_bytecode/java_bytecode_convert_method_class.h b/src/java_bytecode/java_bytecode_convert_method_class.h index f83ece86b12..31a29a8ffe3 100644 --- a/src/java_bytecode/java_bytecode_convert_method_class.h +++ b/src/java_bytecode/java_bytecode_convert_method_class.h @@ -32,11 +32,13 @@ class java_bytecode_convert_methodt:public messaget symbol_tablet &_symbol_table, message_handlert &_message_handler, size_t _max_array_length, - safe_pointer _lazy_methods): + safe_pointer _lazy_methods, + const character_refine_preprocesst &_character_preprocess): messaget(_message_handler), symbol_table(_symbol_table), max_array_length(_max_array_length), - lazy_methods(_lazy_methods) + lazy_methods(_lazy_methods), + character_preprocess(_character_preprocess) { } @@ -59,6 +61,7 @@ class java_bytecode_convert_methodt:public messaget irep_idt method_id; irep_idt current_method; typet method_return_type; + character_refine_preprocesst character_preprocess; public: struct holet diff --git a/src/java_bytecode/java_bytecode_language.cpp b/src/java_bytecode/java_bytecode_language.cpp index 96e0f78fd57..bda9bb70734 100644 --- a/src/java_bytecode/java_bytecode_language.cpp +++ b/src/java_bytecode/java_bytecode_language.cpp @@ -511,6 +511,9 @@ bool java_bytecode_languaget::typecheck( symbol_tablet &symbol_table, const std::string &module) { + if(string_refinement_enabled) + character_preprocess.initialize_conversion_table(); + // first convert all for(java_class_loadert::class_mapt::const_iterator c_it=java_class_loader.class_map.begin(); @@ -529,7 +532,8 @@ bool java_bytecode_languaget::typecheck( max_user_array_length, lazy_methods, lazy_methods_mode, - string_refinement_enabled)) + string_refinement_enabled, + character_preprocess)) return true; } @@ -664,7 +668,8 @@ bool java_bytecode_languaget::do_ci_lazy_method_conversion( symbol_table, get_message_handler(), max_user_array_length, - safe_pointer::create_non_null(&lazy_methods)); + safe_pointer::create_non_null(&lazy_methods), + character_preprocess); gather_virtual_callsites( symbol_table.lookup(mname).value, virtual_callsites); @@ -774,7 +779,8 @@ void java_bytecode_languaget::convert_lazy_method( *lazy_method_entry.second, symtab, get_message_handler(), - max_user_array_length); + max_user_array_length, + character_preprocess); } /*******************************************************************\ diff --git a/src/java_bytecode/java_bytecode_language.h b/src/java_bytecode/java_bytecode_language.h index 3c889322daf..afac2587546 100644 --- a/src/java_bytecode/java_bytecode_language.h +++ b/src/java_bytecode/java_bytecode_language.h @@ -13,6 +13,7 @@ Author: Daniel Kroening, kroening@kroening.com #include #include "java_class_loader.h" +#include "character_refine_preprocess.h" #define MAX_NONDET_ARRAY_LENGTH_DEFAULT 5 @@ -99,6 +100,7 @@ class java_bytecode_languaget:public languaget lazy_methodst lazy_methods; lazy_methods_modet lazy_methods_mode; bool string_refinement_enabled; + character_refine_preprocesst character_preprocess; std::string java_cp_include_files; };