From d4ef31a323710fb03c5cb460cb072e831e546fac Mon Sep 17 00:00:00 2001 From: Jahongir Qurbonov Date: Fri, 4 Jul 2025 13:39:20 +0500 Subject: [PATCH 1/9] Add str.lower() and str.upper() primitives --- mypyc/lib-rt/CPy.h | 2 + mypyc/lib-rt/str_ops.c | 119 +++++++++++++++++++++++++++++++ mypyc/primitives/str_ops.py | 18 +++++ mypyc/test-data/fixtures/ir.py | 3 +- mypyc/test-data/irbuild-str.test | 21 ++++++ mypyc/test-data/run-strings.test | 18 +++++ 6 files changed, 180 insertions(+), 1 deletion(-) diff --git a/mypyc/lib-rt/CPy.h b/mypyc/lib-rt/CPy.h index bdf3e0130a4c..3ec17999d512 100644 --- a/mypyc/lib-rt/CPy.h +++ b/mypyc/lib-rt/CPy.h @@ -756,6 +756,8 @@ PyObject *CPy_Encode(PyObject *obj, PyObject *encoding, PyObject *errors); Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start); Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end); CPyTagged CPyStr_Ord(PyObject *obj); +PyObject *CPyStr_Lower(PyObject *self); +PyObject *CPyStr_Upper(PyObject *self); // Bytes operations diff --git a/mypyc/lib-rt/str_ops.c b/mypyc/lib-rt/str_ops.c index 210172c57497..3294589d82e0 100644 --- a/mypyc/lib-rt/str_ops.c +++ b/mypyc/lib-rt/str_ops.c @@ -546,3 +546,122 @@ CPyTagged CPyStr_Ord(PyObject *obj) { PyExc_TypeError, "ord() expected a character, but a string of length %zd found", s); return CPY_INT_TAG; } + +// Fast ASCII lower/upper tables +static const unsigned char ascii_lower_table[128] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119,120,121,122, 91, 92, 93, 94, 95, + 96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127 +}; + +static const unsigned char ascii_upper_table[128] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127 +}; + +// Helper for lower/upper: get the lower/upper code point for a character +static inline Py_UCS4 tolower_ucs4(Py_UCS4 ch) { + if (ch < 128) { + return ascii_lower_table[ch]; + } +#ifdef Py_UNICODE_TOLOWER + return Py_UNICODE_TOLOWER(ch); +#else + // fallback: no-op for non-ASCII if macro is unavailable + return ch; +#endif +} + +static inline Py_UCS4 toupper_ucs4(Py_UCS4 ch) { + if (ch < 128) { + return ascii_upper_table[ch]; + } +#ifdef Py_UNICODE_TOUPPER + return Py_UNICODE_TOUPPER(ch); +#else + // fallback: no-op for non-ASCII if macro is unavailable + return ch; +#endif +} + +// Implementation of s.lower() +PyObject *CPyStr_Lower(PyObject *self) { + if (PyUnicode_READY(self) == -1) + return NULL; + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + + // Fast path: check if already all lower + int unchanged = 1; + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (tolower_ucs4(ch) != ch) { + unchanged = 0; + break; + } + } + if (unchanged) { + return Py_NewRef(self); + } + + Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); + PyObject *res = PyUnicode_New(len, maxchar); + if (!res) + return NULL; + int res_kind = PyUnicode_KIND(res); + void *res_data = PyUnicode_DATA(res); + + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + Py_UCS4 lower = tolower_ucs4(ch); + PyUnicode_WRITE(res_kind, res_data, i, lower); + } + return res; +} + +// Implementation of s.upper() +PyObject *CPyStr_Upper(PyObject *self) { + if (PyUnicode_READY(self) == -1) + return NULL; + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + + int unchanged = 1; + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (toupper_ucs4(ch) != ch) { + unchanged = 0; + break; + } + } + if (unchanged) { + return Py_NewRef(self); + } + + Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); + PyObject *res = PyUnicode_New(len, maxchar); + if (!res) + return NULL; + int res_kind = PyUnicode_KIND(res); + void *res_data = PyUnicode_DATA(res); + + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + Py_UCS4 upper = toupper_ucs4(ch); + PyUnicode_WRITE(res_kind, res_data, i, upper); + } + return res; +} diff --git a/mypyc/primitives/str_ops.py b/mypyc/primitives/str_ops.py index 9d46da9c3514..b55ed5be3e53 100644 --- a/mypyc/primitives/str_ops.py +++ b/mypyc/primitives/str_ops.py @@ -428,3 +428,21 @@ c_function_name="CPyStr_Ord", error_kind=ERR_MAGIC, ) + +# str.lower() +method_op( + name="lower", + arg_types=[str_rprimitive], + return_type=str_rprimitive, + c_function_name="CPyStr_Lower", + error_kind=ERR_MAGIC, +) + +# str.upper() +method_op( + name="upper", + arg_types=[str_rprimitive], + return_type=str_rprimitive, + c_function_name="CPyStr_Upper", + error_kind=ERR_MAGIC, +) diff --git a/mypyc/test-data/fixtures/ir.py b/mypyc/test-data/fixtures/ir.py index 532cbbc06177..5b90ca00a51c 100644 --- a/mypyc/test-data/fixtures/ir.py +++ b/mypyc/test-data/fixtures/ir.py @@ -112,7 +112,6 @@ def lstrip(self, item: Optional[str] = None) -> str: pass def rstrip(self, item: Optional[str] = None) -> str: pass def join(self, x: Iterable[str]) -> str: pass def format(self, *args: Any, **kwargs: Any) -> str: ... - def upper(self) -> str: ... def startswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ... def endswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ... def replace(self, old: str, new: str, maxcount: int=...) -> str: ... @@ -122,6 +121,8 @@ def rpartition(self, sep: str, /) -> Tuple[str, str, str]: ... def removeprefix(self, prefix: str, /) -> str: ... def removesuffix(self, suffix: str, /) -> str: ... def islower(self) -> bool: ... + def lower(self) -> str: ... + def upper(self) -> str: ... class float: def __init__(self, x: object) -> None: pass diff --git a/mypyc/test-data/irbuild-str.test b/mypyc/test-data/irbuild-str.test index 2bf77a6cb556..bfcced6efffe 100644 --- a/mypyc/test-data/irbuild-str.test +++ b/mypyc/test-data/irbuild-str.test @@ -562,3 +562,24 @@ L0: r3 = box(native_int, r1) r4 = unbox(int, r3) return r4 + +[case testLower] +def do_lower(s: str) -> str: + return s.lower() +[out] +def do_lower(s): + s, r0 :: str +L0: + r0 = CPyStr_Lower(s) + return r0 + +[case testUpper] +def do_upper(s: str) -> str: + return s.upper() +[out] +def do_upper(s): + s, r0 :: str +L0: + r0 = CPyStr_Upper(s) + return r0 + diff --git a/mypyc/test-data/run-strings.test b/mypyc/test-data/run-strings.test index 074e56f9068a..3ce8fabd5687 100644 --- a/mypyc/test-data/run-strings.test +++ b/mypyc/test-data/run-strings.test @@ -906,3 +906,21 @@ def test_count_multi_start_end_emoji() -> None: assert string.count("๐Ÿ˜ด๐Ÿ˜ด๐Ÿ˜ด", 0, 12) == 1, string.count("๐Ÿ˜ด๐Ÿ˜ด๐Ÿ˜ด", 0, 12) assert string.count("๐Ÿš€๐Ÿš€๐Ÿš€", 0, 12) == 2, string.count("๐Ÿš€๐Ÿš€๐Ÿš€", 0, 12) assert string.count("รฑรฑรฑ", 0, 12) == 1, string.count("รฑรฑรฑ", 0, 12) + +[case testLower] +def test_str_lower() -> None: + assert "".lower() == "" + assert "ABC".lower() == "abc" + assert "abc".lower() == "abc" + assert "AbC123".lower() == "abc123" + assert "รกร‰ร".lower() == "รกรฉรญ" + assert "๐Ÿ˜ด๐Ÿš€".lower() == "๐Ÿ˜ด๐Ÿš€" + +[case testUpper] +def test_str_upper() -> None: + assert "".upper() == "" + assert "abc".upper() == "ABC" + assert "ABC".upper() == "ABC" + assert "AbC123".upper() == "ABC123" + assert "รกรฉรญ".upper() == "รร‰ร" + assert "๐Ÿ˜ด๐Ÿš€".upper() == "๐Ÿ˜ด๐Ÿš€" From 124dceb91ebc407195821feda25d08b603fce555 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 4 Jul 2025 08:45:01 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mypyc/test-data/irbuild-str.test | 1 - 1 file changed, 1 deletion(-) diff --git a/mypyc/test-data/irbuild-str.test b/mypyc/test-data/irbuild-str.test index bfcced6efffe..1bc4fa25fb37 100644 --- a/mypyc/test-data/irbuild-str.test +++ b/mypyc/test-data/irbuild-str.test @@ -582,4 +582,3 @@ def do_upper(s): L0: r0 = CPyStr_Upper(s) return r0 - From 8065f9cfeaded459094a2a42b284d549ffdc9c2f Mon Sep 17 00:00:00 2001 From: Jahongir Qurbonov Date: Sun, 6 Jul 2025 16:42:15 +0500 Subject: [PATCH 3/9] Refactor tolower_ucs4 and toupper_ucs4 functions by removing fallback logic for non-ASCII characters --- mypyc/lib-rt/str_ops.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/mypyc/lib-rt/str_ops.c b/mypyc/lib-rt/str_ops.c index 3294589d82e0..2bddb9c7447a 100644 --- a/mypyc/lib-rt/str_ops.c +++ b/mypyc/lib-rt/str_ops.c @@ -575,24 +575,14 @@ static inline Py_UCS4 tolower_ucs4(Py_UCS4 ch) { if (ch < 128) { return ascii_lower_table[ch]; } -#ifdef Py_UNICODE_TOLOWER return Py_UNICODE_TOLOWER(ch); -#else - // fallback: no-op for non-ASCII if macro is unavailable - return ch; -#endif } static inline Py_UCS4 toupper_ucs4(Py_UCS4 ch) { if (ch < 128) { return ascii_upper_table[ch]; } -#ifdef Py_UNICODE_TOUPPER return Py_UNICODE_TOUPPER(ch); -#else - // fallback: no-op for non-ASCII if macro is unavailable - return ch; -#endif } // Implementation of s.lower() From 1d40499c7a94e3556c791896644c56c374e010cd Mon Sep 17 00:00:00 2001 From: Jahongir Qurbonov Date: Sun, 6 Jul 2025 17:05:30 +0500 Subject: [PATCH 4/9] Optimize CPyStr_Lower and CPyStr_Upper for ASCII strings by removing fallback logic and using direct table lookups --- mypyc/lib-rt/str_ops.c | 67 +++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/mypyc/lib-rt/str_ops.c b/mypyc/lib-rt/str_ops.c index 2bddb9c7447a..cd26c25d5c05 100644 --- a/mypyc/lib-rt/str_ops.c +++ b/mypyc/lib-rt/str_ops.c @@ -570,20 +570,6 @@ static const unsigned char ascii_upper_table[128] = { 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127 }; -// Helper for lower/upper: get the lower/upper code point for a character -static inline Py_UCS4 tolower_ucs4(Py_UCS4 ch) { - if (ch < 128) { - return ascii_lower_table[ch]; - } - return Py_UNICODE_TOLOWER(ch); -} - -static inline Py_UCS4 toupper_ucs4(Py_UCS4 ch) { - if (ch < 128) { - return ascii_upper_table[ch]; - } - return Py_UNICODE_TOUPPER(ch); -} // Implementation of s.lower() PyObject *CPyStr_Lower(PyObject *self) { @@ -593,29 +579,26 @@ PyObject *CPyStr_Lower(PyObject *self) { int kind = PyUnicode_KIND(self); void *data = PyUnicode_DATA(self); - // Fast path: check if already all lower - int unchanged = 1; - for (Py_ssize_t i = 0; i < len; i++) { - Py_UCS4 ch = PyUnicode_READ(kind, data, i); - if (tolower_ucs4(ch) != ch) { - unchanged = 0; - break; - } - } - if (unchanged) { - return Py_NewRef(self); - } - Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); PyObject *res = PyUnicode_New(len, maxchar); - if (!res) + if (res == NULL) return NULL; int res_kind = PyUnicode_KIND(res); void *res_data = PyUnicode_DATA(res); + // Fast path for ASCII strings + if (PyUnicode_IS_ASCII(self)) { + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS1 ch = ((Py_UCS1 *)data)[i]; + Py_UCS1 lower = ascii_lower_table[ch]; + ((Py_UCS1 *)res_data)[i] = lower; + } + return res; + } + for (Py_ssize_t i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_UCS4 lower = tolower_ucs4(ch); + Py_UCS4 lower = Py_UNICODE_TOLOWER(ch); PyUnicode_WRITE(res_kind, res_data, i, lower); } return res; @@ -629,28 +612,26 @@ PyObject *CPyStr_Upper(PyObject *self) { int kind = PyUnicode_KIND(self); void *data = PyUnicode_DATA(self); - int unchanged = 1; - for (Py_ssize_t i = 0; i < len; i++) { - Py_UCS4 ch = PyUnicode_READ(kind, data, i); - if (toupper_ucs4(ch) != ch) { - unchanged = 0; - break; - } - } - if (unchanged) { - return Py_NewRef(self); - } - Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); PyObject *res = PyUnicode_New(len, maxchar); - if (!res) + if (res == NULL) return NULL; int res_kind = PyUnicode_KIND(res); void *res_data = PyUnicode_DATA(res); + // Fast path for ASCII strings + if (PyUnicode_IS_ASCII(self)) { + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS1 ch = ((Py_UCS1 *)data)[i]; + Py_UCS1 upper = ascii_upper_table[ch]; + ((Py_UCS1 *)res_data)[i] = upper; + } + return res; + } + for (Py_ssize_t i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_UCS4 upper = toupper_ucs4(ch); + Py_UCS4 upper = Py_UNICODE_TOUPPER(ch); PyUnicode_WRITE(res_kind, res_data, i, upper); } return res; From 2750549c6aaf2d5b497b538e63f5059be4962bed Mon Sep 17 00:00:00 2001 From: Jahongir Qurbonov Date: Sun, 6 Jul 2025 17:56:33 +0500 Subject: [PATCH 5/9] Optimize CPyStr_Lower and CPyStr_Upper for ASCII strings by removing static lookup tables and using direct character conversion --- mypyc/lib-rt/str_ops.c | 82 ++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 48 deletions(-) diff --git a/mypyc/lib-rt/str_ops.c b/mypyc/lib-rt/str_ops.c index cd26c25d5c05..644046d257b7 100644 --- a/mypyc/lib-rt/str_ops.c +++ b/mypyc/lib-rt/str_ops.c @@ -547,38 +547,28 @@ CPyTagged CPyStr_Ord(PyObject *obj) { return CPY_INT_TAG; } -// Fast ASCII lower/upper tables -static const unsigned char ascii_lower_table[128] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - 64, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111, - 112,113,114,115,116,117,118,119,120,121,122, 91, 92, 93, 94, 95, - 96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111, - 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127 -}; - -static const unsigned char ascii_upper_table[128] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, - 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127 -}; - - -// Implementation of s.lower() PyObject *CPyStr_Lower(PyObject *self) { if (PyUnicode_READY(self) == -1) return NULL; + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + + // Fast path: ASCII only + if (PyUnicode_IS_ASCII(self)) { + PyObject *res = PyUnicode_New(len, 127); + if (res == NULL) + return NULL; + const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); + Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res); + for (Py_ssize_t i = 0; i < len; i++) { + res_data[i] = Py_TOLOWER((unsigned char) data[i]); + } + return res; + } + + // General Unicode path int kind = PyUnicode_KIND(self); void *data = PyUnicode_DATA(self); - Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); PyObject *res = PyUnicode_New(len, maxchar); if (res == NULL) @@ -586,16 +576,7 @@ PyObject *CPyStr_Lower(PyObject *self) { int res_kind = PyUnicode_KIND(res); void *res_data = PyUnicode_DATA(res); - // Fast path for ASCII strings - if (PyUnicode_IS_ASCII(self)) { - for (Py_ssize_t i = 0; i < len; i++) { - Py_UCS1 ch = ((Py_UCS1 *)data)[i]; - Py_UCS1 lower = ascii_lower_table[ch]; - ((Py_UCS1 *)res_data)[i] = lower; - } - return res; - } - + // Unified loop for all Unicode kinds for (Py_ssize_t i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); Py_UCS4 lower = Py_UNICODE_TOLOWER(ch); @@ -604,14 +585,28 @@ PyObject *CPyStr_Lower(PyObject *self) { return res; } -// Implementation of s.upper() PyObject *CPyStr_Upper(PyObject *self) { if (PyUnicode_READY(self) == -1) return NULL; + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + + // Fast path: ASCII only + if (PyUnicode_IS_ASCII(self)) { + PyObject *res = PyUnicode_New(len, 127); + if (res == NULL) + return NULL; + const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); + Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res); + for (Py_ssize_t i = 0; i < len; i++) { + res_data[i] = Py_TOUPPER((unsigned char) data[i]); + } + return res; + } + + // General Unicode path int kind = PyUnicode_KIND(self); void *data = PyUnicode_DATA(self); - Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); PyObject *res = PyUnicode_New(len, maxchar); if (res == NULL) @@ -619,16 +614,7 @@ PyObject *CPyStr_Upper(PyObject *self) { int res_kind = PyUnicode_KIND(res); void *res_data = PyUnicode_DATA(res); - // Fast path for ASCII strings - if (PyUnicode_IS_ASCII(self)) { - for (Py_ssize_t i = 0; i < len; i++) { - Py_UCS1 ch = ((Py_UCS1 *)data)[i]; - Py_UCS1 upper = ascii_upper_table[ch]; - ((Py_UCS1 *)res_data)[i] = upper; - } - return res; - } - + // Unified loop for all Unicode kinds for (Py_ssize_t i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); Py_UCS4 upper = Py_UNICODE_TOUPPER(ch); From 62520efcff93f589f9aa74f56e40beb3b666f4ba Mon Sep 17 00:00:00 2001 From: Jahongir Qurbonov Date: Sun, 6 Jul 2025 17:59:00 +0500 Subject: [PATCH 6/9] Add test case for lower() method with special case for uppercase 'SS' --- mypyc/test-data/run-strings.test | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mypyc/test-data/run-strings.test b/mypyc/test-data/run-strings.test index 3ce8fabd5687..d68e80572d79 100644 --- a/mypyc/test-data/run-strings.test +++ b/mypyc/test-data/run-strings.test @@ -915,6 +915,8 @@ def test_str_lower() -> None: assert "AbC123".lower() == "abc123" assert "รกร‰ร".lower() == "รกรฉรญ" assert "๐Ÿ˜ด๐Ÿš€".lower() == "๐Ÿ˜ด๐Ÿš€" + # Special + assert "SS".lower() == "ss" [case testUpper] def test_str_upper() -> None: From cc2ed145cd76b12059f696b1d8a94ec49b244437 Mon Sep 17 00:00:00 2001 From: Jahongir Qurbonov Date: Sun, 6 Jul 2025 18:01:55 +0500 Subject: [PATCH 7/9] Refactor CPyStr_Lower and CPyStr_Upper to use consistent variable naming for character conversion --- mypyc/lib-rt/str_ops.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mypyc/lib-rt/str_ops.c b/mypyc/lib-rt/str_ops.c index 644046d257b7..df9d36c21c93 100644 --- a/mypyc/lib-rt/str_ops.c +++ b/mypyc/lib-rt/str_ops.c @@ -579,8 +579,8 @@ PyObject *CPyStr_Lower(PyObject *self) { // Unified loop for all Unicode kinds for (Py_ssize_t i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_UCS4 lower = Py_UNICODE_TOLOWER(ch); - PyUnicode_WRITE(res_kind, res_data, i, lower); + Py_UCS4 rch = Py_UNICODE_TOLOWER(ch); + PyUnicode_WRITE(res_kind, res_data, i, rch); } return res; } @@ -617,8 +617,8 @@ PyObject *CPyStr_Upper(PyObject *self) { // Unified loop for all Unicode kinds for (Py_ssize_t i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_UCS4 upper = Py_UNICODE_TOUPPER(ch); - PyUnicode_WRITE(res_kind, res_data, i, upper); + Py_UCS4 rch = Py_UNICODE_TOUPPER(ch); + PyUnicode_WRITE(res_kind, res_data, i, rch); } return res; } From 5cdca5a8b389d7b300818dfdb79c99b20f6bb80c Mon Sep 17 00:00:00 2001 From: Jahongir Qurbonov Date: Sun, 6 Jul 2025 19:15:40 +0500 Subject: [PATCH 8/9] Add test case for lower() method to handle Greek capital sigma --- mypyc/test-data/run-strings.test | 1 + 1 file changed, 1 insertion(+) diff --git a/mypyc/test-data/run-strings.test b/mypyc/test-data/run-strings.test index d68e80572d79..a9aabde87776 100644 --- a/mypyc/test-data/run-strings.test +++ b/mypyc/test-data/run-strings.test @@ -917,6 +917,7 @@ def test_str_lower() -> None: assert "๐Ÿ˜ด๐Ÿš€".lower() == "๐Ÿ˜ด๐Ÿš€" # Special assert "SS".lower() == "ss" + assert "ฮฃ".lower() == "ฯƒ" # Greek capital sigma -> small sigma [case testUpper] def test_str_upper() -> None: From 7049d8b93eafb4435514a9183b0b55faee05ce1a Mon Sep 17 00:00:00 2001 From: Jahongir Qurbonov Date: Sun, 6 Jul 2025 19:38:54 +0500 Subject: [PATCH 9/9] Add commented-out test cases for lower() and upper() methods to handle special characters --- mypyc/test-data/run-strings.test | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mypyc/test-data/run-strings.test b/mypyc/test-data/run-strings.test index a9aabde87776..f9edd98b4200 100644 --- a/mypyc/test-data/run-strings.test +++ b/mypyc/test-data/run-strings.test @@ -918,6 +918,8 @@ def test_str_lower() -> None: # Special assert "SS".lower() == "ss" assert "ฮฃ".lower() == "ฯƒ" # Greek capital sigma -> small sigma + #assert "ฤฐ".lower() == "iฬ‡" # TODO: Latin capital letter I with dot above -> 'i' + combining dot + #assert len("ฤฐ".lower()) == 2 # TODO: Confirms length change [case testUpper] def test_str_upper() -> None: @@ -927,3 +929,7 @@ def test_str_upper() -> None: assert "AbC123".upper() == "ABC123" assert "รกรฉรญ".upper() == "รร‰ร" assert "๐Ÿ˜ด๐Ÿš€".upper() == "๐Ÿ˜ด๐Ÿš€" + # Special + #assert "รŸ".upper() == "SS" # TODO: German sharp S -> double S + #assert "๏ฌƒ".upper() == "FFI" # TODO: Ligature 'ffi' -> separate letters + #assert len("๏ฌƒ".upper()) == 3 # TODO: Confirm length increases