diff --git a/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst b/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst new file mode 100644 index 00000000000000..a29ada98ac20e9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst @@ -0,0 +1 @@ +Fix undefined behaviour in :func:`struct.unpack`. diff --git a/Modules/_struct.c b/Modules/_struct.c index 9d66568a282662..09f52a981485dd 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -805,19 +805,38 @@ static const formatdef native_table[] = { /* Big-endian routines. *****************************************************/ +static PyObject * +bu_short(_structmodulestate *state, const char *p, const formatdef *f) +{ + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 2. */ + assert(f->size == 2); + Py_ssize_t i = 2; + const unsigned char *bytes = (const unsigned char *)p; + do { + x = (x<<8) | *bytes++; + } while (--i > 0); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x8000U) - 0x8000U; + return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x) : (long)x); +} + static PyObject * bu_int(_structmodulestate *state, const char *p, const formatdef *f) { - long x = 0; - Py_ssize_t i = f->size; + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 4. */ + assert(f->size == 4); + Py_ssize_t i = 4; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | *bytes++; } while (--i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG > f->size) - x |= -(x & (1L << ((8 * f->size) - 1))); - return PyLong_FromLong(x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x80000000U) - 0x80000000U; + return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x) : (long)x); } static PyObject * @@ -835,16 +854,19 @@ bu_uint(_structmodulestate *state, const char *p, const formatdef *f) static PyObject * bu_longlong(_structmodulestate *state, const char *p, const formatdef *f) { - long long x = 0; - Py_ssize_t i = f->size; + unsigned long long x = 0; + + /* This function is only ever used in the case f->size == 8. */ + assert(f->size == 8); + Py_ssize_t i = 8; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | *bytes++; } while (--i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG_LONG > f->size) - x |= -(x & ((long long)1 << ((8 * f->size) - 1))); - return PyLong_FromLongLong(x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x8000000000000000U) - 0x8000000000000000U; + return PyLong_FromLongLong( + x & 0x8000000000000000U ? -1 - (long long)(~x) : (long long)x); } static PyObject * @@ -1009,7 +1031,7 @@ static formatdef bigendian_table[] = { {'c', 1, 0, nu_char, np_char}, {'s', 1, 0, NULL}, {'p', 1, 0, NULL}, - {'h', 2, 0, bu_int, bp_int}, + {'h', 2, 0, bu_short, bp_int}, {'H', 2, 0, bu_uint, bp_uint}, {'i', 4, 0, bu_int, bp_int}, {'I', 4, 0, bu_uint, bp_uint}, @@ -1026,19 +1048,38 @@ static formatdef bigendian_table[] = { /* Little-endian routines. *****************************************************/ +static PyObject * +lu_short(_structmodulestate *state, const char *p, const formatdef *f) +{ + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 2. */ + assert(f->size == 2); + Py_ssize_t i = 2; + const unsigned char *bytes = (const unsigned char *)p; + do { + x = (x<<8) | bytes[--i]; + } while (i > 0); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x8000U) - 0x8000U; + return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x) : (long)x); +} + static PyObject * lu_int(_structmodulestate *state, const char *p, const formatdef *f) { - long x = 0; - Py_ssize_t i = f->size; + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 4. */ + assert(f->size == 4); + Py_ssize_t i = 4; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | bytes[--i]; } while (i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG > f->size) - x |= -(x & (1L << ((8 * f->size) - 1))); - return PyLong_FromLong(x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x80000000U) - 0x80000000U; + return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x) : (long)x); } static PyObject * @@ -1056,16 +1097,19 @@ lu_uint(_structmodulestate *state, const char *p, const formatdef *f) static PyObject * lu_longlong(_structmodulestate *state, const char *p, const formatdef *f) { - long long x = 0; - Py_ssize_t i = f->size; + unsigned long long x = 0; + + /* This function is only ever used in the case f->size == 8. */ + assert(f->size == 8); + Py_ssize_t i = 8; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | bytes[--i]; } while (i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG_LONG > f->size) - x |= -(x & ((long long)1 << ((8 * f->size) - 1))); - return PyLong_FromLongLong(x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x8000000000000000U) - 0x8000000000000000U; + return PyLong_FromLongLong( + x & 0x8000000000000000U ? -1 - (long long)(~x) : (long long)x); } static PyObject * @@ -1213,7 +1257,7 @@ static formatdef lilendian_table[] = { {'c', 1, 0, nu_char, np_char}, {'s', 1, 0, NULL}, {'p', 1, 0, NULL}, - {'h', 2, 0, lu_int, lp_int}, + {'h', 2, 0, lu_short, lp_int}, {'H', 2, 0, lu_uint, lp_uint}, {'i', 4, 0, lu_int, lp_int}, {'I', 4, 0, lu_uint, lp_uint},