diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5d89613bd3d4f..e71220102cbb4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -583,6 +583,7 @@ I/O - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) +- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) Period ^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index dd92b1bbfdba0..03c15d0ab07bb 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -983,6 +983,19 @@ def __init__(self) -> None: np.float64(struct.unpack(" None: # These missing values are the generic '.' in Stata, and are used # to replace nans - self.MISSING_VALUES = { + self.MISSING_VALUES: dict[str, int | np.float32 | np.float64] = { "b": 101, "h": 32741, "l": 2147483621, @@ -1808,11 +1821,18 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra replacements = {} for i in range(len(data.columns)): fmt = self._typlist[i] - if fmt not in self.VALID_RANGE: - continue + if self._format_version <= 111: + if fmt not in self.OLD_VALID_RANGE: + continue - fmt = cast(str, fmt) # only strs in VALID_RANGE - nmin, nmax = self.VALID_RANGE[fmt] + fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE + nmin, nmax = self.OLD_VALID_RANGE[fmt] + else: + if fmt not in self.VALID_RANGE: + continue + + fmt = cast(str, fmt) # only strs in VALID_RANGE + nmin, nmax = self.VALID_RANGE[fmt] series = data.iloc[:, i] # appreciably faster to do this with ndarray instead of Series @@ -1827,7 +1847,12 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra umissing, umissing_loc = np.unique(series[missing], return_inverse=True) replacement = Series(series, dtype=object) for j, um in enumerate(umissing): - missing_value = StataMissingValue(um) + if self._format_version <= 111: + missing_value = StataMissingValue( + float(self.MISSING_VALUES[fmt]) + ) + else: + missing_value = StataMissingValue(um) loc = missing_loc[umissing_loc == j] replacement.iloc[loc] = missing_value diff --git a/pandas/tests/io/data/stata/stata1_108.dta b/pandas/tests/io/data/stata/stata1_108.dta new file mode 100644 index 0000000000000..6c948b4490589 Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_108.dta differ diff --git a/pandas/tests/io/data/stata/stata1_110.dta b/pandas/tests/io/data/stata/stata1_110.dta new file mode 100644 index 0000000000000..c9e2ca72dbd4e Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_110.dta differ diff --git a/pandas/tests/io/data/stata/stata1_111.dta b/pandas/tests/io/data/stata/stata1_111.dta new file mode 100644 index 0000000000000..21370d3027458 Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_111.dta differ diff --git a/pandas/tests/io/data/stata/stata1_113.dta b/pandas/tests/io/data/stata/stata1_113.dta new file mode 100644 index 0000000000000..6fcf55f0406e9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_113.dta differ diff --git a/pandas/tests/io/data/stata/stata1_115.dta b/pandas/tests/io/data/stata/stata1_115.dta new file mode 100644 index 0000000000000..2e5258da49c3c Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_115.dta differ diff --git a/pandas/tests/io/data/stata/stata1_118.dta b/pandas/tests/io/data/stata/stata1_118.dta new file mode 100644 index 0000000000000..26d7beccb745c Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_118.dta differ diff --git a/pandas/tests/io/data/stata/stata1_119.dta b/pandas/tests/io/data/stata/stata1_119.dta new file mode 100644 index 0000000000000..284daa78bf6db Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_119.dta differ diff --git a/pandas/tests/io/data/stata/stata8_108.dta b/pandas/tests/io/data/stata/stata8_108.dta new file mode 100644 index 0000000000000..962f7f4331fb3 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_108.dta differ diff --git a/pandas/tests/io/data/stata/stata8_110.dta b/pandas/tests/io/data/stata/stata8_110.dta new file mode 100644 index 0000000000000..a7fe9a3b7e639 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_110.dta differ diff --git a/pandas/tests/io/data/stata/stata8_111.dta b/pandas/tests/io/data/stata/stata8_111.dta new file mode 100644 index 0000000000000..cb96ac0e0f5d3 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_111.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_102.dta b/pandas/tests/io/data/stata/stata_int_validranges_102.dta new file mode 100644 index 0000000000000..69de2e2f7f91d Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_102.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_103.dta b/pandas/tests/io/data/stata/stata_int_validranges_103.dta new file mode 100644 index 0000000000000..71f03873808e2 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_103.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_104.dta b/pandas/tests/io/data/stata/stata_int_validranges_104.dta new file mode 100644 index 0000000000000..f6dff2a6b42d9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_104.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_105.dta b/pandas/tests/io/data/stata/stata_int_validranges_105.dta new file mode 100644 index 0000000000000..d0a7ad0f01d16 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_105.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_108.dta b/pandas/tests/io/data/stata/stata_int_validranges_108.dta new file mode 100644 index 0000000000000..47b715bce21ef Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_108.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_110.dta b/pandas/tests/io/data/stata/stata_int_validranges_110.dta new file mode 100644 index 0000000000000..2fe5dee018f4e Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_110.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_111.dta b/pandas/tests/io/data/stata/stata_int_validranges_111.dta new file mode 100644 index 0000000000000..07052d824f132 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_111.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_113.dta b/pandas/tests/io/data/stata/stata_int_validranges_113.dta new file mode 100644 index 0000000000000..4060c1c88ea12 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_113.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_114.dta b/pandas/tests/io/data/stata/stata_int_validranges_114.dta new file mode 100644 index 0000000000000..71c22366e9b1a Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_114.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_115.dta b/pandas/tests/io/data/stata/stata_int_validranges_115.dta new file mode 100644 index 0000000000000..80e1dc8670b38 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_115.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_117.dta b/pandas/tests/io/data/stata/stata_int_validranges_117.dta new file mode 100644 index 0000000000000..c220037941f4f Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_117.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_118.dta b/pandas/tests/io/data/stata/stata_int_validranges_118.dta new file mode 100644 index 0000000000000..4bbd823bff63e Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_118.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_119.dta b/pandas/tests/io/data/stata/stata_int_validranges_119.dta new file mode 100644 index 0000000000000..6bd9bbde1d22d Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_119.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 6d6f222fc0660..fb7182fdefb32 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -120,9 +120,11 @@ def test_read_index_col_none(self, version, temp_file): expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(read_df, expected, check_index_type=True) - @pytest.mark.parametrize("file", ["stata1_114", "stata1_117"]) - def test_read_dta1(self, file, datapath): - file = datapath("io", "data", "stata", f"{file}.dta") + # Note this test starts at format version 108 as the missing code for double + # was different prior to this (see GH 58149) and would therefore fail + @pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119]) + def test_read_dta1(self, version, datapath): + file = datapath("io", "data", "stata", f"stata1_{version}.dta") parsed = self.read_dta(file) # Pandas uses np.nan as missing value. @@ -136,6 +138,18 @@ def test_read_dta1(self, file, datapath): # the casting doesn't fail so need to match stata here expected["float_miss"] = expected["float_miss"].astype(np.float32) + # Column names too long for older Stata formats + if version <= 108: + expected = expected.rename( + columns={ + "float_miss": "f_miss", + "double_miss": "d_miss", + "byte_miss": "b_miss", + "int_miss": "i_miss", + "long_miss": "l_miss", + } + ) + tm.assert_frame_equal(parsed, expected) def test_read_dta2(self, datapath): @@ -920,6 +934,23 @@ def test_missing_value_conversion(self, file, datapath): ) tm.assert_frame_equal(parsed, expected) + # Note this test starts at format version 108 as the missing code for double + # was different prior to this (see GH 58149) and would therefore fail + @pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"]) + def test_missing_value_conversion_compat(self, file, datapath): + columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] + smv = StataMissingValue(101) + keys = sorted(smv.MISSING_VALUES.keys()) + data = [] + row = [StataMissingValue(keys[j * 27]) for j in range(5)] + data.append(row) + expected = DataFrame(data, columns=columns) + + parsed = read_stata( + datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + ) + tm.assert_frame_equal(parsed, expected) + def test_big_dates(self, datapath, temp_file): yr = [1960, 2000, 9999, 100, 2262, 1677] mo = [1, 1, 12, 1, 4, 9] @@ -2035,6 +2066,52 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + @pytest.mark.parametrize("version", [113, 114, 115, 117, 118, 119]) + def test_read_data_int_validranges(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-127, 100], dtype=np.int8), + "int": np.array([-32767, 32740], dtype=np.int16), + "long": np.array([-2147483647, 2147483620], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("version", [104, 105, 108, 110, 111]) + def test_read_data_int_validranges_compat(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-128, 126], dtype=np.int8), + "int": np.array([-32768, 32766], dtype=np.int16), + "long": np.array([-2147483648, 2147483646], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + + # The byte type was not supported prior to the 104 format + @pytest.mark.parametrize("version", [102, 103]) + def test_read_data_int_validranges_compat_nobyte(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-128, 126], dtype=np.int16), + "int": np.array([-32768, 32766], dtype=np.int16), + "long": np.array([-2147483648, 2147483646], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114]) def test_backward_compat(version, datapath):