diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8dabaeb6c7bfe..e30f6ae144648 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -355,6 +355,7 @@ I/O - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) +- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`) - Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`) Plotting diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3e9a61280f0b5..7bc6f267d094e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1719,10 +1719,19 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist, vc = Series(categories).value_counts() repeats = list(vc.index[vc > 1]) repeats = '-' * 80 + '\n' + '\n'.join(repeats) - raise ValueError('Value labels for column {col} are not ' - 'unique. The repeated labels are:\n' - '{repeats}' - .format(col=col, repeats=repeats)) + # GH 25772 + msg = """ +Value labels for column {col} are not unique. These cannot be converted to +pandas categoricals. + +Either read the file with `convert_categoricals` set to False or use the +low level interface in `StataReader` to separately read the values and the +value_labels. + +The repeated labels are: +{repeats} +""" + raise ValueError(msg.format(col=col, repeats=repeats)) # TODO: is the next line needed above in the data(...) method? cat_data = Series(cat_data, index=data.index) cat_converted_data.append((col, cat_data)) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 21cb3e597ca2a..52fead0166dc5 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1311,9 +1311,17 @@ def test_unsupported_datetype(self): original.to_stata(path) def test_repeated_column_labels(self): - # GH 13923 - msg = (r"Value labels for column ethnicsn are not unique\. The" - r" repeated labels are:\n-+\nwolof") + # GH 13923, 25772 + msg = """ +Value labels for column ethnicsn are not unique. These cannot be converted to +pandas categoricals. + +Either read the file with `convert_categoricals` set to False or use the +low level interface in `StataReader` to separately read the values and the +value_labels. + +The repeated labels are:\n-+\nwolof +""" with pytest.raises(ValueError, match=msg): read_stata(self.dta23, convert_categoricals=True)