From 479a82a7a1046435540cb85f1826471c4fad487d Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 3 Apr 2019 00:01:31 +0100 Subject: [PATCH] ENH: Improve explanation when erroring on dta files Improve the explanation when value labels are repeated in Stata dta files. Add suggested methods to workaround the issue using the low level interface. closes #25772 --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/stata.py | 17 +++++++++++++---- pandas/tests/io/test_stata.py | 14 +++++++++++--- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8dabaeb6c7bfe..e30f6ae144648 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -355,6 +355,7 @@ I/O - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) +- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`) - Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`) Plotting diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3e9a61280f0b5..7bc6f267d094e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1719,10 +1719,19 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist, vc = Series(categories).value_counts() repeats = list(vc.index[vc > 1]) repeats = '-' * 80 + '\n' + '\n'.join(repeats) - raise ValueError('Value labels for column {col} are not ' - 'unique. The repeated labels are:\n' - '{repeats}' - .format(col=col, repeats=repeats)) + # GH 25772 + msg = """ +Value labels for column {col} are not unique. These cannot be converted to +pandas categoricals. + +Either read the file with `convert_categoricals` set to False or use the +low level interface in `StataReader` to separately read the values and the +value_labels. + +The repeated labels are: +{repeats} +""" + raise ValueError(msg.format(col=col, repeats=repeats)) # TODO: is the next line needed above in the data(...) method? cat_data = Series(cat_data, index=data.index) cat_converted_data.append((col, cat_data)) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 21cb3e597ca2a..52fead0166dc5 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1311,9 +1311,17 @@ def test_unsupported_datetype(self): original.to_stata(path) def test_repeated_column_labels(self): - # GH 13923 - msg = (r"Value labels for column ethnicsn are not unique\. The" - r" repeated labels are:\n-+\nwolof") + # GH 13923, 25772 + msg = """ +Value labels for column ethnicsn are not unique. These cannot be converted to +pandas categoricals. + +Either read the file with `convert_categoricals` set to False or use the +low level interface in `StataReader` to separately read the values and the +value_labels. + +The repeated labels are:\n-+\nwolof +""" with pytest.raises(ValueError, match=msg): read_stata(self.dta23, convert_categoricals=True)