Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,7 @@ I/O
- Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`)
- Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
- Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
- Bug in :func:`read_csv` replacing existing column names if mangling of duplicate columns conflicts with the target column (:issue:`14704`)
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
- Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)

Expand Down
49 changes: 30 additions & 19 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -657,46 +657,56 @@ cdef class TextReader:
field_count = self.parser.line_fields[hr]
start = self.parser.line_start[hr]

counts = {}
unnamed_count = 0
unnamed_col_indices = []

for i in range(field_count):
word = self.parser.words[start + i]

name = PyUnicode_DecodeUTF8(word, strlen(word),
self.encoding_errors)

# We use this later when collecting placeholder names.
old_name = name

if name == '':
if self.has_mi_columns:
name = f'Unnamed: {i}_level_{level}'
else:
name = f'Unnamed: {i}'

unnamed_count += 1
unnamed_col_indices.append(i)

this_header.append(name)

count = counts.get(name, 0)
if not self.has_mi_columns and self.mangle_dupe_cols:
col_loop_order = [i for i in range(len(this_header))
if i not in unnamed_col_indices
] + unnamed_col_indices
counts = {}

for i in col_loop_order:
col = this_header[i]
old_col = col
cur_count = counts.get(col, 0)

if cur_count > 0:
while cur_count > 0:
counts[old_col] = cur_count + 1
col = f'{old_col}.{cur_count}'
if col in this_header:
cur_count += 1
else:
cur_count = counts.get(col, 0)

if not self.has_mi_columns and self.mangle_dupe_cols:
if count > 0:
while count > 0:
counts[name] = count + 1
name = f'{name}.{count}'
count = counts.get(name, 0)
if (
self.dtype is not None
and is_dict_like(self.dtype)
and self.dtype.get(old_name) is not None
and self.dtype.get(name) is None
and self.dtype.get(old_col) is not None
and self.dtype.get(col) is None
):
self.dtype.update({name: self.dtype.get(old_name)})

if old_name == '':
unnamed_cols.add(name)
self.dtype.update({col: self.dtype.get(old_col)})

this_header.append(name)
counts[name] = count + 1
this_header[i] = col
counts[col] = cur_count + 1

if self.has_mi_columns:

Expand All @@ -716,6 +726,7 @@ cdef class TextReader:

data_line = hr + 1
header.append(this_header)
unnamed_cols.update({this_header[i] for i in unnamed_col_indices})

if self.names is not None:
header = [self.names]
Expand Down
20 changes: 15 additions & 5 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,16 +415,26 @@ def _infer_columns(self):

if not have_mi_columns and self.mangle_dupe_cols:
counts: DefaultDict = defaultdict(int)

for i, col in enumerate(this_columns):
col_loop_order = [
i
for i in range(len(this_columns))
if i not in this_unnamed_cols
] + this_unnamed_cols

for i in col_loop_order:
col = this_columns[i]
old_col = col
cur_count = counts[col]

if cur_count > 0:
while cur_count > 0:
counts[col] = cur_count + 1
col = f"{col}.{cur_count}"
cur_count = counts[col]
counts[old_col] = cur_count + 1
col = f"{old_col}.{cur_count}"
if col in this_columns:
cur_count += 1
else:
cur_count = counts[col]

if (
self.dtype is not None
and is_dict_like(self.dtype)
Expand Down
37 changes: 33 additions & 4 deletions pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,19 @@ def test_basic_names_raise(all_parsers):
@pytest.mark.parametrize(
"data,expected",
[
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])),
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
(
"a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
DataFrame(
[[1, 2, 3, 4, 5, 6]],
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
),
),
(
"a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
DataFrame(
[[1, 2, 3, 4, 5, 6, 7]],
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
),
),
],
Expand Down Expand Up @@ -131,9 +131,38 @@ def test_mangled_unnamed_placeholders(all_parsers):
expected = DataFrame()

for j in range(i + 1):
expected["Unnamed: 0" + ".1" * j] = [0, 1, 2]
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
expected.insert(loc=0, column=col_name, value=[0, 1, 2])

expected[orig_key] = orig_value
df = parser.read_csv(StringIO(df.to_csv()))

tm.assert_frame_equal(df, expected)


@skip_pyarrow
def test_mangle_dupe_cols_already_exists(all_parsers):
# GH#14704
parser = all_parsers

data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4, 5, 6, 7]],
columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
)
tm.assert_frame_equal(result, expected)


@skip_pyarrow
def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
# GH#14704
parser = all_parsers

data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4]],
columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
)
tm.assert_frame_equal(result, expected)