Skip to content

Commit f64ccf0

Browse files
committed
BUG: support duplicate columns in DataFrame.from_records. close #2179
1 parent 81169f9 commit f64ccf0

File tree

3 files changed

+65
-32
lines changed

3 files changed

+65
-32
lines changed

RELEASE.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ pandas 0.10.0
4646
- Unstack/reshape algorithm rewrite to avoid high memory use in cases where
4747
the number of observed key-tuples is much smaller than the total possible
4848
number that could occur (#2278). Also improves performance in most cases.
49+
- Support duplicate columns in DataFrame.from_records (#2179)
4950

5051
**Bug fixes**
5152

pandas/core/frame.py

Lines changed: 57 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -896,11 +896,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
896896
"""
897897
# Make a copy of the input columns so we can modify it
898898
if columns is not None:
899-
columns = list(columns)
900-
901-
if len(algos.unique(columns)) < len(columns):
902-
raise ValueError('Non-unique columns not yet supported in '
903-
'from_records')
899+
columns = _ensure_index(columns)
904900

905901
if com.is_iterator(data):
906902
if nrows == 0:
@@ -932,48 +928,66 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
932928
else:
933929
data = values
934930

935-
if isinstance(data, (np.ndarray, DataFrame, dict)):
936-
keys, sdict = _rec_to_dict(data)
931+
if isinstance(data, dict):
937932
if columns is None:
938-
columns = keys
933+
columns = arr_columns = _ensure_index(sorted(data))
934+
arrays = [data[k] for k in columns]
939935
else:
940-
sdict = dict((k, v) for k, v in sdict.iteritems()
941-
if k in columns)
936+
arrays = []
937+
arr_columns = []
938+
for k, v in data.iteritems():
939+
if k in columns:
940+
arr_columns.append(k)
941+
arrays.append(v)
942+
943+
elif isinstance(data, (np.ndarray, DataFrame)):
944+
arrays, columns = _to_arrays(data, columns)
945+
if columns is not None:
946+
columns = _ensure_index(columns)
947+
arr_columns = columns
942948
else:
943-
arrays, columns = _to_arrays(data, columns,
944-
coerce_float=coerce_float)
945-
columns=list(columns) # _to_arrays returns index, but we might mutate
946-
sdict = dict(zip(columns, arrays))
949+
arrays, arr_columns = _to_arrays(data, columns,
950+
coerce_float=coerce_float)
951+
952+
arr_columns = _ensure_index(arr_columns)
953+
if columns is not None:
954+
columns = _ensure_index(columns)
955+
else:
956+
columns = arr_columns
947957

948958
if exclude is None:
949959
exclude = set()
950960
else:
951961
exclude = set(exclude)
952962

953-
for col in exclude:
954-
del sdict[col]
955-
columns.remove(col)
956-
957963
result_index = None
958964
if index is not None:
959965
if (isinstance(index, basestring) or
960966
not hasattr(index, "__iter__")):
961-
result_index = sdict.pop(index)
962-
result_index = Index(result_index, name=index)
963-
columns.remove(index)
967+
i = columns.get_loc(index)
968+
exclude.add(index)
969+
result_index = Index(arrays[i], name=index)
964970
else:
965971
try:
966-
arrays = []
967-
for field in index:
968-
arrays.append(sdict[field])
969-
for field in index:
970-
del sdict[field]
971-
columns.remove(field)
972-
result_index = MultiIndex.from_arrays(arrays, names=index)
972+
to_remove = [arr_columns.get_loc(field) for field in index]
973+
974+
result_index = MultiIndex.from_arrays(
975+
[arrays[i] for i in to_remove], names=index)
976+
977+
exclude.update(index)
973978
except Exception:
974979
result_index = index
975980

976-
return cls(sdict, index=result_index, columns=columns)
981+
if any(exclude):
982+
to_remove = [arr_columns.get_loc(col) for col in exclude]
983+
arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
984+
arr_columns = arr_columns.drop(exclude)
985+
columns = columns.drop(exclude)
986+
987+
mgr = _arrays_to_mgr(arrays, arr_columns, result_index,
988+
columns)
989+
990+
return DataFrame(mgr)
977991

978992
def to_records(self, index=True):
979993
"""
@@ -5217,9 +5231,18 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None):
52175231
"""
52185232
Return list of arrays, columns
52195233
"""
5234+
if isinstance(data, DataFrame):
5235+
if columns is not None:
5236+
arrays = [data.icol(i).values for i, col in enumerate(data.columns)
5237+
if col in columns]
5238+
else:
5239+
columns = data.columns
5240+
arrays = [data.icol(i).values for i in range(len(columns))]
5241+
5242+
return arrays, columns
52205243

52215244
if len(data) == 0:
5222-
return [], columns if columns is not None else []
5245+
return [], [] # columns if columns is not None else []
52235246
if isinstance(data[0], (list, tuple)):
52245247
return _list_to_arrays(data, columns, coerce_float=coerce_float,
52255248
dtype=dtype)
@@ -5231,6 +5254,10 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None):
52315254
return _list_of_series_to_arrays(data, columns,
52325255
coerce_float=coerce_float,
52335256
dtype=dtype)
5257+
elif isinstance(data, np.ndarray):
5258+
columns = list(data.dtype.names)
5259+
arrays = [data[k] for k in columns]
5260+
return arrays, columns
52345261
else:
52355262
# last ditch effort
52365263
data = map(tuple, data)

pandas/tests/test_frame.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2681,8 +2681,13 @@ def test_from_records_decimal(self):
26812681
self.assert_(np.isnan(df['a'].values[-1]))
26822682

26832683
def test_from_records_duplicates(self):
2684-
self.assertRaises(ValueError, DataFrame.from_records,
2685-
[(1,2,3), (4,5,6)], columns=['a','b','a'])
2684+
result = DataFrame.from_records([(1,2,3), (4,5,6)],
2685+
columns=['a','b','a'])
2686+
2687+
expected = DataFrame([(1,2,3), (4,5,6)],
2688+
columns=['a', 'b', 'a'])
2689+
2690+
assert_frame_equal(result, expected)
26862691

26872692
def test_from_records_set_index_name(self):
26882693
def create_dict(order_id):

0 commit comments

Comments
 (0)