Skip to content

Commit 03ea9b2

Browse files
FEAT-#2663: Add algebraic operator from_labels (#2665)
Resolves #2663 This operator is necessary for efficient `reset_index` operations. See this paper for more information on the operator: http://www.vldb.org/pvldb/vol13/p2033-petersohn.pdf Co-authored-by: William Ma <[email protected]> Signed-off-by: Devin Petersohn <[email protected]>
1 parent f2a7271 commit 03ea9b2

File tree

2 files changed

+63
-10
lines changed

2 files changed

+63
-10
lines changed

modin/backends/pandas/query_compiler.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -533,16 +533,8 @@ def reset_index(self, **kwargs):
533533
if level is not None or self.has_multiindex():
534534
return self.default_to_pandas(pandas.DataFrame.reset_index, **kwargs)
535535
if not drop:
536-
new_column_name = (
537-
self.index.name
538-
if self.index.name is not None
539-
else "index"
540-
if "index" not in self.columns
541-
else "level_0"
542-
)
543-
new_self = self.insert(0, new_column_name, self.index)
544-
else:
545-
new_self = self.copy()
536+
return self.__constructor__(self._modin_frame.from_labels())
537+
new_self = self.copy()
546538
new_self.index = pandas.RangeIndex(len(new_self.index))
547539
return new_self
548540

modin/engines/base/frame/data.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,67 @@ def mask(
536536
row_numeric_idx=new_row_order, col_numeric_idx=new_col_order
537537
)
538538

539+
def from_labels(self) -> "BasePandasFrame":
540+
"""Convert the row labels to a column of data, inserted at the first position.
541+
542+
Returns
543+
-------
544+
BasePandasFrame
545+
A new BasePandasFrame.
546+
"""
547+
new_row_labels = pandas.RangeIndex(len(self.index))
548+
# Column labels are different for multilevel index.
549+
if len(self.index.names) > 1:
550+
# We will also use the `new_column_names` in the calculation of the internal metadata, so this is a
551+
# lightweight way of ensuring the metadata matches.
552+
new_column_names = pandas.Index(
553+
[
554+
self.index.names[i]
555+
if self.index.names[i] is not None
556+
else "level_{}".format(i)
557+
for i in range(len(self.index.names))
558+
]
559+
)
560+
new_columns = new_column_names.append(self.columns)
561+
else:
562+
# See note above about usage of `new_column_names`.
563+
new_column_names = pandas.Index(
564+
[
565+
self.index.names[0]
566+
if self.index.names[0] is not None
567+
else "index"
568+
if "index" not in self.columns
569+
else "level_{}".format(0)
570+
]
571+
)
572+
new_columns = new_column_names.append(self.columns)
573+
574+
def from_labels_executor(df, **kwargs):
575+
# Setting the names here ensures that external and internal metadata always match.
576+
df.index.names = new_column_names
577+
return df.reset_index()
578+
579+
new_parts = self._frame_mgr_cls.apply_func_to_select_indices(
580+
0,
581+
self._partitions,
582+
from_labels_executor,
583+
[0],
584+
keep_remaining=True,
585+
)
586+
new_column_widths = [
587+
len(self.index.names) + self._column_widths[0]
588+
] + self._column_widths[1:]
589+
result = self.__constructor__(
590+
new_parts,
591+
new_row_labels,
592+
new_columns,
593+
row_lengths=self._row_lengths_cache,
594+
column_widths=new_column_widths,
595+
)
596+
# Propagate the new row labels to the all dataframe partitions
597+
result._apply_index_objs(0)
598+
return result
599+
539600
def reorder_labels(self, row_numeric_idx=None, col_numeric_idx=None):
540601
"""Reorder the column and or rows in this DataFrame.
541602

0 commit comments

Comments
 (0)