Skip to content

Commit 414a6fb

Browse files
authored
Make reset_index disallow the same name but allow it when drop=True. (#1455)
pandas' `DataFrame.reset_index()` raises an error if the index name is the same as one of columns but allow it when `drop=True`. ```py >>> import pandas as pd >>> import numpy as np >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=np.random.rand(3)) >>> pdf.index.name = "a" >>> pdf.reset_index() Traceback (most recent call last): ... ValueError: cannot insert a, already exists >>> pdf.reset_index(drop=True) a b 0 1 4 1 2 5 2 3 6 ``` whereas Koalas raises another error for both cases: ```py >>> ks.from_pandas(pdf).reset_index() Traceback (most recent call last): ... pyspark.sql.utils.AnalysisException: "Reference 'a' is ambiguous, could be: a, a.;" >>> ks.from_pandas(pdf).reset_index(drop=True) ... pyspark.sql.utils.AnalysisException: "Reference 'a' is ambiguous, could be: a, a.;" ```
1 parent b2a6679 commit 414a6fb

File tree

2 files changed

+22
-6
lines changed

2 files changed

+22
-6
lines changed

databricks/koalas/frame.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3598,6 +3598,13 @@ def rename(index):
35983598

35993599
index_map = OrderedDict(new_index_map_items)
36003600

3601+
if drop:
3602+
new_index_map = []
3603+
3604+
for _, name in new_index_map:
3605+
if name in self._internal.column_labels:
3606+
raise ValueError("cannot insert {}, already exists".format(name_like_string(name)))
3607+
36013608
new_data_scols = [
36023609
scol_for(self._sdf, column).alias(name_like_string(name))
36033610
for column, name in new_index_map
@@ -3616,15 +3623,9 @@ def rename(index):
36163623
new_data_scols + self._internal.data_spark_columns + list(HIDDEN_COLUMNS)
36173624
)
36183625

3619-
# Now, new internal Spark columns are named as same as index name.
3620-
new_index_map = [(column, name) for column, name in new_index_map]
3621-
36223626
sdf = _InternalFrame.attach_default_index(sdf)
36233627
index_map = OrderedDict({SPARK_DEFAULT_INDEX_NAME: None})
36243628

3625-
if drop:
3626-
new_index_map = []
3627-
36283629
if self._internal.column_labels_level > 1:
36293630
column_depth = len(self._internal.column_labels[0])
36303631
if col_level >= column_depth:

databricks/koalas/tests/test_dataframe.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,21 @@ def test_iterrows(self):
170170
self.assert_eq(pdf_k, kdf_k)
171171
self.assert_eq(pdf_v, kdf_v)
172172

173+
def test_reset_index(self):
174+
pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=np.random.rand(3))
175+
kdf = ks.from_pandas(pdf)
176+
177+
self.assert_eq(kdf.reset_index(), pdf.reset_index())
178+
self.assert_eq(kdf.reset_index(drop=True), pdf.reset_index(drop=True))
179+
180+
pdf.index.name = "a"
181+
kdf.index.name = "a"
182+
183+
with self.assertRaisesRegex(ValueError, "cannot insert a, already exists"):
184+
kdf.reset_index()
185+
186+
self.assert_eq(kdf.reset_index(drop=True), pdf.reset_index(drop=True))
187+
173188
def test_reset_index_with_default_index_types(self):
174189
pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=np.random.rand(3))
175190
kdf = ks.from_pandas(pdf)

0 commit comments

Comments
 (0)