Skip to content

Commit 8803344

Browse files
authored
Implement DataFrame.insert (#1983)
ref #1929 Insert column into DataFrame at a specified location. ``` >>> kdf = ks.DataFrame([1, 2, 3]) >>> kdf.insert(0, 'x', 4) >>> kdf.sort_index() x 0 0 4 1 1 4 2 2 4 3 >>> from databricks.koalas.config import set_option, reset_option >>> set_option("compute.ops_on_diff_frames", True) >>> kdf.insert(1, 'y', [5, 6, 7]) >>> kdf.sort_index() x y 0 0 4 5 1 1 4 6 2 2 4 7 3 >>> kdf.insert(2, 'z', ks.Series([8, 9, 10])) >>> kdf.sort_index() x y z 0 0 4 5 8 1 1 4 6 9 2 2 4 7 10 3 >>> reset_option("compute.ops_on_diff_frames") ```
1 parent c38c96f commit 8803344

File tree

5 files changed

+182
-1
lines changed

5 files changed

+182
-1
lines changed

databricks/koalas/frame.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@
113113
spark_type_to_pandas_dtype,
114114
DataFrameType,
115115
SeriesType,
116+
Scalar,
116117
)
117118
from databricks.koalas.plot import KoalasPlotAccessor
118119

@@ -3711,6 +3712,87 @@ def notnull(self) -> "DataFrame":
37113712

37123713
notna = notnull
37133714

3715+
def insert(
3716+
self,
3717+
loc: int,
3718+
column,
3719+
value: Union[Scalar, "Series", Iterable],
3720+
allow_duplicates: bool = False,
3721+
) -> None:
3722+
"""
3723+
Insert column into DataFrame at specified location.
3724+
3725+
Raises a ValueError if `column` is already contained in the DataFrame,
3726+
unless `allow_duplicates` is set to True.
3727+
3728+
Parameters
3729+
----------
3730+
loc : int
3731+
Insertion index. Must verify 0 <= loc <= len(columns).
3732+
column : str, number, or hashable object
3733+
Label of the inserted column.
3734+
value : int, Series, or array-like
3735+
allow_duplicates : bool, optional
3736+
3737+
Examples
3738+
--------
3739+
>>> kdf = ks.DataFrame([1, 2, 3])
3740+
>>> kdf.sort_index()
3741+
0
3742+
0 1
3743+
1 2
3744+
2 3
3745+
>>> kdf.insert(0, 'x', 4)
3746+
>>> kdf.sort_index()
3747+
x 0
3748+
0 4 1
3749+
1 4 2
3750+
2 4 3
3751+
3752+
>>> from databricks.koalas.config import set_option, reset_option
3753+
>>> set_option("compute.ops_on_diff_frames", True)
3754+
3755+
>>> kdf.insert(1, 'y', [5, 6, 7])
3756+
>>> kdf.sort_index()
3757+
x y 0
3758+
0 4 5 1
3759+
1 4 6 2
3760+
2 4 7 3
3761+
3762+
>>> kdf.insert(2, 'z', ks.Series([8, 9, 10]))
3763+
>>> kdf.sort_index()
3764+
x y z 0
3765+
0 4 5 8 1
3766+
1 4 6 9 2
3767+
2 4 7 10 3
3768+
3769+
>>> reset_option("compute.ops_on_diff_frames")
3770+
"""
3771+
if not isinstance(loc, int):
3772+
raise TypeError("loc must be int")
3773+
3774+
assert 0 <= loc <= len(self.columns)
3775+
assert allow_duplicates is False
3776+
3777+
if not is_name_like_value(column):
3778+
raise ValueError(
3779+
'"column" should be a scalar value or tuple that contains scalar values'
3780+
)
3781+
3782+
if is_name_like_tuple(column):
3783+
if len(column) != len(self.columns.levels):
3784+
# To be consistent with pandas
3785+
raise ValueError('"column" must have length equal to number of column levels.')
3786+
3787+
if column in self.columns:
3788+
raise ValueError("cannot insert %s, already exists" % column)
3789+
3790+
kdf = self.copy()
3791+
kdf[column] = value
3792+
columns = kdf.columns[:-1].insert(loc, kdf.columns[-1])
3793+
kdf = kdf[columns]
3794+
self._update_internal_frame(kdf._internal)
3795+
37143796
# TODO: add frep and axis parameter
37153797
def shift(self, periods=1, fill_value=None) -> "DataFrame":
37163798
"""

databricks/koalas/missing/frame.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ class _MissingPandasLikeDataFrame(object):
5050
ewm = _unsupported_function("ewm")
5151
first = _unsupported_function("first")
5252
infer_objects = _unsupported_function("infer_objects")
53-
insert = _unsupported_function("insert")
5453
interpolate = _unsupported_function("interpolate")
5554
last = _unsupported_function("last")
5655
lookup = _unsupported_function("lookup")

databricks/koalas/tests/test_dataframe.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,63 @@ def test_dataframe(self):
8888
index_cols = pdf.columns[column_mask]
8989
self.assert_eq(kdf[index_cols], pdf[index_cols])
9090

91+
def test_insert(self):
92+
#
93+
# Basic DataFrame
94+
#
95+
pdf = pd.DataFrame([1, 2, 3])
96+
kdf = ks.from_pandas(pdf)
97+
98+
kdf.insert(1, "b", 10)
99+
pdf.insert(1, "b", 10)
100+
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
101+
kdf.insert(2, "c", 0.1)
102+
pdf.insert(2, "c", 0.1)
103+
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
104+
kdf.insert(3, "d", kdf.b + 1)
105+
pdf.insert(3, "d", pdf.b + 1)
106+
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
107+
108+
kser = ks.Series([4, 5, 6])
109+
self.assertRaises(ValueError, lambda: kdf.insert(0, "y", kser))
110+
self.assertRaisesRegex(
111+
ValueError, "cannot insert b, already exists", lambda: kdf.insert(1, "b", 10)
112+
)
113+
self.assertRaisesRegex(
114+
ValueError,
115+
'"column" should be a scalar value or tuple that contains scalar values',
116+
lambda: kdf.insert(0, list("abc"), kser),
117+
)
118+
self.assertRaises(ValueError, lambda: kdf.insert(0, "e", [7, 8, 9, 10]))
119+
self.assertRaises(ValueError, lambda: kdf.insert(0, "f", ks.Series([7, 8])))
120+
self.assertRaises(AssertionError, lambda: kdf.insert(100, "y", kser))
121+
self.assertRaises(AssertionError, lambda: kdf.insert(1, "y", kser, allow_duplicates=True))
122+
123+
#
124+
# DataFrame with MultiIndex as columns
125+
#
126+
pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
127+
kdf = ks.from_pandas(pdf)
128+
129+
kdf.insert(1, "b", 10)
130+
pdf.insert(1, "b", 10)
131+
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
132+
kdf.insert(2, "c", 0.1)
133+
pdf.insert(2, "c", 0.1)
134+
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
135+
kdf.insert(3, "d", kdf.b + 1)
136+
pdf.insert(3, "d", pdf.b + 1)
137+
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
138+
139+
self.assertRaisesRegex(
140+
ValueError, "cannot insert d, already exists", lambda: kdf.insert(4, "d", 11)
141+
)
142+
self.assertRaisesRegex(
143+
ValueError,
144+
'"column" must have length equal to number of column levels.',
145+
lambda: kdf.insert(4, ("e",), 11),
146+
)
147+
91148
def test_inplace(self):
92149
pdf, kdf = self.df_pair
93150

databricks/koalas/tests/test_ops_on_diff_frames.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,48 @@ def test_combine_first(self):
477477
kser1.combine_first(kser2).sort_index(), pser1.combine_first(pser2).sort_index()
478478
)
479479

480+
def test_insert(self):
481+
#
482+
# Basic DataFrame
483+
#
484+
pdf = pd.DataFrame([1, 2, 3])
485+
kdf = ks.from_pandas(pdf)
486+
487+
pser = pd.Series([4, 5, 6])
488+
kser = ks.from_pandas(pser)
489+
kdf.insert(1, "y", kser)
490+
pdf.insert(1, "y", pser)
491+
self.assert_eq(kdf.sort_index(), pdf.sort_index())
492+
493+
#
494+
# DataFrame with Index different from inserting Series'
495+
#
496+
pdf = pd.DataFrame([1, 2, 3], index=[10, 20, 30])
497+
kdf = ks.from_pandas(pdf)
498+
499+
pser = pd.Series([4, 5, 6])
500+
kser = ks.from_pandas(pser)
501+
kdf.insert(1, "y", kser)
502+
pdf.insert(1, "y", pser)
503+
self.assert_eq(kdf.sort_index(), pdf.sort_index())
504+
505+
#
506+
# DataFrame with Multi-index columns
507+
#
508+
pdf = pd.DataFrame({("x", "a"): [1, 2, 3]})
509+
kdf = ks.from_pandas(pdf)
510+
511+
pser = pd.Series([4, 5, 6])
512+
kser = ks.from_pandas(pser)
513+
pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
514+
kdf = ks.from_pandas(pdf)
515+
kdf.insert(0, "a", kser)
516+
pdf.insert(0, "a", pser)
517+
self.assert_eq(kdf.sort_index(), pdf.sort_index())
518+
kdf.insert(0, ("b", "c", ""), kser)
519+
pdf.insert(0, ("b", "c", ""), pser)
520+
self.assert_eq(kdf.sort_index(), pdf.sort_index())
521+
480522
def test_compare(self):
481523
if LooseVersion(pd.__version__) >= LooseVersion("1.1"):
482524
pser1 = pd.Series(["b", "c", np.nan, "g", np.nan])

docs/source/reference/frame.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ Combining / joining / merging
227227
DataFrame.merge
228228
DataFrame.join
229229
DataFrame.update
230+
DataFrame.insert
230231

231232
Time series-related
232233
-------------------

0 commit comments

Comments
 (0)