Implement DataFrame.insert (#1983)

xinrong-meng · web-flow · commit 8803344d6202 · 2021-01-20T13:50:53.000-08:00
ref #1929 Insert column into DataFrame at a specified location. ``` >>> kdf = ks.DataFrame([1, 2, 3]) >>> kdf.insert(0, 'x', 4) >>> kdf.sort_index() x 0 0 4 1 1 4 2 2 4 3 >>> from databricks.koalas.config import set_option, reset_option >>> set_option("compute.ops_on_diff_frames", True) >>> kdf.insert(1, 'y', [5, 6, 7]) >>> kdf.sort_index() x y 0 0 4 5 1 1 4 6 2 2 4 7 3 >>> kdf.insert(2, 'z', ks.Series([8, 9, 10])) >>> kdf.sort_index() x y z 0 0 4 5 8 1 1 4 6 9 2 2 4 7 10 3 >>> reset_option("compute.ops_on_diff_frames") ```
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -113,6 +113,7 @@
     spark_type_to_pandas_dtype,
     DataFrameType,
     SeriesType,
+    Scalar,
 )
 from databricks.koalas.plot import KoalasPlotAccessor
 
@@ -3711,6 +3712,87 @@ def notnull(self) -> "DataFrame":
 
     notna = notnull
 
+    def insert(
+        self,
+        loc: int,
+        column,
+        value: Union[Scalar, "Series", Iterable],
+        allow_duplicates: bool = False,
+    ) -> None:
+        """
+        Insert column into DataFrame at specified location.
+
+        Raises a ValueError if `column` is already contained in the DataFrame,
+        unless `allow_duplicates` is set to True.
+
+        Parameters
+        ----------
+        loc : int
+            Insertion index. Must verify 0 <= loc <= len(columns).
+        column : str, number, or hashable object
+            Label of the inserted column.
+        value : int, Series, or array-like
+        allow_duplicates : bool, optional
+
+        Examples
+        --------
+        >>> kdf = ks.DataFrame([1, 2, 3])
+        >>> kdf.sort_index()
+           0
+        0  1
+        1  2
+        2  3
+        >>> kdf.insert(0, 'x', 4)
+        >>> kdf.sort_index()
+           x  0
+        0  4  1
+        1  4  2
+        2  4  3
+
+        >>> from databricks.koalas.config import set_option, reset_option
+        >>> set_option("compute.ops_on_diff_frames", True)
+
+        >>> kdf.insert(1, 'y', [5, 6, 7])
+        >>> kdf.sort_index()
+           x  y  0
+        0  4  5  1
+        1  4  6  2
+        2  4  7  3
+
+        >>> kdf.insert(2, 'z', ks.Series([8, 9, 10]))
+        >>> kdf.sort_index()
+           x  y   z  0
+        0  4  5   8  1
+        1  4  6   9  2
+        2  4  7  10  3
+
+        >>> reset_option("compute.ops_on_diff_frames")
+        """
+        if not isinstance(loc, int):
+            raise TypeError("loc must be int")
+
+        assert 0 <= loc <= len(self.columns)
+        assert allow_duplicates is False
+
+        if not is_name_like_value(column):
+            raise ValueError(
+                '"column" should be a scalar value or tuple that contains scalar values'
+            )
+
+        if is_name_like_tuple(column):
+            if len(column) != len(self.columns.levels):
+                # To be consistent with pandas
+                raise ValueError('"column" must have length equal to number of column levels.')
+
+        if column in self.columns:
+            raise ValueError("cannot insert %s, already exists" % column)
+
+        kdf = self.copy()
+        kdf[column] = value
+        columns = kdf.columns[:-1].insert(loc, kdf.columns[-1])
+        kdf = kdf[columns]
+        self._update_internal_frame(kdf._internal)
+
     # TODO: add frep and axis parameter
     def shift(self, periods=1, fill_value=None) -> "DataFrame":
         """
diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -50,7 +50,6 @@ class _MissingPandasLikeDataFrame(object):
     ewm = _unsupported_function("ewm")
     first = _unsupported_function("first")
     infer_objects = _unsupported_function("infer_objects")
-    insert = _unsupported_function("insert")
     interpolate = _unsupported_function("interpolate")
     last = _unsupported_function("last")
     lookup = _unsupported_function("lookup")
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -88,6 +88,63 @@ def test_dataframe(self):
         index_cols = pdf.columns[column_mask]
         self.assert_eq(kdf[index_cols], pdf[index_cols])
 
+    def test_insert(self):
+        #
+        # Basic DataFrame
+        #
+        pdf = pd.DataFrame([1, 2, 3])
+        kdf = ks.from_pandas(pdf)
+
+        kdf.insert(1, "b", 10)
+        pdf.insert(1, "b", 10)
+        self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
+        kdf.insert(2, "c", 0.1)
+        pdf.insert(2, "c", 0.1)
+        self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
+        kdf.insert(3, "d", kdf.b + 1)
+        pdf.insert(3, "d", pdf.b + 1)
+        self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
+
+        kser = ks.Series([4, 5, 6])
+        self.assertRaises(ValueError, lambda: kdf.insert(0, "y", kser))
+        self.assertRaisesRegex(
+            ValueError, "cannot insert b, already exists", lambda: kdf.insert(1, "b", 10)
+        )
+        self.assertRaisesRegex(
+            ValueError,
+            '"column" should be a scalar value or tuple that contains scalar values',
+            lambda: kdf.insert(0, list("abc"), kser),
+        )
+        self.assertRaises(ValueError, lambda: kdf.insert(0, "e", [7, 8, 9, 10]))
+        self.assertRaises(ValueError, lambda: kdf.insert(0, "f", ks.Series([7, 8])))
+        self.assertRaises(AssertionError, lambda: kdf.insert(100, "y", kser))
+        self.assertRaises(AssertionError, lambda: kdf.insert(1, "y", kser, allow_duplicates=True))
+
+        #
+        # DataFrame with MultiIndex as columns
+        #
+        pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
+        kdf = ks.from_pandas(pdf)
+
+        kdf.insert(1, "b", 10)
+        pdf.insert(1, "b", 10)
+        self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
+        kdf.insert(2, "c", 0.1)
+        pdf.insert(2, "c", 0.1)
+        self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
+        kdf.insert(3, "d", kdf.b + 1)
+        pdf.insert(3, "d", pdf.b + 1)
+        self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
+
+        self.assertRaisesRegex(
+            ValueError, "cannot insert d, already exists", lambda: kdf.insert(4, "d", 11)
+        )
+        self.assertRaisesRegex(
+            ValueError,
+            '"column" must have length equal to number of column levels.',
+            lambda: kdf.insert(4, ("e",), 11),
+        )
+
     def test_inplace(self):
         pdf, kdf = self.df_pair
 
diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py
@@ -477,6 +477,48 @@ def test_combine_first(self):
             kser1.combine_first(kser2).sort_index(), pser1.combine_first(pser2).sort_index()
         )
 
+    def test_insert(self):
+        #
+        # Basic DataFrame
+        #
+        pdf = pd.DataFrame([1, 2, 3])
+        kdf = ks.from_pandas(pdf)
+
+        pser = pd.Series([4, 5, 6])
+        kser = ks.from_pandas(pser)
+        kdf.insert(1, "y", kser)
+        pdf.insert(1, "y", pser)
+        self.assert_eq(kdf.sort_index(), pdf.sort_index())
+
+        #
+        # DataFrame with Index different from inserting Series'
+        #
+        pdf = pd.DataFrame([1, 2, 3], index=[10, 20, 30])
+        kdf = ks.from_pandas(pdf)
+
+        pser = pd.Series([4, 5, 6])
+        kser = ks.from_pandas(pser)
+        kdf.insert(1, "y", kser)
+        pdf.insert(1, "y", pser)
+        self.assert_eq(kdf.sort_index(), pdf.sort_index())
+
+        #
+        # DataFrame with Multi-index columns
+        #
+        pdf = pd.DataFrame({("x", "a"): [1, 2, 3]})
+        kdf = ks.from_pandas(pdf)
+
+        pser = pd.Series([4, 5, 6])
+        kser = ks.from_pandas(pser)
+        pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
+        kdf = ks.from_pandas(pdf)
+        kdf.insert(0, "a", kser)
+        pdf.insert(0, "a", pser)
+        self.assert_eq(kdf.sort_index(), pdf.sort_index())
+        kdf.insert(0, ("b", "c", ""), kser)
+        pdf.insert(0, ("b", "c", ""), pser)
+        self.assert_eq(kdf.sort_index(), pdf.sort_index())
+
     def test_compare(self):
         if LooseVersion(pd.__version__) >= LooseVersion("1.1"):
             pser1 = pd.Series(["b", "c", np.nan, "g", np.nan])
diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -227,6 +227,7 @@ Combining / joining / merging
    DataFrame.merge
    DataFrame.join
    DataFrame.update
+   DataFrame.insert
 
 Time series-related
 -------------------