Merge remote-tracking branch 'upstream/master' into merge_2_series

thoo · thoo · commit 630b2cbdf5f1 · 2019-09-23T20:34:02.000-04:00
* upstream/master: Updated the koalas logo in readme.md Adding koalas-logo without label Adding Koalas logo to readme Adding koalas logo Clean pandas usage in frame.agg (databricks#821) Implement Series.aggregate and agg (databricks#816) Raise a more helpful error for duplicated columns in Join (databricks#820)
diff --git a/Koalas-logo-nolabel.png b/Koalas-logo-nolabel.png
diff --git a/Koalas-logo.png b/Koalas-logo.png
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-
+<img src="https://github.com/databricks/koalas/blob/master/Koalas-logo-nolabel.png" width="160"><br>
 
 # Koalas: pandas API on Apache Spark <!-- omit in toc -->
 
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -871,7 +871,6 @@ def applymap(self, func):
                                        column_index=[c._internal.column_index[0] for c in applied])
         return DataFrame(internal)
 
-    # TODO: Series support is not implemented yet.
     # TODO: not all arguments are implemented comparing to Pandas' for now.
     def aggregate(self, func: Union[List[str], Dict[str, List[str]]]):
         """Aggregate using one or more operations over the specified axis.
@@ -960,11 +959,8 @@ def aggregate(self, func: Union[List[str], Dict[str, List[str]]]):
         #     sum  12.0  NaN
         #
         # Aggregated output is usually pretty much small. So it is fine to directly use pandas API.
-        pdf = kdf.to_pandas().transpose().reset_index()
-        pdf = pdf.groupby(['level_1']).apply(
-            lambda gpdf: gpdf.drop('level_1', 1).set_index('level_0').transpose()
-        ).reset_index(level=1)
-        pdf = pdf.drop(columns='level_1')
+        pdf = kdf.to_pandas().stack()
+        pdf.index = pdf.index.droplevel()
         pdf.columns.names = [None]
         pdf.index.names = [None]
 
@@ -5436,6 +5432,14 @@ def join(self, right: 'DataFrame', on: Optional[Union[str, List[str]]] = None,
         2  K1  A1    B1
         3  K2  A2    B2
         """
+        if isinstance(right, ks.Series):
+            common = list(self.columns.intersection([right.name]))
+        else:
+            common = list(self.columns.intersection(right.columns))
+        if len(common) > 0 and not lsuffix and not rsuffix:
+            raise ValueError(
+                "columns overlap but no suffix specified: "
+                "{rename}".format(rename=common))
         if on:
             self = self.set_index(on)
             join_kdf = self.merge(right, left_index=True, right_index=True, how=how,
diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py
@@ -48,8 +48,6 @@ class _MissingPandasLikeSeries(object):
     flags = unsupported_property('flags', deprecated=True)
 
     # Functions
-    agg = unsupported_function('agg')
-    aggregate = unsupported_function('aggregate')
     align = unsupported_function('align')
     argsort = unsupported_function('argsort')
     asfreq = unsupported_function('asfreq')
diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -2294,6 +2294,51 @@ def apply(self, func, args=(), **kwds):
         wrapped = ks.pandas_wraps(return_col=return_sig)(apply_each)
         return wrapped(self, *args, **kwds).rename(self.name)
 
+    # TODO: not all arguments are implemented comparing to Pandas' for now.
+    def aggregate(self, func: Union[str, List[str]]):
+        """Aggregate using one or more operations over the specified axis.
+
+        Parameters
+        ----------
+        func : str or a list of str
+            function name(s) as string apply to series.
+
+        Returns
+        -------
+        scalar, Series
+            The return can be:
+            - scalar : when Series.agg is called with single function
+            - Series : when Series.agg is called with several functions
+
+        Notes
+        -----
+        `agg` is an alias for `aggregate`. Use the alias.
+
+        See Also
+        --------
+        databricks.koalas.Series.apply
+        databricks.koalas.Series.transform
+
+        Examples
+        --------
+        >>> s = ks.Series([1, 2, 3, 4])
+        >>> s.agg('min')
+        1
+
+        >>> s.agg(['min', 'max'])
+        max    4
+        min    1
+        Name: 0, dtype: int64
+        """
+        if isinstance(func, list):
+            return self.to_frame().agg(func)[self.name]
+        elif isinstance(func, str):
+            return getattr(self, func)()
+        else:
+            raise ValueError("func must be a string or list of strings")
+
+    agg = aggregate
+
     def transpose(self, *args, **kwargs):
         """
         Return the transpose, which is by definition self.
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -1020,6 +1020,7 @@ def test_join(self):
                              'A': ['A0', 'A1', 'A2', 'A3']}, columns=['key', 'A'])
         kdf2 = ks.DataFrame({'key': ['K0', 'K1', 'K2'],
                              'B': ['B0', 'B1', 'B2']}, columns=['key', 'B'])
+        ks1 = ks.Series(['A1', 'A5'], index=[1, 2], name='A')
         join_pdf = pdf1.join(pdf2, lsuffix='_left', rsuffix='_right')
         join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
 
@@ -1028,6 +1029,11 @@ def test_join(self):
 
         self.assert_eq(join_pdf, join_kdf)
 
+        # join with duplicated columns in Series and DataFrame
+        with self.assertRaisesRegex(ValueError,
+                                    "columns overlap but no suffix specified"):
+            kdf1.join(ks1, how='outer')
+            kdf1.join(kdf2, how='outer')
         # check `on` parameter
         join_pdf = pdf1.join(pdf2.set_index('key'), on='key', lsuffix='_left', rsuffix='_right')
         join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -647,3 +647,14 @@ def test_astype(self):
         kser = koalas.Series(pser)
         with self.assertRaisesRegex(ValueError, 'Type int63 not understood'):
             kser.astype('int63')
+
+    def test_aggregate(self):
+        pser = pd.Series([10, 20, 15, 30, 45], name='x')
+        kser = koalas.Series(pser)
+        msg = 'func must be a string or list of strings'
+        with self.assertRaisesRegex(ValueError, msg):
+            kser.aggregate({'x': ['min', 'max']})
+        msg = ('If the given function is a list, it '
+               'should only contains function names as strings.')
+        with self.assertRaisesRegex(ValueError, msg):
+            kser.aggregate(['min', max])
diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst
@@ -85,6 +85,8 @@ Function application, GroupBy & Window
    :toctree: api/
 
    Series.apply
+   Series.agg
+   Series.aggregate
    Series.map
    Series.groupby
    Series.pipe

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-`
	`1`	`+<img src="https://github.com/databricks/koalas/blob/master/Koalas-logo-nolabel.png" width="160"><br>`
`2`	`2`
`3`	`3`	`# Koalas: pandas API on Apache Spark <!-- omit in toc -->`
`4`	`4`