Skip to content

Commit 630b2cb

Browse files
committed
Merge remote-tracking branch 'upstream/master' into merge_2_series
* upstream/master: Updated the koalas logo in readme.md Adding koalas-logo without label Adding Koalas logo to readme Adding koalas logo Clean pandas usage in frame.agg (databricks#821) Implement Series.aggregate and agg (databricks#816) Raise a more helpful error for duplicated columns in Join (databricks#820)
2 parents 377847e + baffe5d commit 630b2cb

File tree

9 files changed

+75
-9
lines changed

9 files changed

+75
-9
lines changed

Koalas-logo-nolabel.png

44.7 KB
Loading

Koalas-logo.png

65.7 KB
Loading

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
<img src="https://github.com/databricks/koalas/blob/master/Koalas-logo-nolabel.png" width="160"><br>
22

33
# Koalas: pandas API on Apache Spark <!-- omit in toc -->
44

databricks/koalas/frame.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -871,7 +871,6 @@ def applymap(self, func):
871871
column_index=[c._internal.column_index[0] for c in applied])
872872
return DataFrame(internal)
873873

874-
# TODO: Series support is not implemented yet.
875874
# TODO: not all arguments are implemented comparing to Pandas' for now.
876875
def aggregate(self, func: Union[List[str], Dict[str, List[str]]]):
877876
"""Aggregate using one or more operations over the specified axis.
@@ -960,11 +959,8 @@ def aggregate(self, func: Union[List[str], Dict[str, List[str]]]):
960959
# sum 12.0 NaN
961960
#
962961
# Aggregated output is usually pretty much small. So it is fine to directly use pandas API.
963-
pdf = kdf.to_pandas().transpose().reset_index()
964-
pdf = pdf.groupby(['level_1']).apply(
965-
lambda gpdf: gpdf.drop('level_1', 1).set_index('level_0').transpose()
966-
).reset_index(level=1)
967-
pdf = pdf.drop(columns='level_1')
962+
pdf = kdf.to_pandas().stack()
963+
pdf.index = pdf.index.droplevel()
968964
pdf.columns.names = [None]
969965
pdf.index.names = [None]
970966

@@ -5436,6 +5432,14 @@ def join(self, right: 'DataFrame', on: Optional[Union[str, List[str]]] = None,
54365432
2 K1 A1 B1
54375433
3 K2 A2 B2
54385434
"""
5435+
if isinstance(right, ks.Series):
5436+
common = list(self.columns.intersection([right.name]))
5437+
else:
5438+
common = list(self.columns.intersection(right.columns))
5439+
if len(common) > 0 and not lsuffix and not rsuffix:
5440+
raise ValueError(
5441+
"columns overlap but no suffix specified: "
5442+
"{rename}".format(rename=common))
54395443
if on:
54405444
self = self.set_index(on)
54415445
join_kdf = self.merge(right, left_index=True, right_index=True, how=how,

databricks/koalas/missing/series.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@ class _MissingPandasLikeSeries(object):
4848
flags = unsupported_property('flags', deprecated=True)
4949

5050
# Functions
51-
agg = unsupported_function('agg')
52-
aggregate = unsupported_function('aggregate')
5351
align = unsupported_function('align')
5452
argsort = unsupported_function('argsort')
5553
asfreq = unsupported_function('asfreq')

databricks/koalas/series.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2294,6 +2294,51 @@ def apply(self, func, args=(), **kwds):
22942294
wrapped = ks.pandas_wraps(return_col=return_sig)(apply_each)
22952295
return wrapped(self, *args, **kwds).rename(self.name)
22962296

2297+
# TODO: not all arguments are implemented comparing to Pandas' for now.
2298+
def aggregate(self, func: Union[str, List[str]]):
2299+
"""Aggregate using one or more operations over the specified axis.
2300+
2301+
Parameters
2302+
----------
2303+
func : str or a list of str
2304+
function name(s) as string apply to series.
2305+
2306+
Returns
2307+
-------
2308+
scalar, Series
2309+
The return can be:
2310+
- scalar : when Series.agg is called with single function
2311+
- Series : when Series.agg is called with several functions
2312+
2313+
Notes
2314+
-----
2315+
`agg` is an alias for `aggregate`. Use the alias.
2316+
2317+
See Also
2318+
--------
2319+
databricks.koalas.Series.apply
2320+
databricks.koalas.Series.transform
2321+
2322+
Examples
2323+
--------
2324+
>>> s = ks.Series([1, 2, 3, 4])
2325+
>>> s.agg('min')
2326+
1
2327+
2328+
>>> s.agg(['min', 'max'])
2329+
max 4
2330+
min 1
2331+
Name: 0, dtype: int64
2332+
"""
2333+
if isinstance(func, list):
2334+
return self.to_frame().agg(func)[self.name]
2335+
elif isinstance(func, str):
2336+
return getattr(self, func)()
2337+
else:
2338+
raise ValueError("func must be a string or list of strings")
2339+
2340+
agg = aggregate
2341+
22972342
def transpose(self, *args, **kwargs):
22982343
"""
22992344
Return the transpose, which is by definition self.

databricks/koalas/tests/test_dataframe.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,6 +1020,7 @@ def test_join(self):
10201020
'A': ['A0', 'A1', 'A2', 'A3']}, columns=['key', 'A'])
10211021
kdf2 = ks.DataFrame({'key': ['K0', 'K1', 'K2'],
10221022
'B': ['B0', 'B1', 'B2']}, columns=['key', 'B'])
1023+
ks1 = ks.Series(['A1', 'A5'], index=[1, 2], name='A')
10231024
join_pdf = pdf1.join(pdf2, lsuffix='_left', rsuffix='_right')
10241025
join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
10251026

@@ -1028,6 +1029,11 @@ def test_join(self):
10281029

10291030
self.assert_eq(join_pdf, join_kdf)
10301031

1032+
# join with duplicated columns in Series and DataFrame
1033+
with self.assertRaisesRegex(ValueError,
1034+
"columns overlap but no suffix specified"):
1035+
kdf1.join(ks1, how='outer')
1036+
kdf1.join(kdf2, how='outer')
10311037
# check `on` parameter
10321038
join_pdf = pdf1.join(pdf2.set_index('key'), on='key', lsuffix='_left', rsuffix='_right')
10331039
join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)

databricks/koalas/tests/test_series.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,3 +647,14 @@ def test_astype(self):
647647
kser = koalas.Series(pser)
648648
with self.assertRaisesRegex(ValueError, 'Type int63 not understood'):
649649
kser.astype('int63')
650+
651+
def test_aggregate(self):
652+
pser = pd.Series([10, 20, 15, 30, 45], name='x')
653+
kser = koalas.Series(pser)
654+
msg = 'func must be a string or list of strings'
655+
with self.assertRaisesRegex(ValueError, msg):
656+
kser.aggregate({'x': ['min', 'max']})
657+
msg = ('If the given function is a list, it '
658+
'should only contains function names as strings.')
659+
with self.assertRaisesRegex(ValueError, msg):
660+
kser.aggregate(['min', max])

docs/source/reference/series.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ Function application, GroupBy & Window
8585
:toctree: api/
8686

8787
Series.apply
88+
Series.agg
89+
Series.aggregate
8890
Series.map
8991
Series.groupby
9092
Series.pipe

0 commit comments

Comments
 (0)