Skip to content

Commit ad958b8

Browse files
committed
Exclude Index columns for exposed Spark DataFrame and disallow Koalas DataFrame with no index
1 parent c87a849 commit ad958b8

File tree

14 files changed

+173
-138
lines changed

14 files changed

+173
-138
lines changed

databricks/koalas/base.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -304,9 +304,6 @@ def is_monotonic(self):
304304
>>> ser.rename("a").to_frame().set_index("a").index.is_monotonic
305305
True
306306
"""
307-
if len(self._kdf._internal.index_columns) == 0:
308-
raise ValueError("Index must be set.")
309-
310307
col = self._scol
311308
window = Window.orderBy(self._kdf._internal.index_scols).rowsBetween(-1, -1)
312309
sdf = self._kdf._sdf.withColumn(
@@ -356,9 +353,6 @@ def is_monotonic_decreasing(self):
356353
>>> ser.rename("a").to_frame().set_index("a").index.is_monotonic_decreasing
357354
True
358355
"""
359-
if len(self._kdf._internal.index_columns) == 0:
360-
raise ValueError("Index must be set.")
361-
362356
col = self._scol
363357
window = Window.orderBy(self._kdf._internal.index_scols).rowsBetween(-1, -1)
364358
sdf = self._kdf._sdf.withColumn(
@@ -705,9 +699,6 @@ def shift(self, periods=1, fill_value=None):
705699
>>> df.index.shift(periods=3, fill_value=0)
706700
Int64Index([0, 0, 0, 0, 1], dtype='int64')
707701
"""
708-
if len(self._internal.index_columns) == 0:
709-
raise ValueError("Index must be set.")
710-
711702
if not isinstance(periods, int):
712703
raise ValueError('periods should be an int; however, got [%s]' % type(periods))
713704

databricks/koalas/frame.py

Lines changed: 40 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1598,9 +1598,7 @@ def index(self):
15981598
Index
15991599
"""
16001600
from databricks.koalas.indexes import Index, MultiIndex
1601-
if len(self._internal.index_map) == 0:
1602-
return None
1603-
elif len(self._internal.index_map) == 1:
1601+
if len(self._internal.index_map) == 1:
16041602
return Index(self)
16051603
else:
16061604
return MultiIndex(self)
@@ -1860,9 +1858,6 @@ class max type
18601858
lion mammal 80.5 run
18611859
monkey mammal NaN jump
18621860
"""
1863-
if len(self._internal.index_map) == 0:
1864-
raise NotImplementedError('Can\'t reset index because there is no index.')
1865-
18661861
multi_index = len(self._internal.index_map) > 1
18671862

18681863
def rename(index):
@@ -1877,7 +1872,15 @@ def rename(index):
18771872
if level is None:
18781873
new_index_map = [(column, name if name is not None else rename(i))
18791874
for i, (column, name) in enumerate(self._internal.index_map)]
1880-
index_map = []
1875+
new_data_columns = [
1876+
self._internal.scol_for(column).alias(name) for column, name in new_index_map]
1877+
sdf = self._sdf.select(new_data_columns + self._internal.data_columns)
1878+
1879+
# Now, new internal Spark columns are named as same as index name.
1880+
new_index_map = [(name, name) for column, name in new_index_map]
1881+
1882+
index_map = [('__index_level_0__', None)]
1883+
sdf = _InternalFrame.attach_default_index(sdf)
18811884
else:
18821885
if isinstance(level, (int, str)):
18831886
level = [level]
@@ -1915,10 +1918,13 @@ def rename(index):
19151918
index_name if index_name is not None else rename(index_name)))
19161919
index_map.remove(info)
19171920

1921+
sdf = self._sdf
1922+
19181923
if drop:
19191924
new_index_map = []
19201925

19211926
internal = self._internal.copy(
1927+
sdf=sdf,
19221928
data_columns=[column for column, _ in new_index_map] + self._internal.data_columns,
19231929
index_map=index_map,
19241930
column_index=None)
@@ -2382,13 +2388,13 @@ def to_koalas(self):
23822388
23832389
>>> spark_df = df.to_spark()
23842390
>>> spark_df
2385-
DataFrame[__index_level_0__: bigint, col1: bigint, col2: bigint]
2391+
DataFrame[col1: bigint, col2: bigint]
23862392
23872393
>>> kdf = spark_df.to_koalas()
23882394
>>> kdf
2389-
__index_level_0__ col1 col2
2390-
0 0 1 3
2391-
1 1 2 4
2395+
col1 col2
2396+
0 1 3
2397+
1 2 4
23922398
23932399
Calling to_koalas on a Koalas DataFrame simply returns itself.
23942400
@@ -2493,8 +2499,8 @@ def to_table(self, name: str, format: Optional[str] = None, mode: str = 'error',
24932499
24942500
>>> df.to_table('%s.my_table' % db, partition_cols='date')
24952501
"""
2496-
self._sdf.write.saveAsTable(name=name, format=format, mode=mode,
2497-
partitionBy=partition_cols, options=options)
2502+
self.to_spark().write.saveAsTable(name=name, format=format, mode=mode,
2503+
partitionBy=partition_cols, options=options)
24982504

24992505
def to_delta(self, path: str, mode: str = 'error',
25002506
partition_cols: Union[str, List[str], None] = None, **options):
@@ -2604,8 +2610,8 @@ def to_parquet(self, path: str, mode: str = 'error',
26042610
... mode = 'overwrite',
26052611
... partition_cols=['date', 'country'])
26062612
"""
2607-
self._sdf.write.parquet(path=path, mode=mode, partitionBy=partition_cols,
2608-
compression=compression)
2613+
self.to_spark().write.parquet(
2614+
path=path, mode=mode, partitionBy=partition_cols, compression=compression)
26092615

26102616
def to_spark_io(self, path: Optional[str] = None, format: Optional[str] = None,
26112617
mode: str = 'error', partition_cols: Union[str, List[str], None] = None,
@@ -2657,13 +2663,16 @@ def to_spark_io(self, path: Optional[str] = None, format: Optional[str] = None,
26572663
26582664
>>> df.to_spark_io(path='%s/to_spark_io/foo.json' % path, format='json')
26592665
"""
2660-
self._sdf.write.save(path=path, format=format, mode=mode, partitionBy=partition_cols,
2661-
options=options)
2666+
self.to_spark().write.save(
2667+
path=path, format=format, mode=mode, partitionBy=partition_cols, options=options)
26622668

26632669
def to_spark(self):
26642670
"""
26652671
Return the current DataFrame as a Spark DataFrame.
26662672
2673+
.. note:: Index information is lost. So, if the index columns are not present in
2674+
actual columns, they are lost.
2675+
26672676
See Also
26682677
--------
26692678
DataFrame.to_koalas
@@ -3653,14 +3662,21 @@ def pivot_table(self, values=None, index=None, columns=None,
36533662
sdf = sdf.fillna(fill_value)
36543663

36553664
if index is not None:
3656-
return DataFrame(sdf).set_index(index)
3665+
data_columns = [column for column in sdf.columns if column not in index]
3666+
index_map = [(column, column) for column in index]
3667+
internal = _InternalFrame(sdf=sdf, data_columns=data_columns, index_map=index_map)
3668+
return DataFrame(internal)
36573669
else:
36583670
if isinstance(values, list):
36593671
index_values = values[-1]
36603672
else:
36613673
index_values = values
36623674

3663-
return DataFrame(sdf.withColumn(columns, F.lit(index_values))).set_index(columns)
3675+
sdf = sdf.withColumn(columns, F.lit(index_values))
3676+
data_columns = [column for column in sdf.columns if column not in columns]
3677+
index_map = [(column, column) for column in columns]
3678+
internal = _InternalFrame(sdf=sdf, data_columns=data_columns, index_map=index_map)
3679+
return DataFrame(internal)
36643680

36653681
def pivot(self, index=None, columns=None, values=None):
36663682
"""
@@ -4364,9 +4380,6 @@ def sort_index(self, axis: int = 0,
43644380
a 1 2 1
43654381
b 1 0 3
43664382
"""
4367-
if len(self._internal.index_map) == 0:
4368-
raise ValueError("Index should be set.")
4369-
43704383
if axis != 0:
43714384
raise ValueError("No other axes than 0 are supported at the moment")
43724385
if kind is not None:
@@ -4959,12 +4972,12 @@ def join(self, right: 'DataFrame', on: Optional[Union[str, List[str]]] = None,
49594972
original DataFrame’s index in the result.
49604973
49614974
>>> join_kdf = kdf1.join(kdf2.set_index('key'), on='key')
4962-
>>> join_kdf.sort_values(by=join_kdf.columns)
4975+
>>> join_kdf.sort_index()
49634976
key A B
4964-
0 K0 A0 B0
4965-
1 K1 A1 B1
4966-
2 K2 A2 B2
4967-
3 K3 A3 None
4977+
0 K3 A3 None
4978+
1 K0 A0 B0
4979+
2 K1 A1 B1
4980+
3 K2 A2 B2
49684981
"""
49694982
if on:
49704983
self = self.set_index(on)
@@ -5543,9 +5556,6 @@ def _cum(self, func, skipna: bool):
55435556
elif func.__name__ == "cumprod":
55445557
func = "cumprod"
55455558

5546-
if len(self._internal.index_columns) == 0:
5547-
raise ValueError("Index must be set.")
5548-
55495559
applied = []
55505560
for column in self.columns:
55515561
applied.append(getattr(self[column], func)(skipna))

databricks/koalas/groupby.py

Lines changed: 43 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -682,7 +682,6 @@ def cumsum(self):
682682
"""
683683
return self._cum(F.sum)
684684

685-
# TODO: Series support is not implemented yet.
686685
def apply(self, func):
687686
"""
688687
Apply function `func` group-wise and combine the results together.
@@ -797,7 +796,31 @@ def apply(self, func):
797796
return_schema = None # schema will inferred.
798797
else:
799798
return_schema = _infer_return_type(func).tpe
800-
return self._apply(func, return_schema, retain_index=return_schema is None)
799+
800+
should_infer_schema = return_schema is None
801+
input_groupnames = [s.name for s in self._groupkeys]
802+
803+
if should_infer_schema:
804+
# Here we execute with the first 1000 to get the return type.
805+
# If the records were less than 1000, it uses pandas API directly for a shortcut.
806+
limit = 1000
807+
pdf = self._kdf.head(limit + 1).to_pandas()
808+
pdf = pdf.groupby(input_groupnames).apply(func)
809+
kdf = DataFrame(pdf)
810+
return_schema = kdf._sdf.schema
811+
if len(pdf) <= limit:
812+
return kdf
813+
814+
sdf = self._spark_group_map_apply(
815+
func, return_schema, retain_index=should_infer_schema)
816+
817+
if should_infer_schema:
818+
# If schema is inferred, we can restore indexes too.
819+
internal = kdf._internal.copy(sdf=sdf)
820+
else:
821+
# Otherwise, it loses index.
822+
internal = _InternalFrame(sdf=sdf)
823+
return DataFrame(internal)
801824

802825
# TODO: implement 'dropna' parameter
803826
def filter(self, func):
@@ -843,24 +866,11 @@ def filter(self, func):
843866
def pandas_filter(pdf):
844867
return pdf.groupby(groupby_names).filter(func)
845868

846-
kdf = self._apply(pandas_filter, data_schema, retain_index=True)
847-
return DataFrame(self._kdf._internal.copy(sdf=kdf._sdf))
848-
849-
def _apply(self, func, return_schema, retain_index):
850-
should_infer_schema = return_schema is None
851-
input_groupnames = [s.name for s in self._groupkeys]
852-
853-
if should_infer_schema:
854-
# Here we execute with the first 1000 to get the return type.
855-
# If the records were less than 1000, it uses pandas API directly for a shortcut.
856-
limit = 1000
857-
pdf = self._kdf.head(limit + 1).to_pandas()
858-
pdf = pdf.groupby(input_groupnames).apply(func)
859-
kdf = DataFrame(pdf)
860-
return_schema = kdf._sdf.schema
861-
if len(pdf) <= limit:
862-
return kdf
869+
sdf = self._spark_group_map_apply(
870+
pandas_filter, data_schema, retain_index=True)
871+
return DataFrame(self._kdf._internal.copy(sdf=sdf))
863872

873+
def _spark_group_map_apply(self, func, return_schema, retain_index):
864874
index_columns = self._kdf._internal.index_columns
865875
index_names = self._kdf._internal.index_names
866876
data_columns = self._kdf._internal.data_columns
@@ -934,14 +944,7 @@ def rename_output(pdf):
934944
input_groupkeys = [s._scol for s in self._groupkeys]
935945
sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func)
936946

937-
if should_infer_schema:
938-
# If schema is inferred, we can restore indexes too.
939-
internal = kdf._internal.copy(sdf=sdf)
940-
else:
941-
# Otherwise, it loses index.
942-
internal = _InternalFrame(
943-
sdf=sdf, data_columns=return_schema.fieldNames(), index_map=[])
944-
return DataFrame(internal)
947+
return sdf
945948

946949
def rank(self, method='average', ascending=True):
947950
"""
@@ -1007,7 +1010,6 @@ def rank(self, method='average', ascending=True):
10071010
"""
10081011
return self._rank(method, ascending)
10091012

1010-
# TODO: Series support is not implemented yet.
10111013
def transform(self, func):
10121014
"""
10131015
Apply function column-by-column to the GroupBy object.
@@ -1117,7 +1119,9 @@ def pandas_transform(pdf):
11171119
pdf = pdf.drop(columns=input_groupnames)
11181120
return pdf.transform(func)
11191121

1120-
if return_sig is None:
1122+
should_infer_schema = return_sig is None
1123+
1124+
if should_infer_schema:
11211125
# Here we execute with the first 1000 to get the return type.
11221126
# If the records were less than 1000, it uses pandas API directly for a shortcut.
11231127
limit = 1000
@@ -1128,16 +1132,22 @@ def pandas_transform(pdf):
11281132
if len(pdf) <= limit:
11291133
return pdf
11301134

1131-
applied_kdf = self._apply(pandas_transform, return_schema, retain_index=True)
1132-
# kdf inferred from pdf holds a correct index.
1133-
return DataFrame(kdf._internal.copy(sdf=applied_kdf._sdf))
1135+
sdf = self._spark_group_map_apply(
1136+
pandas_transform, return_schema, retain_index=True)
1137+
# If schema is inferred, we can restore indexes too.
1138+
internal = kdf._internal.copy(sdf=sdf)
11341139
else:
11351140
return_type = _infer_return_type(func).tpe
11361141
data_columns = self._kdf._internal.data_columns
11371142
return_schema = StructType([
11381143
StructField(c, return_type) for c in data_columns if c not in input_groupnames])
11391144

1140-
return self._apply(pandas_transform, return_schema, retain_index=False)
1145+
sdf = self._spark_group_map_apply(
1146+
pandas_transform, return_schema, retain_index=False)
1147+
# Otherwise, it loses index.
1148+
internal = _InternalFrame(sdf=sdf)
1149+
1150+
return DataFrame(internal)
11411151

11421152
def nunique(self, dropna=True):
11431153
"""
@@ -1362,9 +1372,6 @@ def _cum(self, func):
13621372
elif func.__name__ == "cumprod":
13631373
func = "cumprod"
13641374

1365-
if len(self._kdf._internal.index_columns) == 0:
1366-
raise ValueError("Index must be set.")
1367-
13681375
applied = []
13691376
kdf = self._kdf
13701377
groupkey_columns = set(s.name for s in self._groupkeys)

0 commit comments

Comments
 (0)