Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion databricks/koalas/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def transform_batch(self, func, *args, **kwargs):
0 3
1 5
2 7
dtype: int32
dtype: int64

You can also omit the type hints so Koalas infers the return schema as below:

Expand Down
8 changes: 4 additions & 4 deletions databricks/koalas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,7 +1019,7 @@ def apply(self, func, *args, **kwargs):
0 6
1 3
2 4
Name: B, dtype: int32
Name: B, dtype: int64

>>> def plus_min(x):
... return x + x.min()
Expand All @@ -1036,7 +1036,7 @@ def apply(self, func, *args, **kwargs):
>>> df.B.groupby(df.A).apply(plus_length).sort_index()
0 1
1 2
Name: B, dtype: int32
Name: B, dtype: int64

The extra arguments to the function can be passed as below.

Expand All @@ -1045,7 +1045,7 @@ def apply(self, func, *args, **kwargs):
>>> df.B.groupby(df.A).apply(calculation, 5, z=10).sort_index()
0 51
1 52
Name: B, dtype: int32
Name: B, dtype: int64
"""
from pandas.core.base import SelectionMixin

Expand Down Expand Up @@ -1993,7 +1993,7 @@ def transform(self, func, *args, **kwargs):
0 3
1 4
2 6
Name: B, dtype: int32
Name: B, dtype: int64

>>> (df * -1).B.groupby(df.A).transform(abs)
0 1
Expand Down
4 changes: 2 additions & 2 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3071,7 +3071,7 @@ def transform(self, func, axis=0, *args, **kwargs) -> Union["Series", DataFrame]
0 0.000000
1 1.000000
2 1.414214
dtype: float32
dtype: float64

Even though the resulting instance must have the same length as the
input, it is possible to provide several input functions:
Expand Down Expand Up @@ -4793,7 +4793,7 @@ def repeat(self, repeats: Union[int, "Series"]) -> "Series":
)
else:
scol = F.explode(
SF.array_repeat(self.spark.column, repeats.astype(int).spark.column)
SF.array_repeat(self.spark.column, repeats.astype("int32").spark.column)
).alias(name_like_string(self.name))
sdf = self._internal.spark_frame.select(self._internal.index_spark_columns + [scol])
internal = self._internal.copy(
Expand Down
18 changes: 9 additions & 9 deletions databricks/koalas/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,7 +976,7 @@ def count(self, pat, flags=0) -> "ks.Series":
3 2
4 2
5 0
dtype: int32
dtype: int64
"""

def pandas_count(s) -> "ks.Series[int]":
Expand Down Expand Up @@ -1037,25 +1037,25 @@ def find(self, sub, start=0, end=None) -> "ks.Series":
0 0
1 2
2 1
dtype: int32
dtype: int64

>>> s.str.find('a', start=2)
0 -1
1 2
2 3
dtype: int32
dtype: int64

>>> s.str.find('a', end=1)
0 0
1 -1
2 -1
dtype: int32
dtype: int64

>>> s.str.find('a', start=2, end=2)
0 -1
1 -1
2 -1
dtype: int32
dtype: int64
"""

def pandas_find(s) -> "ks.Series[int]":
Expand Down Expand Up @@ -1614,25 +1614,25 @@ def rfind(self, sub, start=0, end=None) -> "ks.Series":
0 0
1 2
2 5
dtype: int32
dtype: int64

>>> s.str.rfind('a', start=2)
0 -1
1 2
2 5
dtype: int32
dtype: int64

>>> s.str.rfind('a', end=1)
0 0
1 -1
2 -1
dtype: int32
dtype: int64

>>> s.str.rfind('a', start=2, end=2)
0 -1
1 -1
2 -1
dtype: int32
dtype: int64
"""

def pandas_rfind(s) -> "ks.Series[int]":
Expand Down
22 changes: 22 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1196,7 +1196,23 @@ def test_astype(self):
pser = pd.Series([10, 20, 15, 30, 45], name="x")
kser = ks.Series(pser)

self.assert_eq(kser.astype(int), pser.astype(int))
self.assert_eq(kser.astype(np.int), pser.astype(np.int))
self.assert_eq(kser.astype(np.int8), pser.astype(np.int8))
self.assert_eq(kser.astype(np.int16), pser.astype(np.int16))
self.assert_eq(kser.astype(np.int32), pser.astype(np.int32))
self.assert_eq(kser.astype(np.int64), pser.astype(np.int64))
self.assert_eq(kser.astype("int"), pser.astype("int"))
self.assert_eq(kser.astype("int8"), pser.astype("int8"))
self.assert_eq(kser.astype("int16"), pser.astype("int16"))
self.assert_eq(kser.astype("int32"), pser.astype("int32"))
self.assert_eq(kser.astype("int64"), pser.astype("int64"))
self.assert_eq(kser.astype(np.float), pser.astype(np.float))
self.assert_eq(kser.astype(np.float32), pser.astype(np.float32))
self.assert_eq(kser.astype(np.float64), pser.astype(np.float64))
self.assert_eq(kser.astype("float"), pser.astype("float"))
self.assert_eq(kser.astype("float32"), pser.astype("float32"))
self.assert_eq(kser.astype("float64"), pser.astype("float64"))
self.assert_eq(kser.astype(bool), pser.astype(bool))

pser = pd.Series([10, 20, 15, 30, 45, None, np.nan], name="x")
Expand All @@ -1218,6 +1234,12 @@ def test_astype(self):

self.assert_eq(kser.astype(bool), pser.astype(bool))

pser = pd.Series(["2020-10-27"], name="x")
kser = ks.Series(pser)

self.assert_eq(kser.astype(np.datetime64), pser.astype(np.datetime64))
self.assert_eq(kser.astype("datetime64[ns]"), pser.astype("datetime64[ns]"))

with self.assertRaisesRegex(TypeError, "not understood"):
kser.astype("int63")

Expand Down
68 changes: 56 additions & 12 deletions databricks/koalas/tests/test_typedef.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,27 @@
#
import sys
import unittest
import datetime

import pandas
import pandas as pd
import numpy as np
from pyspark.sql.types import FloatType, IntegerType, LongType, StringType, StructField, StructType

from databricks.koalas.typedef import infer_return_type
from pyspark.sql.types import (
BinaryType,
BooleanType,
FloatType,
IntegerType,
LongType,
StringType,
StructField,
StructType,
ByteType,
ShortType,
DoubleType,
TimestampType,
)

from databricks.koalas.typedef import infer_return_type, as_spark_type
from databricks import koalas as ks


Expand All @@ -34,40 +48,40 @@ def test_infer_schema_from_pandas_instances(self):
def func() -> pd.Series[int]:
pass

self.assertEqual(infer_return_type(func).tpe, IntegerType())
self.assertEqual(infer_return_type(func).tpe, LongType())

def func() -> pd.Series[np.float]:
pass

self.assertEqual(infer_return_type(func).tpe, FloatType())
self.assertEqual(infer_return_type(func).tpe, DoubleType())

def func() -> "pd.DataFrame[np.float, str]":
pass

expected = StructType([StructField("c0", FloatType()), StructField("c1", StringType())])
expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
self.assertEqual(infer_return_type(func).tpe, expected)

def func() -> "pandas.DataFrame[np.float]":
pass

expected = StructType([StructField("c0", FloatType())])
expected = StructType([StructField("c0", DoubleType())])
self.assertEqual(infer_return_type(func).tpe, expected)

def func() -> "pd.Series[int]":
pass

self.assertEqual(infer_return_type(func).tpe, IntegerType())
self.assertEqual(infer_return_type(func).tpe, LongType())

def func() -> pd.DataFrame[np.float, str]:
pass

expected = StructType([StructField("c0", FloatType()), StructField("c1", StringType())])
expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
self.assertEqual(infer_return_type(func).tpe, expected)

def func() -> pd.DataFrame[np.float]:
pass

expected = StructType([StructField("c0", FloatType())])
expected = StructType([StructField("c0", DoubleType())])
self.assertEqual(infer_return_type(func).tpe, expected)

pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
Expand All @@ -93,13 +107,13 @@ def test_infer_schema_with_names_pandas_instances(self):
def func() -> 'pd.DataFrame["a" : np.float, "b":str]': # noqa: F821
pass

expected = StructType([StructField("a", FloatType()), StructField("b", StringType())])
expected = StructType([StructField("a", DoubleType()), StructField("b", StringType())])
self.assertEqual(infer_return_type(func).tpe, expected)

def func() -> "pd.DataFrame['a': np.float, 'b': int]": # noqa: F821
pass

expected = StructType([StructField("a", FloatType()), StructField("b", IntegerType())])
expected = StructType([StructField("a", DoubleType()), StructField("b", LongType())])
self.assertEqual(infer_return_type(func).tpe, expected)

pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
Expand Down Expand Up @@ -191,3 +205,33 @@ def f() -> ks.DataFrame[pdf.dtypes]: # type: ignore
infer_return_type(f)

self.assertRaisesRegex(TypeError, "object.*not understood", try_infer_return_type)

def test_as_spark_type(self):
type_mapper = {
# binary
bytes: BinaryType(),
# integer
np.int8: ByteType(),
np.int16: ShortType(),
np.int32: IntegerType(),
np.int64: LongType(),
np.int: LongType(),
int: LongType(),
# floating
np.float32: FloatType(),
np.float: DoubleType(),
np.float64: DoubleType(),
float: DoubleType(),
# string
np.str: StringType(),
str: StringType(),
# bool
np.bool: BooleanType(),
bool: BooleanType(),
# datetime
np.datetime64: TimestampType(),
datetime.datetime: TimestampType(),
}

for numpy_or_python_type, spark_type in type_mapper.items():
self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
Loading