Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion databricks/koalas/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def transform_batch(self, func, *args, **kwargs):
0 3
1 5
2 7
dtype: int32
dtype: int64

You can also omit the type hints so Koalas infers the return schema as below:

Expand Down
8 changes: 4 additions & 4 deletions databricks/koalas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,7 +1019,7 @@ def apply(self, func, *args, **kwargs):
0 6
1 3
2 4
Name: B, dtype: int32
Name: B, dtype: int64

>>> def plus_min(x):
... return x + x.min()
Expand All @@ -1036,7 +1036,7 @@ def apply(self, func, *args, **kwargs):
>>> df.B.groupby(df.A).apply(plus_length).sort_index()
0 1
1 2
Name: B, dtype: int32
Name: B, dtype: int64

The extra arguments to the function can be passed as below.

Expand All @@ -1045,7 +1045,7 @@ def apply(self, func, *args, **kwargs):
>>> df.B.groupby(df.A).apply(calculation, 5, z=10).sort_index()
0 51
1 52
Name: B, dtype: int32
Name: B, dtype: int64
"""
from pandas.core.base import SelectionMixin

Expand Down Expand Up @@ -1993,7 +1993,7 @@ def transform(self, func, *args, **kwargs):
0 3
1 4
2 6
Name: B, dtype: int32
Name: B, dtype: int64

>>> (df * -1).B.groupby(df.A).transform(abs)
0 1
Expand Down
4 changes: 2 additions & 2 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3071,7 +3071,7 @@ def transform(self, func, axis=0, *args, **kwargs) -> Union["Series", DataFrame]
0 0.000000
1 1.000000
2 1.414214
dtype: float32
dtype: float64

Even though the resulting instance must have the same length as the
input, it is possible to provide several input functions:
Expand Down Expand Up @@ -4793,7 +4793,7 @@ def repeat(self, repeats: Union[int, "Series"]) -> "Series":
)
else:
scol = F.explode(
SF.array_repeat(self.spark.column, repeats.astype(int).spark.column)
SF.array_repeat(self.spark.column, repeats.astype("int32").spark.column)
).alias(name_like_string(self.name))
sdf = self._internal.spark_frame.select(self._internal.index_spark_columns + [scol])
internal = self._internal.copy(
Expand Down
18 changes: 9 additions & 9 deletions databricks/koalas/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,7 +976,7 @@ def count(self, pat, flags=0) -> "ks.Series":
3 2
4 2
5 0
dtype: int32
dtype: int64
"""

def pandas_count(s) -> "ks.Series[int]":
Expand Down Expand Up @@ -1037,25 +1037,25 @@ def find(self, sub, start=0, end=None) -> "ks.Series":
0 0
1 2
2 1
dtype: int32
dtype: int64

>>> s.str.find('a', start=2)
0 -1
1 2
2 3
dtype: int32
dtype: int64

>>> s.str.find('a', end=1)
0 0
1 -1
2 -1
dtype: int32
dtype: int64

>>> s.str.find('a', start=2, end=2)
0 -1
1 -1
2 -1
dtype: int32
dtype: int64
"""

def pandas_find(s) -> "ks.Series[int]":
Expand Down Expand Up @@ -1614,25 +1614,25 @@ def rfind(self, sub, start=0, end=None) -> "ks.Series":
0 0
1 2
2 5
dtype: int32
dtype: int64

>>> s.str.rfind('a', start=2)
0 -1
1 2
2 5
dtype: int32
dtype: int64

>>> s.str.rfind('a', end=1)
0 0
1 -1
2 -1
dtype: int32
dtype: int64

>>> s.str.rfind('a', start=2, end=2)
0 -1
1 -1
2 -1
dtype: int32
dtype: int64
"""

def pandas_rfind(s) -> "ks.Series[int]":
Expand Down
36 changes: 36 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1196,8 +1196,37 @@ def test_astype(self):
pser = pd.Series([10, 20, 15, 30, 45], name="x")
kser = ks.Series(pser)

self.assert_eq(kser.astype(int), pser.astype(int))
self.assert_eq(kser.astype(np.int), pser.astype(np.int))
self.assert_eq(kser.astype(np.int8), pser.astype(np.int8))
self.assert_eq(kser.astype(np.int16), pser.astype(np.int16))
self.assert_eq(kser.astype(np.int32), pser.astype(np.int32))
self.assert_eq(kser.astype(np.int64), pser.astype(np.int64))
self.assert_eq(kser.astype(np.byte), pser.astype(np.byte))
self.assert_eq(kser.astype("int"), pser.astype("int"))
self.assert_eq(kser.astype("int8"), pser.astype("int8"))
self.assert_eq(kser.astype("int16"), pser.astype("int16"))
self.assert_eq(kser.astype("int32"), pser.astype("int32"))
self.assert_eq(kser.astype("int64"), pser.astype("int64"))
self.assert_eq(kser.astype("b"), pser.astype("b"))
self.assert_eq(kser.astype("byte"), pser.astype("byte"))
self.assert_eq(kser.astype("i"), pser.astype("i"))
self.assert_eq(kser.astype("long"), pser.astype("long"))
self.assert_eq(kser.astype("short"), pser.astype("short"))
self.assert_eq(kser.astype(np.float), pser.astype(np.float))
self.assert_eq(kser.astype(np.float32), pser.astype(np.float32))
self.assert_eq(kser.astype(np.float64), pser.astype(np.float64))
self.assert_eq(kser.astype("float"), pser.astype("float"))
self.assert_eq(kser.astype("float32"), pser.astype("float32"))
self.assert_eq(kser.astype("float64"), pser.astype("float64"))
self.assert_eq(kser.astype("double"), pser.astype("double"))
self.assert_eq(kser.astype("f"), pser.astype("f"))
self.assert_eq(kser.astype(bool), pser.astype(bool))
self.assert_eq(kser.astype("bool"), pser.astype("bool"))
self.assert_eq(kser.astype("?"), pser.astype("?"))
self.assert_eq(kser.astype(np.unicode_), pser.astype(np.unicode_))
self.assert_eq(kser.astype("str"), pser.astype("str"))
self.assert_eq(kser.astype("U"), pser.astype("U"))

pser = pd.Series([10, 20, 15, 30, 45, None, np.nan], name="x")
kser = ks.Series(pser)
Expand All @@ -1218,6 +1247,13 @@ def test_astype(self):

self.assert_eq(kser.astype(bool), pser.astype(bool))

pser = pd.Series(["2020-10-27"], name="x")
kser = ks.Series(pser)

self.assert_eq(kser.astype(np.datetime64), pser.astype(np.datetime64))
self.assert_eq(kser.astype("datetime64[ns]"), pser.astype("datetime64[ns]"))
self.assert_eq(kser.astype("M"), pser.astype("M"))

with self.assertRaisesRegex(TypeError, "not understood"):
kser.astype("int63")

Expand Down
73 changes: 61 additions & 12 deletions databricks/koalas/tests/test_typedef.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,27 @@
#
import sys
import unittest
import datetime

import pandas
import pandas as pd
import numpy as np
from pyspark.sql.types import FloatType, IntegerType, LongType, StringType, StructField, StructType

from databricks.koalas.typedef import infer_return_type
from pyspark.sql.types import (
BinaryType,
BooleanType,
FloatType,
IntegerType,
LongType,
StringType,
StructField,
StructType,
ByteType,
ShortType,
DoubleType,
TimestampType,
)

from databricks.koalas.typedef import infer_return_type, as_spark_type
from databricks import koalas as ks


Expand All @@ -34,40 +48,40 @@ def test_infer_schema_from_pandas_instances(self):
def func() -> pd.Series[int]:
pass

self.assertEqual(infer_return_type(func).tpe, IntegerType())
self.assertEqual(infer_return_type(func).tpe, LongType())

def func() -> pd.Series[np.float]:
pass

self.assertEqual(infer_return_type(func).tpe, FloatType())
self.assertEqual(infer_return_type(func).tpe, DoubleType())

def func() -> "pd.DataFrame[np.float, str]":
pass

expected = StructType([StructField("c0", FloatType()), StructField("c1", StringType())])
expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
self.assertEqual(infer_return_type(func).tpe, expected)

def func() -> "pandas.DataFrame[np.float]":
pass

expected = StructType([StructField("c0", FloatType())])
expected = StructType([StructField("c0", DoubleType())])
self.assertEqual(infer_return_type(func).tpe, expected)

def func() -> "pd.Series[int]":
pass

self.assertEqual(infer_return_type(func).tpe, IntegerType())
self.assertEqual(infer_return_type(func).tpe, LongType())

def func() -> pd.DataFrame[np.float, str]:
pass

expected = StructType([StructField("c0", FloatType()), StructField("c1", StringType())])
expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
self.assertEqual(infer_return_type(func).tpe, expected)

def func() -> pd.DataFrame[np.float]:
pass

expected = StructType([StructField("c0", FloatType())])
expected = StructType([StructField("c0", DoubleType())])
self.assertEqual(infer_return_type(func).tpe, expected)

pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
Expand All @@ -93,13 +107,13 @@ def test_infer_schema_with_names_pandas_instances(self):
def func() -> 'pd.DataFrame["a" : np.float, "b":str]': # noqa: F821
pass

expected = StructType([StructField("a", FloatType()), StructField("b", StringType())])
expected = StructType([StructField("a", DoubleType()), StructField("b", StringType())])
self.assertEqual(infer_return_type(func).tpe, expected)

def func() -> "pd.DataFrame['a': np.float, 'b': int]": # noqa: F821
pass

expected = StructType([StructField("a", FloatType()), StructField("b", IntegerType())])
expected = StructType([StructField("a", DoubleType()), StructField("b", LongType())])
self.assertEqual(infer_return_type(func).tpe, expected)

pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
Expand Down Expand Up @@ -191,3 +205,38 @@ def f() -> ks.DataFrame[pdf.dtypes]: # type: ignore
infer_return_type(f)

self.assertRaisesRegex(TypeError, "object.*not understood", try_infer_return_type)

def test_as_spark_type(self):
type_mapper = {
# binary
np.character: BinaryType(),
np.bytes_: BinaryType(),
np.string_: BinaryType(),
bytes: BinaryType(),
# integer
np.int8: ByteType(),
np.byte: ByteType(),
np.int16: ShortType(),
np.int32: IntegerType(),
np.int64: LongType(),
np.int: LongType(),
int: LongType(),
# floating
np.float32: FloatType(),
np.float: DoubleType(),
np.float64: DoubleType(),
float: DoubleType(),
# string
np.str: StringType(),
np.unicode_: StringType(),
str: StringType(),
# bool
np.bool: BooleanType(),
bool: BooleanType(),
# datetime
np.datetime64: TimestampType(),
datetime.datetime: TimestampType(),
}

for numpy_or_python_type, spark_type in type_mapper.items():
self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
Loading