Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions databricks/koalas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,10 @@ def __sub__(self, other):
)
if isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, DateType):
warnings.warn(msg, UserWarning)
return column_op(F.datediff)(self, other)
return column_op(F.datediff)(self, other).astype("bigint")
elif isinstance(other, datetime.date) and not isinstance(other, datetime.datetime):
warnings.warn(msg, UserWarning)
return column_op(F.datediff)(self, F.lit(other))
return column_op(F.datediff)(self, F.lit(other)).astype("bigint")
else:
raise TypeError("date subtraction can only be applied to date series.")
return column_op(Column.__sub__)(self, other)
Expand Down Expand Up @@ -286,7 +286,7 @@ def __rsub__(self, other):
)
if isinstance(other, datetime.date) and not isinstance(other, datetime.datetime):
warnings.warn(msg, UserWarning)
return -column_op(F.datediff)(self, F.lit(other))
return -column_op(F.datediff)(self, F.lit(other)).astype("bigint")
else:
raise TypeError("date subtraction can only be applied to date series.")
return column_op(Column.__rsub__)(self, other)
Expand Down
12 changes: 10 additions & 2 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2093,10 +2093,18 @@ def transpose(self):

internal = self._internal.copy(
spark_frame=transposed_df,
index_map=OrderedDict((col, None) for col in internal_index_columns),
index_map=OrderedDict(
(col, name if name is None or isinstance(name, tuple) else (name,))
for col, name in zip(
internal_index_columns,
self._internal.column_label_names
if self._internal.column_label_names is not None
else ([None] * len(internal_index_columns)),
)
),
column_labels=[tuple(json.loads(col)["a"]) for col in new_data_columns],
data_spark_columns=[scol_for(transposed_df, col) for col in new_data_columns],
column_label_names=None,
column_label_names=self._internal.index_names,
)

return DataFrame(internal)
Expand Down
66 changes: 47 additions & 19 deletions databricks/koalas/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,27 +133,46 @@ def tearDownClass(cls):

def assertPandasEqual(self, left, right):
if isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame):
msg = (
"DataFrames are not equal: "
+ "\n\nLeft:\n%s\n%s" % (left, left.dtypes)
+ "\n\nRight:\n%s\n%s" % (right, right.dtypes)
)
self.assertTrue(left.equals(right), msg=msg)
try:
pd.util.testing.assert_frame_equal(
left,
right,
check_index_type=("equiv" if len(left.index) > 0 else False),
check_column_type=("equiv" if len(left.columns) > 0 else False),
check_exact=True,
)
except AssertionError as e:
msg = (
str(e)
+ "\n\nLeft:\n%s\n%s" % (left, left.dtypes)
+ "\n\nRight:\n%s\n%s" % (right, right.dtypes)
)
raise AssertionError(msg) from e
elif isinstance(left, pd.Series) and isinstance(right, pd.Series):
msg = (
"Series are not equal: "
+ "\n\nLeft:\n%s\n%s" % (left, left.dtype)
+ "\n\nRight:\n%s\n%s" % (right, right.dtype)
)
self.assertEqual(str(left.name), str(right.name), msg=msg)
self.assertTrue((left == right).all(), msg=msg)
try:
pd.util.testing.assert_series_equal(
left,
right,
check_index_type=("equiv" if len(left.index) > 0 else False),
check_exact=True,
)
except AssertionError as e:
msg = (
str(e)
+ "\n\nLeft:\n%s\n%s" % (left, left.dtype)
+ "\n\nRight:\n%s\n%s" % (right, right.dtype)
)
raise AssertionError(msg) from e
elif isinstance(left, pd.Index) and isinstance(right, pd.Index):
msg = (
"Indices are not equal: "
+ "\n\nLeft:\n%s\n%s" % (left, left.dtype)
+ "\n\nRight:\n%s\n%s" % (right, right.dtype)
)
self.assertTrue((left == right).all(), msg=msg)
try:
pd.util.testing.assert_index_equal(left, right, check_exact=True)
except AssertionError as e:
msg = (
str(e)
+ "\n\nLeft:\n%s\n%s" % (left, left.dtype)
+ "\n\nRight:\n%s\n%s" % (right, right.dtype)
)
raise AssertionError(msg) from e
else:
raise ValueError("Unexpected values: (%s, %s)" % (left, right))

Expand Down Expand Up @@ -190,6 +209,15 @@ def assertPandasAlmostEqual(self, left, right):
self.assertEqual(lnull, rnull, msg=msg)
for lval, rval in zip(left.dropna(), right.dropna()):
self.assertAlmostEqual(lval, rval, msg=msg)
elif isinstance(left, pd.MultiIndex) and isinstance(left, pd.MultiIndex):
msg = (
"MultiIndices are not almost equal: "
+ "\n\nLeft:\n%s\n%s" % (left, left.dtype)
+ "\n\nRight:\n%s\n%s" % (right, right.dtype)
)
self.assertEqual(len(left), len(right), msg=msg)
for lval, rval in zip(left, right):
self.assertAlmostEqual(lval, rval, msg=msg)
elif isinstance(left, pd.Index) and isinstance(left, pd.Index):
msg = (
"Indices are not almost equal: "
Expand Down
37 changes: 19 additions & 18 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ def mul10(x) -> int:

def test_dot_in_column_name(self):
self.assert_eq(
ks.DataFrame(ks.range(1)._internal.spark_frame.selectExpr("1 as `a.b`"))["a.b"],
ks.DataFrame(ks.range(1)._internal.spark_frame.selectExpr("1L as `a.b`"))["a.b"],
ks.Series([1], name="a.b"),
)

Expand Down Expand Up @@ -665,7 +665,7 @@ def _test_dropna(self, pdf, axis):
pdf2.dropna(inplace=True)
kdf2.dropna(inplace=True)
self.assert_eq(kdf2, pdf2)
self.assert_eq(kser, pser, almost=True)
self.assert_eq(kser, pser)

# multi-index
columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
Expand Down Expand Up @@ -805,7 +805,7 @@ def test_fillna(self):
pdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True)
kdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True)
self.assert_eq(kdf, pdf)
self.assert_eq(kser, pser, almost=True)
self.assert_eq(kser, pser)

s_nan = pd.Series([-1, -2, -5], index=["x", "y", "z"], dtype=int)
self.assert_eq(kdf.fillna(s_nan), pdf.fillna(s_nan))
Expand Down Expand Up @@ -942,7 +942,7 @@ def test_sort_values(self):
kserA = kdf.a
self.assert_eq(kdf.sort_values("b", inplace=True), pdf.sort_values("b", inplace=True))
self.assert_eq(kdf, pdf)
self.assert_eq(kserA, pserA, almost=True)
self.assert_eq(kserA, pserA)

columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")])
kdf.columns = columns
Expand Down Expand Up @@ -975,7 +975,7 @@ def test_sort_index(self):
kserA = kdf.A
self.assertEqual(kdf.sort_index(inplace=True), pdf.sort_index(inplace=True))
self.assert_eq(kdf, pdf)
self.assert_eq(kserA, pserA, almost=True)
self.assert_eq(kserA, pserA)

# Assert multi-indices
pdf = pd.DataFrame(
Expand Down Expand Up @@ -1759,7 +1759,7 @@ def get_data(left_columns=None, right_columns=None):
left_pdf.update(right_pdf)
left_kdf.update(right_kdf)
self.assert_eq(left_pdf.sort_values(by=["A", "B"]), left_kdf.sort_values(by=["A", "B"]))
self.assert_eq(kser.sort_index(), pser.sort_index(), almost=True)
self.assert_eq(kser.sort_index(), pser.sort_index())

left_kdf, left_pdf, right_kdf, right_pdf = get_data()
left_pdf.update(right_pdf, overwrite=False)
Expand Down Expand Up @@ -2063,7 +2063,7 @@ def test_stack(self):
)
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.stack().sort_index(), pdf.stack().sort_index(), almost=True)
self.assert_eq(kdf.stack().sort_index(), pdf.stack().sort_index())
self.assert_eq(kdf[[]].stack().sort_index(), pdf[[]].stack().sort_index(), almost=True)

def test_unstack(self):
Expand Down Expand Up @@ -3346,10 +3346,10 @@ def test_query(self):
kdf.query("('A', 'Z') > ('B', 'X')")

def test_take(self):
kdf = ks.DataFrame(
pdf = pd.DataFrame(
{"A": range(0, 50000), "B": range(100000, 0, -2), "C": range(100000, 50000, -1)}
)
pdf = kdf.to_pandas()
kdf = ks.from_pandas(pdf)

# axis=0 (default)
self.assert_eq(kdf.take([1, 2]).sort_index(), pdf.take([1, 2]).sort_index())
Expand Down Expand Up @@ -3422,6 +3422,7 @@ def test_take(self):
self.assert_eq(
kdf.take(range(-1, -3), axis=1).sort_index(),
pdf.take(range(-1, -3), axis=1).sort_index(),
almost=True,
)
self.assert_eq(
kdf.take([2, 1], axis=1).sort_index(), pdf.take([2, 1], axis=1).sort_index(),
Expand Down Expand Up @@ -3539,7 +3540,7 @@ def test_squeeze(self):
axises = [None, 0, 1, "rows", "index", "columns"]

# Multiple columns
pdf = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
pdf = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"], index=["x", "y"])
kdf = ks.from_pandas(pdf)
for axis in axises:
self.assert_eq(pdf.squeeze(axis), kdf.squeeze(axis))
Expand All @@ -3551,7 +3552,7 @@ def test_squeeze(self):
self.assert_eq(pdf.squeeze(axis), kdf.squeeze(axis))

# Single column with single value
pdf = pd.DataFrame([[1]], columns=["a"])
pdf = pd.DataFrame([[1]], columns=["a"], index=["x"])
kdf = ks.from_pandas(pdf)
for axis in axises:
self.assert_eq(pdf.squeeze(axis), kdf.squeeze(axis))
Expand Down Expand Up @@ -3864,15 +3865,15 @@ def test_iteritems(self):

def test_tail(self):
if LooseVersion(pyspark.__version__) >= LooseVersion("3.0"):
pdf = pd.DataFrame(range(1000))
pdf = pd.DataFrame({"x": range(1000)})
kdf = ks.from_pandas(pdf)

self.assert_eq(pdf.tail(), kdf.tail(), almost=True)
self.assert_eq(pdf.tail(10), kdf.tail(10), almost=True)
self.assert_eq(pdf.tail(-990), kdf.tail(-990), almost=True)
self.assert_eq(pdf.tail(0), kdf.tail(0), almost=True)
self.assert_eq(pdf.tail(-1001), kdf.tail(-1001), almost=True)
self.assert_eq(pdf.tail(1001), kdf.tail(1001), almost=True)
self.assert_eq(pdf.tail(), kdf.tail())
self.assert_eq(pdf.tail(10), kdf.tail(10))
self.assert_eq(pdf.tail(-990), kdf.tail(-990))
self.assert_eq(pdf.tail(0), kdf.tail(0))
self.assert_eq(pdf.tail(-1001), kdf.tail(-1001))
self.assert_eq(pdf.tail(1001), kdf.tail(1001))
with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"):
kdf.tail("10")

Expand Down
Loading