Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 48 additions & 18 deletions databricks/koalas/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,26 +133,47 @@ def tearDownClass(cls):

def assertPandasEqual(self, left, right):
if isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame):
msg = (
"DataFrames are not equal: "
+ "\n\nLeft:\n%s\n%s" % (left, left.dtypes)
+ "\n\nRight:\n%s\n%s" % (right, right.dtypes)
)
self.assertTrue(left.equals(right), msg=msg)
try:
pd.util.testing.assert_frame_equal(
left,
right,
check_index_type=("equiv" if len(left.index) > 0 else False),
check_column_type=("equiv" if len(left.columns) > 0 else False),
check_exact=True,
)
except AssertionError as e:
msg = (
str(e)
+ "\n\nLeft:\n%s\n%s" % (left, left.dtypes)
+ "\n\nRight:\n%s\n%s" % (right, right.dtypes)
)
raise AssertionError(msg) from e
elif isinstance(left, pd.Series) and isinstance(right, pd.Series):
msg = (
"Series are not equal: "
+ "\n\nLeft:\n%s\n%s" % (left, left.dtype)
+ "\n\nRight:\n%s\n%s" % (right, right.dtype)
)
self.assertTrue((left == right).all(), msg=msg)
try:
pd.util.testing.assert_series_equal(
left,
right,
check_index_type=("equiv" if len(left.index) > 0 else False),
check_names=False,
check_exact=True,
)
except AssertionError as e:
msg = (
str(e)
+ "\n\nLeft:\n%s\n%s" % (left, left.dtype)
+ "\n\nRight:\n%s\n%s" % (right, right.dtype)
)
raise AssertionError(msg) from e
elif isinstance(left, pd.Index) and isinstance(right, pd.Index):
msg = (
"Indices are not equal: "
+ "\n\nLeft:\n%s\n%s" % (left, left.dtype)
+ "\n\nRight:\n%s\n%s" % (right, right.dtype)
)
self.assertTrue((left == right).all(), msg=msg)
try:
pd.util.testing.assert_index_equal(left, right, check_exact=True)
except AssertionError as e:
msg = (
str(e)
+ "\n\nLeft:\n%s\n%s" % (left, left.dtype)
+ "\n\nRight:\n%s\n%s" % (right, right.dtype)
)
raise AssertionError(msg) from e
else:
raise ValueError("Unexpected values: (%s, %s)" % (left, right))

Expand Down Expand Up @@ -188,6 +209,15 @@ def assertPandasAlmostEqual(self, left, right):
self.assertEqual(lnull, rnull, msg=msg)
for lval, rval in zip(left.dropna(), right.dropna()):
self.assertAlmostEqual(lval, rval, msg=msg)
elif isinstance(left, pd.MultiIndex) and isinstance(left, pd.MultiIndex):
msg = (
"MultiIndices are not almost equal: "
+ "\n\nLeft:\n%s\n%s" % (left, left.dtype)
+ "\n\nRight:\n%s\n%s" % (right, right.dtype)
)
self.assertEqual(len(left), len(right), msg=msg)
for lval, rval in zip(left, right):
self.assertAlmostEqual(lval, rval, msg=msg)
elif isinstance(left, pd.Index) and isinstance(left, pd.Index):
msg = (
"Indices are not almost equal: "
Expand Down
33 changes: 17 additions & 16 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ def mul10(x) -> int:

def test_dot_in_column_name(self):
self.assert_eq(
ks.DataFrame(ks.range(1)._internal.spark_frame.selectExpr("1 as `a.b`"))["a.b"],
ks.DataFrame(ks.range(1)._internal.spark_frame.selectExpr("1L as `a.b`"))["a.b"],
ks.Series([1]),
)

Expand Down Expand Up @@ -668,7 +668,7 @@ def _test_dropna(self, pdf, axis):
pdf2.dropna(inplace=True)
kdf2.dropna(inplace=True)
self.assert_eq(kdf2, pdf2)
self.assert_eq(kser, pser, almost=True)
self.assert_eq(kser, pser)

# multi-index
columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
Expand Down Expand Up @@ -808,7 +808,7 @@ def test_fillna(self):
pdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True)
kdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True)
self.assert_eq(kdf, pdf)
self.assert_eq(kser, pser, almost=True)
self.assert_eq(kser, pser)

s_nan = pd.Series([-1, -2, -5], index=["x", "y", "z"], dtype=int)
self.assert_eq(kdf.fillna(s_nan), pdf.fillna(s_nan))
Expand Down Expand Up @@ -950,7 +950,7 @@ def test_sort_values(self):
kserA = kdf.a
self.assert_eq(kdf.sort_values("b", inplace=True), pdf.sort_values("b", inplace=True))
self.assert_eq(kdf, pdf)
self.assert_eq(kserA, pserA, almost=True)
self.assert_eq(kserA, pserA)

columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")])
kdf.columns = columns
Expand Down Expand Up @@ -983,7 +983,7 @@ def test_sort_index(self):
kserA = kdf.A
self.assertEqual(kdf.sort_index(inplace=True), pdf.sort_index(inplace=True))
self.assert_eq(kdf, pdf)
self.assert_eq(kserA, pserA, almost=True)
self.assert_eq(kserA, pserA)

# Assert multi-indices
pdf = pd.DataFrame(
Expand Down Expand Up @@ -1767,7 +1767,7 @@ def get_data(left_columns=None, right_columns=None):
left_pdf.update(right_pdf)
left_kdf.update(right_kdf)
self.assert_eq(left_pdf.sort_values(by=["A", "B"]), left_kdf.sort_values(by=["A", "B"]))
self.assert_eq(kser.sort_index(), pser.sort_index(), almost=True)
self.assert_eq(kser.sort_index(), pser.sort_index())

left_kdf, left_pdf, right_kdf, right_pdf = get_data()
left_pdf.update(right_pdf, overwrite=False)
Expand Down Expand Up @@ -2071,7 +2071,7 @@ def test_stack(self):
)
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.stack().sort_index(), pdf.stack().sort_index(), almost=True)
self.assert_eq(kdf.stack().sort_index(), pdf.stack().sort_index())
self.assert_eq(kdf[[]].stack().sort_index(), pdf[[]].stack().sort_index(), almost=True)

def test_unstack(self):
Expand Down Expand Up @@ -3356,10 +3356,10 @@ def test_query(self):
kdf.query("('A', 'Z') > ('B', 'X')")

def test_take(self):
kdf = ks.DataFrame(
pdf = pd.DataFrame(
{"A": range(0, 50000), "B": range(100000, 0, -2), "C": range(100000, 50000, -1)}
)
pdf = kdf.to_pandas()
kdf = ks.from_pandas(pdf)

# axis=0 (default)
self.assert_eq(kdf.take([1, 2]).sort_index(), pdf.take([1, 2]).sort_index())
Expand Down Expand Up @@ -3432,6 +3432,7 @@ def test_take(self):
self.assert_eq(
kdf.take(range(-1, -3), axis=1).sort_index(),
pdf.take(range(-1, -3), axis=1).sort_index(),
almost=True,
)
self.assert_eq(
kdf.take([2, 1], axis=1).sort_index(), pdf.take([2, 1], axis=1).sort_index(),
Expand Down Expand Up @@ -3874,15 +3875,15 @@ def test_iteritems(self):

def test_tail(self):
if LooseVersion(pyspark.__version__) >= LooseVersion("3.0"):
pdf = pd.DataFrame(range(1000))
pdf = pd.DataFrame({"x": range(1000)})
kdf = ks.from_pandas(pdf)

self.assert_eq(pdf.tail(), kdf.tail(), almost=True)
self.assert_eq(pdf.tail(10), kdf.tail(10), almost=True)
self.assert_eq(pdf.tail(-990), kdf.tail(-990), almost=True)
self.assert_eq(pdf.tail(0), kdf.tail(0), almost=True)
self.assert_eq(pdf.tail(-1001), kdf.tail(-1001), almost=True)
self.assert_eq(pdf.tail(1001), kdf.tail(1001), almost=True)
self.assert_eq(pdf.tail(), kdf.tail())
self.assert_eq(pdf.tail(10), kdf.tail(10))
self.assert_eq(pdf.tail(-990), kdf.tail(-990))
self.assert_eq(pdf.tail(0), kdf.tail(0))
self.assert_eq(pdf.tail(-1001), kdf.tail(-1001))
self.assert_eq(pdf.tail(1001), kdf.tail(1001))
with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"):
kdf.tail("10")

Expand Down
Loading