From 0c70d7c3faecc3844a53e6ccfa25ca9d8da57bfc Mon Sep 17 00:00:00 2001 From: Maria Rubtsova Date: Wed, 16 Feb 2022 23:02:14 +0300 Subject: [PATCH 1/5] add tests --- protocol/tests/conftest.py | 7 ++ protocol/tests/test_protocol.py | 142 ++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 protocol/tests/conftest.py create mode 100644 protocol/tests/test_protocol.py diff --git a/protocol/tests/conftest.py b/protocol/tests/conftest.py new file mode 100644 index 00000000..452df49e --- /dev/null +++ b/protocol/tests/conftest.py @@ -0,0 +1,7 @@ +import pytest +import pandas as pd + + +@pytest.fixture +def constructor_frame(data): + return pd.DataFrame(data) diff --git a/protocol/tests/test_protocol.py b/protocol/tests/test_protocol.py new file mode 100644 index 00000000..2235d55f --- /dev/null +++ b/protocol/tests/test_protocol.py @@ -0,0 +1,142 @@ +import pytest +import numpy as np +from conftest import * + + +@pytest.mark.parametrize("test_data", + [ + ({'a': [np.array([1, 2, 3]), np.array([4, 5, 6])], + 'b': [np.array([1.5, 2.0, 3.2]), np.array([4.1, 5.7, 6.9])]}, + np.object_, None), + ({'a': [1.5, 2.5, 3.5], 'b': [9.2, 10.5, 11.8]}, np.float64, None), + ({'A': [1, 2, 3, 4], 'B': [1, 2, 3, 4]}, np.int64, np.float64) + ], + ids=["array_data", "float_data", "int_data"]) +def test_only_one_data(test_data): + data, dtype, new_dtype = test_data + columns = list(data.keys()) + df = constructor_frame(data) + df2 = df.__dataframe__() + new_dtype = dtype if new_dtype is None else new_dtype + assert df.columns.values.tolist() == columns + val = len(df[columns[0]])-1 + column_size = df.size + for column in columns: + assert df[column].tolist() == df[column].tolist() + assert df[column].dtype.type is dtype + assert df2.get_column_by_name(column).null_count == 0 + assert df2.get_column_by_name(column).size == column_size + assert df2.get_column_by_name(column).offset == 0 + assert not df2["x"].is_masked + n = np.random.randint(0, val) + (df[column])[n] = None + assert df[column].dtype.type is new_dtype + assert df2.get_column_by_name(column).null_count == 1 + + +def test_float_int(): + df = constructor_frame({'a': [1, 2, 3], 'b': [3, 4, 5], + 'c': [1.5, 2.5, 3.5], 'd': [9, 10, 11]}) + df2 = df.__dataframe__() + columns = ['a', 'b', 'c', 'd'] + assert df.columns.values.tolist() == columns + for column in columns: + assert df[column].tolist() == df[column].tolist() + if column is 'c': + assert df[column].dtype.type is np.float64 + else: + assert df[column].dtype.type is np.int64 + + assert df2.get_column_by_name(column).null_count == 0 + assert df2.get_column_by_name(column).size == 3 + assert df2.get_column_by_name(column).offset == 0 + + n = np.random.randint(0, 2) + (df[column])[n] = None + assert df[column].dtype.type is np.float64 + assert df2.get_column_by_name(column).null_count == 1 + + +def test_mixed_intfloatbool(): + df = constructor_frame({"x": np.array([True, True, False]), + "y": np.array([1, 2, 0]), + "z": np.array([9.2, 10.5, 11.8])}) + df2 = df.__dataframe__() + columns = ['x', 'y', 'z'] + assert df.columns.values.tolist() == columns + for column in columns: + assert df[column].tolist() == df[column].tolist() + assert df2.get_column_by_name(column).null_count == 0 + assert df2.get_column_by_name(column).size == 3 + assert df2.get_column_by_name(column).offset == 0 + + assert df["x"].dtype.type is np.bool_ + assert df["y"].dtype.type is np.int32 + assert df["z"].dtype.type is np.float64 + + assert df2.get_column_by_name("x")._allow_copy == True + + for column in columns: + n = np.random.randint(0, 2) + (df[column])[n] = None + if column is "x": + assert df[column].dtype.type is np.object_ + else: + assert df[column].dtype.type is np.float64 + assert df2.get_column_by_name(column).null_count == 1 + + +def test_string_dtype(): + df = constructor_frame({"A": ["a", "b", "cdef", "", "g"]}) + df2 = df.__dataframe__() + columns = ['A'] + assert df.columns.values.tolist() == columns + for column in columns: + assert df[column].tolist() == df[column].tolist() + assert df[column].dtype.type is np.object_ + assert df2.get_column_by_name(column).null_count == 0 + + +def test_categorical(): + df = constructor_frame({"year": [2012, 2013, 2015, 2019], "weekday": [0, 1, 4, 6]}) + df = df.categorize("year", min_value=2012, max_value=2019) + df = df.categorize("weekday", labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]) + # Some detailed testing for correctness of dtype and null handling: + col = df.__dataframe__().get_column_by_name("year") + assert col.describe_categorical == (False, True, {0: 2012, 1: 2013, 2: 2014, 3: 2015, 4: 2016, 5: 2017, 6: 2018, 7: 2019}) + assert col.describe_null == (0, None) + col2 = df.__dataframe__().get_column_by_name("weekday") + assert col2.describe_categorical == (False, True, {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}) + assert col2.describe_null == (0, None) + + +def test_dataframe(): + df = constructor_frame({"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}) + df2 = df.__dataframe__() + assert df2._allow_copy == True + assert df2.num_columns() == 3 + assert df2.num_rows() == 3 + assert df2.num_chunks() == 1 + assert df2.column_names() == ["x", "y", "z"] + assert df2.select_columns((0, 2))._df[:, 0].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 0].tolist() + assert df2.select_columns((0, 2))._df[:, 1].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 1].tolist() + + +def test_chunks(): + df = constructor_frame({"x": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}) + df2 = df.__dataframe__() + chunk_iter = iter(df2.get_chunks(3)) + chunk = next(chunk_iter) + assert chunk.num_rows() == 4 + chunk = next(chunk_iter) + assert chunk.num_rows() == 4 + chunk = next(chunk_iter) + assert chunk.num_rows() == 2 + with pytest.raises(StopIteration): + chunk = next(chunk_iter) + + +def test_get_chunks(): + df = constructor_frame({"x": [1]}) + df2 = df.__dataframe__() + assert df2.get_chunks() == 1 From 46830f4c6d1e390e34c0d150b7ddad950220af64 Mon Sep 17 00:00:00 2001 From: Maria Rubtsova Date: Fri, 25 Feb 2022 01:27:12 +0300 Subject: [PATCH 2/5] change tests --- protocol/tests/test_protocol.py | 287 ++++++++++++++++---------------- 1 file changed, 147 insertions(+), 140 deletions(-) diff --git a/protocol/tests/test_protocol.py b/protocol/tests/test_protocol.py index 2235d55f..683bfa75 100644 --- a/protocol/tests/test_protocol.py +++ b/protocol/tests/test_protocol.py @@ -1,142 +1,149 @@ import pytest -import numpy as np -from conftest import * - - -@pytest.mark.parametrize("test_data", - [ - ({'a': [np.array([1, 2, 3]), np.array([4, 5, 6])], - 'b': [np.array([1.5, 2.0, 3.2]), np.array([4.1, 5.7, 6.9])]}, - np.object_, None), - ({'a': [1.5, 2.5, 3.5], 'b': [9.2, 10.5, 11.8]}, np.float64, None), - ({'A': [1, 2, 3, 4], 'B': [1, 2, 3, 4]}, np.int64, np.float64) - ], - ids=["array_data", "float_data", "int_data"]) -def test_only_one_data(test_data): - data, dtype, new_dtype = test_data - columns = list(data.keys()) - df = constructor_frame(data) - df2 = df.__dataframe__() - new_dtype = dtype if new_dtype is None else new_dtype - assert df.columns.values.tolist() == columns - val = len(df[columns[0]])-1 - column_size = df.size +import math +import ctypes + + +@pytest.mark.parametrize( + "test_data", + [ + {"a": ["foo", "bar"], "b": ["baz", "qux"]}, + {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, + {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, + ], + ids=["str_data", "float_data", "int_data"], +) +def test_only_one_dtype(test_data, df_from_dict): + columns = list(test_data.keys()) + df = df_from_dict(test_data) + dfX = df.__dataframe__() + + column_size = len(test_data[columns[0]]) for column in columns: - assert df[column].tolist() == df[column].tolist() - assert df[column].dtype.type is dtype - assert df2.get_column_by_name(column).null_count == 0 - assert df2.get_column_by_name(column).size == column_size - assert df2.get_column_by_name(column).offset == 0 - assert not df2["x"].is_masked - n = np.random.randint(0, val) - (df[column])[n] = None - assert df[column].dtype.type is new_dtype - assert df2.get_column_by_name(column).null_count == 1 - - -def test_float_int(): - df = constructor_frame({'a': [1, 2, 3], 'b': [3, 4, 5], - 'c': [1.5, 2.5, 3.5], 'd': [9, 10, 11]}) - df2 = df.__dataframe__() - columns = ['a', 'b', 'c', 'd'] - assert df.columns.values.tolist() == columns - for column in columns: - assert df[column].tolist() == df[column].tolist() - if column is 'c': - assert df[column].dtype.type is np.float64 - else: - assert df[column].dtype.type is np.int64 - - assert df2.get_column_by_name(column).null_count == 0 - assert df2.get_column_by_name(column).size == 3 - assert df2.get_column_by_name(column).offset == 0 - - n = np.random.randint(0, 2) - (df[column])[n] = None - assert df[column].dtype.type is np.float64 - assert df2.get_column_by_name(column).null_count == 1 - - -def test_mixed_intfloatbool(): - df = constructor_frame({"x": np.array([True, True, False]), - "y": np.array([1, 2, 0]), - "z": np.array([9.2, 10.5, 11.8])}) - df2 = df.__dataframe__() - columns = ['x', 'y', 'z'] - assert df.columns.values.tolist() == columns - for column in columns: - assert df[column].tolist() == df[column].tolist() - assert df2.get_column_by_name(column).null_count == 0 - assert df2.get_column_by_name(column).size == 3 - assert df2.get_column_by_name(column).offset == 0 - - assert df["x"].dtype.type is np.bool_ - assert df["y"].dtype.type is np.int32 - assert df["z"].dtype.type is np.float64 - - assert df2.get_column_by_name("x")._allow_copy == True - - for column in columns: - n = np.random.randint(0, 2) - (df[column])[n] = None - if column is "x": - assert df[column].dtype.type is np.object_ - else: - assert df[column].dtype.type is np.float64 - assert df2.get_column_by_name(column).null_count == 1 - - -def test_string_dtype(): - df = constructor_frame({"A": ["a", "b", "cdef", "", "g"]}) - df2 = df.__dataframe__() - columns = ['A'] - assert df.columns.values.tolist() == columns - for column in columns: - assert df[column].tolist() == df[column].tolist() - assert df[column].dtype.type is np.object_ - assert df2.get_column_by_name(column).null_count == 0 - - -def test_categorical(): - df = constructor_frame({"year": [2012, 2013, 2015, 2019], "weekday": [0, 1, 4, 6]}) - df = df.categorize("year", min_value=2012, max_value=2019) - df = df.categorize("weekday", labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]) - # Some detailed testing for correctness of dtype and null handling: - col = df.__dataframe__().get_column_by_name("year") - assert col.describe_categorical == (False, True, {0: 2012, 1: 2013, 2: 2014, 3: 2015, 4: 2016, 5: 2017, 6: 2018, 7: 2019}) - assert col.describe_null == (0, None) - col2 = df.__dataframe__().get_column_by_name("weekday") - assert col2.describe_categorical == (False, True, {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}) - assert col2.describe_null == (0, None) - - -def test_dataframe(): - df = constructor_frame({"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}) - df2 = df.__dataframe__() - assert df2._allow_copy == True - assert df2.num_columns() == 3 - assert df2.num_rows() == 3 - assert df2.num_chunks() == 1 - assert df2.column_names() == ["x", "y", "z"] - assert df2.select_columns((0, 2))._df[:, 0].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 0].tolist() - assert df2.select_columns((0, 2))._df[:, 1].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 1].tolist() - - -def test_chunks(): - df = constructor_frame({"x": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}) - df2 = df.__dataframe__() - chunk_iter = iter(df2.get_chunks(3)) - chunk = next(chunk_iter) - assert chunk.num_rows() == 4 - chunk = next(chunk_iter) - assert chunk.num_rows() == 4 - chunk = next(chunk_iter) - assert chunk.num_rows() == 2 - with pytest.raises(StopIteration): - chunk = next(chunk_iter) - - -def test_get_chunks(): - df = constructor_frame({"x": [1]}) - df2 = df.__dataframe__() - assert df2.get_chunks() == 1 + assert dfX.get_column_by_name(column).null_count == 0 + assert dfX.get_column_by_name(column).size == column_size + assert dfX.get_column_by_name(column).offset == 0 + + +def test_float_int(df_from_dict): + df = df_from_dict( + { + "a": [1, 2, 3], + "b": [3, 4, 5], + "c": [1.5, 2.5, 3.5], + "d": [9, 10, 11], + "e": [True, False, True], + "f": ["a", "", "c"], + } + ) + dfX = df.__dataframe__() + columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} + + for column, kind in columns.items(): + colX = dfX.get_column_by_name(column) + assert colX.null_count == 0 + assert colX.size == 3 + assert colX.offset == 0 + + assert colX.dtype[0] == kind + + assert dfX.get_column_by_name("c").dtype[1] == 64 + + +def test_na_float(df_from_dict): + df = df_from_dict({"a": [1.0, math.nan, 2.0]}) + dfX = df.__dataframe__() + colX = dfX.get_column_by_name("a") + assert colX.null_count == 1 + + +def test_noncategorical(df_from_dict): + df = df_from_dict({"a": [1, 2, 3]}) + dfX = df.__dataframe__() + colX = dfX.get_column_by_name("a") + with pytest.raises(TypeError): + colX.describe_categorical + + +def test_categorical(df_from_dict): + df = df_from_dict( + {"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]}, + is_categorical=True, + ) + + colX = df.__dataframe__().get_column_by_name("weekday") + is_ordered, is_dictionary, _ = colX.describe_categorical + assert isinstance(is_ordered, bool) + assert isinstance(is_dictionary, bool) + + +def test_dataframe(df_from_dict): + df = df_from_dict( + {"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]} + ) + dfX = df.__dataframe__() + + assert dfX.num_columns() == 3 + assert dfX.num_rows() == 3 + assert dfX.num_chunks() == 1 + assert dfX.column_names() == ["x", "y", "z"] + assert ( + dfX.select_columns((0, 2)).column_names() + == dfX.select_columns_by_name(("x", "z")).column_names() + ) + + +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_df_get_chunks(size, n_chunks, df_from_dict): + df = df_from_dict({"x": list(range(size))}) + dfX = df.__dataframe__() + chunks = list(dfX.get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.num_rows() for chunk in chunks) == size + + +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_column_get_chunks(size, n_chunks, df_from_dict): + df = df_from_dict({"x": list(range(size))}) + dfX = df.__dataframe__() + chunks = list(dfX.get_column(0).get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.size for chunk in chunks) == size + + +def test_get_columns(df_from_dict): + df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]}) + dfX = df.__dataframe__() + for colX in dfX.get_columns(): + assert colX.size == 2 + assert colX.num_chunks() == 1 + assert dfX.get_column(0).dtype[0] == 0 + assert dfX.get_column(1).dtype[0] == 2 + + +def test_buffer(df_from_dict): + arr = [0, 1, -1] + df = df_from_dict({"a": arr}) + dfX = df.__dataframe__() + colX = dfX.get_column(0) + bufX = colX.get_buffers() + + dataBuf, dataDtype = bufX["data"] + + assert dataBuf.bufsize > 0 + assert dataBuf.ptr != 0 + device, _ = dataBuf.__dlpack_device__ + + assert dataDtype[0] == 0 + + if device == 1: # CPU-only as we're going to directly read memory here + bitwidth = dataDtype[1] + ctype = { + 8: ctypes.c_int8, + 16: ctypes.c_int16, + 32: ctypes.c_int32, + 64: ctypes.c_int64, + }[bitwidth] + + for idx, truth in enumerate(arr): + val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value + assert val == truth, f"Buffer at index {idx} mismatch" From 87231e0ca1e168a7829315edd5a0544d89839788 Mon Sep 17 00:00:00 2001 From: Maria Rubtsova Date: Sat, 26 Feb 2022 00:36:34 +0300 Subject: [PATCH 3/5] change conftest --- protocol/tests/conftest.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/protocol/tests/conftest.py b/protocol/tests/conftest.py index 452df49e..36fca8c2 100644 --- a/protocol/tests/conftest.py +++ b/protocol/tests/conftest.py @@ -1,7 +1,20 @@ import pytest import pandas as pd +from pandas.api.exchange.implementation import _from_dataframe -@pytest.fixture -def constructor_frame(data): - return pd.DataFrame(data) +@pytest.fixture(scope="package") +def df_from_dict(): + def maker(dct, is_categorical=False): + df = pd.DataFrame(dct) + return df.astype("category") if is_categorical else df + + return maker + + +@pytest.fixture(scope="package") +def df_from_xchg(): + def maker(xchg): + return _from_dataframe(xchg) + + return maker \ No newline at end of file From 9c3f3526134ca3ce28ac144eb5321f4f5614054e Mon Sep 17 00:00:00 2001 From: Maria Rubtsova Date: Mon, 28 Feb 2022 17:57:50 +0300 Subject: [PATCH 4/5] change tests & conftest --- protocol/tests/conftest.py | 9 --------- protocol/tests/test_protocol.py | 14 ++++++-------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/protocol/tests/conftest.py b/protocol/tests/conftest.py index 36fca8c2..a77839b2 100644 --- a/protocol/tests/conftest.py +++ b/protocol/tests/conftest.py @@ -1,6 +1,5 @@ import pytest import pandas as pd -from pandas.api.exchange.implementation import _from_dataframe @pytest.fixture(scope="package") @@ -10,11 +9,3 @@ def maker(dct, is_categorical=False): return df.astype("category") if is_categorical else df return maker - - -@pytest.fixture(scope="package") -def df_from_xchg(): - def maker(xchg): - return _from_dataframe(xchg) - - return maker \ No newline at end of file diff --git a/protocol/tests/test_protocol.py b/protocol/tests/test_protocol.py index 683bfa75..ea700c15 100644 --- a/protocol/tests/test_protocol.py +++ b/protocol/tests/test_protocol.py @@ -36,7 +36,7 @@ def test_float_int(df_from_dict): } ) dfX = df.__dataframe__() - columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} + columns = [INT, INT, FLOAT, INT, BOOL, STRING] for column, kind in columns.items(): colX = dfX.get_column_by_name(column) @@ -46,8 +46,6 @@ def test_float_int(df_from_dict): assert colX.dtype[0] == kind - assert dfX.get_column_by_name("c").dtype[1] == 64 - def test_na_float(df_from_dict): df = df_from_dict({"a": [1.0, math.nan, 2.0]}) @@ -85,10 +83,10 @@ def test_dataframe(df_from_dict): assert dfX.num_columns() == 3 assert dfX.num_rows() == 3 assert dfX.num_chunks() == 1 - assert dfX.column_names() == ["x", "y", "z"] + assert list(dfX.column_names()) == ["x", "y", "z"] assert ( - dfX.select_columns((0, 2)).column_names() - == dfX.select_columns_by_name(("x", "z")).column_names() + list(dfX.select_columns((0, 2)).column_names()) + == list(dfX.select_columns_by_name(("x", "z")).column_names()) ) @@ -116,8 +114,8 @@ def test_get_columns(df_from_dict): for colX in dfX.get_columns(): assert colX.size == 2 assert colX.num_chunks() == 1 - assert dfX.get_column(0).dtype[0] == 0 - assert dfX.get_column(1).dtype[0] == 2 + assert dfX.get_column(0).dtype[0] == INT + assert dfX.get_column(1).dtype[0] == FLOAT def test_buffer(df_from_dict): From 90a659860747b5c61e8e130ecd453c9dd8a09782 Mon Sep 17 00:00:00 2001 From: Maria Rubtsova Date: Tue, 1 Mar 2022 19:57:06 +0300 Subject: [PATCH 5/5] change tests --- protocol/tests/test_protocol.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protocol/tests/test_protocol.py b/protocol/tests/test_protocol.py index ea700c15..53f6c171 100644 --- a/protocol/tests/test_protocol.py +++ b/protocol/tests/test_protocol.py @@ -36,7 +36,7 @@ def test_float_int(df_from_dict): } ) dfX = df.__dataframe__() - columns = [INT, INT, FLOAT, INT, BOOL, STRING] + columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} for column, kind in columns.items(): colX = dfX.get_column_by_name(column) @@ -114,8 +114,8 @@ def test_get_columns(df_from_dict): for colX in dfX.get_columns(): assert colX.size == 2 assert colX.num_chunks() == 1 - assert dfX.get_column(0).dtype[0] == INT - assert dfX.get_column(1).dtype[0] == FLOAT + assert dfX.get_column(0).dtype[0] == 0 + assert dfX.get_column(1).dtype[0] == 2 def test_buffer(df_from_dict):