Skip to content

Commit aab6137

Browse files
authored
First pass at adding testing for pylibcudf (#15300)
This PR adds tests of the `pylibcudf.copying` module along with establishing the infrastructure and best practices for writing pylibcudf tests going forward (and adding associated documentation). Resolves #15133 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Ashwin Srinath (https://github.com/shwina) - Jake Awe (https://github.com/AyodeAwe) - https://github.com/brandon-b-miller URL: #15300
1 parent 7c69e66 commit aab6137

File tree

21 files changed

+1254
-54
lines changed

21 files changed

+1254
-54
lines changed

ci/test_python_cudf.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ EXITCODE=0
1414
trap "EXITCODE=1" ERR
1515
set +e
1616

17+
rapids-logger "pytest pylibcudf"
18+
pushd python/cudf/cudf/pylibcudf_tests
19+
python -m pytest \
20+
--cache-clear \
21+
--dist=worksteal \
22+
.
23+
popd
24+
1725
rapids-logger "pytest cudf"
1826
./ci/run_cudf_pytests.sh \
1927
--junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \

ci/test_wheel_cudf.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@ if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
1818
rapids-logger "Run smoke tests for cudf"
1919
python ./ci/wheel_smoke_test_cudf.py
2020
else
21+
rapids-logger "pytest pylibcudf"
22+
pushd python/cudf/cudf/pylibcudf_tests
23+
python -m pytest \
24+
--cache-clear \
25+
--dist=worksteal \
26+
.
27+
popd
28+
2129
rapids-logger "pytest cudf"
2230
pushd python/cudf/cudf/tests
2331
python -m pytest \

cpp/include/cudf/copying.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,8 @@ std::unique_ptr<column> empty_like(scalar const& input);
253253
* If the `mask_alloc` allocates a validity mask that mask is also uninitialized
254254
* and the validity bits and the null count should be set by the caller.
255255
*
256+
* @throws cudf::data_type_error if input type is not of fixed width.
257+
*
256258
* @param input Immutable view of input column to emulate
257259
* @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
258260
* @param mr Device memory resource used to allocate the returned column's device memory
@@ -360,6 +362,7 @@ void copy_range_in_place(column_view const& source,
360362
*
361363
* @throws std::out_of_range for any invalid range.
362364
* @throws cudf::data_type_error if @p target and @p source have different types.
365+
* @throws cudf::data_type_error if the data type is not fixed width, string, or dictionary
363366
*
364367
* @param source The column to copy from inside the range
365368
* @param target The column to copy from outside the range

cpp/src/copying/copy.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
2+
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -122,7 +122,8 @@ std::unique_ptr<column> allocate_like(column_view const& input,
122122
rmm::mr::device_memory_resource* mr)
123123
{
124124
CUDF_FUNC_RANGE();
125-
CUDF_EXPECTS(is_fixed_width(input.type()), "Expects only fixed-width type column");
125+
CUDF_EXPECTS(
126+
is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error);
126127
mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable());
127128

128129
return std::make_unique<column>(input.type(),

cpp/src/copying/copy_range.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ struct out_of_place_copy_range_dispatch {
119119
std::enable_if_t<not cudf::is_rep_layout_compatible<T>(), std::unique_ptr<cudf::column>>
120120
operator()(Args...)
121121
{
122-
CUDF_FAIL("Unsupported type for out of place copy.");
122+
CUDF_FAIL("Unsupported type for out of place copy.", cudf::data_type_error);
123123
}
124124
};
125125

cpp/src/copying/scatter.cu

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,9 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
144144
rmm::cuda_stream_view stream,
145145
rmm::mr::device_memory_resource* mr) const
146146
{
147-
CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match");
147+
CUDF_EXPECTS(source.get().type() == target.type(),
148+
"scalar and column types must match",
149+
cudf::data_type_error);
148150

149151
auto const scalar_impl = static_cast<string_scalar const*>(&source.get());
150152
auto const source_view = string_view(scalar_impl->data(), scalar_impl->size());
@@ -166,6 +168,9 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
166168
rmm::cuda_stream_view stream,
167169
rmm::mr::device_memory_resource* mr) const
168170
{
171+
CUDF_EXPECTS(source.get().type() == target.type(),
172+
"scalar and column types must match",
173+
cudf::data_type_error);
169174
auto result =
170175
lists::detail::scatter(source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr);
171176

@@ -249,6 +254,10 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
249254
rmm::cuda_stream_view stream,
250255
rmm::mr::device_memory_resource* mr) const
251256
{
257+
CUDF_EXPECTS(source.get().type() == target.type(),
258+
"scalar and column types must match",
259+
cudf::data_type_error);
260+
252261
// For each field of `source`, copy construct a scalar from the field
253262
// and dispatch to the corresponding scalar scatterer
254263

docs/cudf/source/developer_guide/pylibcudf.md

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,72 @@ There are a couple of notable points from the snippet above:
9696
- The object returned from libcudf is immediately converted to a pylibcudf type.
9797
- `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter. `OutOfBoundsPolicy` is an alias for this type in pylibcudf that matches our Python naming conventions (CapsCase instead of snake\_case).
9898

99+
## Testing
100+
101+
When writing pylibcudf tests, it is important to remember that all the APIs should be tested in the C++ layer in libcudf already.
102+
The primary purpose of pylibcudf tests is to ensure the correctness of the _bindings_; the correctness of the underlying implementation should generally be validated in libcudf.
103+
If pylibcudf tests uncover a libcudf bug, a suitable libcudf test should be added to cover this case rather than relying solely on pylibcudf testing.
104+
105+
pylibcudf's ``conftest.py`` contains some standard parametrized dtype fixture lists that may in turn be used to parametrize other fixtures.
106+
Fixtures allocating data should leverage these dtype lists wherever possible to simplify testing across the matrix of important types.
107+
Where appropriate, new fixture lists may be added.
108+
109+
To run tests as efficiently as possible, the test suite should make generous use of fixtures.
110+
The simplest general structure to follow is for pyarrow array/table/scalar fixtures to be parametrized by one of the dtype list.
111+
Then, a corresponding pylibcudf fixture may be created using a simple `from_arrow` call.
112+
This approach ensures consistent global coverage across types for various tests.
113+
114+
In general, pylibcudf tests should prefer validating against a corresponding pyarrow implementation rather than hardcoding data.
115+
This approach is more resilient to changes to input data, particularly given the fixture strategy outlined above.
116+
Standard tools for comparing between pylibcudf and pyarrow types are provided in the utils module.
117+
118+
Here is an example demonstrating the above points:
119+
120+
```python
121+
import pyarrow as pa
122+
import pyarrow.compute as pc
123+
import pytest
124+
from cudf._lib import pylibcudf as plc
125+
from utils import assert_column_eq
126+
127+
# The pa_dtype fixture is defined in conftest.py.
128+
@pytest.fixture(scope="module")
129+
def pa_column(pa_dtype):
130+
pa.array([1, 2, 3])
131+
132+
133+
@pytest.fixture(scope="module")
134+
def column(pa_column):
135+
return plc.interop.from_arrow(pa_column)
136+
137+
138+
def test_foo(pa_column, column):
139+
index = 1
140+
result = plc.foo(column)
141+
expected = pa.foo(pa_column)
142+
143+
assert_column_eq(result, expected)
144+
```
145+
146+
Some guidelines on what should be tested:
147+
- Tests SHOULD comprehensively cover the API, including all possible combinations of arguments required to ensure good test coverage.
148+
- pylibcudf SHOULD NOT attempt to stress test large data sizes, and SHOULD instead defer to libcudf tests.
149+
- Exception: In special cases where constructing suitable large tests is difficult in C++ (such as creating suitable input data for I/O testing), tests may be added to pylibcudf instead.
150+
- Nullable data should always be tested.
151+
- Expected exceptions should be tested. Tests should be written from the user's perspective in mind, and if the API is not currently throwing the appropriate exception it should be updated.
152+
- Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in [`exception_handler.pxd`](https://github.com/rapidsai/cudf/blob/branch-24.04/python/cudf/cudf/_lib/exception_handler.pxd).
153+
154+
Some guidelines on how best to use pytests.
155+
- By default, fixtures producing device data containers should be of module scope and treated as immutable by tests. Allocating data on the GPU is expensive and slows tests. Almost all pylibcudf operations are out of place operations, so module-scoped fixtures should not typically be problematic to work with. Session-scoped fixtures would also work, but they are harder to reason about since they live in a different module, and if they need to change for any reason they could affect an arbitrarily large number of tests. Module scope is a good balance.
156+
- Where necessary, mutable fixtures should be named as such (e.g. `mutable_col`) and be of function scope. If possible, they can be implemented as simply making a copy of a corresponding module-scope immutable fixture to avoid duplicating the generation logic.
157+
158+
Tests should be organized corresponding to pylibcudf modules, i.e. one test module for each pylibcudf module.
159+
160+
The following sections of the cuDF Python testing guide also generally apply to pylibcudf unless superseded by any statements above:
161+
- [](#test_parametrization)
162+
- [](#xfailing_tests)
163+
- [](#testing_warnings)
164+
99165
## Miscellaneous Notes
100166

101167
### Cython Scoped Enums

docs/cudf/source/developer_guide/testing.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ Typically, exception cases require specific assertions or other special logic, s
5555
The main exception to this rule is tests based on comparison to pandas.
5656
Such tests may test exceptional cases alongside more typical cases since the logic is generally identical.
5757

58+
(test_parametrization)=
59+
5860
### Parametrization: custom fixtures and `pytest.mark.parametrize`
5961

6062
When it comes to parametrizing tests written with `pytest`,
@@ -140,6 +142,8 @@ def test_odds():
140142

141143
Other approaches are also possible, and the best solution should be discussed on a case-by-case basis during PR review.
142144

145+
(xfailing_tests)=
146+
143147
### Tests with expected failures (`xfail`s)
144148

145149
In some circumstances it makes sense to mark a test as _expected_ to
@@ -218,6 +222,8 @@ This way, when the bug is fixed, the test suite will fail at this
218222
point (and we will remember to update the test).
219223

220224

225+
(testing_warnings)=
226+
221227
### Testing code that throws warnings
222228

223229
Some code may be expected to throw warnings.

python/cudf/cudf/_lib/cpp/copying.pxd

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
1+
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
22

33
from libc.stdint cimport int32_t, int64_t, uint8_t
44
from libcpp cimport bool
@@ -33,19 +33,19 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
3333
const column_view& input,
3434
size_type offset,
3535
const scalar& fill_values
36-
) except +
36+
) except +cudf_exception_handler
3737

3838
cdef unique_ptr[table] scatter (
3939
const table_view& source_table,
4040
const column_view& scatter_map,
4141
const table_view& target_table,
42-
) except +
42+
) except +cudf_exception_handler
4343

4444
cdef unique_ptr[table] scatter (
4545
const vector[reference_wrapper[constscalar]]& source_scalars,
4646
const column_view& indices,
4747
const table_view& target,
48-
) except +
48+
) except +cudf_exception_handler
4949

5050
cpdef enum class mask_allocation_policy(int32_t):
5151
NEVER
@@ -54,99 +54,99 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
5454

5555
cdef unique_ptr[column] empty_like (
5656
const column_view& input_column
57-
) except +
57+
) except +cudf_exception_handler
5858

5959
cdef unique_ptr[column] allocate_like (
6060
const column_view& input_column,
6161
mask_allocation_policy policy
62-
) except +
62+
) except +cudf_exception_handler
6363

6464
cdef unique_ptr[column] allocate_like (
6565
const column_view& input_column,
6666
size_type size,
6767
mask_allocation_policy policy
68-
) except +
68+
) except +cudf_exception_handler
6969

7070
cdef unique_ptr[table] empty_like (
7171
const table_view& input_table
72-
) except +
72+
) except +cudf_exception_handler
7373

7474
cdef void copy_range_in_place (
7575
const column_view& input_column,
7676
mutable_column_view& target_column,
7777
size_type input_begin,
7878
size_type input_end,
7979
size_type target_begin
80-
) except +
80+
) except +cudf_exception_handler
8181

8282
cdef unique_ptr[column] copy_range (
8383
const column_view& input_column,
8484
const column_view& target_column,
8585
size_type input_begin,
8686
size_type input_end,
8787
size_type target_begin
88-
) except +
88+
) except +cudf_exception_handler
8989

9090
cdef vector[column_view] slice (
9191
const column_view& input_column,
9292
vector[size_type] indices
93-
) except +
93+
) except +cudf_exception_handler
9494

9595
cdef vector[table_view] slice (
9696
const table_view& input_table,
9797
vector[size_type] indices
98-
) except +
98+
) except +cudf_exception_handler
9999

100100
cdef vector[column_view] split (
101101
const column_view& input_column,
102102
vector[size_type] splits
103-
) except +
103+
) except +cudf_exception_handler
104104

105105
cdef vector[table_view] split (
106106
const table_view& input_table,
107107
vector[size_type] splits
108-
) except +
108+
) except +cudf_exception_handler
109109

110110
cdef unique_ptr[column] copy_if_else (
111111
const column_view& lhs,
112112
const column_view& rhs,
113113
const column_view& boolean_mask
114-
) except +
114+
) except +cudf_exception_handler
115115

116116
cdef unique_ptr[column] copy_if_else (
117117
const scalar& lhs,
118118
const column_view& rhs,
119119
const column_view& boolean_mask
120-
) except +
120+
) except +cudf_exception_handler
121121

122122
cdef unique_ptr[column] copy_if_else (
123123
const column_view& lhs,
124124
const scalar& rhs,
125125
const column_view boolean_mask
126-
) except +
126+
) except +cudf_exception_handler
127127

128128
cdef unique_ptr[column] copy_if_else (
129129
const scalar& lhs,
130130
const scalar& rhs,
131131
const column_view boolean_mask
132-
) except +
132+
) except +cudf_exception_handler
133133

134134
cdef unique_ptr[table] boolean_mask_scatter (
135135
const table_view& input,
136136
const table_view& target,
137137
const column_view& boolean_mask
138-
) except +
138+
) except +cudf_exception_handler
139139

140140
cdef unique_ptr[table] boolean_mask_scatter (
141141
const vector[reference_wrapper[constscalar]]& input,
142142
const table_view& target,
143143
const column_view& boolean_mask
144-
) except +
144+
) except +cudf_exception_handler
145145

146146
cdef unique_ptr[scalar] get_element (
147147
const column_view& input,
148148
size_type index
149-
) except +
149+
) except +cudf_exception_handler
150150

151151
cpdef enum class sample_with_replacement(bool):
152152
FALSE

python/cudf/cudf/_lib/pylibcudf/column.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ cdef class Column:
4343
cpdef gpumemoryview data(self)
4444
cpdef gpumemoryview null_mask(self)
4545
cpdef list children(self)
46+
cpdef Column copy(self)
4647

4748
cpdef ListColumnView list_view(self)
4849

0 commit comments

Comments
 (0)