rapidsai · davidwendt · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025
@@ -82,7 +82,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.10-latest"
       date: ${{ inputs.date }}
       node_type: "gpu-l4-latest-1"
       script: "ci/build_docs.sh"

@@ -197,7 +197,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.10-latest"
       script: "ci/test_java.sh"
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
@@ -208,7 +208,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.10-latest"
       script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -218,7 +218,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.10-latest"
       script: "ci/build_docs.sh"
   wheel-build-libcudf:
     needs: checks
@@ -378,7 +378,7 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.10-latest"
       script: ci/test_narwhals.sh
   spark-rapids-jni:
     needs: changed-files

@@ -49,7 +49,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.10-latest"
       script: "ci/test_cpp_memcheck.sh"
   cpp-linters:
     secrets: inherit
@@ -90,7 +90,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.10-latest"
       script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
@@ -102,7 +102,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.10-latest"
       script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
@@ -170,5 +170,5 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.10-latest"
       script: ci/test_narwhals.sh
@@ -114,7 +114,7 @@ function buildAll {
 }
 
 function buildLibCudfJniInDocker {
-    local cudaVersion="11.8.0"
+    local cudaVersion="12.9.1"
     local imageName="cudf-build:${cudaVersion}-devel-rocky8"
     local CMAKE_GENERATOR="${CMAKE_GENERATOR:-Ninja}"
     local workspaceDir="/rapids"

@@ -63,7 +63,7 @@ DEPENDENCIES=(
 )
 for DEP in "${DEPENDENCIES[@]}"; do
   for FILE in dependencies.yaml conda/environments/*.yaml python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml; do
-    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}\(\[.*\]\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do
     sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
@@ -83,6 +83,7 @@ sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_
 for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "s/:[0-9]*\\.[0-9]*-/:${NEXT_SHORT_TAG}-/g" "${FILE}"
 done
 sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
 sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_cudf_polars_polars_tests.sh

@@ -54,7 +54,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.16.0,<0.17.0a0
+- numba-cuda>=0.14.0,<0.15.0a0
 - numba>=0.59.1,<0.62.0a0
 - numpy>=1.23,<3.0a0
 - numpydoc
@@ -69,6 +69,7 @@ dependencies:
 - pre-commit
 - pyarrow>=14.0.0,<20.0.0a0
 - pydata-sphinx-theme>=0.15.4
+- pynvjitlink>=0.0.0a0
 - pynvml>=12.0.0,<13.0.0a0
 - pytest
 - pytest-benchmark

@@ -55,7 +55,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.16.0,<0.17.0a0
+- numba-cuda>=0.14.0,<0.15.0a0
 - numba>=0.59.1,<0.62.0a0
 - numpy>=1.23,<3.0a0
 - numpydoc
@@ -70,6 +70,7 @@ dependencies:
 - pre-commit
 - pyarrow>=14.0.0,<20.0.0a0
 - pydata-sphinx-theme>=0.15.4
+- pynvjitlink>=0.0.0a0
 - pynvml>=12.0.0,<13.0.0a0
 - pytest
 - pytest-benchmark

@@ -69,7 +69,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.4.0dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.16.0,<0.17.0a0
+    - numba-cuda >=0.14.0,<0.15.0a0
     - numba >=0.59.1,<0.62.0a0
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<20.0.0a0
@@ -78,7 +78,13 @@ requirements:
     - ${{ pin_compatible("rmm", upper_bound="x.x") }}
     - fsspec >=0.6.0
     - cuda-cudart
+    # Needed by Numba for CUDA support
+    - cuda-nvcc-impl
+    # TODO: Add nvjitlink here
+    # xref: https://github.com/rapidsai/cudf/issues/12822
+    - cuda-nvrtc
     - cuda-python >=12.6.2,<13.0a0
+    - pynvjitlink
     - if: linux and x86_64
       then:
         - libcufile

@@ -692,6 +692,7 @@ add_library(
   src/sort/stable_segmented_sort.cu
   src/sort/stable_sort_column.cu
   src/sort/stable_sort.cu
+  src/sort/top_k.cu
   src/stream_compaction/apply_boolean_mask.cu
   src/stream_compaction/distinct.cu
   src/stream_compaction/distinct_count.cu
@@ -756,6 +757,7 @@ add_library(
   src/strings/slice.cu
   src/strings/split/partition.cu
   src/strings/split/split.cu
+  src/strings/split/split_part.cu
   src/strings/split/split_re.cu
   src/strings/split/split_record.cu
   src/strings/strings_column_factories.cu

@@ -206,7 +206,7 @@ ConfigureNVBench(SEARCH_NVBENCH search/contains_scalar.cpp search/contains_table
 # * sort benchmark --------------------------------------------------------------------------------
 ConfigureNVBench(
   SORT_NVBENCH sort/rank.cpp sort/rank_lists.cpp sort/rank_structs.cpp sort/segmented_sort.cpp
-  sort/sort.cpp sort/sort_lists.cpp sort/sort_strings.cpp sort/sort_structs.cpp
+  sort/sort.cpp sort/sort_lists.cpp sort/sort_strings.cpp sort/sort_structs.cpp sort/top_k.cpp
 )
 
 # ##################################################################################################

@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename DataType>
+static void bench_top_k(nvbench::state& state, nvbench::type_list<DataType>)
+{
+  auto const ordered   = static_cast<bool>(state.get_int64("ordered"));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const k         = static_cast<cudf::size_type>(state.get_int64("k"));
+  auto const nulls     = state.get_float64("nulls");
+  auto const data_type = cudf::type_to_id<DataType>();
+
+  data_profile const profile =
+    data_profile_builder().cardinality(0).null_probability(nulls).distribution(
+      data_type, distribution_id::UNIFORM, 100, 10'000);
+  auto input = create_random_column(data_type, row_count{num_rows}, profile);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.add_global_memory_reads<nvbench::int8_t>(input->alloc_size());
+  state.add_global_memory_writes<nvbench::int32_t>(k);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    if (ordered) {
+      cudf::top_k_order(input->view(), k);
+    } else {
+      cudf::top_k(input->view(), k);
+    }
+  });
+}
+
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_s, "time_s", "time_s");
+
+using Types = nvbench::type_list<int32_t, float, cudf::timestamp_s>;
+
+NVBENCH_BENCH_TYPES(bench_top_k, NVBENCH_TYPE_AXES(Types))
+  .set_name("top_k")
+  .add_float64_axis("nulls", {0, 0.1})
+  .add_int64_axis("num_rows", {262144, 2097152, 16777216, 67108864})
+  .add_int64_axis("k", {100, 1000})
+  .add_int64_axis("ordered", {0, 1});
@@ -53,9 +53,15 @@ static void bench_split(nvbench::state& state)
   } else if (stype == "record") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::split_record(input, target); });
-  } else {
+  } else if (stype == "record_ws") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::split_record(input); });
+  } else if (stype == "part") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::split_part(input, target); });
+  } else {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::split_part(input); });
   }
 }
 
@@ -64,4 +70,4 @@ NVBENCH_BENCH(bench_split)
   .add_int64_axis("min_width", {0})
   .add_int64_axis("max_width", {32, 64, 128, 256})
   .add_int64_axis("num_rows", {32768, 262144, 2097152})
-  .add_string_axis("type", {"split", "split_ws", "record", "record_ws"});
+  .add_string_axis("type", {"split", "split_ws", "record", "record_ws", "part", "part_ws"});
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -344,5 +344,51 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Computes the top k values of a column
+ *
+ * This performs the equivalent of a sort and the slice of the resulting first k elements.
+ * However, the returned column may or may not necessarily be sorted.
+ *
+ * @throw std::invalid_argument if k is greater than the number of rows in the column
+ *
+ * @param col Column to compute top k
+ * @param k Number of values to return
+ * @param sort_order The desired sort order for the top k values.
+ *                   Default is high to low.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return A column with the top k values of the input column.
+ */
+std::unique_ptr<column> top_k(
+  column_view const& col,
+  size_type k,
+  order sort_order                  = order::DESCENDING,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Computes the indices of the top k values of a column
+ *
+ * The indices will represent the top k elements but may or may not represent
+ * those elements as k sorted values.
+ *
+ * @throw std::invalid_argument if k is greater than the number of rows in the column
+ *
+ * @param col Column to compute top k
+ * @param k Number of values to return
+ * @param sort_order The desired sort order for the top k values.
+ *                   Default is high to low.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Indices of the top k values of the input column
+ */
+std::unique_ptr<column> top_k_order(
+  column_view const& col,
+  size_type k,
+  order sort_order                  = order::DESCENDING,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -241,6 +241,29 @@ std::unique_ptr<column> rsplit_record(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns a columns of strings by splitting each input string using the
+ * specified delimiter and returning the string at the specified index
+ *
+ * Any null rows in the input return corresponding null output rows.
+ * A null row is also returned if the number of tokens computed by splitting
+ * the string for that row is less than the `index`.
+ *
+ * @param input Strings instance for this operation
+ * @param delimiter UTF-8 encoded string indicating the split points in each string;
+ *        Default of empty string indicates split on whitespace
+ * @param index The 0-based index of the string to return from the split
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of strings
+ */
+std::unique_ptr<column> split_part(
+  strings_column_view const& input,
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type index                   = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace CUDF_EXPORT cudf
@@ -473,8 +473,8 @@ struct mask_tform {
 
 }  // anonymous namespace
 
-uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_span<PageInfo const> pages,
-                                       rmm::cuda_stream_view stream)
+uint32_t get_aggregated_decode_kernel_mask(cudf::detail::hostdevice_span<PageInfo const> pages,
+                                           rmm::cuda_stream_view stream)
 {
   // determine which kernels to invoke
   auto mask_iter = thrust::make_transform_iterator(pages.device_begin(), mask_tform{});

@@ -726,8 +726,8 @@ void build_string_dictionary_index(ColumnChunkDesc* chunks,
  * @param[in] stream CUDA stream to use
  * @return Bitwise OR of all page `kernel_mask` values
  */
-uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_span<PageInfo const> pages,
-                                       rmm::cuda_stream_view stream);
+uint32_t get_aggregated_decode_kernel_mask(cudf::detail::hostdevice_span<PageInfo const> pages,
+                                           rmm::cuda_stream_view stream);
 
 /**
  * @brief Compute page output size information.