Add a parquet reader utility to update output null masks (#19370)

mhaseeb123 · web-flow · commit 52e7b82243fd · 2025-07-18T08:52:12.000Z
Contributes to PR #19308 This PR adds a parquet reader utility to update the null masks of output buffers in order to nullify rows corresponding to the pruned out pages. No independent tests for this PR here. Instead this is tested in #19308 Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) URL: #19370
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
@@ -473,8 +473,8 @@ struct mask_tform {
 
 }  // anonymous namespace
 
-uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_span<PageInfo const> pages,
-                                       rmm::cuda_stream_view stream)
+uint32_t get_aggregated_decode_kernel_mask(cudf::detail::hostdevice_span<PageInfo const> pages,
+                                           rmm::cuda_stream_view stream)
 {
   // determine which kernels to invoke
   auto mask_iter = thrust::make_transform_iterator(pages.device_begin(), mask_tform{});
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -726,8 +726,8 @@ void build_string_dictionary_index(ColumnChunkDesc* chunks,
  * @param[in] stream CUDA stream to use
  * @return Bitwise OR of all page `kernel_mask` values
  */
-uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_span<PageInfo const> pages,
-                                       rmm::cuda_stream_view stream);
+uint32_t get_aggregated_decode_kernel_mask(cudf::detail::hostdevice_span<PageInfo const> pages,
+                                           rmm::cuda_stream_view stream);
 
 /**
  * @brief Compute page output size information.
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
@@ -70,7 +70,7 @@ void reader_impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num_
     });
 
   // figure out which kernels to run
-  auto const kernel_mask = GetAggregatedDecodeKernelMask(subpass.pages, _stream);
+  auto const kernel_mask = get_aggregated_decode_kernel_mask(subpass.pages, _stream);
 
   // Check to see if there are any string columns present. If so, then we need to get size info
   // for each string page. This size info will be used to pre-allocate memory for the column,
@@ -445,6 +445,9 @@ void reader_impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num_
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
+  // Invalidate output buffer nullmasks at row indices spanned by pruned pages
+  update_output_nullmasks_for_pruned_pages(host_page_mask);
+
   // Copy over initial string offsets from device
   auto h_initial_str_offsets = cudf::detail::make_host_vector_async(initial_str_offsets, _stream);
 
@@ -862,6 +865,79 @@ bool reader_impl::has_next()
   return has_more_work() or is_first_output_chunk();
 }
 
+void reader_impl::update_output_nullmasks_for_pruned_pages(cudf::host_span<bool const> page_mask)
+{
+  auto const& subpass    = _pass_itm_data->subpass;
+  auto const& pages      = subpass->pages;
+  auto const& chunks     = _pass_itm_data->chunks;
+  auto const num_columns = _input_columns.size();
+
+  CUDF_EXPECTS(pages.size() == page_mask.size(), "Page mask size mismatch");
+
+  // Return early if page mask is empty or all pages are required
+  if (page_mask.empty() or std::all_of(page_mask.begin(), page_mask.end(), std::identity{})) {
+    return;
+  }
+
+  auto page_and_mask_begin =
+    thrust::make_zip_iterator(thrust::make_tuple(pages.host_begin(), page_mask.begin()));
+
+  auto null_masks = std::vector<bitmask_type*>{};
+  auto begin_bits = std::vector<cudf::size_type>{};
+  auto end_bits   = std::vector<cudf::size_type>{};
+
+  std::for_each(
+    page_and_mask_begin, page_and_mask_begin + pages.size(), [&](auto const& page_and_mask_pair) {
+      // Return early if the page is valid
+      if (thrust::get<1>(page_and_mask_pair)) { return; }
+
+      auto const& page     = thrust::get<0>(page_and_mask_pair);
+      auto const chunk_idx = page.chunk_idx;
+      auto const start_row = chunks[chunk_idx].start_row + page.chunk_row;
+      auto const end_row   = start_row + page.num_rows;
+      auto& input_col      = _input_columns[chunk_idx % num_columns];
+      auto max_depth       = input_col.nesting_depth();
+      auto* cols           = &_output_buffers;
+
+      for (size_t l_idx = 0; l_idx < max_depth; l_idx++) {
+        auto& out_buf = (*cols)[input_col.nesting[l_idx]];
+        cols          = &out_buf.children;
+        // Continue if the current column is a list column
+        if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { continue; }
+        // Add the nullmask and bit bounds to corresponding lists
+        null_masks.emplace_back(out_buf.null_mask());
+        begin_bits.emplace_back(start_row);
+        end_bits.emplace_back(end_row);
+
+        // Increment the null count by the number of rows in this page
+        out_buf.null_count() += page.num_rows;
+      }
+    });
+
+  // Min number of nullmasks to use bulk update optimally
+  constexpr auto min_nullmasks_for_bulk_update = 32;
+
+  // Bulk update the nullmasks if optimal
+  if (null_masks.size() >= min_nullmasks_for_bulk_update) {
+    auto valids = cudf::detail::make_host_vector<bool>(null_masks.size(), _stream);
+    std::fill(valids.begin(), valids.end(), false);
+    cudf::set_null_masks_safe(null_masks, begin_bits, end_bits, valids, _stream);
+  }
+  // Otherwise, update the nullmasks in a loop
+  else {
+    auto nullmask_iter = thrust::make_zip_iterator(
+      thrust::make_tuple(null_masks.begin(), begin_bits.begin(), end_bits.begin()));
+    std::for_each(
+      nullmask_iter, nullmask_iter + null_masks.size(), [&](auto const& nullmask_tuple) {
+        cudf::set_null_mask(thrust::get<0>(nullmask_tuple),
+                            thrust::get<1>(nullmask_tuple),
+                            thrust::get<2>(nullmask_tuple),
+                            false,
+                            _stream);
+      });
+  }
+}
+
 namespace {
 parquet_column_schema walk_schema(aggregate_reader_metadata const* mt, int idx)
 {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
@@ -302,6 +302,13 @@ class reader_impl {
    */
   void decode_page_data(read_mode mode, size_t skip_rows, size_t num_rows);
 
+  /**
+   * @brief Invalidate output buffer nullmask for rows spanned by the pruned pages
+   *
+   * @param page_mask Boolean vector indicating if a page needs to be decoded or is pruned
+   */
+  void update_output_nullmasks_for_pruned_pages(cudf::host_span<bool const> page_mask);
+
   /**
    * @brief Creates file-wide parquet chunk information.
    *

Original file line number	Diff line number	Diff line change
`@@ -473,8 +473,8 @@ struct mask_tform {`
`473`	`473`
`474`	`474`	`} // anonymous namespace`
`475`	`475`
`476`		`-uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_span<PageInfo const> pages,`
`477`		`- rmm::cuda_stream_view stream)`
	`476`	`+uint32_t get_aggregated_decode_kernel_mask(cudf::detail::hostdevice_span<PageInfo const> pages,`
	`477`	`+ rmm::cuda_stream_view stream)`
`478`	`478`	`{`
`479`	`479`	`// determine which kernels to invoke`
`480`	`480`	`auto mask_iter = thrust::make_transform_iterator(pages.device_begin(), mask_tform{});`