cuda: fix windowing test with cuda

Rohan Yadav · Rohan Yadav · commit 432c25d04cde · 2021-03-16T16:01:31.000-07:00
Fixes #422. This commit ensures that the allocation clearing logic is applied to the CUDA backend as well. The windowing test caught this because TACO was automatically parallelizing the loop onto the GPU.
diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
@@ -1096,6 +1096,20 @@ void CodeGen_CUDA::visit(const Allocate* op) {
   op->num_elements.accept(this);
   parentPrecedence = TOP;
   stream << "));" << endl;
+  // If the operation wants the input cleared, then memset it to zero.
+  if (op->clear) {
+    doIndent();
+    stream << "gpuErrchk(cudaMemset(";
+    op->var.accept(this);
+    stream << variable_name;
+    stream << ", 0, ";
+    stream << "sizeof(" << elementType << ")";
+    stream << " * ";
+    parentPrecedence = MUL;
+    op->num_elements.accept(this);
+    parentPrecedence = TOP;
+    stream << "));" << endl;
+  }
 
   if(op->is_realloc) {
     doIndent();