ginkgo-project · thoasm · Jun 4, 2019 · May 24, 2019 · May 25, 2019 · May 25, 2019
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
@@ -34,7 +34,7 @@ target_sources(ginkgo_cuda
         matrix/csr_kernels.cu
         matrix/dense_kernels.cu
         matrix/ell_kernels.cu
-        matrix/hybrid_kernels.cpp
+        matrix/hybrid_kernels.cu
         matrix/sellp_kernels.cu
         preconditioner/jacobi_advanced_apply_kernel.cu
         preconditioner/jacobi_generate_kernel.cu

diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh
@@ -39,11 +39,71 @@ namespace kernels {
 namespace cuda {
 
 
+namespace detail {
+
+
+template <typename ValueType, typename = void>
+struct atomic_helper {
+    __forceinline__ __device__ static void atomic_add(ValueType *, ValueType)
+    {
+        GKO_ASSERT(false);
+        // TODO: add proper implementation of generic atomic add
+    }
+};
+
+
+template <typename ResultType, typename ValueType>
+__forceinline__ __device__ ResultType reinterpret(ValueType val)
+{
+    static_assert(sizeof(ValueType) == sizeof(ResultType),
+                  "The type to reinterpret to must be of the same size as the "
+                  "original type.");
+    return reinterpret_cast<ResultType &>(val);
+}
+
+
+#define GKO_BIND_ATOMIC_HELPER_STRUCTURE(CONVERTER_TYPE)                     \
+    template <typename ValueType>                                            \
+    struct atomic_helper<ValueType,                                          \
+                         gko::xstd::enable_if_t<(sizeof(ValueType) ==        \
+                                                 sizeof(CONVERTER_TYPE))>> { \
+        __forceinline__ __device__ static void atomic_add(                   \
+            ValueType *__restrict__ addr, ValueType val)                     \
+        {                                                                    \
+            CONVERTER_TYPE *address_as_ull =                                 \
+                reinterpret_cast<CONVERTER_TYPE *>(addr);                    \
+            CONVERTER_TYPE old = *address_as_ull;                            \
+            CONVERTER_TYPE assumed;                                          \
+            do {                                                             \
+                assumed = old;                                               \
+                old = atomicCAS(address_as_ull, assumed,                     \
+                                reinterpret<CONVERTER_TYPE>(                 \
+                                    val + reinterpret<ValueType>(assumed))); \
+            } while (assumed != old);                                        \
+        }                                                                    \
+    };
+
+// Support 64-bit ATOMIC_ADD
+GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
+// Support 32-bit ATOMIC_ADD
+GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
+
+
+#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10100))
+// CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS
+GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
+#endif
+
+#undef GKO_BIND_ATOMIC_HELPER__STRUCTURE
+
+
+}  // namespace detail
+
+
 template <typename T>
-__forceinline__ __device__ void atomic_add(T *, T)
+__forceinline__ __device__ void atomic_add(T *__restrict__ addr, T val)
 {
-    GKO_ASSERT(false);
-    // TODO: add proper implementation of generic atomic add
+    detail::atomic_helper<T>::atomic_add(addr, val);
 }
 
 
@@ -60,36 +120,29 @@ GKO_BIND_ATOMIC_ADD(unsigned long long int);
 GKO_BIND_ATOMIC_ADD(float);
 
 
-#if (defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \
-    (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))
-
-
-// the function is copied from CUDA 9.2 documentation
-__forceinline__ __device__ void atomic_add(double *__restrict__ addr,
-                                           double val)
-{
-    unsigned long long int *address_as_ull = (unsigned long long int *)addr;
-    unsigned long long int old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(
-            address_as_ull, assumed,
-            __double_as_longlong(val + __longlong_as_double(assumed)));
-
-        // Note: uses integer comparison to avoid hang in case of NaN (since NaN
-        // != NaN)
-    } while (assumed != old);
-}
-
-
-#else
-
-
+#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \
+      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
+// CUDA 8.0 starts suppoting 64-bit double atomicAdd on devices of compute
+// capability 6.x and higher
 GKO_BIND_ATOMIC_ADD(double);
+#endif
 
-
+#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
+      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+// CUDA 10.0 starts supporting 16-bit __half floating-point atomicAdd on devices
+// of compute capability 7.x and higher.
+GKO_BIND_ATOMIC_ADD(__half);
 #endif
 
+#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
+      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
+// CUDA 10.0 starts supporting 32-bit __half2 floating-point atomicAdd on
+// devices of compute capability 6.x and higher. note: The atomicity of the
+// __half2 add operation is guaranteed separately for each of the two __half
+// elements; the entire __half2 is not guaranteed to be atomic as a single
+// 32-bit access.
+GKO_BIND_ATOMIC_ADD(__half2);
+#endif
 
 #undef GKO_BIND_ATOMIC_ADD
 

diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh
@@ -0,0 +1,104 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2019, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_COMPONENTS_FORMAT_CONVERSION_CUH_
+#define GKO_CUDA_COMPONENTS_FORMAT_CONVERSION_CUH_
+
+
+#include <ginkgo/core/base/std_extensions.hpp>
+
+
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace ell {
+namespace kernel {
+
+
+template <typename ValueType, typename IndexType>
+__global__ void count_nnz_per_row(size_type num_rows, size_type max_nnz_per_row,
+                                  size_type stride,
+                                  const ValueType *__restrict__ values,
+                                  IndexType *__restrict__ result);
+
+
+}  // namespace kernel
+}  // namespace ell
+
+
+namespace coo {
+namespace kernel {
+
+
+template <typename IndexType>
+__global__ void convert_row_idxs_to_ptrs(const IndexType *__restrict__ idxs,
+                                         size_type num_nonzeros,
+                                         IndexType *__restrict__ ptrs,
+                                         size_type length);
+
+
+}  // namespace kernel
+
+
+namespace host_kernel {
+
+
+template <size_type subwarp_size = cuda_config::warp_size>
+__host__ size_type calculate_nwarps(std::shared_ptr<const CudaExecutor> exec,
+                                    const size_type nnz)
+{
+    size_type warps_per_sm = exec->get_num_cores_per_sm() / subwarp_size;
+    size_type nwarps_in_cuda = exec->get_num_multiprocessor() * warps_per_sm;
+    size_type multiple = 8;
+    if (nnz >= 2000000) {
+        multiple = 128;
+    } else if (nnz >= 200000) {
+        multiple = 32;
+    }
+    return std::min(
+        multiple * nwarps_in_cuda,
+        static_cast<size_type>(ceildiv(nnz, cuda_config::warp_size)));
+}
+
+
+}  // namespace host_kernel
+}  // namespace coo
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_COMPONENTS_FORMAT_CONVERSION_CUH_
diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh
@@ -34,18 +34,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CUDA_COMPONENTS_REDUCTION_CUH_
 
 
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
 
 
+#include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace cuda {
 
 
+constexpr int default_block_size = 512;
+
+
 /**
  * @internal
  *
@@ -172,6 +179,67 @@ __device__ void reduce_array(size_type size,
 }
 
 
+/**
+ * @internal
+ *
+ * Computes a reduction using the add operation (+) on an array
+ * `source` of any size. Has to be called a second time on `result` to reduce
+ * an array larger than `default_block_size`.
+ */
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void reduce_add_array(
+    size_type size, const ValueType *__restrict__ source,
+    ValueType *__restrict__ result)
+{
+    __shared__ UninitializedArray<ValueType, default_block_size> block_sum;
+    reduce_array(size, source, static_cast<ValueType *>(block_sum),
+                 [](const ValueType &x, const ValueType &y) { return x + y; });
+
+    if (threadIdx.x == 0) {
+        result[blockIdx.x] = block_sum[0];
+    }
+}
+
+
+/**
+ * Compute a reduction using add operation (+).
+ *
+ * @param exec  Executor associated to the array
+ * @param size  size of the array
+ * @param source  the pointer of the array
+ *
+ * @return the reduction result
+ */
+template <typename ValueType>
+__host__ ValueType reduce_add_array(std::shared_ptr<const CudaExecutor> exec,
+                                    size_type size, const ValueType *source)
+{
+    auto block_results_val = source;
+    size_type grid_dim = size;
+    if (size > default_block_size) {
+        const auto n = ceildiv(size, default_block_size);
+        grid_dim = (n <= default_block_size) ? n : default_block_size;
+
+        auto block_results = Array<ValueType>(exec, grid_dim);
+
+        reduce_add_array<<<grid_dim, default_block_size>>>(
+            size, as_cuda_type(source), as_cuda_type(block_results.get_data()));
+
+        block_results_val = block_results.get_const_data();
+    }
+
+    auto d_result = Array<ValueType>(exec, 1);
+
+    reduce_add_array<<<1, default_block_size>>>(
+        grid_dim, as_cuda_type(block_results_val),
+        as_cuda_type(d_result.get_data()));
+    ValueType answer = zero<ValueType>();
+    exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
+                                  &answer);
+    return answer;
+}
+
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko