Skip to content

Cuda and Omp matrix conversion of Csr and Hybrid and Omp count_nonzeros of Ell #310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 4, 2019
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ target_sources(ginkgo_cuda
matrix/csr_kernels.cu
matrix/dense_kernels.cu
matrix/ell_kernels.cu
matrix/hybrid_kernels.cpp
matrix/hybrid_kernels.cu
matrix/sellp_kernels.cu
preconditioner/jacobi_advanced_apply_kernel.cu
preconditioner/jacobi_generate_kernel.cu
Expand Down
111 changes: 82 additions & 29 deletions cuda/components/atomic.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,71 @@ namespace kernels {
namespace cuda {


namespace detail {


template <typename ValueType, typename = void>
struct atomic_helper {
__forceinline__ __device__ static void atomic_add(ValueType *, ValueType)
{
GKO_ASSERT(false);
// TODO: add proper implementation of generic atomic add
}
};


template <typename ResultType, typename ValueType>
__forceinline__ __device__ ResultType reinterpret(ValueType val)
{
static_assert(sizeof(ValueType) == sizeof(ResultType),
"The type to reinterpret to must be of the same size as the "
"original type.");
return reinterpret_cast<ResultType &>(val);
}


#define GKO_BIND_ATOMIC_HELPER_STRUCTURE(CONVERTER_TYPE) \
template <typename ValueType> \
struct atomic_helper<ValueType, \
gko::xstd::enable_if_t<(sizeof(ValueType) == \
sizeof(CONVERTER_TYPE))>> { \
__forceinline__ __device__ static void atomic_add( \
ValueType *__restrict__ addr, ValueType val) \
{ \
CONVERTER_TYPE *address_as_ull = \
reinterpret_cast<CONVERTER_TYPE *>(addr); \
CONVERTER_TYPE old = *address_as_ull; \
CONVERTER_TYPE assumed; \
do { \
assumed = old; \
old = atomicCAS(address_as_ull, assumed, \
reinterpret<CONVERTER_TYPE>( \
val + reinterpret<ValueType>(assumed))); \
} while (assumed != old); \
} \
};

// Support 64-bit ATOMIC_ADD
GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
// Support 32-bit ATOMIC_ADD
GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);


#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10100))
// CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS
GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
#endif

#undef GKO_BIND_ATOMIC_HELPER__STRUCTURE


} // namespace detail


template <typename T>
__forceinline__ __device__ void atomic_add(T *, T)
__forceinline__ __device__ void atomic_add(T *__restrict__ addr, T val)
{
GKO_ASSERT(false);
// TODO: add proper implementation of generic atomic add
detail::atomic_helper<T>::atomic_add(addr, val);
}


Expand All @@ -60,36 +120,29 @@ GKO_BIND_ATOMIC_ADD(unsigned long long int);
GKO_BIND_ATOMIC_ADD(float);


#if (defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))


// the function is copied from CUDA 9.2 documentation
__forceinline__ __device__ void atomic_add(double *__restrict__ addr,
double val)
{
unsigned long long int *address_as_ull = (unsigned long long int *)addr;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(
address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));

// Note: uses integer comparison to avoid hang in case of NaN (since NaN
// != NaN)
} while (assumed != old);
}


#else


#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
// CUDA 8.0 starts suppoting 64-bit double atomicAdd on devices of compute
// capability 6.x and higher
GKO_BIND_ATOMIC_ADD(double);
#endif


#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
// CUDA 10.0 starts supporting 16-bit __half floating-point atomicAdd on devices
// of compute capability 7.x and higher.
GKO_BIND_ATOMIC_ADD(__half);
#endif

#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
// CUDA 10.0 starts supporting 32-bit __half2 floating-point atomicAdd on
// devices of compute capability 6.x and higher. note: The atomicity of the
// __half2 add operation is guaranteed separately for each of the two __half
// elements; the entire __half2 is not guaranteed to be atomic as a single
// 32-bit access.
GKO_BIND_ATOMIC_ADD(__half2);
#endif

#undef GKO_BIND_ATOMIC_ADD

Expand Down
104 changes: 104 additions & 0 deletions cuda/components/format_conversion.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2019, the Ginkgo authors
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

#ifndef GKO_CUDA_COMPONENTS_FORMAT_CONVERSION_CUH_
#define GKO_CUDA_COMPONENTS_FORMAT_CONVERSION_CUH_


#include <ginkgo/core/base/std_extensions.hpp>


#include "cuda/components/cooperative_groups.cuh"
#include "cuda/components/thread_ids.cuh"


namespace gko {
namespace kernels {
namespace cuda {
namespace ell {
namespace kernel {


template <typename ValueType, typename IndexType>
__global__ void count_nnz_per_row(size_type num_rows, size_type max_nnz_per_row,
size_type stride,
const ValueType *__restrict__ values,
IndexType *__restrict__ result);


} // namespace kernel
} // namespace ell


namespace coo {
namespace kernel {


template <typename IndexType>
__global__ void convert_row_idxs_to_ptrs(const IndexType *__restrict__ idxs,
size_type num_nonzeros,
IndexType *__restrict__ ptrs,
size_type length);


} // namespace kernel


namespace host_kernel {


template <size_type subwarp_size = cuda_config::warp_size>
__host__ size_type calculate_nwarps(std::shared_ptr<const CudaExecutor> exec,
const size_type nnz)
{
size_type warps_per_sm = exec->get_num_cores_per_sm() / subwarp_size;
size_type nwarps_in_cuda = exec->get_num_multiprocessor() * warps_per_sm;
size_type multiple = 8;
if (nnz >= 2000000) {
multiple = 128;
} else if (nnz >= 200000) {
multiple = 32;
}
return std::min(
multiple * nwarps_in_cuda,
static_cast<size_type>(ceildiv(nnz, cuda_config::warp_size)));
}


} // namespace host_kernel
} // namespace coo
} // namespace cuda
} // namespace kernels
} // namespace gko


#endif // GKO_CUDA_COMPONENTS_FORMAT_CONVERSION_CUH_
68 changes: 68 additions & 0 deletions cuda/components/reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GKO_CUDA_COMPONENTS_REDUCTION_CUH_


#include <ginkgo/core/base/array.hpp>
#include <ginkgo/core/base/executor.hpp>
#include <ginkgo/core/base/std_extensions.hpp>


#include "cuda/base/types.hpp"
#include "cuda/components/cooperative_groups.cuh"
#include "cuda/components/thread_ids.cuh"
#include "cuda/components/uninitialized_array.hpp"


namespace gko {
namespace kernels {
namespace cuda {


constexpr int default_block_size = 512;


/**
* @internal
*
Expand Down Expand Up @@ -172,6 +179,67 @@ __device__ void reduce_array(size_type size,
}


/**
* @internal
*
* Computes a reduction using the add operation (+) on an array
* `source` of any size. Has to be called a second time on `result` to reduce
* an array larger than `default_block_size`.
*/
template <typename ValueType>
__global__ __launch_bounds__(default_block_size) void reduce_add_array(
size_type size, const ValueType *__restrict__ source,
ValueType *__restrict__ result)
{
__shared__ UninitializedArray<ValueType, default_block_size> block_sum;
reduce_array(size, source, static_cast<ValueType *>(block_sum),
[](const ValueType &x, const ValueType &y) { return x + y; });

if (threadIdx.x == 0) {
result[blockIdx.x] = block_sum[0];
}
}


/**
* Compute a reduction using add operation (+).
*
* @param exec Executor associated to the array
* @param size size of the array
* @param source the pointer of the array
*
* @return the reduction result
*/
template <typename ValueType>
__host__ ValueType reduce_add_array(std::shared_ptr<const CudaExecutor> exec,
size_type size, const ValueType *source)
{
auto block_results_val = source;
size_type grid_dim = size;
if (size > default_block_size) {
const auto n = ceildiv(size, default_block_size);
grid_dim = (n <= default_block_size) ? n : default_block_size;

auto block_results = Array<ValueType>(exec, grid_dim);

reduce_add_array<<<grid_dim, default_block_size>>>(
size, as_cuda_type(source), as_cuda_type(block_results.get_data()));

block_results_val = block_results.get_const_data();
}

auto d_result = Array<ValueType>(exec, 1);

reduce_add_array<<<1, default_block_size>>>(
grid_dim, as_cuda_type(block_results_val),
as_cuda_type(d_result.get_data()));
ValueType answer = zero<ValueType>();
exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
&answer);
return answer;
}


} // namespace cuda
} // namespace kernels
} // namespace gko
Expand Down
Loading