Skip to content

Fix CUDA GoodFeaturesToTrackDetector not threadsafe bug #2868

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 27 additions & 31 deletions modules/cudaimgproc/src/cuda/gftt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,37 +52,33 @@ namespace cv { namespace cuda { namespace device
{
namespace gfft
{
texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);

__device__ int g_counter = 0;

template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, cudaTextureObject_t eigTex, int *g_counter)
{
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;

if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
{
float val = tex2D(eigTex, j, i);
float val = tex2D<float>(eigTex, j, i);

if (val > threshold)
{
float maxVal = val;

maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
maxVal = ::fmax(tex2D(eigTex, j , i - 1), maxVal);
maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i - 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j , i - 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i - 1), maxVal);

maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i), maxVal);

maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
maxVal = ::fmax(tex2D(eigTex, j , i + 1), maxVal);
maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i + 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j , i + 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i + 1), maxVal);

if (val == maxVal)
{
const int ind = ::atomicAdd(&g_counter, 1);
const int ind = ::atomicAdd(g_counter, 1);

if (ind < max_count)
corners[ind] = make_float2(j, i);
Expand All @@ -91,22 +87,20 @@ namespace cv { namespace cuda { namespace device
}
}

int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream)
int findCorners_gpu(const cudaTextureObject_t &eigTex, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
int* counter_ptr;
cudaSafeCall( cudaMalloc(&counter_ptr, sizeof(int)) );

cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );

bindTexture(&eigTex, eig);

dim3 block(16, 16);
dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
dim3 grid(divUp(cols, block.x), divUp(rows, block.y));

if (mask.data)
findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, rows, cols, eigTex, counter_ptr);
else
findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, rows, cols, eigTex, counter_ptr);

cudaSafeCall( cudaGetLastError() );

Expand All @@ -122,25 +116,27 @@ namespace cv { namespace cuda { namespace device
class EigGreater
{
public:
EigGreater(const cudaTextureObject_t &eigTex_) : eigTex(eigTex_)
{
}
__device__ __forceinline__ bool operator()(float2 a, float2 b) const
{
return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
return tex2D<float>(eigTex, a.x, a.y) > tex2D<float>(eigTex, b.x, b.y);
}
};

cudaTextureObject_t eigTex;
};

void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
void sortCorners_gpu(const cudaTextureObject_t &eigTex, float2* corners, int count, cudaStream_t stream)
{
bindTexture(&eigTex, eig);

thrust::device_ptr<float2> ptr(corners);
#if THRUST_VERSION >= 100802
if (stream)
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater());
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(eigTex));
else
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater());
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(eigTex));
#else
thrust::sort(ptr, ptr + count, EigGreater());
thrust::sort(ptr, ptr + count, EigGreater(eigTex));
#endif
}
} // namespace optical_flow
Expand Down
22 changes: 18 additions & 4 deletions modules/cudaimgproc/src/gftt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
{
namespace gfft
{
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream);
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
int findCorners_gpu(const cudaTextureObject_t &eigTex_, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream);
void sortCorners_gpu(const cudaTextureObject_t &eigTex_, float2* corners, int count, cudaStream_t stream);
}
}}}

Expand Down Expand Up @@ -112,15 +112,29 @@ namespace
cudaStream_t stream_ = StreamAccessor::getStream(stream);
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);

int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, stream_);
//create texture object for findCorners_gpu and sortCorners_gpu
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
texDesc.filterMode = cudaFilterModePoint;
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;

cudaTextureObject_t eigTex_;
PtrStepSzf eig = eig_;
cv::cuda::device::createTextureObjectPitch2D<float>(&eigTex_, eig, texDesc);

int total = findCorners_gpu(eigTex_, eig_.rows, eig_.cols, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, stream_);


if (total == 0)
{
_corners.release();
return;
}

sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);
sortCorners_gpu(eigTex_, tmpCorners_.ptr<float2>(), total, stream_);

if (minDistance_ < 1)
{
Expand Down