From 73154555ef81d1de0d97216618dbdbe738916e76 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Fri, 15 Oct 2021 14:55:03 +0300 Subject: [PATCH 01/10] Naive implementation of uniqueness ratio option for cuda::StereoBM. --- modules/cudastereo/src/cuda/stereobm.cu | 164 ++++++++++++++++-------- modules/cudastereo/src/stereobm.cpp | 11 +- modules/cudastereo/test/test_stereo.cpp | 22 ++++ 3 files changed, 135 insertions(+), 62 deletions(-) diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index 5be19e99ff8..04a2a3bf6eb 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -98,40 +98,6 @@ namespace cv { namespace cuda { namespace device return col_ssd[0] + cache + cache2; } - template - __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X) - { - unsigned int ssd[N_DISPARITIES]; - - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) - ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X); - __syncthreads(); - ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X); - __syncthreads(); - ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X); - __syncthreads(); - ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X); - __syncthreads(); - ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X); - __syncthreads(); - ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X); - __syncthreads(); - ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X); - __syncthreads(); - ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X); - - int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7]))); - - int bestIdx = 0; - for (int i = 0; i < N_DISPARITIES; i++) - { - if (mssd == ssd[i]) - bestIdx = i; - } - - return make_uint2(mssd, bestIdx); - } - template __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd) { @@ -235,9 +201,11 @@ namespace cv { namespace cuda { namespace device } template - __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp) + __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp, int uniquenessRatio) { extern __shared__ unsigned int col_ssd_cache[]; + uint line_ssds[256*ROWSperTHREAD]; + volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x; volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS) @@ -259,6 +227,8 @@ namespace cv { namespace cuda { namespace device int y_tex; int x_tex = X - RADIUS; + float uniqueness_thresh = 1.0 + uniquenessRatio / 100.0f; + if (x_tex >= cwidth) return; @@ -272,17 +242,44 @@ namespace cv { namespace cuda { namespace device if (x_tex + BLOCK_W < cwidth) InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra); - __syncthreads(); //before MinSSD function + __syncthreads(); //before CalcSSDVector function if (Y < cheight - RADIUS) { - uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd, X); + uint* batch_ssds = line_ssds + d - STEREO_MIND; + + //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) + batch_ssds[0] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[1] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[2] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[3] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[4] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[5] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[6] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + + int mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); + + int bestIdx = 0; + for (int i = 0; i < N_DISPARITIES; i++) + { + if (mssd == batch_ssds[i]) + bestIdx = i; + } // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously - // computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all. + // computed "minSSD" value is not used at all. // - // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads - // must also call "MinSSD" to avoid deadlock. (#13850) + // However, since the "CalcSSDVector" function has "__syncthreads" call in its body, those threads + // must also call "CalcSSDVector" to avoid deadlock. (#13850) // // From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads" // could be an option, but the shared memory access pattern does not allow this option, @@ -290,10 +287,10 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { - if (minSSD.x < minSSDImage[0]) + if (mssd < minSSDImage[0]) { - disparImage[0] = (unsigned char)(d + minSSD.y); - minSSDImage[0] = minSSD.x; + disparImage[0] = (unsigned char)(d + bestIdx); + minSSDImage[0] = mssd; } } } @@ -313,17 +310,44 @@ namespace cv { namespace cuda { namespace device y_tex += 1; - __syncthreads(); //before MinSSD function + __syncthreads(); if (row < cheight - RADIUS - Y) { - uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd, X); + uint* batch_ssds = line_ssds + row * 256 + d - STEREO_MIND; + + //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) + batch_ssds[0] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[1] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[2] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[3] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[4] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[5] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[6] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X); + __syncthreads(); + + int mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); + + int bestIdx = 0; + for (int i = 0; i < N_DISPARITIES; i++) + { + if (mssd == batch_ssds[i]) + bestIdx = i; + } // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously - // computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all. + // computed "minSSD" value, which is the result of "CalcSSDVector" function call, is not used at all. // - // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads - // must also call "MinSSD" to avoid deadlock. (#13850) + // However, since the "CalcSSDVector" function has "__syncthreads" call in its body, those threads + // must also call "CalcSSDVector" to avoid deadlock. (#13850) // // From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads" // could be an option, but the shared memory access pattern does not allow this option, @@ -332,10 +356,10 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { int idx = row * cminSSD_step; - if (minSSD.x < minSSDImage[idx]) + if (mssd < minSSDImage[idx]) { - disparImage[disp.step * row] = (unsigned char)(d + minSSD.y); - minSSDImage[idx] = minSSD.x; + disparImage[disp.step * row] = (unsigned char)(d + bestIdx); + minSSDImage[idx] = mssd; } } } @@ -344,10 +368,34 @@ namespace cv { namespace cuda { namespace device __syncthreads(); // before initializing shared memory at the beginning of next loop } // for d loop - } + if (uniquenessRatio > 0) + { + for (int row = 0; row < end_row; row++) + { + uint suboptimal_ssd = minSSDImage[row * cminSSD_step]; + uint suboptimal_d = disparImage[disp.step * row]; + float thresh = (1.0 + uniquenessRatio / 100.0f) * suboptimal_ssd; + + int d = 0; + uint * batch_ssds = line_ssds + row*256; + for (; d < maxdisp - STEREO_MIND; d++) + { + if( (d < suboptimal_d-1 || d > suboptimal_d+1) && (batch_ssds[d] <= thresh) ) + { + break; + } + } + if( d < maxdisp ) + { + disparImage[disp.step * row] = 0; + } + } + } + } - template void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream) + template void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, + int maxdisp, int uniquenessRatio, cudaStream_t & stream) { dim3 grid(1,1,1); dim3 threads(BLOCK_W, 1, 1); @@ -358,14 +406,15 @@ namespace cv { namespace cuda { namespace device //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int); - stereoKernel<<>>(left.data, right.data, left.step, disp, maxdisp); + stereoKernel<<>>(left.data, right.data, left.step, disp, maxdisp, uniquenessRatio); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); }; - typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream); + typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, + int maxdisp, int uniquenessRatio, cudaStream_t & stream); const static kernel_caller_t callers[] = { @@ -380,7 +429,8 @@ namespace cv { namespace cuda { namespace device }; const int calles_num = sizeof(callers)/sizeof(callers[0]); - void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz& minSSD_buf, cudaStream_t& stream) + void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, + int winsz, int uniquenessRatio, const PtrStepSz& minSSD_buf, cudaStream_t& stream) { int winsz2 = winsz >> 1; @@ -400,7 +450,7 @@ namespace cv { namespace cuda { namespace device size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize(); cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) ); - callers[winsz2](left, right, disp, maxdisp, stream); + callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, stream); } __device__ inline int clamp(int x, int a, int b) diff --git a/modules/cudastereo/src/stereobm.cpp b/modules/cudastereo/src/stereobm.cpp index 47ef8e3e1ca..dab326981ba 100644 --- a/modules/cudastereo/src/stereobm.cpp +++ b/modules/cudastereo/src/stereobm.cpp @@ -55,7 +55,7 @@ namespace cv { namespace cuda { namespace device { namespace stereobm { - void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz& minSSD_buf, cudaStream_t & stream); + void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, int uniquenessRatio, const PtrStepSz& minSSD_buf, cudaStream_t & stream); void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream); void prefilter_norm(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, int winsize, cudaStream_t & stream); void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream); @@ -102,8 +102,8 @@ namespace int getTextureThreshold() const { return static_cast(avergeTexThreshold_); } void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = static_cast(textureThreshold); } - int getUniquenessRatio() const { return 0; } - void setUniquenessRatio(int /*uniquenessRatio*/) {} + int getUniquenessRatio() const { return uniquenessRatio_; } + void setUniquenessRatio(int uniquenessRatio) { uniquenessRatio_ = uniquenessRatio; } int getSmallerBlockSize() const { return 0; } void setSmallerBlockSize(int /*blockSize*/){} @@ -121,12 +121,13 @@ namespace int preFilterCap_; float avergeTexThreshold_; int preFilterSize_; + int uniquenessRatio_; GpuMat minSSD_, leBuf_, riBuf_; }; StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize) - : preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9) + : preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9), uniquenessRatio_(0) { } @@ -183,7 +184,7 @@ namespace ri_for_bm = riBuf_; } - stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream); + stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, uniquenessRatio_, minSSD_, stream); if (avergeTexThreshold_ > 0) postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream); diff --git a/modules/cudastereo/test/test_stereo.cpp b/modules/cudastereo/test/test_stereo.cpp index b58b79e56e6..bd052e6447e 100644 --- a/modules/cudastereo/test/test_stereo.cpp +++ b/modules/cudastereo/test/test_stereo.cpp @@ -122,6 +122,28 @@ CUDA_TEST_P(StereoBM, PrefilterNormRegression) EXPECT_MAT_NEAR(disp_gold, disp, 0.0); } +CUDA_TEST_P(StereoBM, Uniqueness_Regression) +{ + cv::Mat left_image = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE); + cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE); + cv::Mat disp_gold = readImage("stereobm/aloe-disp-uniqueness15.png", cv::IMREAD_GRAYSCALE); + + ASSERT_FALSE(left_image.empty()); + ASSERT_FALSE(right_image.empty()); + ASSERT_FALSE(disp_gold.empty()); + + cv::Ptr bm = cv::cuda::createStereoBM(128, 19); + cv::cuda::GpuMat disp; + + bm->setUniquenessRatio(15); + bm->compute(loadMat(left_image), loadMat(right_image), disp); + + cv::Mat disp_cpu; + disp.download(disp_cpu); + + EXPECT_MAT_NEAR(disp_gold, disp, 0.0); +} + INSTANTIATE_TEST_CASE_P(CUDA_Stereo, StereoBM, ALL_DEVICES); ////////////////////////////////////////////////////////////////////////// From 966c94fc4f1cc6371741dcf1c95371dff45ae003 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Mon, 18 Oct 2021 16:30:03 +0300 Subject: [PATCH 02/10] Optimized memory consumption in cuda::stereoBM with uniqueness check. --- modules/cudastereo/src/cuda/stereobm.cu | 144 +++++++++++++++++++----- modules/cudastereo/test/test_stereo.cpp | 1 + 2 files changed, 116 insertions(+), 29 deletions(-) diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index 04a2a3bf6eb..ce7a85fe11e 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -204,7 +204,12 @@ namespace cv { namespace cuda { namespace device __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp, int uniquenessRatio) { extern __shared__ unsigned int col_ssd_cache[]; - uint line_ssds[256*ROWSperTHREAD]; + uint line_ssds[3 + N_DISPARITIES]; // 1 - local minima + 2 - tail of previous batch for accurate uniquenessRatio check + uint line_disps[3 + N_DISPARITIES]; + uint* batch_ssds = line_ssds + 3; + + uint line_ssd_tails[3*ROWSperTHREAD]; + uchar uniqueness_approved[ROWSperTHREAD]; volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x; volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS) @@ -217,6 +222,7 @@ namespace cv { namespace cuda { namespace device unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; unsigned char* disparImage = disp.data + X + Y * disp.step; + float thresh_scale; //if (X < cwidth) //{ // unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step; @@ -227,11 +233,25 @@ namespace cv { namespace cuda { namespace device int y_tex; int x_tex = X - RADIUS; - float uniqueness_thresh = 1.0 + uniquenessRatio / 100.0f; - if (x_tex >= cwidth) return; + + if (uniquenessRatio > 0) + { + batch_ssds[6] = 0xFFFFFFFF; + batch_ssds[7] = 0xFFFFFFFF; + thresh_scale = (1.0 + uniquenessRatio / 100.0f); + for(int i = 0; i < ROWSperTHREAD; i++) + { + uniqueness_approved[i] = 1; + } + for(int i = 0; i < 3*ROWSperTHREAD; i++) + { + line_ssd_tails[i] = 0xFFFFFFFF; + } + } + for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP) { y_tex = Y - RADIUS; @@ -246,8 +266,6 @@ namespace cv { namespace cuda { namespace device if (Y < cheight - RADIUS) { - uint* batch_ssds = line_ssds + d - STEREO_MIND; - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) batch_ssds[0] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X); __syncthreads(); @@ -266,7 +284,8 @@ namespace cv { namespace cuda { namespace device batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X); __syncthreads(); - int mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); + int mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), + ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); int bestIdx = 0; for (int i = 0; i < N_DISPARITIES; i++) @@ -278,8 +297,8 @@ namespace cv { namespace cuda { namespace device // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously // computed "minSSD" value is not used at all. // - // However, since the "CalcSSDVector" function has "__syncthreads" call in its body, those threads - // must also call "CalcSSDVector" to avoid deadlock. (#13850) + // However, since the batch_ssds computation has "__syncthreads" call in its body, those threads + // must also call "CalcSSD" to avoid deadlock. (#13850) // // From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads" // could be an option, but the shared memory access pattern does not allow this option, @@ -287,6 +306,47 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { + if (uniquenessRatio > 0) + { + line_ssds[0] = minSSDImage[0]; + line_ssds[1] = line_ssd_tails[3*0 + 1]; + line_ssds[2] = line_ssd_tails[3*0 + 2]; + + for(int i = 0; i < N_DISPARITIES; i++) + { + line_disps[i+3] = d + i; + } + line_disps[0] = disparImage[0]; + line_disps[1] = d - 2; + line_disps[2] = d - 1; + + float thresh = thresh_scale * ::min(line_ssds[0], mssd); + int dtest = disparImage[0]; + + if(mssd < line_ssds[0]) + { + uniqueness_approved[0] = 1; + dtest = d + bestIdx; + } + + if(uniqueness_approved[0]) + { + for (int ld = 0; ld < N_DISPARITIES + 3; ld++) + { + if (((line_disps[ld] < dtest-1) || (line_disps[ld] > dtest+1)) && (line_ssds[ld] <= thresh)) + { + printf("[%d, %d, %d] Dropped uniqueness at %d, %d\n", blockIdx.x, blockIdx.y, threadIdx.x, d, ld); + uniqueness_approved[0] = 0; + break; + } + } + } + + line_ssd_tails[3*0 + 0] = ::min(line_ssds[0], mssd); + line_ssd_tails[3*0 + 1] = batch_ssds[6]; + line_ssd_tails[3*0 + 2] = batch_ssds[7]; + } + if (mssd < minSSDImage[0]) { disparImage[0] = (unsigned char)(d + bestIdx); @@ -314,8 +374,6 @@ namespace cv { namespace cuda { namespace device if (row < cheight - RADIUS - Y) { - uint* batch_ssds = line_ssds + row * 256 + d - STEREO_MIND; - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) batch_ssds[0] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X); __syncthreads(); @@ -334,7 +392,8 @@ namespace cv { namespace cuda { namespace device batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X); __syncthreads(); - int mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); + int mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), + ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); int bestIdx = 0; for (int i = 0; i < N_DISPARITIES; i++) @@ -355,6 +414,48 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { + if (uniquenessRatio > 0) + { + // restore + line_ssds[0] = minSSDImage[row * cminSSD_step]; + line_ssds[1] = line_ssd_tails[3*row + 1]; + line_ssds[2] = line_ssd_tails[3*row + 2]; + + + for(int i = 0; i < N_DISPARITIES; i++) + { + line_disps[i+3] = d + i; + } + line_disps[0] = disparImage[disp.step * row]; + line_disps[1] = d - 2; + line_disps[2] = d - 1; + + float thresh = thresh_scale * ::min(line_ssds[0], mssd); + int dtest = disparImage[disp.step * row]; + + if(mssd < line_ssds[0]) + { + uniqueness_approved[row] = 1; + dtest = d + bestIdx; + } + + if(uniqueness_approved[row]) + { + for (int ld = 0; ld < N_DISPARITIES + 3; ld++) + { + if (((line_disps[ld] < dtest-1) || (line_disps[ld] > dtest+1)) && (line_ssds[ld] <= thresh)) + { + uniqueness_approved[row] = 0; + break; + } + } + } + + line_ssd_tails[3*row + 0] = ::min(line_ssds[0], mssd); + line_ssd_tails[3*row + 1] = batch_ssds[6]; + line_ssd_tails[3*row + 2] = batch_ssds[7]; + } + int idx = row * cminSSD_step; if (mssd < minSSDImage[idx]) { @@ -371,25 +472,10 @@ namespace cv { namespace cuda { namespace device if (uniquenessRatio > 0) { - for (int row = 0; row < end_row; row++) + for (int row = 0; row < ROWSperTHREAD; row++) { - uint suboptimal_ssd = minSSDImage[row * cminSSD_step]; - uint suboptimal_d = disparImage[disp.step * row]; - float thresh = (1.0 + uniquenessRatio / 100.0f) * suboptimal_ssd; - - int d = 0; - uint * batch_ssds = line_ssds + row*256; - for (; d < maxdisp - STEREO_MIND; d++) - { - if( (d < suboptimal_d-1 || d > suboptimal_d+1) && (batch_ssds[d] <= thresh) ) - { - break; - } - } - if( d < maxdisp ) - { - disparImage[disp.step * row] = 0; - } + // drop disparity for pixel where uniqueness requirement was not satisfied (zero value) + disparImage[disp.step * row] = disparImage[disp.step * row] * uniqueness_approved[row]; } } } diff --git a/modules/cudastereo/test/test_stereo.cpp b/modules/cudastereo/test/test_stereo.cpp index bd052e6447e..14f7d97f964 100644 --- a/modules/cudastereo/test/test_stereo.cpp +++ b/modules/cudastereo/test/test_stereo.cpp @@ -140,6 +140,7 @@ CUDA_TEST_P(StereoBM, Uniqueness_Regression) cv::Mat disp_cpu; disp.download(disp_cpu); + cv::imwrite("disp_inq15.png", disp_cpu); EXPECT_MAT_NEAR(disp_gold, disp, 0.0); } From 1ab12be4fcdf7f49af92726c373ff0a4bd754ede Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Mon, 18 Oct 2021 17:16:14 +0300 Subject: [PATCH 03/10] Got rid of line_disps array. --- modules/cudastereo/src/cuda/stereobm.cu | 36 +++++++++---------------- 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index ce7a85fe11e..c7affe5a5d1 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -205,7 +205,6 @@ namespace cv { namespace cuda { namespace device { extern __shared__ unsigned int col_ssd_cache[]; uint line_ssds[3 + N_DISPARITIES]; // 1 - local minima + 2 - tail of previous batch for accurate uniquenessRatio check - uint line_disps[3 + N_DISPARITIES]; uint* batch_ssds = line_ssds + 3; uint line_ssd_tails[3*ROWSperTHREAD]; @@ -312,14 +311,6 @@ namespace cv { namespace cuda { namespace device line_ssds[1] = line_ssd_tails[3*0 + 1]; line_ssds[2] = line_ssd_tails[3*0 + 2]; - for(int i = 0; i < N_DISPARITIES; i++) - { - line_disps[i+3] = d + i; - } - line_disps[0] = disparImage[0]; - line_disps[1] = d - 2; - line_disps[2] = d - 1; - float thresh = thresh_scale * ::min(line_ssds[0], mssd); int dtest = disparImage[0]; @@ -331,15 +322,18 @@ namespace cv { namespace cuda { namespace device if(uniqueness_approved[0]) { - for (int ld = 0; ld < N_DISPARITIES + 3; ld++) + for (int ld = 1; ld < N_DISPARITIES + 3; ld++) { - if (((line_disps[ld] < dtest-1) || (line_disps[ld] > dtest+1)) && (line_ssds[ld] <= thresh)) + if ((d+ld-3 < dtest-1 || d+ld-3 > dtest+1) && (line_ssds[ld] <= thresh)) { - printf("[%d, %d, %d] Dropped uniqueness at %d, %d\n", blockIdx.x, blockIdx.y, threadIdx.x, d, ld); uniqueness_approved[0] = 0; break; } } + if ((disparImage[0] < dtest-1 || disparImage[0] > dtest+1) && (line_ssds[0] <= thresh)) + { + uniqueness_approved[0] = 0; + } } line_ssd_tails[3*0 + 0] = ::min(line_ssds[0], mssd); @@ -416,20 +410,10 @@ namespace cv { namespace cuda { namespace device { if (uniquenessRatio > 0) { - // restore line_ssds[0] = minSSDImage[row * cminSSD_step]; line_ssds[1] = line_ssd_tails[3*row + 1]; line_ssds[2] = line_ssd_tails[3*row + 2]; - - for(int i = 0; i < N_DISPARITIES; i++) - { - line_disps[i+3] = d + i; - } - line_disps[0] = disparImage[disp.step * row]; - line_disps[1] = d - 2; - line_disps[2] = d - 1; - float thresh = thresh_scale * ::min(line_ssds[0], mssd); int dtest = disparImage[disp.step * row]; @@ -441,14 +425,18 @@ namespace cv { namespace cuda { namespace device if(uniqueness_approved[row]) { - for (int ld = 0; ld < N_DISPARITIES + 3; ld++) + for (int ld = 1; ld < N_DISPARITIES + 3; ld++) { - if (((line_disps[ld] < dtest-1) || (line_disps[ld] > dtest+1)) && (line_ssds[ld] <= thresh)) + if (((d+ld-3 < dtest-1) || (d+ld-3 > dtest+1)) && (line_ssds[ld] <= thresh)) { uniqueness_approved[row] = 0; break; } } + if ((disparImage[disp.step * row] < dtest-1 || disparImage[disp.step * row] > dtest+1) && (line_ssds[0] <= thresh)) + { + uniqueness_approved[row] = 0; + } } line_ssd_tails[3*row + 0] = ::min(line_ssds[0], mssd); From ea23bb9aba90eaeedca75c8054ec4e314755974d Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Mon, 18 Oct 2021 17:23:24 +0300 Subject: [PATCH 04/10] Reduced line_ssds. --- modules/cudastereo/src/cuda/stereobm.cu | 50 ++++++++++++------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index c7affe5a5d1..16663cf53ed 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -204,8 +204,8 @@ namespace cv { namespace cuda { namespace device __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp, int uniquenessRatio) { extern __shared__ unsigned int col_ssd_cache[]; - uint line_ssds[3 + N_DISPARITIES]; // 1 - local minima + 2 - tail of previous batch for accurate uniquenessRatio check - uint* batch_ssds = line_ssds + 3; + uint line_ssds[2 + N_DISPARITIES]; // +2 - tail of previous batch for accurate uniquenessRatio check + uint* batch_ssds = line_ssds + 2; uint line_ssd_tails[3*ROWSperTHREAD]; uchar uniqueness_approved[ROWSperTHREAD]; @@ -307,36 +307,35 @@ namespace cv { namespace cuda { namespace device { if (uniquenessRatio > 0) { - line_ssds[0] = minSSDImage[0]; - line_ssds[1] = line_ssd_tails[3*0 + 1]; - line_ssds[2] = line_ssd_tails[3*0 + 2]; + line_ssds[0] = line_ssd_tails[3*0 + 1]; + line_ssds[1] = line_ssd_tails[3*0 + 2]; - float thresh = thresh_scale * ::min(line_ssds[0], mssd); + float thresh = thresh_scale * ::min(minSSDImage[0], mssd); int dtest = disparImage[0]; - if(mssd < line_ssds[0]) + if(mssd < minSSDImage[0]) { uniqueness_approved[0] = 1; dtest = d + bestIdx; + if ((disparImage[0] < dtest-1 || disparImage[0] > dtest+1) && (minSSDImage[0] <= thresh)) + { + uniqueness_approved[0] = 0; + } } if(uniqueness_approved[0]) { - for (int ld = 1; ld < N_DISPARITIES + 3; ld++) + for (int ld = 0; ld < N_DISPARITIES + 2; ld++) { - if ((d+ld-3 < dtest-1 || d+ld-3 > dtest+1) && (line_ssds[ld] <= thresh)) + if ((d+ld-2 < dtest-1 || d+ld-2 > dtest+1) && (line_ssds[ld] <= thresh)) { uniqueness_approved[0] = 0; break; } } - if ((disparImage[0] < dtest-1 || disparImage[0] > dtest+1) && (line_ssds[0] <= thresh)) - { - uniqueness_approved[0] = 0; - } } - line_ssd_tails[3*0 + 0] = ::min(line_ssds[0], mssd); + line_ssd_tails[3*0 + 0] = ::min(minSSDImage[0], mssd); line_ssd_tails[3*0 + 1] = batch_ssds[6]; line_ssd_tails[3*0 + 2] = batch_ssds[7]; } @@ -410,36 +409,35 @@ namespace cv { namespace cuda { namespace device { if (uniquenessRatio > 0) { - line_ssds[0] = minSSDImage[row * cminSSD_step]; - line_ssds[1] = line_ssd_tails[3*row + 1]; - line_ssds[2] = line_ssd_tails[3*row + 2]; + line_ssds[0] = line_ssd_tails[3*row + 1]; + line_ssds[1] = line_ssd_tails[3*row + 2]; - float thresh = thresh_scale * ::min(line_ssds[0], mssd); + float thresh = thresh_scale * ::min(minSSDImage[row * cminSSD_step], mssd); int dtest = disparImage[disp.step * row]; - if(mssd < line_ssds[0]) + if(mssd < minSSDImage[row * cminSSD_step]) { uniqueness_approved[row] = 1; dtest = d + bestIdx; + if ((disparImage[disp.step * row] < dtest-1 || disparImage[disp.step * row] > dtest+1) && (minSSDImage[row * cminSSD_step] <= thresh)) + { + uniqueness_approved[row] = 0; + } } if(uniqueness_approved[row]) { - for (int ld = 1; ld < N_DISPARITIES + 3; ld++) + for (int ld = 0; ld < N_DISPARITIES + 2; ld++) { - if (((d+ld-3 < dtest-1) || (d+ld-3 > dtest+1)) && (line_ssds[ld] <= thresh)) + if (((d+ld-2 < dtest-1) || (d+ld-2 > dtest+1)) && (line_ssds[ld] <= thresh)) { uniqueness_approved[row] = 0; break; } } - if ((disparImage[disp.step * row] < dtest-1 || disparImage[disp.step * row] > dtest+1) && (line_ssds[0] <= thresh)) - { - uniqueness_approved[row] = 0; - } } - line_ssd_tails[3*row + 0] = ::min(line_ssds[0], mssd); + line_ssd_tails[3*row + 0] = ::min(minSSDImage[row * cminSSD_step], mssd); line_ssd_tails[3*row + 1] = batch_ssds[6]; line_ssd_tails[3*row + 2] = batch_ssds[7]; } From d5b7f0d7aedab1448760c7407853138803d45ebf Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Tue, 19 Oct 2021 15:02:37 +0300 Subject: [PATCH 05/10] Apply streams for all parts of cuda::stereoBM::compute. --- modules/cudastereo/src/cuda/stereobm.cu | 32 +++++++++++++------------ modules/cudastereo/test/test_stereo.cpp | 20 ++++++++++++++++ 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index 16663cf53ed..533369be690 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -407,19 +407,21 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { + int ssd_idx = row * cminSSD_step; + int disp_idx = row * disp.step; if (uniquenessRatio > 0) { line_ssds[0] = line_ssd_tails[3*row + 1]; line_ssds[1] = line_ssd_tails[3*row + 2]; - float thresh = thresh_scale * ::min(minSSDImage[row * cminSSD_step], mssd); - int dtest = disparImage[disp.step * row]; + float thresh = thresh_scale * ::min(minSSDImage[ssd_idx], mssd); + int dtest = disparImage[disp_idx]; - if(mssd < minSSDImage[row * cminSSD_step]) + if(mssd < minSSDImage[ssd_idx]) { uniqueness_approved[row] = 1; dtest = d + bestIdx; - if ((disparImage[disp.step * row] < dtest-1 || disparImage[disp.step * row] > dtest+1) && (minSSDImage[row * cminSSD_step] <= thresh)) + if ((disparImage[disp_idx] < dtest-1 || disparImage[disp_idx] > dtest+1) && (minSSDImage[ssd_idx] <= thresh)) { uniqueness_approved[row] = 0; } @@ -437,16 +439,16 @@ namespace cv { namespace cuda { namespace device } } - line_ssd_tails[3*row + 0] = ::min(minSSDImage[row * cminSSD_step], mssd); + line_ssd_tails[3*row + 0] = ::min(minSSDImage[ssd_idx], mssd); line_ssd_tails[3*row + 1] = batch_ssds[6]; line_ssd_tails[3*row + 2] = batch_ssds[7]; } - int idx = row * cminSSD_step; - if (mssd < minSSDImage[idx]) + + if (mssd < minSSDImage[ssd_idx]) { - disparImage[disp.step * row] = (unsigned char)(d + bestIdx); - minSSDImage[idx] = mssd; + disparImage[disp_idx] = (unsigned char)(d + bestIdx); + minSSDImage[ssd_idx] = mssd; } } } @@ -512,15 +514,15 @@ namespace cv { namespace cuda { namespace device //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) ); //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) ); - cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) ); - cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) ); + cudaSafeCall( cudaMemset2DAsync(disp.data, disp.step, 0, disp.cols, disp.rows, stream) ); + cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows, stream) ); - cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) ); - cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) ); - cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) ); + cudaSafeCall( cudaMemcpyToSymbolAsync( cwidth, &left.cols, sizeof(left.cols) , 0, cudaMemcpyHostToDevice, stream) ); + cudaSafeCall( cudaMemcpyToSymbolAsync( cheight, &left.rows, sizeof(left.rows), 0, cudaMemcpyHostToDevice, stream ) ); + cudaSafeCall( cudaMemcpyToSymbolAsync( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) , 0, cudaMemcpyHostToDevice, stream ) ); size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize(); - cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) ); + cudaSafeCall( cudaMemcpyToSymbolAsync( cminSSD_step, &minssd_step, sizeof(minssd_step) , 0, cudaMemcpyHostToDevice, stream ) ); callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, stream); } diff --git a/modules/cudastereo/test/test_stereo.cpp b/modules/cudastereo/test/test_stereo.cpp index 14f7d97f964..acfd846a042 100644 --- a/modules/cudastereo/test/test_stereo.cpp +++ b/modules/cudastereo/test/test_stereo.cpp @@ -122,6 +122,26 @@ CUDA_TEST_P(StereoBM, PrefilterNormRegression) EXPECT_MAT_NEAR(disp_gold, disp, 0.0); } +CUDA_TEST_P(StereoBM, Streams) +{ + cv::cuda::Stream stream; + cv::Mat left_image = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE); + cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE); + cv::Mat disp_gold = readImage("stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE); + + ASSERT_FALSE(left_image.empty()); + ASSERT_FALSE(right_image.empty()); + ASSERT_FALSE(disp_gold.empty()); + + cv::Ptr bm = cv::cuda::createStereoBM(128, 19); + cv::cuda::GpuMat disp; + + bm->compute(loadMat(left_image), loadMat(right_image), disp, stream); + stream.waitForCompletion(); + + EXPECT_MAT_NEAR(disp_gold, disp, 0.0); +} + CUDA_TEST_P(StereoBM, Uniqueness_Regression) { cv::Mat left_image = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE); From df5b1b24a70fa4f496c920ad611489d58a3e8d98 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Mon, 25 Oct 2021 12:32:30 +0300 Subject: [PATCH 06/10] Added perf test for CUDA stereoBM with uniqueness check. --- modules/cudastereo/perf/perf_stereo.cpp | 39 +++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/modules/cudastereo/perf/perf_stereo.cpp b/modules/cudastereo/perf/perf_stereo.cpp index 2b999d9d120..d691ebc26c1 100644 --- a/modules/cudastereo/perf/perf_stereo.cpp +++ b/modules/cudastereo/perf/perf_stereo.cpp @@ -87,6 +87,45 @@ PERF_TEST_P(ImagePair, StereoBM, } } +PERF_TEST_P(ImagePair, StereoBMwithUniqueness, + Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png"))) +{ + declare.time(300.0); + + const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(imgLeft.empty()); + + const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(imgRight.empty()); + + const int ndisp = 256; + + if (PERF_RUN_CUDA()) + { + cv::Ptr d_bm = cv::cuda::createStereoBM(ndisp); + d_bm->setUniquenessRatio(10); + + const cv::cuda::GpuMat d_imgLeft(imgLeft); + const cv::cuda::GpuMat d_imgRight(imgRight); + cv::cuda::GpuMat dst; + + TEST_CYCLE() d_bm->compute(d_imgLeft, d_imgRight, dst); + + CUDA_SANITY_CHECK(dst); + } + else + { + cv::Ptr bm = cv::StereoBM::create(ndisp); + bm->setUniquenessRatio(10); + + cv::Mat dst; + + TEST_CYCLE() bm->compute(imgLeft, imgRight, dst); + + CPU_SANITY_CHECK(dst); + } +} + ////////////////////////////////////////////////////////////////////// // StereoBeliefPropagation From 3d75319eae4b844b6e843c73aebfd1d1588e53af Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Mon, 25 Oct 2021 12:25:44 +0300 Subject: [PATCH 07/10] Optimized global memory transactions. --- modules/cudastereo/src/cuda/stereobm.cu | 101 +++++++++++++----------- 1 file changed, 53 insertions(+), 48 deletions(-) diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index 533369be690..c5e6917551e 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -43,6 +43,7 @@ #if !defined CUDA_DISABLER #include "opencv2/core/cuda/common.hpp" +#include namespace cv { namespace cuda { namespace device { @@ -209,25 +210,18 @@ namespace cv { namespace cuda { namespace device uint line_ssd_tails[3*ROWSperTHREAD]; uchar uniqueness_approved[ROWSperTHREAD]; + uchar local_disparity[ROWSperTHREAD] = {0}; volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x; - volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS) + volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; - //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD) - int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS); - //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS) - #define Y (blockIdx.y * ROWSperTHREAD + RADIUS) - //int Y = blockIdx.y * ROWSperTHREAD + RADIUS; + const int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS); + const int Y = (blockIdx.y * ROWSperTHREAD + RADIUS); unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; unsigned char* disparImage = disp.data + X + Y * disp.step; float thresh_scale; - //if (X < cwidth) - //{ - // unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step; - // for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step ) - // *ptr = 0xFFFFFFFF; - //} + int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS); int y_tex; int x_tex = X - RADIUS; @@ -235,20 +229,20 @@ namespace cv { namespace cuda { namespace device if (x_tex >= cwidth) return; + for(int i = 0; i < 3*ROWSperTHREAD; i++) + { + line_ssd_tails[i] = UINT_MAX; + } if (uniquenessRatio > 0) { - batch_ssds[6] = 0xFFFFFFFF; - batch_ssds[7] = 0xFFFFFFFF; + batch_ssds[6] = UINT_MAX; + batch_ssds[7] = UINT_MAX; thresh_scale = (1.0 + uniquenessRatio / 100.0f); for(int i = 0; i < ROWSperTHREAD; i++) { uniqueness_approved[i] = 1; } - for(int i = 0; i < 3*ROWSperTHREAD; i++) - { - line_ssd_tails[i] = 0xFFFFFFFF; - } } for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP) @@ -261,7 +255,7 @@ namespace cv { namespace cuda { namespace device if (x_tex + BLOCK_W < cwidth) InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra); - __syncthreads(); //before CalcSSDVector function + __syncthreads(); //before CalcSSD function if (Y < cheight - RADIUS) { @@ -281,9 +275,8 @@ namespace cv { namespace cuda { namespace device batch_ssds[6] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X); __syncthreads(); batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X); - __syncthreads(); - int mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), + uint mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); int bestIdx = 0; @@ -305,19 +298,22 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { + unsigned int last_opt = line_ssd_tails[3*0 + 0]; + unsigned int opt = ::min(last_opt, mssd); + if (uniquenessRatio > 0) { line_ssds[0] = line_ssd_tails[3*0 + 1]; line_ssds[1] = line_ssd_tails[3*0 + 2]; - float thresh = thresh_scale * ::min(minSSDImage[0], mssd); - int dtest = disparImage[0]; + float thresh = thresh_scale * opt; + int dtest = local_disparity[0]; - if(mssd < minSSDImage[0]) + if(mssd < last_opt) { uniqueness_approved[0] = 1; dtest = d + bestIdx; - if ((disparImage[0] < dtest-1 || disparImage[0] > dtest+1) && (minSSDImage[0] <= thresh)) + if ((local_disparity[0] < dtest-1 || local_disparity[0] > dtest+1) && (last_opt <= thresh)) { uniqueness_approved[0] = 0; } @@ -325,9 +321,11 @@ namespace cv { namespace cuda { namespace device if(uniqueness_approved[0]) { - for (int ld = 0; ld < N_DISPARITIES + 2; ld++) + // the trial to decompose the code on 2 loops without ld vs dtest makes + // uniqueness check dramatically slow. at least on gf 1080 + for (int ld = d-2; ld < d + N_DISPARITIES; ld++) { - if ((d+ld-2 < dtest-1 || d+ld-2 > dtest+1) && (line_ssds[ld] <= thresh)) + if ((ld < dtest-1 || ld > dtest+1) && (line_ssds[ld-d+2] <= thresh)) { uniqueness_approved[0] = 0; break; @@ -335,15 +333,15 @@ namespace cv { namespace cuda { namespace device } } - line_ssd_tails[3*0 + 0] = ::min(minSSDImage[0], mssd); + line_ssd_tails[3*0 + 1] = batch_ssds[6]; line_ssd_tails[3*0 + 2] = batch_ssds[7]; } - if (mssd < minSSDImage[0]) + line_ssd_tails[3*0 + 0] = opt; + if (mssd < last_opt) { - disparImage[0] = (unsigned char)(d + bestIdx); - minSSDImage[0] = mssd; + local_disparity[0] = (unsigned char)(d + bestIdx); } } } @@ -383,9 +381,8 @@ namespace cv { namespace cuda { namespace device batch_ssds[6] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X); __syncthreads(); batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X); - __syncthreads(); - int mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), + uint mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); int bestIdx = 0; @@ -407,21 +404,21 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { - int ssd_idx = row * cminSSD_step; - int disp_idx = row * disp.step; + unsigned int last_opt = line_ssd_tails[3*row + 0]; + unsigned int opt = ::min(last_opt, mssd); if (uniquenessRatio > 0) { line_ssds[0] = line_ssd_tails[3*row + 1]; line_ssds[1] = line_ssd_tails[3*row + 2]; - float thresh = thresh_scale * ::min(minSSDImage[ssd_idx], mssd); - int dtest = disparImage[disp_idx]; + float thresh = thresh_scale * opt; + int dtest = local_disparity[row]; - if(mssd < minSSDImage[ssd_idx]) + if(mssd < last_opt) { uniqueness_approved[row] = 1; dtest = d + bestIdx; - if ((disparImage[disp_idx] < dtest-1 || disparImage[disp_idx] > dtest+1) && (minSSDImage[ssd_idx] <= thresh)) + if ((local_disparity[row] < dtest-1 || local_disparity[row] > dtest+1) && (last_opt <= thresh)) { uniqueness_approved[row] = 0; } @@ -439,16 +436,15 @@ namespace cv { namespace cuda { namespace device } } - line_ssd_tails[3*row + 0] = ::min(minSSDImage[ssd_idx], mssd); line_ssd_tails[3*row + 1] = batch_ssds[6]; line_ssd_tails[3*row + 2] = batch_ssds[7]; } + line_ssd_tails[3*row + 0] = opt; - if (mssd < minSSDImage[ssd_idx]) + if (mssd < last_opt) { - disparImage[disp_idx] = (unsigned char)(d + bestIdx); - minSSDImage[ssd_idx] = mssd; + local_disparity[row] = (unsigned char)(d + bestIdx); } } } @@ -458,12 +454,24 @@ namespace cv { namespace cuda { namespace device } // for d loop + for (int row = 0; row < end_row; row++) + { + minSSDImage[row * cminSSD_step] = line_ssd_tails[3*row + 0]; + } + if (uniquenessRatio > 0) { - for (int row = 0; row < ROWSperTHREAD; row++) + for (int row = 0; row < end_row; row++) { // drop disparity for pixel where uniqueness requirement was not satisfied (zero value) - disparImage[disp.step * row] = disparImage[disp.step * row] * uniqueness_approved[row]; + disparImage[disp.step * row] = local_disparity[row] * uniqueness_approved[row]; + } + } + else + { + for (int row = 0; row < end_row; row++) + { + disparImage[disp.step * row] = local_disparity[row]; } } } @@ -514,9 +522,6 @@ namespace cv { namespace cuda { namespace device //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) ); //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) ); - cudaSafeCall( cudaMemset2DAsync(disp.data, disp.step, 0, disp.cols, disp.rows, stream) ); - cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows, stream) ); - cudaSafeCall( cudaMemcpyToSymbolAsync( cwidth, &left.cols, sizeof(left.cols) , 0, cudaMemcpyHostToDevice, stream) ); cudaSafeCall( cudaMemcpyToSymbolAsync( cheight, &left.rows, sizeof(left.rows), 0, cudaMemcpyHostToDevice, stream ) ); cudaSafeCall( cudaMemcpyToSymbolAsync( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) , 0, cudaMemcpyHostToDevice, stream ) ); From 5d8a3a1aa85d07d8e0bcb95a36db0d14a26a1437 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Tue, 26 Oct 2021 16:45:26 +0300 Subject: [PATCH 08/10] Restored sync data transfers as they use stack variables. --- modules/cudastereo/src/cuda/stereobm.cu | 11 +++++++---- modules/cudastereo/test/test_stereo.cpp | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index c5e6917551e..c247dc4a220 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -522,12 +522,15 @@ namespace cv { namespace cuda { namespace device //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) ); //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) ); - cudaSafeCall( cudaMemcpyToSymbolAsync( cwidth, &left.cols, sizeof(left.cols) , 0, cudaMemcpyHostToDevice, stream) ); - cudaSafeCall( cudaMemcpyToSymbolAsync( cheight, &left.rows, sizeof(left.rows), 0, cudaMemcpyHostToDevice, stream ) ); - cudaSafeCall( cudaMemcpyToSymbolAsync( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) , 0, cudaMemcpyHostToDevice, stream ) ); + cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) ); + cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) ); + + cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) ); + cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) ); + cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) ); size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize(); - cudaSafeCall( cudaMemcpyToSymbolAsync( cminSSD_step, &minssd_step, sizeof(minssd_step) , 0, cudaMemcpyHostToDevice, stream ) ); + cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) ); callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, stream); } diff --git a/modules/cudastereo/test/test_stereo.cpp b/modules/cudastereo/test/test_stereo.cpp index acfd846a042..80344a3ae15 100644 --- a/modules/cudastereo/test/test_stereo.cpp +++ b/modules/cudastereo/test/test_stereo.cpp @@ -41,6 +41,7 @@ //M*/ #include "test_precomp.hpp" +#include #ifdef HAVE_CUDA From 5d55c312ad707709a6f45118dc80588673e3661d Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 28 Oct 2021 17:18:35 +0300 Subject: [PATCH 09/10] Do not use constant memory in stereoBM to exclude stream races. --- modules/cudastereo/src/cuda/stereobm.cu | 64 ++++++++++++------------- modules/cudastereo/test/test_stereo.cpp | 1 - 2 files changed, 30 insertions(+), 35 deletions(-) diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index c247dc4a220..115e4f4bb00 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -61,18 +61,13 @@ namespace cv { namespace cuda { namespace device #define STEREO_MIND 0 // The minimum d range to check #define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing - __constant__ unsigned int* cminSSDImage; - __constant__ size_t cminSSD_step; - __constant__ int cwidth; - __constant__ int cheight; - __device__ __forceinline__ int SQ(int a) { return a * a; } template - __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X) + __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth) { unsigned int cache = 0; unsigned int cache2 = 0; @@ -202,7 +197,8 @@ namespace cv { namespace cuda { namespace device } template - __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp, int uniquenessRatio) + __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp, + int uniquenessRatio, unsigned int* cminSSDImage, size_t cminSSD_step, int cwidth, int cheight) { extern __shared__ unsigned int col_ssd_cache[]; uint line_ssds[2 + N_DISPARITIES]; // +2 - tail of previous batch for accurate uniquenessRatio check @@ -210,7 +206,7 @@ namespace cv { namespace cuda { namespace device uint line_ssd_tails[3*ROWSperTHREAD]; uchar uniqueness_approved[ROWSperTHREAD]; - uchar local_disparity[ROWSperTHREAD] = {0}; + uchar local_disparity[ROWSperTHREAD]; volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x; volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; @@ -229,6 +225,9 @@ namespace cv { namespace cuda { namespace device if (x_tex >= cwidth) return; + for(int i = 0; i < ROWSperTHREAD; i++) + local_disparity[i] = 0; + for(int i = 0; i < 3*ROWSperTHREAD; i++) { line_ssd_tails[i] = UINT_MAX; @@ -260,21 +259,21 @@ namespace cv { namespace cuda { namespace device if (Y < cheight - RADIUS) { //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) - batch_ssds[0] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[0] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[1] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[1] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[2] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[2] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[3] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[3] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[4] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[4] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[5] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[5] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[6] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[6] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth); uint mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); @@ -366,21 +365,21 @@ namespace cv { namespace cuda { namespace device if (row < cheight - RADIUS - Y) { //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) - batch_ssds[0] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[0] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[1] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[1] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[2] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[2] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[3] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[3] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[4] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[4] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[5] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[5] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[6] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[6] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X); + batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth); uint mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); @@ -477,7 +476,8 @@ namespace cv { namespace cuda { namespace device } template void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, - int maxdisp, int uniquenessRatio, cudaStream_t & stream) + int maxdisp, int uniquenessRatio, unsigned int* missd_buffer, + size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream) { dim3 grid(1,1,1); dim3 threads(BLOCK_W, 1, 1); @@ -488,7 +488,8 @@ namespace cv { namespace cuda { namespace device //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int); - stereoKernel<<>>(left.data, right.data, left.step, disp, maxdisp, uniquenessRatio); + stereoKernel<<>>(left.data, right.data, left.step, disp, maxdisp, uniquenessRatio, + missd_buffer, minssd_step, cwidth, cheight); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -496,7 +497,8 @@ namespace cv { namespace cuda { namespace device }; typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, - int maxdisp, int uniquenessRatio, cudaStream_t & stream); + int maxdisp, int uniquenessRatio, unsigned int* missd_buffer, + size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream); const static kernel_caller_t callers[] = { @@ -525,14 +527,8 @@ namespace cv { namespace cuda { namespace device cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) ); cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) ); - cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) ); - cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) ); - cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) ); - size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize(); - cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) ); - - callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, stream); + callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, minSSD_buf.data, minssd_step, left.cols, left.rows, stream); } __device__ inline int clamp(int x, int a, int b) diff --git a/modules/cudastereo/test/test_stereo.cpp b/modules/cudastereo/test/test_stereo.cpp index 80344a3ae15..acfd846a042 100644 --- a/modules/cudastereo/test/test_stereo.cpp +++ b/modules/cudastereo/test/test_stereo.cpp @@ -41,7 +41,6 @@ //M*/ #include "test_precomp.hpp" -#include #ifdef HAVE_CUDA From 24384b40a56d96b5eb44e3b0f5087f5f4ca389f3 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Tue, 14 Dec 2021 08:22:25 +0300 Subject: [PATCH 10/10] Code review fixes. --- modules/cudastereo/src/cuda/stereobm.cu | 128 ++++++++++-------------- 1 file changed, 53 insertions(+), 75 deletions(-) diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index 115e4f4bb00..1616f984f67 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -94,6 +94,38 @@ namespace cv { namespace cuda { namespace device return col_ssd[0] + cache + cache2; } + template + __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth, unsigned int* ssd) + { + //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) + ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth); + __syncthreads(); + ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth); + __syncthreads(); + ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth); + __syncthreads(); + ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth); + __syncthreads(); + ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth); + __syncthreads(); + ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth); + __syncthreads(); + ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth); + __syncthreads(); + ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth); + + int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7]))); + + int bestIdx = 0; + for (int i = 0; i < N_DISPARITIES; i++) + { + if (mssd == ssd[i]) + bestIdx = i; + } + + return make_uint2(mssd, bestIdx); + } + template __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd) { @@ -254,42 +286,17 @@ namespace cv { namespace cuda { namespace device if (x_tex + BLOCK_W < cwidth) InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra); - __syncthreads(); //before CalcSSD function + __syncthreads(); //before MinSSD function if (Y < cheight - RADIUS) { - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) - batch_ssds[0] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[1] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[2] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[3] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[4] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[5] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[6] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth); - - uint mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), - ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); - - int bestIdx = 0; - for (int i = 0; i < N_DISPARITIES; i++) - { - if (mssd == batch_ssds[i]) - bestIdx = i; - } + uint2 batch_opt = MinSSD(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds); // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously - // computed "minSSD" value is not used at all. + // computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all. // - // However, since the batch_ssds computation has "__syncthreads" call in its body, those threads - // must also call "CalcSSD" to avoid deadlock. (#13850) + // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads + // must also call "MinSSD" to avoid deadlock. (#13850) // // From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads" // could be an option, but the shared memory access pattern does not allow this option, @@ -298,7 +305,7 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { unsigned int last_opt = line_ssd_tails[3*0 + 0]; - unsigned int opt = ::min(last_opt, mssd); + unsigned int opt = ::min(last_opt, batch_opt.x); if (uniquenessRatio > 0) { @@ -308,10 +315,10 @@ namespace cv { namespace cuda { namespace device float thresh = thresh_scale * opt; int dtest = local_disparity[0]; - if(mssd < last_opt) + if(batch_opt.x < last_opt) { uniqueness_approved[0] = 1; - dtest = d + bestIdx; + dtest = d + batch_opt.y; if ((local_disparity[0] < dtest-1 || local_disparity[0] > dtest+1) && (last_opt <= thresh)) { uniqueness_approved[0] = 0; @@ -338,9 +345,9 @@ namespace cv { namespace cuda { namespace device } line_ssd_tails[3*0 + 0] = opt; - if (mssd < last_opt) + if (batch_opt.x < last_opt) { - local_disparity[0] = (unsigned char)(d + bestIdx); + local_disparity[0] = (unsigned char)(d + batch_opt.y); } } } @@ -364,38 +371,12 @@ namespace cv { namespace cuda { namespace device if (row < cheight - RADIUS - Y) { - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) - batch_ssds[0] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[1] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[2] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[3] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[4] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[5] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[6] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth); - __syncthreads(); - batch_ssds[7] = CalcSSD(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth); - - uint mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])), - ::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7]))); - - int bestIdx = 0; - for (int i = 0; i < N_DISPARITIES; i++) - { - if (mssd == batch_ssds[i]) - bestIdx = i; - } - + uint2 batch_opt = MinSSD(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds); // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously - // computed "minSSD" value, which is the result of "CalcSSDVector" function call, is not used at all. + // computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all. // - // However, since the "CalcSSDVector" function has "__syncthreads" call in its body, those threads - // must also call "CalcSSDVector" to avoid deadlock. (#13850) + // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads + // must also call "MinSSD" to avoid deadlock. (#13850) // // From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads" // could be an option, but the shared memory access pattern does not allow this option, @@ -404,7 +385,7 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { unsigned int last_opt = line_ssd_tails[3*row + 0]; - unsigned int opt = ::min(last_opt, mssd); + unsigned int opt = ::min(last_opt, batch_opt.x); if (uniquenessRatio > 0) { line_ssds[0] = line_ssd_tails[3*row + 1]; @@ -413,10 +394,10 @@ namespace cv { namespace cuda { namespace device float thresh = thresh_scale * opt; int dtest = local_disparity[row]; - if(mssd < last_opt) + if(batch_opt.x < last_opt) { uniqueness_approved[row] = 1; - dtest = d + bestIdx; + dtest = d + batch_opt.y; if ((local_disparity[row] < dtest-1 || local_disparity[row] > dtest+1) && (last_opt <= thresh)) { uniqueness_approved[row] = 0; @@ -441,9 +422,9 @@ namespace cv { namespace cuda { namespace device line_ssd_tails[3*row + 0] = opt; - if (mssd < last_opt) + if (batch_opt.x < last_opt) { - local_disparity[row] = (unsigned char)(d + bestIdx); + local_disparity[row] = (unsigned char)(d + batch_opt.y); } } } @@ -521,11 +502,8 @@ namespace cv { namespace cuda { namespace device if (winsz2 == 0 || winsz2 >= calles_num) CV_Error(cv::Error::StsBadArg, "Unsupported window size"); - //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) ); - //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) ); - - cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) ); - cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) ); + cudaSafeCall( cudaMemset2DAsync(disp.data, disp.step, 0, disp.cols, disp.rows, stream) ); + cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows, stream) ); size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize(); callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, minSSD_buf.data, minssd_step, left.cols, left.rows, stream);