diff --git a/modules/cudastereo/perf/perf_stereo.cpp b/modules/cudastereo/perf/perf_stereo.cpp index 2b999d9d120..d691ebc26c1 100644 --- a/modules/cudastereo/perf/perf_stereo.cpp +++ b/modules/cudastereo/perf/perf_stereo.cpp @@ -87,6 +87,45 @@ PERF_TEST_P(ImagePair, StereoBM, } } +PERF_TEST_P(ImagePair, StereoBMwithUniqueness, + Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png"))) +{ + declare.time(300.0); + + const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(imgLeft.empty()); + + const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(imgRight.empty()); + + const int ndisp = 256; + + if (PERF_RUN_CUDA()) + { + cv::Ptr d_bm = cv::cuda::createStereoBM(ndisp); + d_bm->setUniquenessRatio(10); + + const cv::cuda::GpuMat d_imgLeft(imgLeft); + const cv::cuda::GpuMat d_imgRight(imgRight); + cv::cuda::GpuMat dst; + + TEST_CYCLE() d_bm->compute(d_imgLeft, d_imgRight, dst); + + CUDA_SANITY_CHECK(dst); + } + else + { + cv::Ptr bm = cv::StereoBM::create(ndisp); + bm->setUniquenessRatio(10); + + cv::Mat dst; + + TEST_CYCLE() bm->compute(imgLeft, imgRight, dst); + + CPU_SANITY_CHECK(dst); + } +} + ////////////////////////////////////////////////////////////////////// // StereoBeliefPropagation diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index 5be19e99ff8..1616f984f67 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -43,6 +43,7 @@ #if !defined CUDA_DISABLER #include "opencv2/core/cuda/common.hpp" +#include namespace cv { namespace cuda { namespace device { @@ -60,18 +61,13 @@ namespace cv { namespace cuda { namespace device #define STEREO_MIND 0 // The minimum d range to check #define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing - __constant__ unsigned int* cminSSDImage; - __constant__ size_t cminSSD_step; - __constant__ int cwidth; - __constant__ int cheight; - __device__ __forceinline__ int SQ(int a) { return a * a; } template - __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X) + __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth) { unsigned int cache = 0; unsigned int cache2 = 0; @@ -99,26 +95,24 @@ namespace cv { namespace cuda { namespace device } template - __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X) + __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth, unsigned int* ssd) { - unsigned int ssd[N_DISPARITIES]; - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) - ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X); + ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X); + ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X); + ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X); + ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X); + ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X); + ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X); + ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth); __syncthreads(); - ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X); + ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth); int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7]))); @@ -235,26 +229,27 @@ namespace cv { namespace cuda { namespace device } template - __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp) + __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp, + int uniquenessRatio, unsigned int* cminSSDImage, size_t cminSSD_step, int cwidth, int cheight) { extern __shared__ unsigned int col_ssd_cache[]; + uint line_ssds[2 + N_DISPARITIES]; // +2 - tail of previous batch for accurate uniquenessRatio check + uint* batch_ssds = line_ssds + 2; + + uint line_ssd_tails[3*ROWSperTHREAD]; + uchar uniqueness_approved[ROWSperTHREAD]; + uchar local_disparity[ROWSperTHREAD]; + volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x; - volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS) + volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; - //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD) - int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS); - //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS) - #define Y (blockIdx.y * ROWSperTHREAD + RADIUS) - //int Y = blockIdx.y * ROWSperTHREAD + RADIUS; + const int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS); + const int Y = (blockIdx.y * ROWSperTHREAD + RADIUS); unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; unsigned char* disparImage = disp.data + X + Y * disp.step; - //if (X < cwidth) - //{ - // unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step; - // for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step ) - // *ptr = 0xFFFFFFFF; - //} + float thresh_scale; + int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS); int y_tex; int x_tex = X - RADIUS; @@ -262,6 +257,25 @@ namespace cv { namespace cuda { namespace device if (x_tex >= cwidth) return; + for(int i = 0; i < ROWSperTHREAD; i++) + local_disparity[i] = 0; + + for(int i = 0; i < 3*ROWSperTHREAD; i++) + { + line_ssd_tails[i] = UINT_MAX; + } + + if (uniquenessRatio > 0) + { + batch_ssds[6] = UINT_MAX; + batch_ssds[7] = UINT_MAX; + thresh_scale = (1.0 + uniquenessRatio / 100.0f); + for(int i = 0; i < ROWSperTHREAD; i++) + { + uniqueness_approved[i] = 1; + } + } + for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP) { y_tex = Y - RADIUS; @@ -276,10 +290,10 @@ namespace cv { namespace cuda { namespace device if (Y < cheight - RADIUS) { - uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd, X); + uint2 batch_opt = MinSSD(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds); // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously - // computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all. + // computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all. // // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads // must also call "MinSSD" to avoid deadlock. (#13850) @@ -290,10 +304,50 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { - if (minSSD.x < minSSDImage[0]) + unsigned int last_opt = line_ssd_tails[3*0 + 0]; + unsigned int opt = ::min(last_opt, batch_opt.x); + + if (uniquenessRatio > 0) + { + line_ssds[0] = line_ssd_tails[3*0 + 1]; + line_ssds[1] = line_ssd_tails[3*0 + 2]; + + float thresh = thresh_scale * opt; + int dtest = local_disparity[0]; + + if(batch_opt.x < last_opt) + { + uniqueness_approved[0] = 1; + dtest = d + batch_opt.y; + if ((local_disparity[0] < dtest-1 || local_disparity[0] > dtest+1) && (last_opt <= thresh)) + { + uniqueness_approved[0] = 0; + } + } + + if(uniqueness_approved[0]) + { + // the trial to decompose the code on 2 loops without ld vs dtest makes + // uniqueness check dramatically slow. at least on gf 1080 + for (int ld = d-2; ld < d + N_DISPARITIES; ld++) + { + if ((ld < dtest-1 || ld > dtest+1) && (line_ssds[ld-d+2] <= thresh)) + { + uniqueness_approved[0] = 0; + break; + } + } + } + + + line_ssd_tails[3*0 + 1] = batch_ssds[6]; + line_ssd_tails[3*0 + 2] = batch_ssds[7]; + } + + line_ssd_tails[3*0 + 0] = opt; + if (batch_opt.x < last_opt) { - disparImage[0] = (unsigned char)(d + minSSD.y); - minSSDImage[0] = minSSD.x; + local_disparity[0] = (unsigned char)(d + batch_opt.y); } } } @@ -313,14 +367,13 @@ namespace cv { namespace cuda { namespace device y_tex += 1; - __syncthreads(); //before MinSSD function + __syncthreads(); if (row < cheight - RADIUS - Y) { - uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd, X); - + uint2 batch_opt = MinSSD(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds); // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously - // computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all. + // computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all. // // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads // must also call "MinSSD" to avoid deadlock. (#13850) @@ -331,11 +384,47 @@ namespace cv { namespace cuda { namespace device if (X < cwidth - RADIUS) { - int idx = row * cminSSD_step; - if (minSSD.x < minSSDImage[idx]) + unsigned int last_opt = line_ssd_tails[3*row + 0]; + unsigned int opt = ::min(last_opt, batch_opt.x); + if (uniquenessRatio > 0) { - disparImage[disp.step * row] = (unsigned char)(d + minSSD.y); - minSSDImage[idx] = minSSD.x; + line_ssds[0] = line_ssd_tails[3*row + 1]; + line_ssds[1] = line_ssd_tails[3*row + 2]; + + float thresh = thresh_scale * opt; + int dtest = local_disparity[row]; + + if(batch_opt.x < last_opt) + { + uniqueness_approved[row] = 1; + dtest = d + batch_opt.y; + if ((local_disparity[row] < dtest-1 || local_disparity[row] > dtest+1) && (last_opt <= thresh)) + { + uniqueness_approved[row] = 0; + } + } + + if(uniqueness_approved[row]) + { + for (int ld = 0; ld < N_DISPARITIES + 2; ld++) + { + if (((d+ld-2 < dtest-1) || (d+ld-2 > dtest+1)) && (line_ssds[ld] <= thresh)) + { + uniqueness_approved[row] = 0; + break; + } + } + } + + line_ssd_tails[3*row + 1] = batch_ssds[6]; + line_ssd_tails[3*row + 2] = batch_ssds[7]; + } + + line_ssd_tails[3*row + 0] = opt; + + if (batch_opt.x < last_opt) + { + local_disparity[row] = (unsigned char)(d + batch_opt.y); } } } @@ -344,10 +433,32 @@ namespace cv { namespace cuda { namespace device __syncthreads(); // before initializing shared memory at the beginning of next loop } // for d loop - } + for (int row = 0; row < end_row; row++) + { + minSSDImage[row * cminSSD_step] = line_ssd_tails[3*row + 0]; + } - template void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream) + if (uniquenessRatio > 0) + { + for (int row = 0; row < end_row; row++) + { + // drop disparity for pixel where uniqueness requirement was not satisfied (zero value) + disparImage[disp.step * row] = local_disparity[row] * uniqueness_approved[row]; + } + } + else + { + for (int row = 0; row < end_row; row++) + { + disparImage[disp.step * row] = local_disparity[row]; + } + } + } + + template void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, + int maxdisp, int uniquenessRatio, unsigned int* missd_buffer, + size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream) { dim3 grid(1,1,1); dim3 threads(BLOCK_W, 1, 1); @@ -358,14 +469,17 @@ namespace cv { namespace cuda { namespace device //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int); - stereoKernel<<>>(left.data, right.data, left.step, disp, maxdisp); + stereoKernel<<>>(left.data, right.data, left.step, disp, maxdisp, uniquenessRatio, + missd_buffer, minssd_step, cwidth, cheight); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); }; - typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream); + typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, + int maxdisp, int uniquenessRatio, unsigned int* missd_buffer, + size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream); const static kernel_caller_t callers[] = { @@ -380,27 +494,19 @@ namespace cv { namespace cuda { namespace device }; const int calles_num = sizeof(callers)/sizeof(callers[0]); - void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz& minSSD_buf, cudaStream_t& stream) + void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, + int winsz, int uniquenessRatio, const PtrStepSz& minSSD_buf, cudaStream_t& stream) { int winsz2 = winsz >> 1; if (winsz2 == 0 || winsz2 >= calles_num) CV_Error(cv::Error::StsBadArg, "Unsupported window size"); - //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) ); - //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) ); - - cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) ); - cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) ); - - cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) ); - cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) ); - cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) ); + cudaSafeCall( cudaMemset2DAsync(disp.data, disp.step, 0, disp.cols, disp.rows, stream) ); + cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows, stream) ); size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize(); - cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) ); - - callers[winsz2](left, right, disp, maxdisp, stream); + callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, minSSD_buf.data, minssd_step, left.cols, left.rows, stream); } __device__ inline int clamp(int x, int a, int b) diff --git a/modules/cudastereo/src/stereobm.cpp b/modules/cudastereo/src/stereobm.cpp index 47ef8e3e1ca..dab326981ba 100644 --- a/modules/cudastereo/src/stereobm.cpp +++ b/modules/cudastereo/src/stereobm.cpp @@ -55,7 +55,7 @@ namespace cv { namespace cuda { namespace device { namespace stereobm { - void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz& minSSD_buf, cudaStream_t & stream); + void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, int uniquenessRatio, const PtrStepSz& minSSD_buf, cudaStream_t & stream); void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream); void prefilter_norm(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, int winsize, cudaStream_t & stream); void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream); @@ -102,8 +102,8 @@ namespace int getTextureThreshold() const { return static_cast(avergeTexThreshold_); } void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = static_cast(textureThreshold); } - int getUniquenessRatio() const { return 0; } - void setUniquenessRatio(int /*uniquenessRatio*/) {} + int getUniquenessRatio() const { return uniquenessRatio_; } + void setUniquenessRatio(int uniquenessRatio) { uniquenessRatio_ = uniquenessRatio; } int getSmallerBlockSize() const { return 0; } void setSmallerBlockSize(int /*blockSize*/){} @@ -121,12 +121,13 @@ namespace int preFilterCap_; float avergeTexThreshold_; int preFilterSize_; + int uniquenessRatio_; GpuMat minSSD_, leBuf_, riBuf_; }; StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize) - : preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9) + : preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9), uniquenessRatio_(0) { } @@ -183,7 +184,7 @@ namespace ri_for_bm = riBuf_; } - stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream); + stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, uniquenessRatio_, minSSD_, stream); if (avergeTexThreshold_ > 0) postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream); diff --git a/modules/cudastereo/test/test_stereo.cpp b/modules/cudastereo/test/test_stereo.cpp index b58b79e56e6..acfd846a042 100644 --- a/modules/cudastereo/test/test_stereo.cpp +++ b/modules/cudastereo/test/test_stereo.cpp @@ -122,6 +122,49 @@ CUDA_TEST_P(StereoBM, PrefilterNormRegression) EXPECT_MAT_NEAR(disp_gold, disp, 0.0); } +CUDA_TEST_P(StereoBM, Streams) +{ + cv::cuda::Stream stream; + cv::Mat left_image = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE); + cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE); + cv::Mat disp_gold = readImage("stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE); + + ASSERT_FALSE(left_image.empty()); + ASSERT_FALSE(right_image.empty()); + ASSERT_FALSE(disp_gold.empty()); + + cv::Ptr bm = cv::cuda::createStereoBM(128, 19); + cv::cuda::GpuMat disp; + + bm->compute(loadMat(left_image), loadMat(right_image), disp, stream); + stream.waitForCompletion(); + + EXPECT_MAT_NEAR(disp_gold, disp, 0.0); +} + +CUDA_TEST_P(StereoBM, Uniqueness_Regression) +{ + cv::Mat left_image = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE); + cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE); + cv::Mat disp_gold = readImage("stereobm/aloe-disp-uniqueness15.png", cv::IMREAD_GRAYSCALE); + + ASSERT_FALSE(left_image.empty()); + ASSERT_FALSE(right_image.empty()); + ASSERT_FALSE(disp_gold.empty()); + + cv::Ptr bm = cv::cuda::createStereoBM(128, 19); + cv::cuda::GpuMat disp; + + bm->setUniquenessRatio(15); + bm->compute(loadMat(left_image), loadMat(right_image), disp); + + cv::Mat disp_cpu; + disp.download(disp_cpu); + cv::imwrite("disp_inq15.png", disp_cpu); + + EXPECT_MAT_NEAR(disp_gold, disp, 0.0); +} + INSTANTIATE_TEST_CASE_P(CUDA_Stereo, StereoBM, ALL_DEVICES); //////////////////////////////////////////////////////////////////////////