diff --git a/modules/cudastereo/perf/perf_stereo.cpp b/modules/cudastereo/perf/perf_stereo.cpp
index 2b999d9d120..d691ebc26c1 100644
--- a/modules/cudastereo/perf/perf_stereo.cpp
+++ b/modules/cudastereo/perf/perf_stereo.cpp
@@ -87,6 +87,45 @@ PERF_TEST_P(ImagePair, StereoBM,
     }
 }
 
+PERF_TEST_P(ImagePair, StereoBMwithUniqueness,
+            Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
+{
+    declare.time(300.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 256;
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::StereoBM> d_bm = cv::cuda::createStereoBM(ndisp);
+        d_bm->setUniquenessRatio(10);
+
+        const cv::cuda::GpuMat d_imgLeft(imgLeft);
+        const cv::cuda::GpuMat d_imgRight(imgRight);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() d_bm->compute(d_imgLeft, d_imgRight, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Ptr<cv::StereoBM> bm = cv::StereoBM::create(ndisp);
+        bm->setUniquenessRatio(10);
+
+        cv::Mat dst;
+
+        TEST_CYCLE() bm->compute(imgLeft, imgRight, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // StereoBeliefPropagation
 
diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu
index 5be19e99ff8..1616f984f67 100644
--- a/modules/cudastereo/src/cuda/stereobm.cu
+++ b/modules/cudastereo/src/cuda/stereobm.cu
@@ -43,6 +43,7 @@
 #if !defined CUDA_DISABLER
 
 #include "opencv2/core/cuda/common.hpp"
+#include <limits.h>
 
 namespace cv { namespace cuda { namespace device
 {
@@ -60,18 +61,13 @@ namespace cv { namespace cuda { namespace device
         #define STEREO_MIND 0                    // The minimum d range to check
         #define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
 
-        __constant__ unsigned int* cminSSDImage;
-        __constant__ size_t cminSSD_step;
-        __constant__ int cwidth;
-        __constant__ int cheight;
-
         __device__ __forceinline__ int SQ(int a)
         {
             return a * a;
         }
 
         template<int RADIUS>
-        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X)
+        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth)
         {
             unsigned int cache = 0;
             unsigned int cache2 = 0;
@@ -99,26 +95,24 @@ namespace cv { namespace cuda { namespace device
         }
 
         template<int RADIUS>
-        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X)
+        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth, unsigned int* ssd)
         {
-            unsigned int ssd[N_DISPARITIES];
-
             //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth);
             __syncthreads();
-            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth);
             __syncthreads();
-            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth);
             __syncthreads();
-            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth);
             __syncthreads();
-            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth);
             __syncthreads();
-            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth);
             __syncthreads();
-            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth);
             __syncthreads();
-            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth);
 
             int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
 
@@ -235,26 +229,27 @@ namespace cv { namespace cuda { namespace device
         }
 
         template<int RADIUS>
-        __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
+        __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp,
+                                     int uniquenessRatio, unsigned int* cminSSDImage, size_t cminSSD_step, int cwidth, int cheight)
         {
             extern __shared__ unsigned int col_ssd_cache[];
+            uint line_ssds[2 + N_DISPARITIES]; // +2 - tail of previous batch for accurate uniquenessRatio check
+            uint* batch_ssds = line_ssds + 2;
+
+            uint line_ssd_tails[3*ROWSperTHREAD];
+            uchar uniqueness_approved[ROWSperTHREAD];
+            uchar local_disparity[ROWSperTHREAD];
+
             volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
-            volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;  //#define N_DIRTY_PIXELS (2 * RADIUS)
+            volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;
 
-            //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
-            int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
-            //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
-            #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
-            //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
+            const int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
+            const int Y = (blockIdx.y * ROWSperTHREAD + RADIUS);
 
             unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
             unsigned char* disparImage = disp.data + X + Y * disp.step;
-            //if (X < cwidth)
-            //{
-            //    unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
-            //    for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
-            //        *ptr = 0xFFFFFFFF;
-            //}
+            float thresh_scale;
+
             int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
             int y_tex;
             int x_tex = X - RADIUS;
@@ -262,6 +257,25 @@ namespace cv { namespace cuda { namespace device
             if (x_tex >= cwidth)
                 return;
 
+            for(int i = 0; i < ROWSperTHREAD; i++)
+                local_disparity[i] = 0;
+
+            for(int i = 0; i < 3*ROWSperTHREAD; i++)
+            {
+                line_ssd_tails[i] = UINT_MAX;
+            }
+
+            if (uniquenessRatio > 0)
+            {
+                batch_ssds[6] = UINT_MAX;
+                batch_ssds[7] = UINT_MAX;
+                thresh_scale = (1.0 + uniquenessRatio / 100.0f);
+                for(int i = 0; i < ROWSperTHREAD; i++)
+                {
+                    uniqueness_approved[i] = 1;
+                }
+            }
+
             for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
             {
                 y_tex = Y - RADIUS;
@@ -276,10 +290,10 @@ namespace cv { namespace cuda { namespace device
 
                 if (Y < cheight - RADIUS)
                 {
-                    uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
+                    uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds);
 
                     // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
-                    // computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
+                    // computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all.
                     //
                     // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
                     // must also call "MinSSD" to avoid deadlock. (#13850)
@@ -290,10 +304,50 @@ namespace cv { namespace cuda { namespace device
 
                     if (X < cwidth - RADIUS)
                     {
-                        if (minSSD.x < minSSDImage[0])
+                        unsigned int last_opt = line_ssd_tails[3*0 + 0];
+                        unsigned int opt = ::min(last_opt, batch_opt.x);
+
+                        if (uniquenessRatio > 0)
+                        {
+                            line_ssds[0] = line_ssd_tails[3*0 + 1];
+                            line_ssds[1] = line_ssd_tails[3*0 + 2];
+
+                            float thresh = thresh_scale * opt;
+                            int dtest = local_disparity[0];
+
+                            if(batch_opt.x < last_opt)
+                            {
+                                uniqueness_approved[0] = 1;
+                                dtest = d + batch_opt.y;
+                                if ((local_disparity[0] < dtest-1 || local_disparity[0] > dtest+1) && (last_opt <= thresh))
+                                {
+                                    uniqueness_approved[0] = 0;
+                                }
+                            }
+
+                            if(uniqueness_approved[0])
+                            {
+                                // the trial to decompose the code on 2 loops without ld vs dtest makes
+                                // uniqueness check dramatically slow. at least on gf 1080
+                                for (int ld = d-2; ld < d + N_DISPARITIES; ld++)
+                                {
+                                    if ((ld < dtest-1 || ld > dtest+1) && (line_ssds[ld-d+2] <= thresh))
+                                    {
+                                        uniqueness_approved[0] = 0;
+                                        break;
+                                    }
+                                }
+                            }
+
+
+                            line_ssd_tails[3*0 + 1] = batch_ssds[6];
+                            line_ssd_tails[3*0 + 2] = batch_ssds[7];
+                        }
+
+                        line_ssd_tails[3*0 + 0] = opt;
+                        if (batch_opt.x < last_opt)
                         {
-                            disparImage[0] = (unsigned char)(d + minSSD.y);
-                            minSSDImage[0] = minSSD.x;
+                            local_disparity[0] = (unsigned char)(d + batch_opt.y);
                         }
                     }
                 }
@@ -313,14 +367,13 @@ namespace cv { namespace cuda { namespace device
 
                     y_tex += 1;
 
-                    __syncthreads(); //before MinSSD function
+                    __syncthreads();
 
                     if (row < cheight - RADIUS - Y)
                     {
-                        uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
-
+                        uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds);
                         // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
-                        // computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
+                        // computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all.
                         //
                         // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
                         // must also call "MinSSD" to avoid deadlock. (#13850)
@@ -331,11 +384,47 @@ namespace cv { namespace cuda { namespace device
 
                         if (X < cwidth - RADIUS)
                         {
-                            int idx = row * cminSSD_step;
-                            if (minSSD.x < minSSDImage[idx])
+                            unsigned int last_opt = line_ssd_tails[3*row + 0];
+                            unsigned int opt = ::min(last_opt, batch_opt.x);
+                            if (uniquenessRatio > 0)
                             {
-                                disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
-                                minSSDImage[idx] = minSSD.x;
+                                line_ssds[0] = line_ssd_tails[3*row + 1];
+                                line_ssds[1] = line_ssd_tails[3*row + 2];
+
+                                float thresh = thresh_scale * opt;
+                                int dtest = local_disparity[row];
+
+                                if(batch_opt.x < last_opt)
+                                {
+                                    uniqueness_approved[row] = 1;
+                                    dtest = d + batch_opt.y;
+                                    if ((local_disparity[row] < dtest-1 || local_disparity[row] > dtest+1) && (last_opt <= thresh))
+                                    {
+                                        uniqueness_approved[row] = 0;
+                                    }
+                                }
+
+                                if(uniqueness_approved[row])
+                                {
+                                    for (int ld = 0; ld < N_DISPARITIES + 2; ld++)
+                                    {
+                                        if (((d+ld-2 < dtest-1) || (d+ld-2 > dtest+1)) && (line_ssds[ld] <= thresh))
+                                        {
+                                            uniqueness_approved[row] = 0;
+                                            break;
+                                        }
+                                    }
+                                }
+
+                                line_ssd_tails[3*row + 1] = batch_ssds[6];
+                                line_ssd_tails[3*row + 2] = batch_ssds[7];
+                            }
+
+                            line_ssd_tails[3*row + 0] = opt;
+
+                            if (batch_opt.x < last_opt)
+                            {
+                                local_disparity[row] = (unsigned char)(d + batch_opt.y);
                             }
                         }
                     }
@@ -344,10 +433,32 @@ namespace cv { namespace cuda { namespace device
                 __syncthreads(); // before initializing shared memory at the beginning of next loop
 
             } // for d loop
-        }
 
+            for (int row = 0; row < end_row; row++)
+            {
+                minSSDImage[row * cminSSD_step] = line_ssd_tails[3*row + 0];
+            }
 
-        template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream)
+            if (uniquenessRatio > 0)
+            {
+                for (int row = 0; row < end_row; row++)
+                {
+                    // drop disparity for pixel where uniqueness requirement was not satisfied (zero value)
+                    disparImage[disp.step * row] = local_disparity[row] * uniqueness_approved[row];
+                }
+            }
+            else
+            {
+                for (int row = 0; row < end_row; row++)
+                {
+                    disparImage[disp.step * row] = local_disparity[row];
+                }
+            }
+        }
+
+        template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp,
+                                                int maxdisp, int uniquenessRatio, unsigned int* missd_buffer,
+                                                size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream)
         {
             dim3 grid(1,1,1);
             dim3 threads(BLOCK_W, 1, 1);
@@ -358,14 +469,17 @@ namespace cv { namespace cuda { namespace device
             //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
             size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
 
-            stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
+            stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp, uniquenessRatio,
+                                                                       missd_buffer, minssd_step, cwidth, cheight);
             cudaSafeCall( cudaGetLastError() );
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         };
 
-        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
+        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp,
+                                        int maxdisp, int uniquenessRatio, unsigned int* missd_buffer,
+                                        size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream);
 
         const static kernel_caller_t callers[] =
         {
@@ -380,27 +494,19 @@ namespace cv { namespace cuda { namespace device
         };
         const int calles_num = sizeof(callers)/sizeof(callers[0]);
 
-        void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
+        void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp,
+                           int winsz, int uniquenessRatio, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
         {
             int winsz2 = winsz >> 1;
 
             if (winsz2 == 0 || winsz2 >= calles_num)
                 CV_Error(cv::Error::StsBadArg, "Unsupported window size");
 
-            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
-            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
-
-            cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
-            cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
-
-            cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
-            cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
-            cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
+            cudaSafeCall( cudaMemset2DAsync(disp.data, disp.step, 0, disp.cols, disp.rows, stream) );
+            cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows, stream) );
 
             size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
-            cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step,  &minssd_step, sizeof(minssd_step) ) );
-
-            callers[winsz2](left, right, disp, maxdisp, stream);
+            callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, minSSD_buf.data, minssd_step, left.cols, left.rows, stream);
         }
 
         __device__ inline int clamp(int x, int a, int b)
diff --git a/modules/cudastereo/src/stereobm.cpp b/modules/cudastereo/src/stereobm.cpp
index 47ef8e3e1ca..dab326981ba 100644
--- a/modules/cudastereo/src/stereobm.cpp
+++ b/modules/cudastereo/src/stereobm.cpp
@@ -55,7 +55,7 @@ namespace cv { namespace cuda { namespace device
 {
     namespace stereobm
     {
-        void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
+        void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, int uniquenessRatio, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
         void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
         void prefilter_norm(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, int winsize, cudaStream_t & stream);
         void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream);
@@ -102,8 +102,8 @@ namespace
         int getTextureThreshold() const { return static_cast<int>(avergeTexThreshold_); }
         void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = static_cast<float>(textureThreshold); }
 
-        int getUniquenessRatio() const { return 0; }
-        void setUniquenessRatio(int /*uniquenessRatio*/) {}
+        int getUniquenessRatio() const { return uniquenessRatio_; }
+        void setUniquenessRatio(int uniquenessRatio) { uniquenessRatio_ = uniquenessRatio; }
 
         int getSmallerBlockSize() const { return 0; }
         void setSmallerBlockSize(int /*blockSize*/){}
@@ -121,12 +121,13 @@ namespace
         int preFilterCap_;
         float avergeTexThreshold_;
         int preFilterSize_;
+        int uniquenessRatio_;
 
         GpuMat minSSD_, leBuf_, riBuf_;
     };
 
     StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize)
-        : preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9)
+        : preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9), uniquenessRatio_(0)
     {
     }
 
@@ -183,7 +184,7 @@ namespace
             ri_for_bm = riBuf_;
         }
 
-        stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream);
+        stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, uniquenessRatio_, minSSD_, stream);
 
         if (avergeTexThreshold_ > 0)
             postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream);
diff --git a/modules/cudastereo/test/test_stereo.cpp b/modules/cudastereo/test/test_stereo.cpp
index b58b79e56e6..acfd846a042 100644
--- a/modules/cudastereo/test/test_stereo.cpp
+++ b/modules/cudastereo/test/test_stereo.cpp
@@ -122,6 +122,49 @@ CUDA_TEST_P(StereoBM, PrefilterNormRegression)
     EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
 }
 
+CUDA_TEST_P(StereoBM, Streams)
+{
+    cv::cuda::Stream stream;
+    cv::Mat left_image  = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat disp_gold   = readImage("stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE);
+
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+
+    cv::Ptr<cv::cuda::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
+    cv::cuda::GpuMat disp;
+
+    bm->compute(loadMat(left_image), loadMat(right_image), disp, stream);
+    stream.waitForCompletion();
+
+    EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
+}
+
+CUDA_TEST_P(StereoBM, Uniqueness_Regression)
+{
+    cv::Mat left_image  = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat disp_gold   = readImage("stereobm/aloe-disp-uniqueness15.png", cv::IMREAD_GRAYSCALE);
+
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+
+    cv::Ptr<cv::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
+    cv::cuda::GpuMat disp;
+
+    bm->setUniquenessRatio(15);
+    bm->compute(loadMat(left_image), loadMat(right_image), disp);
+
+    cv::Mat disp_cpu;
+    disp.download(disp_cpu);
+    cv::imwrite("disp_inq15.png", disp_cpu);
+
+    EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Stereo, StereoBM, ALL_DEVICES);
 
 //////////////////////////////////////////////////////////////////////////