diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
index 4aa6582dcf1..246d0dfd313 100644
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -358,6 +358,31 @@ threshold types are not supported.
  */
 CV_EXPORTS_W double threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
 
+/** @brief  Checks if array elements lie between two scalars.
+
+The function checks the range as follows:
+-   For every element of a single-channel input array:
+    \f[\texttt{dst} (I)= \texttt{lowerb}_0  \leq \texttt{src} (I)_0 \leq  \texttt{upperb}_0\f]
+-   For two-channel arrays:
+    \f[\texttt{dst} (I)= \texttt{lowerb}_0  \leq \texttt{src} (I)_0 \leq  \texttt{upperb}_0  \land \texttt{lowerb}_1  \leq \texttt{src} (I)_1 \leq  \texttt{upperb}_1\f]
+-   and so forth.
+
+That is, dst (I) is set to 255 (all 1 -bits) if src (I) is within the
+specified 1D, 2D, 3D, ... box and 0 otherwise.
+
+Note that unlike the CPU inRange, this does NOT accept an array for lowerb or
+upperb, only a cv::Scalar.
+
+@param src first input array.
+@param lowerb inclusive lower boundary cv::Scalar.
+@param upperb inclusive upper boundary cv::Scalar.
+@param dst output array of the same size as src and CV_8U type.
+@param stream Stream for the asynchronous version.
+
+@sa cv::inRange
+ */
+CV_EXPORTS_W void inRange(InputArray src, const Scalar& lowerb, const Scalar& upperb, OutputArray dst, Stream& stream = Stream::Null());
+
 /** @brief Computes magnitudes of complex matrix elements.
 
 @param xy Source complex matrix in the interleaved format ( CV_32FC2 ).
diff --git a/modules/cudaarithm/misc/python/test/test_cudaarithm.py b/modules/cudaarithm/misc/python/test/test_cudaarithm.py
index b068fae44bf..bbc9527a767 100644
--- a/modules/cudaarithm/misc/python/test/test_cudaarithm.py
+++ b/modules/cudaarithm/misc/python/test/test_cudaarithm.py
@@ -174,5 +174,24 @@ def test_convolution(self):
         self.assertTrue(np.allclose(cuMatDst.download(),
                     cv.filter2D(npMat,-1,kernel,anchor=(-1,-1))[iS[0]:iE[0]+1,iS[1]:iE[1]+1]))
 
+    def test_inrange(self):
+        npMat = (np.random.random((128, 128, 3)) * 255).astype(np.float32)
+
+        bound1 = np.random.random((4,)) * 255
+        bound2 = np.random.random((4,)) * 255
+        lowerb = np.minimum(bound1, bound2).tolist()
+        upperb = np.maximum(bound1, bound2).tolist()
+
+        cuMat = cv.cuda_GpuMat()
+        cuMat.upload(npMat)
+
+        self.assertTrue((cv.cuda.inRange(cuMat, lowerb, upperb).download() ==
+                         cv.inRange(npMat, np.array(lowerb), np.array(upperb))).all())
+
+        cuMatDst = cv.cuda_GpuMat(cuMat.size(), cv.CV_8UC1)
+        cv.cuda.inRange(cuMat, lowerb, upperb, cuMatDst)
+        self.assertTrue((cuMatDst.download() ==
+                         cv.inRange(npMat, np.array(lowerb), np.array(upperb))).all())
+
 if __name__ == '__main__':
-    NewOpenCVTests.bootstrap()
\ No newline at end of file
+    NewOpenCVTests.bootstrap()
diff --git a/modules/cudaarithm/perf/perf_element_operations.cpp b/modules/cudaarithm/perf/perf_element_operations.cpp
index 9aa2d4e4e0f..df2146fc147 100644
--- a/modules/cudaarithm/perf/perf_element_operations.cpp
+++ b/modules/cudaarithm/perf/perf_element_operations.cpp
@@ -1501,4 +1501,41 @@ PERF_TEST_P(Sz_Depth_Op, Threshold,
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// InRange
+
+PERF_TEST_P(Sz_Depth_Cn, InRange,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+            Values(CV_8U, CV_16U, CV_32F, CV_64F),
+            CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    cv::Mat src(size, CV_MAKE_TYPE(depth, channels));
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Scalar lowerb(10, 50, 100);
+    const cv::Scalar upperb(70, 85, 200);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::inRange(d_src, lowerb, upperb, dst);
+
+        CUDA_SANITY_CHECK(dst, 0);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::inRange(src, lowerb, upperb, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
 }} // namespace
diff --git a/modules/cudaarithm/src/cuda/in_range.cu b/modules/cudaarithm/src/cuda/in_range.cu
new file mode 100644
index 00000000000..1902b49996b
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/in_range.cu
@@ -0,0 +1,99 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace {
+
+template <typename T, int cn>
+void inRangeImpl(const GpuMat& src,
+                 const Scalar& lowerb,
+                 const Scalar& upperb,
+                 GpuMat& dst,
+                 Stream& stream) {
+    gridTransformUnary(globPtr<typename MakeVec<T, cn>::type>(src),
+                       globPtr<uchar>(dst),
+                       InRangeFunc<T, cn>(lowerb, upperb),
+                       stream);
+}
+
+}  // namespace
+
+void cv::cuda::inRange(InputArray _src,
+                       const Scalar& _lowerb,
+                       const Scalar& _upperb,
+                       OutputArray _dst,
+                       Stream& stream) {
+    const GpuMat src = getInputMat(_src, stream);
+
+    typedef void (*func_t)(const GpuMat& src,
+                           const Scalar& lowerb,
+                           const Scalar& upperb,
+                           GpuMat& dst,
+                           Stream& stream);
+
+    // Note: We cannot support 16F with the current implementation because we
+    // use a CUDA vector (e.g. int3) to store the bounds, and there is no CUDA
+    // vector type for float16
+    static constexpr const int MAX_CHANNELS = 4;
+    static constexpr const int NUM_DEPTHS = CV_64F + 1;
+
+    static const std::array<std::array<func_t, NUM_DEPTHS>, MAX_CHANNELS>
+            funcs = {std::array<func_t, NUM_DEPTHS>{inRangeImpl<uchar, 1>,
+                                                    inRangeImpl<schar, 1>,
+                                                    inRangeImpl<ushort, 1>,
+                                                    inRangeImpl<short, 1>,
+                                                    inRangeImpl<int, 1>,
+                                                    inRangeImpl<float, 1>,
+                                                    inRangeImpl<double, 1>},
+                     std::array<func_t, NUM_DEPTHS>{inRangeImpl<uchar, 2>,
+                                                    inRangeImpl<schar, 2>,
+                                                    inRangeImpl<ushort, 2>,
+                                                    inRangeImpl<short, 2>,
+                                                    inRangeImpl<int, 2>,
+                                                    inRangeImpl<float, 2>,
+                                                    inRangeImpl<double, 2>},
+                     std::array<func_t, NUM_DEPTHS>{inRangeImpl<uchar, 3>,
+                                                    inRangeImpl<schar, 3>,
+                                                    inRangeImpl<ushort, 3>,
+                                                    inRangeImpl<short, 3>,
+                                                    inRangeImpl<int, 3>,
+                                                    inRangeImpl<float, 3>,
+                                                    inRangeImpl<double, 3>},
+                     std::array<func_t, NUM_DEPTHS>{inRangeImpl<uchar, 4>,
+                                                    inRangeImpl<schar, 4>,
+                                                    inRangeImpl<ushort, 4>,
+                                                    inRangeImpl<short, 4>,
+                                                    inRangeImpl<int, 4>,
+                                                    inRangeImpl<float, 4>,
+                                                    inRangeImpl<double, 4>}};
+
+    CV_CheckLE(src.channels(), MAX_CHANNELS, "Src must have <= 4 channels");
+    CV_CheckLE(src.depth(),
+               CV_64F,
+               "Src must have depth 8U, 8S, 16U, 16S, 32S, 32F, or 64F");
+
+    GpuMat dst = getOutputMat(_dst, src.size(), CV_8UC1, stream);
+
+    const func_t func = funcs.at(src.channels() - 1).at(src.depth());
+    func(src, _lowerb, _upperb, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp
index f88119502d1..1ad3c17c40f 100644
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
@@ -77,6 +77,8 @@ void cv::cuda::addWeighted(InputArray, double, InputArray, double, double, Outpu
 
 double cv::cuda::threshold(InputArray, OutputArray, double, double, int, Stream&) {throw_no_cuda(); return 0.0;}
 
+void cv::cuda::inRange(InputArray, const Scalar&, const Scalar&, OutputArray, Stream&) { throw_no_cuda(); }
+
 void cv::cuda::magnitude(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 void cv::cuda::magnitude(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 void cv::cuda::magnitudeSqr(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp
index 848ab5ce740..d2e314b10d9 100644
--- a/modules/cudaarithm/test/test_element_operations.cpp
+++ b/modules/cudaarithm/test/test_element_operations.cpp
@@ -2577,6 +2577,64 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Threshold, testing::Combine(
     ALL_THRESH_OPS,
     WHOLE_SUBMAT));
 
+////////////////////////////////////////////////////////////////////////////////
+// InRange
+
+PARAM_TEST_CASE(InRange, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(InRange, Accuracy)
+{
+    // Set max value to 127 for signed char
+    const int max_bound = (depth == CV_8S) ? 127 : 255;
+
+    // Create lower and upper bound scalars, and make sure lowerb[i] <=
+    // upperb[i]
+    const cv::Scalar bound1 = randomScalar(0, max_bound);
+    const cv::Scalar bound2 = randomScalar(0, max_bound);
+
+    cv::Scalar lowerb, upperb;
+    for (int i = 0; i < 4; i++) {
+        lowerb[i] = std::min(bound1[i], bound2[i]);
+        upperb[i] = std::max(bound1[i], bound2[i]);
+    }
+
+    // Create mats and run CPU and GPU versions
+    const cv::Mat src = randomMat(size, CV_MAKE_TYPE(depth, channels));
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::inRange(loadMat(src, useRoi), lowerb, upperb, dst);
+
+    cv::Mat dst_gold;
+    cv::inRange(src, lowerb, upperb, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, InRange, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    ALL_CHANNELS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Magnitude
 
diff --git a/modules/cudev/include/opencv2/cudev/functional/functional.hpp b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
index f6569cf3d55..e4165358c28 100644
--- a/modules/cudev/include/opencv2/cudev/functional/functional.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
@@ -786,6 +786,156 @@ __host__ __device__ ThreshToZeroInvFunc<T> thresh_to_zero_inv_func(T thresh)
     return f;
 }
 
+// InRange functors
+
+/** @brief  Functor that checks if a CUDA vector v is in the range between lowerb and upperb
+
+  Implemented as a recursive template
+
+@tparam T      underlying floating point/integral type
+@tparam cn     total number of channels in the input arguments
+@tparam i      number of the channel to check (will check this channel and lower)
+@param  lowerb inclusive scalar lower bound, as a CUDA vector, e.g. a uchar3
+@param  upperb inclusive scalar upper bound, as a CUDA vector, e.g. a uchar3
+@param  v      scalar to check, as a CUDA vector, e.g. a uchar3
+ */
+template <typename T, int cn, int i>
+struct InRangeComparator {
+    __device__ bool operator()(const typename MakeVec<T, cn>::type& lowerb,
+                               const typename MakeVec<T, cn>::type& upperb,
+                               const typename MakeVec<T, cn>::type& v) const;
+};
+
+// Specialize InRangeComparator for MakeVec<T, N>
+#define OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COMPARATOR(i, field)          \
+    template <typename T, int cn>                                           \
+    struct InRangeComparator<T, cn, i> {                                    \
+        __device__ bool operator()(                                         \
+                const typename MakeVec<T, cn>::type& lowerb,                \
+                const typename MakeVec<T, cn>::type& upperb,                \
+                const typename MakeVec<T, cn>::type& v) const {             \
+            const bool in_range =                                           \
+                    lowerb.field <= v.field && v.field <= upperb.field;     \
+            return in_range                                                 \
+                   && InRangeComparator<T, cn, i - 1>{}(lowerb, upperb, v); \
+        }                                                                   \
+    };
+
+OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COMPARATOR(4, w)
+OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COMPARATOR(3, z)
+OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COMPARATOR(2, y)
+OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COMPARATOR(1, x)
+
+#undef OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COMPARATOR
+
+// Specialize for the base case of i=0
+template <typename T, int cn>
+struct InRangeComparator<T, cn, 0> {
+    __device__ bool operator()(const typename MakeVec<T, cn>::type&,
+                               const typename MakeVec<T, cn>::type&,
+                               const typename MakeVec<T, cn>::type&) const {
+        return true;
+    }
+};
+
+// Specialize for MakeVec<T, 1>::type, which is e.g. uchar instead of uchar1
+template <typename T>
+struct InRangeComparator<T, 1, 1> {
+    static constexpr const int cn = 1;
+
+    __device__ bool operator()(const typename MakeVec<T, cn>::type& lowerb,
+                               const typename MakeVec<T, cn>::type& upperb,
+                               const typename MakeVec<T, cn>::type& v) const {
+        return lowerb <= v && v <= upperb;
+    }
+};
+
+/** @brief  Functor that copies a cv::Scalar into a CUDA vector, e.g. a uchar3
+
+  Implemented as a recursive template
+
+@tparam T   underlying floating point/integral type
+@tparam cn  total number of channels in the input arguments
+@tparam i   number of the channel to check (will check this channel and lower)
+@param  in  cv::Scalar to copy from
+@param  out CUDA vector to copy into, e.g. a uchar3
+ */
+template <typename T, int cn, int i>
+struct InRangeCopier {
+    void operator()(const Scalar& in,
+                    typename MakeVec<T, cn>::type& out) const;
+};
+
+// Specialize InRangeCopier for MakeVec<T, N>
+#define OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COPIER(i, field)           \
+    template <typename T, int cn>                                        \
+    struct InRangeCopier<T, cn, i> {                                     \
+        void operator()(const Scalar& in,                                \
+                        typename MakeVec<T, cn>::type& out) const {      \
+            const double in_rounded = (std::is_same<T, double>::value    \
+                                       || std::is_same<T, float>::value) \
+                                              ? in[i - 1]                \
+                                              : std::round(in[i - 1]);   \
+            out.field = static_cast<T>(in_rounded);                      \
+            InRangeCopier<T, cn, i - 1>{}(in, out);                      \
+        }                                                                \
+    };
+
+OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COPIER(4, w)
+OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COPIER(3, z)
+OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COPIER(2, y)
+OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COPIER(1, x)
+
+#undef OPENCV_CUDEV_FUNCTIONAL_MAKE_IN_RANGE_COPIER
+
+// Specialize for the base case of i=0
+template <typename T, int cn>
+struct InRangeCopier<T, cn, 0> {
+    void operator()(const Scalar&, typename MakeVec<T, cn>::type&) const {
+        return;
+    }
+};
+
+// Specialize for MakeVec<T, 1>::type, which is e.g. uchar instead of uchar1
+template <typename T>
+struct InRangeCopier<T, 1, 1> {
+    void operator()(const Scalar& in, typename MakeVec<T, 1>::type& out) const {
+        const double in_rounded = (std::is_same<T, double>::value
+                                   || std::is_same<T, float>::value)
+                                          ? in[0]
+                                          : std::round(in[0]);
+        out = static_cast<T>(in_rounded);
+    }
+};
+
+/** @brief  unary_function implementation of inRange
+
+  Intended to be used to create an Op for gridTransformUnary
+
+@tparam T      underlying floating point/integral type
+@tparam cn     total number of channels in the source image
+ */
+template <typename T, int cn>
+struct InRangeFunc : unary_function<typename MakeVec<T, cn>::type, uchar> {
+    typename MakeVec<T, cn>::type lowerb;
+    typename MakeVec<T, cn>::type upperb;
+
+    /** @brief  Builds an InRangeFunc with the given lower and upper bound scalars
+
+    @param  lowerb_scalar inclusive lower bound
+    @param  upperb_scalar inclusive upper bound
+     */
+    __host__ InRangeFunc(const Scalar& lowerb_scalar, const Scalar& upperb_scalar) {
+        InRangeCopier<T, cn, cn>{}(lowerb_scalar, lowerb);
+        InRangeCopier<T, cn, cn>{}(upperb_scalar, upperb);
+    }
+
+    __device__ uchar
+    operator()(const typename MakeVec<T, cn>::type& src) const {
+        return InRangeComparator<T, cn, cn>{}(lowerb, upperb, src) ? 255 : 0;
+    }
+};
+
 // Function Object Adaptors
 
 template <class Predicate> struct UnaryNegate : unary_function<typename Predicate::argument_type, typename Predicate::result_type>