diff --git a/modules/rgbd/samples/large_kinfu_demo.cpp b/modules/rgbd/samples/large_kinfu_demo.cpp
index 40d14ca8cae..e78c3511215 100644
--- a/modules/rgbd/samples/large_kinfu_demo.cpp
+++ b/modules/rgbd/samples/large_kinfu_demo.cpp
@@ -127,8 +127,7 @@ int main(int argc, char** argv)
     // These params can be different for each depth sensor
     ds->updateParams(*params);
 
-    // Disabled until there is no OpenCL accelerated HashTSDF is available
-    cv::setUseOptimized(false);
+    cv::setUseOptimized(true);
 
     if (!idle)
         largeKinfu = LargeKinfu::create(params);
diff --git a/modules/rgbd/src/hash_tsdf.cpp b/modules/rgbd/src/hash_tsdf.cpp
index afdeef37063..08338fe83ef 100644
--- a/modules/rgbd/src/hash_tsdf.cpp
+++ b/modules/rgbd/src/hash_tsdf.cpp
@@ -15,9 +15,10 @@
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/utils/trace.hpp"
 #include "utils.hpp"
+#include "opencl_kernels_rgbd.hpp"
 
 #define USE_INTERPOLATION_IN_GETNORMAL 1
-#define VOLUMES_SIZE 1024
+#define VOLUMES_SIZE 8192
 
 namespace cv
 {
@@ -35,13 +36,17 @@ HashTSDFVolume::HashTSDFVolume(float _voxelSize, cv::Matx44f _pose, float _rayca
     zFirstMemOrder(_zFirstMemOrder)
 {
     truncDist = std::max(_truncDist, 4.0f * voxelSize);
-}
 
-HashTSDFVolumeCPU::HashTSDFVolumeCPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist,
-                                     int _maxWeight, float _truncateThreshold, int _volumeUnitRes, bool _zFirstMemOrder)
-    :HashTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _truncateThreshold, _volumeUnitRes,
-           _zFirstMemOrder)
-{
+    if (!(volumeUnitResolution & (volumeUnitResolution - 1)))
+    {
+        // vuRes is a power of 2, let's get this power
+        volumeUnitDegree = trailingZeros32(volumeUnitResolution);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg, "Volume unit resolution should be a power of 2");
+    }
+
     int xdim, ydim, zdim;
     if (zFirstMemOrder)
     {
@@ -56,21 +61,110 @@ HashTSDFVolumeCPU::HashTSDFVolumeCPU(float _voxelSize, const Matx44f& _pose, flo
         zdim = volumeUnitResolution * volumeUnitResolution;
     }
     volStrides = Vec4i(xdim, ydim, zdim);
+}
+
+//! Spatial hashing
+struct tsdf_hash
+{
+    size_t operator()(const Vec3i& x) const noexcept
+    {
+        size_t seed = 0;
+        constexpr uint32_t GOLDEN_RATIO = 0x9e3779b9;
+        for (uint16_t i = 0; i < 3; i++)
+        {
+            seed ^= std::hash<int>()(x[i]) + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+        }
+        return seed;
+    }
+};
+
+struct VolumeUnit
+{
+    cv::Vec3i coord;
+    int index;
+    cv::Matx44f pose;
+    int lastVisibleIndex = 0;
+    bool isActive;
+};
+
+typedef std::unordered_set<cv::Vec3i, tsdf_hash> VolumeUnitIndexSet;
+typedef std::unordered_map<cv::Vec3i, VolumeUnit, tsdf_hash> VolumeUnitIndexes;
+
+class HashTSDFVolumeCPU : public HashTSDFVolume
+{
+public:
+    // dimension in voxels, size in meters
+    HashTSDFVolumeCPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist, int _maxWeight,
+        float _truncateThreshold, int _volumeUnitRes, bool zFirstMemOrder = true);
+
+    HashTSDFVolumeCPU(const VolumeParams& _volumeParams, bool zFirstMemOrder = true);
+
+    void integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const kinfu::Intr& intrinsics,
+        const int frameId = 0) override;
+    void raycast(const Matx44f& cameraPose, const kinfu::Intr& intrinsics, const Size& frameSize, OutputArray points,
+        OutputArray normals) const override;
+
+    void fetchNormals(InputArray points, OutputArray _normals) const override;
+    void fetchPointsNormals(OutputArray points, OutputArray normals) const override;
+
+    void reset() override;
+    size_t getTotalVolumeUnits() const override { return volumeUnits.size(); }
+    int getVisibleBlocks(int currFrameId, int frameThreshold) const override;
+
+    //! Return the voxel given the voxel index in the universal volume (1 unit = 1 voxel_length)
+    TsdfVoxel at(const Vec3i& volumeIdx) const;
+
+    //! Return the voxel given the point in volume coordinate system i.e., (metric scale 1 unit =
+    //! 1m)
+    virtual TsdfVoxel at(const cv::Point3f& point) const;
+    virtual TsdfVoxel _at(const cv::Vec3i& volumeIdx, int indx) const;
+
+    TsdfVoxel atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, VolumeUnitIndexes::const_iterator it) const;
 
+
+    float interpolateVoxelPoint(const Point3f& point) const;
+    float interpolateVoxel(const cv::Point3f& point) const;
+    Point3f getNormalVoxel(const cv::Point3f& p) const;
+
+    //! Utility functions for coordinate transformations
+    Vec3i volumeToVolumeUnitIdx(const Point3f& point) const;
+    Point3f volumeUnitIdxToVolume(const Vec3i& volumeUnitIdx) const;
+
+    Point3f voxelCoordToVolume(const Vec3i& voxelIdx) const;
+    Vec3i volumeToVoxelCoord(const Point3f& point) const;
+
+public:
+    Vec6f frameParams;
+    Mat pixNorms;
+    VolumeUnitIndexes volumeUnits;
+    cv::Mat volUnitsData;
+    int lastVolIndex;
+};
+
+
+HashTSDFVolumeCPU::HashTSDFVolumeCPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist,
+                                     int _maxWeight, float _truncateThreshold, int _volumeUnitRes, bool _zFirstMemOrder)
+    :HashTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _truncateThreshold, _volumeUnitRes,
+           _zFirstMemOrder)
+{
     reset();
 }
 
 HashTSDFVolumeCPU::HashTSDFVolumeCPU(const VolumeParams& _params, bool _zFirstMemOrder)
-    : HashTSDFVolume(_params.voxelSize, _params.pose.matrix, _params.raycastStepFactor, _params.tsdfTruncDist, _params.maxWeight,
+    : HashTSDFVolumeCPU(_params.voxelSize, _params.pose.matrix, _params.raycastStepFactor, _params.tsdfTruncDist, _params.maxWeight,
            _params.depthTruncThreshold, _params.unitResolution, _zFirstMemOrder)
 {
 }
+
 // zero volume, leave rest params the same
 void HashTSDFVolumeCPU::reset()
 {
     CV_TRACE_FUNCTION();
     lastVolIndex = 0;
     volUnitsData = cv::Mat(VOLUMES_SIZE, volumeUnitResolution * volumeUnitResolution * volumeUnitResolution, rawType<TsdfVoxel>());
+    frameParams = Vec6f();
+    pixNorms = Mat();
+    volumeUnits = VolumeUnitIndexes();
 }
 
 void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics, const int frameId)
@@ -81,7 +175,7 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Ma
     Depth depth = _depth.getMat();
 
     //! Compute volumes to be allocated
-    const int depthStride = int(log2(volumeUnitResolution));
+    const int depthStride = volumeUnitDegree;
     const float invDepthFactor = 1.f / depthFactor;
     const Intr::Reprojector reproj(intrinsics.makeReprojector());
     const Affine3f cam2vol(pose.inv() * Affine3f(cameraPose));
@@ -111,7 +205,7 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Ma
                         for (int k = lower_bound[2]; k <= upper_bound[2]; k++)
                         {
                             const Vec3i tsdf_idx = Vec3i(i, j, k);
-                            if (!localAccessVolUnits.count(tsdf_idx))
+                            if (localAccessVolUnits.count(tsdf_idx) <= 0 && this->volumeUnits.count(tsdf_idx) <= 0)
                             {
                                 //! This volume unit will definitely be required for current integration
                                 localAccessVolUnits.emplace(tsdf_idx);
@@ -124,10 +218,9 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Ma
         for (const auto& tsdf_idx : localAccessVolUnits)
         {
             //! If the insert into the global set passes
-            if (!this->volumeUnits.count(tsdf_idx))
+            if (!newIndices.count(tsdf_idx))
             {
                 // Volume allocation can be performed outside of the lock
-                this->volumeUnits.emplace(tsdf_idx, VolumeUnit());
                 newIndices.emplace(tsdf_idx);
             }
         }
@@ -138,12 +231,13 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Ma
     //! Perform the allocation
     for (auto idx : newIndices)
     {
-        VolumeUnit& vu = volumeUnits[idx];
+        VolumeUnit& vu = this->volumeUnits.emplace(idx, VolumeUnit()).first->second;
+
         Matx44f subvolumePose = pose.translate(volumeUnitIdxToVolume(idx)).matrix;
 
         vu.pose = subvolumePose;
         vu.index = lastVolIndex; lastVolIndex++;
-        if (lastVolIndex > VolumeIndex(volUnitsData.size().height))
+        if (lastVolIndex > int(volUnitsData.size().height))
         {
             volUnitsData.resize((lastVolIndex - 1) * 2);
         }
@@ -175,14 +269,14 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Ma
             Vec3i tsdf_idx = totalVolUnits[i];
             VolumeUnitIndexes::iterator it = volumeUnits.find(tsdf_idx);
             if (it == volumeUnits.end())
-                return;
+                continue;
 
             Point3f volumeUnitPos = volumeUnitIdxToVolume(it->first);
             Point3f volUnitInCamSpace = vol2cam * volumeUnitPos;
             if (volUnitInCamSpace.z < 0 || volUnitInCamSpace.z > truncateThreshold)
             {
                 it->second.isActive = false;
-                return;
+                continue;
             }
             Point2f cameraPoint = proj(volUnitInCamSpace);
             if (cameraPoint.x >= 0 && cameraPoint.y >= 0 && cameraPoint.x < depth.cols && cameraPoint.y < depth.rows)
@@ -250,17 +344,14 @@ cv::Vec3i HashTSDFVolumeCPU::volumeToVoxelCoord(const cv::Point3f& point) const
                      cvFloor(point.z * voxelSizeInv));
 }
 
-inline TsdfVoxel HashTSDFVolumeCPU::_at(const cv::Vec3i& volumeIdx, VolumeIndex indx) const
+inline TsdfVoxel HashTSDFVolumeCPU::_at(const cv::Vec3i& volumeIdx, int indx) const
 {
     //! Out of bounds
     if ((volumeIdx[0] >= volumeUnitResolution || volumeIdx[0] < 0) ||
         (volumeIdx[1] >= volumeUnitResolution || volumeIdx[1] < 0) ||
         (volumeIdx[2] >= volumeUnitResolution || volumeIdx[2] < 0))
     {
-        TsdfVoxel dummy;
-        dummy.tsdf = floatToTsdf(1.0f);
-        dummy.weight = 0;
-        return dummy;
+        return TsdfVoxel(floatToTsdf(1.f), 0);
     }
 
     const TsdfVoxel* volData = volUnitsData.ptr<TsdfVoxel>(indx);
@@ -270,25 +361,21 @@ inline TsdfVoxel HashTSDFVolumeCPU::_at(const cv::Vec3i& volumeIdx, VolumeIndex
 }
 
 inline TsdfVoxel HashTSDFVolumeCPU::at(const cv::Vec3i& volumeIdx) const
-
 {
-    Vec3i volumeUnitIdx = Vec3i(cvFloor(volumeIdx[0] / volumeUnitResolution),
-                                cvFloor(volumeIdx[1] / volumeUnitResolution),
-                                cvFloor(volumeIdx[2] / volumeUnitResolution));
+    Vec3i volumeUnitIdx = Vec3i(volumeIdx[0] >> volumeUnitDegree,
+                                volumeIdx[1] >> volumeUnitDegree,
+                                volumeIdx[2] >> volumeUnitDegree);
 
     VolumeUnitIndexes::const_iterator it = volumeUnits.find(volumeUnitIdx);
 
     if (it == volumeUnits.end())
     {
-        TsdfVoxel dummy;
-        dummy.tsdf = floatToTsdf(1.f);
-        dummy.weight = 0;
-        return dummy;
+        return TsdfVoxel(floatToTsdf(1.f), 0);
     }
 
-    cv::Vec3i volUnitLocalIdx = volumeIdx - cv::Vec3i(volumeUnitIdx[0] * volumeUnitResolution,
-                                                      volumeUnitIdx[1] * volumeUnitResolution,
-                                                      volumeUnitIdx[2] * volumeUnitResolution);
+    cv::Vec3i volUnitLocalIdx = volumeIdx - cv::Vec3i(volumeUnitIdx[0] << volumeUnitDegree,
+                                                      volumeUnitIdx[1] << volumeUnitDegree,
+                                                      volumeUnitIdx[2] << volumeUnitDegree);
 
     volUnitLocalIdx =
         cv::Vec3i(abs(volUnitLocalIdx[0]), abs(volUnitLocalIdx[1]), abs(volUnitLocalIdx[2]));
@@ -298,15 +385,12 @@ inline TsdfVoxel HashTSDFVolumeCPU::at(const cv::Vec3i& volumeIdx) const
 
 TsdfVoxel HashTSDFVolumeCPU::at(const Point3f& point) const
 {
-    cv::Vec3i volumeUnitIdx          = volumeToVolumeUnitIdx(point);
+    cv::Vec3i volumeUnitIdx = volumeToVolumeUnitIdx(point);
     VolumeUnitIndexes::const_iterator it = volumeUnits.find(volumeUnitIdx);
 
     if (it == volumeUnits.end())
     {
-        TsdfVoxel dummy;
-        dummy.tsdf = floatToTsdf(1.f);
-        dummy.weight = 0;
-        return dummy;
+        return TsdfVoxel(floatToTsdf(1.f), 0);
     }
 
     cv::Point3f volumeUnitPos = volumeUnitIdxToVolume(volumeUnitIdx);
@@ -316,32 +400,15 @@ TsdfVoxel HashTSDFVolumeCPU::at(const Point3f& point) const
     return _at(volUnitLocalIdx, it->second.index);
 }
 
-static inline Vec3i voxelToVolumeUnitIdx(const Vec3i& pt, const int vuRes)
-{
-    if (!(vuRes & (vuRes - 1)))
-    {
-        // vuRes is a power of 2, let's get this power
-        const int p2 = trailingZeros32(vuRes);
-        return Vec3i(pt[0] >> p2, pt[1] >> p2, pt[2] >> p2);
-    }
-    else
-    {
-        return Vec3i(cvFloor(float(pt[0]) / vuRes),
-            cvFloor(float(pt[1]) / vuRes),
-            cvFloor(float(pt[2]) / vuRes));
-    }
-}
-
 TsdfVoxel HashTSDFVolumeCPU::atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, VolumeUnitIndexes::const_iterator it) const
 {
     if (it == volumeUnits.end())
     {
-        TsdfVoxel dummy;
-        dummy.tsdf = floatToTsdf(1.f);
-        dummy.weight = 0;
-        return dummy;
+        return TsdfVoxel(floatToTsdf(1.f), 0);
     }
-    Vec3i volUnitLocalIdx = point - volumeUnitIdx * volumeUnitResolution;
+    Vec3i volUnitLocalIdx = point - Vec3i(volumeUnitIdx[0] << volumeUnitDegree,
+                                          volumeUnitIdx[1] << volumeUnitDegree,
+                                          volumeUnitIdx[2] << volumeUnitDegree);
 
     // expanding at(), removing bounds check
     const TsdfVoxel* volData = volUnitsData.ptr<TsdfVoxel>(it->second.index);
@@ -349,8 +416,6 @@ TsdfVoxel HashTSDFVolumeCPU::atVolumeUnit(const Vec3i& point, const Vec3i& volum
     return volData[coordBase];
 }
 
-
-
 #if USE_INTRINSICS
 inline float interpolate(float tx, float ty, float tz, float vx[8])
 {
@@ -413,7 +478,7 @@ float HashTSDFVolumeCPU::interpolateVoxelPoint(const Point3f& point) const
     {
         Vec3i pt = iv + neighbourCoords[i];
 
-        Vec3i volumeUnitIdx = voxelToVolumeUnitIdx(pt, volumeUnitResolution);
+        Vec3i volumeUnitIdx = Vec3i(pt[0] >> volumeUnitDegree, pt[1] >> volumeUnitDegree, pt[2] >> volumeUnitDegree);
         int dictIdx = (volumeUnitIdx[0] & 1) + (volumeUnitIdx[1] & 1) * 2 + (volumeUnitIdx[2] & 1) * 4;
         auto it = iterMap[dictIdx];
         if (!queried[dictIdx])
@@ -475,7 +540,7 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
     {
         Vec3i pt = iptVox + offsets[i];
 
-        Vec3i volumeUnitIdx = voxelToVolumeUnitIdx(pt, volumeUnitResolution);
+        Vec3i volumeUnitIdx = Vec3i(pt[0] >> volumeUnitDegree, pt[1] >> volumeUnitDegree, pt[2] >> volumeUnitDegree);
 
         int dictIdx = (volumeUnitIdx[0] & 1) + (volumeUnitIdx[1] & 1) * 2 + (volumeUnitIdx[2] & 1) * 4;
         auto it = iterMap[dictIdx];
@@ -625,7 +690,6 @@ void HashTSDFVolumeCPU::raycast(const Matx44f& cameraPose, const kinfu::Intr& in
                     cv::Vec3i(std::numeric_limits<int>::min(), std::numeric_limits<int>::min(),
                         std::numeric_limits<int>::min());
 
-
                 float tprev = tcurr;
                 float prevTsdf = volume.truncDist;
                 Ptr<TSDFVolumeCPU> currVolumeUnit;
@@ -634,7 +698,6 @@ void HashTSDFVolumeCPU::raycast(const Matx44f& cameraPose, const kinfu::Intr& in
                     Point3f currRayPos = orig + tcurr * rayDirV;
                     cv::Vec3i currVolumeUnitIdx = volume.volumeToVolumeUnitIdx(currRayPos);
 
-
                     VolumeUnitIndexes::const_iterator it = volume.volumeUnits.find(currVolumeUnitIdx);
 
                     float currTsdf = prevTsdf;
@@ -650,7 +713,6 @@ void HashTSDFVolumeCPU::raycast(const Matx44f& cameraPose, const kinfu::Intr& in
                             volume.volumeUnitIdxToVolume(currVolumeUnitIdx);
                         volUnitLocalIdx = volume.volumeToVoxelCoord(currRayPos - currVolUnitPos);
 
-
                         //! TODO: Figure out voxel interpolation
                         TsdfVoxel currVoxel = _at(volUnitLocalIdx, it->second.index);
                         currTsdf = tsdfToFloat(currVoxel.tsdf);
@@ -708,17 +770,13 @@ void HashTSDFVolumeCPU::fetchPointsNormals(OutputArray _points, OutputArray _nor
         bool needNormals(_normals.needed());
         Mutex mutex;
 
-
         auto HashFetchPointsNormalsInvoker = [&](const Range& range)
         {
-
-
             std::vector<ptype> points, normals;
             for (int i = range.start; i < range.end; i++)
             {
                 cv::Vec3i tsdf_idx = totalVolUnits[i];
 
-
                 VolumeUnitIndexes::const_iterator it = volume.volumeUnits.find(tsdf_idx);
                 Point3f base_point = volume.volumeUnitIdxToVolume(tsdf_idx);
                 if (it != volume.volumeUnits.end())
@@ -753,8 +811,6 @@ void HashTSDFVolumeCPU::fetchPointsNormals(OutputArray _points, OutputArray _nor
 
         parallel_for_(fetchRange, HashFetchPointsNormalsInvoker, nstripes);
 
-
-
         std::vector<ptype> points, normals;
         for (size_t i = 0; i < pVecs.size(); i++)
         {
@@ -817,5 +873,920 @@ int HashTSDFVolumeCPU::getVisibleBlocks(int currFrameId, int frameThreshold) con
     return numVisibleBlocks;
 }
 
+
+///////// GPU implementation /////////
+
+#ifdef HAVE_OPENCL
+
+class HashTSDFVolumeGPU : public HashTSDFVolume
+{
+public:
+    HashTSDFVolumeGPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist, int _maxWeight,
+        float _truncateThreshold, int _volumeUnitRes, bool zFirstMemOrder = false);
+
+    HashTSDFVolumeGPU(const VolumeParams& _volumeParams, bool zFirstMemOrder = false);
+
+    void reset() override;
+
+    void integrateAllVolumeUnitsGPU(const UMat& depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics);
+
+    void allocateVolumeUnits(const UMat& depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics);
+
+    void markActive(const Matx44f& cameraPose, const Intr& intrinsics, const Size frameSz, const int frameId);
+
+    void integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const kinfu::Intr& intrinsics,
+        const int frameId = 0) override;
+    void raycast(const Matx44f& cameraPose, const kinfu::Intr& intrinsics, const Size& frameSize, OutputArray points,
+        OutputArray normals) const override;
+
+
+    void fetchNormals(InputArray points, OutputArray _normals) const override;
+    void fetchPointsNormals(OutputArray points, OutputArray normals) const override;
+
+    size_t getTotalVolumeUnits() const override { return size_t(hashTable.last); }
+    int getVisibleBlocks(int currFrameId, int frameThreshold) const override;
+
+
+
+    //! Return the voxel given the point in volume coordinate system i.e., (metric scale 1 unit =
+    //! 1m)
+    virtual TsdfVoxel new_at(const cv::Vec3i& volumeIdx, int indx) const;
+    TsdfVoxel new_atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, int indx) const;
+
+
+    float interpolateVoxelPoint(const Point3f& point) const;
+    float interpolateVoxel(const cv::Point3f& point) const;
+    Point3f getNormalVoxel(const cv::Point3f& p) const;
+
+    //! Utility functions for coordinate transformations
+    Vec3i volumeToVolumeUnitIdx(const Point3f& point) const;
+    Point3f volumeUnitIdxToVolume(const Vec3i& volumeUnitIdx) const;
+
+    Point3f voxelCoordToVolume(const Vec3i& voxelIdx) const;
+    Vec3i volumeToVoxelCoord(const Point3f& point) const;
+
+public:
+    Vec6f frameParams;
+    int bufferSizeDegree;
+
+    // per-volume-unit data
+    cv::UMat lastVisibleIndices;
+
+    cv::UMat isActiveFlags;
+
+    cv::UMat volUnitsData;
+    //TODO: remove it when there's no CPU parts
+    cv::Mat volUnitsDataCopy;
+
+    cv::UMat pixNorms;
+
+    //TODO: move indexes.volumes to GPU
+    CustomHashSet hashTable;
+};
+
+HashTSDFVolumeGPU::HashTSDFVolumeGPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist, int _maxWeight,
+    float _truncateThreshold, int _volumeUnitRes, bool _zFirstMemOrder)
+    :HashTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _truncateThreshold, _volumeUnitRes, _zFirstMemOrder)
+{
+    if (truncDist >= volumeUnitSize)
+    {
+        std::cout << "truncDist exceeds volume unit size, allocation may work incorrectly" << std::endl;
+    }
+
+    reset();
+}
+
+HashTSDFVolumeGPU::HashTSDFVolumeGPU(const VolumeParams & _params, bool _zFirstMemOrder)
+    : HashTSDFVolumeGPU(_params.voxelSize, _params.pose.matrix, _params.raycastStepFactor, _params.tsdfTruncDist, _params.maxWeight,
+                     _params.depthTruncThreshold, _params.unitResolution, _zFirstMemOrder)
+{
+}
+
+// zero volume, leave rest params the same
+void HashTSDFVolumeGPU::reset()
+{
+    CV_TRACE_FUNCTION();
+
+    bufferSizeDegree = 15;
+    int buff_lvl = (int) (1 << bufferSizeDegree);
+
+    int volCubed = volumeUnitResolution * volumeUnitResolution * volumeUnitResolution;
+    volUnitsDataCopy = cv::Mat(buff_lvl, volCubed, rawType<TsdfVoxel>());
+
+    volUnitsData = cv::UMat(buff_lvl, volCubed, CV_8UC2);
+
+    lastVisibleIndices = cv::UMat(buff_lvl, 1, CV_32S);
+
+    isActiveFlags = cv::UMat(buff_lvl, 1, CV_8U);
+
+    hashTable = CustomHashSet();
+
+    frameParams = Vec6f();
+    pixNorms = UMat();
+}
+
+
+void HashTSDFVolumeGPU::integrateAllVolumeUnitsGPU(const UMat& depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics)
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(!depth.empty());
+
+    String errorStr;
+    String name = "integrateAllVolumeUnits";
+    ocl::ProgramSource source = ocl::rgbd::hash_tsdf_oclsrc;
+    String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if (k.empty())
+        throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+    float dfac = 1.f / depthFactor;
+    Vec2f fxy(intrinsics.fx, intrinsics.fy), cxy(intrinsics.cx, intrinsics.cy);
+    Matx44f vol2camMatrix = (Affine3f(cameraPose).inv() * pose).matrix;
+    Matx44f camInvMatrix = Affine3f(cameraPose).inv().matrix;
+
+    UMat hashesGpu = Mat(hashTable.hashes, false).getUMat(ACCESS_READ);
+    UMat hashDataGpu = Mat(hashTable.data, false).getUMat(ACCESS_READ);
+
+    k.args(ocl::KernelArg::ReadOnly(depth),
+           ocl::KernelArg::PtrReadOnly(hashesGpu),
+           ocl::KernelArg::PtrReadOnly(hashDataGpu),
+           ocl::KernelArg::ReadWrite(volUnitsData),
+           ocl::KernelArg::ReadOnly(pixNorms),
+           ocl::KernelArg::ReadOnly(isActiveFlags),
+           vol2camMatrix,
+           camInvMatrix,
+           voxelSize,
+           volumeUnitResolution,
+           volStrides.val,
+           fxy.val,
+           cxy.val,
+           dfac,
+           truncDist,
+           int(maxWeight)
+    );
+
+    int resol = volumeUnitResolution;
+    size_t globalSize[3];
+    globalSize[0] = (size_t)resol;
+    globalSize[1] = (size_t)resol;
+    globalSize[2] = (size_t)hashTable.last; // num of volume units
+
+    if (!k.run(3, globalSize, NULL, true))
+        throw std::runtime_error("Failed to run kernel");
+}
+
+
+void HashTSDFVolumeGPU::allocateVolumeUnits(const UMat& _depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics)
+{
+    constexpr int pixCapacity = 16;
+    typedef std::array<Vec3i, pixCapacity> LocalVolUnits;
+
+    Depth depth = _depth.getMat(ACCESS_READ);
+
+    //! Compute volumes to be allocated
+    const int depthStride = volumeUnitDegree;
+    const float invDepthFactor = 1.f / depthFactor;
+    const Intr::Reprojector reproj(intrinsics.makeReprojector());
+    const Affine3f cam2vol(pose.inv() * Affine3f(cameraPose));
+    const Point3f truncPt(truncDist, truncDist, truncDist);
+    Mutex mutex;
+
+    // for new indices
+    CustomHashSet thm;
+
+    auto fillLocalAcessVolUnits = [&](const Range& xrange, const Range& yrange, CustomHashSet& ghm)
+    {
+        for (int y = yrange.start; y < yrange.end; y += depthStride)
+        {
+            const depthType* depthRow = depth[y];
+            for (int x = xrange.start; x < xrange.end; x += depthStride)
+            {
+                depthType z = depthRow[x] * invDepthFactor;
+                if (z <= 0 || z > this->truncateThreshold)
+                    continue;
+                Point3f camPoint = reproj(Point3f((float)x, (float)y, z));
+                Point3f volPoint = cam2vol * camPoint;
+                //! Find accessed TSDF volume unit for valid 3D vertex
+                Vec3i lower_bound = this->volumeToVolumeUnitIdx(volPoint - truncPt);
+                Vec3i upper_bound = this->volumeToVolumeUnitIdx(volPoint + truncPt);
+
+                int pixLocalCounter = 0;
+                LocalVolUnits pixLocalVolUnits;
+                for (int i = lower_bound[0]; i <= upper_bound[0]; i++)
+                    for (int j = lower_bound[1]; j <= upper_bound[1]; j++)
+                        for (int k = lower_bound[2]; k <= upper_bound[2]; k++)
+                        {
+                            const Vec3i tsdf_idx = Vec3i(i, j, k);
+
+                            if (hashTable.find(tsdf_idx) < 0)
+                            {
+                                bool found = false;
+                                for (int c = 0; c < pixLocalCounter; c++)
+                                {
+                                    if (pixLocalVolUnits[c] == tsdf_idx)
+                                    {
+                                        found = true; break;
+                                    }
+                                }
+                                if (!found)
+                                {
+                                    pixLocalVolUnits[pixLocalCounter++] = tsdf_idx;
+                                    if (pixLocalCounter >= pixCapacity)
+                                    {
+                                        return;
+                                    }
+                                }
+                            }
+                        }
+
+                // lock localAccessVolUnits somehow
+                for (int i = 0; i < pixLocalCounter; i++)
+                {
+                    Vec3i idx = pixLocalVolUnits[i];
+                    if (!ghm.insert(idx))
+                    {
+                        //return;
+                    }
+                }
+                // unlock
+            }
+        }
+    };
+
+    Rect dim(0, 0, depth.cols, depth.rows);
+    Size gsz(32, 32);
+    Size gg(divUp(dim.width, gsz.width), divUp(dim.height, gsz.height));
+
+    bool needReallocation = false;
+    auto allocateLambda = [&](const Range& r)
+    {
+
+    for (int yg = r.start; yg < r.end; yg++)
+    {
+        for (int xg = 0; xg < gg.width; xg++)
+        {
+            Rect gr(xg * gsz.width, yg * gsz.height, (xg + 1) * gsz.width, (yg + 1) * gsz.height);
+            gr = gr & dim;
+            Range xr(gr.tl().x, gr.br().x), yr(gr.tl().y, gr.br().y);
+
+            CustomHashSet ghm;
+
+            fillLocalAcessVolUnits(xr, yr, ghm);
+
+            if (ghm.last)
+            {
+                std::lock_guard<std::recursive_mutex> al(mutex);
+
+                for (int i = 0; i < ghm.last; i++)
+                {
+                    Vec4i node = ghm.data[i];
+                    Vec3i idx(node[0], node[1], node[2]);
+
+                    //TODO: 1. add to separate hash map instead, then merge on GPU side
+
+                    int result = thm.insert(idx);
+                    if (!result)
+                    {
+                        needReallocation = true;
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    };
+
+    do
+    {
+        if (needReallocation)
+        {
+            thm.capacity *= 2;
+            thm.data.resize(thm.capacity);
+
+            needReallocation = false;
+        }
+
+        parallel_for_(Range(0, gg.height), allocateLambda);
+    } while (needReallocation);
+
+
+    auto pushToGlobal = [](const CustomHashSet _thm, CustomHashSet& _globalHashMap,
+                           bool& _needReallocation, Mutex& _mutex)
+    {
+        for (int i = 0; i < _thm.last; i++)
+        {
+                Vec4i node = _thm.data[i];
+                Vec3i idx(node[0], node[1], node[2]);
+
+                std::lock_guard<std::recursive_mutex> al(_mutex);
+
+                int result = _globalHashMap.insert(idx);
+                if (result == 0)
+                {
+                    _needReallocation = true;
+                    return;
+                }
+        }
+    };
+
+    needReallocation = false;
+    do
+    {
+        if (needReallocation)
+        {
+            hashTable.capacity *= 2;
+            hashTable.data.resize(hashTable.capacity);
+
+            needReallocation = false;
+        }
+
+        pushToGlobal(thm, hashTable, needReallocation, mutex);
+    } while (needReallocation);
+}
+
+
+void HashTSDFVolumeGPU::markActive(const Matx44f& cameraPose, const Intr& intrinsics, const Size frameSz, const int frameId)
+{
+    //! Mark volumes in the camera frustum as active
+    String errorStr;
+    String name = "markActive";
+    ocl::ProgramSource source = ocl::rgbd::hash_tsdf_oclsrc;
+    String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if (k.empty())
+        throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+    const Affine3f vol2cam(Affine3f(cameraPose.inv()) * pose);
+    const Intr::Projector proj(intrinsics.makeProjector());
+    Vec2f fxy(proj.fx, proj.fy), cxy(proj.cx, proj.cy);
+
+    UMat hashDataGpu = Mat(hashTable.data, false).getUMat(ACCESS_READ);
+
+    k.args(
+        ocl::KernelArg::PtrReadOnly(hashDataGpu),
+        ocl::KernelArg::WriteOnly(isActiveFlags),
+        ocl::KernelArg::WriteOnly(lastVisibleIndices),
+        vol2cam.matrix,
+        fxy,
+        cxy,
+        frameSz,
+        volumeUnitSize,
+        hashTable.last,
+        truncateThreshold,
+        frameId
+    );
+
+    size_t globalSize[1] = { (size_t)hashTable.last };
+    if (!k.run(1, globalSize, nullptr, true))
+        throw std::runtime_error("Failed to run kernel");
+}
+
+
+void HashTSDFVolumeGPU::integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics, const int frameId)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_depth.type() == DEPTH_TYPE);
+    UMat depth = _depth.getUMat();
+
+    // Save length to fill new data in ranges
+    int sizeBefore = hashTable.last;
+    allocateVolumeUnits(depth, depthFactor, cameraPose, intrinsics);
+    int sizeAfter = hashTable.last;
+    //! Perform the allocation
+
+    // Grow buffers
+    int buff_lvl = (int)(1 << bufferSizeDegree);
+    if (sizeAfter >= buff_lvl)
+    {
+        bufferSizeDegree = (int)(log2(sizeAfter) + 1); // clz() would be better
+        int oldBuffSize = buff_lvl;
+        buff_lvl = (int)pow(2, bufferSizeDegree);
+
+        volUnitsDataCopy.resize(buff_lvl);
+
+        Range oldr(0, oldBuffSize);
+        int volCubed = volumeUnitResolution * volumeUnitResolution * volumeUnitResolution;
+        UMat newData(buff_lvl, volCubed, CV_8UC2);
+        volUnitsData.copyTo(newData.rowRange(oldr));
+        volUnitsData = newData;
+
+        UMat newLastVisibleIndices(buff_lvl, 1, CV_32S);
+        lastVisibleIndices.copyTo(newLastVisibleIndices.rowRange(oldr));
+        lastVisibleIndices = newLastVisibleIndices;
+
+        UMat newIsActiveFlags(buff_lvl, 1, CV_8U);
+        isActiveFlags.copyTo(newIsActiveFlags.rowRange(oldr));
+        isActiveFlags = newIsActiveFlags;
+    }
+
+    // Fill data for new volume units
+    Range r(sizeBefore, sizeAfter);
+    if (r.start < r.end)
+    {
+        lastVisibleIndices.rowRange(r) = frameId;
+        isActiveFlags.rowRange(r) = 1;
+
+        TsdfVoxel emptyVoxel(floatToTsdf(0.0f), 0);
+        volUnitsData.rowRange(r) = Vec2b((uchar)(emptyVoxel.tsdf), (uchar)(emptyVoxel.weight));
+    }
+
+    //! Mark volumes in the camera frustum as active
+    markActive(cameraPose, intrinsics, depth.size(), frameId);
+
+    Vec6f newParams((float)depth.rows, (float)depth.cols,
+                     intrinsics.fx, intrinsics.fy,
+                     intrinsics.cx, intrinsics.cy);
+    if (!(frameParams == newParams))
+    {
+        frameParams = newParams;
+        pixNorms = preCalculationPixNormGPU(depth, intrinsics);
+    }
+
+    //! Integrate the correct volumeUnits
+    integrateAllVolumeUnitsGPU(depth, depthFactor, cameraPose, intrinsics);
+}
+
+
+//TODO: remove these functions when everything is done on GPU
+cv::Vec3i HashTSDFVolumeGPU::volumeToVolumeUnitIdx(const cv::Point3f& p) const
+{
+    return cv::Vec3i(cvFloor(p.x / volumeUnitSize), cvFloor(p.y / volumeUnitSize),
+        cvFloor(p.z / volumeUnitSize));
+}
+
+cv::Point3f HashTSDFVolumeGPU::volumeUnitIdxToVolume(const cv::Vec3i& volumeUnitIdx) const
+{
+    return cv::Point3f(volumeUnitIdx[0] * volumeUnitSize, volumeUnitIdx[1] * volumeUnitSize,
+        volumeUnitIdx[2] * volumeUnitSize);
+}
+
+cv::Point3f HashTSDFVolumeGPU::voxelCoordToVolume(const cv::Vec3i& voxelIdx) const
+{
+    return cv::Point3f(voxelIdx[0] * voxelSize, voxelIdx[1] * voxelSize, voxelIdx[2] * voxelSize);
+}
+
+cv::Vec3i HashTSDFVolumeGPU::volumeToVoxelCoord(const cv::Point3f& point) const
+{
+    return cv::Vec3i(cvFloor(point.x * voxelSizeInv), cvFloor(point.y * voxelSizeInv),
+        cvFloor(point.z * voxelSizeInv));
+}
+
+inline TsdfVoxel HashTSDFVolumeGPU::new_at(const cv::Vec3i& volumeIdx, int indx) const
+{
+    //! Out of bounds
+    if ((volumeIdx[0] >= volumeUnitResolution || volumeIdx[0] < 0) ||
+        (volumeIdx[1] >= volumeUnitResolution || volumeIdx[1] < 0) ||
+        (volumeIdx[2] >= volumeUnitResolution || volumeIdx[2] < 0))
+    {
+        return TsdfVoxel(floatToTsdf(1.0f), 0);
+    }
+
+    const TsdfVoxel* volData = volUnitsDataCopy.ptr<TsdfVoxel>(indx);
+    int coordBase =
+        volumeIdx[0] * volStrides[0] +
+        volumeIdx[1] * volStrides[1] +
+        volumeIdx[2] * volStrides[2];
+    return volData[coordBase];
+}
+
+TsdfVoxel HashTSDFVolumeGPU::new_atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, int indx) const
+{
+    if (indx < 0)
+    {
+        return TsdfVoxel(floatToTsdf(1.f), 0);
+    }
+    Vec3i volUnitLocalIdx = point - Vec3i(volumeUnitIdx[0] << volumeUnitDegree,
+                                          volumeUnitIdx[1] << volumeUnitDegree,
+                                          volumeUnitIdx[2] << volumeUnitDegree);
+
+    // expanding at(), removing bounds check
+    const TsdfVoxel* volData = volUnitsDataCopy.ptr<TsdfVoxel>(indx);
+    int coordBase = volUnitLocalIdx[0] * volStrides[0] +
+        volUnitLocalIdx[1] * volStrides[1] +
+        volUnitLocalIdx[2] * volStrides[2];
+    return volData[coordBase];
+}
+
+float HashTSDFVolumeGPU::interpolateVoxelPoint(const Point3f& point) const
+{
+    const Vec3i local_neighbourCoords[] = { {0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1},
+                                            {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1} };
+
+    // A small hash table to reduce a number of find() calls
+    // -2 and lower means not queried yet
+    // -1 means not found
+    // 0+ means found
+    int iterMap[8];
+    for (int i = 0; i < 8; i++)
+    {
+        iterMap[i] = -2;
+    }
+
+    int ix = cvFloor(point.x);
+    int iy = cvFloor(point.y);
+    int iz = cvFloor(point.z);
+
+    float tx = point.x - ix;
+    float ty = point.y - iy;
+    float tz = point.z - iz;
+
+    Vec3i iv(ix, iy, iz);
+    float vx[8];
+    for (int i = 0; i < 8; i++)
+    {
+        Vec3i pt = iv + local_neighbourCoords[i];
+
+        Vec3i volumeUnitIdx = Vec3i(pt[0] >> volumeUnitDegree, pt[1] >> volumeUnitDegree, pt[2] >> volumeUnitDegree);
+        int dictIdx = (volumeUnitIdx[0] & 1) + (volumeUnitIdx[1] & 1) * 2 + (volumeUnitIdx[2] & 1) * 4;
+        auto it = iterMap[dictIdx];
+        if (it < -1)
+        {
+            it = hashTable.find(volumeUnitIdx);
+            iterMap[dictIdx] = it;
+        }
+
+        vx[i] = new_atVolumeUnit(pt, volumeUnitIdx, it).tsdf;
+    }
+
+    return interpolate(tx, ty, tz, vx);
+}
+
+inline float HashTSDFVolumeGPU::interpolateVoxel(const cv::Point3f& point) const
+{
+    return interpolateVoxelPoint(point * voxelSizeInv);
+}
+
+Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
+{
+    Vec3f normal = Vec3f(0, 0, 0);
+
+    Point3f ptVox = point * voxelSizeInv;
+    Vec3i iptVox(cvFloor(ptVox.x), cvFloor(ptVox.y), cvFloor(ptVox.z));
+
+    // A small hash table to reduce a number of find() calls
+    // -2 and lower means not queried yet
+    // -1 means not found
+    // 0+ means found
+    int iterMap[8];
+    for (int i = 0; i < 8; i++)
+    {
+        iterMap[i] = -2;
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    const Vec3i offsets[] = { { 1,  0,  0}, {-1,  0,  0}, { 0,  1,  0}, // 0-3
+                              { 0, -1,  0}, { 0,  0,  1}, { 0,  0, -1}  // 4-7
+    };
+    const int nVals = 6;
+
+#else
+    const Vec3i offsets[] = { { 0,  0,  0}, { 0,  0,  1}, { 0,  1,  0}, { 0,  1,  1}, //  0-3
+                              { 1,  0,  0}, { 1,  0,  1}, { 1,  1,  0}, { 1,  1,  1}, //  4-7
+                              {-1,  0,  0}, {-1,  0,  1}, {-1,  1,  0}, {-1,  1,  1}, //  8-11
+                              { 2,  0,  0}, { 2,  0,  1}, { 2,  1,  0}, { 2,  1,  1}, // 12-15
+                              { 0, -1,  0}, { 0, -1,  1}, { 1, -1,  0}, { 1, -1,  1}, // 16-19
+                              { 0,  2,  0}, { 0,  2,  1}, { 1,  2,  0}, { 1,  2,  1}, // 20-23
+                              { 0,  0, -1}, { 0,  1, -1}, { 1,  0, -1}, { 1,  1, -1}, // 24-27
+                              { 0,  0,  2}, { 0,  1,  2}, { 1,  0,  2}, { 1,  1,  2}, // 28-31
+    };
+    const int nVals = 32;
+#endif
+
+    float vals[nVals];
+    for (int i = 0; i < nVals; i++)
+    {
+        Vec3i pt = iptVox + offsets[i];
+
+        Vec3i volumeUnitIdx = Vec3i(pt[0] >> volumeUnitDegree, pt[1] >> volumeUnitDegree, pt[2] >> volumeUnitDegree);
+
+        int dictIdx = (volumeUnitIdx[0] & 1) + (volumeUnitIdx[1] & 1) * 2 + (volumeUnitIdx[2] & 1) * 4;
+        auto it = iterMap[dictIdx];
+        if (it < -1)
+        {
+            it = hashTable.find(volumeUnitIdx);
+            iterMap[dictIdx] = it;
+        }
+
+        vals[i] = tsdfToFloat(new_atVolumeUnit(pt, volumeUnitIdx, it).tsdf);
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    for (int c = 0; c < 3; c++)
+    {
+        normal[c] = vals[c * 2] - vals[c * 2 + 1];
+    }
+#else
+
+    float cxv[8], cyv[8], czv[8];
+
+    // How these numbers were obtained:
+    // 1. Take the basic interpolation sequence:
+    // 000, 001, 010, 011, 100, 101, 110, 111
+    // where each digit corresponds to shift by x, y, z axis respectively.
+    // 2. Add +1 for next or -1 for prev to each coordinate to corresponding axis
+    // 3. Search corresponding values in offsets
+    const int idxxp[8] = { 8,  9, 10, 11,  0,  1,  2,  3 };
+    const int idxxn[8] = { 4,  5,  6,  7, 12, 13, 14, 15 };
+    const int idxyp[8] = { 16, 17,  0,  1, 18, 19,  4,  5 };
+    const int idxyn[8] = { 2,  3, 20, 21,  6,  7, 22, 23 };
+    const int idxzp[8] = { 24,  0, 25,  2, 26,  4, 27,  6 };
+    const int idxzn[8] = { 1, 28,  3, 29,  5, 30,  7, 31 };
+
+#if !USE_INTRINSICS
+    for (int i = 0; i < 8; i++)
+    {
+        cxv[i] = vals[idxxn[i]] - vals[idxxp[i]];
+        cyv[i] = vals[idxyn[i]] - vals[idxyp[i]];
+        czv[i] = vals[idxzn[i]] - vals[idxzp[i]];
+    }
+#else
+
+# if CV_SIMD >= 32
+    v_float32x8 cxp = v_lut(vals, idxxp);
+    v_float32x8 cxn = v_lut(vals, idxxn);
+
+    v_float32x8 cyp = v_lut(vals, idxyp);
+    v_float32x8 cyn = v_lut(vals, idxyn);
+
+    v_float32x8 czp = v_lut(vals, idxzp);
+    v_float32x8 czn = v_lut(vals, idxzn);
+
+    v_float32x8 vcxv = cxn - cxp;
+    v_float32x8 vcyv = cyn - cyp;
+    v_float32x8 vczv = czn - czp;
+
+    v_store(cxv, vcxv);
+    v_store(cyv, vcyv);
+    v_store(czv, vczv);
+# else
+    v_float32x4 cxp0 = v_lut(vals, idxxp + 0); v_float32x4 cxp1 = v_lut(vals, idxxp + 4);
+    v_float32x4 cxn0 = v_lut(vals, idxxn + 0); v_float32x4 cxn1 = v_lut(vals, idxxn + 4);
+
+    v_float32x4 cyp0 = v_lut(vals, idxyp + 0); v_float32x4 cyp1 = v_lut(vals, idxyp + 4);
+    v_float32x4 cyn0 = v_lut(vals, idxyn + 0); v_float32x4 cyn1 = v_lut(vals, idxyn + 4);
+
+    v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
+    v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
+
+    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
+    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
+    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+
+    v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
+    v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
+    v_store(czv + 0, czv0); v_store(czv + 4, czv1);
+#endif
+
+#endif
+
+    float tx = ptVox.x - iptVox[0];
+    float ty = ptVox.y - iptVox[1];
+    float tz = ptVox.z - iptVox[2];
+
+    normal[0] = interpolate(tx, ty, tz, cxv);
+    normal[1] = interpolate(tx, ty, tz, cyv);
+    normal[2] = interpolate(tx, ty, tz, czv);
+#endif
+    float nv = sqrt(normal[0] * normal[0] +
+                    normal[1] * normal[1] +
+                    normal[2] * normal[2]);
+    return nv < 0.0001f ? nan3 : normal / nv;
+}
+
+
+void HashTSDFVolumeGPU::raycast(const Matx44f& cameraPose, const kinfu::Intr& intrinsics, const Size& frameSize,
+                                OutputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(frameSize.area() > 0);
+
+    String errorStr;
+    String name = "raycast";
+    ocl::ProgramSource source = ocl::rgbd::hash_tsdf_oclsrc;
+    String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if (k.empty())
+        throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+    _points.create(frameSize, CV_32FC4);
+    _normals.create(frameSize, CV_32FC4);
+
+    UMat points = _points.getUMat();
+    UMat normals = _normals.getUMat();
+
+    Intr::Reprojector r = intrinsics.makeReprojector();
+    Vec2f finv(r.fxinv, r.fyinv), cxy(r.cx, r.cy);
+
+    Vec4f boxMin, boxMax(volumeUnitSize - voxelSize,
+                         volumeUnitSize - voxelSize,
+                         volumeUnitSize - voxelSize);
+
+    float tstep = truncDist * raycastStepFactor;
+
+    const HashTSDFVolumeGPU& volume(*this);
+    const Affine3f cam2vol(volume.pose.inv() * Affine3f(cameraPose));
+    const Affine3f vol2cam(Affine3f(cameraPose.inv()) * volume.pose);
+
+    Matx44f cam2volRotGPU = cam2vol.matrix;
+    Matx44f vol2camRotGPU = vol2cam.matrix;
+
+    UMat volPoseGpu = Mat(pose.matrix).getUMat(ACCESS_READ);
+    UMat invPoseGpu = Mat(pose.inv().matrix).getUMat(ACCESS_READ);
+
+    UMat hashesGpu = Mat(hashTable.hashes, false).getUMat(ACCESS_READ);
+    UMat hashDataGpu = Mat(hashTable.data, false).getUMat(ACCESS_READ);
+
+    k.args(
+        ocl::KernelArg::PtrReadOnly(hashesGpu),
+        ocl::KernelArg::PtrReadOnly(hashDataGpu),
+        ocl::KernelArg::WriteOnlyNoSize(points),
+        ocl::KernelArg::WriteOnlyNoSize(normals),
+        frameSize,
+        ocl::KernelArg::ReadOnly(volUnitsData),
+        cam2volRotGPU,
+        vol2camRotGPU,
+        float(volume.truncateThreshold),
+        finv.val, cxy.val,
+        boxMin.val, boxMax.val,
+        tstep,
+        voxelSize,
+        voxelSizeInv,
+        volumeUnitSize,
+        volume.truncDist,
+        volumeUnitDegree,
+        volStrides
+    );
+
+    size_t globalSize[2];
+    globalSize[0] = (size_t)frameSize.width;
+    globalSize[1] = (size_t)frameSize.height;
+
+    if (!k.run(2, globalSize, NULL, true))
+        throw std::runtime_error("Failed to run kernel");
+}
+
+void HashTSDFVolumeGPU::fetchPointsNormals(OutputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    if (_points.needed())
+    {
+        //TODO: remove it when it works w/o CPU code
+        volUnitsData.copyTo(volUnitsDataCopy);
+        //TODO: remove it when it works w/o CPU code
+        //TODO: enable it when it's on GPU
+        //UMat hashDataGpu(hashMap.capacity, 1, CV_32SC4);
+        //Mat(hashMap.data, false).copyTo(hashDataGpu);
+
+        std::vector<std::vector<ptype>> pVecs, nVecs;
+
+        Range _fetchRange(0, hashTable.last);
+
+        const int nstripes = -1;
+
+        const HashTSDFVolumeGPU& volume(*this);
+        bool needNormals(_normals.needed());
+        Mutex mutex;
+
+        auto _HashFetchPointsNormalsInvoker = [&](const Range& range)
+        {
+            std::vector<ptype> points, normals;
+            for (int row = range.start; row < range.end; row++)
+            {
+                cv::Vec4i idx4 = hashTable.data[row];
+                cv::Vec3i idx(idx4[0], idx4[1], idx4[2]);
+
+                Point3f base_point = volume.volumeUnitIdxToVolume(idx);
+
+                std::vector<ptype> localPoints;
+                std::vector<ptype> localNormals;
+                for (int x = 0; x < volume.volumeUnitResolution; x++)
+                    for (int y = 0; y < volume.volumeUnitResolution; y++)
+                        for (int z = 0; z < volume.volumeUnitResolution; z++)
+                        {
+                            cv::Vec3i voxelIdx(x, y, z);
+                            TsdfVoxel voxel = new_at(voxelIdx, row);
+
+                            if (voxel.tsdf != -128 && voxel.weight != 0)
+                            {
+                                Point3f point = base_point + volume.voxelCoordToVolume(voxelIdx);
+
+                                localPoints.push_back(toPtype(point));
+                                if (needNormals)
+                                {
+                                    Point3f normal = volume.getNormalVoxel(point);
+                                    localNormals.push_back(toPtype(normal));
+                                }
+                            }
+                        }
+
+                AutoLock al(mutex);
+                pVecs.push_back(localPoints);
+                nVecs.push_back(localNormals);
+            }
+        };
+
+        parallel_for_(_fetchRange, _HashFetchPointsNormalsInvoker, nstripes);
+
+        std::vector<ptype> points, normals;
+        for (size_t i = 0; i < pVecs.size(); i++)
+        {
+            points.insert(points.end(), pVecs[i].begin(), pVecs[i].end());
+            normals.insert(normals.end(), nVecs[i].begin(), nVecs[i].end());
+        }
+
+        _points.create((int)points.size(), 1, POINT_TYPE);
+        if (!points.empty())
+            Mat((int)points.size(), 1, POINT_TYPE, &points[0]).copyTo(_points.getMat());
+
+        if (_normals.needed())
+        {
+            _normals.create((int)normals.size(), 1, POINT_TYPE);
+            if (!normals.empty())
+                Mat((int)normals.size(), 1, POINT_TYPE, &normals[0]).copyTo(_normals.getMat());
+        }
+    }
+}
+
+void HashTSDFVolumeGPU::fetchNormals(InputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    if (_normals.needed())
+    {
+        //TODO: remove it when it works w/o CPU code
+        volUnitsData.copyTo(volUnitsDataCopy);
+
+        Points points = _points.getMat();
+        CV_Assert(points.type() == POINT_TYPE);
+        _normals.createSameSize(_points, _points.type());
+        Normals normals = _normals.getMat();
+        const HashTSDFVolumeGPU& _volume = *this;
+        auto HashPushNormals             = [&](const ptype& point, const int* position) {
+            const HashTSDFVolumeGPU& volume(_volume);
+            Affine3f invPose(volume.pose.inv());
+            Point3f p = fromPtype(point);
+            Point3f n = nan3;
+            if (!isNaN(p))
+            {
+                Point3f voxelPoint = invPose * p;
+                n                  = volume.pose.rotation() * volume.getNormalVoxel(voxelPoint);
+            }
+            normals(position[0], position[1]) = toPtype(n);
+        };
+        points.forEach(HashPushNormals);
+    }
+
+}
+
+int HashTSDFVolumeGPU::getVisibleBlocks(int currFrameId, int frameThreshold) const
+{
+    Mat cpuIndices = lastVisibleIndices.getMat(ACCESS_READ);
+
+    int numVisibleBlocks = 0;
+    //! TODO: Iterate over map parallely?
+    for (int i = 0; i < hashTable.last; i++)
+    {
+        if (cpuIndices.at<int>(i) > (currFrameId - frameThreshold))
+            numVisibleBlocks++;
+    }
+    return numVisibleBlocks;
+}
+
+#endif
+
+//template<typename T>
+Ptr<HashTSDFVolume> makeHashTSDFVolume(const VolumeParams& _params)
+{
+#ifdef HAVE_OPENCL
+    if (ocl::useOpenCL())
+        return makePtr<HashTSDFVolumeGPU>(_params.voxelSize, _params.pose.matrix, _params.raycastStepFactor, _params.tsdfTruncDist, _params.maxWeight,
+            _params.depthTruncThreshold, _params.unitResolution);
+#endif
+    return makePtr<HashTSDFVolumeCPU>(_params.voxelSize, _params.pose.matrix, _params.raycastStepFactor, _params.tsdfTruncDist, _params.maxWeight,
+        _params.depthTruncThreshold, _params.unitResolution);
+}
+
+//template<typename T>
+Ptr<HashTSDFVolume> makeHashTSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
+    int _maxWeight, float truncateThreshold, int volumeUnitResolution)
+{
+#ifdef HAVE_OPENCL
+    if (ocl::useOpenCL())
+        return makePtr<HashTSDFVolumeGPU>(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, truncateThreshold,
+            volumeUnitResolution);
+#endif
+    return makePtr<HashTSDFVolumeCPU>(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, truncateThreshold,
+        volumeUnitResolution);
+}
+
 }  // namespace kinfu
 }  // namespace cv
diff --git a/modules/rgbd/src/hash_tsdf.hpp b/modules/rgbd/src/hash_tsdf.hpp
index 31bf026785b..34e73f30383 100644
--- a/modules/rgbd/src/hash_tsdf.hpp
+++ b/modules/rgbd/src/hash_tsdf.hpp
@@ -26,108 +26,26 @@ class HashTSDFVolume : public Volume
 
     virtual ~HashTSDFVolume() = default;
 
+    virtual int getVisibleBlocks(int currFrameId, int frameThreshold) const = 0;
+    virtual size_t getTotalVolumeUnits() const = 0;
+
    public:
     int maxWeight;
     float truncDist;
     float truncateThreshold;
     int volumeUnitResolution;
+    int volumeUnitDegree;
     float volumeUnitSize;
     bool zFirstMemOrder;
+    Vec4i volStrides;
 };
 
-//! Spatial hashing
-struct tsdf_hash
-{
-    size_t operator()(const Vec3i& x) const noexcept
-    {
-        size_t seed                     = 0;
-        constexpr uint32_t GOLDEN_RATIO = 0x9e3779b9;
-        for (uint16_t i = 0; i < 3; i++)
-        {
-            seed ^= std::hash<int>()(x[i]) + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
-        }
-        return seed;
-    }
-};
-
-typedef unsigned int VolumeIndex;
-struct VolumeUnit
-{
-    cv::Vec3i coord;
-    VolumeIndex index;
-    cv::Matx44f pose;
-    int lastVisibleIndex = 0;
-    bool isActive;
-};
-
-typedef std::unordered_set<cv::Vec3i, tsdf_hash> VolumeUnitIndexSet;
-typedef std::unordered_map<cv::Vec3i, VolumeUnit, tsdf_hash> VolumeUnitIndexes;
-
-class HashTSDFVolumeCPU : public HashTSDFVolume
-{
-   public:
-    // dimension in voxels, size in meters
-    HashTSDFVolumeCPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist, int _maxWeight,
-                      float _truncateThreshold, int _volumeUnitRes, bool zFirstMemOrder = true);
-
-    HashTSDFVolumeCPU(const VolumeParams& _volumeParams, bool zFirstMemOrder = true);
-
-    void integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const kinfu::Intr& intrinsics,
-                    const int frameId = 0) override;
-    void raycast(const Matx44f& cameraPose, const kinfu::Intr& intrinsics, const Size& frameSize, OutputArray points,
-                  OutputArray normals) const override;
-
-    void fetchNormals(InputArray points, OutputArray _normals) const override;
-    void fetchPointsNormals(OutputArray points, OutputArray normals) const override;
-
-    void reset() override;
-    size_t getTotalVolumeUnits() const { return volumeUnits.size(); }
-    int getVisibleBlocks(int currFrameId, int frameThreshold) const;
-
-    //! Return the voxel given the voxel index in the universal volume (1 unit = 1 voxel_length)
-    TsdfVoxel at(const Vec3i& volumeIdx) const;
-
-    //! Return the voxel given the point in volume coordinate system i.e., (metric scale 1 unit =
-    //! 1m)
-    virtual TsdfVoxel at(const cv::Point3f& point) const;
-    virtual TsdfVoxel _at(const cv::Vec3i& volumeIdx, VolumeIndex indx) const;
-
-    TsdfVoxel atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, VolumeUnitIndexes::const_iterator it) const;
-
-
-    float interpolateVoxelPoint(const Point3f& point) const;
-    float interpolateVoxel(const cv::Point3f& point) const;
-    Point3f getNormalVoxel(const cv::Point3f& p) const;
-
-    //! Utility functions for coordinate transformations
-    Vec3i volumeToVolumeUnitIdx(const Point3f& point) const;
-    Point3f volumeUnitIdxToVolume(const Vec3i& volumeUnitIdx) const;
-
-    Point3f voxelCoordToVolume(const Vec3i& voxelIdx) const;
-    Vec3i volumeToVoxelCoord(const Point3f& point) const;
-
-   public:
-       Vec4i volStrides;
-       Vec6f frameParams;
-       Mat pixNorms;
-       VolumeUnitIndexes volumeUnits;
-       cv::Mat volUnitsData;
-       VolumeIndex lastVolIndex;
-};
-
-template<typename T>
-Ptr<HashTSDFVolume> makeHashTSDFVolume(const VolumeParams& _volumeParams)
-{
-    return makePtr<T>(_volumeParams);
-}
-
-template<typename T>
+//template<typename T>
+Ptr<HashTSDFVolume> makeHashTSDFVolume(const VolumeParams& _volumeParams);
+//template<typename T>
 Ptr<HashTSDFVolume> makeHashTSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
-                                          int _maxWeight, float truncateThreshold, int volumeUnitResolution = 16)
-{
-    return makePtr<T>(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, truncateThreshold,
-                      volumeUnitResolution);
-}
+    int _maxWeight, float truncateThreshold, int volumeUnitResolution = 16);
+
 }  // namespace kinfu
 }  // namespace cv
 #endif
diff --git a/modules/rgbd/src/large_kinfu.cpp b/modules/rgbd/src/large_kinfu.cpp
index dedeabb82db..16006713c53 100644
--- a/modules/rgbd/src/large_kinfu.cpp
+++ b/modules/rgbd/src/large_kinfu.cpp
@@ -316,21 +316,21 @@ template<typename MatType>
 void LargeKinfuImpl<MatType>::getCloud(OutputArray p, OutputArray n) const
 {
     auto currSubmap = submapMgr->getCurrentSubmap();
-    currSubmap->volume.fetchPointsNormals(p, n);
+    currSubmap->volume->fetchPointsNormals(p, n);
 }
 
 template<typename MatType>
 void LargeKinfuImpl<MatType>::getPoints(OutputArray points) const
 {
     auto currSubmap = submapMgr->getCurrentSubmap();
-    currSubmap->volume.fetchPointsNormals(points, noArray());
+    currSubmap->volume->fetchPointsNormals(points, noArray());
 }
 
 template<typename MatType>
 void LargeKinfuImpl<MatType>::getNormals(InputArray points, OutputArray normals) const
 {
     auto currSubmap = submapMgr->getCurrentSubmap();
-    currSubmap->volume.fetchNormals(points, normals);
+    currSubmap->volume->fetchNormals(points, normals);
 }
 
 // importing class
diff --git a/modules/rgbd/src/opencl/hash_tsdf.cl b/modules/rgbd/src/opencl/hash_tsdf.cl
new file mode 100644
index 00000000000..0e611c1d3dd
--- /dev/null
+++ b/modules/rgbd/src/opencl/hash_tsdf.cl
@@ -0,0 +1,653 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory
+
+#define USE_INTERPOLATION_IN_GETNORMAL 1
+#define HASH_DIVISOR 32768
+
+typedef char int8_t;
+typedef uint int32_t;
+
+typedef int8_t TsdfType;
+typedef uchar WeightType;
+
+struct TsdfVoxel
+{
+    TsdfType tsdf;
+    WeightType weight;
+};
+
+
+static inline TsdfType floatToTsdf(float num)
+{
+    int8_t res = (int8_t) ( (num * (-128)) );
+    res = res ? res : (num < 0 ? 1 : -1);
+    return res;
+}
+
+static inline float tsdfToFloat(TsdfType num)
+{
+    return ( (float) num ) / (-128);
+}
+
+static uint calc_hash(int3 x)
+{
+    unsigned int seed = 0;
+    unsigned int GOLDEN_RATIO = 0x9e3779b9;
+    seed ^= x.s0 + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+    seed ^= x.s1 + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+    seed ^= x.s2 + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+    return seed;
+}
+
+
+//TODO: make hashDivisor a power of 2
+//TODO: put it to this .cl file as a constant
+static int custom_find(int3 idx, const int hashDivisor, __global const int* hashes,
+                    __global const int4* data)
+{
+    int hash = calc_hash(idx) % hashDivisor;
+    int place = hashes[hash];
+    // search a place
+    while (place >= 0)
+    {
+        if (all(data[place].s012 == idx))
+            break;
+        else
+            place = data[place].s3;
+    }
+
+    return place;
+}
+
+
+
+static void integrateVolumeUnit(
+                        int x, int y,
+                        __global const char * depthptr,
+                        int depth_step, int depth_offset,
+                        int depth_rows, int depth_cols,
+                        __global struct TsdfVoxel * volumeptr,
+                        const __global char * pixNormsPtr,
+                        int pixNormsStep, int pixNormsOffset,
+                        int pixNormsRows, int pixNormsCols,
+                        const float16 vol2camMatrix,
+                        const float voxelSize,
+                        const int4 volResolution4,
+                        const int4 volStrides4,
+                        const float2 fxy,
+                        const float2 cxy,
+                        const float dfac,
+                        const float truncDist,
+                        const int maxWeight
+                        )
+{
+    const int3 volResolution = volResolution4.xyz;
+
+    if(x >= volResolution.x || y >= volResolution.y)
+        return;
+
+    // coord-independent constants
+    const int3 volStrides = volStrides4.xyz;
+    const float2 limits = (float2)(depth_cols-1, depth_rows-1);
+
+    const float4 vol2cam0 = vol2camMatrix.s0123;
+    const float4 vol2cam1 = vol2camMatrix.s4567;
+    const float4 vol2cam2 = vol2camMatrix.s89ab;
+
+    const float truncDistInv = 1.f/truncDist;
+
+    // optimization of camSpace transformation (vector addition instead of matmul at each z)
+    float4 inPt = (float4)(x*voxelSize, y*voxelSize, 0, 1);
+    float3 basePt = (float3)(dot(vol2cam0, inPt),
+                             dot(vol2cam1, inPt),
+                             dot(vol2cam2, inPt));
+
+    float3 camSpacePt = basePt;
+
+    // zStep == vol2cam*(float3(x, y, 1)*voxelSize) - basePt;
+    float3 zStep = ((float3)(vol2cam0.z, vol2cam1.z, vol2cam2.z))*voxelSize;
+
+    int volYidx = x*volStrides.x + y*volStrides.y;
+
+    int startZ, endZ;
+    if(fabs(zStep.z) > 1e-5f)
+    {
+        int baseZ = convert_int(-basePt.z / zStep.z);
+        if(zStep.z > 0)
+        {
+            startZ = baseZ;
+            endZ = volResolution.z;
+        }
+        else
+        {
+            startZ = 0;
+            endZ = baseZ;
+        }
+    }
+    else
+    {
+        if(basePt.z > 0)
+        {
+            startZ = 0; endZ = volResolution.z;
+        }
+        else
+        {
+            // z loop shouldn't be performed
+            startZ = endZ = 0;
+        }
+    }
+
+    startZ = max(0, startZ);
+    endZ = min(volResolution.z, endZ);
+
+    for(int z = startZ; z < endZ; z++)
+    {
+        // optimization of the following:
+        //float3 camSpacePt = vol2cam * ((float3)(x, y, z)*voxelSize);
+        camSpacePt += zStep;
+
+        if(camSpacePt.z <= 0)
+            continue;
+
+        float3 camPixVec = camSpacePt / camSpacePt.z;
+        float2 projected = mad(camPixVec.xy, fxy, cxy); // mad(a,b,c) = a * b + c
+
+        float v;
+        // bilinearly interpolate depth at projected
+        if(all(projected >= 0) && all(projected < limits))
+        {
+            float2 ip = floor(projected);
+            int xi = ip.x, yi = ip.y;
+
+            __global const float* row0 = (__global const float*)(depthptr + depth_offset +
+                                                                 (yi+0)*depth_step);
+            __global const float* row1 = (__global const float*)(depthptr + depth_offset +
+                                                                 (yi+1)*depth_step);
+
+            float v00 = row0[xi+0];
+            float v01 = row0[xi+1];
+            float v10 = row1[xi+0];
+            float v11 = row1[xi+1];
+            float4 vv = (float4)(v00, v01, v10, v11);
+
+            // assume correct depth is positive
+            if(all(vv > 0))
+            {
+                float2 t = projected - ip;
+                float2 vf = mix(vv.xz, vv.yw, t.x);
+                v = mix(vf.s0, vf.s1, t.y);
+            }
+            else
+                continue;
+        }
+        else
+            continue;
+
+        if(v == 0)
+            continue;
+
+        int2 projInt = convert_int2(projected);
+        float pixNorm = *(__global const float*)(pixNormsPtr + pixNormsOffset + projInt.y*pixNormsStep + projInt.x*sizeof(float));
+        //float pixNorm = length(camPixVec);
+
+        // difference between distances of point and of surface to camera
+        float sdf = pixNorm*(v*dfac - camSpacePt.z);
+        // possible alternative is:
+        // float sdf = length(camSpacePt)*(v*dfac/camSpacePt.z - 1.0);
+        if(sdf >= -truncDist)
+        {
+            float tsdf = fmin(1.0f, sdf * truncDistInv);
+            int volIdx = volYidx + z*volStrides.z;
+
+            struct TsdfVoxel voxel = volumeptr[volIdx];
+            float value  = tsdfToFloat(voxel.tsdf);
+            int weight = voxel.weight;
+            // update TSDF
+            value = (value*weight + tsdf) / (weight + 1);
+            weight = min(weight + 1, maxWeight);
+
+            voxel.tsdf = floatToTsdf(value);
+            voxel.weight = weight;
+            volumeptr[volIdx] = voxel;
+        }
+    }
+
+}
+
+
+__kernel void integrateAllVolumeUnits(
+                        // depth
+                        __global const char * depthptr,
+                        int depth_step, int depth_offset,
+                        int depth_rows, int depth_cols,
+                        // hashMap
+                        __global const int* hashes,
+                        __global const int4* data,
+                        // volUnitsData
+                        __global struct TsdfVoxel * allVolumePtr,
+                        int table_step, int table_offset,
+                        int table_rows, int table_cols,
+                        // pixNorms
+                        const __global char * pixNormsPtr,
+                        int pixNormsStep, int pixNormsOffset,
+                        int pixNormsRows, int pixNormsCols,
+                        // isActiveFlags
+                        __global const uchar* isActiveFlagsPtr,
+                        int isActiveFlagsStep, int isActiveFlagsOffset,
+                        int isActiveFlagsRows, int isActiveFlagsCols,
+                        // cam matrices:
+                        const float16 vol2cam,
+                        const float16 camInv,
+                        // scalars:
+                        const float voxelSize,
+                        const int volUnitResolution,
+                        const int4 volStrides4,
+                        const float2 fxy,
+                        const float2 cxy,
+                        const float dfac,
+                        const float truncDist,
+                        const int maxWeight
+                        )
+{
+    const int hash_divisor = HASH_DIVISOR;
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    int row = get_global_id(2);
+    int3 idx = data[row].xyz;
+
+    const int4 volResolution4 = (int4)(volUnitResolution,
+                                       volUnitResolution,
+                                       volUnitResolution,
+                                       volUnitResolution);
+
+    int isActive = *(__global const uchar*)(isActiveFlagsPtr + isActiveFlagsOffset + row);
+
+    if (isActive)
+    {
+        int volCubed = volUnitResolution * volUnitResolution * volUnitResolution;
+        __global struct TsdfVoxel * volumeptr = (__global struct TsdfVoxel*)
+                                                (allVolumePtr + table_offset + row * volCubed);
+
+        // volUnit2cam = world2cam * volUnit2world =
+        // camPoseInv * volUnitPose = camPoseInv * (volPose + idx * volUnitSize) =
+        // camPoseInv * (volPose + idx * volUnitResolution * voxelSize) =
+        // camPoseInv * (volPose + mulIdx) = camPoseInv * volPose + camPoseInv * mulIdx =
+        // vol2cam + camPoseInv * mulIdx
+        float3 mulIdx = convert_float3(idx * volUnitResolution) * voxelSize;
+        float16 volUnit2cam = vol2cam;
+        volUnit2cam.s37b += (float3)(dot(mulIdx, camInv.s012),
+                                     dot(mulIdx, camInv.s456),
+                                     dot(mulIdx, camInv.s89a));
+
+        integrateVolumeUnit(
+            i, j,
+            depthptr,
+            depth_step, depth_offset,
+            depth_rows, depth_cols,
+            volumeptr,
+            pixNormsPtr,
+            pixNormsStep, pixNormsOffset,
+            pixNormsRows, pixNormsCols,
+            volUnit2cam,
+            voxelSize,
+            volResolution4,
+            volStrides4,
+            fxy,
+            cxy,
+            dfac,
+            truncDist,
+            maxWeight
+            );
+    }
+}
+
+
+static struct TsdfVoxel at(int3 volumeIdx, int row, int volumeUnitDegree,
+                           int3 volStrides, __global const struct TsdfVoxel * allVolumePtr, int table_offset)
+
+{
+    //! Out of bounds
+    if (any(volumeIdx >= (1 << volumeUnitDegree)) ||
+        any(volumeIdx < 0))
+    {
+        struct TsdfVoxel dummy;
+        dummy.tsdf = floatToTsdf(1.0f);
+        dummy.weight = 0;
+        return dummy;
+    }
+
+    int volCubed = 1 << (volumeUnitDegree*3);
+    __global struct TsdfVoxel * volData = (__global struct TsdfVoxel*)
+                                          (allVolumePtr + table_offset + row * volCubed);
+    int3 ismul = volumeIdx * volStrides;
+    int coordBase = ismul.x + ismul.y + ismul.z;
+    return volData[coordBase];
+}
+
+
+static struct TsdfVoxel atVolumeUnit(int3 volumeIdx, int3 volumeUnitIdx, int row,
+                                     int volumeUnitDegree, int3 volStrides,
+                                     __global const struct TsdfVoxel * allVolumePtr, int table_offset)
+
+{
+    //! Out of bounds
+    if (row < 0)
+    {
+        struct TsdfVoxel dummy;
+        dummy.tsdf = floatToTsdf(1.0f);
+        dummy.weight = 0;
+        return dummy;
+    }
+
+    int3 volUnitLocalIdx = volumeIdx - (volumeUnitIdx << volumeUnitDegree);
+    int volCubed = 1 << (volumeUnitDegree*3);
+    __global struct TsdfVoxel * volData = (__global struct TsdfVoxel*)
+                                          (allVolumePtr + table_offset + row * volCubed);
+    int3 ismul = volUnitLocalIdx * volStrides;
+    int coordBase = ismul.x + ismul.y + ismul.z;
+    return volData[coordBase];
+}
+
+inline float interpolate(float3 t, float8 vz)
+{
+    float4 vy = mix(vz.s0246, vz.s1357, t.z);
+    float2 vx = mix(vy.s02, vy.s13, t.y);
+    return mix(vx.s0, vx.s1, t.x);
+}
+
+inline float3 getNormalVoxel(float3 ptVox, __global const struct TsdfVoxel* allVolumePtr,
+                             int volumeUnitDegree,
+                             const int hash_divisor,
+                             __global const int* hashes,
+                             __global const int4* data,
+
+                             int3 volStrides, int table_offset)
+{
+    float3 normal = (float3) (0.0f, 0.0f, 0.0f);
+    float3 fip = floor(ptVox);
+    int3 iptVox = convert_int3(fip);
+
+    // A small hash table to reduce a number of findRow() calls
+    // -2 and lower means not queried yet
+    // -1 means not found
+    // 0+ means found
+    int iterMap[8];
+    for (int i = 0; i < 8; i++)
+    {
+        iterMap[i] = -2;
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    int4 offsets[] = { (int4)( 1,  0,  0, 0), (int4)(-1,  0,  0, 0), (int4)( 0,  1,  0, 0), // 0-3
+                       (int4)( 0, -1,  0, 0), (int4)( 0,  0,  1, 0), (int4)( 0,  0, -1, 0)  // 4-7
+    };
+    
+    const int nVals = 6;
+    float vals[6];
+#else
+    int4 offsets[]={(int4)( 0,  0,  0, 0), (int4)( 0,  0,  1, 0), (int4)( 0,  1,  0, 0), (int4)( 0,  1,  1, 0), //  0-3
+                    (int4)( 1,  0,  0, 0), (int4)( 1,  0,  1, 0), (int4)( 1,  1,  0, 0), (int4)( 1,  1,  1, 0), //  4-7
+                    (int4)(-1,  0,  0, 0), (int4)(-1,  0,  1, 0), (int4)(-1,  1,  0, 0), (int4)(-1,  1,  1, 0), //  8-11
+                    (int4)( 2,  0,  0, 0), (int4)( 2,  0,  1, 0), (int4)( 2,  1,  0, 0), (int4)( 2,  1,  1, 0), // 12-15
+                    (int4)( 0, -1,  0, 0), (int4)( 0, -1,  1, 0), (int4)( 1, -1,  0, 0), (int4)( 1, -1,  1, 0), // 16-19
+                    (int4)( 0,  2,  0, 0), (int4)( 0,  2,  1, 0), (int4)( 1,  2,  0, 0), (int4)( 1,  2,  1, 0), // 20-23
+                    (int4)( 0,  0, -1, 0), (int4)( 0,  1, -1, 0), (int4)( 1,  0, -1, 0), (int4)( 1,  1, -1, 0), // 24-27
+                    (int4)( 0,  0,  2, 0), (int4)( 0,  1,  2, 0), (int4)( 1,  0,  2, 0), (int4)( 1,  1,  2, 0), // 28-31
+    };
+    const int nVals = 32;
+    float vals[32];
+#endif
+
+    for (int i = 0; i < nVals; i++)
+    {
+        int3 pt = iptVox + offsets[i].s012;
+
+        // VoxelToVolumeUnitIdx()
+        int3 volumeUnitIdx = pt >> volumeUnitDegree;
+
+        int3 vand = (volumeUnitIdx & 1);
+        int dictIdx = vand.s0 + vand.s1 * 2 + vand.s2 * 4;
+
+        int it = iterMap[dictIdx];
+        if (it < -1)
+        {
+            it = custom_find(volumeUnitIdx, hash_divisor, hashes, data);
+            iterMap[dictIdx] = it;
+        }
+
+        struct TsdfVoxel tmp = atVolumeUnit(pt, volumeUnitIdx, it, volumeUnitDegree, volStrides, allVolumePtr, table_offset);
+        vals[i] = tsdfToFloat( tmp.tsdf );
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    float3 pv, nv;
+
+    pv = (float3)(vals[0*2  ], vals[1*2  ], vals[2*2  ]);
+    nv = (float3)(vals[0*2+1], vals[1*2+1], vals[2*2+1]);
+    normal = pv - nv;
+#else
+
+    float cxv[8], cyv[8], czv[8];
+
+    // How these numbers were obtained:
+    // 1. Take the basic interpolation sequence:
+    // 000, 001, 010, 011, 100, 101, 110, 111
+    // where each digit corresponds to shift by x, y, z axis respectively.
+    // 2. Add +1 for next or -1 for prev to each coordinate to corresponding axis
+    // 3. Search corresponding values in offsets
+    const int idxxn[8] = {  8,  9, 10, 11,  0,  1,  2,  3 };
+    const int idxxp[8] = {  4,  5,  6,  7, 12, 13, 14, 15 };
+    const int idxyn[8] = { 16, 17,  0,  1, 18, 19,  4,  5 };
+    const int idxyp[8] = {  2,  3, 20, 21,  6,  7, 22, 23 };
+    const int idxzn[8] = { 24,  0, 25,  2, 26,  4, 27,  6 };
+    const int idxzp[8] = {  1, 28,  3, 29,  5, 30,  7, 31 };
+
+    float vcxp[8], vcxn[8];
+    float vcyp[8], vcyn[8];
+    float vczp[8], vczn[8];
+
+    for (int i = 0; i < 8; i++)
+    {
+        vcxp[i] = vals[idxxp[i]]; vcxn[i] = vals[idxxn[i]];
+        vcyp[i] = vals[idxyp[i]]; vcyn[i] = vals[idxyn[i]];
+        vczp[i] = vals[idxzp[i]]; vczn[i] = vals[idxzn[i]];
+    }
+
+    float8 cxp = vload8(0, vcxp), cxn = vload8(0, vcxn);
+    float8 cyp = vload8(0, vcyp), cyn = vload8(0, vcyn);
+    float8 czp = vload8(0, vczp), czn = vload8(0, vczn);
+    float8 cx = cxp - cxn;
+    float8 cy = cyp - cyn;
+    float8 cz = czp - czn;
+
+    float3 tv = ptVox - fip;
+    normal.x = interpolate(tv, cx);
+    normal.y = interpolate(tv, cy);
+    normal.z = interpolate(tv, cz);
+#endif
+
+    float norm = sqrt(dot(normal, normal));
+    return norm < 0.0001f ? nan((uint)0) : normal / norm;
+}
+
+typedef float4 ptype;
+
+__kernel void raycast(
+                    __global const int* hashes,
+                    __global const int4* data,
+                    __global char * pointsptr,
+                      int points_step, int points_offset,
+                    __global char * normalsptr,
+                      int normals_step, int normals_offset,
+                    const int2 frameSize,
+                    __global const struct TsdfVoxel * allVolumePtr,
+                        int table_step, int table_offset,
+                        int table_rows, int table_cols,
+                    float16 cam2volRotGPU,
+                    float16 vol2camRotGPU,
+                    float truncateThreshold,
+                    const float2 fixy, const float2 cxy,
+                    const float4 boxDown4, const float4 boxUp4,
+                    const float tstep,
+                    const float voxelSize,
+                    const float voxelSizeInv,
+                    float volumeUnitSize,
+                    float truncDist,
+                    int volumeUnitDegree,
+                    int4 volStrides4
+                    )
+{
+    const int hash_divisor = HASH_DIVISOR;
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x >= frameSize.x || y >= frameSize.y)
+        return;
+
+    float3 point  = nan((uint)0);
+    float3 normal = nan((uint)0);
+
+    const float3 camRot0  = cam2volRotGPU.s012;
+    const float3 camRot1  = cam2volRotGPU.s456;
+    const float3 camRot2  = cam2volRotGPU.s89a;
+    const float3 camTrans = cam2volRotGPU.s37b;
+
+    const float3 volRot0  = vol2camRotGPU.s012;
+    const float3 volRot1  = vol2camRotGPU.s456;
+    const float3 volRot2  = vol2camRotGPU.s89a;
+    const float3 volTrans = vol2camRotGPU.s37b;
+
+    float3 planed = (float3)(((float2)(x, y) - cxy)*fixy, 1.f);
+    planed = (float3)(dot(planed, camRot0),
+                      dot(planed, camRot1),
+                      dot(planed, camRot2));
+
+    float3 orig = (float3) (camTrans.s0, camTrans.s1, camTrans.s2);
+    float3 dir = fast_normalize(planed);
+    float3 origScaled = orig * voxelSizeInv;
+    float3 dirScaled = dir * voxelSizeInv;
+
+    float tmin = 0;
+    float tmax = truncateThreshold;
+    float tcurr = tmin;
+    float tprev = tcurr;
+    float prevTsdf = truncDist;
+
+    int3 volStrides = volStrides4.xyz;
+
+    while (tcurr < tmax)
+    {
+        float3 currRayPosVox = origScaled + tcurr * dirScaled;
+
+        // VolumeToVolumeUnitIdx()
+        int3 currVoxel = convert_int3(floor(currRayPosVox));
+        int3 currVolumeUnitIdx = currVoxel >> volumeUnitDegree;
+
+        int row = custom_find(currVolumeUnitIdx, hash_divisor, hashes, data);
+
+        float currTsdf = prevTsdf;
+        int currWeight = 0;
+        float stepSize = 0.5 * volumeUnitSize;
+        int3 volUnitLocalIdx;
+
+        if (row >= 0)
+        {
+            volUnitLocalIdx = currVoxel - (currVolumeUnitIdx << volumeUnitDegree);
+            struct TsdfVoxel currVoxel = at(volUnitLocalIdx, row, volumeUnitDegree, volStrides, allVolumePtr, table_offset);
+
+            currTsdf = tsdfToFloat(currVoxel.tsdf);
+            currWeight = currVoxel.weight;
+            stepSize = tstep;
+        }
+
+        if (prevTsdf > 0.f && currTsdf <= 0.f && currWeight > 0)
+        {
+            float tInterp = (tcurr * prevTsdf - tprev * currTsdf) / (prevTsdf - currTsdf);
+            if ( !isnan(tInterp) && !isinf(tInterp) )
+            {
+                float3 pvox = origScaled + tInterp * dirScaled;
+                float3 nv = getNormalVoxel( pvox, allVolumePtr, volumeUnitDegree,
+                                            hash_divisor, hashes, data,
+                                            volStrides, table_offset);
+
+                if(!any(isnan(nv)))
+                {
+                    //convert pv and nv to camera space
+                    normal = (float3)(dot(nv, volRot0),
+                                      dot(nv, volRot1),
+                                      dot(nv, volRot2));
+                    // interpolation optimized a little
+                    float3 pv = pvox * voxelSize;
+                    point = (float3)(dot(pv, volRot0),
+                                     dot(pv, volRot1),
+                                     dot(pv, volRot2)) + volTrans;
+                }
+            }
+            break;
+        }
+        prevTsdf = currTsdf;
+        tprev = tcurr;
+        tcurr += stepSize;
+    }
+
+    __global float* pts = (__global float*)(pointsptr  +  points_offset + y*points_step   + x*sizeof(ptype));
+    __global float* nrm = (__global float*)(normalsptr + normals_offset + y*normals_step  + x*sizeof(ptype));
+    vstore4((float4)(point,  0), 0, pts);
+    vstore4((float4)(normal, 0), 0, nrm);
+}
+
+
+__kernel void markActive (
+        __global const int4* hashSetData,
+
+        __global char* isActiveFlagsPtr,
+        int isActiveFlagsStep, int isActiveFlagsOffset,
+        int isActiveFlagsRows, int isActiveFlagsCols,
+
+        __global char* lastVisibleIndicesPtr,
+        int lastVisibleIndicesStep, int lastVisibleIndicesOffset,
+        int lastVisibleIndicesRows, int lastVisibleIndicesCols,
+
+        const float16 vol2cam,
+        const float2 fxy,
+        const float2 cxy,
+        const int2 frameSz,
+        const float volumeUnitSize,
+        const int lastVolIndex,
+        const float truncateThreshold,
+        const int frameId
+        )
+{
+    const int hash_divisor = HASH_DIVISOR;
+    int row = get_global_id(0);
+
+    if (row < lastVolIndex)
+    {
+        int3 idx = hashSetData[row].xyz;
+
+        float3 volumeUnitPos = convert_float3(idx) * volumeUnitSize;
+
+        float3 volUnitInCamSpace = (float3) (dot(volumeUnitPos, vol2cam.s012),
+                                             dot(volumeUnitPos, vol2cam.s456),
+                                             dot(volumeUnitPos, vol2cam.s89a)) + vol2cam.s37b;
+
+        if (volUnitInCamSpace.z < 0 || volUnitInCamSpace.z > truncateThreshold)
+        {
+            *(isActiveFlagsPtr + isActiveFlagsOffset + row * isActiveFlagsStep) = 0;
+            return;
+        }
+
+        float2 cameraPoint;
+        float invz = 1.f / volUnitInCamSpace.z;
+        cameraPoint = fxy * volUnitInCamSpace.xy * invz + cxy;
+
+        if (all(cameraPoint >= 0) && all(cameraPoint < convert_float2(frameSz)))
+        {
+            *(__global int*)(lastVisibleIndicesPtr + lastVisibleIndicesOffset + row * lastVisibleIndicesStep) = frameId;
+            *(isActiveFlagsPtr + isActiveFlagsOffset + row * isActiveFlagsStep) = 1;
+        }
+    }
+}
diff --git a/modules/rgbd/src/opencl/tsdf.cl b/modules/rgbd/src/opencl/tsdf.cl
index b9186826236..3a13d3b8336 100644
--- a/modules/rgbd/src/opencl/tsdf.cl
+++ b/modules/rgbd/src/opencl/tsdf.cl
@@ -26,20 +26,6 @@ static inline float tsdfToFloat(TsdfType num)
     return ( (float) num ) / (-128);
 }
 
-__kernel void preCalculationPixNorm (__global float * pixNorms,
-                                     int pix_step, int pix_offset,
-                                     int pix_rows, int pix_cols,
-                                     const __global float * xx,
-                                     const __global float * yy,
-                                     int width, int height)
-{    
-    int i = get_global_id(0);
-    int j = get_global_id(1);
-    int idx = i*width + j;
-    if(i < height && j < width && idx < pix_cols)
-        pixNorms[idx] = sqrt(xx[j] * xx[j] + yy[i] * yy[i] + 1.0f);
-}
-
 __kernel void integrate(__global const char * depthptr,
                         int depth_step, int depth_offset,
                         int depth_rows, int depth_cols,
diff --git a/modules/rgbd/src/opencl/tsdf_functions.cl b/modules/rgbd/src/opencl/tsdf_functions.cl
new file mode 100644
index 00000000000..40efeca57c8
--- /dev/null
+++ b/modules/rgbd/src/opencl/tsdf_functions.cl
@@ -0,0 +1,19 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory
+
+__kernel void preCalculationPixNorm (__global char * pixNormsPtr,
+                                     int pixNormsStep, int pixNormsOffset,
+                                     int pixNormsRows, int pixNormsCols,
+                                     const __global float * xx,
+                                     const __global float * yy)
+{
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if (i < pixNormsRows && j < pixNormsCols)
+    {
+        *(__global float*)(pixNormsPtr + pixNormsOffset + i*pixNormsStep + j*sizeof(float)) = sqrt(xx[j] * xx[j] + yy[i] * yy[i] + 1.0f);
+    }
+}
\ No newline at end of file
diff --git a/modules/rgbd/src/submap.hpp b/modules/rgbd/src/submap.hpp
index 65a2bed6229..b40023b6a56 100644
--- a/modules/rgbd/src/submap.hpp
+++ b/modules/rgbd/src/submap.hpp
@@ -42,7 +42,7 @@ class Submap
 
     Submap(int _id, const VolumeParams& volumeParams, const cv::Affine3f& _pose = cv::Affine3f::Identity(),
            int _startFrameId = 0)
-        : id(_id), pose(_pose), cameraPose(Affine3f::Identity()), startFrameId(_startFrameId), volume(volumeParams)
+        : id(_id), pose(_pose), cameraPose(Affine3f::Identity()), startFrameId(_startFrameId), volume(makeHashTSDFVolume(volumeParams))
     {
         std::cout << "Created volume\n";
     }
@@ -53,10 +53,10 @@ class Submap
                          OutputArray points, OutputArray normals);
     virtual void updatePyrPointsNormals(const int pyramidLevels);
 
-    virtual int getTotalAllocatedBlocks() const { return int(volume.getTotalVolumeUnits()); };
+    virtual int getTotalAllocatedBlocks() const { return int(volume->getTotalVolumeUnits()); };
     virtual int getVisibleBlocks(int currFrameId) const
     {
-        return volume.getVisibleBlocks(currFrameId, FRAME_VISIBILITY_THRESHOLD);
+        return volume->getVisibleBlocks(currFrameId, FRAME_VISIBILITY_THRESHOLD);
     }
 
     float calcVisibilityRatio(int currFrameId) const
@@ -91,7 +91,7 @@ class Submap
     //! TODO: Add support for GPU arrays (UMat)
     std::vector<MatType> pyrPoints;
     std::vector<MatType> pyrNormals;
-    HashTSDFVolumeCPU volume;
+    std::shared_ptr<HashTSDFVolume> volume;
 };
 
 template<typename MatType>
@@ -100,14 +100,14 @@ void Submap<MatType>::integrate(InputArray _depth, float depthFactor, const cv::
                                 const int currFrameId)
 {
     CV_Assert(currFrameId >= startFrameId);
-    volume.integrate(_depth, depthFactor, cameraPose.matrix, intrinsics, currFrameId);
+    volume->integrate(_depth, depthFactor, cameraPose.matrix, intrinsics, currFrameId);
 }
 
 template<typename MatType>
 void Submap<MatType>::raycast(const cv::Affine3f& _cameraPose, const cv::kinfu::Intr& intrinsics, cv::Size frameSize,
                               OutputArray points, OutputArray normals)
 {
-    volume.raycast(_cameraPose.matrix, intrinsics, frameSize, points, normals);
+    volume->raycast(_cameraPose.matrix, intrinsics, frameSize, points, normals);
 }
 
 template<typename MatType>
diff --git a/modules/rgbd/src/tsdf.cpp b/modules/rgbd/src/tsdf.cpp
index 412c2fe789e..0a0fcfacf05 100644
--- a/modules/rgbd/src/tsdf.cpp
+++ b/modules/rgbd/src/tsdf.cpp
@@ -102,10 +102,7 @@ TsdfVoxel TSDFVolumeCPU::at(const Vec3i& volumeIdx) const
         (volumeIdx[1] >= volResolution.y || volumeIdx[1] < 0) ||
         (volumeIdx[2] >= volResolution.z || volumeIdx[2] < 0))
     {
-        TsdfVoxel dummy;
-        dummy.tsdf   = floatToTsdf(1.0f);
-        dummy.weight = 0;
-        return dummy;
+        return TsdfVoxel(floatToTsdf(1.f), 0);
     }
 
     const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
@@ -836,47 +833,6 @@ void TSDFVolumeGPU::reset()
     volume.setTo(Scalar(0, 0));
 }
 
-static void preCalculationPixNormGPU(int depth_rows, int depth_cols, Vec2f fxy, Vec2f cxy, UMat& pixNorm)
-{
-    Mat x(1, depth_cols, CV_32F);
-    Mat y(1, depth_rows, CV_32F);
-    pixNorm.create(1, depth_rows * depth_cols, CV_32F);
-
-    for (int i = 0; i < depth_cols; i++)
-        x.at<float>(0, i) = (i - cxy[0]) / fxy[0];
-    for (int i = 0; i < depth_rows; i++)
-        y.at<float>(0, i) = (i - cxy[1]) / fxy[1];
-
-    cv::String errorStr;
-    cv::String name = "preCalculationPixNorm";
-    ocl::ProgramSource source = ocl::rgbd::tsdf_oclsrc;
-    cv::String options = "-cl-mad-enable";
-    ocl::Kernel kk;
-    kk.create(name.c_str(), source, options, &errorStr);
-
-
-    if (kk.empty())
-        throw std::runtime_error("Failed to create kernel: " + errorStr);
-
-    AccessFlag af = ACCESS_READ;
-    UMat xx = x.getUMat(af);
-    UMat yy = y.getUMat(af);
-
-    kk.args(ocl::KernelArg::ReadWrite(pixNorm),
-        ocl::KernelArg::PtrReadOnly(xx),
-        ocl::KernelArg::PtrReadOnly(yy),
-        depth_cols, depth_rows);
-
-    size_t globalSize[2];
-    globalSize[0] = depth_rows;
-    globalSize[1] = depth_cols;
-
-    if (!kk.run(2, globalSize, NULL, true))
-        throw std::runtime_error("Failed to run kernel");
-
-    return;
-}
-
 // use depth instead of distance (optimization)
 void TSDFVolumeGPU::integrate(InputArray _depth, float depthFactor,
                               const Matx44f& cameraPose, const Intr& intrinsics, const int frameId)
@@ -901,15 +857,13 @@ void TSDFVolumeGPU::integrate(InputArray _depth, float depthFactor,
     float dfac = 1.f/depthFactor;
     Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z);
     Vec2f fxy(intrinsics.fx, intrinsics.fy), cxy(intrinsics.cx, intrinsics.cy);
-    if (!(frameParams[0] == depth.rows && frameParams[1] == depth.cols &&
-        frameParams[2] == intrinsics.fx && frameParams[3] == intrinsics.fy &&
-        frameParams[4] == intrinsics.cx && frameParams[5] == intrinsics.cy))
+    Vec6f newParams((float)depth.rows, (float)depth.cols,
+        intrinsics.fx, intrinsics.fy,
+        intrinsics.cx, intrinsics.cy);
+    if (!(frameParams == newParams))
     {
-        frameParams[0] = (float)depth.rows; frameParams[1] = (float)depth.cols;
-        frameParams[2] = intrinsics.fx;     frameParams[3] = intrinsics.fy;
-        frameParams[4] = intrinsics.cx;     frameParams[5] = intrinsics.cy;
-
-        preCalculationPixNormGPU(depth.rows, depth.cols, fxy, cxy, pixNorms);
+        frameParams = newParams;
+        pixNorms = preCalculationPixNormGPU(depth, intrinsics);
     }
 
     // TODO: optimization possible
diff --git a/modules/rgbd/src/tsdf.hpp b/modules/rgbd/src/tsdf.hpp
index 009c4a8466c..67361aa59d9 100644
--- a/modules/rgbd/src/tsdf.hpp
+++ b/modules/rgbd/src/tsdf.hpp
@@ -17,15 +17,15 @@ namespace cv
 {
 namespace kinfu
 {
-// TODO: Optimization possible:
-// * TsdfType can be FP16
-// * WeightType can be uint16
 
 typedef int8_t TsdfType;
 typedef uchar WeightType;
 
 struct TsdfVoxel
 {
+    TsdfVoxel(TsdfType _tsdf, WeightType _weight) :
+        tsdf(_tsdf), weight(_weight)
+    { }
     TsdfType tsdf;
     WeightType weight;
 };
@@ -107,8 +107,7 @@ class TSDFVolumeGPU : public TSDFVolume
     UMat pixNorms;
     // See zFirstMemOrder arg of parent class constructor
     // for the array layout info
-    // Array elem is CV_32FC2, read as (float, int)
-    // TODO: optimization possible to (fp16, int16), see Voxel definition
+    // Array elem is CV_8UC2, read as (int8, uint8)
     UMat volume;
 };
 #endif
diff --git a/modules/rgbd/src/tsdf_functions.cpp b/modules/rgbd/src/tsdf_functions.cpp
index ff40dce248d..b0e5276cba7 100644
--- a/modules/rgbd/src/tsdf_functions.cpp
+++ b/modules/rgbd/src/tsdf_functions.cpp
@@ -6,6 +6,7 @@
 
 #include "precomp.hpp"
 #include "tsdf_functions.hpp"
+#include "opencl_kernels_rgbd.hpp"
 
 namespace cv {
 
@@ -35,6 +36,51 @@ cv::Mat preCalculationPixNorm(Depth depth, const Intr& intrinsics)
     return pixNorm;
 }
 
+#ifdef HAVE_OPENCL
+cv::UMat preCalculationPixNormGPU(const UMat& depth, const Intr& intrinsics)
+{
+    int depth_cols = depth.cols;
+    int depth_rows = depth.rows;
+    Point2f fl(intrinsics.fx, intrinsics.fy);
+    Point2f pp(intrinsics.cx, intrinsics.cy);
+    Mat x(1, depth_cols, CV_32FC1);
+    Mat y(1, depth_rows, CV_32FC1);
+    UMat pixNorm(depth_rows, depth_cols, CV_32F);
+
+    for (int i = 0; i < depth_cols; i++)
+        x.at<float>(i) = (i - pp.x) / fl.x;
+    for (int i = 0; i < depth_rows; i++)
+        y.at<float>(i) = (i - pp.y) / fl.y;
+
+    cv::String errorStr;
+    cv::String name = "preCalculationPixNorm";
+    ocl::ProgramSource source = ocl::rgbd::tsdf_functions_oclsrc;
+    cv::String options = "-cl-mad-enable";
+    ocl::Kernel kk;
+    kk.create(name.c_str(), source, options, &errorStr);
+
+    if (kk.empty())
+        throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+    AccessFlag af = ACCESS_READ;
+    UMat xx = x.getUMat(af);
+    UMat yy = y.getUMat(af);
+
+    kk.args(ocl::KernelArg::WriteOnly(pixNorm),
+        ocl::KernelArg::PtrReadOnly(xx),
+        ocl::KernelArg::PtrReadOnly(yy));
+
+    size_t globalSize[2];
+    globalSize[0] = depth_rows;
+    globalSize[1] = depth_cols;
+
+    if (!kk.run(2, globalSize, NULL, true))
+        throw std::runtime_error("Failed to run kernel");
+
+    return pixNorm;
+}
+#endif
+
 const bool fixMissingData = false;
 depthType bilinearDepth(const Depth& m, cv::Point2f pt)
 {
@@ -254,7 +300,7 @@ void integrateVolumeUnit(
                     if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
                         continue;
                     float pixNorm = pixNorms.at<float>(_v, _u);
-                    // float pixNorm = sqrt(v_reduce_sum(camPixVec*camPixVec));
+                    //float pixNorm = sqrt(v_reduce_sum(camPixVec*camPixVec));
                     // difference between distances of point and of surface to camera
                     float sdf = pixNorm * (v * dfac - zCamSpace);
                     // possible alternative is:
diff --git a/modules/rgbd/src/tsdf_functions.hpp b/modules/rgbd/src/tsdf_functions.hpp
index 6d86595118f..09efecf1252 100644
--- a/modules/rgbd/src/tsdf_functions.hpp
+++ b/modules/rgbd/src/tsdf_functions.hpp
@@ -35,6 +35,8 @@ inline float tsdfToFloat(TsdfType num)
 }
 
 cv::Mat preCalculationPixNorm(Depth depth, const Intr& intrinsics);
+cv::UMat preCalculationPixNormGPU(const UMat& depth, const Intr& intrinsics);
+
 depthType bilinearDepth(const Depth& m, cv::Point2f pt);
 
 void integrateVolumeUnit(
@@ -43,6 +45,246 @@ void integrateVolumeUnit(
     InputArray _depth, float depthFactor, const cv::Matx44f& cameraPose,
     const cv::kinfu::Intr& intrinsics, InputArray _pixNorms, InputArray _volume);
 
+
+class CustomHashSet
+{
+public:
+    static const int hashDivisor = 32768;
+    static const int startCapacity = 2048;
+
+    std::vector<int> hashes;
+    // 0-3 for key, 4th for internal use
+    // don't keep keep value
+    std::vector<Vec4i> data;
+    int capacity;
+    int last;
+
+    CustomHashSet()
+    {
+        hashes.resize(hashDivisor);
+        for (int i = 0; i < hashDivisor; i++)
+            hashes[i] = -1;
+        capacity = startCapacity;
+
+        data.resize(capacity);
+        for (int i = 0; i < capacity; i++)
+            data[i] = { 0, 0, 0, -1 };
+
+        last = 0;
+    }
+
+    ~CustomHashSet() { }
+
+    inline size_t calc_hash(Vec3i x) const
+    {
+        uint32_t seed = 0;
+        constexpr uint32_t GOLDEN_RATIO = 0x9e3779b9;
+        for (int i = 0; i < 3; i++)
+        {
+            seed ^= x[i] + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+        }
+        return seed;
+    }
+
+    // should work on existing elements too
+    // 0 - need resize
+    // 1 - idx is inserted
+    // 2 - idx already exists
+    int insert(Vec3i idx)
+    {
+        if (last < capacity)
+        {
+            int hash = int(calc_hash(idx) % hashDivisor);
+            int place = hashes[hash];
+            if (place >= 0)
+            {
+                int oldPlace = place;
+                while (place >= 0)
+                {
+                    if (data[place][0] == idx[0] &&
+                        data[place][1] == idx[1] &&
+                        data[place][2] == idx[2])
+                        return 2;
+                    else
+                    {
+                        oldPlace = place;
+                        place = data[place][3];
+                        //std::cout << "place=" << place << std::endl;
+                    }
+                }
+
+                // found, create here
+                data[oldPlace][3] = last;
+            }
+            else
+            {
+                // insert at last
+                hashes[hash] = last;
+            }
+
+            data[last][0] = idx[0];
+            data[last][1] = idx[1];
+            data[last][2] = idx[2];
+            data[last][3] = -1;
+            last++;
+
+            return 1;
+        }
+        else
+            return 0;
+    }
+
+    int find(Vec3i idx) const
+    {
+        int hash = int(calc_hash(idx) % hashDivisor);
+        int place = hashes[hash];
+        // search a place
+        while (place >= 0)
+        {
+            if (data[place][0] == idx[0] &&
+                data[place][1] == idx[1] &&
+                data[place][2] == idx[2])
+                break;
+            else
+            {
+                place = data[place][3];
+            }
+        }
+
+        return place;
+    }
+};
+
+// TODO: remove this structure as soon as HashTSDFGPU data is completely on GPU;
+// until then CustomHashTable can be replaced by this one if needed
+
+const int NAN_ELEMENT = -2147483647;
+
+struct Volume_NODE
+{
+    Vec4i idx = Vec4i(NAN_ELEMENT);
+    int32_t row = -1;
+    int32_t nextVolumeRow = -1;
+    int32_t dummy = 0;
+    int32_t dummy2 = 0;
+};
+
+const int _hash_divisor = 32768;
+const int _list_size = 4;
+
+class VolumesTable
+{
+public:
+    const int hash_divisor = _hash_divisor;
+    const int list_size = _list_size;
+    const int32_t free_row = -1;
+    const int32_t free_isActive = 0;
+
+    const cv::Vec4i nan4 = cv::Vec4i(NAN_ELEMENT);
+
+    int bufferNums;
+    cv::Mat volumes;
+
+    VolumesTable() : bufferNums(1)
+    {
+        this->volumes = cv::Mat(hash_divisor * list_size, 1, rawType<Volume_NODE>());
+        for (int i = 0; i < volumes.size().height; i++)
+        {
+            Volume_NODE* v = volumes.ptr<Volume_NODE>(i);
+            v->idx = nan4;
+            v->row = -1;
+            v->nextVolumeRow = -1;
+        }
+    }
+    const VolumesTable& operator=(const VolumesTable& vt)
+    {
+        this->volumes = vt.volumes;
+        this->bufferNums = vt.bufferNums;
+        return *this;
+    }
+    ~VolumesTable() {};
+
+    bool insert(Vec3i idx, int row)
+    {
+        CV_Assert(row >= 0);
+
+        int bufferNum = 0;
+        int hash = int(calc_hash(idx) % hash_divisor);
+        int start = getPos(idx, bufferNum);
+        int i = start;
+
+        while (i >= 0)
+        {
+            Volume_NODE* v = volumes.ptr<Volume_NODE>(i);
+
+            if (v->idx[0] == NAN_ELEMENT)
+            {
+                Vec4i idx4(idx[0], idx[1], idx[2], 0);
+
+                bool extend = false;
+                if (i != start && i % list_size == 0)
+                {
+                    if (bufferNum >= bufferNums - 1)
+                    {
+                        extend = true;
+                        volumes.resize(hash_divisor * bufferNums);
+                        bufferNums++;
+                    }
+                    bufferNum++;
+                    v->nextVolumeRow = (bufferNum * hash_divisor + hash) * list_size;
+                }
+                else
+                {
+                    v->nextVolumeRow = i + 1;
+                }
+
+                v->idx = idx4;
+                v->row = row;
+
+                return extend;
+            }
+
+            i = v->nextVolumeRow;
+        }
+        return false;
+    }
+    int findRow(Vec3i idx) const
+    {
+        int bufferNum = 0;
+        int i = getPos(idx, bufferNum);
+
+        while (i >= 0)
+        {
+            const Volume_NODE* v = volumes.ptr<Volume_NODE>(i);
+
+            if (v->idx == Vec4i(idx[0], idx[1], idx[2], 0))
+                return v->row;
+            else
+                i = v->nextVolumeRow;
+        }
+
+        return -1;
+    }
+
+    inline int getPos(Vec3i idx, int bufferNum) const
+    {
+        int hash = int(calc_hash(idx) % hash_divisor);
+        return (bufferNum * hash_divisor + hash) * list_size;
+    }
+
+    inline size_t calc_hash(Vec3i x) const
+    {
+        uint32_t seed = 0;
+        constexpr uint32_t GOLDEN_RATIO = 0x9e3779b9;
+        for (int i = 0; i < 3; i++)
+        {
+            seed ^= x[i] + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+        }
+        return seed;
+    }
+};
+
+
 }  // namespace kinfu
 }  // namespace cv
 #endif
diff --git a/modules/rgbd/src/volume.cpp b/modules/rgbd/src/volume.cpp
index 8177213e8ab..88c46fe4b67 100644
--- a/modules/rgbd/src/volume.cpp
+++ b/modules/rgbd/src/volume.cpp
@@ -68,7 +68,7 @@ Ptr<Volume> makeVolume(const VolumeParams& _volumeParams)
     if(_volumeParams.type == VolumeType::TSDF)
         return kinfu::makeTSDFVolume(_volumeParams);
     else if(_volumeParams.type == VolumeType::HASHTSDF)
-        return kinfu::makeHashTSDFVolume<HashTSDFVolumeCPU>(_volumeParams);
+        return kinfu::makeHashTSDFVolume(_volumeParams);
     CV_Error(Error::StsBadArg, "Invalid VolumeType does not have parameters");
 }
 
@@ -79,13 +79,11 @@ Ptr<Volume> makeVolume(VolumeType _volumeType, float _voxelSize, Matx44f _pose,
     Point3i _presolution = _resolution;
     if (_volumeType == VolumeType::TSDF)
     {
-        return makeTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight,
-                              _presolution);
+        return makeTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _presolution);
     }
     else if (_volumeType == VolumeType::HASHTSDF)
     {
-        return makeHashTSDFVolume<kinfu::HashTSDFVolumeCPU>(
-            _voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _truncateThreshold);
+        return makeHashTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _truncateThreshold);
     }
     CV_Error(Error::StsBadArg, "Invalid VolumeType does not have parameters");
 }
diff --git a/modules/rgbd/test/test_tsdf.cpp b/modules/rgbd/test/test_tsdf.cpp
index 99da9579697..5a9b23afb95 100644
--- a/modules/rgbd/test/test_tsdf.cpp
+++ b/modules/rgbd/test/test_tsdf.cpp
@@ -358,7 +358,7 @@ void normal_test(bool isHashTSDF, bool isRaycast, bool isFetchPointsNormals, boo
         points = _points.getMat(af);
         renderPointsNormals(points, normals, image, _params->lightPose);
         imshow("render", image);
-        waitKey(20000);
+        waitKey(2000);
     }
 
     if (isRaycast)
@@ -384,7 +384,7 @@ void normal_test(bool isHashTSDF, bool isRaycast, bool isFetchPointsNormals, boo
             points = _newPoints.getMat(af);
             renderPointsNormals(points, normals, image, _params->lightPose);
             imshow("render", image);
-            waitKey(20000);
+            waitKey(2000);
         }
 
     }
@@ -448,7 +448,7 @@ void valid_points_test(bool isHashTSDF)
         imshow("depth", depth * (1.f / _params->depthFactor / 4.f));
         renderPointsNormals(points, normals, image, _params->lightPose);
         imshow("render", image);
-        waitKey(20000);
+        waitKey(2000);
     }
 
     volume->raycast(poses[17].matrix, _params->intr, _params->frameSize, _newPoints, _newNormals);
@@ -463,10 +463,13 @@ void valid_points_test(bool isHashTSDF)
         imshow("depth", depth * (1.f / _params->depthFactor / 4.f));
         renderPointsNormals(points, normals, image, _params->lightPose);
         imshow("render", image);
-        waitKey(20000);
+        waitKey(2000);
     }
 
-    float percentValidity = float(profile) / float(anfas);
+    float percentValidity;
+    if (profile == 0)    percentValidity = -0.5;
+    else if (anfas == 0) percentValidity = 0;
+    else                 percentValidity = float(profile) / float(anfas);
     ASSERT_LT(0.5 - percentValidity, 0.3);
 }
 
@@ -510,4 +513,5 @@ TEST(HashTSDF, valid_points)
     valid_points_test(true);
 }
 
+
 }}  // namespace