|
45 | 45 | #include "opencv2/core/cuda/common.hpp"
|
46 | 46 | #include "opencv2/core/cuda/border_interpolate.hpp"
|
47 | 47 | #include "opencv2/core/cuda/limits.hpp"
|
| 48 | +#include "opencv2/core/cuda.hpp" |
48 | 49 |
|
49 | 50 | using namespace cv::cuda;
|
50 | 51 | using namespace cv::cuda::device;
|
@@ -101,11 +102,64 @@ namespace tvl1flow
|
101 | 102 | }
|
102 | 103 | }
|
103 | 104 |
|
| 105 | + struct SrcTex |
| 106 | + { |
| 107 | + virtual ~SrcTex() {} |
| 108 | + |
| 109 | + __device__ __forceinline__ virtual float I1(float x, float y) const = 0; |
| 110 | + __device__ __forceinline__ virtual float I1x(float x, float y) const = 0; |
| 111 | + __device__ __forceinline__ virtual float I1y(float x, float y) const = 0; |
| 112 | + }; |
| 113 | + |
104 | 114 | texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
|
105 | 115 | texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
|
106 | 116 | texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
|
| 117 | + struct SrcTexRef : SrcTex |
| 118 | + { |
| 119 | + __device__ __forceinline__ float I1(float x, float y) const override |
| 120 | + { |
| 121 | + return tex2D(tex_I1, x, y); |
| 122 | + } |
| 123 | + __device__ __forceinline__ float I1x(float x, float y) const override |
| 124 | + { |
| 125 | + return tex2D(tex_I1x, x, y); |
| 126 | + } |
| 127 | + __device__ __forceinline__ float I1y(float x, float y) const override |
| 128 | + { |
| 129 | + return tex2D(tex_I1y, x, y); |
| 130 | + } |
| 131 | + }; |
| 132 | + |
| 133 | + struct SrcTexObj : SrcTex |
| 134 | + { |
| 135 | + __host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_) |
| 136 | + : tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {} |
| 137 | + |
| 138 | + __device__ __forceinline__ float I1(float x, float y) const override |
| 139 | + { |
| 140 | + return tex2D<float>(tex_obj_I1, x, y); |
| 141 | + } |
| 142 | + __device__ __forceinline__ float I1x(float x, float y) const override |
| 143 | + { |
| 144 | + return tex2D<float>(tex_obj_I1x, x, y); |
| 145 | + } |
| 146 | + __device__ __forceinline__ float I1y(float x, float y) const override |
| 147 | + { |
| 148 | + return tex2D<float>(tex_obj_I1y, x, y); |
| 149 | + } |
107 | 150 |
|
108 |
| - __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho) |
| 151 | + cudaTextureObject_t tex_obj_I1; |
| 152 | + cudaTextureObject_t tex_obj_I1x; |
| 153 | + cudaTextureObject_t tex_obj_I1y; |
| 154 | + }; |
| 155 | + |
| 156 | + template < |
| 157 | + typename T, |
| 158 | + typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value> |
| 159 | + > |
| 160 | + __global__ void warpBackwardKernel( |
| 161 | + const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2, |
| 162 | + PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho) |
109 | 163 | {
|
110 | 164 | const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
111 | 165 | const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
@@ -136,9 +190,9 @@ namespace tvl1flow
|
136 | 190 | {
|
137 | 191 | const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
|
138 | 192 |
|
139 |
| - sum += w * tex2D(tex_I1 , cx, cy); |
140 |
| - sumx += w * tex2D(tex_I1x, cx, cy); |
141 |
| - sumy += w * tex2D(tex_I1y, cx, cy); |
| 193 | + sum += w * src.I1(cx, cy); |
| 194 | + sumx += w * src.I1x(cx, cy); |
| 195 | + sumy += w * src.I1y(cx, cy); |
142 | 196 |
|
143 | 197 | wsum += w;
|
144 | 198 | }
|
@@ -173,15 +227,46 @@ namespace tvl1flow
|
173 | 227 | const dim3 block(32, 8);
|
174 | 228 | const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
|
175 | 229 |
|
176 |
| - bindTexture(&tex_I1 , I1); |
177 |
| - bindTexture(&tex_I1x, I1x); |
178 |
| - bindTexture(&tex_I1y, I1y); |
| 230 | + bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30); |
179 | 231 |
|
180 |
| - warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho); |
181 |
| - cudaSafeCall( cudaGetLastError() ); |
| 232 | + if (cc30) |
| 233 | + { |
| 234 | + cudaTextureDesc texDesc; |
| 235 | + memset(&texDesc, 0, sizeof(texDesc)); |
| 236 | + texDesc.addressMode[0] = cudaAddressModeClamp; |
| 237 | + texDesc.addressMode[1] = cudaAddressModeClamp; |
| 238 | + texDesc.addressMode[2] = cudaAddressModeClamp; |
182 | 239 |
|
183 |
| - if (!stream) |
184 |
| - cudaSafeCall( cudaDeviceSynchronize() ); |
| 240 | + cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0; |
| 241 | + |
| 242 | + createTextureObjectPitch2D(&texObj_I1, I1, texDesc); |
| 243 | + createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc); |
| 244 | + createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc); |
| 245 | + |
| 246 | + warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho); |
| 247 | + cudaSafeCall(cudaGetLastError()); |
| 248 | + |
| 249 | + if (!stream) |
| 250 | + cudaSafeCall(cudaDeviceSynchronize()); |
| 251 | + else |
| 252 | + cudaSafeCall(cudaStreamSynchronize(stream)); |
| 253 | + |
| 254 | + cudaSafeCall(cudaDestroyTextureObject(texObj_I1)); |
| 255 | + cudaSafeCall(cudaDestroyTextureObject(texObj_I1x)); |
| 256 | + cudaSafeCall(cudaDestroyTextureObject(texObj_I1y)); |
| 257 | + } |
| 258 | + else |
| 259 | + { |
| 260 | + bindTexture(&tex_I1, I1); |
| 261 | + bindTexture(&tex_I1x, I1x); |
| 262 | + bindTexture(&tex_I1y, I1y); |
| 263 | + |
| 264 | + warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho); |
| 265 | + cudaSafeCall(cudaGetLastError()); |
| 266 | + |
| 267 | + if (!stream) |
| 268 | + cudaSafeCall(cudaDeviceSynchronize()); |
| 269 | + } |
185 | 270 | }
|
186 | 271 | }
|
187 | 272 |
|
|
0 commit comments