Skip to content

Commit 3ea4443

Browse files
committed
Merge pull request #2868 from nkwangyh:local
2 parents 7ede658 + d1099dc commit 3ea4443

File tree

2 files changed

+45
-35
lines changed

2 files changed

+45
-35
lines changed

modules/cudaimgproc/src/cuda/gftt.cu

+27-31
Original file line numberDiff line numberDiff line change
@@ -52,37 +52,33 @@ namespace cv { namespace cuda { namespace device
5252
{
5353
namespace gfft
5454
{
55-
texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
56-
57-
__device__ int g_counter = 0;
58-
59-
template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
55+
template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, cudaTextureObject_t eigTex, int *g_counter)
6056
{
6157
const int j = blockIdx.x * blockDim.x + threadIdx.x;
6258
const int i = blockIdx.y * blockDim.y + threadIdx.y;
6359

6460
if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
6561
{
66-
float val = tex2D(eigTex, j, i);
62+
float val = tex2D<float>(eigTex, j, i);
6763

6864
if (val > threshold)
6965
{
7066
float maxVal = val;
7167

72-
maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
73-
maxVal = ::fmax(tex2D(eigTex, j , i - 1), maxVal);
74-
maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
68+
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i - 1), maxVal);
69+
maxVal = ::fmax(tex2D<float>(eigTex, j , i - 1), maxVal);
70+
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i - 1), maxVal);
7571

76-
maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
77-
maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
72+
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i), maxVal);
73+
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i), maxVal);
7874

79-
maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
80-
maxVal = ::fmax(tex2D(eigTex, j , i + 1), maxVal);
81-
maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
75+
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i + 1), maxVal);
76+
maxVal = ::fmax(tex2D<float>(eigTex, j , i + 1), maxVal);
77+
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i + 1), maxVal);
8278

8379
if (val == maxVal)
8480
{
85-
const int ind = ::atomicAdd(&g_counter, 1);
81+
const int ind = ::atomicAdd(g_counter, 1);
8682

8783
if (ind < max_count)
8884
corners[ind] = make_float2(j, i);
@@ -91,22 +87,20 @@ namespace cv { namespace cuda { namespace device
9187
}
9288
}
9389

94-
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream)
90+
int findCorners_gpu(const cudaTextureObject_t &eigTex, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream)
9591
{
96-
void* counter_ptr;
97-
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
92+
int* counter_ptr;
93+
cudaSafeCall( cudaMalloc(&counter_ptr, sizeof(int)) );
9894

9995
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
10096

101-
bindTexture(&eigTex, eig);
102-
10397
dim3 block(16, 16);
104-
dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
98+
dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
10599

106100
if (mask.data)
107-
findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
101+
findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, rows, cols, eigTex, counter_ptr);
108102
else
109-
findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
103+
findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, rows, cols, eigTex, counter_ptr);
110104

111105
cudaSafeCall( cudaGetLastError() );
112106

@@ -122,25 +116,27 @@ namespace cv { namespace cuda { namespace device
122116
class EigGreater
123117
{
124118
public:
119+
EigGreater(const cudaTextureObject_t &eigTex_) : eigTex(eigTex_)
120+
{
121+
}
125122
__device__ __forceinline__ bool operator()(float2 a, float2 b) const
126123
{
127-
return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
124+
return tex2D<float>(eigTex, a.x, a.y) > tex2D<float>(eigTex, b.x, b.y);
128125
}
129-
};
130126

127+
cudaTextureObject_t eigTex;
128+
};
131129

132-
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
130+
void sortCorners_gpu(const cudaTextureObject_t &eigTex, float2* corners, int count, cudaStream_t stream)
133131
{
134-
bindTexture(&eigTex, eig);
135-
136132
thrust::device_ptr<float2> ptr(corners);
137133
#if THRUST_VERSION >= 100802
138134
if (stream)
139-
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater());
135+
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(eigTex));
140136
else
141-
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater());
137+
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(eigTex));
142138
#else
143-
thrust::sort(ptr, ptr + count, EigGreater());
139+
thrust::sort(ptr, ptr + count, EigGreater(eigTex));
144140
#endif
145141
}
146142
} // namespace optical_flow

modules/cudaimgproc/src/gftt.cpp

+18-4
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
5555
{
5656
namespace gfft
5757
{
58-
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream);
59-
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
58+
int findCorners_gpu(const cudaTextureObject_t &eigTex_, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream);
59+
void sortCorners_gpu(const cudaTextureObject_t &eigTex_, float2* corners, int count, cudaStream_t stream);
6060
}
6161
}}}
6262

@@ -112,15 +112,29 @@ namespace
112112
cudaStream_t stream_ = StreamAccessor::getStream(stream);
113113
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
114114

115-
int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, stream_);
115+
//create texture object for findCorners_gpu and sortCorners_gpu
116+
cudaTextureDesc texDesc;
117+
memset(&texDesc, 0, sizeof(texDesc));
118+
texDesc.readMode = cudaReadModeElementType;
119+
texDesc.filterMode = cudaFilterModePoint;
120+
texDesc.addressMode[0] = cudaAddressModeClamp;
121+
texDesc.addressMode[1] = cudaAddressModeClamp;
122+
texDesc.addressMode[2] = cudaAddressModeClamp;
123+
124+
cudaTextureObject_t eigTex_;
125+
PtrStepSzf eig = eig_;
126+
cv::cuda::device::createTextureObjectPitch2D<float>(&eigTex_, eig, texDesc);
127+
128+
int total = findCorners_gpu(eigTex_, eig_.rows, eig_.cols, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, stream_);
129+
116130

117131
if (total == 0)
118132
{
119133
_corners.release();
120134
return;
121135
}
122136

123-
sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);
137+
sortCorners_gpu(eigTex_, tmpCorners_.ptr<float2>(), total, stream_);
124138

125139
if (minDistance_ < 1)
126140
{

0 commit comments

Comments
 (0)