Skip to content

Pass warping matrix as parameter instead of const memory #3091

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 1, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 54 additions & 42 deletions modules/cudawarping/src/cuda/warp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,55 +53,75 @@ namespace cv { namespace cuda { namespace device
{
namespace imgproc
{
__constant__ float c_warpMat[3 * 3];

struct AffineTransform
{
static __device__ __forceinline__ float2 calcCoord(int x, int y)
static const int rows = 2;
static __device__ __forceinline__ float2 calcCoord(const float warpMat[AffineTransform::rows * 3], int x, int y)
{
const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
const float xcoo = warpMat[0] * x + warpMat[1] * y + warpMat[2];
const float ycoo = warpMat[3] * x + warpMat[4] * y + warpMat[5];

return make_float2(xcoo, ycoo);
}

struct Coefficients
{
Coefficients(const float* c_)
{
for(int i = 0; i < AffineTransform::rows * 3; i++)
c[i] = c_[i];
}
float c[AffineTransform::rows * 3];
};
};

struct PerspectiveTransform
{
static __device__ __forceinline__ float2 calcCoord(int x, int y)
static const int rows = 3;
static __device__ __forceinline__ float2 calcCoord(const float warpMat[PerspectiveTransform::rows * 3], int x, int y)
{
const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
const float coeff = 1.0f / (warpMat[6] * x + warpMat[7] * y + warpMat[8]);

const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
const float xcoo = coeff * (warpMat[0] * x + warpMat[1] * y + warpMat[2]);
const float ycoo = coeff * (warpMat[3] * x + warpMat[4] * y + warpMat[5]);

return make_float2(xcoo, ycoo);
}
struct Coefficients
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can re-use common Coefficients for both structs?

{
Coefficients(const float* c_)
{
for(int i = 0; i < PerspectiveTransform::rows * 3; i++)
c[i] = c_[i];
}

float c[PerspectiveTransform::rows * 3];
};
};

///////////////////////////////////////////////////////////////////
// Build Maps

template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap, const typename Transform::Coefficients warpMat)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;

if (x < xmap.cols && y < xmap.rows)
{
const float2 coord = Transform::calcCoord(x, y);
const float2 coord = Transform::calcCoord(warpMat.c, x, y);

xmap(y, x) = coord.x;
ymap(y, x) = coord.y;
}
}

template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, const float warpMat[Transform::rows * 3], cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));

buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap, warpMat);
cudaSafeCall( cudaGetLastError() );

if (stream == 0)
Expand All @@ -110,37 +130,33 @@ namespace cv { namespace cuda { namespace device

void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
{
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );

buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
buildWarpMaps_caller<AffineTransform>(xmap, ymap, coeffs, stream);
}

void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
{
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );

buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, coeffs, stream);
}

///////////////////////////////////////////////////////////////////
// Warp

template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst, const typename Transform::Coefficients warpMat)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;

if (x < dst.cols && y < dst.rows)
{
const float2 coord = Transform::calcCoord(x, y);
const float2 coord = Transform::calcCoord(warpMat.c, x, y);

dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
}
}

template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
{
static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], cudaStream_t stream, bool)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;

Expand All @@ -151,14 +167,14 @@ namespace cv { namespace cuda { namespace device
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);

warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst, warpMat);
cudaSafeCall( cudaGetLastError() );
}
};

template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], bool)
{
CV_UNUSED(xoff);
CV_UNUSED(yoff);
Expand All @@ -173,7 +189,7 @@ namespace cv { namespace cuda { namespace device
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);

warp<Transform><<<grid, block>>>(filter_src, dst);
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat);
cudaSafeCall( cudaGetLastError() );

cudaSafeCall( cudaDeviceSynchronize() );
Expand All @@ -195,7 +211,7 @@ namespace cv { namespace cuda { namespace device
}; \
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc20 ? 8 : 4); \
Expand All @@ -205,14 +221,14 @@ namespace cv { namespace cuda { namespace device
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst); \
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, const float warpMat[Transform::rows*3], bool) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
Expand All @@ -221,14 +237,14 @@ namespace cv { namespace cuda { namespace device
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
{ \
Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst); \
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
} \
else \
{ \
BrdReplicate<type> brd(src.rows, src.cols); \
BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst); \
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
} \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
Expand Down Expand Up @@ -263,20 +279,20 @@ namespace cv { namespace cuda { namespace device

template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], cudaStream_t stream, bool cc20)
{
if (stream == 0)
WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, warpMat, cc20);
else
WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, warpMat, stream, cc20);
}
};

template <class Transform, typename T>
void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
int borderMode, const float* borderValue, const float warpMat[Transform::rows*3], cudaStream_t stream, bool cc20)
{
typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], cudaStream_t stream, bool cc20);

static const func_t funcs[3][5] =
{
Expand Down Expand Up @@ -304,15 +320,13 @@ namespace cv { namespace cuda { namespace device
};

funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
static_cast< PtrStepSz<T> >(dst), borderValue, warpMat, stream, cc20);
}

template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
{
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );

warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, coeffs, stream, cc20);
}

template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
Expand Down Expand Up @@ -348,9 +362,7 @@ namespace cv { namespace cuda { namespace device
template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
{
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );

warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, coeffs, stream, cc20);
}

template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
Expand Down