Skip to content

Commit

Permalink
Merge pull request #3744 from asmorkalov:as/variadic_tuple
Browse files Browse the repository at this point in the history
Added CUDA 12.4+ support #3744

Tries to fix #3690 for CUDA 12.4+
Related patch to main repo: opencv/opencv#25658

Changes:
- Added branches to support new variadic implementation of thrust::tuple
- Added branch with std::array instead of std::tuple in split-merge and grid operations. The new branch got rid of namespace clash: cv::cuda in OpenCV and ::cuda in CUDA standard library (injected by Thrust). Old tuple branches presumed for compatibility with old code and CUDA versions before 12.4.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
  • Loading branch information
asmorkalov committed May 30, 2024
1 parent d131e7a commit 1ed3dd2
Show file tree
Hide file tree
Showing 12 changed files with 537 additions and 49 deletions.
18 changes: 2 additions & 16 deletions modules/cudaarithm/src/cuda/polar_cart.cu
Original file line number Diff line number Diff line change
Expand Up @@ -133,23 +133,9 @@ void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, Outpu
GpuMat_<float> anglec(angle.reshape(1));

if (angleInDegrees)
{
gridTransformTuple(zipPtr(xc, yc),
tie(magc, anglec),
make_tuple(
binaryTupleAdapter<0, 1>(magnitude_func<float>()),
binaryTupleAdapter<0, 1>(direction_func<float, true>())),
stream);
}
gridTransformBinary(xc, yc, magc, anglec, magnitude_func<float>(), direction_func<float, true>(), stream);
else
{
gridTransformTuple(zipPtr(xc, yc),
tie(magc, anglec),
make_tuple(
binaryTupleAdapter<0, 1>(magnitude_func<float>()),
binaryTupleAdapter<0, 1>(direction_func<float, false>())),
stream);
}
gridTransformBinary(xc, yc, magc, anglec, magnitude_func<float>(), direction_func<float, false>(), stream);

syncOutput(mag, _mag, stream);
syncOutput(angle, _angle, stream);
Expand Down
9 changes: 6 additions & 3 deletions modules/cudaarithm/src/cuda/split_merge.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ namespace
{
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
{
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1])),
const std::array<GlobPtrSz<T>, 2> d_src = {globPtr<T>(src[0]), globPtr<T>(src[1])};
gridMerge(d_src,
globPtr<typename MakeVec<T, 2>::type>(dst),
stream);
}
Expand All @@ -77,7 +78,8 @@ namespace
{
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
{
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])),
const std::array<GlobPtrSz<T>, 3> d_src = {globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])};
gridMerge(d_src,
globPtr<typename MakeVec<T, 3>::type>(dst),
stream);
}
Expand All @@ -87,7 +89,8 @@ namespace
{
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
{
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])),
const std::array<GlobPtrSz<T>, 4 > d_src = {globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])};
gridMerge(d_src,
globPtr<typename MakeVec<T, 4>::type>(dst),
stream);
}
Expand Down
55 changes: 46 additions & 9 deletions modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,17 @@ namespace block_reduce_detail
val = smem[tid];
}


// merge

template <typename T, class Op>
__device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op)
{
T reg = smem[tid + delta];
smem[tid] = val = op(val, reg);
}

#if (CUDART_VERSION < 12040)
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
__device__ __forceinline__ void loadToSmem(const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
Expand All @@ -172,15 +183,6 @@ namespace block_reduce_detail
For<0, tuple_size<tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
}

// merge

template <typename T, class Op>
__device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op)
{
T reg = smem[tid + delta];
smem[tid] = val = op(val, reg);
}

template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
Expand Down Expand Up @@ -214,6 +216,41 @@ namespace block_reduce_detail
}
#endif

#else
template <typename... P, typename... R>
__device__ __forceinline__ void loadToSmem(const tuple<P...>& smem, const tuple<R...>& val, uint tid)
{
For<0, tuple_size<tuple<P...> >::value>::loadToSmem(smem, val, tid);
}

template <typename... P, typename... R>
__device__ __forceinline__ void loadFromSmem(const tuple<P...>& smem, const tuple<R...>& val, uint tid)
{
For<0, tuple_size<tuple<P...> >::value>::loadFromSmem(smem, val, tid);
}

template <typename... P, typename... R, class... Op>
__device__ __forceinline__ void merge(const tuple<P...>& smem, const tuple<R...>& val, uint tid, uint delta, const tuple<Op...>& op)
{
For<0, tuple_size<tuple<P...> >::value>::merge(smem, val, tid, delta, op);
}

// mergeShfl

template <typename T, class Op>
__device__ __forceinline__ void mergeShfl(T& val, uint delta, uint width, const Op& op)
{
T reg = shfl_down(val, delta, width);
val = op(val, reg);
}

template <typename... R, class... Op>
__device__ __forceinline__ void mergeShfl(const tuple<R...>& val, uint delta, uint width, const tuple<Op...>& op)
{
For<0, tuple_size<tuple<R...> >::value>::mergeShfl(val, delta, width, op);
}
#endif

// Generic

template <int N> struct Generic
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ namespace block_reduce_key_val_detail
data = smem[tid];
}

#if (CUDART_VERSION < 12040)
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
__device__ __forceinline__ void loadToSmem(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
Expand Down Expand Up @@ -241,6 +242,67 @@ namespace block_reduce_key_val_detail
{
For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
}
#else
template <typename... VP, typename... VR>
__device__ __forceinline__ void loadToSmem(const tuple<VP...>& smem, const tuple<VR...>& data, uint tid)
{
For<0, tuple_size<tuple<VP...> >::value>::loadToSmem(smem, data, tid);
}

template <typename... VP, typename... VR>
__device__ __forceinline__ void loadFromSmem(const tuple<VP...>& smem, const tuple<VR...>& data, uint tid)
{
For<0, tuple_size<tuple<VP...> >::value>::loadFromSmem(smem, data, tid);
}

// copyVals

template <typename V>
__device__ __forceinline__ void copyVals(volatile V* svals, V& val, uint tid, uint delta)
{
svals[tid] = val = svals[tid + delta];
}

template <typename... VP, typename... VR>
__device__ __forceinline__ void copyVals(const tuple<VP...>& svals, const tuple<VR...>& val, uint tid, uint delta)
{
For<0, tuple_size<tuple<VP...> >::value>::copy(svals, val, tid, delta);
}

// merge

template <typename K, typename V, class Cmp>
__device__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, uint tid, uint delta)
{
K reg = skeys[tid + delta];

if (cmp(reg, key))
{
skeys[tid] = key = reg;
copyVals(svals, val, tid, delta);
}
}

template <typename K, typename... VP, typename... VR, class Cmp>
__device__ void merge(volatile K* skeys, K& key, const tuple<VP...>& svals, const tuple<VR...>& val, const Cmp& cmp, uint tid, uint delta)
{
K reg = skeys[tid + delta];

if (cmp(reg, key))
{
skeys[tid] = key = reg;
copyVals(svals, val, tid, delta);
}
}

template <typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
__device__ __forceinline__ void merge(const tuple<KP...>& skeys, const tuple<KR...>& key,
const tuple<VP...>& svals, const tuple<VR...>& val,
const tuple<Cmp...>& cmp, uint tid, uint delta)
{
For<0, tuple_size<tuple<VP...> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
}
#endif

// Generic

Expand Down
35 changes: 35 additions & 0 deletions modules/cudev/include/opencv2/cudev/block/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "../warp/reduce.hpp"
#include "detail/reduce.hpp"
#include "detail/reduce_key_val.hpp"
#include <cuda_runtime_api.h>

namespace cv { namespace cudev {

Expand All @@ -65,6 +66,7 @@ __device__ __forceinline__ void blockReduce(volatile T* smem, T& val, uint tid,
block_reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
}

#if (CUDART_VERSION < 12040)
template <int N,
typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
Expand Down Expand Up @@ -126,6 +128,39 @@ __device__ __forceinline__ void blockReduceKeyVal(const tuple<KP0, KP1, KP2, KP3
>(skeys, key, svals, val, tid, cmp);
}

#else

template <int N, typename... P, typename... R, typename... Op>
__device__ __forceinline__ void blockReduce(const tuple<P...>& smem,
const tuple<R...>& val,
uint tid,
const tuple<Op...>& op)
{
block_reduce_detail::Dispatcher<N>::reductor::template reduce<const tuple<P...>&, const tuple<R...>&, const tuple<Op...>&>(smem, val, tid, op);
}

// blockReduceKeyVal

template <int N, typename K, typename V, class Cmp>
__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, uint tid, const Cmp& cmp)
{
block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
}

template <int N, typename K, typename... VP, typename... VR, class Cmp>
__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, const tuple<VP...>& svals, const tuple<VR...>& val, uint tid, const Cmp& cmp)
{
block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, const tuple<VP...>&, const tuple<VR...>&, const Cmp&>(skeys, key, svals, val, tid, cmp);
}

template <int N, typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
__device__ __forceinline__ void blockReduceKeyVal(const tuple<KP...>& skeys, const tuple<KR...>& key, const tuple<VP...>& svals, const tuple<VR...>& val, uint tid, const tuple<Cmp...>& cmp)
{
block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce< const tuple<KP...>&, const tuple<KR...>&, const tuple<VP...>&, const tuple<VR...>&, const tuple<Cmp...>&>(skeys, key, svals, val, tid, cmp);
}

#endif

//! @}

}}
Expand Down
25 changes: 22 additions & 3 deletions modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,28 +157,47 @@ namespace grid_split_merge_detail
template <class Policy> struct MergeImpl<2, Policy>
{
template <class SrcPtrTuple, typename DstType, class MaskPtr>
__host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC2<Policy>(get<0>(src), get<1>(src), dst, mask, rows, cols, stream);
}

template <class SrcPtrArray, typename DstType, class MaskPtr>
__host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC2<Policy>(src[0], src[1], dst, mask, rows, cols, stream);
}

};

template <class Policy> struct MergeImpl<3, Policy>
{
template <class SrcPtrTuple, typename DstType, class MaskPtr>
__host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC3<Policy>(get<0>(src), get<1>(src), get<2>(src), dst, mask, rows, cols, stream);
}

template <class SrcPtrArray, typename DstType, class MaskPtr>
__host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC3<Policy>(src[0], src[1], src[2], dst, mask, rows, cols, stream);
}
};

template <class Policy> struct MergeImpl<4, Policy>
{
template <class SrcPtrTuple, typename DstType, class MaskPtr>
__host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC4<Policy>(get<0>(src), get<1>(src), get<2>(src), get<3>(src), dst, mask, rows, cols, stream);
}

template <class SrcPtrArray, typename DstType, class MaskPtr>
__host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC4<Policy>(src[0], src[1], src[2], src[3], dst, mask, rows, cols, stream);
}
};

// split
Expand Down

0 comments on commit 1ed3dd2

Please sign in to comment.