CUDAUtilities/interface/launch.h

0001 #ifndef HeterogeneousCore_CUDAUtilities_launch_h
0002 #define HeterogeneousCore_CUDAUtilities_launch_h
0003
0004 #include <tuple>
0005
0006 #include <cuda_runtime.h>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0009
0010 /*
0011  * `cms::cuda::launch` and `cms::cuda::launch_cooperative` are wrappers around
0012  * the CUDA Runtime API calls to setup and call a CUDA kernel from the host.
0013  *
0014  * `kernel` should be a pointer to a __global__ void(...) function.
0015  * `config` describe the launch configuration: the grid size and block size, the
0016  *          dynamic shared memory size (default to 0) and the CUDA stream to use
0017  *          (default to 0, the default stream).
0018  * `args` are the arguments passed (by value) to the kernel.
0019  *
0020  *  Currently this is requires an extra copy to perform the necessary implicit
0021  *  conversions and ensure that the arguments match the kernel function signature;
0022  *  the extra copy could eventually be avoided for arguments that are already of
0023  *  the exact type.
0024  *
0025  *  Unlike the `kernel<<<...>>>(...)` syntax and the `cuda::launch(...)`
0026  *  implementation from the CUDA API Wrappers, `cms::cuda::launch(...)` and
0027  *  `cms::cuda::launch_cooperative` can be called from standard C++ host code.
0028  *
0029  *  Possible optimisations
0030  *
0031  *    - once C++17 is available in CUDA, replace the `pointer_setter` functor
0032  *      with a simpler function using fold expressions:
0033  *
0034  *  template<int N, class Tuple, std::size_t... Is>
0035  *  void pointer_setter(void* ptrs[N], Tuple const& t, std::index_sequence<Is...>)
0036  *  {
0037  *    ((ptrs[Is] = & std::get<Is>(t)), ...);
0038  *  }
0039  *
0040  *    - add a template specialisation to `launch` and `launch_cooperative` to
0041  *      avoid making a temporary copy of the parameters when they match the
0042  *      kernel signature.
0043  */
0044
0045 namespace cms {
0046   namespace cuda {
0047
0048     struct LaunchParameters {
0049       dim3 gridDim;
0050       dim3 blockDim;
0051       size_t sharedMem;
0052       cudaStream_t stream;
0053
0054       LaunchParameters(dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
0055           : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
0056
0057       LaunchParameters(int gridDim, int blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
0058           : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
0059     };
0060
0061     namespace detail {
0062
0063       template <typename T>
0064       struct kernel_traits;
0065
0066       template <typename... Args>
0067       struct kernel_traits<void(Args...)> {
0068         static constexpr size_t arguments_size = sizeof...(Args);
0069
0070         using argument_type_tuple = std::tuple<Args...>;
0071
0072         template <size_t i>
0073         using argument_type = typename std::tuple_element<i, argument_type_tuple>::type;
0074       };
0075
0076       // fill an array with the pointers to the elements of a tuple
0077       template <int I>
0078       struct pointer_setter {
0079         template <typename Tuple>
0080         void operator()(void const* ptrs[], Tuple const& t) {
0081           pointer_setter<I - 1>()(ptrs, t);
0082           ptrs[I - 1] = &std::get<I - 1>(t);
0083         }
0084       };
0085
0086       template <>
0087       struct pointer_setter<0> {
0088         template <typename Tuple>
0089         void operator()(void const* ptrs[], Tuple const& t) {}
0090       };
0091
0092     }  // namespace detail
0093
0094     // wrappers for cudaLaunchKernel
0095
0096     inline void launch(void (*kernel)(), LaunchParameters config) {
0097       cudaCheck(cudaLaunchKernel(
0098           (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
0099     }
0100
0101     template <typename F, typename... Args>
0102 #if __cplusplus >= 201703L
0103     std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
0104 #else
0105     std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
0106 #endif
0107     launch(F* kernel, LaunchParameters config, Args&&... args) {
0108       using function_type = detail::kernel_traits<F>;
0109       typename function_type::argument_type_tuple args_copy(args...);
0110
0111       constexpr auto size = function_type::arguments_size;
0112       void const* pointers[size];
0113
0114       detail::pointer_setter<size>()(pointers, args_copy);
0115       cudaCheck(cudaLaunchKernel(
0116           (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
0117     }
0118
0119     // wrappers for cudaLaunchCooperativeKernel
0120
0121     inline void launch_cooperative(void (*kernel)(), LaunchParameters config) {
0122       cudaCheck(cudaLaunchCooperativeKernel(
0123           (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
0124     }
0125
0126     template <typename F, typename... Args>
0127 #if __cplusplus >= 201703L
0128     std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
0129 #else
0130     std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
0131 #endif
0132     launch_cooperative(F* kernel, LaunchParameters config, Args&&... args) {
0133       using function_type = detail::kernel_traits<F>;
0134       typename function_type::argument_type_tuple args_copy(args...);
0135
0136       constexpr auto size = function_type::arguments_size;
0137       void const* pointers[size];
0138
0139       detail::pointer_setter<size>()(pointers, args_copy);
0140       cudaCheck(cudaLaunchCooperativeKernel(
0141           (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
0142     }
0143
0144   }  // namespace cuda
0145 }  // namespace cms
0146
0147 #endif  // HeterogeneousCore_CUDAUtilities_launch_h