File indexing completed on 2023-03-17 11:05:47
0001 #ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
0002 #define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
0003
0004 #include <cstdint>
0005
0006 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0007 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
0008 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0009
0010 #ifdef __CUDA_ARCH__
0011
0012 template <typename T>
0013 __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) {
0014
0015 auto x = ci[i];
0016 auto laneId = threadIdx.x & 0x1f;
0017 CMS_UNROLL_LOOP
0018 for (int offset = 1; offset < 32; offset <<= 1) {
0019 auto y = __shfl_up_sync(mask, x, offset);
0020 if (laneId >= offset)
0021 x += y;
0022 }
0023 co[i] = x;
0024 }
0025
0026 template <typename T>
0027 __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
0028 auto x = c[i];
0029 auto laneId = threadIdx.x & 0x1f;
0030 CMS_UNROLL_LOOP
0031 for (int offset = 1; offset < 32; offset <<= 1) {
0032 auto y = __shfl_up_sync(mask, x, offset);
0033 if (laneId >= offset)
0034 x += y;
0035 }
0036 c[i] = x;
0037 }
0038
0039 #endif
0040
0041 namespace cms {
0042 namespace cuda {
0043
0044
0045 template <typename VT, typename T>
0046 __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci,
0047 VT* co,
0048 uint32_t size,
0049 T* ws
0050 #ifndef __CUDA_ARCH__
0051 = nullptr
0052 #endif
0053 ) {
0054 #ifdef __CUDA_ARCH__
0055 assert(ws);
0056 assert(size <= 1024);
0057 assert(0 == blockDim.x % 32);
0058 auto first = threadIdx.x;
0059 auto mask = __ballot_sync(0xffffffff, first < size);
0060
0061 for (auto i = first; i < size; i += blockDim.x) {
0062 warpPrefixScan(ci, co, i, mask);
0063 auto laneId = threadIdx.x & 0x1f;
0064 auto warpId = i / 32;
0065 assert(warpId < 32);
0066 if (31 == laneId)
0067 ws[warpId] = co[i];
0068 mask = __ballot_sync(mask, i + blockDim.x < size);
0069 }
0070 __syncthreads();
0071 if (size <= 32)
0072 return;
0073 if (threadIdx.x < 32)
0074 warpPrefixScan(ws, threadIdx.x, 0xffffffff);
0075 __syncthreads();
0076 for (auto i = first + 32; i < size; i += blockDim.x) {
0077 auto warpId = i / 32;
0078 co[i] += ws[warpId - 1];
0079 }
0080 __syncthreads();
0081 #else
0082 co[0] = ci[0];
0083 for (uint32_t i = 1; i < size; ++i)
0084 co[i] = ci[i] + co[i - 1];
0085 #endif
0086 }
0087
0088
0089
0090 template <typename T>
0091 __host__ __device__ __forceinline__ void blockPrefixScan(T* c,
0092 uint32_t size,
0093 T* ws
0094 #ifndef __CUDA_ARCH__
0095 = nullptr
0096 #endif
0097 ) {
0098 #ifdef __CUDA_ARCH__
0099 assert(ws);
0100 assert(size <= 1024);
0101 assert(0 == blockDim.x % 32);
0102 auto first = threadIdx.x;
0103 auto mask = __ballot_sync(0xffffffff, first < size);
0104
0105 for (auto i = first; i < size; i += blockDim.x) {
0106 warpPrefixScan(c, i, mask);
0107 auto laneId = threadIdx.x & 0x1f;
0108 auto warpId = i / 32;
0109 assert(warpId < 32);
0110 if (31 == laneId)
0111 ws[warpId] = c[i];
0112 mask = __ballot_sync(mask, i + blockDim.x < size);
0113 }
0114 __syncthreads();
0115 if (size <= 32)
0116 return;
0117 if (threadIdx.x < 32)
0118 warpPrefixScan(ws, threadIdx.x, 0xffffffff);
0119 __syncthreads();
0120 for (auto i = first + 32; i < size; i += blockDim.x) {
0121 auto warpId = i / 32;
0122 c[i] += ws[warpId - 1];
0123 }
0124 __syncthreads();
0125 #else
0126 for (uint32_t i = 1; i < size; ++i)
0127 c[i] += c[i - 1];
0128 #endif
0129 }
0130
0131 #ifdef __CUDA_ARCH__
0132
0133 __device__ __forceinline__ unsigned dynamic_smem_size() {
0134 unsigned ret;
0135 asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
0136 return ret;
0137 }
0138 #endif
0139
0140
0141 template <typename T>
0142 __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
0143 volatile T const* ci = ici;
0144 volatile T* co = ico;
0145 __shared__ T ws[32];
0146 #ifdef __CUDA_ARCH__
0147 assert(sizeof(T) * gridDim.x <= dynamic_smem_size());
0148 #endif
0149 assert(blockDim.x * gridDim.x >= size);
0150
0151 int off = blockDim.x * blockIdx.x;
0152 if (size - off > 0)
0153 blockPrefixScan(ci + off, co + off, std::min(int(blockDim.x), size - off), ws);
0154
0155
0156 __shared__ bool isLastBlockDone;
0157 if (0 == threadIdx.x) {
0158 __threadfence();
0159 auto value = atomicAdd(pc, 1);
0160 isLastBlockDone = (value == (int(gridDim.x) - 1));
0161 }
0162
0163 __syncthreads();
0164
0165 if (!isLastBlockDone)
0166 return;
0167
0168 assert(int(gridDim.x) == *pc);
0169
0170
0171
0172
0173 extern __shared__ T psum[];
0174 for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
0175 auto j = blockDim.x * i + blockDim.x - 1;
0176 psum[i] = (j < size) ? co[j] : T(0);
0177 }
0178 __syncthreads();
0179 blockPrefixScan(psum, psum, gridDim.x, ws);
0180
0181
0182 for (int i = threadIdx.x + blockDim.x, k = 0; i < size; i += blockDim.x, ++k) {
0183 co[i] += psum[k];
0184 }
0185 }
0186 }
0187 }
0188
0189 #endif