CUDAUtilities/interface/prefixScan.h

0001 #ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
0002 #define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
0003
0004 #include <cstdint>
0005
0006 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0007 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
0008 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0009
0010 #ifdef __CUDA_ARCH__
0011
0012 template <typename T>
0013 __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) {
0014   // ci and co may be the same
0015   auto x = ci[i];
0016   auto laneId = threadIdx.x & 0x1f;
0017   CMS_UNROLL_LOOP
0018   for (int offset = 1; offset < 32; offset <<= 1) {
0019     auto y = __shfl_up_sync(mask, x, offset);
0020     if (laneId >= offset)
0021       x += y;
0022   }
0023   co[i] = x;
0024 }
0025
0026 template <typename T>
0027 __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
0028   auto x = c[i];
0029   auto laneId = threadIdx.x & 0x1f;
0030   CMS_UNROLL_LOOP
0031   for (int offset = 1; offset < 32; offset <<= 1) {
0032     auto y = __shfl_up_sync(mask, x, offset);
0033     if (laneId >= offset)
0034       x += y;
0035   }
0036   c[i] = x;
0037 }
0038
0039 #endif
0040
0041 namespace cms {
0042   namespace cuda {
0043
0044     // limited to 32*32 elements....
0045     template <typename VT, typename T>
0046     __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci,
0047                                                              VT* co,
0048                                                              uint32_t size,
0049                                                              T* ws
0050 #ifndef __CUDA_ARCH__
0051                                                              = nullptr
0052 #endif
0053     ) {
0054 #ifdef __CUDA_ARCH__
0055       assert(ws);
0056       assert(size <= 1024);
0057       assert(0 == blockDim.x % 32);
0058       auto first = threadIdx.x;
0059       auto mask = __ballot_sync(0xffffffff, first < size);
0060
0061       for (auto i = first; i < size; i += blockDim.x) {
0062         warpPrefixScan(ci, co, i, mask);
0063         auto laneId = threadIdx.x & 0x1f;
0064         auto warpId = i / 32;
0065         assert(warpId < 32);
0066         if (31 == laneId)
0067           ws[warpId] = co[i];
0068         mask = __ballot_sync(mask, i + blockDim.x < size);
0069       }
0070       __syncthreads();
0071       if (size <= 32)
0072         return;
0073       if (threadIdx.x < 32)
0074         warpPrefixScan(ws, threadIdx.x, 0xffffffff);
0075       __syncthreads();
0076       for (auto i = first + 32; i < size; i += blockDim.x) {
0077         auto warpId = i / 32;
0078         co[i] += ws[warpId - 1];
0079       }
0080       __syncthreads();
0081 #else
0082       co[0] = ci[0];
0083       for (uint32_t i = 1; i < size; ++i)
0084         co[i] = ci[i] + co[i - 1];
0085 #endif
0086     }
0087
0088     // same as above, may remove
0089     // limited to 32*32 elements....
0090     template <typename T>
0091     __host__ __device__ __forceinline__ void blockPrefixScan(T* c,
0092                                                              uint32_t size,
0093                                                              T* ws
0094 #ifndef __CUDA_ARCH__
0095                                                              = nullptr
0096 #endif
0097     ) {
0098 #ifdef __CUDA_ARCH__
0099       assert(ws);
0100       assert(size <= 1024);
0101       assert(0 == blockDim.x % 32);
0102       auto first = threadIdx.x;
0103       auto mask = __ballot_sync(0xffffffff, first < size);
0104
0105       for (auto i = first; i < size; i += blockDim.x) {
0106         warpPrefixScan(c, i, mask);
0107         auto laneId = threadIdx.x & 0x1f;
0108         auto warpId = i / 32;
0109         assert(warpId < 32);
0110         if (31 == laneId)
0111           ws[warpId] = c[i];
0112         mask = __ballot_sync(mask, i + blockDim.x < size);
0113       }
0114       __syncthreads();
0115       if (size <= 32)
0116         return;
0117       if (threadIdx.x < 32)
0118         warpPrefixScan(ws, threadIdx.x, 0xffffffff);
0119       __syncthreads();
0120       for (auto i = first + 32; i < size; i += blockDim.x) {
0121         auto warpId = i / 32;
0122         c[i] += ws[warpId - 1];
0123       }
0124       __syncthreads();
0125 #else
0126       for (uint32_t i = 1; i < size; ++i)
0127         c[i] += c[i - 1];
0128 #endif
0129     }
0130
0131 #ifdef __CUDA_ARCH__
0132     // see https://stackoverflow.com/questions/40021086/can-i-obtain-the-amount-of-allocated-dynamic-shared-memory-from-within-a-kernel/40021087#40021087
0133     __device__ __forceinline__ unsigned dynamic_smem_size() {
0134       unsigned ret;
0135       asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
0136       return ret;
0137     }
0138 #endif
0139
0140     // in principle not limited....
0141     template <typename T>
0142     __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
0143       volatile T const* ci = ici;
0144       volatile T* co = ico;
0145       __shared__ T ws[32];
0146 #ifdef __CUDA_ARCH__
0147       assert(sizeof(T) * gridDim.x <= dynamic_smem_size());  // size of psum below
0148 #endif
0149       assert(blockDim.x * gridDim.x >= size);
0150       // first each block does a scan
0151       int off = blockDim.x * blockIdx.x;
0152       if (size - off > 0)
0153         blockPrefixScan(ci + off, co + off, std::min(int(blockDim.x), size - off), ws);
0154
0155       // count blocks that finished
0156       __shared__ bool isLastBlockDone;
0157       if (0 == threadIdx.x) {
0158         __threadfence();
0159         auto value = atomicAdd(pc, 1);  // block counter
0160         isLastBlockDone = (value == (int(gridDim.x) - 1));
0161       }
0162
0163       __syncthreads();
0164
0165       if (!isLastBlockDone)
0166         return;
0167
0168       assert(int(gridDim.x) == *pc);
0169
0170       // good each block has done its work and now we are left in last block
0171
0172       // let's get the partial sums from each block
0173       extern __shared__ T psum[];
0174       for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
0175         auto j = blockDim.x * i + blockDim.x - 1;
0176         psum[i] = (j < size) ? co[j] : T(0);
0177       }
0178       __syncthreads();
0179       blockPrefixScan(psum, psum, gridDim.x, ws);
0180
0181       // now it would have been handy to have the other blocks around...
0182       for (int i = threadIdx.x + blockDim.x, k = 0; i < size; i += blockDim.x, ++k) {
0183         co[i] += psum[k];
0184       }
0185     }
0186   }  // namespace cuda
0187 }  // namespace cms
0188
0189 #endif  // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h