CUDAUtilities/interface/OneToManyAssoc.h

0001 #ifndef HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
0002 #define HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
0003
0004 #include <algorithm>
0005 #ifndef __CUDA_ARCH__
0006 #include <atomic>
0007 #endif  // __CUDA_ARCH__
0008 #include <cstddef>
0009 #include <cstdint>
0010 #include <type_traits>
0011
0012 #include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
0013 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0014 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
0017 #include "HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h"
0018
0019 namespace cms {
0020   namespace cuda {
0021
0022     template <typename Assoc>
0023     struct OneToManyAssocView {
0024       using Counter = typename Assoc::Counter;
0025       using index_type = typename Assoc::index_type;
0026
0027       Assoc *assoc = nullptr;
0028       Counter *offStorage = nullptr;
0029       index_type *contentStorage = nullptr;
0030       int32_t offSize = -1;
0031       int32_t contentSize = -1;
0032     };
0033
0034     // this MUST BE DONE in a single block (or in two kernels!)
0035     template <typename Assoc>
0036     __global__ void zeroAndInit(OneToManyAssocView<Assoc> view) {
0037       auto h = view.assoc;
0038       assert(1 == gridDim.x);
0039       assert(0 == blockIdx.x);
0040
0041       int first = threadIdx.x;
0042
0043       if (0 == first) {
0044         h->psws = 0;
0045         h->initStorage(view);
0046       }
0047       __syncthreads();
0048       for (int i = first, nt = h->totOnes(); i < nt; i += blockDim.x) {
0049         h->off[i] = 0;
0050       }
0051     }
0052
0053     template <typename Assoc>
0054     inline __attribute__((always_inline)) void launchZero(Assoc *h,
0055                                                           cudaStream_t stream
0056 #ifndef __CUDACC__
0057                                                           = cudaStreamDefault
0058 #endif
0059     ) {
0060       typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
0061       launchZero(view, stream);
0062     }
0063     template <typename Assoc>
0064     inline __attribute__((always_inline)) void launchZero(OneToManyAssocView<Assoc> view,
0065                                                           cudaStream_t stream
0066 #ifndef __CUDACC__
0067                                                           = cudaStreamDefault
0068 #endif
0069     ) {
0070
0071       if constexpr (Assoc::ctCapacity() < 0) {
0072         assert(view.contentStorage);
0073         assert(view.contentSize > 0);
0074       }
0075       if constexpr (Assoc::ctNOnes() < 0) {
0076         assert(view.offStorage);
0077         assert(view.offSize > 0);
0078       }
0079 #ifdef __CUDACC__
0080       auto nthreads = 1024;
0081       auto nblocks = 1;  // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels);
0082       zeroAndInit<<<nblocks, nthreads, 0, stream>>>(view);
0083       cudaCheck(cudaGetLastError());
0084 #else
0085       auto h = view.assoc;
0086       assert(h);
0087       h->initStorage(view);
0088       h->zero();
0089       h->psws = 0;
0090 #endif
0091     }
0092
0093     template <typename Assoc>
0094     inline __attribute__((always_inline)) void launchFinalize(Assoc *h,
0095                                                               cudaStream_t stream
0096 #ifndef __CUDACC__
0097                                                               = cudaStreamDefault
0098 #endif
0099     ) {
0100       typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
0101       launchFinalize(view, stream);
0102     }
0103
0104     template <typename Assoc>
0105     inline __attribute__((always_inline)) void launchFinalize(OneToManyAssocView<Assoc> view,
0106                                                               cudaStream_t stream
0107 #ifndef __CUDACC__
0108                                                               = cudaStreamDefault
0109 #endif
0110     ) {
0111       auto h = view.assoc;
0112       assert(h);
0113 #ifdef __CUDACC__
0114       using Counter = typename Assoc::Counter;
0115       Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
0116       auto nOnes = Assoc::ctNOnes();
0117       if constexpr (Assoc::ctNOnes() < 0) {
0118         assert(view.offStorage);
0119         assert(view.offSize > 0);
0120         nOnes = view.offSize;
0121         poff = view.offStorage;
0122       }
0123       assert(nOnes > 0);
0124       int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
0125       auto nthreads = 1024;
0126       auto nblocks = (nOnes + nthreads - 1) / nthreads;
0127       multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(poff, poff, nOnes, ppsws);
0128       cudaCheck(cudaGetLastError());
0129 #else
0130       h->finalize();
0131 #endif
0132     }
0133
0134     template <typename Assoc>
0135     __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
0136       assoc->bulkFinalizeFill(*apc);
0137     }
0138
0139     template <typename I,    // type stored in the container (usually an index in a vector of the input values)
0140               int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage
0141               int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage
0142               >
0143     class OneToManyAssoc {
0144     public:
0145       using View = OneToManyAssocView<OneToManyAssoc<I, ONES, SIZE>>;
0146       using Counter = uint32_t;
0147
0148       using CountersOnly = OneToManyAssoc<I, ONES, 0>;
0149
0150       using index_type = I;
0151
0152       static constexpr uint32_t ilog2(uint32_t v) {
0153         constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
0154         constexpr uint32_t s[] = {1, 2, 4, 8, 16};
0155
0156         uint32_t r = 0;  // result of log2(v) will go here
0157         for (auto i = 4; i >= 0; i--)
0158           if (v & b[i]) {
0159             v >>= s[i];
0160             r |= s[i];
0161           }
0162         return r;
0163       }
0164
0165       static constexpr int32_t ctNOnes() { return ONES; }
0166       constexpr auto totOnes() const { return off.capacity(); }
0167       constexpr auto nOnes() const { return totOnes() - 1; }
0168       static constexpr int32_t ctCapacity() { return SIZE; }
0169       constexpr auto capacity() const { return content.capacity(); }
0170
0171       __host__ __device__ void initStorage(View view) {
0172         assert(view.assoc == this);
0173         if constexpr (ctCapacity() < 0) {
0174           assert(view.contentStorage);
0175           assert(view.contentSize > 0);
0176           content.init(view.contentStorage, view.contentSize);
0177         }
0178         if constexpr (ctNOnes() < 0) {
0179           assert(view.offStorage);
0180           assert(view.offSize > 0);
0181           off.init(view.offStorage, view.offSize);
0182         }
0183       }
0184
0185       __host__ __device__ void zero() {
0186         for (int32_t i = 0; i < totOnes(); ++i) {
0187           off[i] = 0;
0188         }
0189       }
0190
0191       __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
0192         for (int32_t i = 0; i < totOnes(); ++i) {
0193 #ifdef __CUDA_ARCH__
0194           atomicAdd(off.data() + i, co.off[i]);
0195 #else
0196           std::atomic_ref<Counter> a{off[i]};
0197           a += co.off[i];
0198 #endif
0199         }
0200       }
0201
0202       static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
0203 #ifdef __CUDA_ARCH__
0204         return atomicAdd(&x, 1);
0205 #else
0206         std::atomic_ref<Counter> a{x};
0207         return a++;
0208 #endif
0209       }
0210
0211       static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
0212 #ifdef __CUDA_ARCH__
0213         return atomicSub(&x, 1);
0214 #else
0215         std::atomic_ref<Counter> a{x};
0216         return a--;
0217 #endif
0218       }
0219
0220       __host__ __device__ __forceinline__ void count(int32_t b) {
0221         assert(b < nOnes());
0222         atomicIncrement(off[b]);
0223       }
0224
0225       __host__ __device__ __forceinline__ void fill(int32_t b, index_type j) {
0226         assert(b < nOnes());
0227         auto w = atomicDecrement(off[b]);
0228         assert(w > 0);
0229         content[w - 1] = j;
0230       }
0231
0232       __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
0233         auto c = apc.add(n);
0234         if (int(c.m) >= nOnes())
0235           return -int32_t(c.m);
0236         off[c.m] = c.n;
0237         for (uint32_t j = 0; j < n; ++j)
0238           content[c.n + j] = v[j];
0239         return c.m;
0240       }
0241
0242       __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
0243         off[apc.get().m] = apc.get().n;
0244       }
0245
0246       __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
0247         int m = apc.get().m;
0248         auto n = apc.get().n;
0249         if (m >= nOnes()) {  // overflow!
0250           off[nOnes()] = uint32_t(off[nOnes() - 1]);
0251           return;
0252         }
0253         auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
0254         for (int i = first; i < totOnes(); i += gridDim.x * blockDim.x) {
0255           off[i] = n;
0256         }
0257       }
0258
0259       __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
0260         assert(off[totOnes() - 1] == 0);
0261         blockPrefixScan(off.data(), totOnes(), ws);
0262         assert(off[totOnes() - 1] == off[totOnes() - 2]);
0263       }
0264
0265       constexpr auto size() const { return uint32_t(off[totOnes() - 1]); }
0266       constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
0267
0268       constexpr index_type const *begin() const { return content.data(); }
0269       constexpr index_type const *end() const { return begin() + size(); }
0270
0271       constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; }
0272       constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; }
0273
0274       FlexiStorage<Counter, ONES> off;
0275       int32_t psws;  // prefix-scan working space
0276       FlexiStorage<index_type, SIZE> content;
0277     };
0278
0279   }  // namespace cuda
0280 }  // namespace cms
0281
0282 #endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h