File indexing completed on 2024-04-06 12:15:45
0001 #ifndef HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
0002 #define HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
0003
0004 #include <algorithm>
0005 #ifndef __CUDA_ARCH__
0006 #include <atomic>
0007 #endif
0008 #include <cstddef>
0009 #include <cstdint>
0010 #include <type_traits>
0011
0012 #include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
0013 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0014 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
0017 #include "HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h"
0018
0019 namespace cms {
0020 namespace cuda {
0021
0022 template <typename Assoc>
0023 struct OneToManyAssocView {
0024 using Counter = typename Assoc::Counter;
0025 using index_type = typename Assoc::index_type;
0026
0027 Assoc *assoc = nullptr;
0028 Counter *offStorage = nullptr;
0029 index_type *contentStorage = nullptr;
0030 int32_t offSize = -1;
0031 int32_t contentSize = -1;
0032 };
0033
0034
0035 template <typename Assoc>
0036 __global__ void zeroAndInit(OneToManyAssocView<Assoc> view) {
0037 auto h = view.assoc;
0038 assert(1 == gridDim.x);
0039 assert(0 == blockIdx.x);
0040
0041 int first = threadIdx.x;
0042
0043 if (0 == first) {
0044 h->psws = 0;
0045 h->initStorage(view);
0046 }
0047 __syncthreads();
0048 for (int i = first, nt = h->totOnes(); i < nt; i += blockDim.x) {
0049 h->off[i] = 0;
0050 }
0051 }
0052
0053 template <typename Assoc>
0054 inline __attribute__((always_inline)) void launchZero(Assoc *h,
0055 cudaStream_t stream
0056 #ifndef __CUDACC__
0057 = cudaStreamDefault
0058 #endif
0059 ) {
0060 typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
0061 launchZero(view, stream);
0062 }
0063 template <typename Assoc>
0064 inline __attribute__((always_inline)) void launchZero(OneToManyAssocView<Assoc> view,
0065 cudaStream_t stream
0066 #ifndef __CUDACC__
0067 = cudaStreamDefault
0068 #endif
0069 ) {
0070
0071 if constexpr (Assoc::ctCapacity() < 0) {
0072 assert(view.contentStorage);
0073 assert(view.contentSize > 0);
0074 }
0075 if constexpr (Assoc::ctNOnes() < 0) {
0076 assert(view.offStorage);
0077 assert(view.offSize > 0);
0078 }
0079 #ifdef __CUDACC__
0080 auto nthreads = 1024;
0081 auto nblocks = 1;
0082 zeroAndInit<<<nblocks, nthreads, 0, stream>>>(view);
0083 cudaCheck(cudaGetLastError());
0084 #else
0085 auto h = view.assoc;
0086 assert(h);
0087 h->initStorage(view);
0088 h->zero();
0089 h->psws = 0;
0090 #endif
0091 }
0092
0093 template <typename Assoc>
0094 inline __attribute__((always_inline)) void launchFinalize(Assoc *h,
0095 cudaStream_t stream
0096 #ifndef __CUDACC__
0097 = cudaStreamDefault
0098 #endif
0099 ) {
0100 typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
0101 launchFinalize(view, stream);
0102 }
0103
0104 template <typename Assoc>
0105 inline __attribute__((always_inline)) void launchFinalize(OneToManyAssocView<Assoc> view,
0106 cudaStream_t stream
0107 #ifndef __CUDACC__
0108 = cudaStreamDefault
0109 #endif
0110 ) {
0111 auto h = view.assoc;
0112 assert(h);
0113 #ifdef __CUDACC__
0114 using Counter = typename Assoc::Counter;
0115 Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
0116 auto nOnes = Assoc::ctNOnes();
0117 if constexpr (Assoc::ctNOnes() < 0) {
0118 assert(view.offStorage);
0119 assert(view.offSize > 0);
0120 nOnes = view.offSize;
0121 poff = view.offStorage;
0122 }
0123 assert(nOnes > 0);
0124 int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
0125 auto nthreads = 1024;
0126 auto nblocks = (nOnes + nthreads - 1) / nthreads;
0127 multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(poff, poff, nOnes, ppsws);
0128 cudaCheck(cudaGetLastError());
0129 #else
0130 h->finalize();
0131 #endif
0132 }
0133
0134 template <typename Assoc>
0135 __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
0136 assoc->bulkFinalizeFill(*apc);
0137 }
0138
0139 template <typename I,
0140 int32_t ONES,
0141 int32_t SIZE
0142 >
0143 class OneToManyAssoc {
0144 public:
0145 using View = OneToManyAssocView<OneToManyAssoc<I, ONES, SIZE>>;
0146 using Counter = uint32_t;
0147
0148 using CountersOnly = OneToManyAssoc<I, ONES, 0>;
0149
0150 using index_type = I;
0151
0152 static constexpr uint32_t ilog2(uint32_t v) {
0153 constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
0154 constexpr uint32_t s[] = {1, 2, 4, 8, 16};
0155
0156 uint32_t r = 0;
0157 for (auto i = 4; i >= 0; i--)
0158 if (v & b[i]) {
0159 v >>= s[i];
0160 r |= s[i];
0161 }
0162 return r;
0163 }
0164
0165 static constexpr int32_t ctNOnes() { return ONES; }
0166 constexpr auto totOnes() const { return off.capacity(); }
0167 constexpr auto nOnes() const { return totOnes() - 1; }
0168 static constexpr int32_t ctCapacity() { return SIZE; }
0169 constexpr auto capacity() const { return content.capacity(); }
0170
0171 __host__ __device__ void initStorage(View view) {
0172 assert(view.assoc == this);
0173 if constexpr (ctCapacity() < 0) {
0174 assert(view.contentStorage);
0175 assert(view.contentSize > 0);
0176 content.init(view.contentStorage, view.contentSize);
0177 }
0178 if constexpr (ctNOnes() < 0) {
0179 assert(view.offStorage);
0180 assert(view.offSize > 0);
0181 off.init(view.offStorage, view.offSize);
0182 }
0183 }
0184
0185 __host__ __device__ void zero() {
0186 for (int32_t i = 0; i < totOnes(); ++i) {
0187 off[i] = 0;
0188 }
0189 }
0190
0191 __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
0192 for (int32_t i = 0; i < totOnes(); ++i) {
0193 #ifdef __CUDA_ARCH__
0194 atomicAdd(off.data() + i, co.off[i]);
0195 #else
0196 auto &a = (std::atomic<Counter> &)(off[i]);
0197 a += co.off[i];
0198 #endif
0199 }
0200 }
0201
0202 static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
0203 #ifdef __CUDA_ARCH__
0204 return atomicAdd(&x, 1);
0205 #else
0206 auto &a = (std::atomic<Counter> &)(x);
0207 return a++;
0208 #endif
0209 }
0210
0211 static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
0212 #ifdef __CUDA_ARCH__
0213 return atomicSub(&x, 1);
0214 #else
0215 auto &a = (std::atomic<Counter> &)(x);
0216 return a--;
0217 #endif
0218 }
0219
0220 __host__ __device__ __forceinline__ void count(int32_t b) {
0221 assert(b < nOnes());
0222 atomicIncrement(off[b]);
0223 }
0224
0225 __host__ __device__ __forceinline__ void fill(int32_t b, index_type j) {
0226 assert(b < nOnes());
0227 auto w = atomicDecrement(off[b]);
0228 assert(w > 0);
0229 content[w - 1] = j;
0230 }
0231
0232 __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
0233 auto c = apc.add(n);
0234 if (int(c.m) >= nOnes())
0235 return -int32_t(c.m);
0236 off[c.m] = c.n;
0237 for (uint32_t j = 0; j < n; ++j)
0238 content[c.n + j] = v[j];
0239 return c.m;
0240 }
0241
0242 __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
0243 off[apc.get().m] = apc.get().n;
0244 }
0245
0246 __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
0247 int m = apc.get().m;
0248 auto n = apc.get().n;
0249 if (m >= nOnes()) {
0250 off[nOnes()] = uint32_t(off[nOnes() - 1]);
0251 return;
0252 }
0253 auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
0254 for (int i = first; i < totOnes(); i += gridDim.x * blockDim.x) {
0255 off[i] = n;
0256 }
0257 }
0258
0259 __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
0260 assert(off[totOnes() - 1] == 0);
0261 blockPrefixScan(off.data(), totOnes(), ws);
0262 assert(off[totOnes() - 1] == off[totOnes() - 2]);
0263 }
0264
0265 constexpr auto size() const { return uint32_t(off[totOnes() - 1]); }
0266 constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
0267
0268 constexpr index_type const *begin() const { return content.data(); }
0269 constexpr index_type const *end() const { return begin() + size(); }
0270
0271 constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; }
0272 constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; }
0273
0274 FlexiStorage<Counter, ONES> off;
0275 int32_t psws;
0276 FlexiStorage<index_type, SIZE> content;
0277 };
0278
0279 }
0280 }
0281
0282 #endif