AlpakaInterface/interface/OneToManyAssoc.h

0001 #ifndef HeterogeneousCore_AlpakaInterface_interface_OneToManyAssoc_h
0002 #define HeterogeneousCore_AlpakaInterface_interface_OneToManyAssoc_h
0003
0004 #include <algorithm>
0005 #include <cstddef>
0006 #include <cstdint>
0007 #include <type_traits>
0008
0009 #include <alpaka/alpaka.hpp>
0010
0011 #include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0015
0016 namespace cms::alpakatools {
0017
0018   template <typename I,    // type stored in the container (usually an index in a vector of the input values)
0019             int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage
0020             int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage
0021             >
0022   class OneToManyAssocBase {
0023   public:
0024     using Counter = uint32_t;
0025
0026     using CountersOnly = OneToManyAssocBase<I, ONES, 0>;
0027
0028     using index_type = I;
0029
0030     struct View {
0031       OneToManyAssocBase *assoc = nullptr;
0032       Counter *offStorage = nullptr;
0033       index_type *contentStorage = nullptr;
0034       int32_t offSize = -1;
0035       int32_t contentSize = -1;
0036     };
0037
0038     static constexpr int32_t ctNOnes() { return ONES; }
0039     constexpr auto totOnes() const { return off.capacity(); }
0040     constexpr auto nOnes() const { return totOnes() - 1; }
0041     static constexpr int32_t ctCapacity() { return SIZE; }
0042     constexpr auto capacity() const { return content.capacity(); }
0043
0044     ALPAKA_FN_HOST_ACC void initStorage(View view) {
0045       ALPAKA_ASSERT_ACC(view.assoc == this);
0046       if constexpr (ctCapacity() < 0) {
0047         ALPAKA_ASSERT_ACC(view.contentStorage);
0048         ALPAKA_ASSERT_ACC(view.contentSize > 0);
0049         content.init(view.contentStorage, view.contentSize);
0050       }
0051       if constexpr (ctNOnes() < 0) {
0052         ALPAKA_ASSERT_ACC(view.offStorage);
0053         ALPAKA_ASSERT_ACC(view.offSize > 0);
0054         off.init(view.offStorage, view.offSize);
0055       }
0056     }
0057
0058     ALPAKA_FN_HOST_ACC void zero() {
0059       for (int32_t i = 0; i < totOnes(); ++i) {
0060         off[i] = 0;
0061       }
0062     }
0063
0064     template <typename TAcc>
0065     ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const TAcc &acc, CountersOnly const &co) {
0066       for (uint32_t i = 0; static_cast<int>(i) < totOnes(); ++i) {
0067         alpaka::atomicAdd(acc, off.data() + i, co.off[i], alpaka::hierarchy::Blocks{});
0068       }
0069     }
0070
0071     template <typename TAcc>
0072     ALPAKA_FN_ACC ALPAKA_FN_INLINE static uint32_t atomicIncrement(const TAcc &acc, Counter &x) {
0073       return alpaka::atomicAdd(acc, &x, 1u, alpaka::hierarchy::Blocks{});
0074     }
0075
0076     template <typename TAcc>
0077     ALPAKA_FN_ACC ALPAKA_FN_INLINE static uint32_t atomicDecrement(const TAcc &acc, Counter &x) {
0078       return alpaka::atomicSub(acc, &x, 1u, alpaka::hierarchy::Blocks{});
0079     }
0080
0081     template <typename TAcc>
0082     ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, I b) {
0083       ALPAKA_ASSERT_ACC(b < static_cast<uint32_t>(nOnes()));
0084       atomicIncrement(acc, off[b]);
0085     }
0086
0087     template <typename TAcc>
0088     ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, I b, index_type j) {
0089       ALPAKA_ASSERT_ACC(b < static_cast<uint32_t>(nOnes()));
0090       auto w = atomicDecrement(acc, off[b]);
0091       ALPAKA_ASSERT_ACC(w > 0);
0092       content[w - 1] = j;
0093     }
0094
0095     // this MUST BE DONE in a single block (or in two kernels!)
0096     struct zeroAndInit {
0097       template <typename TAcc>
0098       ALPAKA_FN_ACC void operator()(const TAcc &acc, View view) const {
0099         ALPAKA_ASSERT_ACC((1 == alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0]));
0100         ALPAKA_ASSERT_ACC((0 == alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]));
0101         auto h = view.assoc;
0102         if (cms::alpakatools::once_per_block(acc)) {
0103           h->psws = 0;
0104           h->initStorage(view);
0105         }
0106         alpaka::syncBlockThreads(acc);
0107         for (int i : cms::alpakatools::independent_group_elements(acc, h->totOnes())) {
0108           h->off[i] = 0;
0109         }
0110       }
0111     };
0112
0113     template <typename TAcc, typename TQueue>
0114     ALPAKA_FN_INLINE static void launchZero(OneToManyAssocBase *h, TQueue &queue) {
0115       View view = {h, nullptr, nullptr, -1, -1};
0116       launchZero<TAcc>(view, queue);
0117     }
0118
0119     template <typename TAcc, typename TQueue>
0120     ALPAKA_FN_INLINE static void launchZero(View view, TQueue &queue) {
0121       if constexpr (ctCapacity() < 0) {
0122         ALPAKA_ASSERT_ACC(view.contentStorage);
0123         ALPAKA_ASSERT_ACC(view.contentSize > 0);
0124       }
0125       if constexpr (ctNOnes() < 0) {
0126         ALPAKA_ASSERT_ACC(view.offStorage);
0127         ALPAKA_ASSERT_ACC(view.offSize > 0);
0128       }
0129       if constexpr (!requires_single_thread_per_block_v<TAcc>) {
0130         auto nthreads = 1024;
0131         auto nblocks = 1;  // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels);
0132         auto workDiv = cms::alpakatools::make_workdiv<TAcc>(nblocks, nthreads);
0133         alpaka::exec<TAcc>(queue, workDiv, zeroAndInit{}, view);
0134       } else {
0135         auto h = view.assoc;
0136         ALPAKA_ASSERT_ACC(h);
0137         h->initStorage(view);
0138         h->zero();
0139         h->psws = 0;
0140       }
0141     }
0142
0143     constexpr auto size() const { return uint32_t(off[totOnes() - 1]); }
0144     constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
0145
0146     constexpr index_type const *begin() const { return content.data(); }
0147     constexpr index_type const *end() const { return begin() + size(); }
0148
0149     constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; }
0150     constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; }
0151
0152     FlexiStorage<Counter, ONES> off;
0153     FlexiStorage<index_type, SIZE> content;
0154     int32_t psws;  // prefix-scan working space
0155   };
0156
0157   template <typename I,    // type stored in the container (usually an index in a vector of the input values)
0158             int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage
0159             int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage
0160             >
0161   class OneToManyAssocSequential : public OneToManyAssocBase<I, ONES, SIZE> {
0162   public:
0163     using index_type = typename OneToManyAssocBase<I, ONES, SIZE>::index_type;
0164
0165     template <typename TAcc>
0166     ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int32_t
0167     bulkFill(const TAcc &acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) {
0168       auto c = apc.inc_add(acc, n);
0169       if (int(c.first) >= this->nOnes())
0170         return -int32_t(c.first);
0171       this->off[c.first] = c.second;
0172       for (uint32_t j = 0; j < n; ++j)
0173         this->content[c.second + j] = v[j];
0174       return c.first;
0175     }
0176
0177     ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void bulkFinalize(AtomicPairCounter const &apc) {
0178       this->off[apc.get().first] = apc.get().second;
0179     }
0180
0181     template <typename TAcc>
0182     ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(TAcc &acc, AtomicPairCounter const &apc) {
0183       int f = apc.get().first;
0184       auto s = apc.get().second;
0185       if (f >= this->nOnes()) {  // overflow!
0186         this->off[this->nOnes()] = uint32_t(this->off[this->nOnes() - 1]);
0187         return;
0188       }
0189       auto first = f + alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
0190       for (int i = first; i < this->totOnes(); i += alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0]) {
0191         this->off[i] = s;
0192       }
0193     }
0194
0195     struct finalizeBulk {
0196       template <typename TAcc>
0197       ALPAKA_FN_ACC void operator()(const TAcc &acc,
0198                                     AtomicPairCounter const *apc,
0199                                     OneToManyAssocSequential *__restrict__ assoc) const {
0200         assoc->bulkFinalizeFill(acc, *apc);
0201       }
0202     };
0203   };
0204
0205   template <typename I,    // type stored in the container (usually an index in a vector of the input values)
0206             int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage
0207             int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage
0208             >
0209   class OneToManyAssocRandomAccess : public OneToManyAssocBase<I, ONES, SIZE> {
0210   public:
0211     using Counter = typename OneToManyAssocBase<I, ONES, SIZE>::Counter;
0212     using View = typename OneToManyAssocBase<I, ONES, SIZE>::View;
0213
0214     template <typename TAcc>
0215     ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void finalize(TAcc &acc, Counter *ws = nullptr) {
0216       ALPAKA_ASSERT_ACC(this->off[this->totOnes() - 1] == 0);
0217       blockPrefixScan(acc, this->off.data(), this->totOnes(), ws);
0218       ALPAKA_ASSERT_ACC(this->off[this->totOnes() - 1] == this->off[this->totOnes() - 2]);
0219     }
0220
0221     ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void finalize() {
0222       // Single thread finalize.
0223       for (uint32_t i = 1; static_cast<int>(i) < this->totOnes(); ++i)
0224         this->off[i] += this->off[i - 1];
0225     }
0226
0227     template <typename TAcc, typename TQueue>
0228     ALPAKA_FN_INLINE static void launchFinalize(OneToManyAssocRandomAccess *h, TQueue &queue) {
0229       View view = {h, nullptr, nullptr, -1, -1};
0230       launchFinalize<TAcc>(view, queue);
0231     }
0232
0233     template <typename TAcc, typename TQueue>
0234     ALPAKA_FN_INLINE static void launchFinalize(View view, TQueue &queue) {
0235       // View stores a base pointer, we need to upcast back...
0236       auto h = static_cast<OneToManyAssocRandomAccess *>(view.assoc);
0237       ALPAKA_ASSERT_ACC(h);
0238       if constexpr (!requires_single_thread_per_block_v<TAcc>) {
0239         Counter *poff = (Counter *)((char *)(h) + offsetof(OneToManyAssocRandomAccess, off));
0240         auto nOnes = OneToManyAssocRandomAccess::ctNOnes();
0241         if constexpr (OneToManyAssocRandomAccess::ctNOnes() < 0) {
0242           ALPAKA_ASSERT_ACC(view.offStorage);
0243           ALPAKA_ASSERT_ACC(view.offSize > 0);
0244           nOnes = view.offSize;
0245           poff = view.offStorage;
0246         }
0247         ALPAKA_ASSERT_ACC(nOnes > 0);
0248         int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(OneToManyAssocRandomAccess, psws));
0249         auto nthreads = 1024;
0250         auto nblocks = (nOnes + nthreads - 1) / nthreads;
0251         auto workDiv = cms::alpakatools::make_workdiv<TAcc>(nblocks, nthreads);
0252         alpaka::exec<TAcc>(queue,
0253                            workDiv,
0254                            multiBlockPrefixScan<Counter>(),
0255                            poff,
0256                            poff,
0257                            nOnes,
0258                            nblocks,
0259                            ppsws,
0260                            alpaka::getPreferredWarpSize(alpaka::getDev(queue)));
0261       } else {
0262         h->finalize();
0263       }
0264     }
0265   };
0266
0267 }  // namespace cms::alpakatools
0268
0269 #endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h