Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-01-22 07:34:11

0001 #include <algorithm>
0002 #include <array>
0003 #include <cstdlib>
0004 #include <iostream>
0005 #include <limits>
0006 #include <memory>
0007 #include <random>
0008 
0009 #include <alpaka/alpaka.hpp>
0010 
0011 #include "FWCore/Utilities/interface/stringize.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h"
0016 
0017 constexpr uint32_t MaxElem = 64000;
0018 constexpr uint32_t MaxTk = 8000;
0019 constexpr uint32_t MaxAssocs = 4 * MaxTk;
0020 
0021 using namespace cms::alpakatools;
0022 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0023 
0024 using AssocRandomAccess = OneToManyAssocRandomAccess<uint16_t, MaxElem, MaxAssocs>;
0025 using AssocSequential = OneToManyAssocSequential<uint16_t, MaxElem, MaxAssocs>;
0026 using SmallAssoc = OneToManyAssocSequential<uint16_t, 128, MaxAssocs>;
0027 using Multiplicity = OneToManyAssocRandomAccess<uint16_t, 8, MaxTk>;
0028 using TK = std::array<uint16_t, 4>;
0029 
0030 namespace {
0031   template <typename T>
0032   ALPAKA_FN_HOST_ACC typename std::make_signed<T>::type toSigned(T v) {
0033     return static_cast<typename std::make_signed<T>::type>(v);
0034   }
0035 }  // namespace
0036 
0037 struct countMultiLocal {
0038   ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0039                                 TK const* __restrict__ tk,
0040                                 Multiplicity* __restrict__ assoc,
0041                                 uint32_t n) const {
0042     for (auto i : uniform_elements(acc, n)) {
0043       auto& local = alpaka::declareSharedVar<Multiplicity::CountersOnly, __COUNTER__>(acc);
0044       const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
0045       const bool oncePerSharedMemoryAccess = (threadIdxLocal == 0);
0046       if (oncePerSharedMemoryAccess) {
0047         local.zero();
0048       }
0049       alpaka::syncBlockThreads(acc);
0050       local.count(acc, 2 + i % 4);
0051       alpaka::syncBlockThreads(acc);
0052       if (oncePerSharedMemoryAccess) {
0053         assoc->add(acc, local);
0054       }
0055     }
0056   }
0057 };
0058 
0059 struct countMulti {
0060   ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0061                                 TK const* __restrict__ tk,
0062                                 Multiplicity* __restrict__ assoc,
0063                                 uint32_t n) const {
0064     for (auto i : uniform_elements(acc, n)) {
0065       assoc->count(acc, 2 + i % 4);
0066     }
0067   }
0068 };
0069 
0070 struct verifyMulti {
0071   ALPAKA_FN_ACC void operator()(Acc1D const& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const {
0072     for ([[maybe_unused]] auto i : uniform_elements(acc, Multiplicity{}.totOnes())) {
0073       ALPAKA_ASSERT_ACC(m1->off[i] == m2->off[i]);
0074     }
0075   }
0076 };
0077 
0078 struct count {
0079   ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0080                                 TK const* __restrict__ tk,
0081                                 AssocRandomAccess* __restrict__ assoc,
0082                                 uint32_t n) const {
0083     for (auto i : uniform_elements(acc, 4 * n)) {
0084       auto k = i / 4;
0085       auto j = i - 4 * k;
0086       ALPAKA_ASSERT_ACC(j < 4);
0087       if (k >= n) {
0088         return;
0089       }
0090       if (tk[k][j] < MaxElem) {
0091         assoc->count(acc, tk[k][j]);
0092       }
0093     }
0094   }
0095 };
0096 
0097 struct fill {
0098   ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0099                                 TK const* __restrict__ tk,
0100                                 AssocRandomAccess* __restrict__ assoc,
0101                                 uint32_t n) const {
0102     for (auto i : uniform_elements(acc, 4 * n)) {
0103       auto k = i / 4;
0104       auto j = i - 4 * k;
0105       ALPAKA_ASSERT_ACC(j < 4);
0106       if (k >= n) {
0107         return;
0108       }
0109       if (tk[k][j] < MaxElem) {
0110         assoc->fill(acc, tk[k][j], k);
0111       }
0112     }
0113   }
0114 };
0115 
0116 struct verify {
0117   template <typename Assoc>
0118   ALPAKA_FN_ACC void operator()(Acc1D const& acc, Assoc* __restrict__ assoc) const {
0119     ALPAKA_ASSERT_ACC(assoc->size() < Assoc{}.capacity());
0120   }
0121 };
0122 
0123 struct fillBulk {
0124   template <typename Assoc>
0125   ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0126                                 AtomicPairCounter* apc,
0127                                 TK const* __restrict__ tk,
0128                                 Assoc* __restrict__ assoc,
0129                                 uint32_t n) const {
0130     for (auto k : uniform_elements(acc, n)) {
0131       auto m = tk[k][3] < MaxElem ? 4 : 3;
0132       assoc->bulkFill(acc, *apc, &tk[k][0], m);
0133     }
0134   }
0135 };
0136 
0137 struct verifyBulk {
0138   template <typename Assoc>
0139   ALPAKA_FN_ACC void operator()(Acc1D const& acc, Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) const {
0140     if (::toSigned(apc->get().first) >= Assoc::ctNOnes()) {
0141       printf("Overflow %d %d\n", apc->get().first, Assoc::ctNOnes());
0142     }
0143     ALPAKA_ASSERT_ACC(toSigned(assoc->size()) < Assoc::ctCapacity());
0144   }
0145 };
0146 
0147 int main() {
0148   // get the list of devices on the current platform
0149   auto const& devices = cms::alpakatools::devices<Platform>();
0150   if (devices.empty()) {
0151     std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0152       "the test will be skipped.\n";
0153     exit(EXIT_FAILURE);
0154   }
0155 
0156   // run the test on each device
0157   for (auto const& device : devices) {
0158     Queue queue(device);
0159 
0160     std::cout << "OneToManyAssocRandomAccess " << sizeof(AssocRandomAccess) << " Ones=" << AssocRandomAccess{}.totOnes()
0161               << " Capacity=" << AssocRandomAccess{}.capacity() << std::endl;
0162     std::cout << "OneToManyAssocSequential " << sizeof(AssocSequential) << " Ones=" << AssocSequential{}.totOnes()
0163               << " Capacity=" << AssocSequential{}.capacity() << std::endl;
0164     std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << " Ones=" << SmallAssoc{}.totOnes()
0165               << " Capacity=" << SmallAssoc{}.capacity() << std::endl;
0166 
0167     std::mt19937 eng;
0168     std::geometric_distribution<int> rdm(0.8);
0169 
0170     constexpr uint32_t N = 4000;
0171 
0172     auto tr = make_host_buffer<std::array<uint16_t, 4>[]>(queue, N);
0173     // fill with "index" to element
0174     long long ave = 0;
0175     int imax = 0;
0176     auto n = 0U;
0177     auto z = 0U;
0178     auto nz = 0U;
0179     for (auto i = 0U; i < 4U; ++i) {
0180       auto j = 0U;
0181       while (j < N && n < MaxElem) {
0182         if (z == 11) {
0183           ++n;
0184           z = 0;
0185           ++nz;
0186           continue;
0187         }  // a bit of not assoc
0188         auto x = rdm(eng);
0189         auto k = std::min(j + x + 1, N);
0190         if (i == 3 && z == 3) {  // some triplets time to time
0191           for (; j < k; ++j)
0192             tr[j][i] = MaxElem + 1;
0193         } else {
0194           ave += x + 1;
0195           imax = std::max(imax, x);
0196           for (; j < k; ++j)
0197             tr[j][i] = n;
0198           ++n;
0199         }
0200         ++z;
0201       }
0202       ALPAKA_ASSERT_ACC(n <= MaxElem);
0203       ALPAKA_ASSERT_ACC(j <= N);
0204     }
0205     std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
0206 
0207     auto v_d = make_device_buffer<std::array<uint16_t, 4>[]>(queue, N);
0208     alpaka::memcpy(queue, v_d, tr);
0209 
0210     auto ara_d = make_device_buffer<AssocRandomAccess>(queue);
0211     alpaka::memset(queue, ara_d, 0);
0212 
0213     const auto threadsPerBlockOrElementsPerThread = 256u;
0214     const auto blocksPerGrid4N = divide_up_by(4 * N, threadsPerBlockOrElementsPerThread);
0215     const auto workDiv4N = make_workdiv<Acc1D>(blocksPerGrid4N, threadsPerBlockOrElementsPerThread);
0216 
0217     AssocRandomAccess::template launchZero<Acc1D>(ara_d.data(), queue);
0218 
0219     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, count(), v_d.data(), ara_d.data(), N));
0220 
0221     AssocRandomAccess::template launchFinalize<Acc1D>(ara_d.data(), queue);
0222 
0223     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verify(), ara_d.data()));
0224 
0225     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, fill(), v_d.data(), ara_d.data(), N));
0226 
0227     auto ara_h = make_host_buffer<AssocRandomAccess>(queue);
0228     alpaka::memcpy(queue, ara_h, ara_d);
0229     alpaka::wait(queue);
0230 
0231     std::cout << ara_h->size() << std::endl;
0232     imax = 0;
0233     ave = 0;
0234     z = 0;
0235     for (auto i = 0U; i < n; ++i) {
0236       auto x = ara_h->size(i);
0237       if (x == 0) {
0238         z++;
0239         continue;
0240       }
0241       ave += x;
0242       imax = std::max(imax, int(x));
0243     }
0244     ALPAKA_ASSERT_ACC(0 == ara_h->size(n));
0245     std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl;
0246 
0247     // now the inverse map (actually this is the direct....)
0248     auto dc_d = make_device_buffer<AtomicPairCounter>(queue);
0249     alpaka::memset(queue, dc_d, 0);
0250 
0251     const auto blocksPerGrid = divide_up_by(N, threadsPerBlockOrElementsPerThread);
0252     const auto workDiv = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
0253 
0254     auto as_d = make_device_buffer<AssocSequential>(queue);
0255     alpaka::enqueue(queue,
0256                     alpaka::createTaskKernel<Acc1D>(workDiv, fillBulk(), dc_d.data(), v_d.data(), as_d.data(), N));
0257 
0258     alpaka::enqueue(
0259         queue, alpaka::createTaskKernel<Acc1D>(workDiv, AssocSequential::finalizeBulk(), dc_d.data(), as_d.data()));
0260 
0261     alpaka::enqueue(queue,
0262                     alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), as_d.data(), dc_d.data()));
0263 
0264     auto as_h = make_host_buffer<AssocSequential>(queue);
0265     alpaka::memcpy(queue, as_h, as_d);
0266 
0267     auto dc_h = make_host_buffer<AtomicPairCounter>(queue);
0268     alpaka::memcpy(queue, dc_h, dc_d);
0269     alpaka::wait(queue);
0270 
0271     alpaka::memset(queue, dc_d, 0);
0272     auto sa_d = make_device_buffer<SmallAssoc>(queue);
0273     alpaka::memset(queue, sa_d, 0);
0274 
0275     alpaka::enqueue(queue,
0276                     alpaka::createTaskKernel<Acc1D>(workDiv, fillBulk(), dc_d.data(), v_d.data(), sa_d.data(), N));
0277 
0278     alpaka::enqueue(queue,
0279                     alpaka::createTaskKernel<Acc1D>(workDiv, SmallAssoc::finalizeBulk(), dc_d.data(), sa_d.data()));
0280 
0281     alpaka::enqueue(queue,
0282                     alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), sa_d.data(), dc_d.data()));
0283 
0284     std::cout << "final counter value " << dc_h->get().second << ' ' << dc_h->get().first << std::endl;
0285 
0286     std::cout << as_h->size() << std::endl;
0287     imax = 0;
0288     ave = 0;
0289     for (auto i = 0U; i < N; ++i) {
0290       auto x = as_h->size(i);
0291       if (!(x == 4 || x == 3)) {
0292         std::cout << "i=" << i << " x=" << x << std::endl;
0293       }
0294       ALPAKA_ASSERT_ACC(x == 4 || x == 3);
0295       ave += x;
0296       imax = std::max(imax, int(x));
0297     }
0298     ALPAKA_ASSERT_ACC(0 == as_h->size(N));
0299     std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl;
0300 
0301     // here verify use of block local counters
0302     auto m1_d = make_device_buffer<Multiplicity>(queue);
0303     alpaka::memset(queue, m1_d, 0);
0304     auto m2_d = make_device_buffer<Multiplicity>(queue);
0305     alpaka::memset(queue, m2_d, 0);
0306 
0307     Multiplicity::template launchZero<Acc1D>(m1_d.data(), queue);
0308     Multiplicity::template launchZero<Acc1D>(m2_d.data(), queue);
0309 
0310     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, countMulti(), v_d.data(), m1_d.data(), N));
0311 
0312     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, countMultiLocal(), v_d.data(), m2_d.data(), N));
0313 
0314     const auto blocksPerGridTotBins = 1u;
0315     const auto threadsPerBlockOrElementsPerThreadTotBins = Multiplicity::ctNOnes();
0316     const auto workDivTotBins = make_workdiv<Acc1D>(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins);
0317 
0318     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data()));
0319 
0320     Multiplicity::launchFinalize<Acc1D>(m1_d.data(), queue);
0321     Multiplicity::launchFinalize<Acc1D>(m2_d.data(), queue);
0322 
0323     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data()));
0324 
0325     alpaka::wait(queue);
0326 
0327     return 0;
0328   }
0329 }