Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:15:41

0001 #include <algorithm>
0002 #include <array>
0003 #include <cstdlib>
0004 #include <iostream>
0005 #include <limits>
0006 #include <memory>
0007 #include <random>
0008 
0009 #include <alpaka/alpaka.hpp>
0010 
0011 #include "FWCore/Utilities/interface/stringize.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h"
0016 
0017 constexpr uint32_t MaxElem = 64000;
0018 constexpr uint32_t MaxTk = 8000;
0019 constexpr uint32_t MaxAssocs = 4 * MaxTk;
0020 
0021 using namespace cms::alpakatools;
0022 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0023 
0024 using AssocRandomAccess = OneToManyAssocRandomAccess<uint16_t, MaxElem, MaxAssocs>;
0025 using AssocSequential = OneToManyAssocSequential<uint16_t, MaxElem, MaxAssocs>;
0026 using SmallAssoc = OneToManyAssocSequential<uint16_t, 128, MaxAssocs>;
0027 using Multiplicity = OneToManyAssocRandomAccess<uint16_t, 8, MaxTk>;
0028 using TK = std::array<uint16_t, 4>;
0029 
0030 namespace {
0031   template <typename T>
0032   ALPAKA_FN_HOST_ACC typename std::make_signed<T>::type toSigned(T v) {
0033     return static_cast<typename std::make_signed<T>::type>(v);
0034   }
0035 }  // namespace
0036 
0037 struct countMultiLocal {
0038   template <typename TAcc>
0039   ALPAKA_FN_ACC void operator()(const TAcc& acc,
0040                                 TK const* __restrict__ tk,
0041                                 Multiplicity* __restrict__ assoc,
0042                                 uint32_t n) const {
0043     for (auto i : uniform_elements(acc, n)) {
0044       auto& local = alpaka::declareSharedVar<Multiplicity::CountersOnly, __COUNTER__>(acc);
0045       const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
0046       const bool oncePerSharedMemoryAccess = (threadIdxLocal == 0);
0047       if (oncePerSharedMemoryAccess) {
0048         local.zero();
0049       }
0050       alpaka::syncBlockThreads(acc);
0051       local.count(acc, 2 + i % 4);
0052       alpaka::syncBlockThreads(acc);
0053       if (oncePerSharedMemoryAccess) {
0054         assoc->add(acc, local);
0055       }
0056     }
0057   }
0058 };
0059 
0060 struct countMulti {
0061   template <typename TAcc>
0062   ALPAKA_FN_ACC void operator()(const TAcc& acc,
0063                                 TK const* __restrict__ tk,
0064                                 Multiplicity* __restrict__ assoc,
0065                                 uint32_t n) const {
0066     for (auto i : uniform_elements(acc, n)) {
0067       assoc->count(acc, 2 + i % 4);
0068     }
0069   }
0070 };
0071 
0072 struct verifyMulti {
0073   template <typename TAcc>
0074   ALPAKA_FN_ACC void operator()(const TAcc& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const {
0075     for ([[maybe_unused]] auto i : uniform_elements(acc, Multiplicity{}.totOnes())) {
0076       ALPAKA_ASSERT_ACC(m1->off[i] == m2->off[i]);
0077     }
0078   }
0079 };
0080 
0081 struct count {
0082   template <typename TAcc>
0083   ALPAKA_FN_ACC void operator()(const TAcc& acc,
0084                                 TK const* __restrict__ tk,
0085                                 AssocRandomAccess* __restrict__ assoc,
0086                                 uint32_t n) const {
0087     for (auto i : uniform_elements(acc, 4 * n)) {
0088       auto k = i / 4;
0089       auto j = i - 4 * k;
0090       ALPAKA_ASSERT_ACC(j < 4);
0091       if (k >= n) {
0092         return;
0093       }
0094       if (tk[k][j] < MaxElem) {
0095         assoc->count(acc, tk[k][j]);
0096       }
0097     }
0098   }
0099 };
0100 
0101 struct fill {
0102   template <typename TAcc>
0103   ALPAKA_FN_ACC void operator()(const TAcc& acc,
0104                                 TK const* __restrict__ tk,
0105                                 AssocRandomAccess* __restrict__ assoc,
0106                                 uint32_t n) const {
0107     for (auto i : uniform_elements(acc, 4 * n)) {
0108       auto k = i / 4;
0109       auto j = i - 4 * k;
0110       ALPAKA_ASSERT_ACC(j < 4);
0111       if (k >= n) {
0112         return;
0113       }
0114       if (tk[k][j] < MaxElem) {
0115         assoc->fill(acc, tk[k][j], k);
0116       }
0117     }
0118   }
0119 };
0120 
0121 struct verify {
0122   template <typename TAcc, typename Assoc>
0123   ALPAKA_FN_ACC void operator()(const TAcc& acc, Assoc* __restrict__ assoc) const {
0124     ALPAKA_ASSERT_ACC(assoc->size() < Assoc{}.capacity());
0125   }
0126 };
0127 
0128 struct fillBulk {
0129   template <typename TAcc, typename Assoc>
0130   ALPAKA_FN_ACC void operator()(
0131       const TAcc& acc, AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const {
0132     for (auto k : uniform_elements(acc, n)) {
0133       auto m = tk[k][3] < MaxElem ? 4 : 3;
0134       assoc->bulkFill(acc, *apc, &tk[k][0], m);
0135     }
0136   }
0137 };
0138 
0139 struct verifyBulk {
0140   template <typename TAcc, typename Assoc>
0141   ALPAKA_FN_ACC void operator()(const TAcc& acc, Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) const {
0142     if (::toSigned(apc->get().first) >= Assoc::ctNOnes()) {
0143       printf("Overflow %d %d\n", apc->get().first, Assoc::ctNOnes());
0144     }
0145     ALPAKA_ASSERT_ACC(toSigned(assoc->size()) < Assoc::ctCapacity());
0146   }
0147 };
0148 
0149 int main() {
0150   // get the list of devices on the current platform
0151   auto const& devices = cms::alpakatools::devices<Platform>();
0152   if (devices.empty()) {
0153     std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0154       "the test will be skipped.\n";
0155     exit(EXIT_FAILURE);
0156   }
0157 
0158   // run the test on each device
0159   for (auto const& device : devices) {
0160     Queue queue(device);
0161 
0162     std::cout << "OneToManyAssocRandomAccess " << sizeof(AssocRandomAccess) << " Ones=" << AssocRandomAccess{}.totOnes()
0163               << " Capacity=" << AssocRandomAccess{}.capacity() << std::endl;
0164     std::cout << "OneToManyAssocSequential " << sizeof(AssocSequential) << " Ones=" << AssocSequential{}.totOnes()
0165               << " Capacity=" << AssocSequential{}.capacity() << std::endl;
0166     std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << " Ones=" << SmallAssoc{}.totOnes()
0167               << " Capacity=" << SmallAssoc{}.capacity() << std::endl;
0168 
0169     std::mt19937 eng;
0170     std::geometric_distribution<int> rdm(0.8);
0171 
0172     constexpr uint32_t N = 4000;
0173 
0174     auto tr = make_host_buffer<std::array<uint16_t, 4>[]>(queue, N);
0175     // fill with "index" to element
0176     long long ave = 0;
0177     int imax = 0;
0178     auto n = 0U;
0179     auto z = 0U;
0180     auto nz = 0U;
0181     for (auto i = 0U; i < 4U; ++i) {
0182       auto j = 0U;
0183       while (j < N && n < MaxElem) {
0184         if (z == 11) {
0185           ++n;
0186           z = 0;
0187           ++nz;
0188           continue;
0189         }  // a bit of not assoc
0190         auto x = rdm(eng);
0191         auto k = std::min(j + x + 1, N);
0192         if (i == 3 && z == 3) {  // some triplets time to time
0193           for (; j < k; ++j)
0194             tr[j][i] = MaxElem + 1;
0195         } else {
0196           ave += x + 1;
0197           imax = std::max(imax, x);
0198           for (; j < k; ++j)
0199             tr[j][i] = n;
0200           ++n;
0201         }
0202         ++z;
0203       }
0204       ALPAKA_ASSERT_ACC(n <= MaxElem);
0205       ALPAKA_ASSERT_ACC(j <= N);
0206     }
0207     std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
0208 
0209     auto v_d = make_device_buffer<std::array<uint16_t, 4>[]>(queue, N);
0210     alpaka::memcpy(queue, v_d, tr);
0211 
0212     auto ara_d = make_device_buffer<AssocRandomAccess>(queue);
0213     alpaka::memset(queue, ara_d, 0);
0214 
0215     const auto threadsPerBlockOrElementsPerThread = 256u;
0216     const auto blocksPerGrid4N = divide_up_by(4 * N, threadsPerBlockOrElementsPerThread);
0217     const auto workDiv4N = make_workdiv<Acc1D>(blocksPerGrid4N, threadsPerBlockOrElementsPerThread);
0218 
0219     AssocRandomAccess::template launchZero<Acc1D>(ara_d.data(), queue);
0220 
0221     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, count(), v_d.data(), ara_d.data(), N));
0222 
0223     AssocRandomAccess::template launchFinalize<Acc1D>(ara_d.data(), queue);
0224 
0225     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verify(), ara_d.data()));
0226 
0227     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, fill(), v_d.data(), ara_d.data(), N));
0228 
0229     auto ara_h = make_host_buffer<AssocRandomAccess>(queue);
0230     alpaka::memcpy(queue, ara_h, ara_d);
0231     alpaka::wait(queue);
0232 
0233     std::cout << ara_h->size() << std::endl;
0234     imax = 0;
0235     ave = 0;
0236     z = 0;
0237     for (auto i = 0U; i < n; ++i) {
0238       auto x = ara_h->size(i);
0239       if (x == 0) {
0240         z++;
0241         continue;
0242       }
0243       ave += x;
0244       imax = std::max(imax, int(x));
0245     }
0246     ALPAKA_ASSERT_ACC(0 == ara_h->size(n));
0247     std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl;
0248 
0249     // now the inverse map (actually this is the direct....)
0250     auto dc_d = make_device_buffer<AtomicPairCounter>(queue);
0251     alpaka::memset(queue, dc_d, 0);
0252 
0253     const auto blocksPerGrid = divide_up_by(N, threadsPerBlockOrElementsPerThread);
0254     const auto workDiv = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
0255 
0256     auto as_d = make_device_buffer<AssocSequential>(queue);
0257     alpaka::enqueue(queue,
0258                     alpaka::createTaskKernel<Acc1D>(workDiv, fillBulk(), dc_d.data(), v_d.data(), as_d.data(), N));
0259 
0260     alpaka::enqueue(
0261         queue, alpaka::createTaskKernel<Acc1D>(workDiv, AssocSequential::finalizeBulk(), dc_d.data(), as_d.data()));
0262 
0263     alpaka::enqueue(queue,
0264                     alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), as_d.data(), dc_d.data()));
0265 
0266     auto as_h = make_host_buffer<AssocSequential>(queue);
0267     alpaka::memcpy(queue, as_h, as_d);
0268 
0269     auto dc_h = make_host_buffer<AtomicPairCounter>(queue);
0270     alpaka::memcpy(queue, dc_h, dc_d);
0271     alpaka::wait(queue);
0272 
0273     alpaka::memset(queue, dc_d, 0);
0274     auto sa_d = make_device_buffer<SmallAssoc>(queue);
0275     alpaka::memset(queue, sa_d, 0);
0276 
0277     alpaka::enqueue(queue,
0278                     alpaka::createTaskKernel<Acc1D>(workDiv, fillBulk(), dc_d.data(), v_d.data(), sa_d.data(), N));
0279 
0280     alpaka::enqueue(queue,
0281                     alpaka::createTaskKernel<Acc1D>(workDiv, SmallAssoc::finalizeBulk(), dc_d.data(), sa_d.data()));
0282 
0283     alpaka::enqueue(queue,
0284                     alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), sa_d.data(), dc_d.data()));
0285 
0286     std::cout << "final counter value " << dc_h->get().second << ' ' << dc_h->get().first << std::endl;
0287 
0288     std::cout << as_h->size() << std::endl;
0289     imax = 0;
0290     ave = 0;
0291     for (auto i = 0U; i < N; ++i) {
0292       auto x = as_h->size(i);
0293       if (!(x == 4 || x == 3)) {
0294         std::cout << "i=" << i << " x=" << x << std::endl;
0295       }
0296       ALPAKA_ASSERT_ACC(x == 4 || x == 3);
0297       ave += x;
0298       imax = std::max(imax, int(x));
0299     }
0300     ALPAKA_ASSERT_ACC(0 == as_h->size(N));
0301     std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl;
0302 
0303     // here verify use of block local counters
0304     auto m1_d = make_device_buffer<Multiplicity>(queue);
0305     alpaka::memset(queue, m1_d, 0);
0306     auto m2_d = make_device_buffer<Multiplicity>(queue);
0307     alpaka::memset(queue, m2_d, 0);
0308 
0309     Multiplicity::template launchZero<Acc1D>(m1_d.data(), queue);
0310     Multiplicity::template launchZero<Acc1D>(m2_d.data(), queue);
0311 
0312     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, countMulti(), v_d.data(), m1_d.data(), N));
0313 
0314     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, countMultiLocal(), v_d.data(), m2_d.data(), N));
0315 
0316     const auto blocksPerGridTotBins = 1u;
0317     const auto threadsPerBlockOrElementsPerThreadTotBins = Multiplicity::ctNOnes();
0318     const auto workDivTotBins = make_workdiv<Acc1D>(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins);
0319 
0320     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data()));
0321 
0322     Multiplicity::launchFinalize<Acc1D>(m1_d.data(), queue);
0323     Multiplicity::launchFinalize<Acc1D>(m2_d.data(), queue);
0324 
0325     alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data()));
0326 
0327     alpaka::wait(queue);
0328 
0329     return 0;
0330   }
0331 }