File indexing completed on 2024-04-06 12:15:41
0001 #include <algorithm>
0002 #include <array>
0003 #include <cstdlib>
0004 #include <iostream>
0005 #include <limits>
0006 #include <memory>
0007 #include <random>
0008
0009 #include <alpaka/alpaka.hpp>
0010
0011 #include "FWCore/Utilities/interface/stringize.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h"
0016
0017 constexpr uint32_t MaxElem = 64000;
0018 constexpr uint32_t MaxTk = 8000;
0019 constexpr uint32_t MaxAssocs = 4 * MaxTk;
0020
0021 using namespace cms::alpakatools;
0022 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0023
0024 using AssocRandomAccess = OneToManyAssocRandomAccess<uint16_t, MaxElem, MaxAssocs>;
0025 using AssocSequential = OneToManyAssocSequential<uint16_t, MaxElem, MaxAssocs>;
0026 using SmallAssoc = OneToManyAssocSequential<uint16_t, 128, MaxAssocs>;
0027 using Multiplicity = OneToManyAssocRandomAccess<uint16_t, 8, MaxTk>;
0028 using TK = std::array<uint16_t, 4>;
0029
0030 namespace {
0031 template <typename T>
0032 ALPAKA_FN_HOST_ACC typename std::make_signed<T>::type toSigned(T v) {
0033 return static_cast<typename std::make_signed<T>::type>(v);
0034 }
0035 }
0036
0037 struct countMultiLocal {
0038 template <typename TAcc>
0039 ALPAKA_FN_ACC void operator()(const TAcc& acc,
0040 TK const* __restrict__ tk,
0041 Multiplicity* __restrict__ assoc,
0042 uint32_t n) const {
0043 for (auto i : uniform_elements(acc, n)) {
0044 auto& local = alpaka::declareSharedVar<Multiplicity::CountersOnly, __COUNTER__>(acc);
0045 const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
0046 const bool oncePerSharedMemoryAccess = (threadIdxLocal == 0);
0047 if (oncePerSharedMemoryAccess) {
0048 local.zero();
0049 }
0050 alpaka::syncBlockThreads(acc);
0051 local.count(acc, 2 + i % 4);
0052 alpaka::syncBlockThreads(acc);
0053 if (oncePerSharedMemoryAccess) {
0054 assoc->add(acc, local);
0055 }
0056 }
0057 }
0058 };
0059
0060 struct countMulti {
0061 template <typename TAcc>
0062 ALPAKA_FN_ACC void operator()(const TAcc& acc,
0063 TK const* __restrict__ tk,
0064 Multiplicity* __restrict__ assoc,
0065 uint32_t n) const {
0066 for (auto i : uniform_elements(acc, n)) {
0067 assoc->count(acc, 2 + i % 4);
0068 }
0069 }
0070 };
0071
0072 struct verifyMulti {
0073 template <typename TAcc>
0074 ALPAKA_FN_ACC void operator()(const TAcc& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const {
0075 for ([[maybe_unused]] auto i : uniform_elements(acc, Multiplicity{}.totOnes())) {
0076 ALPAKA_ASSERT_ACC(m1->off[i] == m2->off[i]);
0077 }
0078 }
0079 };
0080
0081 struct count {
0082 template <typename TAcc>
0083 ALPAKA_FN_ACC void operator()(const TAcc& acc,
0084 TK const* __restrict__ tk,
0085 AssocRandomAccess* __restrict__ assoc,
0086 uint32_t n) const {
0087 for (auto i : uniform_elements(acc, 4 * n)) {
0088 auto k = i / 4;
0089 auto j = i - 4 * k;
0090 ALPAKA_ASSERT_ACC(j < 4);
0091 if (k >= n) {
0092 return;
0093 }
0094 if (tk[k][j] < MaxElem) {
0095 assoc->count(acc, tk[k][j]);
0096 }
0097 }
0098 }
0099 };
0100
0101 struct fill {
0102 template <typename TAcc>
0103 ALPAKA_FN_ACC void operator()(const TAcc& acc,
0104 TK const* __restrict__ tk,
0105 AssocRandomAccess* __restrict__ assoc,
0106 uint32_t n) const {
0107 for (auto i : uniform_elements(acc, 4 * n)) {
0108 auto k = i / 4;
0109 auto j = i - 4 * k;
0110 ALPAKA_ASSERT_ACC(j < 4);
0111 if (k >= n) {
0112 return;
0113 }
0114 if (tk[k][j] < MaxElem) {
0115 assoc->fill(acc, tk[k][j], k);
0116 }
0117 }
0118 }
0119 };
0120
0121 struct verify {
0122 template <typename TAcc, typename Assoc>
0123 ALPAKA_FN_ACC void operator()(const TAcc& acc, Assoc* __restrict__ assoc) const {
0124 ALPAKA_ASSERT_ACC(assoc->size() < Assoc{}.capacity());
0125 }
0126 };
0127
0128 struct fillBulk {
0129 template <typename TAcc, typename Assoc>
0130 ALPAKA_FN_ACC void operator()(
0131 const TAcc& acc, AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const {
0132 for (auto k : uniform_elements(acc, n)) {
0133 auto m = tk[k][3] < MaxElem ? 4 : 3;
0134 assoc->bulkFill(acc, *apc, &tk[k][0], m);
0135 }
0136 }
0137 };
0138
0139 struct verifyBulk {
0140 template <typename TAcc, typename Assoc>
0141 ALPAKA_FN_ACC void operator()(const TAcc& acc, Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) const {
0142 if (::toSigned(apc->get().first) >= Assoc::ctNOnes()) {
0143 printf("Overflow %d %d\n", apc->get().first, Assoc::ctNOnes());
0144 }
0145 ALPAKA_ASSERT_ACC(toSigned(assoc->size()) < Assoc::ctCapacity());
0146 }
0147 };
0148
0149 int main() {
0150
0151 auto const& devices = cms::alpakatools::devices<Platform>();
0152 if (devices.empty()) {
0153 std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0154 "the test will be skipped.\n";
0155 exit(EXIT_FAILURE);
0156 }
0157
0158
0159 for (auto const& device : devices) {
0160 Queue queue(device);
0161
0162 std::cout << "OneToManyAssocRandomAccess " << sizeof(AssocRandomAccess) << " Ones=" << AssocRandomAccess{}.totOnes()
0163 << " Capacity=" << AssocRandomAccess{}.capacity() << std::endl;
0164 std::cout << "OneToManyAssocSequential " << sizeof(AssocSequential) << " Ones=" << AssocSequential{}.totOnes()
0165 << " Capacity=" << AssocSequential{}.capacity() << std::endl;
0166 std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << " Ones=" << SmallAssoc{}.totOnes()
0167 << " Capacity=" << SmallAssoc{}.capacity() << std::endl;
0168
0169 std::mt19937 eng;
0170 std::geometric_distribution<int> rdm(0.8);
0171
0172 constexpr uint32_t N = 4000;
0173
0174 auto tr = make_host_buffer<std::array<uint16_t, 4>[]>(queue, N);
0175
0176 long long ave = 0;
0177 int imax = 0;
0178 auto n = 0U;
0179 auto z = 0U;
0180 auto nz = 0U;
0181 for (auto i = 0U; i < 4U; ++i) {
0182 auto j = 0U;
0183 while (j < N && n < MaxElem) {
0184 if (z == 11) {
0185 ++n;
0186 z = 0;
0187 ++nz;
0188 continue;
0189 }
0190 auto x = rdm(eng);
0191 auto k = std::min(j + x + 1, N);
0192 if (i == 3 && z == 3) {
0193 for (; j < k; ++j)
0194 tr[j][i] = MaxElem + 1;
0195 } else {
0196 ave += x + 1;
0197 imax = std::max(imax, x);
0198 for (; j < k; ++j)
0199 tr[j][i] = n;
0200 ++n;
0201 }
0202 ++z;
0203 }
0204 ALPAKA_ASSERT_ACC(n <= MaxElem);
0205 ALPAKA_ASSERT_ACC(j <= N);
0206 }
0207 std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
0208
0209 auto v_d = make_device_buffer<std::array<uint16_t, 4>[]>(queue, N);
0210 alpaka::memcpy(queue, v_d, tr);
0211
0212 auto ara_d = make_device_buffer<AssocRandomAccess>(queue);
0213 alpaka::memset(queue, ara_d, 0);
0214
0215 const auto threadsPerBlockOrElementsPerThread = 256u;
0216 const auto blocksPerGrid4N = divide_up_by(4 * N, threadsPerBlockOrElementsPerThread);
0217 const auto workDiv4N = make_workdiv<Acc1D>(blocksPerGrid4N, threadsPerBlockOrElementsPerThread);
0218
0219 AssocRandomAccess::template launchZero<Acc1D>(ara_d.data(), queue);
0220
0221 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, count(), v_d.data(), ara_d.data(), N));
0222
0223 AssocRandomAccess::template launchFinalize<Acc1D>(ara_d.data(), queue);
0224
0225 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verify(), ara_d.data()));
0226
0227 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, fill(), v_d.data(), ara_d.data(), N));
0228
0229 auto ara_h = make_host_buffer<AssocRandomAccess>(queue);
0230 alpaka::memcpy(queue, ara_h, ara_d);
0231 alpaka::wait(queue);
0232
0233 std::cout << ara_h->size() << std::endl;
0234 imax = 0;
0235 ave = 0;
0236 z = 0;
0237 for (auto i = 0U; i < n; ++i) {
0238 auto x = ara_h->size(i);
0239 if (x == 0) {
0240 z++;
0241 continue;
0242 }
0243 ave += x;
0244 imax = std::max(imax, int(x));
0245 }
0246 ALPAKA_ASSERT_ACC(0 == ara_h->size(n));
0247 std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl;
0248
0249
0250 auto dc_d = make_device_buffer<AtomicPairCounter>(queue);
0251 alpaka::memset(queue, dc_d, 0);
0252
0253 const auto blocksPerGrid = divide_up_by(N, threadsPerBlockOrElementsPerThread);
0254 const auto workDiv = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
0255
0256 auto as_d = make_device_buffer<AssocSequential>(queue);
0257 alpaka::enqueue(queue,
0258 alpaka::createTaskKernel<Acc1D>(workDiv, fillBulk(), dc_d.data(), v_d.data(), as_d.data(), N));
0259
0260 alpaka::enqueue(
0261 queue, alpaka::createTaskKernel<Acc1D>(workDiv, AssocSequential::finalizeBulk(), dc_d.data(), as_d.data()));
0262
0263 alpaka::enqueue(queue,
0264 alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), as_d.data(), dc_d.data()));
0265
0266 auto as_h = make_host_buffer<AssocSequential>(queue);
0267 alpaka::memcpy(queue, as_h, as_d);
0268
0269 auto dc_h = make_host_buffer<AtomicPairCounter>(queue);
0270 alpaka::memcpy(queue, dc_h, dc_d);
0271 alpaka::wait(queue);
0272
0273 alpaka::memset(queue, dc_d, 0);
0274 auto sa_d = make_device_buffer<SmallAssoc>(queue);
0275 alpaka::memset(queue, sa_d, 0);
0276
0277 alpaka::enqueue(queue,
0278 alpaka::createTaskKernel<Acc1D>(workDiv, fillBulk(), dc_d.data(), v_d.data(), sa_d.data(), N));
0279
0280 alpaka::enqueue(queue,
0281 alpaka::createTaskKernel<Acc1D>(workDiv, SmallAssoc::finalizeBulk(), dc_d.data(), sa_d.data()));
0282
0283 alpaka::enqueue(queue,
0284 alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), sa_d.data(), dc_d.data()));
0285
0286 std::cout << "final counter value " << dc_h->get().second << ' ' << dc_h->get().first << std::endl;
0287
0288 std::cout << as_h->size() << std::endl;
0289 imax = 0;
0290 ave = 0;
0291 for (auto i = 0U; i < N; ++i) {
0292 auto x = as_h->size(i);
0293 if (!(x == 4 || x == 3)) {
0294 std::cout << "i=" << i << " x=" << x << std::endl;
0295 }
0296 ALPAKA_ASSERT_ACC(x == 4 || x == 3);
0297 ave += x;
0298 imax = std::max(imax, int(x));
0299 }
0300 ALPAKA_ASSERT_ACC(0 == as_h->size(N));
0301 std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl;
0302
0303
0304 auto m1_d = make_device_buffer<Multiplicity>(queue);
0305 alpaka::memset(queue, m1_d, 0);
0306 auto m2_d = make_device_buffer<Multiplicity>(queue);
0307 alpaka::memset(queue, m2_d, 0);
0308
0309 Multiplicity::template launchZero<Acc1D>(m1_d.data(), queue);
0310 Multiplicity::template launchZero<Acc1D>(m2_d.data(), queue);
0311
0312 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, countMulti(), v_d.data(), m1_d.data(), N));
0313
0314 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, countMultiLocal(), v_d.data(), m2_d.data(), N));
0315
0316 const auto blocksPerGridTotBins = 1u;
0317 const auto threadsPerBlockOrElementsPerThreadTotBins = Multiplicity::ctNOnes();
0318 const auto workDivTotBins = make_workdiv<Acc1D>(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins);
0319
0320 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data()));
0321
0322 Multiplicity::launchFinalize<Acc1D>(m1_d.data(), queue);
0323 Multiplicity::launchFinalize<Acc1D>(m2_d.data(), queue);
0324
0325 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data()));
0326
0327 alpaka::wait(queue);
0328
0329 return 0;
0330 }
0331 }