File indexing completed on 2025-01-22 07:34:11
0001 #include <algorithm>
0002 #include <array>
0003 #include <cstdlib>
0004 #include <iostream>
0005 #include <limits>
0006 #include <memory>
0007 #include <random>
0008
0009 #include <alpaka/alpaka.hpp>
0010
0011 #include "FWCore/Utilities/interface/stringize.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h"
0016
0017 constexpr uint32_t MaxElem = 64000;
0018 constexpr uint32_t MaxTk = 8000;
0019 constexpr uint32_t MaxAssocs = 4 * MaxTk;
0020
0021 using namespace cms::alpakatools;
0022 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0023
0024 using AssocRandomAccess = OneToManyAssocRandomAccess<uint16_t, MaxElem, MaxAssocs>;
0025 using AssocSequential = OneToManyAssocSequential<uint16_t, MaxElem, MaxAssocs>;
0026 using SmallAssoc = OneToManyAssocSequential<uint16_t, 128, MaxAssocs>;
0027 using Multiplicity = OneToManyAssocRandomAccess<uint16_t, 8, MaxTk>;
0028 using TK = std::array<uint16_t, 4>;
0029
0030 namespace {
0031 template <typename T>
0032 ALPAKA_FN_HOST_ACC typename std::make_signed<T>::type toSigned(T v) {
0033 return static_cast<typename std::make_signed<T>::type>(v);
0034 }
0035 }
0036
0037 struct countMultiLocal {
0038 ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0039 TK const* __restrict__ tk,
0040 Multiplicity* __restrict__ assoc,
0041 uint32_t n) const {
0042 for (auto i : uniform_elements(acc, n)) {
0043 auto& local = alpaka::declareSharedVar<Multiplicity::CountersOnly, __COUNTER__>(acc);
0044 const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
0045 const bool oncePerSharedMemoryAccess = (threadIdxLocal == 0);
0046 if (oncePerSharedMemoryAccess) {
0047 local.zero();
0048 }
0049 alpaka::syncBlockThreads(acc);
0050 local.count(acc, 2 + i % 4);
0051 alpaka::syncBlockThreads(acc);
0052 if (oncePerSharedMemoryAccess) {
0053 assoc->add(acc, local);
0054 }
0055 }
0056 }
0057 };
0058
0059 struct countMulti {
0060 ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0061 TK const* __restrict__ tk,
0062 Multiplicity* __restrict__ assoc,
0063 uint32_t n) const {
0064 for (auto i : uniform_elements(acc, n)) {
0065 assoc->count(acc, 2 + i % 4);
0066 }
0067 }
0068 };
0069
0070 struct verifyMulti {
0071 ALPAKA_FN_ACC void operator()(Acc1D const& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const {
0072 for ([[maybe_unused]] auto i : uniform_elements(acc, Multiplicity{}.totOnes())) {
0073 ALPAKA_ASSERT_ACC(m1->off[i] == m2->off[i]);
0074 }
0075 }
0076 };
0077
0078 struct count {
0079 ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0080 TK const* __restrict__ tk,
0081 AssocRandomAccess* __restrict__ assoc,
0082 uint32_t n) const {
0083 for (auto i : uniform_elements(acc, 4 * n)) {
0084 auto k = i / 4;
0085 auto j = i - 4 * k;
0086 ALPAKA_ASSERT_ACC(j < 4);
0087 if (k >= n) {
0088 return;
0089 }
0090 if (tk[k][j] < MaxElem) {
0091 assoc->count(acc, tk[k][j]);
0092 }
0093 }
0094 }
0095 };
0096
0097 struct fill {
0098 ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0099 TK const* __restrict__ tk,
0100 AssocRandomAccess* __restrict__ assoc,
0101 uint32_t n) const {
0102 for (auto i : uniform_elements(acc, 4 * n)) {
0103 auto k = i / 4;
0104 auto j = i - 4 * k;
0105 ALPAKA_ASSERT_ACC(j < 4);
0106 if (k >= n) {
0107 return;
0108 }
0109 if (tk[k][j] < MaxElem) {
0110 assoc->fill(acc, tk[k][j], k);
0111 }
0112 }
0113 }
0114 };
0115
0116 struct verify {
0117 template <typename Assoc>
0118 ALPAKA_FN_ACC void operator()(Acc1D const& acc, Assoc* __restrict__ assoc) const {
0119 ALPAKA_ASSERT_ACC(assoc->size() < Assoc{}.capacity());
0120 }
0121 };
0122
0123 struct fillBulk {
0124 template <typename Assoc>
0125 ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0126 AtomicPairCounter* apc,
0127 TK const* __restrict__ tk,
0128 Assoc* __restrict__ assoc,
0129 uint32_t n) const {
0130 for (auto k : uniform_elements(acc, n)) {
0131 auto m = tk[k][3] < MaxElem ? 4 : 3;
0132 assoc->bulkFill(acc, *apc, &tk[k][0], m);
0133 }
0134 }
0135 };
0136
0137 struct verifyBulk {
0138 template <typename Assoc>
0139 ALPAKA_FN_ACC void operator()(Acc1D const& acc, Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) const {
0140 if (::toSigned(apc->get().first) >= Assoc::ctNOnes()) {
0141 printf("Overflow %d %d\n", apc->get().first, Assoc::ctNOnes());
0142 }
0143 ALPAKA_ASSERT_ACC(toSigned(assoc->size()) < Assoc::ctCapacity());
0144 }
0145 };
0146
0147 int main() {
0148
0149 auto const& devices = cms::alpakatools::devices<Platform>();
0150 if (devices.empty()) {
0151 std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0152 "the test will be skipped.\n";
0153 exit(EXIT_FAILURE);
0154 }
0155
0156
0157 for (auto const& device : devices) {
0158 Queue queue(device);
0159
0160 std::cout << "OneToManyAssocRandomAccess " << sizeof(AssocRandomAccess) << " Ones=" << AssocRandomAccess{}.totOnes()
0161 << " Capacity=" << AssocRandomAccess{}.capacity() << std::endl;
0162 std::cout << "OneToManyAssocSequential " << sizeof(AssocSequential) << " Ones=" << AssocSequential{}.totOnes()
0163 << " Capacity=" << AssocSequential{}.capacity() << std::endl;
0164 std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << " Ones=" << SmallAssoc{}.totOnes()
0165 << " Capacity=" << SmallAssoc{}.capacity() << std::endl;
0166
0167 std::mt19937 eng;
0168 std::geometric_distribution<int> rdm(0.8);
0169
0170 constexpr uint32_t N = 4000;
0171
0172 auto tr = make_host_buffer<std::array<uint16_t, 4>[]>(queue, N);
0173
0174 long long ave = 0;
0175 int imax = 0;
0176 auto n = 0U;
0177 auto z = 0U;
0178 auto nz = 0U;
0179 for (auto i = 0U; i < 4U; ++i) {
0180 auto j = 0U;
0181 while (j < N && n < MaxElem) {
0182 if (z == 11) {
0183 ++n;
0184 z = 0;
0185 ++nz;
0186 continue;
0187 }
0188 auto x = rdm(eng);
0189 auto k = std::min(j + x + 1, N);
0190 if (i == 3 && z == 3) {
0191 for (; j < k; ++j)
0192 tr[j][i] = MaxElem + 1;
0193 } else {
0194 ave += x + 1;
0195 imax = std::max(imax, x);
0196 for (; j < k; ++j)
0197 tr[j][i] = n;
0198 ++n;
0199 }
0200 ++z;
0201 }
0202 ALPAKA_ASSERT_ACC(n <= MaxElem);
0203 ALPAKA_ASSERT_ACC(j <= N);
0204 }
0205 std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
0206
0207 auto v_d = make_device_buffer<std::array<uint16_t, 4>[]>(queue, N);
0208 alpaka::memcpy(queue, v_d, tr);
0209
0210 auto ara_d = make_device_buffer<AssocRandomAccess>(queue);
0211 alpaka::memset(queue, ara_d, 0);
0212
0213 const auto threadsPerBlockOrElementsPerThread = 256u;
0214 const auto blocksPerGrid4N = divide_up_by(4 * N, threadsPerBlockOrElementsPerThread);
0215 const auto workDiv4N = make_workdiv<Acc1D>(blocksPerGrid4N, threadsPerBlockOrElementsPerThread);
0216
0217 AssocRandomAccess::template launchZero<Acc1D>(ara_d.data(), queue);
0218
0219 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, count(), v_d.data(), ara_d.data(), N));
0220
0221 AssocRandomAccess::template launchFinalize<Acc1D>(ara_d.data(), queue);
0222
0223 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verify(), ara_d.data()));
0224
0225 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, fill(), v_d.data(), ara_d.data(), N));
0226
0227 auto ara_h = make_host_buffer<AssocRandomAccess>(queue);
0228 alpaka::memcpy(queue, ara_h, ara_d);
0229 alpaka::wait(queue);
0230
0231 std::cout << ara_h->size() << std::endl;
0232 imax = 0;
0233 ave = 0;
0234 z = 0;
0235 for (auto i = 0U; i < n; ++i) {
0236 auto x = ara_h->size(i);
0237 if (x == 0) {
0238 z++;
0239 continue;
0240 }
0241 ave += x;
0242 imax = std::max(imax, int(x));
0243 }
0244 ALPAKA_ASSERT_ACC(0 == ara_h->size(n));
0245 std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl;
0246
0247
0248 auto dc_d = make_device_buffer<AtomicPairCounter>(queue);
0249 alpaka::memset(queue, dc_d, 0);
0250
0251 const auto blocksPerGrid = divide_up_by(N, threadsPerBlockOrElementsPerThread);
0252 const auto workDiv = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
0253
0254 auto as_d = make_device_buffer<AssocSequential>(queue);
0255 alpaka::enqueue(queue,
0256 alpaka::createTaskKernel<Acc1D>(workDiv, fillBulk(), dc_d.data(), v_d.data(), as_d.data(), N));
0257
0258 alpaka::enqueue(
0259 queue, alpaka::createTaskKernel<Acc1D>(workDiv, AssocSequential::finalizeBulk(), dc_d.data(), as_d.data()));
0260
0261 alpaka::enqueue(queue,
0262 alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), as_d.data(), dc_d.data()));
0263
0264 auto as_h = make_host_buffer<AssocSequential>(queue);
0265 alpaka::memcpy(queue, as_h, as_d);
0266
0267 auto dc_h = make_host_buffer<AtomicPairCounter>(queue);
0268 alpaka::memcpy(queue, dc_h, dc_d);
0269 alpaka::wait(queue);
0270
0271 alpaka::memset(queue, dc_d, 0);
0272 auto sa_d = make_device_buffer<SmallAssoc>(queue);
0273 alpaka::memset(queue, sa_d, 0);
0274
0275 alpaka::enqueue(queue,
0276 alpaka::createTaskKernel<Acc1D>(workDiv, fillBulk(), dc_d.data(), v_d.data(), sa_d.data(), N));
0277
0278 alpaka::enqueue(queue,
0279 alpaka::createTaskKernel<Acc1D>(workDiv, SmallAssoc::finalizeBulk(), dc_d.data(), sa_d.data()));
0280
0281 alpaka::enqueue(queue,
0282 alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), sa_d.data(), dc_d.data()));
0283
0284 std::cout << "final counter value " << dc_h->get().second << ' ' << dc_h->get().first << std::endl;
0285
0286 std::cout << as_h->size() << std::endl;
0287 imax = 0;
0288 ave = 0;
0289 for (auto i = 0U; i < N; ++i) {
0290 auto x = as_h->size(i);
0291 if (!(x == 4 || x == 3)) {
0292 std::cout << "i=" << i << " x=" << x << std::endl;
0293 }
0294 ALPAKA_ASSERT_ACC(x == 4 || x == 3);
0295 ave += x;
0296 imax = std::max(imax, int(x));
0297 }
0298 ALPAKA_ASSERT_ACC(0 == as_h->size(N));
0299 std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl;
0300
0301
0302 auto m1_d = make_device_buffer<Multiplicity>(queue);
0303 alpaka::memset(queue, m1_d, 0);
0304 auto m2_d = make_device_buffer<Multiplicity>(queue);
0305 alpaka::memset(queue, m2_d, 0);
0306
0307 Multiplicity::template launchZero<Acc1D>(m1_d.data(), queue);
0308 Multiplicity::template launchZero<Acc1D>(m2_d.data(), queue);
0309
0310 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, countMulti(), v_d.data(), m1_d.data(), N));
0311
0312 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, countMultiLocal(), v_d.data(), m2_d.data(), N));
0313
0314 const auto blocksPerGridTotBins = 1u;
0315 const auto threadsPerBlockOrElementsPerThreadTotBins = Multiplicity::ctNOnes();
0316 const auto workDivTotBins = make_workdiv<Acc1D>(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins);
0317
0318 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data()));
0319
0320 Multiplicity::launchFinalize<Acc1D>(m1_d.data(), queue);
0321 Multiplicity::launchFinalize<Acc1D>(m2_d.data(), queue);
0322
0323 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data()));
0324
0325 alpaka::wait(queue);
0326
0327 return 0;
0328 }
0329 }