File indexing completed on 2025-01-22 07:34:11
0001 #include <algorithm>
0002 #include <iostream>
0003 #include <limits>
0004 #include <random>
0005
0006 #include <alpaka/alpaka.hpp>
0007
0008 #include "FWCore/Utilities/interface/stringize.h"
0009 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0010 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
0013
0014 using namespace cms::alpakatools;
0015 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0016
0017 template <int NBINS, int S, int DELTA>
0018 struct mykernel {
0019 template <typename T>
0020 ALPAKA_FN_ACC void operator()(Acc1D const& acc, T const* __restrict__ v, uint32_t N) const {
0021 ALPAKA_ASSERT_ACC(v);
0022 ALPAKA_ASSERT_ACC(N == 12000);
0023
0024 if (once_per_block(acc)) {
0025 printf("start kernel for %d data\n", N);
0026 }
0027
0028 using Hist = HistoContainer<T, NBINS, 12000, S, uint16_t>;
0029
0030 auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
0031 auto& ws = alpaka::declareSharedVar<typename Hist::Counter[32], __COUNTER__>(acc);
0032
0033
0034 for (auto j : uniform_elements(acc, Hist::totbins())) {
0035 hist.off[j] = 0;
0036 }
0037 alpaka::syncBlockThreads(acc);
0038
0039
0040 for (auto j : uniform_elements(acc, Hist::totbins())) {
0041 hist.content[j] = 0;
0042 }
0043 alpaka::syncBlockThreads(acc);
0044
0045
0046 for (auto j : uniform_elements(acc, N)) {
0047 hist.count(acc, v[j]);
0048 }
0049 alpaka::syncBlockThreads(acc);
0050
0051 ALPAKA_ASSERT_ACC(0 == hist.size());
0052 alpaka::syncBlockThreads(acc);
0053
0054
0055 hist.finalize(acc, ws);
0056 alpaka::syncBlockThreads(acc);
0057
0058 ALPAKA_ASSERT_ACC(N == hist.size());
0059
0060
0061 for ([[maybe_unused]] auto j : uniform_elements(acc, Hist::nbins())) {
0062 ALPAKA_ASSERT_ACC(hist.off[j] <= hist.off[j + 1]);
0063 }
0064 alpaka::syncBlockThreads(acc);
0065
0066 for (auto j : uniform_elements(acc, 32)) {
0067 ws[j] = 0;
0068 }
0069 alpaka::syncBlockThreads(acc);
0070
0071
0072 for (auto j : uniform_elements(acc, N)) {
0073 hist.fill(acc, v[j], j);
0074 }
0075 alpaka::syncBlockThreads(acc);
0076
0077 ALPAKA_ASSERT_ACC(0 == hist.off[0]);
0078 ALPAKA_ASSERT_ACC(N == hist.size());
0079
0080
0081 #ifndef NDEBUG
0082 for (auto j : uniform_elements(acc, hist.size() - 1)) {
0083 auto p = hist.begin() + j;
0084 ALPAKA_ASSERT_ACC((*p) < N);
0085 [[maybe_unused]] auto k1 = Hist::bin(v[*p]);
0086 [[maybe_unused]] auto k2 = Hist::bin(v[*(p + 1)]);
0087 ALPAKA_ASSERT_ACC(k2 >= k1);
0088 }
0089 #endif
0090
0091
0092 for (auto i : uniform_elements(acc, hist.size())) {
0093 auto p = hist.begin() + i;
0094 auto j = *p;
0095 #ifndef NDEBUG
0096 auto b0 = Hist::bin(v[j]);
0097 #endif
0098 [[maybe_unused]] int tot = 0;
0099 auto ftest = [&](unsigned int k) {
0100 ALPAKA_ASSERT_ACC(k < N);
0101 ++tot;
0102 };
0103 forEachInWindow(hist, v[j], v[j], ftest);
0104 #ifndef NDEBUG
0105 [[maybe_unused]] int rtot = hist.size(b0);
0106 ALPAKA_ASSERT_ACC(tot == rtot);
0107 #endif
0108 tot = 0;
0109 auto vm = int(v[j]) - DELTA;
0110 auto vp = int(v[j]) + DELTA;
0111 constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits<T>::max();
0112 vm = std::max(vm, 0);
0113 vm = std::min(vm, vmax);
0114 vp = std::min(vp, vmax);
0115 vp = std::max(vp, 0);
0116 ALPAKA_ASSERT_ACC(vp >= vm);
0117 forEachInWindow(hist, vm, vp, ftest);
0118 #ifndef NDEBUG
0119 int bp = Hist::bin(vp);
0120 int bm = Hist::bin(vm);
0121 rtot = hist.end(bp) - hist.begin(bm);
0122 ALPAKA_ASSERT_ACC(tot == rtot);
0123 #endif
0124 }
0125 }
0126 };
0127
0128 template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
0129 void go(const DevHost& host, const Device& device, Queue& queue) {
0130 std::mt19937 eng;
0131
0132 int rmin = std::numeric_limits<T>::min();
0133 int rmax = std::numeric_limits<T>::max();
0134 if (NBINS != 128) {
0135 rmin = 0;
0136 rmax = NBINS * 2 - 1;
0137 }
0138
0139 std::uniform_int_distribution<T> rgen(rmin, rmax);
0140 constexpr unsigned int N = 12000;
0141
0142 using Hist = HistoContainer<T, NBINS, N, S>;
0143 std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist{}.capacity() << ' '
0144 << (rmax - rmin) / Hist::nbins() << std::endl;
0145 std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
0146
0147 auto v = make_host_buffer<T[]>(queue, N);
0148 auto v_d = make_device_buffer<T[]>(queue, N);
0149
0150 for (int it = 0; it < 5; ++it) {
0151 for (long long j = 0; j < N; j++)
0152 v[j] = rgen(eng);
0153 if (it == 2)
0154 for (long long j = N / 2; j < N / 2 + N / 4; j++)
0155 v[j] = 4;
0156
0157 alpaka::memcpy(queue, v_d, v);
0158
0159 const auto threadsPerBlockOrElementsPerThread = 256u;
0160 const auto blocksPerGrid = 1u;
0161 const auto workDiv = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
0162 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv, mykernel<NBINS, S, DELTA>(), v_d.data(), N));
0163 }
0164 alpaka::wait(queue);
0165 }
0166
0167 int main() {
0168 auto const& host = cms::alpakatools::host();
0169
0170
0171 auto const& devices = cms::alpakatools::devices<Platform>();
0172 if (devices.empty()) {
0173 std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0174 "the test will be skipped.\n";
0175 exit(EXIT_FAILURE);
0176 }
0177
0178
0179 for (auto const& device : devices) {
0180 std::cout << "Test One Histo Container on " << alpaka::getName(device) << '\n';
0181
0182 auto queue = Queue(device);
0183
0184 go<int16_t>(host, device, queue);
0185 go<uint8_t, 128, 8, 4>(host, device, queue);
0186 go<uint16_t, 313 / 2, 9, 4>(host, device, queue);
0187 }
0188
0189 return 0;
0190 }