File indexing completed on 2024-04-06 12:15:41
0001 #include <algorithm>
0002 #include <iostream>
0003 #include <limits>
0004 #include <random>
0005
0006 #include <alpaka/alpaka.hpp>
0007
0008 #include "FWCore/Utilities/interface/stringize.h"
0009 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0010 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
0013
0014 using namespace cms::alpakatools;
0015 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0016
0017 template <int NBINS, int S, int DELTA>
0018 struct mykernel {
0019 template <typename TAcc, typename T>
0020 ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* __restrict__ v, uint32_t N) const {
0021 ALPAKA_ASSERT_ACC(v);
0022 ALPAKA_ASSERT_ACC(N == 12000);
0023
0024 const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
0025 if (threadIdxLocal == 0) {
0026 printf("start kernel for %d data\n", N);
0027 }
0028
0029 using Hist = HistoContainer<T, NBINS, 12000, S, uint16_t>;
0030
0031 auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
0032 auto& ws = alpaka::declareSharedVar<typename Hist::Counter[32], __COUNTER__>(acc);
0033
0034
0035 for (auto j : uniform_elements(acc, Hist::totbins())) {
0036 hist.off[j] = 0;
0037 }
0038 alpaka::syncBlockThreads(acc);
0039
0040
0041 for (auto j : uniform_elements(acc, Hist::totbins())) {
0042 hist.content[j] = 0;
0043 }
0044 alpaka::syncBlockThreads(acc);
0045
0046
0047 for (auto j : uniform_elements(acc, N)) {
0048 hist.count(acc, v[j]);
0049 }
0050 alpaka::syncBlockThreads(acc);
0051
0052 ALPAKA_ASSERT_ACC(0 == hist.size());
0053 alpaka::syncBlockThreads(acc);
0054
0055
0056 hist.finalize(acc, ws);
0057 alpaka::syncBlockThreads(acc);
0058
0059 ALPAKA_ASSERT_ACC(N == hist.size());
0060
0061
0062 for ([[maybe_unused]] auto j : uniform_elements(acc, Hist::nbins())) {
0063 ALPAKA_ASSERT_ACC(hist.off[j] <= hist.off[j + 1]);
0064 }
0065 alpaka::syncBlockThreads(acc);
0066
0067 for (auto j : uniform_elements(acc, 32)) {
0068 ws[j] = 0;
0069 }
0070 alpaka::syncBlockThreads(acc);
0071
0072
0073 for (auto j : uniform_elements(acc, N)) {
0074 hist.fill(acc, v[j], j);
0075 }
0076 alpaka::syncBlockThreads(acc);
0077
0078 ALPAKA_ASSERT_ACC(0 == hist.off[0]);
0079 ALPAKA_ASSERT_ACC(N == hist.size());
0080
0081
0082 #ifndef NDEBUG
0083 for (auto j : uniform_elements(acc, hist.size() - 1)) {
0084 auto p = hist.begin() + j;
0085 ALPAKA_ASSERT_ACC((*p) < N);
0086 [[maybe_unused]] auto k1 = Hist::bin(v[*p]);
0087 [[maybe_unused]] auto k2 = Hist::bin(v[*(p + 1)]);
0088 ALPAKA_ASSERT_ACC(k2 >= k1);
0089 }
0090 #endif
0091
0092
0093 for (auto i : uniform_elements(acc, hist.size())) {
0094 auto p = hist.begin() + i;
0095 auto j = *p;
0096 #ifndef NDEBUG
0097 auto b0 = Hist::bin(v[j]);
0098 #endif
0099 [[maybe_unused]] int tot = 0;
0100 auto ftest = [&](unsigned int k) {
0101 ALPAKA_ASSERT_ACC(k < N);
0102 ++tot;
0103 };
0104 forEachInWindow(hist, v[j], v[j], ftest);
0105 #ifndef NDEBUG
0106 [[maybe_unused]] int rtot = hist.size(b0);
0107 ALPAKA_ASSERT_ACC(tot == rtot);
0108 #endif
0109 tot = 0;
0110 auto vm = int(v[j]) - DELTA;
0111 auto vp = int(v[j]) + DELTA;
0112 constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits<T>::max();
0113 vm = std::max(vm, 0);
0114 vm = std::min(vm, vmax);
0115 vp = std::min(vp, vmax);
0116 vp = std::max(vp, 0);
0117 ALPAKA_ASSERT_ACC(vp >= vm);
0118 forEachInWindow(hist, vm, vp, ftest);
0119 #ifndef NDEBUG
0120 int bp = Hist::bin(vp);
0121 int bm = Hist::bin(vm);
0122 rtot = hist.end(bp) - hist.begin(bm);
0123 ALPAKA_ASSERT_ACC(tot == rtot);
0124 #endif
0125 }
0126 }
0127 };
0128
0129 template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
0130 void go(const DevHost& host, const Device& device, Queue& queue) {
0131 std::mt19937 eng;
0132
0133 int rmin = std::numeric_limits<T>::min();
0134 int rmax = std::numeric_limits<T>::max();
0135 if (NBINS != 128) {
0136 rmin = 0;
0137 rmax = NBINS * 2 - 1;
0138 }
0139
0140 std::uniform_int_distribution<T> rgen(rmin, rmax);
0141 constexpr unsigned int N = 12000;
0142
0143 using Hist = HistoContainer<T, NBINS, N, S>;
0144 std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist{}.capacity() << ' '
0145 << (rmax - rmin) / Hist::nbins() << std::endl;
0146 std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
0147
0148 auto v = make_host_buffer<T[]>(queue, N);
0149 auto v_d = make_device_buffer<T[]>(queue, N);
0150
0151 for (int it = 0; it < 5; ++it) {
0152 for (long long j = 0; j < N; j++)
0153 v[j] = rgen(eng);
0154 if (it == 2)
0155 for (long long j = N / 2; j < N / 2 + N / 4; j++)
0156 v[j] = 4;
0157
0158 alpaka::memcpy(queue, v_d, v);
0159
0160 const auto threadsPerBlockOrElementsPerThread = 256u;
0161 const auto blocksPerGrid = 1u;
0162 const auto workDiv = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
0163 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv, mykernel<NBINS, S, DELTA>(), v_d.data(), N));
0164 }
0165 alpaka::wait(queue);
0166 }
0167
0168 int main() {
0169 auto const& host = cms::alpakatools::host();
0170
0171
0172 auto const& devices = cms::alpakatools::devices<Platform>();
0173 if (devices.empty()) {
0174 std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0175 "the test will be skipped.\n";
0176 exit(EXIT_FAILURE);
0177 }
0178
0179
0180 for (auto const& device : devices) {
0181 std::cout << "Test One Histo Container on " << alpaka::getName(device) << '\n';
0182
0183 auto queue = Queue(device);
0184
0185 go<int16_t>(host, device, queue);
0186 go<uint8_t, 128, 8, 4>(host, device, queue);
0187 go<uint16_t, 313 / 2, 9, 4>(host, device, queue);
0188 }
0189
0190 return 0;
0191 }