test/alpaka/testPrefixScan.dev.cc

0001 #include <algorithm>
0002 #include <cmath>
0003 #include <cstdlib>
0004 #include <iostream>
0005 #include <limits>
0006 #include <random>
0007
0008 #include <alpaka/alpaka.hpp>
0009
0010 #include "FWCore/Utilities/interface/stringize.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
0015
0016 using namespace cms::alpakatools;
0017 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0018
0019 // static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestPrefixScan) "]";
0020
0021 template <typename T>
0022 struct format_traits {
0023 public:
0024   static const constexpr char* failed_msg = "failed(int) size=%d, i=%d, blockDimension=%d: c[i]=%d c[i-1]=%d\n";
0025 };
0026
0027 template <>
0028 struct format_traits<float> {
0029 public:
0030   static const constexpr char* failed_msg = "failed(float size=%d, i=%d, blockDimension=%d: c[i]=%f c[i-1]=%f\n";
0031 };
0032
0033 template <typename T>
0034 struct testPrefixScan {
0035   template <typename TAcc>
0036   ALPAKA_FN_ACC void operator()(const TAcc& acc, unsigned int size) const {
0037     auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
0038     auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0039     auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0040
0041     for (auto i : uniform_elements(acc, size)) {
0042       c[i] = 1;
0043     };
0044
0045     alpaka::syncBlockThreads(acc);
0046
0047     blockPrefixScan(acc, c, co, size, ws);
0048     blockPrefixScan(acc, c, size, ws);
0049
0050     ALPAKA_ASSERT_ACC(1 == c[0]);
0051     ALPAKA_ASSERT_ACC(1 == co[0]);
0052
0053     // TODO: not needed? Not in multi kernel version, not in CUDA version
0054     alpaka::syncBlockThreads(acc);
0055
0056     for (auto i : uniform_elements(acc, size)) {
0057       if (0 == i)
0058         continue;
0059       if constexpr (!std::is_floating_point_v<T>) {
0060         if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i])))
0061           printf("c[%d]=%d, co[%d]=%d\n", i, c[i], i, co[i]);
0062       } else {
0063         if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i])))
0064           printf("c[%d]=%f, co[%d]=%f\n", i, c[i], i, co[i]);
0065       }
0066       ALPAKA_ASSERT_ACC(c[i] == c[i - 1] + 1);
0067       ALPAKA_ASSERT_ACC(c[i] == i + 1);
0068       ALPAKA_ASSERT_ACC(c[i] == co[i]);
0069     }
0070   }
0071 };
0072
0073 /*
0074  * NB: GPU-only, so do not care about elements here.
0075  */
0076 template <typename T>
0077 struct testWarpPrefixScan {
0078   template <typename TAcc>
0079   ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t size) const {
0080     if constexpr (!requires_single_thread_per_block_v<TAcc>) {
0081       ALPAKA_ASSERT_ACC(size <= 32);
0082       auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0083       auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0084
0085       uint32_t const blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0086       uint32_t const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
0087       auto i = blockThreadIdx;
0088       c[i] = 1;
0089       alpaka::syncBlockThreads(acc);
0090       auto laneId = blockThreadIdx & 0x1f;
0091
0092       warpPrefixScan(acc, laneId, c, co, i);
0093       warpPrefixScan(acc, laneId, c, i);
0094
0095       alpaka::syncBlockThreads(acc);
0096
0097       ALPAKA_ASSERT_ACC(1 == c[0]);
0098       ALPAKA_ASSERT_ACC(1 == co[0]);
0099       if (i != 0) {
0100         if (c[i] != c[i - 1] + 1)
0101           printf(format_traits<T>::failed_msg, size, i, blockDimension, c[i], c[i - 1]);
0102         ALPAKA_ASSERT_ACC(c[i] == c[i - 1] + 1);
0103         ALPAKA_ASSERT_ACC(c[i] == static_cast<T>(i + 1));
0104         ALPAKA_ASSERT_ACC(c[i] == co[i]);
0105       }
0106     } else {
0107       // We should never be called outsie of the GPU.
0108       ALPAKA_ASSERT_ACC(false);
0109     }
0110   }
0111 };
0112
0113 struct init {
0114   template <typename TAcc>
0115   ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t* v, uint32_t val, uint32_t n) const {
0116     for (auto index : uniform_elements(acc, n)) {
0117       v[index] = val;
0118
0119       if (index == 0)
0120         printf("init\n");
0121     }
0122   }
0123 };
0124
0125 struct verify {
0126   template <typename TAcc>
0127   ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t const* v, uint32_t n) const {
0128     for (auto index : uniform_elements(acc, n)) {
0129       ALPAKA_ASSERT_ACC(v[index] == index + 1);
0130
0131       if (index == 0)
0132         printf("verify\n");
0133     }
0134   }
0135 };
0136
0137 int main() {
0138   // get the list of devices on the current platform
0139   auto const& devices = cms::alpakatools::devices<Platform>();
0140
0141   if (devices.empty()) {
0142     std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0143       "the test will be skipped.\n";
0144     exit(EXIT_FAILURE);
0145   }
0146
0147   for (auto const& device : devices) {
0148     std::cout << "Test prefix scan on " << alpaka::getName(device) << '\n';
0149     auto queue = Queue(device);
0150     const auto warpSize = alpaka::getPreferredWarpSize(device);
0151     // WARP PREFIXSCAN (OBVIOUSLY GPU-ONLY)
0152     if constexpr (!requires_single_thread_per_block_v<Acc1D>) {
0153       std::cout << "warp level" << std::endl;
0154
0155       const auto threadsPerBlockOrElementsPerThread = 32;
0156       const auto blocksPerGrid = 1;
0157       const auto workDivWarp = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
0158
0159       alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 32));
0160       alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 16));
0161       alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 5));
0162     }
0163
0164     // PORTABLE BLOCK PREFIXSCAN
0165     std::cout << "block level" << std::endl;
0166
0167     // Running kernel with 1 block, and bs threads per block or elements per thread.
0168     // NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version.
0169     for (int bs = 32; bs <= 1024; bs += 32) {
0170       const auto blocksPerGrid2 = 1;
0171       const auto workDivSingleBlock = make_workdiv<Acc1D>(blocksPerGrid2, bs);
0172
0173       std::cout << "blocks per grid: " << blocksPerGrid2 << ", threads per block or elements per thread: " << bs
0174                 << std::endl;
0175
0176       // Problem size
0177       for (int j = 1; j <= 1024; ++j) {
0178         alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivSingleBlock, testPrefixScan<uint16_t>(), j));
0179         alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivSingleBlock, testPrefixScan<float>(), j));
0180       }
0181     }
0182
0183     // PORTABLE MULTI-BLOCK PREFIXSCAN
0184     uint32_t num_items = 200;
0185     for (int ksize = 1; ksize < 4; ++ksize) {
0186       std::cout << "multiblock" << std::endl;
0187       num_items *= 10;
0188
0189       auto input_d = make_device_buffer<uint32_t[]>(queue, num_items);
0190       auto output1_d = make_device_buffer<uint32_t[]>(queue, num_items);
0191       auto blockCounter_d = make_device_buffer<int32_t>(queue);
0192
0193       const auto nThreadsInit = 256;  // NB: 1024 would be better
0194       const auto nBlocksInit = divide_up_by(num_items, nThreadsInit);
0195       const auto workDivMultiBlockInit = make_workdiv<Acc1D>(nBlocksInit, nThreadsInit);
0196
0197       alpaka::enqueue(queue,
0198                       alpaka::createTaskKernel<Acc1D>(workDivMultiBlockInit, init(), input_d.data(), 1, num_items));
0199       alpaka::memset(queue, blockCounter_d, 0);
0200
0201       const auto nThreads = 1024;
0202       const auto nBlocks = divide_up_by(num_items, nThreads);
0203       const auto workDivMultiBlock = make_workdiv<Acc1D>(nBlocks, nThreads);
0204
0205       std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl;
0206       alpaka::enqueue(queue,
0207                       alpaka::createTaskKernel<Acc1D>(workDivMultiBlock,
0208                                                       multiBlockPrefixScan<uint32_t>(),
0209                                                       input_d.data(),
0210                                                       output1_d.data(),
0211                                                       num_items,
0212                                                       nBlocks,
0213                                                       blockCounter_d.data(),
0214                                                       warpSize));
0215       alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivMultiBlock, verify(), output1_d.data(), num_items));
0216
0217       alpaka::wait(queue);  // input_d and output1_d end of scope
0218     }                       // ksize
0219   }
0220
0221   return 0;
0222 }