Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-01-22 07:34:11

0001 #include <algorithm>
0002 #include <cmath>
0003 #include <cstdlib>
0004 #include <iostream>
0005 #include <limits>
0006 #include <random>
0007 
0008 #include <alpaka/alpaka.hpp>
0009 
0010 #include "FWCore/Utilities/interface/stringize.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
0015 
0016 using namespace cms::alpakatools;
0017 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0018 
0019 // static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestPrefixScan) "]";
0020 
0021 template <typename T>
0022 struct format_traits {
0023 public:
0024   static const constexpr char* failed_msg = "failed(int) size=%d, i=%d, blockDimension=%d: c[i]=%d c[i-1]=%d\n";
0025 };
0026 
0027 template <>
0028 struct format_traits<float> {
0029 public:
0030   static const constexpr char* failed_msg = "failed(float size=%d, i=%d, blockDimension=%d: c[i]=%f c[i-1]=%f\n";
0031 };
0032 
0033 template <typename T>
0034 struct testPrefixScan {
0035   ALPAKA_FN_ACC void operator()(Acc1D const& acc, unsigned int size) const {
0036     // alpaka::warp::getSize(acc) is runtime, but we need a compile-time or constexpr value
0037 #if defined(__CUDA_ARCH__)
0038     // CUDA always has a warp size of 32
0039     auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
0040 #elif defined(__HIP_DEVICE_COMPILE__)
0041     // HIP/ROCm defines warpSize as a constant expression with value 32 or 64 depending on the target device
0042     auto& ws = alpaka::declareSharedVar<T[warpSize], __COUNTER__>(acc);
0043 #else
0044     // CPU back-ends always have a warp size of 1
0045     auto& ws = alpaka::declareSharedVar<T[1], __COUNTER__>(acc);
0046 #endif
0047     auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0048     auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0049 
0050     for (auto i : uniform_elements(acc, size)) {
0051       c[i] = 1;
0052     };
0053 
0054     alpaka::syncBlockThreads(acc);
0055 
0056     blockPrefixScan(acc, c, co, size, ws);
0057     blockPrefixScan(acc, c, size, ws);
0058 
0059     ALPAKA_ASSERT_ACC(1 == c[0]);
0060     ALPAKA_ASSERT_ACC(1 == co[0]);
0061 
0062     // TODO: not needed? Not in multi kernel version, not in CUDA version
0063     alpaka::syncBlockThreads(acc);
0064 
0065     for (auto i : uniform_elements(acc, size)) {
0066       if (0 == i)
0067         continue;
0068       if constexpr (!std::is_floating_point_v<T>) {
0069         if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i])))
0070           printf("c[%d]=%d, co[%d]=%d\n", i, c[i], i, co[i]);
0071       } else {
0072         if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i])))
0073           printf("c[%d]=%f, co[%d]=%f\n", i, c[i], i, co[i]);
0074       }
0075       ALPAKA_ASSERT_ACC(c[i] == c[i - 1] + 1);
0076       ALPAKA_ASSERT_ACC(c[i] == i + 1);
0077       ALPAKA_ASSERT_ACC(c[i] == co[i]);
0078     }
0079   }
0080 };
0081 
0082 /*
0083  * NB: GPU-only, so do not care about elements here.
0084  */
0085 template <typename T>
0086 struct testWarpPrefixScan {
0087   ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint32_t size) const {
0088     if constexpr (not requires_single_thread_per_block_v<Acc1D>) {
0089       ALPAKA_ASSERT_ACC(size <= static_cast<uint32_t>(alpaka::warp::getSize(acc)));
0090       auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0091       auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0092 
0093       uint32_t const blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0094       uint32_t const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
0095       auto i = blockThreadIdx;
0096       c[i] = 1;
0097       alpaka::syncBlockThreads(acc);
0098       // a compile-time constant would be faster, but this is more portable
0099       auto laneId = blockThreadIdx % alpaka::warp::getSize(acc);
0100 
0101       warpPrefixScan(acc, laneId, c, co, i);
0102       warpPrefixScan(acc, laneId, c, i);
0103 
0104       alpaka::syncBlockThreads(acc);
0105 
0106       ALPAKA_ASSERT_ACC(1 == c[0]);
0107       ALPAKA_ASSERT_ACC(1 == co[0]);
0108       if (i != 0) {
0109         if (c[i] != c[i - 1] + 1)
0110           printf(format_traits<T>::failed_msg, size, i, blockDimension, c[i], c[i - 1]);
0111         ALPAKA_ASSERT_ACC(c[i] == c[i - 1] + 1);
0112         ALPAKA_ASSERT_ACC(c[i] == static_cast<T>(i + 1));
0113         ALPAKA_ASSERT_ACC(c[i] == co[i]);
0114       }
0115     } else {
0116       // This should never be called outsie for the serial CPU backend.
0117       ALPAKA_ASSERT_ACC(false);
0118     }
0119   }
0120 };
0121 
0122 struct init {
0123   ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint32_t* v, uint32_t val, uint32_t n) const {
0124     for (auto index : uniform_elements(acc, n)) {
0125       v[index] = val;
0126 
0127       if (index == 0)
0128         printf("init\n");
0129     }
0130   }
0131 };
0132 
0133 struct verify {
0134   ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint32_t const* v, uint32_t n) const {
0135     for (auto index : uniform_elements(acc, n)) {
0136       ALPAKA_ASSERT_ACC(v[index] == index + 1);
0137 
0138       if (index == 0)
0139         printf("verify\n");
0140     }
0141   }
0142 };
0143 
0144 int main() {
0145   // get the list of devices on the current platform
0146   auto const& devices = cms::alpakatools::devices<Platform>();
0147 
0148   if (devices.empty()) {
0149     std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0150       "the test will be skipped.\n";
0151     exit(EXIT_FAILURE);
0152   }
0153 
0154   for (auto const& device : devices) {
0155     std::cout << "Test prefix scan on " << alpaka::getName(device) << '\n';
0156     auto queue = Queue(device);
0157     const auto warpSize = alpaka::getPreferredWarpSize(device);
0158     // WARP PREFIXSCAN (OBVIOUSLY GPU-ONLY)
0159     if constexpr (!requires_single_thread_per_block_v<Acc1D>) {
0160       std::cout << "warp level" << std::endl;
0161 
0162       const auto threadsPerBlockOrElementsPerThread = warpSize;
0163       const auto blocksPerGrid = 1;
0164       const auto workDivWarp = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
0165 
0166       if (warpSize >= 64)
0167         alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 64));
0168       if (warpSize >= 32)
0169         alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 32));
0170       if (warpSize >= 16)
0171         alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 12));
0172       if (warpSize >= 8)
0173         alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 5));
0174     }
0175 
0176     // PORTABLE BLOCK PREFIXSCAN
0177     std::cout << "block level" << std::endl;
0178 
0179     // Running kernel with 1 block, and bs threads per block or elements per thread.
0180     // NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version.
0181     for (int bs = warpSize; bs <= 1024; bs += warpSize) {
0182       const auto blocksPerGrid2 = 1;
0183       const auto workDivSingleBlock = make_workdiv<Acc1D>(blocksPerGrid2, bs);
0184 
0185       std::cout << "blocks per grid: " << blocksPerGrid2 << ", threads per block or elements per thread: " << bs
0186                 << std::endl;
0187 
0188       // Problem size
0189       for (int j = 1; j <= 1024; ++j) {
0190         alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivSingleBlock, testPrefixScan<uint16_t>(), j));
0191         alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivSingleBlock, testPrefixScan<float>(), j));
0192       }
0193     }
0194 
0195     // PORTABLE MULTI-BLOCK PREFIXSCAN
0196     uint32_t num_items = 200;
0197     for (int ksize = 1; ksize < 4; ++ksize) {
0198       std::cout << "multiblock" << std::endl;
0199       num_items *= 10;
0200 
0201       auto input_d = make_device_buffer<uint32_t[]>(queue, num_items);
0202       auto output1_d = make_device_buffer<uint32_t[]>(queue, num_items);
0203       auto blockCounter_d = make_device_buffer<int32_t>(queue);
0204 
0205       const auto nThreadsInit = 256;  // NB: 1024 would be better
0206       const auto nBlocksInit = divide_up_by(num_items, nThreadsInit);
0207       const auto workDivMultiBlockInit = make_workdiv<Acc1D>(nBlocksInit, nThreadsInit);
0208 
0209       alpaka::enqueue(queue,
0210                       alpaka::createTaskKernel<Acc1D>(workDivMultiBlockInit, init(), input_d.data(), 1, num_items));
0211       alpaka::memset(queue, blockCounter_d, 0);
0212 
0213       const auto nThreads = 1024;
0214       const auto nBlocks = divide_up_by(num_items, nThreads);
0215       const auto workDivMultiBlock = make_workdiv<Acc1D>(nBlocks, nThreads);
0216 
0217       std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl;
0218       alpaka::enqueue(queue,
0219                       alpaka::createTaskKernel<Acc1D>(workDivMultiBlock,
0220                                                       multiBlockPrefixScan<uint32_t>(),
0221                                                       input_d.data(),
0222                                                       output1_d.data(),
0223                                                       num_items,
0224                                                       nBlocks,
0225                                                       blockCounter_d.data(),
0226                                                       warpSize));
0227       alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivMultiBlock, verify(), output1_d.data(), num_items));
0228 
0229       alpaka::wait(queue);  // input_d and output1_d end of scope
0230     }  // ksize
0231   }
0232 
0233   return 0;
0234 }