File indexing completed on 2025-01-22 07:34:11
0001 #include <algorithm>
0002 #include <cmath>
0003 #include <cstdlib>
0004 #include <iostream>
0005 #include <limits>
0006 #include <random>
0007
0008 #include <alpaka/alpaka.hpp>
0009
0010 #include "FWCore/Utilities/interface/stringize.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
0015
0016 using namespace cms::alpakatools;
0017 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0018
0019
0020
0021 template <typename T>
0022 struct format_traits {
0023 public:
0024 static const constexpr char* failed_msg = "failed(int) size=%d, i=%d, blockDimension=%d: c[i]=%d c[i-1]=%d\n";
0025 };
0026
0027 template <>
0028 struct format_traits<float> {
0029 public:
0030 static const constexpr char* failed_msg = "failed(float size=%d, i=%d, blockDimension=%d: c[i]=%f c[i-1]=%f\n";
0031 };
0032
0033 template <typename T>
0034 struct testPrefixScan {
0035 ALPAKA_FN_ACC void operator()(Acc1D const& acc, unsigned int size) const {
0036
0037 #if defined(__CUDA_ARCH__)
0038
0039 auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
0040 #elif defined(__HIP_DEVICE_COMPILE__)
0041
0042 auto& ws = alpaka::declareSharedVar<T[warpSize], __COUNTER__>(acc);
0043 #else
0044
0045 auto& ws = alpaka::declareSharedVar<T[1], __COUNTER__>(acc);
0046 #endif
0047 auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0048 auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0049
0050 for (auto i : uniform_elements(acc, size)) {
0051 c[i] = 1;
0052 };
0053
0054 alpaka::syncBlockThreads(acc);
0055
0056 blockPrefixScan(acc, c, co, size, ws);
0057 blockPrefixScan(acc, c, size, ws);
0058
0059 ALPAKA_ASSERT_ACC(1 == c[0]);
0060 ALPAKA_ASSERT_ACC(1 == co[0]);
0061
0062
0063 alpaka::syncBlockThreads(acc);
0064
0065 for (auto i : uniform_elements(acc, size)) {
0066 if (0 == i)
0067 continue;
0068 if constexpr (!std::is_floating_point_v<T>) {
0069 if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i])))
0070 printf("c[%d]=%d, co[%d]=%d\n", i, c[i], i, co[i]);
0071 } else {
0072 if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i])))
0073 printf("c[%d]=%f, co[%d]=%f\n", i, c[i], i, co[i]);
0074 }
0075 ALPAKA_ASSERT_ACC(c[i] == c[i - 1] + 1);
0076 ALPAKA_ASSERT_ACC(c[i] == i + 1);
0077 ALPAKA_ASSERT_ACC(c[i] == co[i]);
0078 }
0079 }
0080 };
0081
0082
0083
0084
0085 template <typename T>
0086 struct testWarpPrefixScan {
0087 ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint32_t size) const {
0088 if constexpr (not requires_single_thread_per_block_v<Acc1D>) {
0089 ALPAKA_ASSERT_ACC(size <= static_cast<uint32_t>(alpaka::warp::getSize(acc)));
0090 auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0091 auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0092
0093 uint32_t const blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0094 uint32_t const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
0095 auto i = blockThreadIdx;
0096 c[i] = 1;
0097 alpaka::syncBlockThreads(acc);
0098
0099 auto laneId = blockThreadIdx % alpaka::warp::getSize(acc);
0100
0101 warpPrefixScan(acc, laneId, c, co, i);
0102 warpPrefixScan(acc, laneId, c, i);
0103
0104 alpaka::syncBlockThreads(acc);
0105
0106 ALPAKA_ASSERT_ACC(1 == c[0]);
0107 ALPAKA_ASSERT_ACC(1 == co[0]);
0108 if (i != 0) {
0109 if (c[i] != c[i - 1] + 1)
0110 printf(format_traits<T>::failed_msg, size, i, blockDimension, c[i], c[i - 1]);
0111 ALPAKA_ASSERT_ACC(c[i] == c[i - 1] + 1);
0112 ALPAKA_ASSERT_ACC(c[i] == static_cast<T>(i + 1));
0113 ALPAKA_ASSERT_ACC(c[i] == co[i]);
0114 }
0115 } else {
0116
0117 ALPAKA_ASSERT_ACC(false);
0118 }
0119 }
0120 };
0121
0122 struct init {
0123 ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint32_t* v, uint32_t val, uint32_t n) const {
0124 for (auto index : uniform_elements(acc, n)) {
0125 v[index] = val;
0126
0127 if (index == 0)
0128 printf("init\n");
0129 }
0130 }
0131 };
0132
0133 struct verify {
0134 ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint32_t const* v, uint32_t n) const {
0135 for (auto index : uniform_elements(acc, n)) {
0136 ALPAKA_ASSERT_ACC(v[index] == index + 1);
0137
0138 if (index == 0)
0139 printf("verify\n");
0140 }
0141 }
0142 };
0143
0144 int main() {
0145
0146 auto const& devices = cms::alpakatools::devices<Platform>();
0147
0148 if (devices.empty()) {
0149 std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0150 "the test will be skipped.\n";
0151 exit(EXIT_FAILURE);
0152 }
0153
0154 for (auto const& device : devices) {
0155 std::cout << "Test prefix scan on " << alpaka::getName(device) << '\n';
0156 auto queue = Queue(device);
0157 const auto warpSize = alpaka::getPreferredWarpSize(device);
0158
0159 if constexpr (!requires_single_thread_per_block_v<Acc1D>) {
0160 std::cout << "warp level" << std::endl;
0161
0162 const auto threadsPerBlockOrElementsPerThread = warpSize;
0163 const auto blocksPerGrid = 1;
0164 const auto workDivWarp = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
0165
0166 if (warpSize >= 64)
0167 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 64));
0168 if (warpSize >= 32)
0169 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 32));
0170 if (warpSize >= 16)
0171 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 12));
0172 if (warpSize >= 8)
0173 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 5));
0174 }
0175
0176
0177 std::cout << "block level" << std::endl;
0178
0179
0180
0181 for (int bs = warpSize; bs <= 1024; bs += warpSize) {
0182 const auto blocksPerGrid2 = 1;
0183 const auto workDivSingleBlock = make_workdiv<Acc1D>(blocksPerGrid2, bs);
0184
0185 std::cout << "blocks per grid: " << blocksPerGrid2 << ", threads per block or elements per thread: " << bs
0186 << std::endl;
0187
0188
0189 for (int j = 1; j <= 1024; ++j) {
0190 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivSingleBlock, testPrefixScan<uint16_t>(), j));
0191 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivSingleBlock, testPrefixScan<float>(), j));
0192 }
0193 }
0194
0195
0196 uint32_t num_items = 200;
0197 for (int ksize = 1; ksize < 4; ++ksize) {
0198 std::cout << "multiblock" << std::endl;
0199 num_items *= 10;
0200
0201 auto input_d = make_device_buffer<uint32_t[]>(queue, num_items);
0202 auto output1_d = make_device_buffer<uint32_t[]>(queue, num_items);
0203 auto blockCounter_d = make_device_buffer<int32_t>(queue);
0204
0205 const auto nThreadsInit = 256;
0206 const auto nBlocksInit = divide_up_by(num_items, nThreadsInit);
0207 const auto workDivMultiBlockInit = make_workdiv<Acc1D>(nBlocksInit, nThreadsInit);
0208
0209 alpaka::enqueue(queue,
0210 alpaka::createTaskKernel<Acc1D>(workDivMultiBlockInit, init(), input_d.data(), 1, num_items));
0211 alpaka::memset(queue, blockCounter_d, 0);
0212
0213 const auto nThreads = 1024;
0214 const auto nBlocks = divide_up_by(num_items, nThreads);
0215 const auto workDivMultiBlock = make_workdiv<Acc1D>(nBlocks, nThreads);
0216
0217 std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl;
0218 alpaka::enqueue(queue,
0219 alpaka::createTaskKernel<Acc1D>(workDivMultiBlock,
0220 multiBlockPrefixScan<uint32_t>(),
0221 input_d.data(),
0222 output1_d.data(),
0223 num_items,
0224 nBlocks,
0225 blockCounter_d.data(),
0226 warpSize));
0227 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivMultiBlock, verify(), output1_d.data(), num_items));
0228
0229 alpaka::wait(queue);
0230 }
0231 }
0232
0233 return 0;
0234 }