File indexing completed on 2024-04-06 12:15:41
0001 #include <algorithm>
0002 #include <cmath>
0003 #include <cstdlib>
0004 #include <iostream>
0005 #include <limits>
0006 #include <random>
0007
0008 #include <alpaka/alpaka.hpp>
0009
0010 #include "FWCore/Utilities/interface/stringize.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
0015
0016 using namespace cms::alpakatools;
0017 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0018
0019
0020
0021 template <typename T>
0022 struct format_traits {
0023 public:
0024 static const constexpr char* failed_msg = "failed(int) size=%d, i=%d, blockDimension=%d: c[i]=%d c[i-1]=%d\n";
0025 };
0026
0027 template <>
0028 struct format_traits<float> {
0029 public:
0030 static const constexpr char* failed_msg = "failed(float size=%d, i=%d, blockDimension=%d: c[i]=%f c[i-1]=%f\n";
0031 };
0032
0033 template <typename T>
0034 struct testPrefixScan {
0035 template <typename TAcc>
0036 ALPAKA_FN_ACC void operator()(const TAcc& acc, unsigned int size) const {
0037 auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
0038 auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0039 auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0040
0041 for (auto i : uniform_elements(acc, size)) {
0042 c[i] = 1;
0043 };
0044
0045 alpaka::syncBlockThreads(acc);
0046
0047 blockPrefixScan(acc, c, co, size, ws);
0048 blockPrefixScan(acc, c, size, ws);
0049
0050 ALPAKA_ASSERT_ACC(1 == c[0]);
0051 ALPAKA_ASSERT_ACC(1 == co[0]);
0052
0053
0054 alpaka::syncBlockThreads(acc);
0055
0056 for (auto i : uniform_elements(acc, size)) {
0057 if (0 == i)
0058 continue;
0059 if constexpr (!std::is_floating_point_v<T>) {
0060 if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i])))
0061 printf("c[%d]=%d, co[%d]=%d\n", i, c[i], i, co[i]);
0062 } else {
0063 if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i])))
0064 printf("c[%d]=%f, co[%d]=%f\n", i, c[i], i, co[i]);
0065 }
0066 ALPAKA_ASSERT_ACC(c[i] == c[i - 1] + 1);
0067 ALPAKA_ASSERT_ACC(c[i] == i + 1);
0068 ALPAKA_ASSERT_ACC(c[i] == co[i]);
0069 }
0070 }
0071 };
0072
0073
0074
0075
0076 template <typename T>
0077 struct testWarpPrefixScan {
0078 template <typename TAcc>
0079 ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t size) const {
0080 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
0081 ALPAKA_ASSERT_ACC(size <= 32);
0082 auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0083 auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
0084
0085 uint32_t const blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0086 uint32_t const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
0087 auto i = blockThreadIdx;
0088 c[i] = 1;
0089 alpaka::syncBlockThreads(acc);
0090 auto laneId = blockThreadIdx & 0x1f;
0091
0092 warpPrefixScan(acc, laneId, c, co, i);
0093 warpPrefixScan(acc, laneId, c, i);
0094
0095 alpaka::syncBlockThreads(acc);
0096
0097 ALPAKA_ASSERT_ACC(1 == c[0]);
0098 ALPAKA_ASSERT_ACC(1 == co[0]);
0099 if (i != 0) {
0100 if (c[i] != c[i - 1] + 1)
0101 printf(format_traits<T>::failed_msg, size, i, blockDimension, c[i], c[i - 1]);
0102 ALPAKA_ASSERT_ACC(c[i] == c[i - 1] + 1);
0103 ALPAKA_ASSERT_ACC(c[i] == static_cast<T>(i + 1));
0104 ALPAKA_ASSERT_ACC(c[i] == co[i]);
0105 }
0106 } else {
0107
0108 ALPAKA_ASSERT_ACC(false);
0109 }
0110 }
0111 };
0112
0113 struct init {
0114 template <typename TAcc>
0115 ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t* v, uint32_t val, uint32_t n) const {
0116 for (auto index : uniform_elements(acc, n)) {
0117 v[index] = val;
0118
0119 if (index == 0)
0120 printf("init\n");
0121 }
0122 }
0123 };
0124
0125 struct verify {
0126 template <typename TAcc>
0127 ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t const* v, uint32_t n) const {
0128 for (auto index : uniform_elements(acc, n)) {
0129 ALPAKA_ASSERT_ACC(v[index] == index + 1);
0130
0131 if (index == 0)
0132 printf("verify\n");
0133 }
0134 }
0135 };
0136
0137 int main() {
0138
0139 auto const& devices = cms::alpakatools::devices<Platform>();
0140
0141 if (devices.empty()) {
0142 std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0143 "the test will be skipped.\n";
0144 exit(EXIT_FAILURE);
0145 }
0146
0147 for (auto const& device : devices) {
0148 std::cout << "Test prefix scan on " << alpaka::getName(device) << '\n';
0149 auto queue = Queue(device);
0150 const auto warpSize = alpaka::getPreferredWarpSize(device);
0151
0152 if constexpr (!requires_single_thread_per_block_v<Acc1D>) {
0153 std::cout << "warp level" << std::endl;
0154
0155 const auto threadsPerBlockOrElementsPerThread = 32;
0156 const auto blocksPerGrid = 1;
0157 const auto workDivWarp = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
0158
0159 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 32));
0160 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 16));
0161 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 5));
0162 }
0163
0164
0165 std::cout << "block level" << std::endl;
0166
0167
0168
0169 for (int bs = 32; bs <= 1024; bs += 32) {
0170 const auto blocksPerGrid2 = 1;
0171 const auto workDivSingleBlock = make_workdiv<Acc1D>(blocksPerGrid2, bs);
0172
0173 std::cout << "blocks per grid: " << blocksPerGrid2 << ", threads per block or elements per thread: " << bs
0174 << std::endl;
0175
0176
0177 for (int j = 1; j <= 1024; ++j) {
0178 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivSingleBlock, testPrefixScan<uint16_t>(), j));
0179 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivSingleBlock, testPrefixScan<float>(), j));
0180 }
0181 }
0182
0183
0184 uint32_t num_items = 200;
0185 for (int ksize = 1; ksize < 4; ++ksize) {
0186 std::cout << "multiblock" << std::endl;
0187 num_items *= 10;
0188
0189 auto input_d = make_device_buffer<uint32_t[]>(queue, num_items);
0190 auto output1_d = make_device_buffer<uint32_t[]>(queue, num_items);
0191 auto blockCounter_d = make_device_buffer<int32_t>(queue);
0192
0193 const auto nThreadsInit = 256;
0194 const auto nBlocksInit = divide_up_by(num_items, nThreadsInit);
0195 const auto workDivMultiBlockInit = make_workdiv<Acc1D>(nBlocksInit, nThreadsInit);
0196
0197 alpaka::enqueue(queue,
0198 alpaka::createTaskKernel<Acc1D>(workDivMultiBlockInit, init(), input_d.data(), 1, num_items));
0199 alpaka::memset(queue, blockCounter_d, 0);
0200
0201 const auto nThreads = 1024;
0202 const auto nBlocks = divide_up_by(num_items, nThreads);
0203 const auto workDivMultiBlock = make_workdiv<Acc1D>(nBlocks, nThreads);
0204
0205 std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl;
0206 alpaka::enqueue(queue,
0207 alpaka::createTaskKernel<Acc1D>(workDivMultiBlock,
0208 multiBlockPrefixScan<uint32_t>(),
0209 input_d.data(),
0210 output1_d.data(),
0211 num_items,
0212 nBlocks,
0213 blockCounter_d.data(),
0214 warpSize));
0215 alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivMultiBlock, verify(), output1_d.data(), num_items));
0216
0217 alpaka::wait(queue);
0218 }
0219 }
0220
0221 return 0;
0222 }