File indexing completed on 2025-01-22 07:34:11
0001 #include <cstdio>
0002 #include <random>
0003
0004 #include <alpaka/alpaka.hpp>
0005
0006 #define CATCH_CONFIG_MAIN
0007 #include <catch.hpp>
0008
0009 #include "FWCore/Utilities/interface/stringize.h"
0010 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0013
0014
0015 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0016
0017 struct VectorAddKernel {
0018 template <typename T>
0019 ALPAKA_FN_ACC void operator()(
0020 Acc1D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0021 for (auto index : cms::alpakatools::uniform_elements(acc, size)) {
0022 out[index] = in1[index] + in2[index];
0023 }
0024 }
0025 };
0026
0027 struct VectorAddKernelSkip {
0028 template <typename T>
0029 ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0030 T const* __restrict__ in1,
0031 T const* __restrict__ in2,
0032 T* __restrict__ out,
0033 size_t first,
0034 size_t size) const {
0035 for (auto index : cms::alpakatools::uniform_elements(acc, first, size)) {
0036 out[index] = in1[index] + in2[index];
0037 }
0038 }
0039 };
0040
0041 struct VectorAddKernel1D {
0042 template <typename T>
0043 ALPAKA_FN_ACC void operator()(
0044 Acc1D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec1D size) const {
0045 for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size)) {
0046 auto index = ndindex[0];
0047 out[index] = in1[index] + in2[index];
0048 }
0049 }
0050 };
0051
0052 struct VectorAddKernel2D {
0053 template <typename T>
0054 ALPAKA_FN_ACC void operator()(
0055 Acc2D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec2D size) const {
0056 for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size)) {
0057 auto index = ndindex[0] * size[1] + ndindex[1];
0058 out[index] = in1[index] + in2[index];
0059 }
0060 }
0061 };
0062
0063 struct VectorAddKernel3D {
0064 template <typename T>
0065 ALPAKA_FN_ACC void operator()(
0066 Acc3D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec3D size) const {
0067 for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size)) {
0068 auto index = (ndindex[0] * size[1] + ndindex[1]) * size[2] + ndindex[2];
0069 out[index] = in1[index] + in2[index];
0070 }
0071 }
0072 };
0073
0074
0075
0076
0077
0078 struct VectorAddBlockKernel {
0079 template <typename T>
0080 ALPAKA_FN_ACC void operator()(
0081 Acc1D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0082
0083 auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0084
0085 T* buffer = alpaka::getDynSharedMem<T>(acc);
0086
0087
0088 for (auto block : cms::alpakatools::uniform_groups(acc, size)) {
0089
0090 if (cms::alpakatools::once_per_block(acc)) {
0091
0092 for (Idx local = 0; local < blockSize; ++local)
0093 buffer[local] = 0.;
0094 }
0095
0096 alpaka::syncBlockThreads(acc);
0097
0098 for (auto index : cms::alpakatools::uniform_group_elements(acc, block, size)) {
0099 buffer[index.local] = in1[index.global];
0100 }
0101
0102 alpaka::syncBlockThreads(acc);
0103
0104 for (auto index : cms::alpakatools::uniform_group_elements(acc, block, size)) {
0105 buffer[index.local] += in2[index.global];
0106 }
0107
0108 alpaka::syncBlockThreads(acc);
0109
0110 for (auto index : cms::alpakatools::uniform_group_elements(acc, block, size)) {
0111 out[index.global] = buffer[index.local];
0112 }
0113 }
0114 }
0115 };
0116
0117
0118
0119
0120
0121 struct VectorAddKernelSerial {
0122 template <typename T>
0123 ALPAKA_FN_ACC void operator()(
0124 Acc1D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0125
0126 if (cms::alpakatools::once_per_grid(acc)) {
0127 for (Idx index = 0; index < size; ++index) {
0128 out[index] += in1[index];
0129 out[index] += in2[index];
0130 }
0131 }
0132 }
0133 };
0134
0135
0136
0137
0138
0139 struct VectorAddKernelBlockSerial {
0140 template <typename T>
0141 ALPAKA_FN_ACC void operator()(
0142 Acc1D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0143
0144 auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0145
0146 for (auto block : cms::alpakatools::uniform_groups(acc, size)) {
0147
0148 const auto first = blockSize * block;
0149 const auto range = std::min<size_t>(first + blockSize, size);
0150 if (cms::alpakatools::once_per_block(acc)) {
0151 for (Idx index = first; index < range; ++index) {
0152 out[index] += in1[index];
0153 out[index] += in2[index];
0154 }
0155 }
0156 }
0157 }
0158 };
0159
0160 namespace alpaka::trait {
0161
0162
0163 template <>
0164 struct BlockSharedMemDynSizeBytes<VectorAddBlockKernel, Acc1D> {
0165
0166 template <typename T>
0167 ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(VectorAddBlockKernel const& ,
0168 Vec1D threads,
0169 Vec1D elements,
0170 T const* __restrict__ ,
0171 T const* __restrict__ ,
0172 T* __restrict__ ,
0173 size_t size) {
0174 return static_cast<std::size_t>(threads[0] * elements[0] * sizeof(T));
0175 }
0176 };
0177 }
0178
0179
0180 template <typename TKernel>
0181 void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::size_t block_size, TKernel kernel) {
0182
0183 std::random_device rd{};
0184 std::default_random_engine rand{rd()};
0185 std::normal_distribution<float> dist{0., 1.};
0186
0187
0188 constexpr float epsilon = 0.000001;
0189
0190
0191 const size_t size = problem_size;
0192
0193
0194 auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0195 auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0196 auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0197
0198
0199 for (size_t i = 0; i < size; ++i) {
0200 in1_h[i] = dist(rand);
0201 in2_h[i] = dist(rand);
0202 out_h[i] = 0.;
0203 }
0204
0205
0206 for (auto const& device : cms::alpakatools::devices<Platform>()) {
0207 std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " over " << problem_size << " values with "
0208 << grid_size << " blocks of " << block_size << " elements\n";
0209 auto queue = Queue(device);
0210
0211
0212 auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0213 auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0214 auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0215
0216
0217 alpaka::memcpy(queue, in1_d, in1_h);
0218 alpaka::memcpy(queue, in2_d, in2_h);
0219
0220
0221 alpaka::memset(queue, out_d, 0.);
0222
0223
0224 auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
0225 alpaka::exec<Acc1D>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), size);
0226
0227
0228 alpaka::memcpy(queue, out_h, out_d);
0229
0230
0231 alpaka::wait(queue);
0232
0233
0234 for (size_t i = 0; i < size; ++i) {
0235 float sum = in1_h[i] + in2_h[i];
0236 REQUIRE(out_h[i] < sum + epsilon);
0237 REQUIRE(out_h[i] > sum - epsilon);
0238 }
0239 }
0240 }
0241
0242
0243 template <typename TKernel>
0244 void testVectorAddKernelSkip(std::size_t skip_elements,
0245 std::size_t problem_size,
0246 std::size_t grid_size,
0247 std::size_t block_size,
0248 TKernel kernel) {
0249
0250 std::random_device rd{};
0251 std::default_random_engine rand{rd()};
0252 std::normal_distribution<float> dist{0., 1.};
0253
0254
0255 constexpr float epsilon = 0.000001;
0256
0257
0258 const size_t size = problem_size;
0259
0260
0261 auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0262 auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0263 auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0264
0265
0266 for (size_t i = 0; i < size; ++i) {
0267 in1_h[i] = dist(rand);
0268 in2_h[i] = dist(rand);
0269 out_h[i] = 0.;
0270 }
0271
0272
0273 for (auto const& device : cms::alpakatools::devices<Platform>()) {
0274 std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " skipping " << skip_elements << " over "
0275 << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n";
0276 auto queue = Queue(device);
0277
0278
0279 auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0280 auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0281 auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0282
0283
0284 alpaka::memcpy(queue, in1_d, in1_h);
0285 alpaka::memcpy(queue, in2_d, in2_h);
0286
0287
0288 alpaka::memset(queue, out_d, 0.);
0289
0290
0291 auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
0292 alpaka::exec<Acc1D>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), skip_elements, size);
0293
0294
0295 alpaka::memcpy(queue, out_h, out_d);
0296
0297
0298 alpaka::wait(queue);
0299
0300
0301 for (size_t i = 0; i < skip_elements; ++i) {
0302 REQUIRE(out_h[i] == 0);
0303 }
0304 for (size_t i = skip_elements; i < size; ++i) {
0305 float sum = in1_h[i] + in2_h[i];
0306 REQUIRE(out_h[i] < sum + epsilon);
0307 REQUIRE(out_h[i] > sum - epsilon);
0308 }
0309 }
0310 }
0311
0312
0313 template <typename TDim, typename TKernel>
0314 void testVectorAddKernelND(Vec<TDim> problem_size, Vec<TDim> grid_size, Vec<TDim> block_size, TKernel kernel) {
0315
0316 std::random_device rd{};
0317 std::default_random_engine rand{rd()};
0318 std::normal_distribution<float> dist{0., 1.};
0319
0320
0321 constexpr float epsilon = 0.000001;
0322
0323
0324 const size_t size = problem_size.prod();
0325
0326
0327 auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0328 auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0329 auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0330
0331
0332 for (size_t i = 0; i < size; ++i) {
0333 in1_h[i] = dist(rand);
0334 in2_h[i] = dist(rand);
0335 out_h[i] = 0.;
0336 }
0337
0338
0339 for (auto const& device : cms::alpakatools::devices<Platform>()) {
0340 std::cout << "Test " << TDim::value << "D vector addition on " << alpaka::getName(device) << " over "
0341 << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n";
0342 auto queue = Queue(device);
0343
0344
0345 auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0346 auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0347 auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0348
0349
0350 alpaka::memcpy(queue, in1_d, in1_h);
0351 alpaka::memcpy(queue, in2_d, in2_h);
0352
0353
0354 alpaka::memset(queue, out_d, 0.);
0355
0356
0357 using AccND = Acc<TDim>;
0358 auto div = cms::alpakatools::make_workdiv<AccND>(grid_size, block_size);
0359 alpaka::exec<AccND>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), problem_size);
0360
0361
0362 alpaka::memcpy(queue, out_h, out_d);
0363
0364
0365 alpaka::wait(queue);
0366
0367
0368 for (size_t i = 0; i < size; ++i) {
0369 float sum = in1_h[i] + in2_h[i];
0370 REQUIRE(out_h[i] < sum + epsilon);
0371 REQUIRE(out_h[i] > sum - epsilon);
0372 }
0373 }
0374 }
0375
0376 TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
0377 "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
0378 SECTION("Alpaka N-dimensional kernels") {
0379
0380 auto const& devices = cms::alpakatools::devices<Platform>();
0381 if (devices.empty()) {
0382 FAIL("No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0383 "the test will be skipped.");
0384 }
0385
0386
0387
0388 std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n";
0389 testVectorAddKernel(10000, 32, 32, VectorAddKernel{});
0390
0391
0392
0393 std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n";
0394 testVectorAddKernel(100, 1, 1024, VectorAddKernel{});
0395
0396
0397
0398 std::cout << "Test 1D vector addition with small block size\n";
0399 testVectorAddKernelND<Dim1D>({10000}, {32}, {32}, VectorAddKernel1D{});
0400
0401
0402
0403 std::cout << "Test 1D vector addition with large block size\n";
0404 testVectorAddKernelND<Dim1D>({100}, {1}, {1024}, VectorAddKernel1D{});
0405
0406
0407
0408 std::cout << "Test 2D vector addition with small block size\n";
0409 testVectorAddKernelND<Dim2D>({400, 250}, {4, 4}, {16, 16}, VectorAddKernel2D{});
0410
0411
0412
0413 std::cout << "Test 2D vector addition with large block size\n";
0414 testVectorAddKernelND<Dim2D>({20, 20}, {1, 1}, {32, 32}, VectorAddKernel2D{});
0415
0416
0417
0418 std::cout << "Test 3D vector addition with small block size\n";
0419 testVectorAddKernelND<Dim3D>({50, 125, 16}, {5, 5, 1}, {4, 4, 4}, VectorAddKernel3D{});
0420
0421
0422
0423 std::cout << "Test 3D vector addition with large block size\n";
0424 testVectorAddKernelND<Dim3D>({5, 5, 5}, {1, 1, 1}, {8, 8, 8}, VectorAddKernel3D{});
0425
0426
0427
0428 std::cout << "Test 1D vector block-level addition with small block size, using scalar dimensions\n";
0429 testVectorAddKernel(10000, 32, 32, VectorAddBlockKernel{});
0430
0431
0432
0433 std::cout << "Test 1D vector block-level addition with large block size, using scalar dimensions\n";
0434 testVectorAddKernel(100, 1, 1024, VectorAddBlockKernel{});
0435
0436
0437
0438 std::cout << "Test 1D vector single-threaded serial addition with small block size, using scalar dimensions\n";
0439 testVectorAddKernel(10000, 32, 32, VectorAddKernelSerial{});
0440
0441
0442
0443 std::cout << "Test 1D vector single-threaded seria addition with large block size, using scalar dimensions\n";
0444 testVectorAddKernel(100, 1, 1024, VectorAddKernelSerial{});
0445
0446
0447
0448 std::cout << "Test 1D vector block-level serial addition with small block size, using scalar dimensions\n";
0449 testVectorAddKernel(10000, 32, 32, VectorAddKernelBlockSerial{});
0450
0451
0452
0453 std::cout << "Test 1D vector block-level serial addition with large block size, using scalar dimensions\n";
0454 testVectorAddKernel(100, 1, 1024, VectorAddKernelBlockSerial{});
0455
0456
0457
0458 std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n";
0459 testVectorAddKernelSkip(20, 10000, 32, 32, VectorAddKernelSkip{});
0460
0461
0462
0463 std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n";
0464 testVectorAddKernelSkip(20, 100, 1, 1024, VectorAddKernelSkip{});
0465 }
0466 }