File indexing completed on 2023-11-16 03:43:47
0001 #include <cstdio>
0002 #include <random>
0003
0004 #include <alpaka/alpaka.hpp>
0005
0006 #define CATCH_CONFIG_MAIN
0007 #include <catch.hpp>
0008
0009 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0010 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0012
0013
0014 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0015
0016 struct VectorAddKernel {
0017 template <typename TAcc, typename T>
0018 ALPAKA_FN_ACC void operator()(
0019 TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0020 for (auto index : cms::alpakatools::elements_with_stride(acc, size)) {
0021 out[index] = in1[index] + in2[index];
0022 }
0023 }
0024 };
0025
0026 struct VectorAddKernel1D {
0027 template <typename TAcc, typename T>
0028 ALPAKA_FN_ACC void operator()(
0029 TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec1D size) const {
0030 for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
0031 auto index = ndindex[0];
0032 out[index] = in1[index] + in2[index];
0033 }
0034 }
0035 };
0036
0037 struct VectorAddKernel2D {
0038 template <typename TAcc, typename T>
0039 ALPAKA_FN_ACC void operator()(
0040 TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec2D size) const {
0041 for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
0042 auto index = ndindex[0] * size[1] + ndindex[1];
0043 out[index] = in1[index] + in2[index];
0044 }
0045 }
0046 };
0047
0048 struct VectorAddKernel3D {
0049 template <typename TAcc, typename T>
0050 ALPAKA_FN_ACC void operator()(
0051 TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec3D size) const {
0052 for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
0053 auto index = (ndindex[0] * size[1] + ndindex[1]) * size[2] + ndindex[2];
0054 out[index] = in1[index] + in2[index];
0055 }
0056 }
0057 };
0058
0059
0060
0061
0062
0063 struct VectorAddBlockKernel {
0064 template <typename TAcc, typename T>
0065 ALPAKA_FN_ACC void operator()(
0066 TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0067
0068 auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0069
0070 T* buffer = alpaka::getDynSharedMem<T>(acc);
0071
0072
0073 for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
0074
0075 if (cms::alpakatools::once_per_block(acc)) {
0076
0077 for (Idx local = 0; local < blockSize; ++local)
0078 buffer[local] = 0.;
0079 }
0080
0081 alpaka::syncBlockThreads(acc);
0082
0083 for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
0084 buffer[index.local] = in1[index.global];
0085 }
0086
0087 alpaka::syncBlockThreads(acc);
0088
0089 for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
0090 buffer[index.local] += in2[index.global];
0091 }
0092
0093 alpaka::syncBlockThreads(acc);
0094
0095 for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
0096 out[index.global] = buffer[index.local];
0097 }
0098 }
0099 }
0100 };
0101
0102
0103
0104
0105
0106 struct VectorAddKernelSerial {
0107 template <typename TAcc, typename T>
0108 ALPAKA_FN_ACC void operator()(
0109 TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0110
0111 if (cms::alpakatools::once_per_grid(acc)) {
0112 for (Idx index = 0; index < size; ++index) {
0113 out[index] += in1[index];
0114 out[index] += in2[index];
0115 }
0116 }
0117 }
0118 };
0119
0120
0121
0122
0123
0124 struct VectorAddKernelBlockSerial {
0125 template <typename TAcc, typename T>
0126 ALPAKA_FN_ACC void operator()(
0127 TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0128
0129 auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0130
0131 for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
0132
0133 const auto first = blockSize * block;
0134 const auto range = std::min<size_t>(first + blockSize, size);
0135 if (cms::alpakatools::once_per_block(acc)) {
0136 for (Idx index = first; index < range; ++index) {
0137 out[index] += in1[index];
0138 out[index] += in2[index];
0139 }
0140 }
0141 }
0142 }
0143 };
0144
0145 namespace alpaka::trait {
0146
0147
0148 template <typename TAcc>
0149 struct BlockSharedMemDynSizeBytes<VectorAddBlockKernel, TAcc> {
0150
0151 template <typename T>
0152 ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(VectorAddBlockKernel const& ,
0153 Vec1D threads,
0154 Vec1D elements,
0155 T const* __restrict__ ,
0156 T const* __restrict__ ,
0157 T* __restrict__ ,
0158 size_t size) {
0159 return static_cast<std::size_t>(threads[0] * elements[0] * sizeof(T));
0160 }
0161 };
0162 }
0163
0164
0165 template <typename TKernel>
0166 void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::size_t block_size, TKernel kernel) {
0167
0168 std::random_device rd{};
0169 std::default_random_engine rand{rd()};
0170 std::normal_distribution<float> dist{0., 1.};
0171
0172
0173 constexpr float epsilon = 0.000001;
0174
0175
0176 const size_t size = problem_size;
0177
0178
0179 auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0180 auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0181 auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0182
0183
0184 for (size_t i = 0; i < size; ++i) {
0185 in1_h[i] = dist(rand);
0186 in2_h[i] = dist(rand);
0187 out_h[i] = 0.;
0188 }
0189
0190
0191 for (auto const& device : cms::alpakatools::devices<Platform>()) {
0192 std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " over " << problem_size << " values with "
0193 << grid_size << " blocks of " << block_size << " elements\n";
0194 auto queue = Queue(device);
0195
0196
0197 auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0198 auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0199 auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0200
0201
0202 alpaka::memcpy(queue, in1_d, in1_h);
0203 alpaka::memcpy(queue, in2_d, in2_h);
0204
0205
0206 alpaka::memset(queue, out_d, 0.);
0207
0208
0209 auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
0210 alpaka::exec<Acc1D>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), size);
0211
0212
0213 alpaka::memcpy(queue, out_h, out_d);
0214
0215
0216 alpaka::wait(queue);
0217
0218
0219 for (size_t i = 0; i < size; ++i) {
0220 float sum = in1_h[i] + in2_h[i];
0221 REQUIRE(out_h[i] < sum + epsilon);
0222 REQUIRE(out_h[i] > sum - epsilon);
0223 }
0224 }
0225 }
0226
0227
0228 template <typename TDim, typename TKernel>
0229 void testVectorAddKernelND(Vec<TDim> problem_size, Vec<TDim> grid_size, Vec<TDim> block_size, TKernel kernel) {
0230
0231 std::random_device rd{};
0232 std::default_random_engine rand{rd()};
0233 std::normal_distribution<float> dist{0., 1.};
0234
0235
0236 constexpr float epsilon = 0.000001;
0237
0238
0239 const size_t size = problem_size.prod();
0240
0241
0242 auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0243 auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0244 auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0245
0246
0247 for (size_t i = 0; i < size; ++i) {
0248 in1_h[i] = dist(rand);
0249 in2_h[i] = dist(rand);
0250 out_h[i] = 0.;
0251 }
0252
0253
0254 for (auto const& device : cms::alpakatools::devices<Platform>()) {
0255 std::cout << "Test " << TDim::value << "D vector addition on " << alpaka::getName(device) << " over "
0256 << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n";
0257 auto queue = Queue(device);
0258
0259
0260 auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0261 auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0262 auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0263
0264
0265 alpaka::memcpy(queue, in1_d, in1_h);
0266 alpaka::memcpy(queue, in2_d, in2_h);
0267
0268
0269 alpaka::memset(queue, out_d, 0.);
0270
0271
0272 using AccND = Acc<TDim>;
0273 auto div = cms::alpakatools::make_workdiv<AccND>(grid_size, block_size);
0274 alpaka::exec<AccND>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), problem_size);
0275
0276
0277 alpaka::memcpy(queue, out_h, out_d);
0278
0279
0280 alpaka::wait(queue);
0281
0282
0283 for (size_t i = 0; i < size; ++i) {
0284 float sum = in1_h[i] + in2_h[i];
0285 REQUIRE(out_h[i] < sum + epsilon);
0286 REQUIRE(out_h[i] > sum - epsilon);
0287 }
0288 }
0289 }
0290
0291 TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
0292 "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
0293 SECTION("Alpaka N-dimensional kernels") {
0294
0295 auto const& devices = cms::alpakatools::devices<Platform>();
0296 if (devices.empty()) {
0297 INFO("No devices available on the platform " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE));
0298 REQUIRE(not devices.empty());
0299 }
0300
0301
0302
0303 std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n";
0304 testVectorAddKernel(10000, 32, 32, VectorAddKernel{});
0305
0306
0307
0308 std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n";
0309 testVectorAddKernel(100, 1, 1024, VectorAddKernel{});
0310
0311
0312
0313 std::cout << "Test 1D vector addition with small block size\n";
0314 testVectorAddKernelND<Dim1D>({10000}, {32}, {32}, VectorAddKernel1D{});
0315
0316
0317
0318 std::cout << "Test 1D vector addition with large block size\n";
0319 testVectorAddKernelND<Dim1D>({100}, {1}, {1024}, VectorAddKernel1D{});
0320
0321
0322
0323 std::cout << "Test 2D vector addition with small block size\n";
0324 testVectorAddKernelND<Dim2D>({400, 250}, {4, 4}, {16, 16}, VectorAddKernel2D{});
0325
0326
0327
0328 std::cout << "Test 2D vector addition with large block size\n";
0329 testVectorAddKernelND<Dim2D>({20, 20}, {1, 1}, {32, 32}, VectorAddKernel2D{});
0330
0331
0332
0333 std::cout << "Test 3D vector addition with small block size\n";
0334 testVectorAddKernelND<Dim3D>({50, 125, 16}, {5, 5, 1}, {4, 4, 4}, VectorAddKernel3D{});
0335
0336
0337
0338 std::cout << "Test 3D vector addition with large block size\n";
0339 testVectorAddKernelND<Dim3D>({5, 5, 5}, {1, 1, 1}, {8, 8, 8}, VectorAddKernel3D{});
0340
0341
0342
0343 std::cout << "Test 1D vector block-level addition with small block size, using scalar dimensions\n";
0344 testVectorAddKernel(10000, 32, 32, VectorAddBlockKernel{});
0345
0346
0347
0348 std::cout << "Test 1D vector block-level addition with large block size, using scalar dimensions\n";
0349 testVectorAddKernel(100, 1, 1024, VectorAddBlockKernel{});
0350
0351
0352
0353 std::cout << "Test 1D vector single-threaded serial addition with small block size, using scalar dimensions\n";
0354 testVectorAddKernel(10000, 32, 32, VectorAddKernelSerial{});
0355
0356
0357
0358 std::cout << "Test 1D vector single-threaded seria addition with large block size, using scalar dimensions\n";
0359 testVectorAddKernel(100, 1, 1024, VectorAddKernelSerial{});
0360
0361
0362
0363 std::cout << "Test 1D vector block-level serial addition with small block size, using scalar dimensions\n";
0364 testVectorAddKernel(10000, 32, 32, VectorAddKernelBlockSerial{});
0365
0366
0367
0368 std::cout << "Test 1D vector block-level serial addition with large block size, using scalar dimensions\n";
0369 testVectorAddKernel(100, 1, 1024, VectorAddKernelBlockSerial{});
0370 }
0371 }