test/alpaka/testKernel.dev.cc

0001 #include <cstdio>
0002 #include <random>
0003
0004 #include <alpaka/alpaka.hpp>
0005
0006 #define CATCH_CONFIG_MAIN
0007 #include <catch.hpp>
0008
0009 #include "FWCore/Utilities/interface/stringize.h"
0010 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0013
0014 // each test binary is built for a single Alpaka backend
0015 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0016
0017 struct VectorAddKernel {
0018   template <typename T>
0019   ALPAKA_FN_ACC void operator()(
0020       Acc1D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0021     for (auto index : cms::alpakatools::uniform_elements(acc, size)) {
0022       out[index] = in1[index] + in2[index];
0023     }
0024   }
0025 };
0026
0027 struct VectorAddKernelSkip {
0028   template <typename T>
0029   ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0030                                 T const* __restrict__ in1,
0031                                 T const* __restrict__ in2,
0032                                 T* __restrict__ out,
0033                                 size_t first,
0034                                 size_t size) const {
0035     for (auto index : cms::alpakatools::uniform_elements(acc, first, size)) {
0036       out[index] = in1[index] + in2[index];
0037     }
0038   }
0039 };
0040
0041 struct VectorAddKernel1D {
0042   template <typename T>
0043   ALPAKA_FN_ACC void operator()(
0044       Acc1D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec1D size) const {
0045     for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size)) {
0046       auto index = ndindex[0];
0047       out[index] = in1[index] + in2[index];
0048     }
0049   }
0050 };
0051
0052 struct VectorAddKernel2D {
0053   template <typename T>
0054   ALPAKA_FN_ACC void operator()(
0055       Acc2D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec2D size) const {
0056     for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size)) {
0057       auto index = ndindex[0] * size[1] + ndindex[1];
0058       out[index] = in1[index] + in2[index];
0059     }
0060   }
0061 };
0062
0063 struct VectorAddKernel3D {
0064   template <typename T>
0065   ALPAKA_FN_ACC void operator()(
0066       Acc3D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec3D size) const {
0067     for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size)) {
0068       auto index = (ndindex[0] * size[1] + ndindex[1]) * size[2] + ndindex[2];
0069       out[index] = in1[index] + in2[index];
0070     }
0071   }
0072 };
0073
0074 /* This is not an efficient approach; it is only a test of using dynamic shared memory,
0075  * split block and element loops, and block-level synchronisation
0076  */
0077
0078 struct VectorAddBlockKernel {
0079   template <typename T>
0080   ALPAKA_FN_ACC void operator()(
0081       Acc1D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0082     // block size
0083     auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0084     // get the dynamic shared memory buffer
0085     T* buffer = alpaka::getDynSharedMem<T>(acc);
0086     // the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space
0087     // the inner loop is needed for backends that use more than one element per thread
0088     for (auto block : cms::alpakatools::uniform_groups(acc, size)) {
0089       // only one thread per block: initialise the shared memory
0090       if (cms::alpakatools::once_per_block(acc)) {
0091         // not really necessary, just to show how to use "once_per_block"
0092         for (Idx local = 0; local < blockSize; ++local)
0093           buffer[local] = 0.;
0094       }
0095       // synchronise all threads in the block
0096       alpaka::syncBlockThreads(acc);
0097       // read the first set of data into shared memory
0098       for (auto index : cms::alpakatools::uniform_group_elements(acc, block, size)) {
0099         buffer[index.local] = in1[index.global];
0100       }
0101       // synchronise all threads in the block
0102       alpaka::syncBlockThreads(acc);
0103       // add the second set of data into shared memory
0104       for (auto index : cms::alpakatools::uniform_group_elements(acc, block, size)) {
0105         buffer[index.local] += in2[index.global];
0106       }
0107       // synchronise all threads in the block
0108       alpaka::syncBlockThreads(acc);
0109       // store the results into global memory
0110       for (auto index : cms::alpakatools::uniform_group_elements(acc, block, size)) {
0111         out[index.global] = buffer[index.local];
0112       }
0113     }
0114   }
0115 };
0116
0117 /* Run all operations in a single thread.
0118  * Written in an inefficient way to test "once_per_grid".
0119  */
0120
0121 struct VectorAddKernelSerial {
0122   template <typename T>
0123   ALPAKA_FN_ACC void operator()(
0124       Acc1D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0125     // the operations are performed by a single thread
0126     if (cms::alpakatools::once_per_grid(acc)) {
0127       for (Idx index = 0; index < size; ++index) {
0128         out[index] += in1[index];
0129         out[index] += in2[index];
0130       }
0131     }
0132   }
0133 };
0134
0135 /* Run all operations in one thread per block.
0136  * Written in an inefficient way to test "once_per_block".
0137  */
0138
0139 struct VectorAddKernelBlockSerial {
0140   template <typename T>
0141   ALPAKA_FN_ACC void operator()(
0142       Acc1D const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0143     // block size
0144     auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0145     // the loop is used to repeat the "block" as many times as needed to cover the whole problem space
0146     for (auto block : cms::alpakatools::uniform_groups(acc, size)) {
0147       // the operations are performed by a single thread in each "logical" block
0148       const auto first = blockSize * block;
0149       const auto range = std::min<size_t>(first + blockSize, size);
0150       if (cms::alpakatools::once_per_block(acc)) {
0151         for (Idx index = first; index < range; ++index) {
0152           out[index] += in1[index];
0153           out[index] += in2[index];
0154         }
0155       }
0156     }
0157   }
0158 };
0159
0160 namespace alpaka::trait {
0161   // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
0162   // block shared dynamic memory for the VectorAddBlockKernel kernel
0163   template <>
0164   struct BlockSharedMemDynSizeBytes<VectorAddBlockKernel, Acc1D> {
0165     // the size in bytes of the shared memory allocated for a block
0166     template <typename T>
0167     ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(VectorAddBlockKernel const& /* kernel */,
0168                                                                         Vec1D threads,
0169                                                                         Vec1D elements,
0170                                                                         T const* __restrict__ /* in1 */,
0171                                                                         T const* __restrict__ /* in2 */,
0172                                                                         T* __restrict__ /* out */,
0173                                                                         size_t size) {
0174       return static_cast<std::size_t>(threads[0] * elements[0] * sizeof(T));
0175     }
0176   };
0177 }  // namespace alpaka::trait
0178
0179 // test the 1-dimensional kernel on all devices
0180 template <typename TKernel>
0181 void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::size_t block_size, TKernel kernel) {
0182   // random number generator with a gaussian distribution
0183   std::random_device rd{};
0184   std::default_random_engine rand{rd()};
0185   std::normal_distribution<float> dist{0., 1.};
0186
0187   // tolerance
0188   constexpr float epsilon = 0.000001;
0189
0190   // buffer size
0191   const size_t size = problem_size;
0192
0193   // allocate input and output host buffers in pinned memory accessible by the Platform devices
0194   auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0195   auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0196   auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0197
0198   // fill the input buffers with random data, and the output buffer with zeros
0199   for (size_t i = 0; i < size; ++i) {
0200     in1_h[i] = dist(rand);
0201     in2_h[i] = dist(rand);
0202     out_h[i] = 0.;
0203   }
0204
0205   // run the test on each device
0206   for (auto const& device : cms::alpakatools::devices<Platform>()) {
0207     std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " over " << problem_size << " values with "
0208               << grid_size << " blocks of " << block_size << " elements\n";
0209     auto queue = Queue(device);
0210
0211     // allocate input and output buffers on the device
0212     auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0213     auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0214     auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0215
0216     // copy the input data to the device; the size is known from the buffer objects
0217     alpaka::memcpy(queue, in1_d, in1_h);
0218     alpaka::memcpy(queue, in2_d, in2_h);
0219
0220     // fill the output buffer with zeros; the size is known from the buffer objects
0221     alpaka::memset(queue, out_d, 0.);
0222
0223     // launch the 1-dimensional kernel with scalar size
0224     auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
0225     alpaka::exec<Acc1D>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), size);
0226
0227     // copy the results from the device to the host
0228     alpaka::memcpy(queue, out_h, out_d);
0229
0230     // wait for all the operations to complete
0231     alpaka::wait(queue);
0232
0233     // check the results
0234     for (size_t i = 0; i < size; ++i) {
0235       float sum = in1_h[i] + in2_h[i];
0236       REQUIRE(out_h[i] < sum + epsilon);
0237       REQUIRE(out_h[i] > sum - epsilon);
0238     }
0239   }
0240 }
0241
0242 // test the 1-dimensional kernel on all devices, potentially skipping some elements
0243 template <typename TKernel>
0244 void testVectorAddKernelSkip(std::size_t skip_elements,
0245                              std::size_t problem_size,
0246                              std::size_t grid_size,
0247                              std::size_t block_size,
0248                              TKernel kernel) {
0249   // random number generator with a gaussian distribution
0250   std::random_device rd{};
0251   std::default_random_engine rand{rd()};
0252   std::normal_distribution<float> dist{0., 1.};
0253
0254   // tolerance
0255   constexpr float epsilon = 0.000001;
0256
0257   // buffer size
0258   const size_t size = problem_size;
0259
0260   // allocate input and output host buffers in pinned memory accessible by the Platform devices
0261   auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0262   auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0263   auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0264
0265   // fill the input buffers with random data, and the output buffer with zeros
0266   for (size_t i = 0; i < size; ++i) {
0267     in1_h[i] = dist(rand);
0268     in2_h[i] = dist(rand);
0269     out_h[i] = 0.;
0270   }
0271
0272   // run the test on each device
0273   for (auto const& device : cms::alpakatools::devices<Platform>()) {
0274     std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " skipping " << skip_elements << " over "
0275               << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n";
0276     auto queue = Queue(device);
0277
0278     // allocate input and output buffers on the device
0279     auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0280     auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0281     auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0282
0283     // copy the input data to the device; the size is known from the buffer objects
0284     alpaka::memcpy(queue, in1_d, in1_h);
0285     alpaka::memcpy(queue, in2_d, in2_h);
0286
0287     // fill the output buffer with zeros; the size is known from the buffer objects
0288     alpaka::memset(queue, out_d, 0.);
0289
0290     // launch the 1-dimensional kernel with scalar size
0291     auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
0292     alpaka::exec<Acc1D>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), skip_elements, size);
0293
0294     // copy the results from the device to the host
0295     alpaka::memcpy(queue, out_h, out_d);
0296
0297     // wait for all the operations to complete
0298     alpaka::wait(queue);
0299
0300     // check the results
0301     for (size_t i = 0; i < skip_elements; ++i) {
0302       REQUIRE(out_h[i] == 0);
0303     }
0304     for (size_t i = skip_elements; i < size; ++i) {
0305       float sum = in1_h[i] + in2_h[i];
0306       REQUIRE(out_h[i] < sum + epsilon);
0307       REQUIRE(out_h[i] > sum - epsilon);
0308     }
0309   }
0310 }
0311
0312 // test the N-dimensional kernels on all devices
0313 template <typename TDim, typename TKernel>
0314 void testVectorAddKernelND(Vec<TDim> problem_size, Vec<TDim> grid_size, Vec<TDim> block_size, TKernel kernel) {
0315   // random number generator with a gaussian distribution
0316   std::random_device rd{};
0317   std::default_random_engine rand{rd()};
0318   std::normal_distribution<float> dist{0., 1.};
0319
0320   // tolerance
0321   constexpr float epsilon = 0.000001;
0322
0323   // linearised buffer size
0324   const size_t size = problem_size.prod();
0325
0326   // allocate input and output host buffers in pinned memory accessible by the Platform devices
0327   auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0328   auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0329   auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0330
0331   // fill the input buffers with random data, and the output buffer with zeros
0332   for (size_t i = 0; i < size; ++i) {
0333     in1_h[i] = dist(rand);
0334     in2_h[i] = dist(rand);
0335     out_h[i] = 0.;
0336   }
0337
0338   // run the test on each device
0339   for (auto const& device : cms::alpakatools::devices<Platform>()) {
0340     std::cout << "Test " << TDim::value << "D vector addition on " << alpaka::getName(device) << " over "
0341               << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n";
0342     auto queue = Queue(device);
0343
0344     // allocate input and output buffers on the device
0345     auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0346     auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0347     auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0348
0349     // copy the input data to the device; the size is known from the buffer objects
0350     alpaka::memcpy(queue, in1_d, in1_h);
0351     alpaka::memcpy(queue, in2_d, in2_h);
0352
0353     // fill the output buffer with zeros; the size is known from the buffer objects
0354     alpaka::memset(queue, out_d, 0.);
0355
0356     // launch the 3-dimensional kernel
0357     using AccND = Acc<TDim>;
0358     auto div = cms::alpakatools::make_workdiv<AccND>(grid_size, block_size);
0359     alpaka::exec<AccND>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), problem_size);
0360
0361     // copy the results from the device to the host
0362     alpaka::memcpy(queue, out_h, out_d);
0363
0364     // wait for all the operations to complete
0365     alpaka::wait(queue);
0366
0367     // check the results
0368     for (size_t i = 0; i < size; ++i) {
0369       float sum = in1_h[i] + in2_h[i];
0370       REQUIRE(out_h[i] < sum + epsilon);
0371       REQUIRE(out_h[i] > sum - epsilon);
0372     }
0373   }
0374 }
0375
0376 TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
0377           "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
0378   SECTION("Alpaka N-dimensional kernels") {
0379     // get the list of devices on the current platform
0380     auto const& devices = cms::alpakatools::devices<Platform>();
0381     if (devices.empty()) {
0382       FAIL("No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0383            "the test will be skipped.");
0384     }
0385
0386     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0387     // this relies on the kernel to loop over the "problem space" and do more work per block
0388     std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n";
0389     testVectorAddKernel(10000, 32, 32, VectorAddKernel{});
0390
0391     // launch the 1-dimensional kernel with a large block size and a single block;
0392     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0393     std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n";
0394     testVectorAddKernel(100, 1, 1024, VectorAddKernel{});
0395
0396     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0397     // this relies on the kernel to loop over the "problem space" and do more work per block
0398     std::cout << "Test 1D vector addition with small block size\n";
0399     testVectorAddKernelND<Dim1D>({10000}, {32}, {32}, VectorAddKernel1D{});
0400
0401     // launch the 1-dimensional kernel with a large block size and a single block;
0402     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0403     std::cout << "Test 1D vector addition with large block size\n";
0404     testVectorAddKernelND<Dim1D>({100}, {1}, {1024}, VectorAddKernel1D{});
0405
0406     // launch the 2-dimensional kernel with a small block size and a small number of blocks;
0407     // this relies on the kernel to loop over the "problem space" and do more work per block
0408     std::cout << "Test 2D vector addition with small block size\n";
0409     testVectorAddKernelND<Dim2D>({400, 250}, {4, 4}, {16, 16}, VectorAddKernel2D{});
0410
0411     // launch the 2-dimensional kernel with a large block size and a single block;
0412     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0413     std::cout << "Test 2D vector addition with large block size\n";
0414     testVectorAddKernelND<Dim2D>({20, 20}, {1, 1}, {32, 32}, VectorAddKernel2D{});
0415
0416     // launch the 3-dimensional kernel with a small block size and a small number of blocks;
0417     // this relies on the kernel to loop over the "problem space" and do more work per block
0418     std::cout << "Test 3D vector addition with small block size\n";
0419     testVectorAddKernelND<Dim3D>({50, 125, 16}, {5, 5, 1}, {4, 4, 4}, VectorAddKernel3D{});
0420
0421     // launch the 3-dimensional kernel with a large block size and a single block;
0422     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0423     std::cout << "Test 3D vector addition with large block size\n";
0424     testVectorAddKernelND<Dim3D>({5, 5, 5}, {1, 1, 1}, {8, 8, 8}, VectorAddKernel3D{});
0425
0426     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0427     // this relies on the kernel to loop over the "problem space" and do more work per block
0428     std::cout << "Test 1D vector block-level addition with small block size, using scalar dimensions\n";
0429     testVectorAddKernel(10000, 32, 32, VectorAddBlockKernel{});
0430
0431     // launch the 1-dimensional kernel with a large block size and a single block;
0432     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0433     std::cout << "Test 1D vector block-level addition with large block size, using scalar dimensions\n";
0434     testVectorAddKernel(100, 1, 1024, VectorAddBlockKernel{});
0435
0436     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0437     // this relies on the kernel to loop over the "problem space" and do more work per block
0438     std::cout << "Test 1D vector single-threaded serial addition with small block size, using scalar dimensions\n";
0439     testVectorAddKernel(10000, 32, 32, VectorAddKernelSerial{});
0440
0441     // launch the 1-dimensional kernel with a large block size and a single block;
0442     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0443     std::cout << "Test 1D vector single-threaded seria addition with large block size, using scalar dimensions\n";
0444     testVectorAddKernel(100, 1, 1024, VectorAddKernelSerial{});
0445
0446     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0447     // this relies on the kernel to loop over the "problem space" and do more work per block
0448     std::cout << "Test 1D vector block-level serial addition with small block size, using scalar dimensions\n";
0449     testVectorAddKernel(10000, 32, 32, VectorAddKernelBlockSerial{});
0450
0451     // launch the 1-dimensional kernel with a large block size and a single block;
0452     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0453     std::cout << "Test 1D vector block-level serial addition with large block size, using scalar dimensions\n";
0454     testVectorAddKernel(100, 1, 1024, VectorAddKernelBlockSerial{});
0455
0456     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0457     // this relies on the kernel to loop over the "problem space" and do more work per block
0458     std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n";
0459     testVectorAddKernelSkip(20, 10000, 32, 32, VectorAddKernelSkip{});
0460
0461     // launch the 1-dimensional kernel with a large block size and a single block;
0462     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0463     std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n";
0464     testVectorAddKernelSkip(20, 100, 1, 1024, VectorAddKernelSkip{});
0465   }
0466 }