test/alpaka/testIndependentKernel.dev.cc

0001 #include <cstdio>
0002 #include <random>
0003
0004 #include <alpaka/alpaka.hpp>
0005
0006 #define CATCH_CONFIG_MAIN
0007 #include <catch.hpp>
0008
0009 #include "FWCore/Utilities/interface/stringize.h"
0010 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0013
0014 // each test binary is built for a single Alpaka backend
0015 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0016
0017 /* Add the group id to te value of each element in the group.
0018  * Each group is composed by the elements first[group]..first[group+1]-1 .
0019  */
0020 struct IndependentWorkKernel {
0021   template <typename TAcc, typename T>
0022   ALPAKA_FN_ACC void operator()(TAcc const& acc,
0023                                 T const* __restrict__ in,
0024                                 T* __restrict__ out,
0025                                 size_t const* __restrict__ indices,
0026                                 size_t groups) const {
0027     for (auto group : cms::alpakatools::independent_groups(acc, groups)) {
0028       size_t first = indices[group];
0029       size_t last = indices[group + 1];
0030       size_t size = last - first;
0031       for (auto index : cms::alpakatools::independent_group_elements(acc, size)) {
0032         out[first + index] = in[first + index] + group;
0033       }
0034     }
0035   }
0036 };
0037
0038 /* Test the IndependentWorkKernel kernel on all devices
0039  */
0040 template <typename TKernel>
0041 void testIndependentWorkKernel(size_t groups, size_t grid_size, size_t block_size, TKernel kernel) {
0042   // random number generator with a gaussian distribution
0043   std::random_device rd{};
0044   std::default_random_engine engine{rd()};
0045
0046   // uniform distribution
0047   std::uniform_int_distribution<size_t> random_size{100, 201};
0048
0049   // gaussian distribution
0050   std::normal_distribution<float> dist{0., 1.};
0051
0052   // build the groups
0053   std::vector<size_t> sizes(groups);
0054   auto indices_h = cms::alpakatools::make_host_buffer<size_t[], Platform>(groups + 1);
0055   indices_h[0] = 0;
0056   for (size_t i = 0; i < groups; ++i) {
0057     auto size = random_size(engine);
0058     sizes[i] = size;
0059     indices_h[i + 1] = indices_h[i] + size;
0060   }
0061
0062   // tolerance
0063   constexpr float epsilon = 0.000001;
0064
0065   // buffer size
0066   const size_t size = indices_h[groups];
0067
0068   // allocate the input and output host buffer in pinned memory accessible by the Platform devices
0069   auto in_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0070   auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0071
0072   // fill the input buffers with random data, and the output buffer with zeros
0073   for (size_t i = 0; i < size; ++i) {
0074     in_h[i] = dist(engine);
0075     out_h[i] = 0;
0076   }
0077
0078   // run the test on each device
0079   for (auto const& device : cms::alpakatools::devices<Platform>()) {
0080     std::cout << "Test IndependentWorkKernel on " << alpaka::getName(device) << " over " << size << " elements in "
0081               << groups << " independent groups with " << grid_size << " blocks of " << block_size << " elements\n";
0082     auto queue = Queue(device);
0083
0084     // allocate input and output buffers on the device
0085     auto indices_d = cms::alpakatools::make_device_buffer<size_t[]>(queue, groups + 1);
0086     auto in_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0087     auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0088
0089     // copy the input data to the device; the size is known from the buffer objects
0090     alpaka::memcpy(queue, indices_d, indices_h);
0091     alpaka::memcpy(queue, in_d, in_h);
0092
0093     // fill the output buffer with zeros; the size is known from the buffer objects
0094     alpaka::memset(queue, out_d, 0.);
0095
0096     // launch the 1-dimensional kernel with independent work groups
0097     auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
0098     alpaka::exec<Acc1D>(queue, div, kernel, in_d.data(), out_d.data(), indices_d.data(), groups);
0099
0100     // copy the results from the device to the host
0101     alpaka::memcpy(queue, out_h, out_d);
0102
0103     // wait for all the operations to complete
0104     alpaka::wait(queue);
0105
0106     // check the results
0107     for (size_t g = 0; g < groups; ++g) {
0108       size_t first = indices_h[g];
0109       size_t last = indices_h[g + 1];
0110       for (size_t i = first; i < last; ++i) {
0111         float sum = in_h[i] + g;
0112         float delta = std::max(std::fabs(sum) * epsilon, epsilon);
0113         REQUIRE(out_h[i] < sum + delta);
0114         REQUIRE(out_h[i] > sum - delta);
0115       }
0116     }
0117   }
0118 }
0119
0120 TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
0121           "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
0122   SECTION("Independent work groups") {
0123     // get the list of devices on the current platform
0124     auto const& devices = cms::alpakatools::devices<Platform>();
0125     if (devices.empty()) {
0126       FAIL("No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0127            "the test will be skipped.");
0128     }
0129
0130     // launch the independent work kernel with a small block size and a small number of blocks;
0131     // this relies on the kernel to loop over the "problem space" and do more work per block
0132     std::cout << "Test independent work kernel with small block size, using scalar dimensions\n";
0133     testIndependentWorkKernel(100, 32, 32, IndependentWorkKernel{});
0134
0135     // launch the independent work kernel with a large block size and a single block;
0136     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0137     std::cout << "Test independent work kernel with large block size, using scalar dimensions\n";
0138     testIndependentWorkKernel(100, 1, 1024, IndependentWorkKernel{});
0139
0140     // launch the independent work kernel with a large block size and a large number of blocks;
0141     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0142     std::cout << "Test independent work kernel with large block size, using scalar dimensions\n";
0143     testIndependentWorkKernel(100, 1024, 1024, IndependentWorkKernel{});
0144   }
0145 }