Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2023-11-16 03:43:47

0001 #include <cstdio>
0002 #include <random>
0003 
0004 #include <alpaka/alpaka.hpp>
0005 
0006 #define CATCH_CONFIG_MAIN
0007 #include <catch.hpp>
0008 
0009 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0010 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0012 
0013 // each test binary is built for a single Alpaka backend
0014 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0015 
0016 struct VectorAddKernel {
0017   template <typename TAcc, typename T>
0018   ALPAKA_FN_ACC void operator()(
0019       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0020     for (auto index : cms::alpakatools::elements_with_stride(acc, size)) {
0021       out[index] = in1[index] + in2[index];
0022     }
0023   }
0024 };
0025 
0026 struct VectorAddKernel1D {
0027   template <typename TAcc, typename T>
0028   ALPAKA_FN_ACC void operator()(
0029       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec1D size) const {
0030     for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
0031       auto index = ndindex[0];
0032       out[index] = in1[index] + in2[index];
0033     }
0034   }
0035 };
0036 
0037 struct VectorAddKernel2D {
0038   template <typename TAcc, typename T>
0039   ALPAKA_FN_ACC void operator()(
0040       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec2D size) const {
0041     for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
0042       auto index = ndindex[0] * size[1] + ndindex[1];
0043       out[index] = in1[index] + in2[index];
0044     }
0045   }
0046 };
0047 
0048 struct VectorAddKernel3D {
0049   template <typename TAcc, typename T>
0050   ALPAKA_FN_ACC void operator()(
0051       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec3D size) const {
0052     for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
0053       auto index = (ndindex[0] * size[1] + ndindex[1]) * size[2] + ndindex[2];
0054       out[index] = in1[index] + in2[index];
0055     }
0056   }
0057 };
0058 
0059 /* This is not an efficient approach; it is only a test of using dynamic shared memory,
0060  * split block and element loops, and block-level synchronisation
0061  */
0062 
0063 struct VectorAddBlockKernel {
0064   template <typename TAcc, typename T>
0065   ALPAKA_FN_ACC void operator()(
0066       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0067     // block size
0068     auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0069     // get the dynamic shared memory buffer
0070     T* buffer = alpaka::getDynSharedMem<T>(acc);
0071     // the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space
0072     // the inner loop is needed for backends that use more than one element per thread
0073     for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
0074       // only one thread per block: initialise the shared memory
0075       if (cms::alpakatools::once_per_block(acc)) {
0076         // not really necessary, just to show how to use "once_per_block"
0077         for (Idx local = 0; local < blockSize; ++local)
0078           buffer[local] = 0.;
0079       }
0080       // synchronise all threads in the block
0081       alpaka::syncBlockThreads(acc);
0082       // read the first set of data into shared memory
0083       for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
0084         buffer[index.local] = in1[index.global];
0085       }
0086       // synchronise all threads in the block
0087       alpaka::syncBlockThreads(acc);
0088       // add the second set of data into shared memory
0089       for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
0090         buffer[index.local] += in2[index.global];
0091       }
0092       // synchronise all threads in the block
0093       alpaka::syncBlockThreads(acc);
0094       // store the results into global memory
0095       for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
0096         out[index.global] = buffer[index.local];
0097       }
0098     }
0099   }
0100 };
0101 
0102 /* Run all operations in a single thread.
0103  * Written in an inefficient way to test "once_per_grid".
0104  */
0105 
0106 struct VectorAddKernelSerial {
0107   template <typename TAcc, typename T>
0108   ALPAKA_FN_ACC void operator()(
0109       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0110     // the operations are performed by a single thread
0111     if (cms::alpakatools::once_per_grid(acc)) {
0112       for (Idx index = 0; index < size; ++index) {
0113         out[index] += in1[index];
0114         out[index] += in2[index];
0115       }
0116     }
0117   }
0118 };
0119 
0120 /* Run all operations in one thread per block.
0121  * Written in an inefficient way to test "once_per_block".
0122  */
0123 
0124 struct VectorAddKernelBlockSerial {
0125   template <typename TAcc, typename T>
0126   ALPAKA_FN_ACC void operator()(
0127       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
0128     // block size
0129     auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0130     // the loop is used to repeat the "block" as many times as needed to cover the whole problem space
0131     for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
0132       // the operations are performed by a single thread in each "logical" block
0133       const auto first = blockSize * block;
0134       const auto range = std::min<size_t>(first + blockSize, size);
0135       if (cms::alpakatools::once_per_block(acc)) {
0136         for (Idx index = first; index < range; ++index) {
0137           out[index] += in1[index];
0138           out[index] += in2[index];
0139         }
0140       }
0141     }
0142   }
0143 };
0144 
0145 namespace alpaka::trait {
0146   // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
0147   // block shared dynamic memory for the VectorAddBlockKernel kernel
0148   template <typename TAcc>
0149   struct BlockSharedMemDynSizeBytes<VectorAddBlockKernel, TAcc> {
0150     // the size in bytes of the shared memory allocated for a block
0151     template <typename T>
0152     ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(VectorAddBlockKernel const& /* kernel */,
0153                                                                         Vec1D threads,
0154                                                                         Vec1D elements,
0155                                                                         T const* __restrict__ /* in1 */,
0156                                                                         T const* __restrict__ /* in2 */,
0157                                                                         T* __restrict__ /* out */,
0158                                                                         size_t size) {
0159       return static_cast<std::size_t>(threads[0] * elements[0] * sizeof(T));
0160     }
0161   };
0162 }  // namespace alpaka::trait
0163 
0164 // test the 1-dimensional kernel on all devices
0165 template <typename TKernel>
0166 void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::size_t block_size, TKernel kernel) {
0167   // random number generator with a gaussian distribution
0168   std::random_device rd{};
0169   std::default_random_engine rand{rd()};
0170   std::normal_distribution<float> dist{0., 1.};
0171 
0172   // tolerance
0173   constexpr float epsilon = 0.000001;
0174 
0175   // buffer size
0176   const size_t size = problem_size;
0177 
0178   // allocate input and output host buffers in pinned memory accessible by the Platform devices
0179   auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0180   auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0181   auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0182 
0183   // fill the input buffers with random data, and the output buffer with zeros
0184   for (size_t i = 0; i < size; ++i) {
0185     in1_h[i] = dist(rand);
0186     in2_h[i] = dist(rand);
0187     out_h[i] = 0.;
0188   }
0189 
0190   // run the test on each device
0191   for (auto const& device : cms::alpakatools::devices<Platform>()) {
0192     std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " over " << problem_size << " values with "
0193               << grid_size << " blocks of " << block_size << " elements\n";
0194     auto queue = Queue(device);
0195 
0196     // allocate input and output buffers on the device
0197     auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0198     auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0199     auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0200 
0201     // copy the input data to the device; the size is known from the buffer objects
0202     alpaka::memcpy(queue, in1_d, in1_h);
0203     alpaka::memcpy(queue, in2_d, in2_h);
0204 
0205     // fill the output buffer with zeros; the size is known from the buffer objects
0206     alpaka::memset(queue, out_d, 0.);
0207 
0208     // launch the 1-dimensional kernel with scalar size
0209     auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
0210     alpaka::exec<Acc1D>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), size);
0211 
0212     // copy the results from the device to the host
0213     alpaka::memcpy(queue, out_h, out_d);
0214 
0215     // wait for all the operations to complete
0216     alpaka::wait(queue);
0217 
0218     // check the results
0219     for (size_t i = 0; i < size; ++i) {
0220       float sum = in1_h[i] + in2_h[i];
0221       REQUIRE(out_h[i] < sum + epsilon);
0222       REQUIRE(out_h[i] > sum - epsilon);
0223     }
0224   }
0225 }
0226 
0227 // test the N-dimensional kernels on all devices
0228 template <typename TDim, typename TKernel>
0229 void testVectorAddKernelND(Vec<TDim> problem_size, Vec<TDim> grid_size, Vec<TDim> block_size, TKernel kernel) {
0230   // random number generator with a gaussian distribution
0231   std::random_device rd{};
0232   std::default_random_engine rand{rd()};
0233   std::normal_distribution<float> dist{0., 1.};
0234 
0235   // tolerance
0236   constexpr float epsilon = 0.000001;
0237 
0238   // linearised buffer size
0239   const size_t size = problem_size.prod();
0240 
0241   // allocate input and output host buffers in pinned memory accessible by the Platform devices
0242   auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0243   auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0244   auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
0245 
0246   // fill the input buffers with random data, and the output buffer with zeros
0247   for (size_t i = 0; i < size; ++i) {
0248     in1_h[i] = dist(rand);
0249     in2_h[i] = dist(rand);
0250     out_h[i] = 0.;
0251   }
0252 
0253   // run the test on each device
0254   for (auto const& device : cms::alpakatools::devices<Platform>()) {
0255     std::cout << "Test " << TDim::value << "D vector addition on " << alpaka::getName(device) << " over "
0256               << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n";
0257     auto queue = Queue(device);
0258 
0259     // allocate input and output buffers on the device
0260     auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0261     auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0262     auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
0263 
0264     // copy the input data to the device; the size is known from the buffer objects
0265     alpaka::memcpy(queue, in1_d, in1_h);
0266     alpaka::memcpy(queue, in2_d, in2_h);
0267 
0268     // fill the output buffer with zeros; the size is known from the buffer objects
0269     alpaka::memset(queue, out_d, 0.);
0270 
0271     // launch the 3-dimensional kernel
0272     using AccND = Acc<TDim>;
0273     auto div = cms::alpakatools::make_workdiv<AccND>(grid_size, block_size);
0274     alpaka::exec<AccND>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), problem_size);
0275 
0276     // copy the results from the device to the host
0277     alpaka::memcpy(queue, out_h, out_d);
0278 
0279     // wait for all the operations to complete
0280     alpaka::wait(queue);
0281 
0282     // check the results
0283     for (size_t i = 0; i < size; ++i) {
0284       float sum = in1_h[i] + in2_h[i];
0285       REQUIRE(out_h[i] < sum + epsilon);
0286       REQUIRE(out_h[i] > sum - epsilon);
0287     }
0288   }
0289 }
0290 
0291 TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
0292           "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
0293   SECTION("Alpaka N-dimensional kernels") {
0294     // get the list of devices on the current platform
0295     auto const& devices = cms::alpakatools::devices<Platform>();
0296     if (devices.empty()) {
0297       INFO("No devices available on the platform " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE));
0298       REQUIRE(not devices.empty());
0299     }
0300 
0301     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0302     // this relies on the kernel to loop over the "problem space" and do more work per block
0303     std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n";
0304     testVectorAddKernel(10000, 32, 32, VectorAddKernel{});
0305 
0306     // launch the 1-dimensional kernel with a large block size and a single block;
0307     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0308     std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n";
0309     testVectorAddKernel(100, 1, 1024, VectorAddKernel{});
0310 
0311     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0312     // this relies on the kernel to loop over the "problem space" and do more work per block
0313     std::cout << "Test 1D vector addition with small block size\n";
0314     testVectorAddKernelND<Dim1D>({10000}, {32}, {32}, VectorAddKernel1D{});
0315 
0316     // launch the 1-dimensional kernel with a large block size and a single block;
0317     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0318     std::cout << "Test 1D vector addition with large block size\n";
0319     testVectorAddKernelND<Dim1D>({100}, {1}, {1024}, VectorAddKernel1D{});
0320 
0321     // launch the 2-dimensional kernel with a small block size and a small number of blocks;
0322     // this relies on the kernel to loop over the "problem space" and do more work per block
0323     std::cout << "Test 2D vector addition with small block size\n";
0324     testVectorAddKernelND<Dim2D>({400, 250}, {4, 4}, {16, 16}, VectorAddKernel2D{});
0325 
0326     // launch the 2-dimensional kernel with a large block size and a single block;
0327     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0328     std::cout << "Test 2D vector addition with large block size\n";
0329     testVectorAddKernelND<Dim2D>({20, 20}, {1, 1}, {32, 32}, VectorAddKernel2D{});
0330 
0331     // launch the 3-dimensional kernel with a small block size and a small number of blocks;
0332     // this relies on the kernel to loop over the "problem space" and do more work per block
0333     std::cout << "Test 3D vector addition with small block size\n";
0334     testVectorAddKernelND<Dim3D>({50, 125, 16}, {5, 5, 1}, {4, 4, 4}, VectorAddKernel3D{});
0335 
0336     // launch the 3-dimensional kernel with a large block size and a single block;
0337     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0338     std::cout << "Test 3D vector addition with large block size\n";
0339     testVectorAddKernelND<Dim3D>({5, 5, 5}, {1, 1, 1}, {8, 8, 8}, VectorAddKernel3D{});
0340 
0341     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0342     // this relies on the kernel to loop over the "problem space" and do more work per block
0343     std::cout << "Test 1D vector block-level addition with small block size, using scalar dimensions\n";
0344     testVectorAddKernel(10000, 32, 32, VectorAddBlockKernel{});
0345 
0346     // launch the 1-dimensional kernel with a large block size and a single block;
0347     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0348     std::cout << "Test 1D vector block-level addition with large block size, using scalar dimensions\n";
0349     testVectorAddKernel(100, 1, 1024, VectorAddBlockKernel{});
0350 
0351     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0352     // this relies on the kernel to loop over the "problem space" and do more work per block
0353     std::cout << "Test 1D vector single-threaded serial addition with small block size, using scalar dimensions\n";
0354     testVectorAddKernel(10000, 32, 32, VectorAddKernelSerial{});
0355 
0356     // launch the 1-dimensional kernel with a large block size and a single block;
0357     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0358     std::cout << "Test 1D vector single-threaded seria addition with large block size, using scalar dimensions\n";
0359     testVectorAddKernel(100, 1, 1024, VectorAddKernelSerial{});
0360 
0361     // launch the 1-dimensional kernel with a small block size and a small number of blocks;
0362     // this relies on the kernel to loop over the "problem space" and do more work per block
0363     std::cout << "Test 1D vector block-level serial addition with small block size, using scalar dimensions\n";
0364     testVectorAddKernel(10000, 32, 32, VectorAddKernelBlockSerial{});
0365 
0366     // launch the 1-dimensional kernel with a large block size and a single block;
0367     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
0368     std::cout << "Test 1D vector block-level serial addition with large block size, using scalar dimensions\n";
0369     testVectorAddKernel(100, 1, 1024, VectorAddKernelBlockSerial{});
0370   }
0371 }