plugins/alpaka/TestAlgo.dev.cc

0001 // Check that ALPAKA_HOST_ONLY is not defined during device compilation:
0002 #ifdef ALPAKA_HOST_ONLY
0003 #error ALPAKA_HOST_ONLY defined in device compilation
0004 #endif
0005
0006 #include <alpaka/alpaka.hpp>
0007
0008 #include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h"
0009 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0010 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0011
0012 #include "TestAlgo.h"
0013
0014 namespace ALPAKA_ACCELERATOR_NAMESPACE {
0015
0016   using namespace cms::alpakatools;
0017
0018   class TestAlgoKernel {
0019   public:
0020     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0021                                   portabletest::TestDeviceCollection::View view,
0022                                   double xvalue) const {
0023       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
0024       const portabletest::Array flags = {{6, 4, 2, 0}};
0025
0026       // set this only once in the whole kernel grid
0027       if (once_per_grid(acc)) {
0028         view.r() = 1.;
0029       }
0030
0031       // make a strided loop over the kernel grid, covering up to "size" elements
0032       for (int32_t i : uniform_elements(acc, view.metadata().size())) {
0033         view[i] = {xvalue, 0., 0., i, flags, matrix * i};
0034       }
0035     }
0036   };
0037
0038   class TestAlgoMultiKernel2 {
0039   public:
0040     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0041                                   portabletest::TestDeviceMultiCollection2::View<1> view,
0042                                   double xvalue) const {
0043       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
0044
0045       // set this only once in the whole kernel grid
0046       if (once_per_grid(acc)) {
0047         view.r2() = 2.;
0048       }
0049
0050       // make a strided loop over the kernel grid, covering up to "size" elements
0051       for (int32_t i : uniform_elements(acc, view.metadata().size())) {
0052         view[i] = {xvalue, 0., 0., i, matrix * i};
0053       }
0054     }
0055   };
0056
0057   class TestAlgoMultiKernel3 {
0058   public:
0059     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0060                                   portabletest::TestDeviceMultiCollection3::View<2> view,
0061                                   double xvalue) const {
0062       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
0063
0064       // set this only once in the whole kernel grid
0065       if (once_per_grid(acc)) {
0066         view.r3() = 3.;
0067       }
0068
0069       // make a strided loop over the kernel grid, covering up to "size" elements
0070       for (int32_t i : uniform_elements(acc, view.metadata().size())) {
0071         view[i] = {xvalue, 0., 0., i, matrix * i};
0072       }
0073     }
0074   };
0075
0076   void TestAlgo::fill(Queue& queue, portabletest::TestDeviceCollection& collection, double xvalue) const {
0077     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0078     uint32_t items = 64;
0079
0080     // use as many groups as needed to cover the whole problem
0081     uint32_t groups = divide_up_by(collection->metadata().size(), items);
0082
0083     // map items to
0084     //   - threads with a single element per thread on a GPU backend
0085     //   - elements within a single thread on a CPU backend
0086     auto workDiv = make_workdiv<Acc1D>(groups, items);
0087
0088     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view(), xvalue);
0089   }
0090
0091   void TestAlgo::fillMulti2(Queue& queue, portabletest::TestDeviceMultiCollection2& collection, double xvalue) const {
0092     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0093     uint32_t items = 64;
0094
0095     // use as many groups as needed to cover the whole problem
0096     uint32_t groups = divide_up_by(collection->metadata().size(), items);
0097     uint32_t groups2 = divide_up_by(collection.view<1>().metadata().size(), items);
0098
0099     // map items to
0100     //   - threads with a single element per thread on a GPU backend
0101     //   - elements within a single thread on a CPU backend
0102     auto workDiv = make_workdiv<Acc1D>(groups, items);
0103     auto workDiv2 = make_workdiv<Acc1D>(groups2, items);
0104
0105     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view<portabletest::TestSoA>(), xvalue);
0106     alpaka::exec<Acc1D>(queue, workDiv2, TestAlgoMultiKernel2{}, collection.view<portabletest::TestSoA2>(), xvalue);
0107   }
0108
0109   class TestAlgoStructKernel {
0110   public:
0111     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0112                                   portabletest::TestDeviceObject::Product* data,
0113                                   double x,
0114                                   double y,
0115                                   double z,
0116                                   int32_t id) const {
0117       // run on a single thread
0118       if (once_per_grid(acc)) {
0119         data->x = x;
0120         data->y = y;
0121         data->z = z;
0122         data->id = id;
0123       }
0124     }
0125   };
0126
0127   void TestAlgo::fillObject(
0128       Queue& queue, portabletest::TestDeviceObject& object, double x, double y, double z, int32_t id) const {
0129     // run on a single thread
0130     auto workDiv = make_workdiv<Acc1D>(1, 1);
0131
0132     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoStructKernel{}, object.data(), x, y, z, id);
0133   }
0134
0135   void TestAlgo::fillMulti3(Queue& queue, portabletest::TestDeviceMultiCollection3& collection, double xvalue) const {
0136     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0137     uint32_t items = 64;
0138
0139     // use as many groups as needed to cover the whole problem
0140     uint32_t groups = divide_up_by(collection.view<portabletest::TestSoA>().metadata().size(), items);
0141     uint32_t groups2 = divide_up_by(collection.view<portabletest::TestSoA2>().metadata().size(), items);
0142     uint32_t groups3 = divide_up_by(collection.view<portabletest::TestSoA3>().metadata().size(), items);
0143
0144     // map items to
0145     //   - threads with a single element per thread on a GPU backend
0146     //   - elements within a single thread on a CPU backend
0147     auto workDiv = make_workdiv<Acc1D>(groups, items);
0148     auto workDiv2 = make_workdiv<Acc1D>(groups2, items);
0149     auto workDiv3 = make_workdiv<Acc1D>(groups3, items);
0150
0151     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view<portabletest::TestSoA>(), xvalue);
0152     alpaka::exec<Acc1D>(queue, workDiv2, TestAlgoMultiKernel2{}, collection.view<portabletest::TestSoA2>(), xvalue);
0153     alpaka::exec<Acc1D>(queue, workDiv3, TestAlgoMultiKernel3{}, collection.view<portabletest::TestSoA3>(), xvalue);
0154   }
0155
0156   class TestAlgoKernelUpdate {
0157   public:
0158     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0159                                   portabletest::TestDeviceCollection::ConstView input,
0160                                   AlpakaESTestDataEDevice::ConstView esData,
0161                                   portabletest::TestDeviceCollection::View output) const {
0162       // set this only once in the whole kernel grid
0163       if (once_per_grid(acc)) {
0164         output.r() = input.r();
0165       }
0166
0167       // make a strided loop over the kernel grid, covering up to "size" elements
0168       for (int32_t i : uniform_elements(acc, output.metadata().size())) {
0169         double x = input[i].x();
0170         if (i < esData.size()) {
0171           x += esData.val(i) + esData.val2(i);
0172         }
0173         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0174       }
0175     }
0176
0177     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0178                                   portabletest::TestDeviceCollection::ConstView input,
0179                                   TestAlgo::UpdateInfo const* updateInfo,
0180                                   portabletest::TestDeviceCollection::View output) const {
0181       // set this only once in the whole kernel grid
0182       if (once_per_grid(acc)) {
0183         output.r() = input.r();
0184       }
0185
0186       // make a strided loop over the kernel grid, covering up to "size" elements
0187       for (int32_t i : uniform_elements(acc, output.metadata().size())) {
0188         double x = input[i].x();
0189         x += updateInfo->x;
0190         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0191       }
0192     }
0193   };
0194
0195   class TestAlgoKernelUpdateMulti2 {
0196   public:
0197     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0198                                   portabletest::TestSoA::ConstView input,
0199                                   portabletest::TestSoA2::ConstView input2,
0200                                   AlpakaESTestDataEDevice::ConstView esData,
0201                                   portabletest::TestSoA::View output,
0202                                   portabletest::TestSoA2::View output2) const {
0203       // set this only once in the whole kernel grid
0204       if (once_per_grid(acc)) {
0205         output.r() = input.r();
0206         output2.r2() = input2.r2();
0207       }
0208
0209       // make a strided loop over the kernel grid, covering up to "size" elements
0210       for (int32_t i : uniform_elements(acc, output.metadata().size())) {
0211         double x = input[i].x();
0212         if (i < esData.size()) {
0213           x += esData.val(i) + esData.val2(i);
0214         }
0215         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0216       }
0217       for (int32_t i : uniform_elements(acc, output2.metadata().size())) {
0218         double x2 = input2[i].x2();
0219         if (i < esData.size()) {
0220           x2 += esData.val(i) + esData.val2(i);
0221         }
0222         output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
0223       }
0224     }
0225
0226     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0227                                   portabletest::TestSoA::ConstView input,
0228                                   portabletest::TestSoA2::ConstView input2,
0229                                   TestAlgo::UpdateInfo const* updateInfo,
0230                                   portabletest::TestSoA::View output,
0231                                   portabletest::TestSoA2::View output2) const {
0232       // set this only once in the whole kernel grid
0233       if (once_per_grid(acc)) {
0234         output.r() = input.r();
0235         output2.r2() = input2.r2();
0236       }
0237
0238       // make a strided loop over the kernel grid, covering up to "size" elements
0239       for (int32_t i : uniform_elements(acc, output.metadata().size())) {
0240         double x = input[i].x();
0241         x += updateInfo->x;
0242         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0243       }
0244       for (int32_t i : uniform_elements(acc, output2.metadata().size())) {
0245         double x2 = input2[i].x2();
0246         x2 += updateInfo->x;
0247         output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
0248       }
0249     }
0250   };
0251
0252   class TestAlgoKernelUpdateMulti3 {
0253   public:
0254     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0255                                   portabletest::TestSoA::ConstView input,
0256                                   portabletest::TestSoA2::ConstView input2,
0257                                   portabletest::TestSoA3::ConstView input3,
0258                                   AlpakaESTestDataEDevice::ConstView esData,
0259                                   portabletest::TestSoA::View output,
0260                                   portabletest::TestSoA2::View output2,
0261                                   portabletest::TestSoA3::View output3) const {
0262       // set this only once in the whole kernel grid
0263       if (once_per_grid(acc)) {
0264         output.r() = input.r();
0265         output2.r2() = input2.r2();
0266         output3.r3() = input3.r3();
0267       }
0268
0269       // make a strided loop over the kernel grid, covering up to "size" elements
0270       for (int32_t i : uniform_elements(acc, output.metadata().size())) {
0271         double x = input[i].x();
0272         if (i < esData.size()) {
0273           x += esData.val(i) + esData.val2(i);
0274           if (0 == i)
0275             printf("Setting x[0] to %f\n", x);
0276         }
0277         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0278       }
0279       for (int32_t i : uniform_elements(acc, output2.metadata().size())) {
0280         double x2 = input2[i].x2();
0281         if (i < esData.size()) {
0282           x2 += esData.val(i) + esData.val2(i);
0283         }
0284         output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
0285       }
0286       for (int32_t i : uniform_elements(acc, output3.metadata().size())) {
0287         double x3 = input3[i].x3();
0288         if (i < esData.size()) {
0289           x3 += esData.val(i) + esData.val2(i);
0290         }
0291         output3[i] = {x3, input3[i].y3(), input3[i].z3(), input3[i].id3(), input3[i].m3()};
0292       }
0293     }
0294
0295     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0296                                   portabletest::TestSoA::ConstView input,
0297                                   portabletest::TestSoA2::ConstView input2,
0298                                   portabletest::TestSoA3::ConstView input3,
0299                                   TestAlgo::UpdateInfo const* updateInfo,
0300                                   portabletest::TestSoA::View output,
0301                                   portabletest::TestSoA2::View output2,
0302                                   portabletest::TestSoA3::View output3) const {
0303       // set this only once in the whole kernel grid
0304       if (once_per_grid(acc)) {
0305         output.r() = input.r();
0306         output2.r2() = input2.r2();
0307         output3.r3() = input3.r3();
0308       }
0309
0310       // make a strided loop over the kernel grid, covering up to "size" elements
0311       for (int32_t i : uniform_elements(acc, output.metadata().size())) {
0312         double x = input[i].x();
0313         x += updateInfo->x;
0314         if (0 == i)
0315           printf("Setting x[0] to %f\n", x);
0316         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0317       }
0318       for (int32_t i : uniform_elements(acc, output2.metadata().size())) {
0319         double x2 = input2[i].x2();
0320         x2 += updateInfo->x;
0321         output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
0322       }
0323       for (int32_t i : uniform_elements(acc, output3.metadata().size())) {
0324         double x3 = input3[i].x3();
0325         x3 += updateInfo->x;
0326         output3[i] = {x3, input3[i].y3(), input3[i].z3(), input3[i].id3(), input3[i].m3()};
0327       }
0328     }
0329   };
0330
0331   portabletest::TestDeviceCollection TestAlgo::update(Queue& queue,
0332                                                       portabletest::TestDeviceCollection const& input,
0333                                                       AlpakaESTestDataEDevice const& esData) const {
0334     portabletest::TestDeviceCollection collection{input->metadata().size(), queue};
0335
0336     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0337     uint32_t items = 64;
0338
0339     // use as many groups as needed to cover the whole problem
0340     uint32_t groups = divide_up_by(collection->metadata().size(), items);
0341
0342     // map items to
0343     //   - threads with a single element per thread on a GPU backend
0344     //   - elements within a single thread on a CPU backend
0345     auto workDiv = make_workdiv<Acc1D>(groups, items);
0346
0347     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernelUpdate{}, input.view(), esData.view(), collection.view());
0348
0349     return collection;
0350   }
0351
0352   portabletest::TestDeviceMultiCollection2 TestAlgo::updateMulti2(Queue& queue,
0353                                                                   portabletest::TestDeviceMultiCollection2 const& input,
0354                                                                   AlpakaESTestDataEDevice const& esData) const {
0355     portabletest::TestDeviceMultiCollection2 collection{input.sizes(), queue};
0356
0357     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0358     uint32_t items = 64;
0359
0360     // use as many groups as needed to cover the whole problem
0361     auto sizes = collection.sizes();
0362     uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items);
0363
0364     // map items to
0365     //   - threads with a single element per thread on a GPU backend
0366     //   - elements within a single thread on a CPU backend
0367     auto workDiv = make_workdiv<Acc1D>(groups, items);
0368
0369     alpaka::exec<Acc1D>(queue,
0370                         workDiv,
0371                         TestAlgoKernelUpdateMulti2{},
0372                         input.view<portabletest::TestSoA>(),
0373                         input.view<portabletest::TestSoA2>(),
0374                         esData.view(),
0375                         collection.view<portabletest::TestSoA>(),
0376                         collection.view<portabletest::TestSoA2>());
0377
0378     return collection;
0379   }
0380
0381   portabletest::TestDeviceMultiCollection3 TestAlgo::updateMulti3(Queue& queue,
0382                                                                   portabletest::TestDeviceMultiCollection3 const& input,
0383                                                                   AlpakaESTestDataEDevice const& esData) const {
0384     portabletest::TestDeviceMultiCollection3 collection{input.sizes(), queue};
0385
0386     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0387     uint32_t items = 64;
0388
0389     // use as many groups as needed to cover the whole problem
0390     auto sizes = collection.sizes();
0391     uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items);
0392
0393     // map items to
0394     //   - threads with a single element per thread on a GPU backend
0395     //   - elements within a single thread on a CPU backend
0396     auto workDiv = make_workdiv<Acc1D>(groups, items);
0397
0398     alpaka::exec<Acc1D>(queue,
0399                         workDiv,
0400                         TestAlgoKernelUpdateMulti3{},
0401                         input.view<portabletest::TestSoA>(),
0402                         input.view<portabletest::TestSoA2>(),
0403                         input.view<portabletest::TestSoA3>(),
0404                         esData.view(),
0405                         collection.view<portabletest::TestSoA>(),
0406                         collection.view<portabletest::TestSoA2>(),
0407                         collection.view<portabletest::TestSoA3>());
0408
0409     return collection;
0410   }
0411
0412   portabletest::TestDeviceCollection TestAlgo::update(Queue& queue,
0413                                                       portabletest::TestDeviceCollection const& input,
0414                                                       UpdateInfo const* d_updateInfo) const {
0415     portabletest::TestDeviceCollection collection{input->metadata().size(), queue};
0416
0417     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0418     uint32_t items = 64;
0419
0420     // use as many groups as needed to cover the whole problem
0421     uint32_t groups = divide_up_by(collection->metadata().size(), items);
0422
0423     // map items to
0424     //   - threads with a single element per thread on a GPU backend
0425     //   - elements within a single thread on a CPU backend
0426     auto workDiv = make_workdiv<Acc1D>(groups, items);
0427
0428     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernelUpdate{}, input.view(), d_updateInfo, collection.view());
0429
0430     return collection;
0431   }
0432
0433   portabletest::TestDeviceMultiCollection2 TestAlgo::updateMulti2(Queue& queue,
0434                                                                   portabletest::TestDeviceMultiCollection2 const& input,
0435                                                                   UpdateInfo const* d_updateInfo) const {
0436     portabletest::TestDeviceMultiCollection2 collection{input.sizes(), queue};
0437
0438     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0439     uint32_t items = 64;
0440
0441     // use as many groups as needed to cover the whole problem
0442     auto sizes = collection.sizes();
0443     uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items);
0444
0445     // map items to
0446     //   - threads with a single element per thread on a GPU backend
0447     //   - elements within a single thread on a CPU backend
0448     auto workDiv = make_workdiv<Acc1D>(groups, items);
0449
0450     alpaka::exec<Acc1D>(queue,
0451                         workDiv,
0452                         TestAlgoKernelUpdateMulti2{},
0453                         input.view<portabletest::TestSoA>(),
0454                         input.view<portabletest::TestSoA2>(),
0455                         d_updateInfo,
0456                         collection.view<portabletest::TestSoA>(),
0457                         collection.view<portabletest::TestSoA2>());
0458
0459     return collection;
0460   }
0461
0462   portabletest::TestDeviceMultiCollection3 TestAlgo::updateMulti3(Queue& queue,
0463                                                                   portabletest::TestDeviceMultiCollection3 const& input,
0464                                                                   UpdateInfo const* d_updateInfo) const {
0465     portabletest::TestDeviceMultiCollection3 collection{input.sizes(), queue};
0466
0467     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0468     uint32_t items = 64;
0469
0470     // use as many groups as needed to cover the whole problem
0471     auto sizes = collection.sizes();
0472     uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items);
0473
0474     // map items to
0475     //   - threads with a single element per thread on a GPU backend
0476     //   - elements within a single thread on a CPU backend
0477     auto workDiv = make_workdiv<Acc1D>(groups, items);
0478
0479     alpaka::exec<Acc1D>(queue,
0480                         workDiv,
0481                         TestAlgoKernelUpdateMulti3{},
0482                         input.view<portabletest::TestSoA>(),
0483                         input.view<portabletest::TestSoA2>(),
0484                         input.view<portabletest::TestSoA3>(),
0485                         d_updateInfo,
0486                         collection.view<portabletest::TestSoA>(),
0487                         collection.view<portabletest::TestSoA2>(),
0488                         collection.view<portabletest::TestSoA3>());
0489
0490     return collection;
0491   }
0492
0493   class TestZeroCollectionKernel {
0494   public:
0495     ALPAKA_FN_ACC void operator()(Acc1D const& acc, portabletest::TestDeviceCollection::ConstView view) const {
0496       const portabletest::Matrix matrix{{0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}};
0497       const portabletest::Array flags = {{0, 0, 0, 0}};
0498
0499       // check this only once in the whole kernel grid
0500       if (once_per_grid(acc)) {
0501         ALPAKA_ASSERT(view.r() == 0.);
0502       }
0503
0504       // make a strided loop over the kernel grid, covering up to "size" elements
0505       for (int32_t i : uniform_elements(acc, view.metadata().size())) {
0506         auto element = view[i];
0507         ALPAKA_ASSERT(element.x() == 0.);
0508         ALPAKA_ASSERT(element.y() == 0.);
0509         ALPAKA_ASSERT(element.z() == 0.);
0510         ALPAKA_ASSERT(element.id() == 0.);
0511         ALPAKA_ASSERT(element.flags() == flags);
0512         ALPAKA_ASSERT(element.m() == matrix);
0513       }
0514     }
0515   };
0516
0517   class TestZeroMultiCollectionKernel2 {
0518   public:
0519     ALPAKA_FN_ACC void operator()(Acc1D const& acc, portabletest::TestDeviceMultiCollection2::ConstView<1> view) const {
0520       const portabletest::Matrix matrix{{0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}};
0521
0522       // check this only once in the whole kernel grid
0523       if (once_per_grid(acc)) {
0524         ALPAKA_ASSERT(view.r2() == 0.);
0525       }
0526
0527       // make a strided loop over the kernel grid, covering up to "size" elements
0528       for (int32_t i : uniform_elements(acc, view.metadata().size())) {
0529         auto element = view[i];
0530         ALPAKA_ASSERT(element.x2() == 0.);
0531         ALPAKA_ASSERT(element.y2() == 0.);
0532         ALPAKA_ASSERT(element.z2() == 0.);
0533         ALPAKA_ASSERT(element.id2() == 0.);
0534         ALPAKA_ASSERT(element.m2() == matrix);
0535       }
0536     }
0537   };
0538
0539   class TestZeroMultiCollectionKernel3 {
0540   public:
0541     ALPAKA_FN_ACC void operator()(Acc1D const& acc, portabletest::TestDeviceMultiCollection3::ConstView<2> view) const {
0542       const portabletest::Matrix matrix{{0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}};
0543
0544       // check this only once in the whole kernel grid
0545       if (once_per_grid(acc)) {
0546         ALPAKA_ASSERT(view.r3() == 0.);
0547       }
0548
0549       // make a strided loop over the kernel grid, covering up to "size" elements
0550       for (int32_t i : uniform_elements(acc, view.metadata().size())) {
0551         auto element = view[i];
0552         ALPAKA_ASSERT(element.x3() == 0.);
0553         ALPAKA_ASSERT(element.y3() == 0.);
0554         ALPAKA_ASSERT(element.z3() == 0.);
0555         ALPAKA_ASSERT(element.id3() == 0.);
0556         ALPAKA_ASSERT(element.m3() == matrix);
0557       }
0558     }
0559   };
0560
0561   class TestZeroStructKernel {
0562   public:
0563     ALPAKA_FN_ACC void operator()(Acc1D const& acc, portabletest::TestDeviceObject::Product const* data) const {
0564       // check this only once in the whole kernel grid
0565       if (once_per_grid(acc)) {
0566         ALPAKA_ASSERT(data->x == 0.);
0567         ALPAKA_ASSERT(data->y == 0.);
0568         ALPAKA_ASSERT(data->z == 0.);
0569         ALPAKA_ASSERT(data->id == 0);
0570       }
0571     }
0572   };
0573
0574   // Check that the collection has been filled with zeroes.
0575   void TestAlgo::checkZero(Queue& queue, portabletest::TestDeviceCollection const& collection) const {
0576     // create a work division with a single block and
0577     //   - 32 threads with a single element per thread on a GPU backend
0578     //   - 32 elements within a single thread on a CPU backend
0579     auto workDiv = make_workdiv<Acc1D>(1, 32);
0580
0581     // the kernel will make a strided loop over the launch grid to cover all elements in the collection
0582     alpaka::exec<Acc1D>(queue, workDiv, TestZeroCollectionKernel{}, collection.const_view());
0583   }
0584
0585   // Check that the collection has been filled with zeroes.
0586   void TestAlgo::checkZero(Queue& queue, portabletest::TestDeviceMultiCollection2 const& collection) const {
0587     // create a work division with a single block and
0588     //   - 32 threads with a single element per thread on a GPU backend
0589     //   - 32 elements within a single thread on a CPU backend
0590     auto workDiv = make_workdiv<Acc1D>(1, 32);
0591
0592     // the kernels will make a strided loop over the launch grid to cover all elements in the collection
0593     alpaka::exec<Acc1D>(queue, workDiv, TestZeroCollectionKernel{}, collection.const_view<portabletest::TestSoA>());
0594     alpaka::exec<Acc1D>(
0595         queue, workDiv, TestZeroMultiCollectionKernel2{}, collection.const_view<portabletest::TestSoA2>());
0596   }
0597
0598   // Check that the collection has been filled with zeroes.
0599   void TestAlgo::checkZero(Queue& queue, portabletest::TestDeviceMultiCollection3 const& collection) const {
0600     // create a work division with a single block and
0601     //   - 32 threads with a single element per thread on a GPU backend
0602     //   - 32 elements within a single thread on a CPU backend
0603     auto workDiv = make_workdiv<Acc1D>(1, 32);
0604
0605     // the kernels will make a strided loop over the launch grid to cover all elements in the collection
0606     alpaka::exec<Acc1D>(queue, workDiv, TestZeroCollectionKernel{}, collection.const_view<portabletest::TestSoA>());
0607     alpaka::exec<Acc1D>(
0608         queue, workDiv, TestZeroMultiCollectionKernel2{}, collection.const_view<portabletest::TestSoA2>());
0609     alpaka::exec<Acc1D>(
0610         queue, workDiv, TestZeroMultiCollectionKernel3{}, collection.const_view<portabletest::TestSoA3>());
0611   }
0612
0613   // Check that the object has been filled with zeroes.
0614   void TestAlgo::checkZero(Queue& queue, portabletest::TestDeviceObject const& object) const {
0615     // create a work division with a single block and
0616     //   - 32 threads with a single element per thread on a GPU backend
0617     //   - 32 elements within a single thread on a CPU backend
0618     auto workDiv = make_workdiv<Acc1D>(1, 32);
0619
0620     // the kernel will actually use a single thread
0621     alpaka::exec<Acc1D>(queue, workDiv, TestZeroStructKernel{}, object.data());
0622   }
0623
0624 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE