Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-02-14 04:11:50

0001 // Check that ALPAKA_HOST_ONLY is not defined during device compilation:
0002 #ifdef ALPAKA_HOST_ONLY
0003 #error ALPAKA_HOST_ONLY defined in device compilation
0004 #endif
0005 
0006 #include <alpaka/alpaka.hpp>
0007 
0008 #include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h"
0009 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0010 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0012 
0013 #include "TestAlgo.h"
0014 
0015 namespace ALPAKA_ACCELERATOR_NAMESPACE {
0016 
0017   using namespace cms::alpakatools;
0018 
0019   class TestAlgoKernel {
0020   public:
0021     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0022     ALPAKA_FN_ACC void operator()(TAcc const& acc, portabletest::TestDeviceCollection::View view, double xvalue) const {
0023       // global index of the thread within the grid
0024       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
0025       const portabletest::Array flags = {{6, 4, 2, 0}};
0026 
0027       // set this only once in the whole kernel grid
0028       if (once_per_grid(acc)) {
0029         view.r() = 1.;
0030       }
0031 
0032       // make a strided loop over the kernel grid, covering up to "size" elements
0033       for (int32_t i : elements_with_stride(acc, view.metadata().size())) {
0034         view[i] = {xvalue, 0., 0., i, flags, matrix * i};
0035       }
0036     }
0037   };
0038 
0039   class TestAlgoMultiKernel2 {
0040   public:
0041     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0042     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0043                                   portabletest::TestDeviceMultiCollection2::View<1> view,
0044                                   double xvalue) const {
0045       // global index of the thread within the grid
0046       const int32_t thread = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
0047       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
0048 
0049       // set this only once in the whole kernel grid
0050       if (thread == 0) {
0051         view.r2() = 2.;
0052       }
0053 
0054       // make a strided loop over the kernel grid, covering up to "size" elements
0055       for (int32_t i : elements_with_stride(acc, view.metadata().size())) {
0056         view[i] = {xvalue, 0., 0., i, matrix * i};
0057       }
0058     }
0059   };
0060 
0061   class TestAlgoMultiKernel3 {
0062   public:
0063     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0064     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0065                                   portabletest::TestDeviceMultiCollection3::View<2> view,
0066                                   double xvalue) const {
0067       // global index of the thread within the grid
0068       const int32_t thread = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
0069       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
0070 
0071       // set this only once in the whole kernel grid
0072       if (thread == 0) {
0073         view.r3() = 3.;
0074       }
0075 
0076       // make a strided loop over the kernel grid, covering up to "size" elements
0077       for (int32_t i : elements_with_stride(acc, view.metadata().size())) {
0078         view[i] = {xvalue, 0., 0., i, matrix * i};
0079       }
0080     }
0081   };
0082 
0083   void TestAlgo::fill(Queue& queue, portabletest::TestDeviceCollection& collection, double xvalue) const {
0084     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0085     uint32_t items = 64;
0086 
0087     // use as many groups as needed to cover the whole problem
0088     uint32_t groups = divide_up_by(collection->metadata().size(), items);
0089 
0090     // map items to
0091     //   - threads with a single element per thread on a GPU backend
0092     //   - elements within a single thread on a CPU backend
0093     auto workDiv = make_workdiv<Acc1D>(groups, items);
0094 
0095     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view(), xvalue);
0096   }
0097 
0098   void TestAlgo::fillMulti2(Queue& queue, portabletest::TestDeviceMultiCollection2& collection, double xvalue) const {
0099     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0100     uint32_t items = 64;
0101 
0102     // use as many groups as needed to cover the whole problem
0103     uint32_t groups = divide_up_by(collection->metadata().size(), items);
0104     uint32_t groups2 = divide_up_by(collection.view<1>().metadata().size(), items);
0105 
0106     // map items to
0107     //   - threads with a single element per thread on a GPU backend
0108     //   - elements within a single thread on a CPU backend
0109     auto workDiv = make_workdiv<Acc1D>(groups, items);
0110     auto workDiv2 = make_workdiv<Acc1D>(groups2, items);
0111 
0112     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view<portabletest::TestSoA>(), xvalue);
0113     alpaka::exec<Acc1D>(queue, workDiv2, TestAlgoMultiKernel2{}, collection.view<portabletest::TestSoA2>(), xvalue);
0114   }
0115 
0116   class TestAlgoStructKernel {
0117   public:
0118     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0119     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0120                                   portabletest::TestDeviceObject::Product* data,
0121                                   double x,
0122                                   double y,
0123                                   double z,
0124                                   int32_t id) const {
0125       // run on a single thread
0126       if (once_per_grid(acc)) {
0127         data->x = x;
0128         data->y = y;
0129         data->z = z;
0130         data->id = id;
0131       }
0132     }
0133   };
0134 
0135   void TestAlgo::fillObject(
0136       Queue& queue, portabletest::TestDeviceObject& object, double x, double y, double z, int32_t id) const {
0137     // run on a single thread
0138     auto workDiv = make_workdiv<Acc1D>(1, 1);
0139 
0140     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoStructKernel{}, object.data(), x, y, z, id);
0141   }
0142 
0143   void TestAlgo::fillMulti3(Queue& queue, portabletest::TestDeviceMultiCollection3& collection, double xvalue) const {
0144     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0145     uint32_t items = 64;
0146 
0147     // use as many groups as needed to cover the whole problem
0148     uint32_t groups = divide_up_by(collection.view<portabletest::TestSoA>().metadata().size(), items);
0149     uint32_t groups2 = divide_up_by(collection.view<portabletest::TestSoA2>().metadata().size(), items);
0150     uint32_t groups3 = divide_up_by(collection.view<portabletest::TestSoA3>().metadata().size(), items);
0151 
0152     // map items to
0153     //   - threads with a single element per thread on a GPU backend
0154     //   - elements within a single thread on a CPU backend
0155     auto workDiv = make_workdiv<Acc1D>(groups, items);
0156     auto workDiv2 = make_workdiv<Acc1D>(groups2, items);
0157     auto workDiv3 = make_workdiv<Acc1D>(groups3, items);
0158 
0159     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view<portabletest::TestSoA>(), xvalue);
0160     alpaka::exec<Acc1D>(queue, workDiv2, TestAlgoMultiKernel2{}, collection.view<portabletest::TestSoA2>(), xvalue);
0161     alpaka::exec<Acc1D>(queue, workDiv3, TestAlgoMultiKernel3{}, collection.view<portabletest::TestSoA3>(), xvalue);
0162   }
0163 
0164   class TestAlgoKernelUpdate {
0165   public:
0166     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0167     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0168                                   portabletest::TestDeviceCollection::ConstView input,
0169                                   AlpakaESTestDataEDevice::ConstView esData,
0170                                   portabletest::TestDeviceCollection::View output) const {
0171       // set this only once in the whole kernel grid
0172       if (once_per_grid(acc)) {
0173         output.r() = input.r();
0174       }
0175 
0176       // make a strided loop over the kernel grid, covering up to "size" elements
0177       for (int32_t i : elements_with_stride(acc, output.metadata().size())) {
0178         double x = input[i].x();
0179         if (i < esData.size()) {
0180           x += esData.val(i) + esData.val2(i);
0181         }
0182         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0183       }
0184     }
0185   };
0186 
0187   class TestAlgoKernelUpdateMulti2 {
0188   public:
0189     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0190     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0191                                   portabletest::TestSoA::ConstView input,
0192                                   portabletest::TestSoA2::ConstView input2,
0193                                   AlpakaESTestDataEDevice::ConstView esData,
0194                                   portabletest::TestSoA::View output,
0195                                   portabletest::TestSoA2::View output2) const {
0196       // set this only once in the whole kernel grid
0197       if (once_per_grid(acc)) {
0198         output.r() = input.r();
0199         output2.r2() = input2.r2();
0200       }
0201 
0202       // make a strided loop over the kernel grid, covering up to "size" elements
0203       for (int32_t i : elements_with_stride(acc, output.metadata().size())) {
0204         double x = input[i].x();
0205         if (i < esData.size()) {
0206           x += esData.val(i) + esData.val2(i);
0207         }
0208         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0209       }
0210       for (int32_t i : elements_with_stride(acc, output2.metadata().size())) {
0211         double x2 = input2[i].x2();
0212         if (i < esData.size()) {
0213           x2 += esData.val(i) + esData.val2(i);
0214         }
0215         output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
0216       }
0217     }
0218   };
0219 
0220   class TestAlgoKernelUpdateMulti3 {
0221   public:
0222     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0223     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0224                                   portabletest::TestSoA::ConstView input,
0225                                   portabletest::TestSoA2::ConstView input2,
0226                                   portabletest::TestSoA3::ConstView input3,
0227                                   AlpakaESTestDataEDevice::ConstView esData,
0228                                   portabletest::TestSoA::View output,
0229                                   portabletest::TestSoA2::View output2,
0230                                   portabletest::TestSoA3::View output3) const {
0231       // set this only once in the whole kernel grid
0232       if (once_per_grid(acc)) {
0233         output.r() = input.r();
0234         output2.r2() = input2.r2();
0235         output3.r3() = input3.r3();
0236       }
0237 
0238       // make a strided loop over the kernel grid, covering up to "size" elements
0239       for (int32_t i : elements_with_stride(acc, output.metadata().size())) {
0240         double x = input[i].x();
0241         if (i < esData.size()) {
0242           x += esData.val(i) + esData.val2(i);
0243           if (0 == i)
0244             printf("Setting x[0] to %f\n", x);
0245         }
0246         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0247       }
0248       for (int32_t i : elements_with_stride(acc, output2.metadata().size())) {
0249         double x2 = input2[i].x2();
0250         if (i < esData.size()) {
0251           x2 += esData.val(i) + esData.val2(i);
0252         }
0253         output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
0254       }
0255       for (int32_t i : elements_with_stride(acc, output3.metadata().size())) {
0256         double x3 = input3[i].x3();
0257         if (i < esData.size()) {
0258           x3 += esData.val(i) + esData.val2(i);
0259         }
0260         output3[i] = {x3, input3[i].y3(), input3[i].z3(), input3[i].id3(), input3[i].m3()};
0261       }
0262     }
0263   };
0264 
0265   portabletest::TestDeviceCollection TestAlgo::update(Queue& queue,
0266                                                       portabletest::TestDeviceCollection const& input,
0267                                                       AlpakaESTestDataEDevice const& esData) const {
0268     portabletest::TestDeviceCollection collection{input->metadata().size(), queue};
0269 
0270     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0271     uint32_t items = 64;
0272 
0273     // use as many groups as needed to cover the whole problem
0274     uint32_t groups = divide_up_by(collection->metadata().size(), items);
0275 
0276     // map items to
0277     //   - threads with a single element per thread on a GPU backend
0278     //   - elements within a single thread on a CPU backend
0279     auto workDiv = make_workdiv<Acc1D>(groups, items);
0280 
0281     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernelUpdate{}, input.view(), esData.view(), collection.view());
0282 
0283     return collection;
0284   }
0285 
0286   portabletest::TestDeviceMultiCollection2 TestAlgo::updateMulti2(Queue& queue,
0287                                                                   portabletest::TestDeviceMultiCollection2 const& input,
0288                                                                   AlpakaESTestDataEDevice const& esData) const {
0289     portabletest::TestDeviceMultiCollection2 collection{input.sizes(), queue};
0290 
0291     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0292     uint32_t items = 64;
0293 
0294     // use as many groups as needed to cover the whole problem
0295     auto sizes = collection.sizes();
0296     uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items);
0297 
0298     // map items to
0299     //   - threads with a single element per thread on a GPU backend
0300     //   - elements within a single thread on a CPU backend
0301     auto workDiv = make_workdiv<Acc1D>(groups, items);
0302 
0303     alpaka::exec<Acc1D>(queue,
0304                         workDiv,
0305                         TestAlgoKernelUpdateMulti2{},
0306                         input.view<portabletest::TestSoA>(),
0307                         input.view<portabletest::TestSoA2>(),
0308                         esData.view(),
0309                         collection.view<portabletest::TestSoA>(),
0310                         collection.view<portabletest::TestSoA2>());
0311 
0312     return collection;
0313   }
0314 
0315   portabletest::TestDeviceMultiCollection3 TestAlgo::updateMulti3(Queue& queue,
0316                                                                   portabletest::TestDeviceMultiCollection3 const& input,
0317                                                                   AlpakaESTestDataEDevice const& esData) const {
0318     portabletest::TestDeviceMultiCollection3 collection{input.sizes(), queue};
0319 
0320     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0321     uint32_t items = 64;
0322 
0323     // use as many groups as needed to cover the whole problem
0324     auto sizes = collection.sizes();
0325     uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items);
0326 
0327     // map items to
0328     //   - threads with a single element per thread on a GPU backend
0329     //   - elements within a single thread on a CPU backend
0330     auto workDiv = make_workdiv<Acc1D>(groups, items);
0331 
0332     alpaka::exec<Acc1D>(queue,
0333                         workDiv,
0334                         TestAlgoKernelUpdateMulti3{},
0335                         input.view<portabletest::TestSoA>(),
0336                         input.view<portabletest::TestSoA2>(),
0337                         input.view<portabletest::TestSoA3>(),
0338                         esData.view(),
0339                         collection.view<portabletest::TestSoA>(),
0340                         collection.view<portabletest::TestSoA2>(),
0341                         collection.view<portabletest::TestSoA3>());
0342 
0343     return collection;
0344   }
0345 
0346 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE