Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-20 02:31:58

0001 // Check that ALPAKA_HOST_ONLY is not defined during device compilation:
0002 #ifdef ALPAKA_HOST_ONLY
0003 #error ALPAKA_HOST_ONLY defined in device compilation
0004 #endif
0005 
0006 #include <alpaka/alpaka.hpp>
0007 
0008 #include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h"
0009 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0010 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0011 
0012 #include "TestAlgo.h"
0013 
0014 namespace ALPAKA_ACCELERATOR_NAMESPACE {
0015 
0016   using namespace cms::alpakatools;
0017 
0018   class TestAlgoKernel {
0019   public:
0020     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0021     ALPAKA_FN_ACC void operator()(TAcc const& acc, portabletest::TestDeviceCollection::View view, double xvalue) const {
0022       // global index of the thread within the grid
0023       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
0024       const portabletest::Array flags = {{6, 4, 2, 0}};
0025 
0026       // set this only once in the whole kernel grid
0027       if (once_per_grid(acc)) {
0028         view.r() = 1.;
0029       }
0030 
0031       // make a strided loop over the kernel grid, covering up to "size" elements
0032       for (int32_t i : uniform_elements(acc, view.metadata().size())) {
0033         view[i] = {xvalue, 0., 0., i, flags, matrix * i};
0034       }
0035     }
0036   };
0037 
0038   class TestAlgoMultiKernel2 {
0039   public:
0040     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0041     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0042                                   portabletest::TestDeviceMultiCollection2::View<1> view,
0043                                   double xvalue) const {
0044       // global index of the thread within the grid
0045       const int32_t thread = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
0046       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
0047 
0048       // set this only once in the whole kernel grid
0049       if (thread == 0) {
0050         view.r2() = 2.;
0051       }
0052 
0053       // make a strided loop over the kernel grid, covering up to "size" elements
0054       for (int32_t i : uniform_elements(acc, view.metadata().size())) {
0055         view[i] = {xvalue, 0., 0., i, matrix * i};
0056       }
0057     }
0058   };
0059 
0060   class TestAlgoMultiKernel3 {
0061   public:
0062     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0063     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0064                                   portabletest::TestDeviceMultiCollection3::View<2> view,
0065                                   double xvalue) const {
0066       // global index of the thread within the grid
0067       const int32_t thread = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
0068       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
0069 
0070       // set this only once in the whole kernel grid
0071       if (thread == 0) {
0072         view.r3() = 3.;
0073       }
0074 
0075       // make a strided loop over the kernel grid, covering up to "size" elements
0076       for (int32_t i : uniform_elements(acc, view.metadata().size())) {
0077         view[i] = {xvalue, 0., 0., i, matrix * i};
0078       }
0079     }
0080   };
0081 
0082   void TestAlgo::fill(Queue& queue, portabletest::TestDeviceCollection& collection, double xvalue) const {
0083     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0084     uint32_t items = 64;
0085 
0086     // use as many groups as needed to cover the whole problem
0087     uint32_t groups = divide_up_by(collection->metadata().size(), items);
0088 
0089     // map items to
0090     //   - threads with a single element per thread on a GPU backend
0091     //   - elements within a single thread on a CPU backend
0092     auto workDiv = make_workdiv<Acc1D>(groups, items);
0093 
0094     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view(), xvalue);
0095   }
0096 
0097   void TestAlgo::fillMulti2(Queue& queue, portabletest::TestDeviceMultiCollection2& collection, double xvalue) const {
0098     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0099     uint32_t items = 64;
0100 
0101     // use as many groups as needed to cover the whole problem
0102     uint32_t groups = divide_up_by(collection->metadata().size(), items);
0103     uint32_t groups2 = divide_up_by(collection.view<1>().metadata().size(), items);
0104 
0105     // map items to
0106     //   - threads with a single element per thread on a GPU backend
0107     //   - elements within a single thread on a CPU backend
0108     auto workDiv = make_workdiv<Acc1D>(groups, items);
0109     auto workDiv2 = make_workdiv<Acc1D>(groups2, items);
0110 
0111     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view<portabletest::TestSoA>(), xvalue);
0112     alpaka::exec<Acc1D>(queue, workDiv2, TestAlgoMultiKernel2{}, collection.view<portabletest::TestSoA2>(), xvalue);
0113   }
0114 
0115   class TestAlgoStructKernel {
0116   public:
0117     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0118     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0119                                   portabletest::TestDeviceObject::Product* data,
0120                                   double x,
0121                                   double y,
0122                                   double z,
0123                                   int32_t id) const {
0124       // run on a single thread
0125       if (once_per_grid(acc)) {
0126         data->x = x;
0127         data->y = y;
0128         data->z = z;
0129         data->id = id;
0130       }
0131     }
0132   };
0133 
0134   void TestAlgo::fillObject(
0135       Queue& queue, portabletest::TestDeviceObject& object, double x, double y, double z, int32_t id) const {
0136     // run on a single thread
0137     auto workDiv = make_workdiv<Acc1D>(1, 1);
0138 
0139     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoStructKernel{}, object.data(), x, y, z, id);
0140   }
0141 
0142   void TestAlgo::fillMulti3(Queue& queue, portabletest::TestDeviceMultiCollection3& collection, double xvalue) const {
0143     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0144     uint32_t items = 64;
0145 
0146     // use as many groups as needed to cover the whole problem
0147     uint32_t groups = divide_up_by(collection.view<portabletest::TestSoA>().metadata().size(), items);
0148     uint32_t groups2 = divide_up_by(collection.view<portabletest::TestSoA2>().metadata().size(), items);
0149     uint32_t groups3 = divide_up_by(collection.view<portabletest::TestSoA3>().metadata().size(), items);
0150 
0151     // map items to
0152     //   - threads with a single element per thread on a GPU backend
0153     //   - elements within a single thread on a CPU backend
0154     auto workDiv = make_workdiv<Acc1D>(groups, items);
0155     auto workDiv2 = make_workdiv<Acc1D>(groups2, items);
0156     auto workDiv3 = make_workdiv<Acc1D>(groups3, items);
0157 
0158     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view<portabletest::TestSoA>(), xvalue);
0159     alpaka::exec<Acc1D>(queue, workDiv2, TestAlgoMultiKernel2{}, collection.view<portabletest::TestSoA2>(), xvalue);
0160     alpaka::exec<Acc1D>(queue, workDiv3, TestAlgoMultiKernel3{}, collection.view<portabletest::TestSoA3>(), xvalue);
0161   }
0162 
0163   class TestAlgoKernelUpdate {
0164   public:
0165     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0166     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0167                                   portabletest::TestDeviceCollection::ConstView input,
0168                                   AlpakaESTestDataEDevice::ConstView esData,
0169                                   portabletest::TestDeviceCollection::View output) const {
0170       // set this only once in the whole kernel grid
0171       if (once_per_grid(acc)) {
0172         output.r() = input.r();
0173       }
0174 
0175       // make a strided loop over the kernel grid, covering up to "size" elements
0176       for (int32_t i : uniform_elements(acc, output.metadata().size())) {
0177         double x = input[i].x();
0178         if (i < esData.size()) {
0179           x += esData.val(i) + esData.val2(i);
0180         }
0181         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0182       }
0183     }
0184   };
0185 
0186   class TestAlgoKernelUpdateMulti2 {
0187   public:
0188     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0189     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0190                                   portabletest::TestSoA::ConstView input,
0191                                   portabletest::TestSoA2::ConstView input2,
0192                                   AlpakaESTestDataEDevice::ConstView esData,
0193                                   portabletest::TestSoA::View output,
0194                                   portabletest::TestSoA2::View output2) const {
0195       // set this only once in the whole kernel grid
0196       if (once_per_grid(acc)) {
0197         output.r() = input.r();
0198         output2.r2() = input2.r2();
0199       }
0200 
0201       // make a strided loop over the kernel grid, covering up to "size" elements
0202       for (int32_t i : uniform_elements(acc, output.metadata().size())) {
0203         double x = input[i].x();
0204         if (i < esData.size()) {
0205           x += esData.val(i) + esData.val2(i);
0206         }
0207         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0208       }
0209       for (int32_t i : uniform_elements(acc, output2.metadata().size())) {
0210         double x2 = input2[i].x2();
0211         if (i < esData.size()) {
0212           x2 += esData.val(i) + esData.val2(i);
0213         }
0214         output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
0215       }
0216     }
0217   };
0218 
0219   class TestAlgoKernelUpdateMulti3 {
0220   public:
0221     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0222     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0223                                   portabletest::TestSoA::ConstView input,
0224                                   portabletest::TestSoA2::ConstView input2,
0225                                   portabletest::TestSoA3::ConstView input3,
0226                                   AlpakaESTestDataEDevice::ConstView esData,
0227                                   portabletest::TestSoA::View output,
0228                                   portabletest::TestSoA2::View output2,
0229                                   portabletest::TestSoA3::View output3) const {
0230       // set this only once in the whole kernel grid
0231       if (once_per_grid(acc)) {
0232         output.r() = input.r();
0233         output2.r2() = input2.r2();
0234         output3.r3() = input3.r3();
0235       }
0236 
0237       // make a strided loop over the kernel grid, covering up to "size" elements
0238       for (int32_t i : uniform_elements(acc, output.metadata().size())) {
0239         double x = input[i].x();
0240         if (i < esData.size()) {
0241           x += esData.val(i) + esData.val2(i);
0242           if (0 == i)
0243             printf("Setting x[0] to %f\n", x);
0244         }
0245         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
0246       }
0247       for (int32_t i : uniform_elements(acc, output2.metadata().size())) {
0248         double x2 = input2[i].x2();
0249         if (i < esData.size()) {
0250           x2 += esData.val(i) + esData.val2(i);
0251         }
0252         output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
0253       }
0254       for (int32_t i : uniform_elements(acc, output3.metadata().size())) {
0255         double x3 = input3[i].x3();
0256         if (i < esData.size()) {
0257           x3 += esData.val(i) + esData.val2(i);
0258         }
0259         output3[i] = {x3, input3[i].y3(), input3[i].z3(), input3[i].id3(), input3[i].m3()};
0260       }
0261     }
0262   };
0263 
0264   portabletest::TestDeviceCollection TestAlgo::update(Queue& queue,
0265                                                       portabletest::TestDeviceCollection const& input,
0266                                                       AlpakaESTestDataEDevice const& esData) const {
0267     portabletest::TestDeviceCollection collection{input->metadata().size(), queue};
0268 
0269     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0270     uint32_t items = 64;
0271 
0272     // use as many groups as needed to cover the whole problem
0273     uint32_t groups = divide_up_by(collection->metadata().size(), items);
0274 
0275     // map items to
0276     //   - threads with a single element per thread on a GPU backend
0277     //   - elements within a single thread on a CPU backend
0278     auto workDiv = make_workdiv<Acc1D>(groups, items);
0279 
0280     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernelUpdate{}, input.view(), esData.view(), collection.view());
0281 
0282     return collection;
0283   }
0284 
0285   portabletest::TestDeviceMultiCollection2 TestAlgo::updateMulti2(Queue& queue,
0286                                                                   portabletest::TestDeviceMultiCollection2 const& input,
0287                                                                   AlpakaESTestDataEDevice const& esData) const {
0288     portabletest::TestDeviceMultiCollection2 collection{input.sizes(), queue};
0289 
0290     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0291     uint32_t items = 64;
0292 
0293     // use as many groups as needed to cover the whole problem
0294     auto sizes = collection.sizes();
0295     uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items);
0296 
0297     // map items to
0298     //   - threads with a single element per thread on a GPU backend
0299     //   - elements within a single thread on a CPU backend
0300     auto workDiv = make_workdiv<Acc1D>(groups, items);
0301 
0302     alpaka::exec<Acc1D>(queue,
0303                         workDiv,
0304                         TestAlgoKernelUpdateMulti2{},
0305                         input.view<portabletest::TestSoA>(),
0306                         input.view<portabletest::TestSoA2>(),
0307                         esData.view(),
0308                         collection.view<portabletest::TestSoA>(),
0309                         collection.view<portabletest::TestSoA2>());
0310 
0311     return collection;
0312   }
0313 
0314   portabletest::TestDeviceMultiCollection3 TestAlgo::updateMulti3(Queue& queue,
0315                                                                   portabletest::TestDeviceMultiCollection3 const& input,
0316                                                                   AlpakaESTestDataEDevice const& esData) const {
0317     portabletest::TestDeviceMultiCollection3 collection{input.sizes(), queue};
0318 
0319     // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
0320     uint32_t items = 64;
0321 
0322     // use as many groups as needed to cover the whole problem
0323     auto sizes = collection.sizes();
0324     uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items);
0325 
0326     // map items to
0327     //   - threads with a single element per thread on a GPU backend
0328     //   - elements within a single thread on a CPU backend
0329     auto workDiv = make_workdiv<Acc1D>(groups, items);
0330 
0331     alpaka::exec<Acc1D>(queue,
0332                         workDiv,
0333                         TestAlgoKernelUpdateMulti3{},
0334                         input.view<portabletest::TestSoA>(),
0335                         input.view<portabletest::TestSoA2>(),
0336                         input.view<portabletest::TestSoA3>(),
0337                         esData.view(),
0338                         collection.view<portabletest::TestSoA>(),
0339                         collection.view<portabletest::TestSoA2>(),
0340                         collection.view<portabletest::TestSoA3>());
0341 
0342     return collection;
0343   }
0344 
0345 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE