Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-01-22 07:34:11

0001 // #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0002 
0003 #include <cstddef>
0004 #include <cstdint>
0005 #include <cstdlib>
0006 #include <algorithm>
0007 
0008 #include <alpaka/alpaka.hpp>
0009 
0010 #include "FWCore/Utilities/interface/stringize.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0016 
0017 using namespace cms::alpakatools;
0018 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0019 
0020 using FLOAT = double;
0021 
0022 // A templated unsigned integer type with N bytes
0023 template <int N>
0024 struct uintN;
0025 
0026 template <>
0027 struct uintN<8> {
0028   using type = uint8_t;
0029 };
0030 
0031 template <>
0032 struct uintN<16> {
0033   using type = uint16_t;
0034 };
0035 
0036 template <>
0037 struct uintN<32> {
0038   using type = uint32_t;
0039 };
0040 
0041 template <>
0042 struct uintN<64> {
0043   using type = uint64_t;
0044 };
0045 
0046 template <int N>
0047 using uintN_t = typename uintN<N>::type;
0048 
0049 // A templated unsigned integer type with the same size as T
0050 template <typename T>
0051 using uintT_t = uintN_t<sizeof(T) * 8>;
0052 
0053 // Keep only the `N` most significant bytes of `t`, and set the others to zero
0054 template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
0055 ALPAKA_FN_HOST_ACC T truncate(T const& t) {
0056   const int shift = 8 * (sizeof(T) - N);
0057   union {
0058     T t;
0059     uintT_t<T> u;
0060   } c;
0061   c.t = t;
0062   c.u = c.u >> shift << shift;
0063   return c.t;
0064 }
0065 
0066 namespace {
0067   struct testKernel {
0068     ALPAKA_FN_ACC void operator()(
0069         Acc1D const& acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const {
0070       // radix sort works in a single block (and the assert macro does not like the comma of the template parameters).
0071       const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0072       const auto blocksIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0073       ALPAKA_ASSERT_ACC(1 == blocksPerGrid);
0074       ALPAKA_ASSERT_ACC(0 == blocksIdx);
0075       ALPAKA_ASSERT_ACC(elements <= 2048);
0076 
0077       auto& order = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
0078       auto& sws = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
0079       auto& z = alpaka::declareSharedVar<float[2048], __COUNTER__>(acc);
0080       auto& iz = alpaka::declareSharedVar<int[2048], __COUNTER__>(acc);
0081       for (auto itrack : uniform_elements(acc, elements)) {
0082         z[itrack] = gpu_input[itrack];
0083         iz[itrack] = 10000 * gpu_input[itrack];
0084         // order[itrack] = itrack;
0085       }
0086       alpaka::syncBlockThreads(acc);
0087       radixSort<Acc1D, float, 2>(acc, z, order, sws, elements);
0088       alpaka::syncBlockThreads(acc);
0089 
0090       //verify
0091       for (auto itrack : uniform_elements(acc, elements - 1)) {
0092         auto ntrack = order[itrack];
0093         auto mtrack = order[itrack + 1];
0094         ALPAKA_ASSERT_ACC(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack]));
0095       }
0096 
0097       alpaka::syncBlockThreads(acc);
0098 
0099       if (doPrint)
0100         if (once_per_grid(acc)) {
0101           for (int itrackO = 0; itrackO < elements; itrackO++) {
0102             int itrack = order[itrackO];
0103             printf(
0104                 "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
0105                 "radixSort %f\n",
0106                 elements,
0107                 itrackO,
0108                 itrack,
0109                 gpu_input[itrack],
0110                 z[itrack]);
0111             gpu_product[itrackO] = itrack;
0112           }
0113         }
0114 
0115       alpaka::syncBlockThreads(acc);
0116       radixSort<Acc1D, int, 4>(acc, iz, order, sws, elements);
0117       alpaka::syncBlockThreads(acc);
0118 
0119       for (auto itrack : uniform_elements(acc, elements - 1)) {
0120         auto ntrack = order[itrack];
0121         auto mtrack = order[itrack + 1];
0122         ALPAKA_ASSERT_ACC(iz[ntrack] <= iz[mtrack]);
0123       }
0124 
0125       if (doPrint)
0126         if (once_per_grid(acc)) {
0127           for (int itrackO = 0; itrackO < elements; itrackO++) {
0128             int itrack = order[itrackO];
0129             printf(
0130                 "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
0131                 "radixSort %d\n",
0132                 elements,
0133                 itrackO,
0134                 itrack,
0135                 gpu_input[itrack],
0136                 iz[itrack]);
0137             gpu_product[itrackO] = itrack;
0138           }
0139         }
0140     }
0141   };
0142 
0143   void testWrapper(Queue& queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
0144     auto blockSize = 512;  // somewhat arbitrary
0145     auto gridSize = 1;     // round up to cover the sample size
0146     const auto workdiv = make_workdiv<Acc1D>(gridSize, blockSize);
0147     alpaka::enqueue(queue,
0148                     alpaka::createTaskKernel<Acc1D>(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint));
0149     alpaka::wait(queue);
0150   }
0151 }  // namespace
0152 
0153 int main() {
0154   // get the list of devices on the current platform
0155   auto const& devices = cms::alpakatools::devices<Platform>();
0156   if (devices.empty()) {
0157     std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0158       "the test will be skipped.\n";
0159     exit(EXIT_FAILURE);
0160   }
0161 
0162   std::random_device rd;
0163   std::mt19937 g(rd());
0164 
0165   // run the test on each device
0166   for (auto const& device : devices) {
0167     Queue queue(device);
0168 
0169     int nmax = 4 * 260;
0170     auto gpu_input_h = cms::alpakatools::make_host_buffer<FLOAT[]>(queue, nmax);
0171     auto i = gpu_input_h.data();
0172     for (auto v : {30.0,         30.0,         -4.4,         -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007,
0173                    2.2666890621, 2.2677690983, 2.2794606686, 2.2802586555,  2.2821085453,  2.2852313519, 2.2877883911,
0174                    2.2946476936, 2.2960267067, 2.3006286621, 2.3245604038,  2.6755006313,  2.7229132652, 2.783257246,
0175                    2.8440306187, 2.9017834663, 2.9252648354, 2.9254128933,  2.927520752,   2.9422419071, 2.9453969002,
0176                    2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948,  2.9575133324,  2.9575304985, 2.9586606026,
0177                    2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377,  2.9646151066,  2.9676523209, 2.9708273411,
0178                    2.974111557,  2.9742531776, 2.9772830009, 2.9877333641,  2.9960610867,  3.013969183,  3.0187871456,
0179                    3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913,  3.0560519695,  3.0592908859, 3.0599737167,
0180                    3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904,  3.0643932819,  3.0645000935, 3.0666446686,
0181                    3.068046093,  3.0697011948, 3.0717656612, 3.0718104839,  3.0718348026,  3.0733406544, 3.0738227367,
0182                    3.0738801956, 3.0738828182, 3.0744686127, 3.0753741264,  3.0758397579,  3.0767207146, 3.0773906708,
0183                    3.0778541565, 3.0780284405, 3.0780889988, 3.0782799721,  3.0789675713,  3.0792205334, 3.0793278217,
0184                    3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494,  3.0815284252,  3.0817306042, 3.0819730759,
0185                    3.0820026398, 3.0838682652, 3.084009409,  3.0848178864,  3.0853257179,  3.0855510235, 3.0856611729,
0186                    3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093,  3.0895674229,  3.0901503563, 3.0903317928,
0187                    3.0912668705, 3.0920717716, 3.0954346657, 3.096424818,   3.0995628834,  3.1001036167, 3.1173279285,
0188                    3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508,  3.1676850319,  3.1886672974, 3.3769197464,
0189                    3.3821125031, 3.4780933857, 3.4822063446, 3.4989323616,  3.5076274872,  3.5225863457, 3.5271244049,
0190                    3.5298995972, 3.5417425632, 3.5444457531, 3.5465917587,  3.5473103523,  3.5480232239, 3.5526945591,
0191                    3.5531234741, 3.5538012981, 3.5544877052, 3.5547749996,  3.5549693108,  3.5550665855, 3.5558729172,
0192                    3.5560717583, 3.5560848713, 3.5584278107, 3.558681488,   3.5587313175,  3.5592217445, 3.559384346,
0193                    3.5604712963, 3.5634038448, 3.563803196,  3.564593792,   3.5660364628,  3.5683133602, 3.5696356297,
0194                    3.569729805,  3.5740811825, 3.5757565498, 3.5760207176,  3.5760478973,  3.5836098194, 3.5839796066,
0195                    3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438,  3.7187042236,  3.9741659164, 4.4111995697,
0196                    4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405,  4.6868157387,  4.6868577003, 4.6879930496,
0197                    4.6888813972, 4.6910686493, 4.6925001144, 4.6957530975,  4.698094368,   4.6997032166, 4.7017259598,
0198                    4.7020640373, 4.7024269104, 4.7036352158, 4.7038679123,  4.7042069435,  4.7044086456, 4.7044372559,
0199                    4.7050771713, 4.7055773735, 4.7060651779, 4.7062759399,  4.7065420151,  4.70657444,   4.7066287994,
0200                    4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582,  4.7075614929,  4.7075891495, 4.7076044083,
0201                    4.7077374458, 4.7080879211, 4.70819664,   4.7086658478,  4.708937645,   4.7092385292, 4.709479332,
0202                    4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566,  4.7105507851,  4.71118927,   4.7113513947,
0203                    4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361,  4.7117333412,  4.7117910385, 4.7119007111,
0204                    4.7120013237, 4.712003231,  4.712044239,  4.7122926712,  4.7135767937,  4.7143669128, 4.7145690918,
0205                    4.7148418427, 4.7149815559, 4.7159647942, 4.7161884308,  4.7177276611,  4.717815876,  4.718059063,
0206                    4.7188801765, 4.7190728188, 4.7199850082, 4.7213058472,  4.7239775658,  4.7243933678, 4.7243990898,
0207                    4.7273659706, 4.7294125557, 4.7296204567, 4.7325615883,  4.7356877327,  4.740146637,  4.742254734,
0208                    4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065,  4.7723139628,  4.774812736,  4.8577151299,
0209                    4.890037536}) {
0210       *(i++) = v;
0211     }
0212     auto input = gpu_input_h.data();
0213     for (int i = 0; i < 260; i++) {
0214       input[i + 260] = -input[i];
0215       input[i + 2 * 260] = input[i] + 10;
0216       input[i + 3 * 260] = -input[i] - 10;
0217     }
0218     auto gpu_input_d = cms::alpakatools::make_device_buffer<FLOAT[]>(queue, nmax);
0219     auto gpu_product_d = cms::alpakatools::make_device_buffer<int[]>(queue, nmax);
0220     // copy the input data to the GPU
0221     alpaka::memcpy(queue, gpu_input_d, gpu_input_h);
0222 
0223     for (int k = 2; k <= nmax; k++) {
0224       std::shuffle(input, input + k, g);
0225       printf("Test with %d items\n", k);
0226       // sort on the GPU
0227       testWrapper(queue, gpu_input_d.data(), gpu_product_d.data(), k, false);
0228       alpaka::wait(queue);
0229     }
0230   }
0231   return 0;
0232 }