Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-23 22:56:17

0001 // #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0002 
0003 #include <cstddef>
0004 #include <cstdint>
0005 #include <cstdlib>
0006 #include <algorithm>
0007 
0008 #include <alpaka/alpaka.hpp>
0009 
0010 #include "FWCore/Utilities/interface/stringize.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0016 
0017 using namespace cms::alpakatools;
0018 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0019 
0020 using FLOAT = double;
0021 
0022 // A templated unsigned integer type with N bytes
0023 template <int N>
0024 struct uintN;
0025 
0026 template <>
0027 struct uintN<8> {
0028   using type = uint8_t;
0029 };
0030 
0031 template <>
0032 struct uintN<16> {
0033   using type = uint16_t;
0034 };
0035 
0036 template <>
0037 struct uintN<32> {
0038   using type = uint32_t;
0039 };
0040 
0041 template <>
0042 struct uintN<64> {
0043   using type = uint64_t;
0044 };
0045 
0046 template <int N>
0047 using uintN_t = typename uintN<N>::type;
0048 
0049 // A templated unsigned integer type with the same size as T
0050 template <typename T>
0051 using uintT_t = uintN_t<sizeof(T) * 8>;
0052 
0053 // Keep only the `N` most significant bytes of `t`, and set the others to zero
0054 template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
0055 ALPAKA_FN_HOST_ACC T truncate(T const& t) {
0056   const int shift = 8 * (sizeof(T) - N);
0057   union {
0058     T t;
0059     uintT_t<T> u;
0060   } c;
0061   c.t = t;
0062   c.u = c.u >> shift << shift;
0063   return c.t;
0064 }
0065 
0066 namespace {
0067   struct testKernel {
0068     template <typename TAcc>
0069     ALPAKA_FN_ACC void operator()(
0070         const TAcc& acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const {
0071       //size_t firstElement = threadIdx.x + blockIdx.x * blockDim.x;  // This is going to be the track index
0072       //size_t gridSize = blockDim.x * gridDim.x;
0073       bool threadZero = !alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
0074 
0075       // radix sort works in a single block (and the assert macro does not like the comma of the template parameters).
0076       const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0077       const auto blocksIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0078       ALPAKA_ASSERT_ACC(1 == blocksPerGrid);
0079       ALPAKA_ASSERT_ACC(0 == blocksIdx);
0080       ALPAKA_ASSERT_ACC(elements <= 2048);
0081 
0082       auto& order = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
0083       auto& sws = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
0084       auto& z = alpaka::declareSharedVar<float[2048], __COUNTER__>(acc);
0085       auto& iz = alpaka::declareSharedVar<int[2048], __COUNTER__>(acc);
0086       //      __shared__ uint16_t order[2048];
0087       //      __shared__ uint16_t sws[2048];
0088       //      __shared__ float z[2048];
0089       //      __shared__ int iz[2048];
0090       for (auto itrack : uniform_elements(acc, elements)) {
0091         z[itrack] = gpu_input[itrack];
0092         iz[itrack] = 10000 * gpu_input[itrack];
0093         // order[itrack] = itrack;
0094       }
0095       alpaka::syncBlockThreads(acc);
0096       radixSort<TAcc, float, 2>(acc, z, order, sws, elements);
0097       alpaka::syncBlockThreads(acc);
0098 
0099       //verify
0100       for (auto itrack : uniform_elements(acc, elements - 1)) {
0101         auto ntrack = order[itrack];
0102         auto mtrack = order[itrack + 1];
0103         ALPAKA_ASSERT_ACC(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack]));
0104       }
0105 
0106       alpaka::syncBlockThreads(acc);
0107 
0108       if (doPrint)
0109         if (threadZero) {
0110           for (int itrackO = 0; itrackO < elements; itrackO++) {
0111             int itrack = order[itrackO];
0112             printf(
0113                 "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
0114                 "radixSort %f\n",
0115                 elements,
0116                 itrackO,
0117                 itrack,
0118                 gpu_input[itrack],
0119                 z[itrack]);
0120             gpu_product[itrackO] = itrack;
0121           }
0122         }
0123 
0124       alpaka::syncBlockThreads(acc);
0125       radixSort<TAcc, int, 4>(acc, iz, order, sws, elements);
0126       alpaka::syncBlockThreads(acc);
0127 
0128       for (auto itrack : uniform_elements(acc, elements - 1)) {
0129         auto ntrack = order[itrack];
0130         auto mtrack = order[itrack + 1];
0131         ALPAKA_ASSERT_ACC(iz[ntrack] <= iz[mtrack]);
0132       }
0133 
0134       if (doPrint)
0135         if (threadZero) {
0136           for (int itrackO = 0; itrackO < elements; itrackO++) {
0137             int itrack = order[itrackO];
0138             printf(
0139                 "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
0140                 "radixSort %d\n",
0141                 elements,
0142                 itrackO,
0143                 itrack,
0144                 gpu_input[itrack],
0145                 iz[itrack]);
0146             gpu_product[itrackO] = itrack;
0147           }
0148         }
0149     }
0150   };
0151 
0152   void testWrapper(Queue& queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
0153     auto blockSize = 512;  // somewhat arbitrary
0154     auto gridSize = 1;     // round up to cover the sample size
0155     const auto workdiv = make_workdiv<Acc1D>(gridSize, blockSize);
0156     alpaka::enqueue(queue,
0157                     alpaka::createTaskKernel<Acc1D>(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint));
0158     alpaka::wait(queue);
0159   }
0160 }  // namespace
0161 
0162 int main() {
0163   // get the list of devices on the current platform
0164   auto const& devices = cms::alpakatools::devices<Platform>();
0165   if (devices.empty()) {
0166     std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0167       "the test will be skipped.\n";
0168     exit(EXIT_FAILURE);
0169   }
0170 
0171   std::random_device rd;
0172   std::mt19937 g(rd());
0173 
0174   // run the test on each device
0175   for (auto const& device : devices) {
0176     Queue queue(device);
0177 
0178     int nmax = 4 * 260;
0179     auto gpu_input_h = cms::alpakatools::make_host_buffer<FLOAT[]>(queue, nmax);
0180     auto i = gpu_input_h.data();
0181     for (auto v : {30.0,         30.0,         -4.4,         -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007,
0182                    2.2666890621, 2.2677690983, 2.2794606686, 2.2802586555,  2.2821085453,  2.2852313519, 2.2877883911,
0183                    2.2946476936, 2.2960267067, 2.3006286621, 2.3245604038,  2.6755006313,  2.7229132652, 2.783257246,
0184                    2.8440306187, 2.9017834663, 2.9252648354, 2.9254128933,  2.927520752,   2.9422419071, 2.9453969002,
0185                    2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948,  2.9575133324,  2.9575304985, 2.9586606026,
0186                    2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377,  2.9646151066,  2.9676523209, 2.9708273411,
0187                    2.974111557,  2.9742531776, 2.9772830009, 2.9877333641,  2.9960610867,  3.013969183,  3.0187871456,
0188                    3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913,  3.0560519695,  3.0592908859, 3.0599737167,
0189                    3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904,  3.0643932819,  3.0645000935, 3.0666446686,
0190                    3.068046093,  3.0697011948, 3.0717656612, 3.0718104839,  3.0718348026,  3.0733406544, 3.0738227367,
0191                    3.0738801956, 3.0738828182, 3.0744686127, 3.0753741264,  3.0758397579,  3.0767207146, 3.0773906708,
0192                    3.0778541565, 3.0780284405, 3.0780889988, 3.0782799721,  3.0789675713,  3.0792205334, 3.0793278217,
0193                    3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494,  3.0815284252,  3.0817306042, 3.0819730759,
0194                    3.0820026398, 3.0838682652, 3.084009409,  3.0848178864,  3.0853257179,  3.0855510235, 3.0856611729,
0195                    3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093,  3.0895674229,  3.0901503563, 3.0903317928,
0196                    3.0912668705, 3.0920717716, 3.0954346657, 3.096424818,   3.0995628834,  3.1001036167, 3.1173279285,
0197                    3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508,  3.1676850319,  3.1886672974, 3.3769197464,
0198                    3.3821125031, 3.4780933857, 3.4822063446, 3.4989323616,  3.5076274872,  3.5225863457, 3.5271244049,
0199                    3.5298995972, 3.5417425632, 3.5444457531, 3.5465917587,  3.5473103523,  3.5480232239, 3.5526945591,
0200                    3.5531234741, 3.5538012981, 3.5544877052, 3.5547749996,  3.5549693108,  3.5550665855, 3.5558729172,
0201                    3.5560717583, 3.5560848713, 3.5584278107, 3.558681488,   3.5587313175,  3.5592217445, 3.559384346,
0202                    3.5604712963, 3.5634038448, 3.563803196,  3.564593792,   3.5660364628,  3.5683133602, 3.5696356297,
0203                    3.569729805,  3.5740811825, 3.5757565498, 3.5760207176,  3.5760478973,  3.5836098194, 3.5839796066,
0204                    3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438,  3.7187042236,  3.9741659164, 4.4111995697,
0205                    4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405,  4.6868157387,  4.6868577003, 4.6879930496,
0206                    4.6888813972, 4.6910686493, 4.6925001144, 4.6957530975,  4.698094368,   4.6997032166, 4.7017259598,
0207                    4.7020640373, 4.7024269104, 4.7036352158, 4.7038679123,  4.7042069435,  4.7044086456, 4.7044372559,
0208                    4.7050771713, 4.7055773735, 4.7060651779, 4.7062759399,  4.7065420151,  4.70657444,   4.7066287994,
0209                    4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582,  4.7075614929,  4.7075891495, 4.7076044083,
0210                    4.7077374458, 4.7080879211, 4.70819664,   4.7086658478,  4.708937645,   4.7092385292, 4.709479332,
0211                    4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566,  4.7105507851,  4.71118927,   4.7113513947,
0212                    4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361,  4.7117333412,  4.7117910385, 4.7119007111,
0213                    4.7120013237, 4.712003231,  4.712044239,  4.7122926712,  4.7135767937,  4.7143669128, 4.7145690918,
0214                    4.7148418427, 4.7149815559, 4.7159647942, 4.7161884308,  4.7177276611,  4.717815876,  4.718059063,
0215                    4.7188801765, 4.7190728188, 4.7199850082, 4.7213058472,  4.7239775658,  4.7243933678, 4.7243990898,
0216                    4.7273659706, 4.7294125557, 4.7296204567, 4.7325615883,  4.7356877327,  4.740146637,  4.742254734,
0217                    4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065,  4.7723139628,  4.774812736,  4.8577151299,
0218                    4.890037536}) {
0219       *(i++) = v;
0220     }
0221     auto input = gpu_input_h.data();
0222     for (int i = 0; i < 260; i++) {
0223       input[i + 260] = -input[i];
0224       input[i + 2 * 260] = input[i] + 10;
0225       input[i + 3 * 260] = -input[i] - 10;
0226     }
0227     auto gpu_input_d = cms::alpakatools::make_device_buffer<FLOAT[]>(queue, nmax);
0228     auto gpu_product_d = cms::alpakatools::make_device_buffer<int[]>(queue, nmax);
0229     // copy the input data to the GPU
0230     alpaka::memcpy(queue, gpu_input_d, gpu_input_h);
0231 
0232     for (int k = 2; k <= nmax; k++) {
0233       std::shuffle(input, input + k, g);
0234       printf("Test with %d items\n", k);
0235       // sort on the GPU
0236       testWrapper(queue, gpu_input_d.data(), gpu_product_d.data(), k, false);
0237       alpaka::wait(queue);
0238     }
0239   }
0240   return 0;
0241 }