test/alpaka/testRadixSort.dev.cc

0001 #include <algorithm>
0002 #include <cassert>
0003 #include <chrono>
0004 using namespace std::chrono_literals;
0005 #include <cstdint>
0006 #include <cstdlib>
0007 #include <iomanip>
0008 #include <iostream>
0009 #include <limits>
0010 #include <memory>
0011 #include <random>
0012 #include <set>
0013 #include <type_traits>
0014
0015 #include <alpaka/alpaka.hpp>
0016
0017 #include "FWCore/Utilities/interface/stringize.h"
0018 #include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h"
0019 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
0020 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0021 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0022 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0023
0024 using namespace cms::alpakatools;
0025 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0026
0027 template <typename T>
0028 struct RS {
0029   using type = std::uniform_int_distribution<T>;
0030   static auto ud() { return type(std::numeric_limits<T>::min(), std::numeric_limits<T>::max()); }
0031   static constexpr T imax = std::numeric_limits<T>::max();
0032 };
0033
0034 template <>
0035 struct RS<float> {
0036   using T = float;
0037   using type = std::uniform_real_distribution<float>;
0038   static auto ud() { return type(-std::numeric_limits<T>::max() / 2, std::numeric_limits<T>::max() / 2); }
0039   //  static auto ud() { return type(0,std::numeric_limits<T>::max()/2);}
0040   static constexpr int imax = std::numeric_limits<int>::max();
0041 };
0042
0043 // A templated unsigned integer type with N bytes
0044 template <int N>
0045 struct uintN;
0046
0047 template <>
0048 struct uintN<8> {
0049   using type = uint8_t;
0050 };
0051
0052 template <>
0053 struct uintN<16> {
0054   using type = uint16_t;
0055 };
0056
0057 template <>
0058 struct uintN<32> {
0059   using type = uint32_t;
0060 };
0061
0062 template <>
0063 struct uintN<64> {
0064   using type = uint64_t;
0065 };
0066
0067 template <int N>
0068 using uintN_t = typename uintN<N>::type;
0069
0070 // A templated unsigned integer type with the same size as T
0071 template <typename T>
0072 using uintT_t = uintN_t<sizeof(T) * 8>;
0073
0074 // Keep only the `N` most significant bytes of `t`, and set the others to zero
0075 template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
0076 void truncate(T& t) {
0077   const int shift = 8 * (sizeof(T) - N);
0078   union {
0079     T t;
0080     uintT_t<T> u;
0081   } c;
0082   c.t = t;
0083   c.u = c.u >> shift << shift;
0084   t = c.t;
0085 }
0086
0087 template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
0088 void go(Queue& queue, bool useShared) {
0089   std::mt19937 eng;
0090   auto rgen = RS<T>::ud();
0091
0092   std::chrono::high_resolution_clock::duration delta = 0ns;
0093   constexpr int blocks = 10;
0094   constexpr int blockSize = 256 * 32;
0095   constexpr int N = blockSize * blocks;
0096   auto v_h = cms::alpakatools::make_host_buffer<T[]>(queue, N);
0097
0098   constexpr bool sgn = T(-1) < T(0);
0099   std::cout << "Will sort " << N << (sgn ? " signed" : " unsigned")
0100             << (std::numeric_limits<T>::is_integer ? " 'ints'" : " 'float'") << " of size " << sizeof(T) << " using "
0101             << NS << " significant bytes" << std::endl;
0102
0103   for (int i = 0; i < 50; ++i) {
0104     if (i == 49) {
0105       for (long long j = 0; j < N; j++)
0106         v_h[j] = 0;
0107     } else if (i > 30) {
0108       for (long long j = 0; j < N; j++)
0109         v_h[j] = rgen(eng);
0110     } else {
0111       uint64_t imax = (i < 15) ? uint64_t(RS<T>::imax) + 1LL : 255;
0112       for (uint64_t j = 0; j < N; j++) {
0113         v_h[j] = (j % imax);
0114         if (j % 2 && i % 2)
0115           v_h[j] = -v_h[j];
0116       }
0117     }
0118
0119     auto offsets_h = cms::alpakatools::make_host_buffer<uint32_t[]>(queue, blocks + 1);
0120     offsets_h[0] = 0;
0121     for (int j = 1; j < blocks + 1; ++j) {
0122       offsets_h[j] = offsets_h[j - 1] + blockSize - 3 * j;
0123       assert(offsets_h[j] <= N);
0124     }
0125
0126     if (i == 1) {  // special cases...
0127       offsets_h[0] = 0;
0128       offsets_h[1] = 0;
0129       offsets_h[2] = 19;
0130       offsets_h[3] = 32 + offsets_h[2];
0131       offsets_h[4] = 123 + offsets_h[3];
0132       offsets_h[5] = 256 + offsets_h[4];
0133       offsets_h[6] = 311 + offsets_h[5];
0134       offsets_h[7] = 2111 + offsets_h[6];
0135       offsets_h[8] = 256 * 11 + offsets_h[7];
0136       offsets_h[9] = 44 + offsets_h[8];
0137       offsets_h[10] = 3297 + offsets_h[9];
0138     }
0139
0140     std::shuffle(v_h.data(), v_h.data() + N, eng);
0141
0142     auto v_d = cms::alpakatools::make_device_buffer<U[]>(queue, N);
0143     auto ind_d = cms::alpakatools::make_device_buffer<uint16_t[]>(queue, N);
0144     auto ind_h = cms::alpakatools::make_host_buffer<uint16_t[]>(queue, N);
0145     auto ws_d = cms::alpakatools::make_device_buffer<uint16_t[]>(queue, N);
0146     auto off_d = cms::alpakatools::make_device_buffer<uint32_t[]>(queue, blocks + 1);
0147
0148     alpaka::memcpy(queue, v_d, v_h);
0149     alpaka::memcpy(queue, off_d, offsets_h);
0150
0151     if (i < 2)
0152       std::cout << "launch for " << offsets_h[blocks] << std::endl;
0153
0154     auto ntXBl = 1 == i % 4 ? 256 : 256;
0155
0156     auto start = std::chrono::high_resolution_clock::now();
0157     // The MaxSize is the max size we allow between offsets (i.e. biggest set to sort when using shared memory).
0158     constexpr int MaxSize = 256 * 32;
0159     auto workdiv = make_workdiv<Acc1D>(blocks, ntXBl);
0160     if (useShared)
0161       // The original CUDA version used to call a kernel with __launch_bounds__(256, 4) specifier
0162       alpaka::enqueue(queue,
0163                       alpaka::createTaskKernel<Acc1D>(workdiv,
0164                                                       radixSortMultiWrapper<U, NS>{},
0165                                                       v_d.data(),
0166                                                       ind_d.data(),
0167                                                       off_d.data(),
0168                                                       nullptr,
0169                                                       MaxSize * sizeof(uint16_t)));
0170     else
0171       alpaka::enqueue(
0172           queue,
0173           alpaka::createTaskKernel<Acc1D>(
0174               workdiv, radixSortMultiWrapper2<U, NS>{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data()));
0175
0176     if (i < 2)
0177       std::cout << "launch done for " << offsets_h[blocks] << std::endl;
0178
0179     alpaka::memcpy(queue, ind_h, ind_d);
0180     alpaka::wait(queue);
0181
0182     delta += std::chrono::high_resolution_clock::now() - start;
0183
0184     if (i < 2)
0185       std::cout << "kernel and read back done for " << offsets_h[blocks] << std::endl;
0186
0187     if (32 == i) {
0188       std::cout << LL(v_h[ind_h[0]]) << ' ' << LL(v_h[ind_h[1]]) << ' ' << LL(v_h[ind_h[2]]) << std::endl;
0189       std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]])
0190                 << std::endl;
0191       std::cout << LL(v_h[ind_h[blockSize / 2 - 1]]) << ' ' << LL(v_h[ind_h[blockSize / 2]]) << ' '
0192                 << LL(v_h[ind_h[blockSize / 2 + 1]]) << std::endl;
0193     }
0194     for (int ib = 0; ib < blocks; ++ib) {
0195       std::set<uint16_t> inds;
0196       if (offsets_h[ib + 1] > offsets_h[ib])
0197         inds.insert(ind_h[offsets_h[ib]]);
0198       for (auto j = offsets_h[ib] + 1; j < offsets_h[ib + 1]; j++) {
0199         if (inds.count(ind_h[j]) != 0) {
0200           printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n", i, ib, j, ind_h[j]);
0201           std::vector<int> counts;
0202           counts.resize(offsets_h[ib + 1] - offsets_h[ib], 0);
0203           for (size_t j2 = offsets_h[ib]; j2 < offsets_h[ib + 1]; j2++) {
0204             counts[ind_h[j2]]++;
0205           }
0206           for (size_t j2 = 0; j2 < counts.size(); j2++) {
0207             if (counts[j2] != 1)
0208               printf("counts[%ld]=%d ", j2, counts[j2]);
0209           }
0210           printf("\n");
0211           printf("inds.count(ind_h[j] = %lu\n", inds.count(ind_h[j]));
0212         }
0213         assert(0 == inds.count(ind_h[j]));
0214         inds.insert(ind_h[j]);
0215         auto a = v_h.data() + offsets_h[ib];
0216         auto k1 = a[ind_h[j]];
0217         auto k2 = a[ind_h[j - 1]];
0218         truncate<NS>(k1);
0219         truncate<NS>(k2);
0220         if (k1 < k2) {
0221           std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib] << ", "
0222                     << offsets_h[ib + 1] - 1 << "] j=" << j << " ind[j]=" << ind_h[j]
0223                     << " (k1 < k2) : a1=" << (int64_t)a[ind_h[j]] << " k1=" << (int64_t)k1
0224                     << " a2= " << (int64_t)a[ind_h[j - 1]] << " k2=" << (int64_t)k2 << std::endl;
0225           assert(false);
0226         }
0227       }
0228       if (!inds.empty()) {
0229         assert(0 == *inds.begin());
0230         assert(inds.size() - 1 == *inds.rbegin());
0231       }
0232       if (inds.size() != (offsets_h[ib + 1] - offsets_h[ib]))
0233         std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets_h[ib + 1] - offsets_h[ib])
0234                   << std::endl;
0235       assert(inds.size() == (offsets_h[ib + 1] - offsets_h[ib]));
0236     }
0237   }  // 50 times
0238   std::cout << "Kernel computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / 50.
0239             << " ms per pass" << std::endl;
0240 }
0241
0242 int main() {
0243   // get the list of devices on the current platform
0244   auto const& devices = cms::alpakatools::devices<Platform>();
0245   if (devices.empty()) {
0246     std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0247       "the test will be skipped.\n";
0248     exit(EXIT_FAILURE);
0249   }
0250
0251   for (auto const& device : devices) {
0252     Queue queue(device);
0253     bool useShared = false;
0254
0255     std::cout << "using Global memory" << std::endl;
0256
0257     go<int8_t>(queue, useShared);
0258     go<int16_t>(queue, useShared);
0259     go<int32_t>(queue, useShared);
0260     go<int32_t, 3>(queue, useShared);
0261     go<int64_t>(queue, useShared);
0262     go<float, 4, float, double>(queue, useShared);
0263     go<float, 2, float, double>(queue, useShared);
0264
0265     go<uint8_t>(queue, useShared);
0266     go<uint16_t>(queue, useShared);
0267     go<uint32_t>(queue, useShared);
0268     // go<uint64_t>(v_h);
0269
0270     useShared = true;
0271
0272     std::cout << "using Shared memory" << std::endl;
0273
0274     go<int8_t>(queue, useShared);
0275     go<int16_t>(queue, useShared);
0276     go<int32_t>(queue, useShared);
0277     go<int32_t, 3>(queue, useShared);
0278     go<int64_t>(queue, useShared);
0279     go<float, 4, float, double>(queue, useShared);
0280     go<float, 2, float, double>(queue, useShared);
0281
0282     go<uint8_t>(queue, useShared);
0283     go<uint16_t>(queue, useShared);
0284     go<uint32_t>(queue, useShared);
0285     // go<uint64_t>(v_h);
0286   }
0287   return 0;
0288 }