Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:15:41

0001 #include <algorithm>
0002 #include <cassert>
0003 #include <chrono>
0004 using namespace std::chrono_literals;
0005 #include <cstdint>
0006 #include <cstdlib>
0007 #include <iomanip>
0008 #include <iostream>
0009 #include <limits>
0010 #include <memory>
0011 #include <random>
0012 #include <set>
0013 #include <type_traits>
0014 
0015 #include <alpaka/alpaka.hpp>
0016 
0017 #include "FWCore/Utilities/interface/stringize.h"
0018 #include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h"
0019 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
0020 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0021 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0022 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0023 
0024 using namespace cms::alpakatools;
0025 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0026 
0027 template <typename T>
0028 struct RS {
0029   using type = std::uniform_int_distribution<T>;
0030   static auto ud() { return type(std::numeric_limits<T>::min(), std::numeric_limits<T>::max()); }
0031   static constexpr T imax = std::numeric_limits<T>::max();
0032 };
0033 
0034 template <>
0035 struct RS<float> {
0036   using T = float;
0037   using type = std::uniform_real_distribution<float>;
0038   static auto ud() { return type(-std::numeric_limits<T>::max() / 2, std::numeric_limits<T>::max() / 2); }
0039   //  static auto ud() { return type(0,std::numeric_limits<T>::max()/2);}
0040   static constexpr int imax = std::numeric_limits<int>::max();
0041 };
0042 
0043 // A templated unsigned integer type with N bytes
0044 template <int N>
0045 struct uintN;
0046 
0047 template <>
0048 struct uintN<8> {
0049   using type = uint8_t;
0050 };
0051 
0052 template <>
0053 struct uintN<16> {
0054   using type = uint16_t;
0055 };
0056 
0057 template <>
0058 struct uintN<32> {
0059   using type = uint32_t;
0060 };
0061 
0062 template <>
0063 struct uintN<64> {
0064   using type = uint64_t;
0065 };
0066 
0067 template <int N>
0068 using uintN_t = typename uintN<N>::type;
0069 
0070 // A templated unsigned integer type with the same size as T
0071 template <typename T>
0072 using uintT_t = uintN_t<sizeof(T) * 8>;
0073 
0074 // Keep only the `N` most significant bytes of `t`, and set the others to zero
0075 template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
0076 void truncate(T& t) {
0077   const int shift = 8 * (sizeof(T) - N);
0078   union {
0079     T t;
0080     uintT_t<T> u;
0081   } c;
0082   c.t = t;
0083   c.u = c.u >> shift << shift;
0084   t = c.t;
0085 }
0086 
0087 template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
0088 void go(Queue& queue, bool useShared) {
0089   std::mt19937 eng;
0090   //std::mt19937 eng2;
0091   auto rgen = RS<T>::ud();
0092 
0093   std::chrono::high_resolution_clock::duration delta = 0ns;
0094   constexpr int blocks = 10;
0095   constexpr int blockSize = 256 * 32;
0096   constexpr int N = blockSize * blocks;
0097   auto v_h = cms::alpakatools::make_host_buffer<T[]>(queue, N);
0098   //uint16_t ind_h[N];
0099 
0100   constexpr bool sgn = T(-1) < T(0);
0101   std::cout << "Will sort " << N << (sgn ? " signed" : " unsigned")
0102             << (std::numeric_limits<T>::is_integer ? " 'ints'" : " 'float'") << " of size " << sizeof(T) << " using "
0103             << NS << " significant bytes" << std::endl;
0104 
0105   for (int i = 0; i < 50; ++i) {
0106     if (i == 49) {
0107       for (long long j = 0; j < N; j++)
0108         v_h[j] = 0;
0109     } else if (i > 30) {
0110       for (long long j = 0; j < N; j++)
0111         v_h[j] = rgen(eng);
0112     } else {
0113       uint64_t imax = (i < 15) ? uint64_t(RS<T>::imax) + 1LL : 255;
0114       for (uint64_t j = 0; j < N; j++) {
0115         v_h[j] = (j % imax);
0116         if (j % 2 && i % 2)
0117           v_h[j] = -v_h[j];
0118       }
0119     }
0120 
0121     auto offsets_h = cms::alpakatools::make_host_buffer<uint32_t[]>(queue, blocks + 1);
0122     offsets_h[0] = 0;
0123     for (int j = 1; j < blocks + 1; ++j) {
0124       offsets_h[j] = offsets_h[j - 1] + blockSize - 3 * j;
0125       assert(offsets_h[j] <= N);
0126     }
0127 
0128     if (i == 1) {  // special cases...
0129       offsets_h[0] = 0;
0130       offsets_h[1] = 0;
0131       offsets_h[2] = 19;
0132       offsets_h[3] = 32 + offsets_h[2];
0133       offsets_h[4] = 123 + offsets_h[3];
0134       offsets_h[5] = 256 + offsets_h[4];
0135       offsets_h[6] = 311 + offsets_h[5];
0136       offsets_h[7] = 2111 + offsets_h[6];
0137       offsets_h[8] = 256 * 11 + offsets_h[7];
0138       offsets_h[9] = 44 + offsets_h[8];
0139       offsets_h[10] = 3297 + offsets_h[9];
0140     }
0141 
0142     std::shuffle(v_h.data(), v_h.data() + N, eng);
0143 
0144     auto v_d = cms::alpakatools::make_device_buffer<U[]>(queue, N);
0145     auto ind_d = cms::alpakatools::make_device_buffer<uint16_t[]>(queue, N);
0146     auto ind_h = cms::alpakatools::make_host_buffer<uint16_t[]>(queue, N);
0147     auto ws_d = cms::alpakatools::make_device_buffer<uint16_t[]>(queue, N);
0148     auto off_d = cms::alpakatools::make_device_buffer<uint32_t[]>(queue, blocks + 1);
0149 
0150     alpaka::memcpy(queue, v_d, v_h);
0151     alpaka::memcpy(queue, off_d, offsets_h);
0152 
0153     if (i < 2)
0154       std::cout << "launch for " << offsets_h[blocks] << std::endl;
0155 
0156     auto ntXBl = 1 == i % 4 ? 256 : 256;
0157 
0158     auto start = std::chrono::high_resolution_clock::now();
0159     // The MaxSize is the max size we allow between offsets (i.e. biggest set to sort when using shared memory).
0160     constexpr int MaxSize = 256 * 32;
0161     auto workdiv = make_workdiv<Acc1D>(blocks, ntXBl);
0162     if (useShared)
0163       // The original CUDA version used to call a kernel with __launch_bounds__(256, 4) specifier
0164       //
0165       alpaka::enqueue(queue,
0166                       alpaka::createTaskKernel<Acc1D>(workdiv,
0167                                                       radixSortMultiWrapper<U, NS>{},
0168                                                       v_d.data(),
0169                                                       ind_d.data(),
0170                                                       off_d.data(),
0171                                                       nullptr,
0172                                                       MaxSize * sizeof(uint16_t)));
0173     else
0174       alpaka::enqueue(
0175           queue,
0176           alpaka::createTaskKernel<Acc1D>(
0177               workdiv, radixSortMultiWrapper2<U, NS>{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data()));
0178 
0179     if (i < 2)
0180       std::cout << "launch done for " << offsets_h[blocks] << std::endl;
0181 
0182     alpaka::memcpy(queue, ind_h, ind_d);
0183     alpaka::wait(queue);
0184 
0185     delta += std::chrono::high_resolution_clock::now() - start;
0186 
0187     if (i < 2)
0188       std::cout << "kernel and read back done for " << offsets_h[blocks] << std::endl;
0189 
0190     if (32 == i) {
0191       std::cout << LL(v_h[ind_h[0]]) << ' ' << LL(v_h[ind_h[1]]) << ' ' << LL(v_h[ind_h[2]]) << std::endl;
0192       std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]])
0193                 << std::endl;
0194       std::cout << LL(v_h[ind_h[blockSize / 2 - 1]]) << ' ' << LL(v_h[ind_h[blockSize / 2]]) << ' '
0195                 << LL(v_h[ind_h[blockSize / 2 + 1]]) << std::endl;
0196     }
0197     for (int ib = 0; ib < blocks; ++ib) {
0198       std::set<uint16_t> inds;
0199       if (offsets_h[ib + 1] > offsets_h[ib])
0200         inds.insert(ind_h[offsets_h[ib]]);
0201       for (auto j = offsets_h[ib] + 1; j < offsets_h[ib + 1]; j++) {
0202         if (inds.count(ind_h[j]) != 0) {
0203           printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n", i, ib, j, ind_h[j]);
0204           std::vector<int> counts;
0205           counts.resize(offsets_h[ib + 1] - offsets_h[ib], 0);
0206           for (size_t j2 = offsets_h[ib]; j2 < offsets_h[ib + 1]; j2++) {
0207             counts[ind_h[j2]]++;
0208           }
0209           for (size_t j2 = 0; j2 < counts.size(); j2++) {
0210             if (counts[j2] != 1)
0211               printf("counts[%ld]=%d ", j2, counts[j2]);
0212           }
0213           printf("\n");
0214           printf("inds.count(ind_h[j] = %lu\n", inds.count(ind_h[j]));
0215         }
0216         assert(0 == inds.count(ind_h[j]));
0217         inds.insert(ind_h[j]);
0218         auto a = v_h.data() + offsets_h[ib];
0219         auto k1 = a[ind_h[j]];
0220         auto k2 = a[ind_h[j - 1]];
0221         truncate<NS>(k1);
0222         truncate<NS>(k2);
0223         if (k1 < k2) {
0224           std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib] << ", "
0225                     << offsets_h[ib + 1] - 1 << "] j=" << j << " ind[j]=" << ind_h[j]
0226                     << " (k1 < k2) : a1=" << (int64_t)a[ind_h[j]] << " k1=" << (int64_t)k1
0227                     << " a2= " << (int64_t)a[ind_h[j - 1]] << " k2=" << (int64_t)k2 << std::endl;
0228           //sleep(2);
0229           assert(false);
0230         }
0231       }
0232       if (!inds.empty()) {
0233         assert(0 == *inds.begin());
0234         assert(inds.size() - 1 == *inds.rbegin());
0235       }
0236       if (inds.size() != (offsets_h[ib + 1] - offsets_h[ib]))
0237         std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets_h[ib + 1] - offsets_h[ib])
0238                   << std::endl;
0239       //
0240       assert(inds.size() == (offsets_h[ib + 1] - offsets_h[ib]));
0241     }
0242   }  // 50 times
0243   std::cout << "Kernel computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / 50.
0244             << " ms per pass" << std::endl;
0245 }
0246 
0247 int main() {
0248   // get the list of devices on the current platform
0249   auto const& devices = cms::alpakatools::devices<Platform>();
0250   if (devices.empty()) {
0251     std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0252       "the test will be skipped.\n";
0253     exit(EXIT_FAILURE);
0254   }
0255 
0256   for (auto const& device : devices) {
0257     Queue queue(device);
0258     bool useShared = false;
0259 
0260     std::cout << "using Global memory" << std::endl;
0261 
0262     go<int8_t>(queue, useShared);
0263     go<int16_t>(queue, useShared);
0264     go<int32_t>(queue, useShared);
0265     go<int32_t, 3>(queue, useShared);
0266     go<int64_t>(queue, useShared);
0267     go<float, 4, float, double>(queue, useShared);
0268     go<float, 2, float, double>(queue, useShared);
0269 
0270     go<uint8_t>(queue, useShared);
0271     go<uint16_t>(queue, useShared);
0272     go<uint32_t>(queue, useShared);
0273     // go<uint64_t>(v_h);
0274 
0275     useShared = true;
0276 
0277     std::cout << "using Shared memory" << std::endl;
0278 
0279     go<int8_t>(queue, useShared);
0280     go<int16_t>(queue, useShared);
0281     go<int32_t>(queue, useShared);
0282     go<int32_t, 3>(queue, useShared);
0283     go<int64_t>(queue, useShared);
0284     go<float, 4, float, double>(queue, useShared);
0285     go<float, 2, float, double>(queue, useShared);
0286 
0287     go<uint8_t>(queue, useShared);
0288     go<uint16_t>(queue, useShared);
0289     go<uint32_t>(queue, useShared);
0290     // go<uint64_t>(v_h);
0291   }
0292   return 0;
0293 }