File indexing completed on 2025-01-22 07:34:11
0001
0002
0003 #include <cstddef>
0004 #include <cstdint>
0005 #include <cstdlib>
0006 #include <algorithm>
0007
0008 #include <alpaka/alpaka.hpp>
0009
0010 #include "FWCore/Utilities/interface/stringize.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0016
0017 using namespace cms::alpakatools;
0018 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0019
0020 using FLOAT = double;
0021
0022
0023 template <int N>
0024 struct uintN;
0025
0026 template <>
0027 struct uintN<8> {
0028 using type = uint8_t;
0029 };
0030
0031 template <>
0032 struct uintN<16> {
0033 using type = uint16_t;
0034 };
0035
0036 template <>
0037 struct uintN<32> {
0038 using type = uint32_t;
0039 };
0040
0041 template <>
0042 struct uintN<64> {
0043 using type = uint64_t;
0044 };
0045
0046 template <int N>
0047 using uintN_t = typename uintN<N>::type;
0048
0049
0050 template <typename T>
0051 using uintT_t = uintN_t<sizeof(T) * 8>;
0052
0053
0054 template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
0055 ALPAKA_FN_HOST_ACC T truncate(T const& t) {
0056 const int shift = 8 * (sizeof(T) - N);
0057 union {
0058 T t;
0059 uintT_t<T> u;
0060 } c;
0061 c.t = t;
0062 c.u = c.u >> shift << shift;
0063 return c.t;
0064 }
0065
0066 namespace {
0067 struct testKernel {
0068 ALPAKA_FN_ACC void operator()(
0069 Acc1D const& acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const {
0070
0071 const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0072 const auto blocksIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0073 ALPAKA_ASSERT_ACC(1 == blocksPerGrid);
0074 ALPAKA_ASSERT_ACC(0 == blocksIdx);
0075 ALPAKA_ASSERT_ACC(elements <= 2048);
0076
0077 auto& order = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
0078 auto& sws = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
0079 auto& z = alpaka::declareSharedVar<float[2048], __COUNTER__>(acc);
0080 auto& iz = alpaka::declareSharedVar<int[2048], __COUNTER__>(acc);
0081 for (auto itrack : uniform_elements(acc, elements)) {
0082 z[itrack] = gpu_input[itrack];
0083 iz[itrack] = 10000 * gpu_input[itrack];
0084
0085 }
0086 alpaka::syncBlockThreads(acc);
0087 radixSort<Acc1D, float, 2>(acc, z, order, sws, elements);
0088 alpaka::syncBlockThreads(acc);
0089
0090
0091 for (auto itrack : uniform_elements(acc, elements - 1)) {
0092 auto ntrack = order[itrack];
0093 auto mtrack = order[itrack + 1];
0094 ALPAKA_ASSERT_ACC(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack]));
0095 }
0096
0097 alpaka::syncBlockThreads(acc);
0098
0099 if (doPrint)
0100 if (once_per_grid(acc)) {
0101 for (int itrackO = 0; itrackO < elements; itrackO++) {
0102 int itrack = order[itrackO];
0103 printf(
0104 "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
0105 "radixSort %f\n",
0106 elements,
0107 itrackO,
0108 itrack,
0109 gpu_input[itrack],
0110 z[itrack]);
0111 gpu_product[itrackO] = itrack;
0112 }
0113 }
0114
0115 alpaka::syncBlockThreads(acc);
0116 radixSort<Acc1D, int, 4>(acc, iz, order, sws, elements);
0117 alpaka::syncBlockThreads(acc);
0118
0119 for (auto itrack : uniform_elements(acc, elements - 1)) {
0120 auto ntrack = order[itrack];
0121 auto mtrack = order[itrack + 1];
0122 ALPAKA_ASSERT_ACC(iz[ntrack] <= iz[mtrack]);
0123 }
0124
0125 if (doPrint)
0126 if (once_per_grid(acc)) {
0127 for (int itrackO = 0; itrackO < elements; itrackO++) {
0128 int itrack = order[itrackO];
0129 printf(
0130 "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
0131 "radixSort %d\n",
0132 elements,
0133 itrackO,
0134 itrack,
0135 gpu_input[itrack],
0136 iz[itrack]);
0137 gpu_product[itrackO] = itrack;
0138 }
0139 }
0140 }
0141 };
0142
0143 void testWrapper(Queue& queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
0144 auto blockSize = 512;
0145 auto gridSize = 1;
0146 const auto workdiv = make_workdiv<Acc1D>(gridSize, blockSize);
0147 alpaka::enqueue(queue,
0148 alpaka::createTaskKernel<Acc1D>(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint));
0149 alpaka::wait(queue);
0150 }
0151 }
0152
0153 int main() {
0154
0155 auto const& devices = cms::alpakatools::devices<Platform>();
0156 if (devices.empty()) {
0157 std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0158 "the test will be skipped.\n";
0159 exit(EXIT_FAILURE);
0160 }
0161
0162 std::random_device rd;
0163 std::mt19937 g(rd());
0164
0165
0166 for (auto const& device : devices) {
0167 Queue queue(device);
0168
0169 int nmax = 4 * 260;
0170 auto gpu_input_h = cms::alpakatools::make_host_buffer<FLOAT[]>(queue, nmax);
0171 auto i = gpu_input_h.data();
0172 for (auto v : {30.0, 30.0, -4.4, -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007,
0173 2.2666890621, 2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453, 2.2852313519, 2.2877883911,
0174 2.2946476936, 2.2960267067, 2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652, 2.783257246,
0175 2.8440306187, 2.9017834663, 2.9252648354, 2.9254128933, 2.927520752, 2.9422419071, 2.9453969002,
0176 2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948, 2.9575133324, 2.9575304985, 2.9586606026,
0177 2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377, 2.9646151066, 2.9676523209, 2.9708273411,
0178 2.974111557, 2.9742531776, 2.9772830009, 2.9877333641, 2.9960610867, 3.013969183, 3.0187871456,
0179 3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913, 3.0560519695, 3.0592908859, 3.0599737167,
0180 3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904, 3.0643932819, 3.0645000935, 3.0666446686,
0181 3.068046093, 3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026, 3.0733406544, 3.0738227367,
0182 3.0738801956, 3.0738828182, 3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146, 3.0773906708,
0183 3.0778541565, 3.0780284405, 3.0780889988, 3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217,
0184 3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494, 3.0815284252, 3.0817306042, 3.0819730759,
0185 3.0820026398, 3.0838682652, 3.084009409, 3.0848178864, 3.0853257179, 3.0855510235, 3.0856611729,
0186 3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093, 3.0895674229, 3.0901503563, 3.0903317928,
0187 3.0912668705, 3.0920717716, 3.0954346657, 3.096424818, 3.0995628834, 3.1001036167, 3.1173279285,
0188 3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508, 3.1676850319, 3.1886672974, 3.3769197464,
0189 3.3821125031, 3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872, 3.5225863457, 3.5271244049,
0190 3.5298995972, 3.5417425632, 3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239, 3.5526945591,
0191 3.5531234741, 3.5538012981, 3.5544877052, 3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172,
0192 3.5560717583, 3.5560848713, 3.5584278107, 3.558681488, 3.5587313175, 3.5592217445, 3.559384346,
0193 3.5604712963, 3.5634038448, 3.563803196, 3.564593792, 3.5660364628, 3.5683133602, 3.5696356297,
0194 3.569729805, 3.5740811825, 3.5757565498, 3.5760207176, 3.5760478973, 3.5836098194, 3.5839796066,
0195 3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438, 3.7187042236, 3.9741659164, 4.4111995697,
0196 4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405, 4.6868157387, 4.6868577003, 4.6879930496,
0197 4.6888813972, 4.6910686493, 4.6925001144, 4.6957530975, 4.698094368, 4.6997032166, 4.7017259598,
0198 4.7020640373, 4.7024269104, 4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456, 4.7044372559,
0199 4.7050771713, 4.7055773735, 4.7060651779, 4.7062759399, 4.7065420151, 4.70657444, 4.7066287994,
0200 4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582, 4.7075614929, 4.7075891495, 4.7076044083,
0201 4.7077374458, 4.7080879211, 4.70819664, 4.7086658478, 4.708937645, 4.7092385292, 4.709479332,
0202 4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566, 4.7105507851, 4.71118927, 4.7113513947,
0203 4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361, 4.7117333412, 4.7117910385, 4.7119007111,
0204 4.7120013237, 4.712003231, 4.712044239, 4.7122926712, 4.7135767937, 4.7143669128, 4.7145690918,
0205 4.7148418427, 4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611, 4.717815876, 4.718059063,
0206 4.7188801765, 4.7190728188, 4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678, 4.7243990898,
0207 4.7273659706, 4.7294125557, 4.7296204567, 4.7325615883, 4.7356877327, 4.740146637, 4.742254734,
0208 4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065, 4.7723139628, 4.774812736, 4.8577151299,
0209 4.890037536}) {
0210 *(i++) = v;
0211 }
0212 auto input = gpu_input_h.data();
0213 for (int i = 0; i < 260; i++) {
0214 input[i + 260] = -input[i];
0215 input[i + 2 * 260] = input[i] + 10;
0216 input[i + 3 * 260] = -input[i] - 10;
0217 }
0218 auto gpu_input_d = cms::alpakatools::make_device_buffer<FLOAT[]>(queue, nmax);
0219 auto gpu_product_d = cms::alpakatools::make_device_buffer<int[]>(queue, nmax);
0220
0221 alpaka::memcpy(queue, gpu_input_d, gpu_input_h);
0222
0223 for (int k = 2; k <= nmax; k++) {
0224 std::shuffle(input, input + k, g);
0225 printf("Test with %d items\n", k);
0226
0227 testWrapper(queue, gpu_input_d.data(), gpu_product_d.data(), k, false);
0228 alpaka::wait(queue);
0229 }
0230 }
0231 return 0;
0232 }