File indexing completed on 2024-04-23 22:56:17
0001
0002
0003 #include <cstddef>
0004 #include <cstdint>
0005 #include <cstdlib>
0006 #include <algorithm>
0007
0008 #include <alpaka/alpaka.hpp>
0009
0010 #include "FWCore/Utilities/interface/stringize.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0016
0017 using namespace cms::alpakatools;
0018 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0019
0020 using FLOAT = double;
0021
0022
0023 template <int N>
0024 struct uintN;
0025
0026 template <>
0027 struct uintN<8> {
0028 using type = uint8_t;
0029 };
0030
0031 template <>
0032 struct uintN<16> {
0033 using type = uint16_t;
0034 };
0035
0036 template <>
0037 struct uintN<32> {
0038 using type = uint32_t;
0039 };
0040
0041 template <>
0042 struct uintN<64> {
0043 using type = uint64_t;
0044 };
0045
0046 template <int N>
0047 using uintN_t = typename uintN<N>::type;
0048
0049
0050 template <typename T>
0051 using uintT_t = uintN_t<sizeof(T) * 8>;
0052
0053
0054 template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
0055 ALPAKA_FN_HOST_ACC T truncate(T const& t) {
0056 const int shift = 8 * (sizeof(T) - N);
0057 union {
0058 T t;
0059 uintT_t<T> u;
0060 } c;
0061 c.t = t;
0062 c.u = c.u >> shift << shift;
0063 return c.t;
0064 }
0065
0066 namespace {
0067 struct testKernel {
0068 template <typename TAcc>
0069 ALPAKA_FN_ACC void operator()(
0070 const TAcc& acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const {
0071
0072
0073 bool threadZero = !alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
0074
0075
0076 const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0077 const auto blocksIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0078 ALPAKA_ASSERT_ACC(1 == blocksPerGrid);
0079 ALPAKA_ASSERT_ACC(0 == blocksIdx);
0080 ALPAKA_ASSERT_ACC(elements <= 2048);
0081
0082 auto& order = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
0083 auto& sws = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
0084 auto& z = alpaka::declareSharedVar<float[2048], __COUNTER__>(acc);
0085 auto& iz = alpaka::declareSharedVar<int[2048], __COUNTER__>(acc);
0086
0087
0088
0089
0090 for (auto itrack : uniform_elements(acc, elements)) {
0091 z[itrack] = gpu_input[itrack];
0092 iz[itrack] = 10000 * gpu_input[itrack];
0093
0094 }
0095 alpaka::syncBlockThreads(acc);
0096 radixSort<TAcc, float, 2>(acc, z, order, sws, elements);
0097 alpaka::syncBlockThreads(acc);
0098
0099
0100 for (auto itrack : uniform_elements(acc, elements - 1)) {
0101 auto ntrack = order[itrack];
0102 auto mtrack = order[itrack + 1];
0103 ALPAKA_ASSERT_ACC(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack]));
0104 }
0105
0106 alpaka::syncBlockThreads(acc);
0107
0108 if (doPrint)
0109 if (threadZero) {
0110 for (int itrackO = 0; itrackO < elements; itrackO++) {
0111 int itrack = order[itrackO];
0112 printf(
0113 "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
0114 "radixSort %f\n",
0115 elements,
0116 itrackO,
0117 itrack,
0118 gpu_input[itrack],
0119 z[itrack]);
0120 gpu_product[itrackO] = itrack;
0121 }
0122 }
0123
0124 alpaka::syncBlockThreads(acc);
0125 radixSort<TAcc, int, 4>(acc, iz, order, sws, elements);
0126 alpaka::syncBlockThreads(acc);
0127
0128 for (auto itrack : uniform_elements(acc, elements - 1)) {
0129 auto ntrack = order[itrack];
0130 auto mtrack = order[itrack + 1];
0131 ALPAKA_ASSERT_ACC(iz[ntrack] <= iz[mtrack]);
0132 }
0133
0134 if (doPrint)
0135 if (threadZero) {
0136 for (int itrackO = 0; itrackO < elements; itrackO++) {
0137 int itrack = order[itrackO];
0138 printf(
0139 "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
0140 "radixSort %d\n",
0141 elements,
0142 itrackO,
0143 itrack,
0144 gpu_input[itrack],
0145 iz[itrack]);
0146 gpu_product[itrackO] = itrack;
0147 }
0148 }
0149 }
0150 };
0151
0152 void testWrapper(Queue& queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
0153 auto blockSize = 512;
0154 auto gridSize = 1;
0155 const auto workdiv = make_workdiv<Acc1D>(gridSize, blockSize);
0156 alpaka::enqueue(queue,
0157 alpaka::createTaskKernel<Acc1D>(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint));
0158 alpaka::wait(queue);
0159 }
0160 }
0161
0162 int main() {
0163
0164 auto const& devices = cms::alpakatools::devices<Platform>();
0165 if (devices.empty()) {
0166 std::cerr << "No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
0167 "the test will be skipped.\n";
0168 exit(EXIT_FAILURE);
0169 }
0170
0171 std::random_device rd;
0172 std::mt19937 g(rd());
0173
0174
0175 for (auto const& device : devices) {
0176 Queue queue(device);
0177
0178 int nmax = 4 * 260;
0179 auto gpu_input_h = cms::alpakatools::make_host_buffer<FLOAT[]>(queue, nmax);
0180 auto i = gpu_input_h.data();
0181 for (auto v : {30.0, 30.0, -4.4, -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007,
0182 2.2666890621, 2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453, 2.2852313519, 2.2877883911,
0183 2.2946476936, 2.2960267067, 2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652, 2.783257246,
0184 2.8440306187, 2.9017834663, 2.9252648354, 2.9254128933, 2.927520752, 2.9422419071, 2.9453969002,
0185 2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948, 2.9575133324, 2.9575304985, 2.9586606026,
0186 2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377, 2.9646151066, 2.9676523209, 2.9708273411,
0187 2.974111557, 2.9742531776, 2.9772830009, 2.9877333641, 2.9960610867, 3.013969183, 3.0187871456,
0188 3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913, 3.0560519695, 3.0592908859, 3.0599737167,
0189 3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904, 3.0643932819, 3.0645000935, 3.0666446686,
0190 3.068046093, 3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026, 3.0733406544, 3.0738227367,
0191 3.0738801956, 3.0738828182, 3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146, 3.0773906708,
0192 3.0778541565, 3.0780284405, 3.0780889988, 3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217,
0193 3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494, 3.0815284252, 3.0817306042, 3.0819730759,
0194 3.0820026398, 3.0838682652, 3.084009409, 3.0848178864, 3.0853257179, 3.0855510235, 3.0856611729,
0195 3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093, 3.0895674229, 3.0901503563, 3.0903317928,
0196 3.0912668705, 3.0920717716, 3.0954346657, 3.096424818, 3.0995628834, 3.1001036167, 3.1173279285,
0197 3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508, 3.1676850319, 3.1886672974, 3.3769197464,
0198 3.3821125031, 3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872, 3.5225863457, 3.5271244049,
0199 3.5298995972, 3.5417425632, 3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239, 3.5526945591,
0200 3.5531234741, 3.5538012981, 3.5544877052, 3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172,
0201 3.5560717583, 3.5560848713, 3.5584278107, 3.558681488, 3.5587313175, 3.5592217445, 3.559384346,
0202 3.5604712963, 3.5634038448, 3.563803196, 3.564593792, 3.5660364628, 3.5683133602, 3.5696356297,
0203 3.569729805, 3.5740811825, 3.5757565498, 3.5760207176, 3.5760478973, 3.5836098194, 3.5839796066,
0204 3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438, 3.7187042236, 3.9741659164, 4.4111995697,
0205 4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405, 4.6868157387, 4.6868577003, 4.6879930496,
0206 4.6888813972, 4.6910686493, 4.6925001144, 4.6957530975, 4.698094368, 4.6997032166, 4.7017259598,
0207 4.7020640373, 4.7024269104, 4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456, 4.7044372559,
0208 4.7050771713, 4.7055773735, 4.7060651779, 4.7062759399, 4.7065420151, 4.70657444, 4.7066287994,
0209 4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582, 4.7075614929, 4.7075891495, 4.7076044083,
0210 4.7077374458, 4.7080879211, 4.70819664, 4.7086658478, 4.708937645, 4.7092385292, 4.709479332,
0211 4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566, 4.7105507851, 4.71118927, 4.7113513947,
0212 4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361, 4.7117333412, 4.7117910385, 4.7119007111,
0213 4.7120013237, 4.712003231, 4.712044239, 4.7122926712, 4.7135767937, 4.7143669128, 4.7145690918,
0214 4.7148418427, 4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611, 4.717815876, 4.718059063,
0215 4.7188801765, 4.7190728188, 4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678, 4.7243990898,
0216 4.7273659706, 4.7294125557, 4.7296204567, 4.7325615883, 4.7356877327, 4.740146637, 4.742254734,
0217 4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065, 4.7723139628, 4.774812736, 4.8577151299,
0218 4.890037536}) {
0219 *(i++) = v;
0220 }
0221 auto input = gpu_input_h.data();
0222 for (int i = 0; i < 260; i++) {
0223 input[i + 260] = -input[i];
0224 input[i + 2 * 260] = input[i] + 10;
0225 input[i + 3 * 260] = -input[i] - 10;
0226 }
0227 auto gpu_input_d = cms::alpakatools::make_device_buffer<FLOAT[]>(queue, nmax);
0228 auto gpu_product_d = cms::alpakatools::make_device_buffer<int[]>(queue, nmax);
0229
0230 alpaka::memcpy(queue, gpu_input_d, gpu_input_h);
0231
0232 for (int k = 2; k <= nmax; k++) {
0233 std::shuffle(input, input + k, g);
0234 printf("Test with %d items\n", k);
0235
0236 testWrapper(queue, gpu_input_d.data(), gpu_product_d.data(), k, false);
0237 alpaka::wait(queue);
0238 }
0239 }
0240 return 0;
0241 }