File indexing completed on 2024-04-09 02:22:21
0001 #ifndef HeterogeneousCore_AlpakaInterface_interface_radixSort_h
0002 #define HeterogeneousCore_AlpakaInterface_interface_radixSort_h
0003
0004 #include <algorithm>
0005 #include <cstdint>
0006 #include <numeric>
0007 #include <type_traits>
0008
0009 #include <alpaka/alpaka.hpp>
0010
0011 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0013
0014 namespace cms::alpakatools {
0015
0016 template <typename TAcc, typename T>
0017 ALPAKA_FN_ACC ALPAKA_FN_INLINE void dummyReorder(
0018 const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {}
0019
0020 template <typename TAcc, typename T>
0021 ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderSigned(
0022 const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0023
0024
0025 auto& firstNeg = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0026 firstNeg = a[ind[0]] < 0 ? 0 : size;
0027 alpaka::syncBlockThreads(acc);
0028
0029
0030 for (auto idx : independent_group_elements(acc, size - 1)) {
0031 if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) {
0032 firstNeg = idx + 1;
0033 }
0034 }
0035
0036 alpaka::syncBlockThreads(acc);
0037
0038 for (auto idx : independent_group_elements(acc, firstNeg, size)) {
0039 ind2[idx - firstNeg] = ind[idx];
0040 }
0041 alpaka::syncBlockThreads(acc);
0042
0043 for (auto idx : independent_group_elements(acc, firstNeg)) {
0044 ind2[idx + size - firstNeg] = ind[idx];
0045 }
0046 alpaka::syncBlockThreads(acc);
0047
0048 for (auto idx : independent_group_elements(acc, size)) {
0049 ind[idx] = ind2[idx];
0050 }
0051 }
0052
0053 template <typename TAcc, typename T>
0054 ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderFloat(
0055 const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0056
0057
0058 auto& firstNeg = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0059 firstNeg = a[ind[0]] < 0 ? 0 : size;
0060 alpaka::syncBlockThreads(acc);
0061
0062
0063 for (auto idx : independent_group_elements(acc, size - 1)) {
0064 if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0)
0065 firstNeg = idx + 1;
0066 }
0067 alpaka::syncBlockThreads(acc);
0068
0069 for (auto idx : independent_group_elements(acc, firstNeg, size)) {
0070 ind2[size - idx - 1] = ind[idx];
0071 }
0072 alpaka::syncBlockThreads(acc);
0073
0074 for (auto idx : independent_group_elements(acc, firstNeg)) {
0075 ind2[idx + size - firstNeg] = ind[idx];
0076 }
0077 alpaka::syncBlockThreads(acc);
0078
0079 for (auto idx : independent_group_elements(acc, size)) {
0080 ind[idx] = ind2[idx];
0081 }
0082 }
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093 template <typename TAcc,
0094 typename T,
0095 int NS,
0096 typename RF>
0097 ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortImpl(
0098 const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
0099 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
0100 const auto warpSize = alpaka::warp::getSize(acc);
0101 const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
0102 [[maybe_unused]] const uint32_t blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
0103
0104 assert(warpSize && (0 == (warpSize & (warpSize - 1))));
0105 const std::size_t warpMask = warpSize - 1;
0106
0107
0108 constexpr int binBits = 8, dataBits = 8 * sizeof(T), totalSortingPassses = dataBits / binBits;
0109
0110 static_assert(0 == dataBits % binBits);
0111
0112 static_assert(NS > 0 && NS <= sizeof(T));
0113 constexpr int binsNumber = 1 << binBits;
0114 constexpr int binsMask = binsNumber - 1;
0115
0116 constexpr int initialSortingPass = int(sizeof(T)) - NS;
0117
0118
0119
0120 auto& c = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
0121
0122
0123 auto& ct = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
0124
0125
0126
0127
0128 auto& cu = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
0129
0130
0131
0132 auto& ibs = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0133 auto& currentSortingPass = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0134
0135 ALPAKA_ASSERT_ACC(size > 0);
0136
0137 ALPAKA_ASSERT_ACC(blockDimension >= binsNumber);
0138
0139 currentSortingPass = initialSortingPass;
0140
0141 auto j = ind;
0142 auto k = ind2;
0143
0144
0145 for (auto idx : independent_group_elements(acc, size)) {
0146 j[idx] = idx;
0147 }
0148 alpaka::syncBlockThreads(acc);
0149
0150
0151 while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, (currentSortingPass < totalSortingPassses))) {
0152 for (auto idx : independent_group_elements(acc, binsNumber)) {
0153 c[idx] = 0;
0154 }
0155 alpaka::syncBlockThreads(acc);
0156 const auto sortingPassShift = binBits * currentSortingPass;
0157
0158
0159 for (auto idx : independent_group_elements(acc, size)) {
0160 auto bin = (a[j[idx]] >> sortingPassShift) & binsMask;
0161 alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{});
0162 }
0163 alpaka::syncBlockThreads(acc);
0164
0165 if (!threadIdxLocal && 1 == alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
0166
0167 size_t total = 0;
0168 for (int i = 0; i < (int)binsNumber; i++) {
0169
0170 total += c[i];
0171 }
0172
0173 assert(total == size);
0174 }
0175
0176
0177
0178 for (auto idx : independent_group_elements(acc, binsNumber)) {
0179 auto x = c[idx];
0180 auto laneId = idx & warpMask;
0181
0182 for (int offset = 1; offset < warpSize; offset <<= 1) {
0183 auto y = alpaka::warp::shfl(acc, x, laneId - offset);
0184 if (laneId >= (uint32_t)offset)
0185 x += y;
0186 }
0187 ct[idx] = x;
0188 }
0189 alpaka::syncBlockThreads(acc);
0190
0191
0192 for (auto idx : independent_group_elements(acc, binsNumber)) {
0193 auto ss = (idx / warpSize) * warpSize - 1;
0194 c[idx] = ct[idx];
0195 for (int i = ss; i > 0; i -= warpSize)
0196 c[idx] += ct[i];
0197 }
0198
0199
0200
0201
0202
0203
0204
0205
0206
0207
0208
0209 ibs = size - 1;
0210 alpaka::syncBlockThreads(acc);
0211
0212
0213 while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, ibs >= 0)) {
0214
0215 for (auto idx : independent_group_elements(acc, binsNumber)) {
0216 cu[idx] = -1;
0217 ct[idx] = -1;
0218 }
0219 alpaka::syncBlockThreads(acc);
0220
0221
0222
0223 for (auto idx : independent_group_elements(acc, binsNumber)) {
0224 int i = ibs - idx;
0225 int32_t bin = -1;
0226 if (i >= 0) {
0227 bin = (a[j[i]] >> sortingPassShift) & binsMask;
0228 ct[idx] = bin;
0229 alpaka::atomicMax(acc, &cu[bin], int(i), alpaka::hierarchy::Threads{});
0230 }
0231 }
0232 alpaka::syncBlockThreads(acc);
0233
0234
0235 for (auto idx : independent_group_elements(acc, binsNumber)) {
0236 int i = ibs - idx;
0237
0238 if (i >= 0) {
0239 int32_t bin = ct[idx];
0240
0241 if (cu[bin] == i) {
0242
0243
0244
0245 for (int peerThreadIdx = idx; peerThreadIdx < binsNumber; peerThreadIdx++) {
0246 if (ct[peerThreadIdx] == bin) {
0247 k[--c[bin]] = j[ibs - peerThreadIdx];
0248 }
0249 }
0250 }
0251 }
0252
0253
0254
0255
0256
0257
0258
0259
0260
0261
0262
0263 }
0264 alpaka::syncBlockThreads(acc);
0265
0266 if (threadIdxLocal == 0) {
0267 ibs -= binsNumber;
0268
0269
0270 alpaka::mem_fence(acc, alpaka::memory_scope::Grid{});
0271 }
0272 alpaka::syncBlockThreads(acc);
0273 }
0274
0275
0276
0277
0278
0279
0280
0281
0282
0283
0284
0285 alpaka::syncBlockThreads(acc);
0286 ALPAKA_ASSERT_ACC(c[0] == 0);
0287
0288
0289 auto t = j;
0290 j = k;
0291 k = t;
0292
0293 const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
0294 if (threadIdxLocal == 0)
0295 ++currentSortingPass;
0296 alpaka::syncBlockThreads(acc);
0297 }
0298
0299 if ((dataBits != 8) && (0 == (NS & 1)))
0300 ALPAKA_ASSERT_ACC(j ==
0301 ind);
0302
0303
0304 if (j != ind)
0305 for (auto idx : independent_group_elements(acc, size)) {
0306 ind[idx] = ind2[idx];
0307 };
0308
0309 alpaka::syncBlockThreads(acc);
0310
0311
0312
0313 reorder(acc, a, ind, ind2, size);
0314 } else {
0315
0316 }
0317 }
0318
0319 template <typename TAcc,
0320 typename T,
0321 int NS = sizeof(T),
0322 typename std::enable_if<std::is_unsigned<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* =
0323 nullptr>
0324 ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
0325 const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0326 radixSortImpl<TAcc, T, NS>(acc, a, ind, ind2, size, dummyReorder<TAcc, T>);
0327 }
0328
0329 template <typename TAcc,
0330 typename T,
0331 int NS = sizeof(T),
0332 typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value &&
0333 !requires_single_thread_per_block_v<TAcc>,
0334 T>::type* = nullptr>
0335 ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
0336 const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0337 radixSortImpl<TAcc, T, NS>(acc, a, ind, ind2, size, reorderSigned<TAcc, T>);
0338 }
0339
0340 template <typename TAcc,
0341 typename T,
0342 int NS = sizeof(T),
0343 typename std::enable_if<std::is_floating_point<T>::value && !requires_single_thread_per_block_v<TAcc>,
0344 T>::type* = nullptr>
0345 ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
0346 const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0347 static_assert(sizeof(T) == sizeof(int), "radixSort with the wrong type size");
0348 using I = int;
0349 radixSortImpl<TAcc, I, NS>(acc, (I const*)(a), ind, ind2, size, reorderFloat<TAcc, I>);
0350 }
0351
0352 template <typename TAcc,
0353 typename T,
0354 int NS = sizeof(T),
0355 typename std::enable_if<requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
0356
0357 ALPAKA_FN_INLINE void radixSort(const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0358 static_assert(requires_single_thread_per_block_v<TAcc>, "CPU sort (not a radixSort) called wtth wrong accelerator");
0359
0360 std::iota(ind, ind + size, 0);
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371
0372 std::stable_sort(ind, ind + size, [a](uint16_t i0, uint16_t i1) { return a[i0] < a[i1]; });
0373
0374
0375
0376
0377
0378
0379 }
0380
0381 template <typename TAcc, typename T, int NS = sizeof(T)>
0382 ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti(
0383 const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
0384
0385
0386
0387 uint16_t* ws = alpaka::getDynSharedMem<uint16_t>(acc);
0388
0389 const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
0390 auto a = v + offsets[blockIdx];
0391 auto ind = index + offsets[blockIdx];
0392 auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx];
0393 auto size = offsets[blockIdx + 1] - offsets[blockIdx];
0394 assert(offsets[blockIdx + 1] >= offsets[blockIdx]);
0395 if (size > 0)
0396 radixSort<TAcc, T, NS>(acc, a, ind, ind2, size);
0397 }
0398
0399 template <typename T, int NS = sizeof(T)>
0400 struct radixSortMultiWrapper {
0401
0402
0403
0404
0405
0406 template <typename TAcc>
0407 ALPAKA_FN_ACC void operator()(const TAcc& acc,
0408 T const* v,
0409 uint16_t* index,
0410 uint32_t const* offsets,
0411 uint16_t* workspace,
0412 size_t sharedMemBytes = 0) const {
0413 radixSortMulti<TAcc, T, NS>(acc, v, index, offsets, workspace);
0414 }
0415 };
0416
0417 template <typename T, int NS = sizeof(T)>
0418 struct radixSortMultiWrapper2 {
0419 template <typename TAcc>
0420 ALPAKA_FN_ACC void operator()(const TAcc& acc,
0421 T const* v,
0422 uint16_t* index,
0423 uint32_t const* offsets,
0424 uint16_t* workspace,
0425 size_t sharedMemBytes = 0) const {
0426 radixSortMulti<TAcc, T, NS>(acc, v, index, offsets, workspace);
0427 }
0428 };
0429 }
0430
0431 namespace alpaka::trait {
0432
0433
0434 template <typename TAcc, typename T, int NS>
0435 struct BlockSharedMemDynSizeBytes<cms::alpakatools::radixSortMultiWrapper<T, NS>, TAcc> {
0436
0437 ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(
0438 cms::alpakatools::radixSortMultiWrapper<T, NS> const& ,
0439 alpaka_common::Vec1D ,
0440 alpaka_common::Vec1D ,
0441 T const* ,
0442 uint16_t* ,
0443 uint32_t const* ,
0444 uint16_t* workspace,
0445 size_t sharedMemBytes) {
0446 if (workspace != nullptr)
0447 return 0;
0448
0449
0450 return sharedMemBytes;
0451 }
0452 };
0453 }
0454
0455 #endif