AlpakaInterface/interface/radixSort.h

0001 #ifndef HeterogeneousCore_AlpakaInterface_interface_radixSort_h
0002 #define HeterogeneousCore_AlpakaInterface_interface_radixSort_h
0003
0004 #include <algorithm>
0005 #include <cstdint>
0006 #include <numeric>
0007 #include <type_traits>
0008
0009 #include <alpaka/alpaka.hpp>
0010
0011 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0013
0014 namespace cms::alpakatools {
0015
0016   template <typename TAcc, typename T>
0017   ALPAKA_FN_ACC ALPAKA_FN_INLINE void dummyReorder(
0018       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {}
0019
0020   template <typename TAcc, typename T>
0021   ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderSigned(
0022       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0023     //move negative first...
0024
0025     auto& firstNeg = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0026     firstNeg = a[ind[0]] < 0 ? 0 : size;
0027     alpaka::syncBlockThreads(acc);
0028
0029     // find first negative
0030     for (auto idx : independent_group_elements(acc, size - 1)) {
0031       if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) {
0032         firstNeg = idx + 1;
0033       }
0034     }
0035
0036     alpaka::syncBlockThreads(acc);
0037
0038     for (auto idx : independent_group_elements(acc, firstNeg, size)) {
0039       ind2[idx - firstNeg] = ind[idx];
0040     }
0041     alpaka::syncBlockThreads(acc);
0042
0043     for (auto idx : independent_group_elements(acc, firstNeg)) {
0044       ind2[idx + size - firstNeg] = ind[idx];
0045     }
0046     alpaka::syncBlockThreads(acc);
0047
0048     for (auto idx : independent_group_elements(acc, size)) {
0049       ind[idx] = ind2[idx];
0050     }
0051   }
0052
0053   template <typename TAcc, typename T>
0054   ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderFloat(
0055       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0056     //move negative first...
0057
0058     auto& firstNeg = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0059     firstNeg = a[ind[0]] < 0 ? 0 : size;
0060     alpaka::syncBlockThreads(acc);
0061
0062     // find first negative
0063     for (auto idx : independent_group_elements(acc, size - 1)) {
0064       if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0)
0065         firstNeg = idx + 1;
0066     }
0067     alpaka::syncBlockThreads(acc);
0068
0069     for (auto idx : independent_group_elements(acc, firstNeg, size)) {
0070       ind2[size - idx - 1] = ind[idx];
0071     }
0072     alpaka::syncBlockThreads(acc);
0073
0074     for (auto idx : independent_group_elements(acc, firstNeg)) {
0075       ind2[idx + size - firstNeg] = ind[idx];
0076     }
0077     alpaka::syncBlockThreads(acc);
0078
0079     for (auto idx : independent_group_elements(acc, size)) {
0080       ind[idx] = ind2[idx];
0081     }
0082   }
0083
0084   // Radix sort implements a bytewise lexicographic order on the input data.
0085   // Data is reordered into bins indexed by the byte considered. But considering least significant bytes first
0086   // and respecting the existing order when binning the values, we achieve the lexicographic ordering.
0087   // The number of bytes actually considered is a parameter template parameter.
0088   // The post processing reorder
0089   // function fixes the order when bitwise ordering is not the order for the underlying type (only based on
0090   // most significant bit for signed types, integer or floating point).
0091   // The floating point numbers are reinterpret_cast into integers in the calling wrapper
0092   // This algorithm requires to run in a single block
0093   template <typename TAcc,
0094             typename T,   // shall be integer, signed or not does not matter here
0095             int NS,       // number of significant bytes to use in sorting.
0096             typename RF>  // The post processing reorder function.
0097   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortImpl(
0098       const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
0099     if constexpr (!requires_single_thread_per_block_v<TAcc>) {
0100       const auto warpSize = alpaka::warp::getSize(acc);
0101       const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
0102       [[maybe_unused]] const uint32_t blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
0103       // we expect a power of 2 here
0104       assert(warpSize && (0 == (warpSize & (warpSize - 1))));
0105       const std::size_t warpMask = warpSize - 1;
0106
0107       // Define the bin size (d=8 => 1 byte bin).
0108       constexpr int binBits = 8, dataBits = 8 * sizeof(T), totalSortingPassses = dataBits / binBits;
0109       // Make sure the slices are data aligned
0110       static_assert(0 == dataBits % binBits);
0111       // Make sure the NS parameter makes sense
0112       static_assert(NS > 0 && NS <= sizeof(T));
0113       constexpr int binsNumber = 1 << binBits;
0114       constexpr int binsMask = binsNumber - 1;
0115       // Prefix scan iterations. NS is counted in full bytes and not slices.
0116       constexpr int initialSortingPass = int(sizeof(T)) - NS;
0117
0118       // Count/index for the prefix scan
0119       // TODO: rename
0120       auto& c = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
0121       // Temporary storage for prefix scan. Only really needed for first-of-warp keeping
0122       // Then used for thread to bin mapping TODO: change type to byte and remap to
0123       auto& ct = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
0124       // Bin to thread index mapping (used to store the highest thread index within a bin number
0125       // batch of threads.
0126       // TODO: currently initialized to an invalid value, but could also be initialized to the
0127       // lowest possible value (change to bytes?)
0128       auto& cu = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
0129       // TODO we could also have an explicit caching of the current index for each thread.
0130
0131       // TODO: do those have to be shared?
0132       auto& ibs = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0133       auto& currentSortingPass = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0134
0135       ALPAKA_ASSERT_ACC(size > 0);
0136       // TODO: is this a hard requirement?
0137       ALPAKA_ASSERT_ACC(blockDimension >= binsNumber);
0138
0139       currentSortingPass = initialSortingPass;
0140
0141       auto j = ind;
0142       auto k = ind2;
0143
0144       // Initializer index order to trivial increment.
0145       for (auto idx : independent_group_elements(acc, size)) {
0146         j[idx] = idx;
0147       }
0148       alpaka::syncBlockThreads(acc);
0149
0150       // Iterate on the slices of the data.
0151       while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, (currentSortingPass < totalSortingPassses))) {
0152         for (auto idx : independent_group_elements(acc, binsNumber)) {
0153           c[idx] = 0;
0154         }
0155         alpaka::syncBlockThreads(acc);
0156         const auto sortingPassShift = binBits * currentSortingPass;
0157
0158         // fill bins (count elements in each bin)
0159         for (auto idx : independent_group_elements(acc, size)) {
0160           auto bin = (a[j[idx]] >> sortingPassShift) & binsMask;
0161           alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{});
0162         }
0163         alpaka::syncBlockThreads(acc);
0164
0165         if (!threadIdxLocal && 1 == alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
0166           //          printf("Pass=%d, Block=%d, ", currentSortingPass - 1, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]);
0167           size_t total = 0;
0168           for (int i = 0; i < (int)binsNumber; i++) {
0169             //            printf("count[%d]=%d ", i, c[i] );
0170             total += c[i];
0171           }
0172           //          printf("total=%zu\n", total);
0173           assert(total == size);
0174         }
0175         // prefix scan "optimized"???...
0176         // TODO: we might be able to reuse the warpPrefixScan function
0177         // Warp level prefix scan
0178         for (auto idx : independent_group_elements(acc, binsNumber)) {
0179           auto x = c[idx];
0180           auto laneId = idx & warpMask;
0181
0182           for (int offset = 1; offset < warpSize; offset <<= 1) {
0183             auto y = alpaka::warp::shfl(acc, x, laneId - offset);
0184             if (laneId >= (uint32_t)offset)
0185               x += y;
0186           }
0187           ct[idx] = x;
0188         }
0189         alpaka::syncBlockThreads(acc);
0190
0191         // Block level completion of prefix scan (add last sum of each preceding warp)
0192         for (auto idx : independent_group_elements(acc, binsNumber)) {
0193           auto ss = (idx / warpSize) * warpSize - 1;
0194           c[idx] = ct[idx];
0195           for (int i = ss; i > 0; i -= warpSize)
0196             c[idx] += ct[i];
0197         }
0198         // Post prefix scan, c[bin] contains the offsets in index counts to the last index +1 for each bin
0199
0200         /*
0201         //prefix scan for the nulls  (for documentation)
0202         if (threadIdxLocal==0)
0203           for (int i = 1; i < sb; ++i) c[i] += c[i-1];
0204         */
0205
0206         // broadcast: we will fill the new index array downward, from offset c[bin], with one thread per
0207         // bin, working on one set of bin size elements at a time.
0208         // This will reorder the indices by the currently considered slice, otherwise preserving the previous order.
0209         ibs = size - 1;
0210         alpaka::syncBlockThreads(acc);
0211
0212         // Iterate on bin-sized slices to (size - 1) / binSize + 1 iterations
0213         while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, ibs >= 0)) {
0214           // Init
0215           for (auto idx : independent_group_elements(acc, binsNumber)) {
0216             cu[idx] = -1;
0217             ct[idx] = -1;
0218           }
0219           alpaka::syncBlockThreads(acc);
0220
0221           // Find the highest index for all the threads dealing with a given bin (in cu[])
0222           // Also record the bin for each thread (in ct[])
0223           for (auto idx : independent_group_elements(acc, binsNumber)) {
0224             int i = ibs - idx;
0225             int32_t bin = -1;
0226             if (i >= 0) {
0227               bin = (a[j[i]] >> sortingPassShift) & binsMask;
0228               ct[idx] = bin;
0229               alpaka::atomicMax(acc, &cu[bin], int(i), alpaka::hierarchy::Threads{});
0230             }
0231           }
0232           alpaka::syncBlockThreads(acc);
0233
0234           // FIXME: we can slash a memory access.
0235           for (auto idx : independent_group_elements(acc, binsNumber)) {
0236             int i = ibs - idx;
0237             // Are we still in inside the data?
0238             if (i >= 0) {
0239               int32_t bin = ct[idx];
0240               // Are we the thread with the highest index (from previous pass)?
0241               if (cu[bin] == i) {
0242                 // With the highest index, we are actually the lowest thread number. We will
0243                 // work "on behalf of" the higher thread numbers (including ourselves)
0244                 // No way around scanning and testing for bin in ct[otherThread] number to find the other threads
0245                 for (int peerThreadIdx = idx; peerThreadIdx < binsNumber; peerThreadIdx++) {
0246                   if (ct[peerThreadIdx] == bin) {
0247                     k[--c[bin]] = j[ibs - peerThreadIdx];
0248                   }
0249                 }
0250               }
0251             }
0252             /*
0253             int32_t bin = (i >= 0 ? ((a[j[i]] >> sortingPassShift) & binsMask) : -1);
0254             if (i >= 0 && i == cu[bin])  // ensure to keep them in order: only one thread per bin is active, rest is idle.
0255               //
0256               for (int ii = idx; ii < sb; ++ii)
0257                 if (ct[ii] == bin) {
0258                   auto oi = ii - idx;
0259                   // assert(i>=oi);if(i>=oi)
0260                   k[--c[bin]] = j[i - oi]; // i = ibs - idx, oi = ii - idx => i - oi = ibs - ii;
0261                 }
0262             */
0263           }
0264           alpaka::syncBlockThreads(acc);
0265
0266           if (threadIdxLocal == 0) {
0267             ibs -= binsNumber;
0268             // https://github.com/cms-patatrack/pixeltrack-standalone/pull/210
0269             // TODO: is this really needed?
0270             alpaka::mem_fence(acc, alpaka::memory_scope::Grid{});
0271           }
0272           alpaka::syncBlockThreads(acc);
0273         }
0274
0275         /*
0276         // broadcast for the nulls  (for documentation)
0277         if (threadIdxLocal==0)
0278         for (int i=size-first-1; i>=0; i--) { // =blockDim.x) {
0279           auto bin = (a[j[i]] >> d*p)&(sb-1);
0280           auto ik = atomicSub(&c[bin],1);
0281           k[ik-1] = j[i];
0282         }
0283         */
0284
0285         alpaka::syncBlockThreads(acc);
0286         ALPAKA_ASSERT_ACC(c[0] == 0);
0287
0288         // swap (local, ok)
0289         auto t = j;
0290         j = k;
0291         k = t;
0292
0293         const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
0294         if (threadIdxLocal == 0)
0295           ++currentSortingPass;
0296         alpaka::syncBlockThreads(acc);
0297       }
0298
0299       if ((dataBits != 8) && (0 == (NS & 1)))
0300         ALPAKA_ASSERT_ACC(j ==
0301                           ind);  // dataBits/binBits is even so ind is correct (the result is in the right location)
0302
0303       // TODO this copy is (doubly?) redundant with the reorder
0304       if (j != ind)  // odd number of sorting passes, we need to move the result to the right array (ind[])
0305         for (auto idx : independent_group_elements(acc, size)) {
0306           ind[idx] = ind2[idx];
0307         };
0308
0309       alpaka::syncBlockThreads(acc);
0310
0311       // now move negative first... (if signed)
0312       // TODO: the ind2 => ind copy should have beed deferred. We should pass (j != ind) as an extra parameter
0313       reorder(acc, a, ind, ind2, size);
0314     } else {
0315       //static_assert(false);
0316     }
0317   }
0318
0319   template <typename TAcc,
0320             typename T,
0321             int NS = sizeof(T),  // number of significant bytes to use in sorting
0322             typename std::enable_if<std::is_unsigned<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* =
0323                 nullptr>
0324   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
0325       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0326     radixSortImpl<TAcc, T, NS>(acc, a, ind, ind2, size, dummyReorder<TAcc, T>);
0327   }
0328
0329   template <typename TAcc,
0330             typename T,
0331             int NS = sizeof(T),  // number of significant bytes to use in sorting
0332             typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value &&
0333                                         !requires_single_thread_per_block_v<TAcc>,
0334                                     T>::type* = nullptr>
0335   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
0336       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0337     radixSortImpl<TAcc, T, NS>(acc, a, ind, ind2, size, reorderSigned<TAcc, T>);
0338   }
0339
0340   template <typename TAcc,
0341             typename T,
0342             int NS = sizeof(T),  // number of significant bytes to use in sorting
0343             typename std::enable_if<std::is_floating_point<T>::value && !requires_single_thread_per_block_v<TAcc>,
0344                                     T>::type* = nullptr>
0345   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
0346       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0347     static_assert(sizeof(T) == sizeof(int), "radixSort with the wrong type size");
0348     using I = int;
0349     radixSortImpl<TAcc, I, NS>(acc, (I const*)(a), ind, ind2, size, reorderFloat<TAcc, I>);
0350   }
0351
0352   template <typename TAcc,
0353             typename T,
0354             int NS = sizeof(T),  // number of significant bytes to use in sorting
0355             typename std::enable_if<requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
0356   /* not ALPAKA_FN_ACC to avoid trying to compile it for the CUDA or ROCm back-ends */
0357   ALPAKA_FN_INLINE void radixSort(const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0358     static_assert(requires_single_thread_per_block_v<TAcc>, "CPU sort (not a radixSort) called wtth wrong accelerator");
0359     // Initialize the index array
0360     std::iota(ind, ind + size, 0);
0361     /*
0362     printf("std::stable_sort(a=%p, ind=%p, indmax=%p, size=%d)\n", a, ind, ind + size, size);
0363     for (uint32_t i=0; i<10 && i<size; i++) {
0364       printf ("a[%d]=%ld ", i, (long int)a[i]);
0365     }
0366     printf("\n");
0367     for (uint32_t i=0; i<10 && i<size; i++) {
0368       printf ("ind[%d]=%d ", i, ind[i]);
0369     }
0370     printf("\n");
0371     */
0372     std::stable_sort(ind, ind + size, [a](uint16_t i0, uint16_t i1) { return a[i0] < a[i1]; });
0373     /*
0374     for (uint32_t i=0; i<10 && i<size; i++) {
0375       printf ("ind[%d]=%d ", i, ind[i]);
0376     }
0377     printf("\n");
0378     */
0379   }
0380
0381   template <typename TAcc, typename T, int NS = sizeof(T)>
0382   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti(
0383       const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
0384     // TODO: check
0385     // Sort multiple blocks of data in v[] separated by in chunks located at offsets[]
0386     // extern __shared__ uint16_t ws[];
0387     uint16_t* ws = alpaka::getDynSharedMem<uint16_t>(acc);
0388
0389     const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
0390     auto a = v + offsets[blockIdx];
0391     auto ind = index + offsets[blockIdx];
0392     auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx];
0393     auto size = offsets[blockIdx + 1] - offsets[blockIdx];
0394     assert(offsets[blockIdx + 1] >= offsets[blockIdx]);
0395     if (size > 0)
0396       radixSort<TAcc, T, NS>(acc, a, ind, ind2, size);
0397   }
0398
0399   template <typename T, int NS = sizeof(T)>
0400   struct radixSortMultiWrapper {
0401     /* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical (keeping CUDA/HIP code for reference for the moment)
0402 #if defined(__CUDACC__) || defined(__HIPCC__)
0403     //__global__ void __launch_bounds__(256, 4)
0404 #endif
0405 */
0406     template <typename TAcc>
0407     ALPAKA_FN_ACC void operator()(const TAcc& acc,
0408                                   T const* v,
0409                                   uint16_t* index,
0410                                   uint32_t const* offsets,
0411                                   uint16_t* workspace,
0412                                   size_t sharedMemBytes = 0) const {
0413       radixSortMulti<TAcc, T, NS>(acc, v, index, offsets, workspace);
0414     }
0415   };
0416
0417   template <typename T, int NS = sizeof(T)>
0418   struct radixSortMultiWrapper2 {
0419     template <typename TAcc>
0420     ALPAKA_FN_ACC void operator()(const TAcc& acc,
0421                                   T const* v,
0422                                   uint16_t* index,
0423                                   uint32_t const* offsets,
0424                                   uint16_t* workspace,
0425                                   size_t sharedMemBytes = 0) const {
0426       radixSortMulti<TAcc, T, NS>(acc, v, index, offsets, workspace);
0427     }
0428   };
0429 }  // namespace cms::alpakatools
0430
0431 namespace alpaka::trait {
0432   // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
0433   // block shared dynamic memory for the radixSortMultiWrapper kernel
0434   template <typename TAcc, typename T, int NS>
0435   struct BlockSharedMemDynSizeBytes<cms::alpakatools::radixSortMultiWrapper<T, NS>, TAcc> {
0436     // the size in bytes of the shared memory allocated for a block
0437     ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(
0438         cms::alpakatools::radixSortMultiWrapper<T, NS> const& /* kernel */,
0439         alpaka_common::Vec1D /* threads */,
0440         alpaka_common::Vec1D /* elements */,
0441         T const* /* v */,
0442         uint16_t* /* index */,
0443         uint32_t const* /* offsets */,
0444         uint16_t* workspace,
0445         size_t sharedMemBytes) {
0446       if (workspace != nullptr)
0447         return 0;
0448       /* The shared memory workspace is 'blockspace * 2' in CUDA *but that's a value coincidence... TODO: check */
0449       //printf ("in BlockSharedMemDynSizeBytes<cms::alpakatools::radixSortMultiWrapper<T, NS>, TAcc>: shared mem size = %d\n", (int)sharedMemBytes);
0450       return sharedMemBytes;
0451     }
0452   };
0453 }  // namespace alpaka::trait
0454
0455 #endif  // HeterogeneousCore_AlpakaInterface_interface_radixSort_h