CUDAUtilities/interface/radixSort.h

0001 #ifndef HeterogeneousCoreCUDAUtilities_radixSort_H
0002 #define HeterogeneousCoreCUDAUtilities_radixSort_H
0003
0004 #ifdef __CUDACC__
0005
0006 #include <cstdint>
0007 #include <type_traits>
0008
0009 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0010 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0011
0012 template <typename T>
0013 __device__ inline void dummyReorder(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {}
0014
0015 template <typename T>
0016 __device__ inline void reorderSigned(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0017   //move negative first...
0018
0019   int32_t first = threadIdx.x;
0020   __shared__ uint32_t firstNeg;
0021   firstNeg = a[ind[0]] < 0 ? 0 : size;
0022   __syncthreads();
0023
0024   // find first negative
0025   for (auto i = first; i < size - 1; i += blockDim.x) {
0026     if ((a[ind[i]] ^ a[ind[i + 1]]) < 0)
0027       firstNeg = i + 1;
0028   }
0029
0030   __syncthreads();
0031
0032   auto ii = first;
0033   for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) {
0034     ind2[ii] = ind[i];
0035     ii += blockDim.x;
0036   }
0037   __syncthreads();
0038   ii = size - firstNeg + threadIdx.x;
0039   assert(ii >= 0);
0040   for (auto i = first; i < firstNeg; i += blockDim.x) {
0041     ind2[ii] = ind[i];
0042     ii += blockDim.x;
0043   }
0044   __syncthreads();
0045   for (auto i = first; i < size; i += blockDim.x)
0046     ind[i] = ind2[i];
0047 }
0048
0049 template <typename T>
0050 __device__ inline void reorderFloat(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0051   //move negative first...
0052
0053   int32_t first = threadIdx.x;
0054   __shared__ uint32_t firstNeg;
0055   firstNeg = a[ind[0]] < 0 ? 0 : size;
0056   __syncthreads();
0057
0058   // find first negative
0059   for (auto i = first; i < size - 1; i += blockDim.x) {
0060     if ((a[ind[i]] ^ a[ind[i + 1]]) < 0)
0061       firstNeg = i + 1;
0062   }
0063
0064   __syncthreads();
0065
0066   int ii = size - firstNeg - threadIdx.x - 1;
0067   for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) {
0068     ind2[ii] = ind[i];
0069     ii -= blockDim.x;
0070   }
0071   __syncthreads();
0072   ii = size - firstNeg + threadIdx.x;
0073   assert(ii >= 0);
0074   for (auto i = first; i < firstNeg; i += blockDim.x) {
0075     ind2[ii] = ind[i];
0076     ii += blockDim.x;
0077   }
0078   __syncthreads();
0079   for (auto i = first; i < size; i += blockDim.x)
0080     ind[i] = ind2[i];
0081 }
0082
0083 template <typename T,  // shall be interger
0084           int NS,      // number of significant bytes to use in sorting
0085           typename RF>
0086 __device__ __forceinline__ void radixSortImpl(
0087     T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
0088   constexpr int d = 8, w = 8 * sizeof(T);
0089   constexpr int sb = 1 << d;
0090   constexpr int ps = int(sizeof(T)) - NS;
0091
0092   __shared__ int32_t c[sb], ct[sb], cu[sb];
0093
0094   __shared__ int ibs;
0095   __shared__ int p;
0096
0097   assert(size > 0);
0098   assert(blockDim.x >= sb);
0099
0100   // bool debug = false; // threadIdx.x==0 && blockIdx.x==5;
0101
0102   p = ps;
0103
0104   auto j = ind;
0105   auto k = ind2;
0106
0107   int32_t first = threadIdx.x;
0108   for (auto i = first; i < size; i += blockDim.x)
0109     j[i] = i;
0110   __syncthreads();
0111
0112   while (__syncthreads_and(p < w / d)) {
0113     if (threadIdx.x < sb)
0114       c[threadIdx.x] = 0;
0115     __syncthreads();
0116
0117     // fill bins
0118     for (auto i = first; i < size; i += blockDim.x) {
0119       auto bin = (a[j[i]] >> d * p) & (sb - 1);
0120       atomicAdd(&c[bin], 1);
0121     }
0122     __syncthreads();
0123
0124     // prefix scan "optimized"???...
0125     if (threadIdx.x < sb) {
0126       auto x = c[threadIdx.x];
0127       auto laneId = threadIdx.x & 0x1f;
0128       CMS_UNROLL_LOOP
0129       for (int offset = 1; offset < 32; offset <<= 1) {
0130         auto y = __shfl_up_sync(0xffffffff, x, offset);
0131         if (laneId >= offset)
0132           x += y;
0133       }
0134       ct[threadIdx.x] = x;
0135     }
0136     __syncthreads();
0137     if (threadIdx.x < sb) {
0138       auto ss = (threadIdx.x / 32) * 32 - 1;
0139       c[threadIdx.x] = ct[threadIdx.x];
0140       for (int i = ss; i > 0; i -= 32)
0141         c[threadIdx.x] += ct[i];
0142     }
0143     /*
0144     //prefix scan for the nulls  (for documentation)
0145     if (threadIdx.x==0)
0146       for (int i = 1; i < sb; ++i) c[i] += c[i-1];
0147     */
0148
0149     // broadcast
0150     ibs = size - 1;
0151     __syncthreads();
0152     while (__syncthreads_and(ibs >= 0)) {
0153       int i = ibs - threadIdx.x;
0154       if (threadIdx.x < sb) {
0155         cu[threadIdx.x] = -1;
0156         ct[threadIdx.x] = -1;
0157       }
0158       __syncthreads();
0159       int32_t bin = -1;
0160       if (threadIdx.x < sb) {
0161         if (i >= 0) {
0162           bin = (a[j[i]] >> d * p) & (sb - 1);
0163           ct[threadIdx.x] = bin;
0164           atomicMax(&cu[bin], int(i));
0165         }
0166       }
0167       __syncthreads();
0168       if (threadIdx.x < sb) {
0169         if (i >= 0 && i == cu[bin])  // ensure to keep them in order
0170           for (int ii = threadIdx.x; ii < sb; ++ii)
0171             if (ct[ii] == bin) {
0172               auto oi = ii - threadIdx.x;
0173               // assert(i>=oi);if(i>=oi)
0174               k[--c[bin]] = j[i - oi];
0175             }
0176       }
0177       __syncthreads();
0178       if (bin >= 0)
0179         assert(c[bin] >= 0);
0180       if (threadIdx.x == 0)
0181         ibs -= sb;
0182       __syncthreads();
0183     }
0184
0185     /*
0186     // broadcast for the nulls  (for documentation)
0187     if (threadIdx.x==0)
0188     for (int i=size-first-1; i>=0; i--) { // =blockDim.x) {
0189       auto bin = (a[j[i]] >> d*p)&(sb-1);
0190       auto ik = atomicSub(&c[bin],1);
0191       k[ik-1] = j[i];
0192     }
0193     */
0194
0195     __syncthreads();
0196     assert(c[0] == 0);
0197
0198     // swap (local, ok)
0199     auto t = j;
0200     j = k;
0201     k = t;
0202
0203     if (threadIdx.x == 0)
0204       ++p;
0205     __syncthreads();
0206   }
0207
0208   if ((w != 8) && (0 == (NS & 1)))
0209     assert(j == ind);  // w/d is even so ind is correct
0210
0211   if (j != ind)  // odd...
0212     for (auto i = first; i < size; i += blockDim.x)
0213       ind[i] = ind2[i];
0214
0215   __syncthreads();
0216
0217   // now move negative first... (if signed)
0218   reorder(a, ind, ind2, size);
0219 }
0220
0221 template <typename T,
0222           int NS = sizeof(T),  // number of significant bytes to use in sorting
0223           typename std::enable_if<std::is_unsigned<T>::value, T>::type* = nullptr>
0224 __device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0225   radixSortImpl<T, NS>(a, ind, ind2, size, dummyReorder<T>);
0226 }
0227
0228 template <typename T,
0229           int NS = sizeof(T),  // number of significant bytes to use in sorting
0230           typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, T>::type* = nullptr>
0231 __device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0232   radixSortImpl<T, NS>(a, ind, ind2, size, reorderSigned<T>);
0233 }
0234
0235 template <typename T,
0236           int NS = sizeof(T),  // number of significant bytes to use in sorting
0237           typename std::enable_if<std::is_floating_point<T>::value, T>::type* = nullptr>
0238 __device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0239   using I = int;
0240   radixSortImpl<I, NS>((I const*)(a), ind, ind2, size, reorderFloat<I>);
0241 }
0242
0243 template <typename T, int NS = sizeof(T)>
0244 __device__ __forceinline__ void radixSortMulti(T const* v,
0245                                                uint16_t* index,
0246                                                uint32_t const* offsets,
0247                                                uint16_t* workspace) {
0248   extern __shared__ uint16_t ws[];
0249
0250   auto a = v + offsets[blockIdx.x];
0251   auto ind = index + offsets[blockIdx.x];
0252   auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx.x];
0253   auto size = offsets[blockIdx.x + 1] - offsets[blockIdx.x];
0254   assert(offsets[blockIdx.x + 1] >= offsets[blockIdx.x]);
0255   if (size > 0)
0256     radixSort<T, NS>(a, ind, ind2, size);
0257 }
0258
0259 namespace cms {
0260   namespace cuda {
0261
0262     template <typename T, int NS = sizeof(T)>
0263     __global__ void __launch_bounds__(256, 4)
0264         radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
0265       radixSortMulti<T, NS>(v, index, offsets, workspace);
0266     }
0267
0268     template <typename T, int NS = sizeof(T)>
0269     __global__ void radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
0270       radixSortMulti<T, NS>(v, index, offsets, workspace);
0271     }
0272
0273   }  // namespace cuda
0274 }  // namespace cms
0275
0276 #endif  // __CUDACC__
0277
0278 #endif  // HeterogeneousCoreCUDAUtilities_radixSort_H