File indexing completed on 2024-04-06 12:15:45
0001 #ifndef HeterogeneousCoreCUDAUtilities_radixSort_H
0002 #define HeterogeneousCoreCUDAUtilities_radixSort_H
0003
0004 #ifdef __CUDACC__
0005
0006 #include <cstdint>
0007 #include <type_traits>
0008
0009 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0010 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0011
0012 template <typename T>
0013 __device__ inline void dummyReorder(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {}
0014
0015 template <typename T>
0016 __device__ inline void reorderSigned(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0017
0018
0019 int32_t first = threadIdx.x;
0020 __shared__ uint32_t firstNeg;
0021 firstNeg = a[ind[0]] < 0 ? 0 : size;
0022 __syncthreads();
0023
0024
0025 for (auto i = first; i < size - 1; i += blockDim.x) {
0026 if ((a[ind[i]] ^ a[ind[i + 1]]) < 0)
0027 firstNeg = i + 1;
0028 }
0029
0030 __syncthreads();
0031
0032 auto ii = first;
0033 for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) {
0034 ind2[ii] = ind[i];
0035 ii += blockDim.x;
0036 }
0037 __syncthreads();
0038 ii = size - firstNeg + threadIdx.x;
0039 assert(ii >= 0);
0040 for (auto i = first; i < firstNeg; i += blockDim.x) {
0041 ind2[ii] = ind[i];
0042 ii += blockDim.x;
0043 }
0044 __syncthreads();
0045 for (auto i = first; i < size; i += blockDim.x)
0046 ind[i] = ind2[i];
0047 }
0048
0049 template <typename T>
0050 __device__ inline void reorderFloat(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0051
0052
0053 int32_t first = threadIdx.x;
0054 __shared__ uint32_t firstNeg;
0055 firstNeg = a[ind[0]] < 0 ? 0 : size;
0056 __syncthreads();
0057
0058
0059 for (auto i = first; i < size - 1; i += blockDim.x) {
0060 if ((a[ind[i]] ^ a[ind[i + 1]]) < 0)
0061 firstNeg = i + 1;
0062 }
0063
0064 __syncthreads();
0065
0066 int ii = size - firstNeg - threadIdx.x - 1;
0067 for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) {
0068 ind2[ii] = ind[i];
0069 ii -= blockDim.x;
0070 }
0071 __syncthreads();
0072 ii = size - firstNeg + threadIdx.x;
0073 assert(ii >= 0);
0074 for (auto i = first; i < firstNeg; i += blockDim.x) {
0075 ind2[ii] = ind[i];
0076 ii += blockDim.x;
0077 }
0078 __syncthreads();
0079 for (auto i = first; i < size; i += blockDim.x)
0080 ind[i] = ind2[i];
0081 }
0082
0083 template <typename T,
0084 int NS,
0085 typename RF>
0086 __device__ __forceinline__ void radixSortImpl(
0087 T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
0088 constexpr int d = 8, w = 8 * sizeof(T);
0089 constexpr int sb = 1 << d;
0090 constexpr int ps = int(sizeof(T)) - NS;
0091
0092 __shared__ int32_t c[sb], ct[sb], cu[sb];
0093
0094 __shared__ int ibs;
0095 __shared__ int p;
0096
0097 assert(size > 0);
0098 assert(blockDim.x >= sb);
0099
0100
0101
0102 p = ps;
0103
0104 auto j = ind;
0105 auto k = ind2;
0106
0107 int32_t first = threadIdx.x;
0108 for (auto i = first; i < size; i += blockDim.x)
0109 j[i] = i;
0110 __syncthreads();
0111
0112 while (__syncthreads_and(p < w / d)) {
0113 if (threadIdx.x < sb)
0114 c[threadIdx.x] = 0;
0115 __syncthreads();
0116
0117
0118 for (auto i = first; i < size; i += blockDim.x) {
0119 auto bin = (a[j[i]] >> d * p) & (sb - 1);
0120 atomicAdd(&c[bin], 1);
0121 }
0122 __syncthreads();
0123
0124
0125 if (threadIdx.x < sb) {
0126 auto x = c[threadIdx.x];
0127 auto laneId = threadIdx.x & 0x1f;
0128 CMS_UNROLL_LOOP
0129 for (int offset = 1; offset < 32; offset <<= 1) {
0130 auto y = __shfl_up_sync(0xffffffff, x, offset);
0131 if (laneId >= offset)
0132 x += y;
0133 }
0134 ct[threadIdx.x] = x;
0135 }
0136 __syncthreads();
0137 if (threadIdx.x < sb) {
0138 auto ss = (threadIdx.x / 32) * 32 - 1;
0139 c[threadIdx.x] = ct[threadIdx.x];
0140 for (int i = ss; i > 0; i -= 32)
0141 c[threadIdx.x] += ct[i];
0142 }
0143
0144
0145
0146
0147
0148
0149
0150 ibs = size - 1;
0151 __syncthreads();
0152 while (__syncthreads_and(ibs >= 0)) {
0153 int i = ibs - threadIdx.x;
0154 if (threadIdx.x < sb) {
0155 cu[threadIdx.x] = -1;
0156 ct[threadIdx.x] = -1;
0157 }
0158 __syncthreads();
0159 int32_t bin = -1;
0160 if (threadIdx.x < sb) {
0161 if (i >= 0) {
0162 bin = (a[j[i]] >> d * p) & (sb - 1);
0163 ct[threadIdx.x] = bin;
0164 atomicMax(&cu[bin], int(i));
0165 }
0166 }
0167 __syncthreads();
0168 if (threadIdx.x < sb) {
0169 if (i >= 0 && i == cu[bin])
0170 for (int ii = threadIdx.x; ii < sb; ++ii)
0171 if (ct[ii] == bin) {
0172 auto oi = ii - threadIdx.x;
0173
0174 k[--c[bin]] = j[i - oi];
0175 }
0176 }
0177 __syncthreads();
0178 if (bin >= 0)
0179 assert(c[bin] >= 0);
0180 if (threadIdx.x == 0)
0181 ibs -= sb;
0182 __syncthreads();
0183 }
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195 __syncthreads();
0196 assert(c[0] == 0);
0197
0198
0199 auto t = j;
0200 j = k;
0201 k = t;
0202
0203 if (threadIdx.x == 0)
0204 ++p;
0205 __syncthreads();
0206 }
0207
0208 if ((w != 8) && (0 == (NS & 1)))
0209 assert(j == ind);
0210
0211 if (j != ind)
0212 for (auto i = first; i < size; i += blockDim.x)
0213 ind[i] = ind2[i];
0214
0215 __syncthreads();
0216
0217
0218 reorder(a, ind, ind2, size);
0219 }
0220
0221 template <typename T,
0222 int NS = sizeof(T),
0223 typename std::enable_if<std::is_unsigned<T>::value, T>::type* = nullptr>
0224 __device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0225 radixSortImpl<T, NS>(a, ind, ind2, size, dummyReorder<T>);
0226 }
0227
0228 template <typename T,
0229 int NS = sizeof(T),
0230 typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, T>::type* = nullptr>
0231 __device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0232 radixSortImpl<T, NS>(a, ind, ind2, size, reorderSigned<T>);
0233 }
0234
0235 template <typename T,
0236 int NS = sizeof(T),
0237 typename std::enable_if<std::is_floating_point<T>::value, T>::type* = nullptr>
0238 __device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
0239 using I = int;
0240 radixSortImpl<I, NS>((I const*)(a), ind, ind2, size, reorderFloat<I>);
0241 }
0242
0243 template <typename T, int NS = sizeof(T)>
0244 __device__ __forceinline__ void radixSortMulti(T const* v,
0245 uint16_t* index,
0246 uint32_t const* offsets,
0247 uint16_t* workspace) {
0248 extern __shared__ uint16_t ws[];
0249
0250 auto a = v + offsets[blockIdx.x];
0251 auto ind = index + offsets[blockIdx.x];
0252 auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx.x];
0253 auto size = offsets[blockIdx.x + 1] - offsets[blockIdx.x];
0254 assert(offsets[blockIdx.x + 1] >= offsets[blockIdx.x]);
0255 if (size > 0)
0256 radixSort<T, NS>(a, ind, ind2, size);
0257 }
0258
0259 namespace cms {
0260 namespace cuda {
0261
0262 template <typename T, int NS = sizeof(T)>
0263 __global__ void __launch_bounds__(256, 4)
0264 radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
0265 radixSortMulti<T, NS>(v, index, offsets, workspace);
0266 }
0267
0268 template <typename T, int NS = sizeof(T)>
0269 __global__ void radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
0270 radixSortMulti<T, NS>(v, index, offsets, workspace);
0271 }
0272
0273 }
0274 }
0275
0276 #endif
0277
0278 #endif