File indexing completed on 2024-04-06 12:15:45
0001 #ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
0002 #define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
0003
0004 #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
0005
0006 namespace cms {
0007 namespace cuda {
0008
0009 template <typename Histo, typename T>
0010 __global__ void countFromVector(Histo *__restrict__ h,
0011 uint32_t nh,
0012 T const *__restrict__ v,
0013 uint32_t const *__restrict__ offsets) {
0014 int first = blockDim.x * blockIdx.x + threadIdx.x;
0015 for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
0016 auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
0017 assert((*off) > 0);
0018 int32_t ih = off - offsets - 1;
0019 assert(ih >= 0);
0020 assert(ih < int(nh));
0021 (*h).count(v[i], ih);
0022 }
0023 }
0024
0025 template <typename Histo, typename T>
0026 __global__ void fillFromVector(Histo *__restrict__ h,
0027 uint32_t nh,
0028 T const *__restrict__ v,
0029 uint32_t const *__restrict__ offsets) {
0030 int first = blockDim.x * blockIdx.x + threadIdx.x;
0031 for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
0032 auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
0033 assert((*off) > 0);
0034 int32_t ih = off - offsets - 1;
0035 assert(ih >= 0);
0036 assert(ih < int(nh));
0037 (*h).fill(v[i], i, ih);
0038 }
0039 }
0040
0041 template <typename Histo, typename T>
0042 inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
0043 uint32_t nh,
0044 T const *__restrict__ v,
0045 uint32_t const *__restrict__ offsets,
0046 int32_t totSize,
0047 int nthreads,
0048 typename Histo::index_type *mem,
0049 cudaStream_t stream
0050 #ifndef __CUDACC__
0051 = cudaStreamDefault
0052 #endif
0053 ) {
0054 typename Histo::View view = {h, nullptr, mem, -1, totSize};
0055 launchZero(view, stream);
0056 #ifdef __CUDACC__
0057 auto nblocks = (totSize + nthreads - 1) / nthreads;
0058 assert(nblocks > 0);
0059 countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
0060 cudaCheck(cudaGetLastError());
0061 launchFinalize(view, stream);
0062 fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
0063 cudaCheck(cudaGetLastError());
0064 #else
0065 countFromVector(h, nh, v, offsets);
0066 h->finalize();
0067 fillFromVector(h, nh, v, offsets);
0068 #endif
0069 }
0070
0071
0072 template <typename Hist, typename V, typename Func>
0073 __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
0074 int bs = Hist::bin(value);
0075 int be = std::min(int(Hist::nbins() - 1), bs + n);
0076 bs = std::max(0, bs - n);
0077 assert(be >= bs);
0078 for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
0079 func(*pj);
0080 }
0081 }
0082
0083
0084 template <typename Hist, typename V, typename Func>
0085 __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
0086 auto bs = Hist::bin(wmin);
0087 auto be = Hist::bin(wmax);
0088 assert(be >= bs);
0089 for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
0090 func(*pj);
0091 }
0092 }
0093
0094 template <typename T,
0095 uint32_t NBINS,
0096 int32_t SIZE,
0097 uint32_t S = sizeof(T) * 8,
0098 typename I = uint32_t,
0099 uint32_t NHISTS = 1
0100 >
0101 class HistoContainer : public OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE> {
0102 public:
0103 using Base = OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE>;
0104 using View = typename Base::View;
0105 using Counter = typename Base::Counter;
0106 using index_type = typename Base::index_type;
0107 using UT = typename std::make_unsigned<T>::type;
0108
0109 static constexpr uint32_t ilog2(uint32_t v) {
0110 constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
0111 constexpr uint32_t s[] = {1, 2, 4, 8, 16};
0112
0113 uint32_t r = 0;
0114 for (auto i = 4; i >= 0; i--)
0115 if (v & b[i]) {
0116 v >>= s[i];
0117 r |= s[i];
0118 }
0119 return r;
0120 }
0121
0122 static constexpr uint32_t sizeT() { return S; }
0123 static constexpr uint32_t nbins() { return NBINS; }
0124 static constexpr uint32_t nhists() { return NHISTS; }
0125 static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
0126 static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
0127
0128
0129
0130 static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
0131
0132 static constexpr UT bin(T t) {
0133 constexpr uint32_t shift = sizeT() - nbits();
0134 constexpr uint32_t mask = (1 << nbits()) - 1;
0135 return (t >> shift) & mask;
0136 }
0137
0138 __host__ __device__ __forceinline__ void count(T t) {
0139 uint32_t b = bin(t);
0140 assert(b < nbins());
0141 Base::atomicIncrement(this->off[b]);
0142 }
0143
0144 __host__ __device__ __forceinline__ void fill(T t, index_type j) {
0145 uint32_t b = bin(t);
0146 assert(b < nbins());
0147 auto w = Base::atomicDecrement(this->off[b]);
0148 assert(w > 0);
0149 this->content[w - 1] = j;
0150 }
0151
0152 __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
0153 uint32_t b = bin(t);
0154 assert(b < nbins());
0155 b += histOff(nh);
0156 assert(b < totbins());
0157 Base::atomicIncrement(this->off[b]);
0158 }
0159
0160 __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
0161 uint32_t b = bin(t);
0162 assert(b < nbins());
0163 b += histOff(nh);
0164 assert(b < totbins());
0165 auto w = Base::atomicDecrement(this->off[b]);
0166 assert(w > 0);
0167 this->content[w - 1] = j;
0168 }
0169 };
0170
0171 }
0172 }
0173
0174 #endif