CUDAUtilities/interface/HistoContainer.h

0001 #ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
0002 #define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
0003
0004 #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
0005
0006 namespace cms {
0007   namespace cuda {
0008
0009     template <typename Histo, typename T>
0010     __global__ void countFromVector(Histo *__restrict__ h,
0011                                     uint32_t nh,
0012                                     T const *__restrict__ v,
0013                                     uint32_t const *__restrict__ offsets) {
0014       int first = blockDim.x * blockIdx.x + threadIdx.x;
0015       for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
0016         auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
0017         assert((*off) > 0);
0018         int32_t ih = off - offsets - 1;
0019         assert(ih >= 0);
0020         assert(ih < int(nh));
0021         (*h).count(v[i], ih);
0022       }
0023     }
0024
0025     template <typename Histo, typename T>
0026     __global__ void fillFromVector(Histo *__restrict__ h,
0027                                    uint32_t nh,
0028                                    T const *__restrict__ v,
0029                                    uint32_t const *__restrict__ offsets) {
0030       int first = blockDim.x * blockIdx.x + threadIdx.x;
0031       for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
0032         auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
0033         assert((*off) > 0);
0034         int32_t ih = off - offsets - 1;
0035         assert(ih >= 0);
0036         assert(ih < int(nh));
0037         (*h).fill(v[i], i, ih);
0038       }
0039     }
0040
0041     template <typename Histo, typename T>
0042     inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
0043                                                                   uint32_t nh,
0044                                                                   T const *__restrict__ v,
0045                                                                   uint32_t const *__restrict__ offsets,
0046                                                                   int32_t totSize,
0047                                                                   int nthreads,
0048                                                                   typename Histo::index_type *mem,
0049                                                                   cudaStream_t stream
0050 #ifndef __CUDACC__
0051                                                                   = cudaStreamDefault
0052 #endif
0053     ) {
0054       typename Histo::View view = {h, nullptr, mem, -1, totSize};
0055       launchZero(view, stream);
0056 #ifdef __CUDACC__
0057       auto nblocks = (totSize + nthreads - 1) / nthreads;
0058       assert(nblocks > 0);
0059       countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
0060       cudaCheck(cudaGetLastError());
0061       launchFinalize(view, stream);
0062       fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
0063       cudaCheck(cudaGetLastError());
0064 #else
0065       countFromVector(h, nh, v, offsets);
0066       h->finalize();
0067       fillFromVector(h, nh, v, offsets);
0068 #endif
0069     }
0070
0071     // iteratate over N bins left and right of the one containing "v"
0072     template <typename Hist, typename V, typename Func>
0073     __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
0074       int bs = Hist::bin(value);
0075       int be = std::min(int(Hist::nbins() - 1), bs + n);
0076       bs = std::max(0, bs - n);
0077       assert(be >= bs);
0078       for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
0079         func(*pj);
0080       }
0081     }
0082
0083     // iteratate over bins containing all values in window wmin, wmax
0084     template <typename Hist, typename V, typename Func>
0085     __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
0086       auto bs = Hist::bin(wmin);
0087       auto be = Hist::bin(wmax);
0088       assert(be >= bs);
0089       for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
0090         func(*pj);
0091       }
0092     }
0093
0094     template <typename T,      // the type of the discretized input values
0095               uint32_t NBINS,  // number of bins
0096               int32_t SIZE,    // max number of element. If -1 is initialized at runtime using external storage
0097               uint32_t S = sizeof(T) * 8,  // number of significant bits in T
0098               typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
0099               uint32_t NHISTS = 1     // number of histos stored
0100               >
0101     class HistoContainer : public OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE> {
0102     public:
0103       using Base = OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE>;
0104       using View = typename Base::View;
0105       using Counter = typename Base::Counter;
0106       using index_type = typename Base::index_type;
0107       using UT = typename std::make_unsigned<T>::type;
0108
0109       static constexpr uint32_t ilog2(uint32_t v) {
0110         constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
0111         constexpr uint32_t s[] = {1, 2, 4, 8, 16};
0112
0113         uint32_t r = 0;  // result of log2(v) will go here
0114         for (auto i = 4; i >= 0; i--)
0115           if (v & b[i]) {
0116             v >>= s[i];
0117             r |= s[i];
0118           }
0119         return r;
0120       }
0121
0122       static constexpr uint32_t sizeT() { return S; }
0123       static constexpr uint32_t nbins() { return NBINS; }
0124       static constexpr uint32_t nhists() { return NHISTS; }
0125       static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
0126       static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
0127
0128       // static_assert(int32_t(totbins())==Base::ctNOnes());
0129
0130       static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
0131
0132       static constexpr UT bin(T t) {
0133         constexpr uint32_t shift = sizeT() - nbits();
0134         constexpr uint32_t mask = (1 << nbits()) - 1;
0135         return (t >> shift) & mask;
0136       }
0137
0138       __host__ __device__ __forceinline__ void count(T t) {
0139         uint32_t b = bin(t);
0140         assert(b < nbins());
0141         Base::atomicIncrement(this->off[b]);
0142       }
0143
0144       __host__ __device__ __forceinline__ void fill(T t, index_type j) {
0145         uint32_t b = bin(t);
0146         assert(b < nbins());
0147         auto w = Base::atomicDecrement(this->off[b]);
0148         assert(w > 0);
0149         this->content[w - 1] = j;
0150       }
0151
0152       __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
0153         uint32_t b = bin(t);
0154         assert(b < nbins());
0155         b += histOff(nh);
0156         assert(b < totbins());
0157         Base::atomicIncrement(this->off[b]);
0158       }
0159
0160       __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
0161         uint32_t b = bin(t);
0162         assert(b < nbins());
0163         b += histOff(nh);
0164         assert(b < totbins());
0165         auto w = Base::atomicDecrement(this->off[b]);
0166         assert(w > 0);
0167         this->content[w - 1] = j;
0168       }
0169     };
0170
0171   }  // namespace cuda
0172 }  // namespace cms
0173
0174 #endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h