1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
|
#ifndef HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h
#define HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <type_traits>
#include <alpaka/alpaka.hpp>
#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h"
#include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h"
#include "HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h"
#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
namespace cms::alpakatools {
struct countFromVector {
template <typename TAcc, typename Histo, typename T>
ALPAKA_FN_ACC void operator()(const TAcc &acc,
Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) const {
const uint32_t nt = offsets[nh];
for (uint32_t i : uniform_elements(acc, nt)) {
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
ALPAKA_ASSERT_ACC((*off) > 0);
int32_t ih = off - offsets - 1;
ALPAKA_ASSERT_ACC(ih >= 0);
ALPAKA_ASSERT_ACC(ih < int(nh));
h->count(acc, v[i], ih);
}
}
};
struct fillFromVector {
template <typename TAcc, typename Histo, typename T>
ALPAKA_FN_ACC void operator()(const TAcc &acc,
Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) const {
const uint32_t nt = offsets[nh];
for (uint32_t i : uniform_elements(acc, nt)) {
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
ALPAKA_ASSERT_ACC((*off) > 0);
int32_t ih = off - offsets - 1;
ALPAKA_ASSERT_ACC(ih >= 0);
ALPAKA_ASSERT_ACC(ih < int(nh));
h->fill(acc, v[i], i, ih);
}
}
};
template <typename TAcc, typename Histo, typename T, typename TQueue>
ALPAKA_FN_INLINE void fillManyFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
uint32_t nthreads,
TQueue &queue) {
Histo::template launchZero<TAcc>(h, queue);
const auto threadsPerBlockOrElementsPerThread = nthreads;
const auto blocksPerGrid = divide_up_by(totSize, nthreads);
const auto workDiv = make_workdiv<TAcc>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
alpaka::exec<TAcc>(queue, workDiv, countFromVector(), h, nh, v, offsets);
Histo::template launchFinalize<TAcc>(h, queue);
alpaka::exec<TAcc>(queue, workDiv, fillFromVector(), h, nh, v, offsets);
}
template <typename TAcc, typename Histo, typename T, typename TQueue>
ALPAKA_FN_INLINE void fillManyFromVector(Histo *__restrict__ h,
typename Histo::View hv,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
uint32_t nthreads,
TQueue &queue) {
Histo::template launchZero<TAcc>(hv, queue);
const auto threadsPerBlockOrElementsPerThread = nthreads;
const auto blocksPerGrid = divide_up_by(totSize, nthreads);
const auto workDiv = make_workdiv<TAcc>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
alpaka::exec<TAcc>(queue, workDiv, countFromVector(), h, nh, v, offsets);
Histo::template launchFinalize<TAcc>(h, queue);
alpaka::exec<TAcc>(queue, workDiv, fillFromVector(), h, nh, v, offsets);
}
// iteratate over N bins left and right of the one containing "v"
template <typename Hist, typename V, typename Func>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) {
int bs = Hist::bin(value);
int be = std::min(int(Hist::nbins() - 1), bs + n);
bs = std::max(0, bs - n);
ALPAKA_ASSERT_ACC(be >= bs);
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
func(*pj);
}
}
// iteratate over bins containing all values in window wmin, wmax
template <typename Hist, typename V, typename Func>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
auto bs = Hist::bin(wmin);
auto be = Hist::bin(wmax);
ALPAKA_ASSERT_ACC(be >= bs);
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
func(*pj);
}
}
template <typename T, // the type of the discretized input values
uint32_t NBINS, // number of bins
int32_t SIZE, // max number of element. If -1 is initialized at runtime using external storage
uint32_t S = sizeof(T) * 8, // number of significant bits in T
typename I = uint32_t, // type stored in the container (usually an index in a vector of the input values)
uint32_t NHISTS = 1 // number of histos stored
>
class HistoContainer : public OneToManyAssocRandomAccess<I, NHISTS * NBINS + 1, SIZE> {
public:
using Base = OneToManyAssocRandomAccess<I, NHISTS * NBINS + 1, SIZE>;
using View = typename Base::View;
using Counter = typename Base::Counter;
using index_type = typename Base::index_type;
using UT = typename std::make_unsigned<T>::type;
static constexpr uint32_t ilog2(uint32_t v) {
constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
constexpr uint32_t s[] = {1, 2, 4, 8, 16};
uint32_t r = 0; // result of log2(v) will go here
for (auto i = 4; i >= 0; i--)
if (v & b[i]) {
v >>= s[i];
r |= s[i];
}
return r;
}
static constexpr uint32_t sizeT() { return S; }
static constexpr int32_t nhists() { return NHISTS; }
static constexpr uint32_t nbins() { return NBINS; }
static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
static constexpr UT bin(T t) {
constexpr uint32_t shift = sizeT() - nbits();
constexpr uint32_t mask = (1 << nbits()) - 1;
return (t >> shift) & mask;
}
template <typename TAcc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t) {
uint32_t b = bin(t);
ALPAKA_ASSERT_ACC(b < nbins());
Base::atomicIncrement(acc, this->off[b]);
}
template <typename TAcc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j) {
uint32_t b = bin(t);
ALPAKA_ASSERT_ACC(b < nbins());
auto w = Base::atomicDecrement(acc, this->off[b]);
ALPAKA_ASSERT_ACC(w > 0);
this->content[w - 1] = j;
}
template <typename TAcc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t, uint32_t nh) {
uint32_t b = bin(t);
ALPAKA_ASSERT_ACC(b < nbins());
b += histOff(nh);
ALPAKA_ASSERT_ACC(b < totbins());
Base::atomicIncrement(acc, this->off[b]);
}
template <typename TAcc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j, uint32_t nh) {
uint32_t b = bin(t);
ALPAKA_ASSERT_ACC(b < nbins());
b += histOff(nh);
ALPAKA_ASSERT_ACC(b < totbins());
auto w = Base::atomicDecrement(acc, this->off[b]);
ALPAKA_ASSERT_ACC(w > 0);
this->content[w - 1] = j;
}
};
} // namespace cms::alpakatools
#endif // HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h
|