File indexing completed on 2024-04-06 12:15:46
0001 #include <algorithm>
0002 #include <cassert>
0003 #include <iostream>
0004 #include <random>
0005 #include <limits>
0006 #include <array>
0007 #include <memory>
0008
0009 #ifdef __CUDACC__
0010 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0011 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0012 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0013 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0014 #endif
0015
0016 #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
0017 using cms::cuda::AtomicPairCounter;
0018
0019 constexpr uint32_t MaxElem = 64000;
0020 constexpr uint32_t MaxTk = 8000;
0021 constexpr uint32_t MaxAssocs = 4 * MaxTk;
0022
0023 #ifdef RUNTIME_SIZE
0024 using Assoc = cms::cuda::OneToManyAssoc<uint16_t, -1, -1>;
0025 using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, -1>;
0026 using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
0027 #else
0028 using Assoc = cms::cuda::OneToManyAssoc<uint16_t, MaxElem, MaxAssocs>;
0029 using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, MaxAssocs>;
0030 using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
0031 #endif
0032
0033 using TK = std::array<uint16_t, 4>;
0034
0035 __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
0036 int first = blockDim.x * blockIdx.x + threadIdx.x;
0037 for (int i = first; i < n; i += gridDim.x * blockDim.x) {
0038 __shared__ Multiplicity::CountersOnly local;
0039 if (threadIdx.x == 0)
0040 local.zero();
0041 __syncthreads();
0042 local.count(2 + i % 4);
0043 __syncthreads();
0044 if (threadIdx.x == 0)
0045 assoc->add(local);
0046 }
0047 }
0048
0049 __global__ void countMulti(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
0050 int first = blockDim.x * blockIdx.x + threadIdx.x;
0051 for (int i = first; i < n; i += gridDim.x * blockDim.x)
0052 assoc->count(2 + i % 4);
0053 }
0054
0055 __global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) {
0056 auto first = blockDim.x * blockIdx.x + threadIdx.x;
0057 for (int i = first; i < m1->totOnes(); i += gridDim.x * blockDim.x)
0058 assert(m1->off[i] == m2->off[i]);
0059 }
0060
0061 __global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
0062 int first = blockDim.x * blockIdx.x + threadIdx.x;
0063 for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
0064 auto k = i / 4;
0065 auto j = i - 4 * k;
0066 assert(j < 4);
0067 if (k >= n)
0068 return;
0069 if (tk[k][j] < MaxElem)
0070 assoc->count(tk[k][j]);
0071 }
0072 }
0073
0074 __global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
0075 int first = blockDim.x * blockIdx.x + threadIdx.x;
0076 for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
0077 auto k = i / 4;
0078 auto j = i - 4 * k;
0079 assert(j < 4);
0080 if (k >= n)
0081 return;
0082 if (tk[k][j] < MaxElem)
0083 assoc->fill(tk[k][j], k);
0084 }
0085 }
0086
0087 __global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); }
0088
0089 template <typename Assoc>
0090 __global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
0091 int first = blockDim.x * blockIdx.x + threadIdx.x;
0092 for (int k = first; k < n; k += gridDim.x * blockDim.x) {
0093 auto m = tk[k][3] < MaxElem ? 4 : 3;
0094 assoc->bulkFill(*apc, &tk[k][0], m);
0095 }
0096 }
0097
0098 template <typename Assoc>
0099 __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) {
0100 if (int(apc->get().m) >= assoc->nOnes())
0101 printf("Overflow %d %d\n", apc->get().m, assoc->nOnes());
0102 assert(int(assoc->size()) < assoc->capacity());
0103 }
0104
0105 template <typename Assoc>
0106 __global__ void verifyFill(Assoc const* __restrict__ la, int n) {
0107 printf("assoc size %d\n", la->size());
0108 int imax = 0;
0109 long long ave = 0;
0110 int z = 0;
0111 for (int i = 0; i < n; ++i) {
0112 auto x = la->size(i);
0113 if (x == 0) {
0114 z++;
0115 continue;
0116 }
0117 ave += x;
0118 imax = std::max(imax, int(x));
0119 }
0120 assert(0 == la->size(n));
0121 printf("found with %d elements %f %d %d\n", n, double(ave) / n, imax, z);
0122 }
0123
0124 template <typename Assoc>
0125 __global__ void verifyFinal(Assoc const* __restrict__ la, int N) {
0126 printf("assoc size %d\n", la->size());
0127
0128 int imax = 0;
0129 long long ave = 0;
0130 for (int i = 0; i < N; ++i) {
0131 auto x = la->size(i);
0132 if (!(x == 4 || x == 3))
0133 printf("%d %d\n", i, x);
0134 assert(x == 4 || x == 3);
0135 ave += x;
0136 imax = std::max(imax, int(x));
0137 }
0138 assert(0 == la->size(N));
0139 printf("found with ave occupancy %f %d\n", double(ave) / N, imax);
0140 }
0141
0142 template <typename T>
0143 auto make_unique(std::size_t size) {
0144 #ifdef __CUDACC__
0145 return cms::cuda::make_device_unique<T>(size, 0);
0146 #else
0147 return std::make_unique<T>(size);
0148 #endif
0149 }
0150
0151 int main() {
0152 #ifdef __CUDACC__
0153 cms::cudatest::requireDevices();
0154 auto current_device = cms::cuda::currentDevice();
0155 #else
0156
0157 std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl;
0158 std::cout << "cuda y's " << threadIdx.y << ' ' << blockIdx.y << ' ' << blockDim.y << ' ' << gridDim.y << std::endl;
0159 std::cout << "cuda z's " << threadIdx.z << ' ' << blockIdx.z << ' ' << blockDim.z << ' ' << gridDim.z << std::endl;
0160 assert(threadIdx.x == 0);
0161 assert(threadIdx.y == 0);
0162 assert(threadIdx.z == 0);
0163 assert(blockIdx.x == 0);
0164 assert(blockIdx.y == 0);
0165 assert(blockIdx.z == 0);
0166 assert(blockDim.x == 1);
0167 assert(blockDim.y == 1);
0168 assert(blockDim.z == 1);
0169 assert(gridDim.x == 1);
0170 assert(gridDim.y == 1);
0171 assert(gridDim.z == 1);
0172 #endif
0173
0174 std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::ctNOnes() << ' ' << Assoc::ctCapacity() << std::endl;
0175 std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::ctNOnes() << ' '
0176 << SmallAssoc::ctCapacity() << std::endl;
0177
0178 std::mt19937 eng;
0179
0180 std::geometric_distribution<int> rdm(0.8);
0181
0182 constexpr uint32_t N = 4000;
0183
0184 std::vector<std::array<uint16_t, 4>> tr(N);
0185
0186
0187 long long ave = 0;
0188 int imax = 0;
0189 auto n = 0U;
0190 auto z = 0U;
0191 auto nz = 0U;
0192 for (auto i = 0U; i < 4U; ++i) {
0193 auto j = 0U;
0194 while (j < N && n < MaxElem) {
0195 if (z == 11) {
0196 ++n;
0197 z = 0;
0198 ++nz;
0199 continue;
0200 }
0201 auto x = rdm(eng);
0202 auto k = std::min(j + x + 1, N);
0203 if (i == 3 && z == 3) {
0204 for (; j < k; ++j)
0205 tr[j][i] = MaxElem + 1;
0206 } else {
0207 ave += x + 1;
0208 imax = std::max(imax, x);
0209 for (; j < k; ++j)
0210 tr[j][i] = n;
0211 ++n;
0212 }
0213 ++z;
0214 }
0215 assert(n <= MaxElem);
0216 assert(j <= N);
0217 }
0218 std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
0219
0220 auto a_d = make_unique<Assoc[]>(1);
0221 auto sa_d = make_unique<SmallAssoc[]>(1);
0222 #ifdef __CUDACC__
0223 auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
0224 assert(v_d.get());
0225 cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
0226 #else
0227 auto v_d = tr.data();
0228 #endif
0229
0230 Assoc::Counter* a_st = nullptr;
0231 int a_n = MaxElem;
0232
0233 Assoc::index_type* a_st2 = nullptr;
0234 SmallAssoc::index_type* sa_st2 = nullptr;
0235 int a_n2 = MaxAssocs;
0236
0237
0238 #ifdef RUNTIME_SIZE
0239 auto a_st_d = make_unique<Assoc::Counter[]>(a_n);
0240 auto a_st2_d = make_unique<Assoc::index_type[]>(a_n2);
0241 auto sa_st2_d = make_unique<SmallAssoc::index_type[]>(a_n2);
0242 a_st = a_st_d.get();
0243 a_st2 = a_st2_d.get();
0244 sa_st2 = sa_st2_d.get();
0245 #endif
0246 Assoc::View aView = {a_d.get(), a_st, a_st2, a_n, a_n2};
0247 launchZero(aView, 0);
0248 SmallAssoc::View saView = {sa_d.get(), nullptr, sa_st2, -1, a_n2};
0249 launchZero(saView, 0);
0250
0251 #ifdef __CUDACC__
0252 auto nThreads = 256;
0253 auto nBlocks = (4 * N + nThreads - 1) / nThreads;
0254
0255 count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
0256
0257 launchFinalize(aView, 0);
0258 verify<<<1, 1>>>(a_d.get());
0259 fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
0260 verifyFill<<<1, 1>>>(a_d.get(), n);
0261
0262 #else
0263 count(v_d, a_d.get(), N);
0264 launchFinalize(aView);
0265 verify(a_d.get());
0266 fill(v_d, a_d.get(), N);
0267 verifyFill(a_d.get(), n);
0268
0269 #endif
0270
0271
0272 AtomicPairCounter* dc_d;
0273 AtomicPairCounter dc(0);
0274
0275 #ifdef __CUDACC__
0276 cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
0277 cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
0278 nBlocks = (N + nThreads - 1) / nThreads;
0279 fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), a_d.get(), N);
0280 finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
0281 verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
0282
0283 cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
0284 verifyFinal<<<1, 1>>>(a_d.get(), N);
0285
0286 cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
0287 fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
0288 finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
0289 verifyBulk<<<1, 1>>>(sa_d.get(), dc_d);
0290
0291 #else
0292 dc_d = &dc;
0293 fillBulk(dc_d, v_d, a_d.get(), N);
0294 finalizeBulk(dc_d, a_d.get());
0295 verifyBulk(a_d.get(), dc_d);
0296
0297 verifyFinal(a_d.get(), N);
0298
0299 AtomicPairCounter sdc(0);
0300 fillBulk(&sdc, v_d, sa_d.get(), N);
0301 finalizeBulk(&sdc, sa_d.get());
0302 verifyBulk(sa_d.get(), &sdc);
0303
0304 #endif
0305
0306 std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl;
0307
0308
0309 auto m1_d = make_unique<Multiplicity[]>(1);
0310 auto m2_d = make_unique<Multiplicity[]>(1);
0311
0312 Multiplicity::index_type* m1_st = nullptr;
0313 Multiplicity::index_type* m2_st = nullptr;
0314 int m_n = 0;
0315
0316 #ifdef RUNTIME_SIZE
0317 m_n = MaxTk;
0318 auto m1_st_d = make_unique<Multiplicity::index_type[]>(m_n);
0319 auto m2_st_d = make_unique<Multiplicity::index_type[]>(m_n);
0320 m1_st = m1_st_d.get();
0321 m2_st = m1_st_d.get();
0322 #endif
0323 Multiplicity::View view1 = {m1_d.get(), nullptr, m1_st, -1, m_n};
0324 Multiplicity::View view2 = {m2_d.get(), nullptr, m2_st, -1, m_n};
0325 launchZero(view1, 0);
0326 launchZero(view2, 0);
0327
0328 #ifdef __CUDACC__
0329 nBlocks = (4 * N + nThreads - 1) / nThreads;
0330 countMulti<<<nBlocks, nThreads>>>(v_d.get(), m1_d.get(), N);
0331 countMultiLocal<<<nBlocks, nThreads>>>(v_d.get(), m2_d.get(), N);
0332 verifyMulti<<<1, Multiplicity::ctNOnes()>>>(m1_d.get(), m2_d.get());
0333
0334 launchFinalize(view1, 0);
0335 launchFinalize(view2, 0);
0336 verifyMulti<<<1, Multiplicity::ctNOnes()>>>(m1_d.get(), m2_d.get());
0337
0338 cudaCheck(cudaGetLastError());
0339 cudaCheck(cudaDeviceSynchronize());
0340 #else
0341 countMulti(v_d, m1_d.get(), N);
0342 countMultiLocal(v_d, m2_d.get(), N);
0343 verifyMulti(m1_d.get(), m2_d.get());
0344
0345 launchFinalize(view1, 0);
0346 launchFinalize(view2, 0);
0347 verifyMulti(m1_d.get(), m2_d.get());
0348 #endif
0349 return 0;
0350 }