CUDAUtilities/test/OneToManyAssoc_t.h

0001 #include <algorithm>
0002 #include <cassert>
0003 #include <iostream>
0004 #include <random>
0005 #include <limits>
0006 #include <array>
0007 #include <memory>
0008
0009 #ifdef __CUDACC__
0010 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0011 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0012 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0013 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0014 #endif
0015
0016 #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
0017 using cms::cuda::AtomicPairCounter;
0018
0019 constexpr uint32_t MaxElem = 64000;
0020 constexpr uint32_t MaxTk = 8000;
0021 constexpr uint32_t MaxAssocs = 4 * MaxTk;
0022
0023 #ifdef RUNTIME_SIZE
0024 using Assoc = cms::cuda::OneToManyAssoc<uint16_t, -1, -1>;
0025 using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, -1>;
0026 using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
0027 #else
0028 using Assoc = cms::cuda::OneToManyAssoc<uint16_t, MaxElem, MaxAssocs>;
0029 using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, MaxAssocs>;
0030 using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
0031 #endif
0032
0033 using TK = std::array<uint16_t, 4>;
0034
0035 __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
0036   int first = blockDim.x * blockIdx.x + threadIdx.x;
0037   for (int i = first; i < n; i += gridDim.x * blockDim.x) {
0038     __shared__ Multiplicity::CountersOnly local;
0039     if (threadIdx.x == 0)
0040       local.zero();
0041     __syncthreads();
0042     local.count(2 + i % 4);
0043     __syncthreads();
0044     if (threadIdx.x == 0)
0045       assoc->add(local);
0046   }
0047 }
0048
0049 __global__ void countMulti(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
0050   int first = blockDim.x * blockIdx.x + threadIdx.x;
0051   for (int i = first; i < n; i += gridDim.x * blockDim.x)
0052     assoc->count(2 + i % 4);
0053 }
0054
0055 __global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) {
0056   auto first = blockDim.x * blockIdx.x + threadIdx.x;
0057   for (int i = first; i < m1->totOnes(); i += gridDim.x * blockDim.x)
0058     assert(m1->off[i] == m2->off[i]);
0059 }
0060
0061 __global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
0062   int first = blockDim.x * blockIdx.x + threadIdx.x;
0063   for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
0064     auto k = i / 4;
0065     auto j = i - 4 * k;
0066     assert(j < 4);
0067     if (k >= n)
0068       return;
0069     if (tk[k][j] < MaxElem)
0070       assoc->count(tk[k][j]);
0071   }
0072 }
0073
0074 __global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
0075   int first = blockDim.x * blockIdx.x + threadIdx.x;
0076   for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
0077     auto k = i / 4;
0078     auto j = i - 4 * k;
0079     assert(j < 4);
0080     if (k >= n)
0081       return;
0082     if (tk[k][j] < MaxElem)
0083       assoc->fill(tk[k][j], k);
0084   }
0085 }
0086
0087 __global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); }
0088
0089 template <typename Assoc>
0090 __global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
0091   int first = blockDim.x * blockIdx.x + threadIdx.x;
0092   for (int k = first; k < n; k += gridDim.x * blockDim.x) {
0093     auto m = tk[k][3] < MaxElem ? 4 : 3;
0094     assoc->bulkFill(*apc, &tk[k][0], m);
0095   }
0096 }
0097
0098 template <typename Assoc>
0099 __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) {
0100   if (int(apc->get().m) >= assoc->nOnes())
0101     printf("Overflow %d %d\n", apc->get().m, assoc->nOnes());
0102   assert(int(assoc->size()) < assoc->capacity());
0103 }
0104
0105 template <typename Assoc>
0106 __global__ void verifyFill(Assoc const* __restrict__ la, int n) {
0107   printf("assoc size %d\n", la->size());
0108   int imax = 0;
0109   long long ave = 0;
0110   int z = 0;
0111   for (int i = 0; i < n; ++i) {
0112     auto x = la->size(i);
0113     if (x == 0) {
0114       z++;
0115       continue;
0116     }
0117     ave += x;
0118     imax = std::max(imax, int(x));
0119   }
0120   assert(0 == la->size(n));
0121   printf("found with %d elements %f %d %d\n", n, double(ave) / n, imax, z);
0122 }
0123
0124 template <typename Assoc>
0125 __global__ void verifyFinal(Assoc const* __restrict__ la, int N) {
0126   printf("assoc size %d\n", la->size());
0127
0128   int imax = 0;
0129   long long ave = 0;
0130   for (int i = 0; i < N; ++i) {
0131     auto x = la->size(i);
0132     if (!(x == 4 || x == 3))
0133       printf("%d %d\n", i, x);
0134     assert(x == 4 || x == 3);
0135     ave += x;
0136     imax = std::max(imax, int(x));
0137   }
0138   assert(0 == la->size(N));
0139   printf("found with ave occupancy %f %d\n", double(ave) / N, imax);
0140 }
0141
0142 template <typename T>
0143 auto make_unique(std::size_t size) {
0144 #ifdef __CUDACC__
0145   return cms::cuda::make_device_unique<T>(size, 0);
0146 #else
0147   return std::make_unique<T>(size);
0148 #endif
0149 }
0150
0151 int main() {
0152 #ifdef __CUDACC__
0153   cms::cudatest::requireDevices();
0154   auto current_device = cms::cuda::currentDevice();
0155 #else
0156   // make sure cuda emulation is working
0157   std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl;
0158   std::cout << "cuda y's " << threadIdx.y << ' ' << blockIdx.y << ' ' << blockDim.y << ' ' << gridDim.y << std::endl;
0159   std::cout << "cuda z's " << threadIdx.z << ' ' << blockIdx.z << ' ' << blockDim.z << ' ' << gridDim.z << std::endl;
0160   assert(threadIdx.x == 0);
0161   assert(threadIdx.y == 0);
0162   assert(threadIdx.z == 0);
0163   assert(blockIdx.x == 0);
0164   assert(blockIdx.y == 0);
0165   assert(blockIdx.z == 0);
0166   assert(blockDim.x == 1);
0167   assert(blockDim.y == 1);
0168   assert(blockDim.z == 1);
0169   assert(gridDim.x == 1);
0170   assert(gridDim.y == 1);
0171   assert(gridDim.z == 1);
0172 #endif
0173
0174   std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::ctNOnes() << ' ' << Assoc::ctCapacity() << std::endl;
0175   std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::ctNOnes() << ' '
0176             << SmallAssoc::ctCapacity() << std::endl;
0177
0178   std::mt19937 eng;
0179
0180   std::geometric_distribution<int> rdm(0.8);
0181
0182   constexpr uint32_t N = 4000;
0183
0184   std::vector<std::array<uint16_t, 4>> tr(N);
0185
0186   // fill with "index" to element
0187   long long ave = 0;
0188   int imax = 0;
0189   auto n = 0U;
0190   auto z = 0U;
0191   auto nz = 0U;
0192   for (auto i = 0U; i < 4U; ++i) {
0193     auto j = 0U;
0194     while (j < N && n < MaxElem) {
0195       if (z == 11) {
0196         ++n;
0197         z = 0;
0198         ++nz;
0199         continue;
0200       }  // a bit of not assoc
0201       auto x = rdm(eng);
0202       auto k = std::min(j + x + 1, N);
0203       if (i == 3 && z == 3) {  // some triplets time to time
0204         for (; j < k; ++j)
0205           tr[j][i] = MaxElem + 1;
0206       } else {
0207         ave += x + 1;
0208         imax = std::max(imax, x);
0209         for (; j < k; ++j)
0210           tr[j][i] = n;
0211         ++n;
0212       }
0213       ++z;
0214     }
0215     assert(n <= MaxElem);
0216     assert(j <= N);
0217   }
0218   std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
0219
0220   auto a_d = make_unique<Assoc[]>(1);
0221   auto sa_d = make_unique<SmallAssoc[]>(1);
0222 #ifdef __CUDACC__
0223   auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
0224   assert(v_d.get());
0225   cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
0226 #else
0227   auto v_d = tr.data();
0228 #endif
0229
0230   Assoc::Counter* a_st = nullptr;
0231   int a_n = MaxElem;
0232
0233   Assoc::index_type* a_st2 = nullptr;
0234   SmallAssoc::index_type* sa_st2 = nullptr;
0235   int a_n2 = MaxAssocs;
0236
0237 // storage
0238 #ifdef RUNTIME_SIZE
0239   auto a_st_d = make_unique<Assoc::Counter[]>(a_n);
0240   auto a_st2_d = make_unique<Assoc::index_type[]>(a_n2);
0241   auto sa_st2_d = make_unique<SmallAssoc::index_type[]>(a_n2);
0242   a_st = a_st_d.get();
0243   a_st2 = a_st2_d.get();
0244   sa_st2 = sa_st2_d.get();
0245 #endif
0246   Assoc::View aView = {a_d.get(), a_st, a_st2, a_n, a_n2};
0247   launchZero(aView, 0);
0248   SmallAssoc::View saView = {sa_d.get(), nullptr, sa_st2, -1, a_n2};
0249   launchZero(saView, 0);
0250
0251 #ifdef __CUDACC__
0252   auto nThreads = 256;
0253   auto nBlocks = (4 * N + nThreads - 1) / nThreads;
0254
0255   count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
0256
0257   launchFinalize(aView, 0);
0258   verify<<<1, 1>>>(a_d.get());
0259   fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
0260   verifyFill<<<1, 1>>>(a_d.get(), n);
0261
0262 #else
0263   count(v_d, a_d.get(), N);
0264   launchFinalize(aView);
0265   verify(a_d.get());
0266   fill(v_d, a_d.get(), N);
0267   verifyFill(a_d.get(), n);
0268
0269 #endif
0270
0271   // now the inverse map (actually this is the ....)
0272   AtomicPairCounter* dc_d;
0273   AtomicPairCounter dc(0);
0274
0275 #ifdef __CUDACC__
0276   cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
0277   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
0278   nBlocks = (N + nThreads - 1) / nThreads;
0279   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), a_d.get(), N);
0280   finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
0281   verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
0282
0283   cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
0284   verifyFinal<<<1, 1>>>(a_d.get(), N);
0285
0286   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
0287   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
0288   finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
0289   verifyBulk<<<1, 1>>>(sa_d.get(), dc_d);
0290
0291 #else
0292   dc_d = &dc;
0293   fillBulk(dc_d, v_d, a_d.get(), N);
0294   finalizeBulk(dc_d, a_d.get());
0295   verifyBulk(a_d.get(), dc_d);
0296
0297   verifyFinal(a_d.get(), N);
0298
0299   AtomicPairCounter sdc(0);
0300   fillBulk(&sdc, v_d, sa_d.get(), N);
0301   finalizeBulk(&sdc, sa_d.get());
0302   verifyBulk(sa_d.get(), &sdc);
0303
0304 #endif
0305
0306   std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl;
0307
0308   // here verify use of block local counters
0309   auto m1_d = make_unique<Multiplicity[]>(1);
0310   auto m2_d = make_unique<Multiplicity[]>(1);
0311
0312   Multiplicity::index_type* m1_st = nullptr;
0313   Multiplicity::index_type* m2_st = nullptr;
0314   int m_n = 0;
0315
0316 #ifdef RUNTIME_SIZE
0317   m_n = MaxTk;
0318   auto m1_st_d = make_unique<Multiplicity::index_type[]>(m_n);
0319   auto m2_st_d = make_unique<Multiplicity::index_type[]>(m_n);
0320   m1_st = m1_st_d.get();
0321   m2_st = m1_st_d.get();
0322 #endif
0323   Multiplicity::View view1 = {m1_d.get(), nullptr, m1_st, -1, m_n};
0324   Multiplicity::View view2 = {m2_d.get(), nullptr, m2_st, -1, m_n};
0325   launchZero(view1, 0);
0326   launchZero(view2, 0);
0327
0328 #ifdef __CUDACC__
0329   nBlocks = (4 * N + nThreads - 1) / nThreads;
0330   countMulti<<<nBlocks, nThreads>>>(v_d.get(), m1_d.get(), N);
0331   countMultiLocal<<<nBlocks, nThreads>>>(v_d.get(), m2_d.get(), N);
0332   verifyMulti<<<1, Multiplicity::ctNOnes()>>>(m1_d.get(), m2_d.get());
0333
0334   launchFinalize(view1, 0);
0335   launchFinalize(view2, 0);
0336   verifyMulti<<<1, Multiplicity::ctNOnes()>>>(m1_d.get(), m2_d.get());
0337
0338   cudaCheck(cudaGetLastError());
0339   cudaCheck(cudaDeviceSynchronize());
0340 #else
0341   countMulti(v_d, m1_d.get(), N);
0342   countMultiLocal(v_d, m2_d.get(), N);
0343   verifyMulti(m1_d.get(), m2_d.get());
0344
0345   launchFinalize(view1, 0);
0346   launchFinalize(view2, 0);
0347   verifyMulti(m1_d.get(), m2_d.get());
0348 #endif
0349   return 0;
0350 }