Math/test/CholeskyInvert_t.cu

0001 // nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
0002 // add -DDOPROF to run  nvprof --metrics all
0003
0004 #include <algorithm>
0005 #include <cassert>
0006 #include <chrono>
0007 #include <iomanip>
0008 #include <iostream>
0009 #include <limits>
0010 #include <memory>
0011 #include <random>
0012
0013 #include <Eigen/Core>
0014 #include <Eigen/Eigenvalues>
0015
0016 #include "DataFormats/Math/interface/choleskyInversion.h"
0017 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0018 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0019 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0020 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0021
0022 constexpr int stride() { return 5 * 1024; }
0023 template <int DIM>
0024 using MXN = Eigen::Matrix<double, DIM, DIM>;
0025 template <int DIM>
0026 using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
0027
0028 template <int N>
0029 __global__ void invertSOA(double *__restrict__ p, unsigned int n) {
0030   auto i = blockIdx.x * blockDim.x + threadIdx.x;
0031   if (i >= n)
0032     return;
0033
0034   MapMX<N> m(p + i);
0035   math::cholesky::invert(m, m);
0036 }
0037
0038 template <typename M, int N>
0039 __global__ void invert(M *mm, unsigned int n) {
0040   auto i = blockIdx.x * blockDim.x + threadIdx.x;
0041   if (i >= n)
0042     return;
0043
0044   auto &m = mm[i];
0045   math::cholesky::invert(m, m);
0046 }
0047
0048 template <typename M, int N>
0049 __global__ void invertSeq(M *mm, unsigned int n) {
0050   if (threadIdx.x != 0)
0051     return;
0052   auto first = blockIdx.x * blockDim.x;
0053   auto last = std::min(first + blockDim.x, n);
0054
0055   for (auto i = first; i < last; ++i) {
0056     auto &m = mm[i];
0057     math::cholesky::invert(m, m);
0058   }
0059 }
0060
0061 // generate matrices
0062 template <class M>
0063 void genMatrix(M &m) {
0064   using T = typename std::remove_reference<decltype(m(0, 0))>::type;
0065   int n = M::ColsAtCompileTime;
0066   std::mt19937 eng;
0067   // std::mt19937 eng2;
0068   std::uniform_real_distribution<T> rgen(0., 1.);
0069
0070   // generate first diagonal elemets
0071   for (int i = 0; i < n; ++i) {
0072     double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
0073     m(i, i) = maxVal * rgen(eng);
0074   }
0075   for (int i = 0; i < n; ++i) {
0076     for (int j = 0; j < i; ++j) {
0077       double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
0078       m(i, j) = v * rgen(eng);
0079       m(j, i) = m(i, j);
0080     }
0081   }
0082 }
0083
0084 template <int N>
0085 void go(bool soa) {
0086   constexpr unsigned int DIM = N;
0087   using MX = MXN<DIM>;
0088   std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << " in " << (soa ? "SOA" : "AOS")
0089             << " mode" << std::endl;
0090
0091   auto start = std::chrono::high_resolution_clock::now();
0092   auto delta = start - start;
0093   auto delta1 = delta;
0094   auto delta2 = delta;
0095
0096   constexpr unsigned int SIZE = 4 * 1024;
0097
0098   MX mm[stride()];  // just storage in case of SOA
0099   double *__restrict__ p = (double *)(mm);
0100
0101   if (soa) {
0102     for (unsigned int i = 0; i < SIZE; ++i) {
0103       MapMX<N> m(p + i);
0104       genMatrix(m);
0105     }
0106   } else {
0107     for (auto &m : mm)
0108       genMatrix(m);
0109   }
0110
0111   std::cout << mm[SIZE / 2](1, 1) << std::endl;
0112
0113   if (soa)
0114     for (unsigned int i = 0; i < SIZE; ++i) {
0115       MapMX<N> m(p + i);
0116       math::cholesky::invert(m, m);
0117       math::cholesky::invert(m, m);
0118     }
0119   else
0120     for (auto &m : mm) {
0121       math::cholesky::invert(m, m);
0122       math::cholesky::invert(m, m);
0123     }
0124
0125   std::cout << mm[SIZE / 2](1, 1) << std::endl;
0126
0127   auto m_d = cms::cuda::make_device_unique<double[]>(DIM * DIM * stride(), nullptr);
0128   cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice));
0129
0130   constexpr int NKK =
0131 #ifdef DOPROF
0132       2;
0133 #else
0134       1000;
0135 #endif
0136   for (int kk = 0; kk < NKK; ++kk) {
0137     int threadsPerBlock = 128;
0138     int blocksPerGrid = SIZE / threadsPerBlock;
0139
0140     delta -= (std::chrono::high_resolution_clock::now() - start);
0141
0142     if (soa)
0143       cms::cuda::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
0144     else
0145       cms::cuda::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
0146
0147     cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
0148
0149     delta += (std::chrono::high_resolution_clock::now() - start);
0150
0151     if (0 == kk)
0152       std::cout << mm[SIZE / 2](1, 1) << std::endl;
0153
0154     if (!soa) {
0155       delta1 -= (std::chrono::high_resolution_clock::now() - start);
0156
0157 #ifndef DOPROF
0158       cms::cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
0159       cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
0160 #endif
0161       delta1 += (std::chrono::high_resolution_clock::now() - start);
0162
0163       if (0 == kk)
0164         std::cout << mm[SIZE / 2](1, 1) << std::endl;
0165     }
0166
0167     delta2 -= (std::chrono::high_resolution_clock::now() - start);
0168     if (soa)
0169 #pragma GCC ivdep
0170       for (unsigned int i = 0; i < SIZE; ++i) {
0171         MapMX<N> m(p + i);
0172         math::cholesky::invert(m, m);
0173       }
0174     else
0175 #pragma GCC ivdep
0176       for (auto &m : mm) {
0177         math::cholesky::invert(m, m);
0178       }
0179
0180     delta2 += (std::chrono::high_resolution_clock::now() - start);
0181   }
0182
0183   std::cout << mm[SIZE / 2](1, 1) << std::endl;
0184
0185   double DNNK = NKK;
0186   std::cout << "cuda/cudaSeq/x86 computation took "
0187             << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
0188             << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
0189             << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
0190             << std::endl;
0191 }
0192
0193 int main() {
0194   cms::cudatest::requireDevices();
0195
0196   go<2>(false);
0197   go<3>(false);
0198   go<4>(false);
0199   go<5>(false);
0200   go<6>(false);
0201   go<7>(false);
0202   go<10>(false);
0203
0204   go<2>(true);
0205   go<3>(true);
0206   go<4>(true);
0207   go<5>(true);
0208   go<6>(true);
0209   go<7>(true);
0210   go<10>(true);
0211   return 0;
0212 }