Warning, /DataFormats/Math/test/CholeskyInvert_t.cu is written in an unsupported language. File is not indexed.
0001 // nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
0002 // add -DDOPROF to run nvprof --metrics all
0003
0004 #include <algorithm>
0005 #include <cassert>
0006 #include <chrono>
0007 #include <iomanip>
0008 #include <iostream>
0009 #include <limits>
0010 #include <memory>
0011 #include <random>
0012
0013 #include <Eigen/Core>
0014 #include <Eigen/Eigenvalues>
0015
0016 #include "DataFormats/Math/interface/choleskyInversion.h"
0017 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0018 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0019 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0020 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0021
0022 constexpr int stride() { return 5 * 1024; }
0023 template <int DIM>
0024 using MXN = Eigen::Matrix<double, DIM, DIM>;
0025 template <int DIM>
0026 using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
0027
0028 template <int N>
0029 __global__ void invertSOA(double *__restrict__ p, unsigned int n) {
0030 auto i = blockIdx.x * blockDim.x + threadIdx.x;
0031 if (i >= n)
0032 return;
0033
0034 MapMX<N> m(p + i);
0035 math::cholesky::invert(m, m);
0036 }
0037
0038 template <typename M, int N>
0039 __global__ void invert(M *mm, unsigned int n) {
0040 auto i = blockIdx.x * blockDim.x + threadIdx.x;
0041 if (i >= n)
0042 return;
0043
0044 auto &m = mm[i];
0045 math::cholesky::invert(m, m);
0046 }
0047
0048 template <typename M, int N>
0049 __global__ void invertSeq(M *mm, unsigned int n) {
0050 if (threadIdx.x != 0)
0051 return;
0052 auto first = blockIdx.x * blockDim.x;
0053 auto last = std::min(first + blockDim.x, n);
0054
0055 for (auto i = first; i < last; ++i) {
0056 auto &m = mm[i];
0057 math::cholesky::invert(m, m);
0058 }
0059 }
0060
0061 // generate matrices
0062 template <class M>
0063 void genMatrix(M &m) {
0064 using T = typename std::remove_reference<decltype(m(0, 0))>::type;
0065 int n = M::ColsAtCompileTime;
0066 std::mt19937 eng;
0067 // std::mt19937 eng2;
0068 std::uniform_real_distribution<T> rgen(0., 1.);
0069
0070 // generate first diagonal elemets
0071 for (int i = 0; i < n; ++i) {
0072 double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4
0073 m(i, i) = maxVal * rgen(eng);
0074 }
0075 for (int i = 0; i < n; ++i) {
0076 for (int j = 0; j < i; ++j) {
0077 double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined
0078 m(i, j) = v * rgen(eng);
0079 m(j, i) = m(i, j);
0080 }
0081 }
0082 }
0083
0084 template <int N>
0085 void go(bool soa) {
0086 constexpr unsigned int DIM = N;
0087 using MX = MXN<DIM>;
0088 std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << " in " << (soa ? "SOA" : "AOS")
0089 << " mode" << std::endl;
0090
0091 auto start = std::chrono::high_resolution_clock::now();
0092 auto delta = start - start;
0093 auto delta1 = delta;
0094 auto delta2 = delta;
0095
0096 constexpr unsigned int SIZE = 4 * 1024;
0097
0098 MX mm[stride()]; // just storage in case of SOA
0099 double *__restrict__ p = (double *)(mm);
0100
0101 if (soa) {
0102 for (unsigned int i = 0; i < SIZE; ++i) {
0103 MapMX<N> m(p + i);
0104 genMatrix(m);
0105 }
0106 } else {
0107 for (auto &m : mm)
0108 genMatrix(m);
0109 }
0110
0111 std::cout << mm[SIZE / 2](1, 1) << std::endl;
0112
0113 if (soa)
0114 for (unsigned int i = 0; i < SIZE; ++i) {
0115 MapMX<N> m(p + i);
0116 math::cholesky::invert(m, m);
0117 math::cholesky::invert(m, m);
0118 }
0119 else
0120 for (auto &m : mm) {
0121 math::cholesky::invert(m, m);
0122 math::cholesky::invert(m, m);
0123 }
0124
0125 std::cout << mm[SIZE / 2](1, 1) << std::endl;
0126
0127 auto m_d = cms::cuda::make_device_unique<double[]>(DIM * DIM * stride(), nullptr);
0128 cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice));
0129
0130 constexpr int NKK =
0131 #ifdef DOPROF
0132 2;
0133 #else
0134 1000;
0135 #endif
0136 for (int kk = 0; kk < NKK; ++kk) {
0137 int threadsPerBlock = 128;
0138 int blocksPerGrid = SIZE / threadsPerBlock;
0139
0140 delta -= (std::chrono::high_resolution_clock::now() - start);
0141
0142 if (soa)
0143 cms::cuda::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
0144 else
0145 cms::cuda::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
0146
0147 cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
0148
0149 delta += (std::chrono::high_resolution_clock::now() - start);
0150
0151 if (0 == kk)
0152 std::cout << mm[SIZE / 2](1, 1) << std::endl;
0153
0154 if (!soa) {
0155 delta1 -= (std::chrono::high_resolution_clock::now() - start);
0156
0157 #ifndef DOPROF
0158 cms::cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
0159 cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
0160 #endif
0161 delta1 += (std::chrono::high_resolution_clock::now() - start);
0162
0163 if (0 == kk)
0164 std::cout << mm[SIZE / 2](1, 1) << std::endl;
0165 }
0166
0167 delta2 -= (std::chrono::high_resolution_clock::now() - start);
0168 if (soa)
0169 #pragma GCC ivdep
0170 for (unsigned int i = 0; i < SIZE; ++i) {
0171 MapMX<N> m(p + i);
0172 math::cholesky::invert(m, m);
0173 }
0174 else
0175 #pragma GCC ivdep
0176 for (auto &m : mm) {
0177 math::cholesky::invert(m, m);
0178 }
0179
0180 delta2 += (std::chrono::high_resolution_clock::now() - start);
0181 }
0182
0183 std::cout << mm[SIZE / 2](1, 1) << std::endl;
0184
0185 double DNNK = NKK;
0186 std::cout << "cuda/cudaSeq/x86 computation took "
0187 << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
0188 << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
0189 << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
0190 << std::endl;
0191 }
0192
0193 int main() {
0194 cms::cudatest::requireDevices();
0195
0196 go<2>(false);
0197 go<3>(false);
0198 go<4>(false);
0199 go<5>(false);
0200 go<6>(false);
0201 go<7>(false);
0202 go<10>(false);
0203
0204 go<2>(true);
0205 go<3>(true);
0206 go<4>(true);
0207 go<5>(true);
0208 go<6>(true);
0209 go<7>(true);
0210 go<10>(true);
0211 return 0;
0212 }