Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoTracker/PixelTrackFitting/test/testEigenGPU.cu is written in an unsupported language. File is not indexed.

0001 #include <iostream>
0002 
0003 #include <Eigen/Core>
0004 #include <Eigen/Eigenvalues>
0005 
0006 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0007 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0008 
0009 #ifdef USE_BL
0010 #include "RecoTracker/PixelTrackFitting/interface/BrokenLine.h"
0011 #else
0012 #include "RecoTracker/PixelTrackFitting/interface/RiemannFit.h"
0013 #endif
0014 
0015 #include "test_common.h"
0016 
0017 using namespace Eigen;
0018 
0019 namespace riemannFit {
0020   constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
0021   constexpr uint32_t stride() { return maxNumberOfTracks(); }
0022   // hits
0023   template <int N>
0024   using Matrix3xNd = Eigen::Matrix<double, 3, N>;
0025   template <int N>
0026   using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()>>;
0027   // errors
0028   template <int N>
0029   using Matrix6xNf = Eigen::Matrix<float, 6, N>;
0030   template <int N>
0031   using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()>>;
0032   // fast fit
0033   using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()>>;
0034 
0035 }  // namespace riemannFit
0036 
0037 template <int N>
0038 __global__ void kernelPrintSizes(double* __restrict__ phits, float* __restrict__ phits_ge) {
0039   auto i = blockIdx.x * blockDim.x + threadIdx.x;
0040   riemannFit::Map3xNd<N> hits(phits + i, 3, 4);
0041   riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, 4);
0042   if (i != 0)
0043     return;
0044   printf("GPU sizes %lu %lu %lu %lu %lu\n",
0045          sizeof(hits[i]),
0046          sizeof(hits_ge[i]),
0047          sizeof(Vector4d),
0048          sizeof(riemannFit::LineFit),
0049          sizeof(riemannFit::CircleFit));
0050 }
0051 
0052 template <int N>
0053 __global__ void kernelFastFit(double* __restrict__ phits, double* __restrict__ presults) {
0054   auto i = blockIdx.x * blockDim.x + threadIdx.x;
0055   riemannFit::Map3xNd<N> hits(phits + i, 3, N);
0056   riemannFit::Map4d result(presults + i, 4);
0057 #ifdef USE_BL
0058   brokenline::fastFit(hits, result);
0059 #else
0060   riemannFit::fastFit(hits, result);
0061 #endif
0062 }
0063 
0064 #ifdef USE_BL
0065 
0066 template <int N>
0067 __global__ void kernelBrokenLineFit(double* __restrict__ phits,
0068                                     float* __restrict__ phits_ge,
0069                                     double* __restrict__ pfast_fit_input,
0070                                     double B,
0071                                     riemannFit::CircleFit* circle_fit,
0072                                     riemannFit::LineFit* line_fit) {
0073   auto i = blockIdx.x * blockDim.x + threadIdx.x;
0074   riemannFit::Map3xNd<N> hits(phits + i, 3, N);
0075   riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
0076   riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
0077 
0078   brokenline::PreparedBrokenLineData<N> data;
0079   riemannFit::Matrix3d Jacob;
0080 
0081   auto& line_fit_results = line_fit[i];
0082   auto& circle_fit_results = circle_fit[i];
0083 
0084   brokenline::prepareBrokenLineData(hits, fast_fit_input, B, data);
0085   brokenline::lineFit(hits_ge, fast_fit_input, B, data, line_fit_results);
0086   brokenline::circleFit(hits, hits_ge, fast_fit_input, B, data, circle_fit_results);
0087   Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
0088       -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
0089   circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
0090   circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
0091 
0092 #ifdef TEST_DEBUG
0093   if (0 == i) {
0094     printf("Circle param %f,%f,%f\n", circle_fit[i].par(0), circle_fit[i].par(1), circle_fit[i].par(2));
0095   }
0096 #endif
0097 }
0098 
0099 #else
0100 
0101 template <int N>
0102 __global__ void kernel_CircleFit(double* __restrict__ phits,
0103                                  float* __restrict__ phits_ge,
0104                                  double* __restrict__ pfast_fit_input,
0105                                  double B,
0106                                  riemannFit::CircleFit* circle_fit_resultsGPU) {
0107   auto i = blockIdx.x * blockDim.x + threadIdx.x;
0108   riemannFit::Map3xNd<N> hits(phits + i, 3, N);
0109   riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
0110   riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
0111 
0112   constexpr auto n = N;
0113 
0114   riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
0115   riemannFit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
0116   riemannFit::loadCovariance2D(hits_ge, hits_cov);
0117 
0118 #ifdef TEST_DEBUG
0119   if (0 == i) {
0120     printf("hits %f, %f\n", hits.block(0, 0, 2, n)(0, 0), hits.block(0, 0, 2, n)(0, 1));
0121     printf("hits %f, %f\n", hits.block(0, 0, 2, n)(1, 0), hits.block(0, 0, 2, n)(1, 1));
0122     printf("fast_fit_input(0): %f\n", fast_fit_input(0));
0123     printf("fast_fit_input(1): %f\n", fast_fit_input(1));
0124     printf("fast_fit_input(2): %f\n", fast_fit_input(2));
0125     printf("fast_fit_input(3): %f\n", fast_fit_input(3));
0126     printf("rad(0,0): %f\n", rad(0, 0));
0127     printf("rad(1,1): %f\n", rad(1, 1));
0128     printf("rad(2,2): %f\n", rad(2, 2));
0129     printf("hits_cov(0,0): %f\n", (*hits_cov)(0, 0));
0130     printf("hits_cov(1,1): %f\n", (*hits_cov)(1, 1));
0131     printf("hits_cov(2,2): %f\n", (*hits_cov)(2, 2));
0132     printf("hits_cov(11,11): %f\n", (*hits_cov)(11, 11));
0133     printf("B: %f\n", B);
0134   }
0135 #endif
0136   circle_fit_resultsGPU[i] = riemannFit::circleFit(hits.block(0, 0, 2, n), hits_cov, fast_fit_input, rad, B, true);
0137 #ifdef TEST_DEBUG
0138   if (0 == i) {
0139     printf("Circle param %f,%f,%f\n",
0140            circle_fit_resultsGPU[i].par(0),
0141            circle_fit_resultsGPU[i].par(1),
0142            circle_fit_resultsGPU[i].par(2));
0143   }
0144 #endif
0145 }
0146 
0147 template <int N>
0148 __global__ void kernelLineFit(double* __restrict__ phits,
0149                               float* __restrict__ phits_ge,
0150                               double B,
0151                               riemannFit::CircleFit* circle_fit,
0152                               double* __restrict__ pfast_fit_input,
0153                               riemannFit::LineFit* line_fit) {
0154   auto i = blockIdx.x * blockDim.x + threadIdx.x;
0155   riemannFit::Map3xNd<N> hits(phits + i, 3, N);
0156   riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
0157   riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
0158   line_fit[i] = riemannFit::lineFit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true);
0159 }
0160 #endif
0161 
0162 template <typename M3xN, typename M6xN>
0163 __device__ __host__ void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
0164   constexpr uint32_t N = M3xN::ColsAtCompileTime;
0165 
0166   if (N == 5) {
0167     hits << 2.934787, 6.314229, 8.936963, 10.360559, 12.856387, 0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
0168         -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
0169     hits_ge.col(0) << 1.424715e-07, -4.996975e-07, 1.752614e-06, 3.660689e-11, 1.644638e-09, 7.346080e-05;
0170     hits_ge.col(1) << 6.899177e-08, -1.873414e-07, 5.087101e-07, -2.078806e-10, -2.210498e-11, 4.346079e-06;
0171     hits_ge.col(2) << 1.406273e-06, 4.042467e-07, 6.391180e-07, -3.141497e-07, 6.513821e-08, 1.163863e-07;
0172     hits_ge.col(3) << 1.176358e-06, 2.154100e-07, 5.072816e-07, -8.161219e-08, 1.437878e-07, 5.951832e-08;
0173     hits_ge.col(4) << 2.852843e-05, 7.956492e-06, 3.117701e-06, -1.060541e-06, 8.777413e-09, 1.426417e-07;
0174     return;
0175   }
0176 
0177   if (N > 3)
0178     hits << 1.98645, 4.72598, 7.65632, 11.3151, 2.18002, 4.88864, 7.75845, 11.3134, 2.46338, 6.99838, 11.808, 17.793;
0179   else
0180     hits << 1.98645, 4.72598, 7.65632, 2.18002, 4.88864, 7.75845, 2.46338, 6.99838, 11.808;
0181 
0182   hits_ge.col(0)[0] = 7.14652e-06;
0183   hits_ge.col(1)[0] = 2.15789e-06;
0184   hits_ge.col(2)[0] = 1.63328e-06;
0185   if (N > 3)
0186     hits_ge.col(3)[0] = 6.27919e-06;
0187   hits_ge.col(0)[2] = 6.10348e-06;
0188   hits_ge.col(1)[2] = 2.08211e-06;
0189   hits_ge.col(2)[2] = 1.61672e-06;
0190   if (N > 3)
0191     hits_ge.col(3)[2] = 6.28081e-06;
0192   hits_ge.col(0)[5] = 5.184e-05;
0193   hits_ge.col(1)[5] = 1.444e-05;
0194   hits_ge.col(2)[5] = 6.25e-06;
0195   if (N > 3)
0196     hits_ge.col(3)[5] = 3.136e-05;
0197   hits_ge.col(0)[1] = -5.60077e-06;
0198   hits_ge.col(1)[1] = -1.11936e-06;
0199   hits_ge.col(2)[1] = -6.24945e-07;
0200   if (N > 3)
0201     hits_ge.col(3)[1] = -5.28e-06;
0202 }
0203 
0204 template <int N>
0205 __global__ void kernelFillHitsAndHitsCov(double* __restrict__ phits, float* phits_ge) {
0206   auto i = blockIdx.x * blockDim.x + threadIdx.x;
0207   riemannFit::Map3xNd<N> hits(phits + i, 3, N);
0208   riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
0209   hits_ge = MatrixXf::Zero(6, N);
0210   fillHitsAndHitsCov(hits, hits_ge);
0211 }
0212 
0213 template <int N>
0214 void testFit() {
0215   constexpr double B = 0.0113921;
0216   riemannFit::Matrix3xNd<N> hits;
0217   riemannFit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
0218   double* hitsGPU = nullptr;
0219   ;
0220   float* hits_geGPU = nullptr;
0221   double* fast_fit_resultsGPU = nullptr;
0222   double* fast_fit_resultsGPUret = new double[riemannFit::maxNumberOfTracks() * sizeof(Vector4d)];
0223   riemannFit::CircleFit* circle_fit_resultsGPU = nullptr;
0224   riemannFit::CircleFit* circle_fit_resultsGPUret = new riemannFit::CircleFit();
0225   riemannFit::LineFit* line_fit_resultsGPU = nullptr;
0226   riemannFit::LineFit* line_fit_resultsGPUret = new riemannFit::LineFit();
0227 
0228   fillHitsAndHitsCov(hits, hits_ge);
0229 
0230   std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << ' '
0231             << sizeof(riemannFit::LineFit) << ' ' << sizeof(riemannFit::CircleFit) << std::endl;
0232 
0233   std::cout << "Generated hits:\n" << hits << std::endl;
0234   std::cout << "Generated cov:\n" << hits_ge << std::endl;
0235 
0236   // FAST_FIT_CPU
0237 #ifdef USE_BL
0238   Vector4d fast_fit_results;
0239   brokenline::fastFit(hits, fast_fit_results);
0240 #else
0241   Vector4d fast_fit_results;
0242   riemannFit::fastFit(hits, fast_fit_results);
0243 #endif
0244   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
0245 
0246   // for timing    purposes we fit    4096 tracks
0247   constexpr uint32_t Ntracks = 4096;
0248   cudaCheck(cudaMalloc(&hitsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix3xNd<N>)));
0249   cudaCheck(cudaMalloc(&hits_geGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix6xNf<N>)));
0250   cudaCheck(cudaMalloc(&fast_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(Vector4d)));
0251   cudaCheck(cudaMalloc(&line_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit)));
0252   cudaCheck(cudaMalloc(&circle_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::CircleFit)));
0253 
0254   cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(Vector4d)));
0255   cudaCheck(cudaMemset(line_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit)));
0256 
0257   kernelPrintSizes<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
0258   kernelFillHitsAndHitsCov<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
0259 
0260   // FAST_FIT GPU
0261   kernelFastFit<N><<<Ntracks / 64, 64>>>(hitsGPU, fast_fit_resultsGPU);
0262   cudaDeviceSynchronize();
0263 
0264   cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
0265                        fast_fit_resultsGPU,
0266                        riemannFit::maxNumberOfTracks() * sizeof(Vector4d),
0267                        cudaMemcpyDeviceToHost));
0268   riemannFit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
0269   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
0270   assert(isEqualFuzzy(fast_fit_results, fast_fit));
0271 
0272 #ifdef USE_BL
0273   // CIRCLE AND LINE FIT CPU
0274   brokenline::PreparedBrokenLineData<N> data;
0275   brokenline::karimaki_circle_fit circle_fit_results;
0276   riemannFit::LineFit line_fit_results;
0277   riemannFit::Matrix3d Jacob;
0278   brokenline::prepareBrokenLineData(hits, fast_fit_results, B, data);
0279   brokenline::lineFit(hits_ge, fast_fit_results, B, data, line_fit_results);
0280   brokenline::circleFit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
0281   Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
0282       -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
0283   circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
0284   circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
0285 
0286   // fit on GPU
0287   kernelBrokenLineFit<N>
0288       <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU, line_fit_resultsGPU);
0289   cudaDeviceSynchronize();
0290 
0291 #else
0292   // CIRCLE_FIT CPU
0293   riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
0294 
0295   riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
0296   riemannFit::loadCovariance2D(hits_ge, hits_cov);
0297   riemannFit::CircleFit circle_fit_results =
0298       riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
0299 
0300   // CIRCLE_FIT GPU
0301   kernel_CircleFit<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU);
0302   cudaDeviceSynchronize();
0303 
0304   // LINE_FIT CPU
0305   riemannFit::LineFit line_fit_results =
0306       riemannFit::lineFit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
0307 
0308   kernelLineFit<N>
0309       <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
0310   cudaDeviceSynchronize();
0311 #endif
0312 
0313   std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
0314 
0315   cudaCheck(cudaMemcpy(
0316       circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(riemannFit::CircleFit), cudaMemcpyDeviceToHost));
0317   std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
0318   assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
0319 
0320   std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
0321   // LINE_FIT GPU
0322   cudaCheck(
0323       cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(riemannFit::LineFit), cudaMemcpyDeviceToHost));
0324   std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
0325   assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6));  // requires fma on CPU
0326 
0327   std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
0328   std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
0329   std::cout << "Fitted cov (CircleFit) GPU:\n" << circle_fit_resultsGPUret->cov << std::endl;
0330   std::cout << "Fitted cov (LineFit): GPU\n" << line_fit_resultsGPUret->cov << std::endl;
0331 }
0332 
0333 int main(int argc, char* argv[]) {
0334   cms::cudatest::requireDevices();
0335 
0336   testFit<4>();
0337   testFit<3>();
0338   testFit<5>();
0339 
0340   std::cout << "TEST FIT, NO ERRORS" << std::endl;
0341 
0342   return 0;
0343 }