Warning, /RecoTracker/PixelTrackFitting/test/testEigenGPU.cu is written in an unsupported language. File is not indexed.
0001 #include <iostream>
0002
0003 #include <Eigen/Core>
0004 #include <Eigen/Eigenvalues>
0005
0006 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0007 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0008
0009 #ifdef USE_BL
0010 #include "RecoTracker/PixelTrackFitting/interface/BrokenLine.h"
0011 #else
0012 #include "RecoTracker/PixelTrackFitting/interface/RiemannFit.h"
0013 #endif
0014
0015 #include "test_common.h"
0016
0017 using namespace Eigen;
0018
0019 namespace riemannFit {
0020 constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
0021 constexpr uint32_t stride() { return maxNumberOfTracks(); }
0022 // hits
0023 template <int N>
0024 using Matrix3xNd = Eigen::Matrix<double, 3, N>;
0025 template <int N>
0026 using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()>>;
0027 // errors
0028 template <int N>
0029 using Matrix6xNf = Eigen::Matrix<float, 6, N>;
0030 template <int N>
0031 using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()>>;
0032 // fast fit
0033 using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()>>;
0034
0035 } // namespace riemannFit
0036
0037 template <int N>
0038 __global__ void kernelPrintSizes(double* __restrict__ phits, float* __restrict__ phits_ge) {
0039 auto i = blockIdx.x * blockDim.x + threadIdx.x;
0040 riemannFit::Map3xNd<N> hits(phits + i, 3, 4);
0041 riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, 4);
0042 if (i != 0)
0043 return;
0044 printf("GPU sizes %lu %lu %lu %lu %lu\n",
0045 sizeof(hits[i]),
0046 sizeof(hits_ge[i]),
0047 sizeof(Vector4d),
0048 sizeof(riemannFit::LineFit),
0049 sizeof(riemannFit::CircleFit));
0050 }
0051
0052 template <int N>
0053 __global__ void kernelFastFit(double* __restrict__ phits, double* __restrict__ presults) {
0054 auto i = blockIdx.x * blockDim.x + threadIdx.x;
0055 riemannFit::Map3xNd<N> hits(phits + i, 3, N);
0056 riemannFit::Map4d result(presults + i, 4);
0057 #ifdef USE_BL
0058 brokenline::fastFit(hits, result);
0059 #else
0060 riemannFit::fastFit(hits, result);
0061 #endif
0062 }
0063
0064 #ifdef USE_BL
0065
0066 template <int N>
0067 __global__ void kernelBrokenLineFit(double* __restrict__ phits,
0068 float* __restrict__ phits_ge,
0069 double* __restrict__ pfast_fit_input,
0070 double B,
0071 riemannFit::CircleFit* circle_fit,
0072 riemannFit::LineFit* line_fit) {
0073 auto i = blockIdx.x * blockDim.x + threadIdx.x;
0074 riemannFit::Map3xNd<N> hits(phits + i, 3, N);
0075 riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
0076 riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
0077
0078 brokenline::PreparedBrokenLineData<N> data;
0079 riemannFit::Matrix3d Jacob;
0080
0081 auto& line_fit_results = line_fit[i];
0082 auto& circle_fit_results = circle_fit[i];
0083
0084 brokenline::prepareBrokenLineData(hits, fast_fit_input, B, data);
0085 brokenline::lineFit(hits_ge, fast_fit_input, B, data, line_fit_results);
0086 brokenline::circleFit(hits, hits_ge, fast_fit_input, B, data, circle_fit_results);
0087 Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
0088 -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
0089 circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
0090 circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
0091
0092 #ifdef TEST_DEBUG
0093 if (0 == i) {
0094 printf("Circle param %f,%f,%f\n", circle_fit[i].par(0), circle_fit[i].par(1), circle_fit[i].par(2));
0095 }
0096 #endif
0097 }
0098
0099 #else
0100
0101 template <int N>
0102 __global__ void kernel_CircleFit(double* __restrict__ phits,
0103 float* __restrict__ phits_ge,
0104 double* __restrict__ pfast_fit_input,
0105 double B,
0106 riemannFit::CircleFit* circle_fit_resultsGPU) {
0107 auto i = blockIdx.x * blockDim.x + threadIdx.x;
0108 riemannFit::Map3xNd<N> hits(phits + i, 3, N);
0109 riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
0110 riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
0111
0112 constexpr auto n = N;
0113
0114 riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
0115 riemannFit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
0116 riemannFit::loadCovariance2D(hits_ge, hits_cov);
0117
0118 #ifdef TEST_DEBUG
0119 if (0 == i) {
0120 printf("hits %f, %f\n", hits.block(0, 0, 2, n)(0, 0), hits.block(0, 0, 2, n)(0, 1));
0121 printf("hits %f, %f\n", hits.block(0, 0, 2, n)(1, 0), hits.block(0, 0, 2, n)(1, 1));
0122 printf("fast_fit_input(0): %f\n", fast_fit_input(0));
0123 printf("fast_fit_input(1): %f\n", fast_fit_input(1));
0124 printf("fast_fit_input(2): %f\n", fast_fit_input(2));
0125 printf("fast_fit_input(3): %f\n", fast_fit_input(3));
0126 printf("rad(0,0): %f\n", rad(0, 0));
0127 printf("rad(1,1): %f\n", rad(1, 1));
0128 printf("rad(2,2): %f\n", rad(2, 2));
0129 printf("hits_cov(0,0): %f\n", (*hits_cov)(0, 0));
0130 printf("hits_cov(1,1): %f\n", (*hits_cov)(1, 1));
0131 printf("hits_cov(2,2): %f\n", (*hits_cov)(2, 2));
0132 printf("hits_cov(11,11): %f\n", (*hits_cov)(11, 11));
0133 printf("B: %f\n", B);
0134 }
0135 #endif
0136 circle_fit_resultsGPU[i] = riemannFit::circleFit(hits.block(0, 0, 2, n), hits_cov, fast_fit_input, rad, B, true);
0137 #ifdef TEST_DEBUG
0138 if (0 == i) {
0139 printf("Circle param %f,%f,%f\n",
0140 circle_fit_resultsGPU[i].par(0),
0141 circle_fit_resultsGPU[i].par(1),
0142 circle_fit_resultsGPU[i].par(2));
0143 }
0144 #endif
0145 }
0146
0147 template <int N>
0148 __global__ void kernelLineFit(double* __restrict__ phits,
0149 float* __restrict__ phits_ge,
0150 double B,
0151 riemannFit::CircleFit* circle_fit,
0152 double* __restrict__ pfast_fit_input,
0153 riemannFit::LineFit* line_fit) {
0154 auto i = blockIdx.x * blockDim.x + threadIdx.x;
0155 riemannFit::Map3xNd<N> hits(phits + i, 3, N);
0156 riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
0157 riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
0158 line_fit[i] = riemannFit::lineFit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true);
0159 }
0160 #endif
0161
0162 template <typename M3xN, typename M6xN>
0163 __device__ __host__ void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
0164 constexpr uint32_t N = M3xN::ColsAtCompileTime;
0165
0166 if (N == 5) {
0167 hits << 2.934787, 6.314229, 8.936963, 10.360559, 12.856387, 0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
0168 -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
0169 hits_ge.col(0) << 1.424715e-07, -4.996975e-07, 1.752614e-06, 3.660689e-11, 1.644638e-09, 7.346080e-05;
0170 hits_ge.col(1) << 6.899177e-08, -1.873414e-07, 5.087101e-07, -2.078806e-10, -2.210498e-11, 4.346079e-06;
0171 hits_ge.col(2) << 1.406273e-06, 4.042467e-07, 6.391180e-07, -3.141497e-07, 6.513821e-08, 1.163863e-07;
0172 hits_ge.col(3) << 1.176358e-06, 2.154100e-07, 5.072816e-07, -8.161219e-08, 1.437878e-07, 5.951832e-08;
0173 hits_ge.col(4) << 2.852843e-05, 7.956492e-06, 3.117701e-06, -1.060541e-06, 8.777413e-09, 1.426417e-07;
0174 return;
0175 }
0176
0177 if (N > 3)
0178 hits << 1.98645, 4.72598, 7.65632, 11.3151, 2.18002, 4.88864, 7.75845, 11.3134, 2.46338, 6.99838, 11.808, 17.793;
0179 else
0180 hits << 1.98645, 4.72598, 7.65632, 2.18002, 4.88864, 7.75845, 2.46338, 6.99838, 11.808;
0181
0182 hits_ge.col(0)[0] = 7.14652e-06;
0183 hits_ge.col(1)[0] = 2.15789e-06;
0184 hits_ge.col(2)[0] = 1.63328e-06;
0185 if (N > 3)
0186 hits_ge.col(3)[0] = 6.27919e-06;
0187 hits_ge.col(0)[2] = 6.10348e-06;
0188 hits_ge.col(1)[2] = 2.08211e-06;
0189 hits_ge.col(2)[2] = 1.61672e-06;
0190 if (N > 3)
0191 hits_ge.col(3)[2] = 6.28081e-06;
0192 hits_ge.col(0)[5] = 5.184e-05;
0193 hits_ge.col(1)[5] = 1.444e-05;
0194 hits_ge.col(2)[5] = 6.25e-06;
0195 if (N > 3)
0196 hits_ge.col(3)[5] = 3.136e-05;
0197 hits_ge.col(0)[1] = -5.60077e-06;
0198 hits_ge.col(1)[1] = -1.11936e-06;
0199 hits_ge.col(2)[1] = -6.24945e-07;
0200 if (N > 3)
0201 hits_ge.col(3)[1] = -5.28e-06;
0202 }
0203
0204 template <int N>
0205 __global__ void kernelFillHitsAndHitsCov(double* __restrict__ phits, float* phits_ge) {
0206 auto i = blockIdx.x * blockDim.x + threadIdx.x;
0207 riemannFit::Map3xNd<N> hits(phits + i, 3, N);
0208 riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
0209 hits_ge = MatrixXf::Zero(6, N);
0210 fillHitsAndHitsCov(hits, hits_ge);
0211 }
0212
0213 template <int N>
0214 void testFit() {
0215 constexpr double B = 0.0113921;
0216 riemannFit::Matrix3xNd<N> hits;
0217 riemannFit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
0218 double* hitsGPU = nullptr;
0219 ;
0220 float* hits_geGPU = nullptr;
0221 double* fast_fit_resultsGPU = nullptr;
0222 double* fast_fit_resultsGPUret = new double[riemannFit::maxNumberOfTracks() * sizeof(Vector4d)];
0223 riemannFit::CircleFit* circle_fit_resultsGPU = nullptr;
0224 riemannFit::CircleFit* circle_fit_resultsGPUret = new riemannFit::CircleFit();
0225 riemannFit::LineFit* line_fit_resultsGPU = nullptr;
0226 riemannFit::LineFit* line_fit_resultsGPUret = new riemannFit::LineFit();
0227
0228 fillHitsAndHitsCov(hits, hits_ge);
0229
0230 std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << ' '
0231 << sizeof(riemannFit::LineFit) << ' ' << sizeof(riemannFit::CircleFit) << std::endl;
0232
0233 std::cout << "Generated hits:\n" << hits << std::endl;
0234 std::cout << "Generated cov:\n" << hits_ge << std::endl;
0235
0236 // FAST_FIT_CPU
0237 #ifdef USE_BL
0238 Vector4d fast_fit_results;
0239 brokenline::fastFit(hits, fast_fit_results);
0240 #else
0241 Vector4d fast_fit_results;
0242 riemannFit::fastFit(hits, fast_fit_results);
0243 #endif
0244 std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
0245
0246 // for timing purposes we fit 4096 tracks
0247 constexpr uint32_t Ntracks = 4096;
0248 cudaCheck(cudaMalloc(&hitsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix3xNd<N>)));
0249 cudaCheck(cudaMalloc(&hits_geGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix6xNf<N>)));
0250 cudaCheck(cudaMalloc(&fast_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(Vector4d)));
0251 cudaCheck(cudaMalloc(&line_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit)));
0252 cudaCheck(cudaMalloc(&circle_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::CircleFit)));
0253
0254 cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(Vector4d)));
0255 cudaCheck(cudaMemset(line_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit)));
0256
0257 kernelPrintSizes<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
0258 kernelFillHitsAndHitsCov<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
0259
0260 // FAST_FIT GPU
0261 kernelFastFit<N><<<Ntracks / 64, 64>>>(hitsGPU, fast_fit_resultsGPU);
0262 cudaDeviceSynchronize();
0263
0264 cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
0265 fast_fit_resultsGPU,
0266 riemannFit::maxNumberOfTracks() * sizeof(Vector4d),
0267 cudaMemcpyDeviceToHost));
0268 riemannFit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
0269 std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
0270 assert(isEqualFuzzy(fast_fit_results, fast_fit));
0271
0272 #ifdef USE_BL
0273 // CIRCLE AND LINE FIT CPU
0274 brokenline::PreparedBrokenLineData<N> data;
0275 brokenline::karimaki_circle_fit circle_fit_results;
0276 riemannFit::LineFit line_fit_results;
0277 riemannFit::Matrix3d Jacob;
0278 brokenline::prepareBrokenLineData(hits, fast_fit_results, B, data);
0279 brokenline::lineFit(hits_ge, fast_fit_results, B, data, line_fit_results);
0280 brokenline::circleFit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
0281 Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
0282 -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
0283 circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
0284 circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
0285
0286 // fit on GPU
0287 kernelBrokenLineFit<N>
0288 <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU, line_fit_resultsGPU);
0289 cudaDeviceSynchronize();
0290
0291 #else
0292 // CIRCLE_FIT CPU
0293 riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
0294
0295 riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
0296 riemannFit::loadCovariance2D(hits_ge, hits_cov);
0297 riemannFit::CircleFit circle_fit_results =
0298 riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
0299
0300 // CIRCLE_FIT GPU
0301 kernel_CircleFit<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU);
0302 cudaDeviceSynchronize();
0303
0304 // LINE_FIT CPU
0305 riemannFit::LineFit line_fit_results =
0306 riemannFit::lineFit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
0307
0308 kernelLineFit<N>
0309 <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
0310 cudaDeviceSynchronize();
0311 #endif
0312
0313 std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
0314
0315 cudaCheck(cudaMemcpy(
0316 circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(riemannFit::CircleFit), cudaMemcpyDeviceToHost));
0317 std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
0318 assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
0319
0320 std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
0321 // LINE_FIT GPU
0322 cudaCheck(
0323 cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(riemannFit::LineFit), cudaMemcpyDeviceToHost));
0324 std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
0325 assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6)); // requires fma on CPU
0326
0327 std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
0328 std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
0329 std::cout << "Fitted cov (CircleFit) GPU:\n" << circle_fit_resultsGPUret->cov << std::endl;
0330 std::cout << "Fitted cov (LineFit): GPU\n" << line_fit_resultsGPUret->cov << std::endl;
0331 }
0332
0333 int main(int argc, char* argv[]) {
0334 cms::cudatest::requireDevices();
0335
0336 testFit<4>();
0337 testFit<3>();
0338 testFit<5>();
0339
0340 std::cout << "TEST FIT, NO ERRORS" << std::endl;
0341
0342 return 0;
0343 }