Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-01-18 03:42:18

0001 #include "KalmanUtilsMPlex.h"
0002 #include "PropagationMPlex.h"
0003 
0004 //#define DEBUG
0005 #include "Debug.h"
0006 
0007 #include "KalmanUtilsMPlex.icc"
0008 
0009 #include "RecoTracker/MkFitCore/interface/cms_common_macros.h"
0010 
0011 namespace {
0012   using namespace mkfit;
0013   using idx_t = Matriplex::idx_t;
0014 
0015   inline void MultResidualsAdd(const MPlexLH& A, const MPlexLV& B, const MPlex2V& C, MPlexLV& D) {
0016     // outPar = psPar + kalmanGain*(dPar)
0017     //   D    =   B         A         C
0018     // where right half of kalman gain is 0
0019 
0020     // XXX Regenerate with a script.
0021 
0022     MultResidualsAdd_imp(A, B, C, D, 0, NN);
0023   }
0024 
0025   inline void MultResidualsAdd(const MPlexL2& A, const MPlexLV& B, const MPlex2V& C, MPlexLV& D) {
0026     // outPar = psPar + kalmanGain*(dPar)
0027     //   D    =   B         A         C
0028 
0029     // XXX Regenerate with a script.
0030 
0031     typedef float T;
0032     const idx_t N = NN;
0033 
0034     const T* a = A.fArray;
0035     ASSUME_ALIGNED(a, 64);
0036     const T* b = B.fArray;
0037     ASSUME_ALIGNED(b, 64);
0038     const T* c = C.fArray;
0039     ASSUME_ALIGNED(c, 64);
0040     T* d = D.fArray;
0041     ASSUME_ALIGNED(d, 64);
0042 
0043 #pragma omp simd
0044     for (idx_t n = 0; n < N; ++n) {
0045       // generate loop (can also write it manually this time, it's not much)
0046       d[0 * N + n] = b[0 * N + n] + a[0 * N + n] * c[0 * N + n] + a[1 * N + n] * c[1 * N + n];
0047       d[1 * N + n] = b[1 * N + n] + a[2 * N + n] * c[0 * N + n] + a[3 * N + n] * c[1 * N + n];
0048       d[2 * N + n] = b[2 * N + n] + a[4 * N + n] * c[0 * N + n] + a[5 * N + n] * c[1 * N + n];
0049       d[3 * N + n] = b[3 * N + n] + a[6 * N + n] * c[0 * N + n] + a[7 * N + n] * c[1 * N + n];
0050       d[4 * N + n] = b[4 * N + n] + a[8 * N + n] * c[0 * N + n] + a[9 * N + n] * c[1 * N + n];
0051       d[5 * N + n] = b[5 * N + n] + a[10 * N + n] * c[0 * N + n] + a[11 * N + n] * c[1 * N + n];
0052     }
0053   }
0054 
0055   inline void MultResidualsAdd(const MPlex52& A, const MPlex5V& B, const MPlex2V& C, MPlex5V& D) {
0056     // outPar = psPar + kalmanGain*(dPar)
0057     //   D    =   B         A         C
0058 
0059     // XXX Regenerate with a script.
0060 
0061     typedef float T;
0062     const idx_t N = NN;
0063 
0064     const T* a = A.fArray;
0065     ASSUME_ALIGNED(a, 64);
0066     const T* b = B.fArray;
0067     ASSUME_ALIGNED(b, 64);
0068     const T* c = C.fArray;
0069     ASSUME_ALIGNED(c, 64);
0070     T* d = D.fArray;
0071     ASSUME_ALIGNED(d, 64);
0072 
0073 #pragma omp simd
0074     for (idx_t n = 0; n < N; ++n) {
0075       // generate loop (can also write it manually this time, it's not much)
0076       d[0 * N + n] = b[0 * N + n] + a[0 * N + n] * c[0 * N + n] + a[1 * N + n] * c[1 * N + n];
0077       d[1 * N + n] = b[1 * N + n] + a[2 * N + n] * c[0 * N + n] + a[3 * N + n] * c[1 * N + n];
0078       d[2 * N + n] = b[2 * N + n] + a[4 * N + n] * c[0 * N + n] + a[5 * N + n] * c[1 * N + n];
0079       d[3 * N + n] = b[3 * N + n] + a[6 * N + n] * c[0 * N + n] + a[7 * N + n] * c[1 * N + n];
0080       d[4 * N + n] = b[4 * N + n] + a[8 * N + n] * c[0 * N + n] + a[9 * N + n] * c[1 * N + n];
0081     }
0082   }
0083 
0084   //------------------------------------------------------------------------------
0085 
0086   inline void Chi2Similarity(const MPlex2V& A,  //resPar
0087                              const MPlex2S& C,  //resErr
0088                              MPlexQF& D)        //outChi2
0089   {
0090     // outChi2 = (resPar) * resErr * (resPar)
0091     //   D     =    A      *    C   *      A
0092 
0093     typedef float T;
0094     const idx_t N = NN;
0095 
0096     const T* a = A.fArray;
0097     ASSUME_ALIGNED(a, 64);
0098     const T* c = C.fArray;
0099     ASSUME_ALIGNED(c, 64);
0100     T* d = D.fArray;
0101     ASSUME_ALIGNED(d, 64);
0102 
0103 #pragma omp simd
0104     for (idx_t n = 0; n < N; ++n) {
0105       // generate loop (can also write it manually this time, it's not much)
0106       d[0 * N + n] = c[0 * N + n] * a[0 * N + n] * a[0 * N + n] + c[2 * N + n] * a[1 * N + n] * a[1 * N + n] +
0107                      2 * (c[1 * N + n] * a[1 * N + n] * a[0 * N + n]);
0108     }
0109   }
0110 
0111   //------------------------------------------------------------------------------
0112 
0113   inline void AddIntoUpperLeft3x3(const MPlexLS& A, const MPlexHS& B, MPlexHS& C) {
0114     // The rest of matrix is left untouched.
0115 
0116     typedef float T;
0117     const idx_t N = NN;
0118 
0119     const T* a = A.fArray;
0120     ASSUME_ALIGNED(a, 64);
0121     const T* b = B.fArray;
0122     ASSUME_ALIGNED(b, 64);
0123     T* c = C.fArray;
0124     ASSUME_ALIGNED(c, 64);
0125 
0126 #pragma omp simd
0127     for (idx_t n = 0; n < N; ++n) {
0128       c[0 * N + n] = a[0 * N + n] + b[0 * N + n];
0129       c[1 * N + n] = a[1 * N + n] + b[1 * N + n];
0130       c[2 * N + n] = a[2 * N + n] + b[2 * N + n];
0131       c[3 * N + n] = a[3 * N + n] + b[3 * N + n];
0132       c[4 * N + n] = a[4 * N + n] + b[4 * N + n];
0133       c[5 * N + n] = a[5 * N + n] + b[5 * N + n];
0134     }
0135   }
0136 
0137   inline void AddIntoUpperLeft2x2(const MPlexLS& A, const MPlexHS& B, MPlex2S& C) {
0138     // The rest of matrix is left untouched.
0139 
0140     typedef float T;
0141     const idx_t N = NN;
0142 
0143     const T* a = A.fArray;
0144     ASSUME_ALIGNED(a, 64);
0145     const T* b = B.fArray;
0146     ASSUME_ALIGNED(b, 64);
0147     T* c = C.fArray;
0148     ASSUME_ALIGNED(c, 64);
0149 
0150 #pragma omp simd
0151     for (idx_t n = 0; n < N; ++n) {
0152       c[0 * N + n] = a[0 * N + n] + b[0 * N + n];
0153       c[1 * N + n] = a[1 * N + n] + b[1 * N + n];
0154       c[2 * N + n] = a[2 * N + n] + b[2 * N + n];
0155     }
0156   }
0157 
0158   //------------------------------------------------------------------------------
0159 
0160   inline void SubtractFirst3(const MPlexHV& A, const MPlexLV& B, MPlexHV& C) {
0161     // The rest of matrix is left untouched.
0162 
0163     typedef float T;
0164     const idx_t N = NN;
0165 
0166     const T* a = A.fArray;
0167     ASSUME_ALIGNED(a, 64);
0168     const T* b = B.fArray;
0169     ASSUME_ALIGNED(b, 64);
0170     T* c = C.fArray;
0171     ASSUME_ALIGNED(c, 64);
0172 
0173 #pragma omp simd
0174     for (idx_t n = 0; n < N; ++n) {
0175       c[0 * N + n] = a[0 * N + n] - b[0 * N + n];
0176       c[1 * N + n] = a[1 * N + n] - b[1 * N + n];
0177       c[2 * N + n] = a[2 * N + n] - b[2 * N + n];
0178     }
0179   }
0180 
0181   inline void SubtractFirst2(const MPlexHV& A, const MPlexLV& B, MPlex2V& C) {
0182     // The rest of matrix is left untouched.
0183 
0184     typedef float T;
0185     const idx_t N = NN;
0186 
0187     const T* a = A.fArray;
0188     ASSUME_ALIGNED(a, 64);
0189     const T* b = B.fArray;
0190     ASSUME_ALIGNED(b, 64);
0191     T* c = C.fArray;
0192     ASSUME_ALIGNED(c, 64);
0193 
0194 #pragma omp simd
0195     for (idx_t n = 0; n < N; ++n) {
0196       c[0 * N + n] = a[0 * N + n] - b[0 * N + n];
0197       c[1 * N + n] = a[1 * N + n] - b[1 * N + n];
0198     }
0199   }
0200 
0201   //==============================================================================
0202 
0203   inline void ProjectResErr(const MPlexQF& A00, const MPlexQF& A01, const MPlexHS& B, MPlexHH& C) {
0204     // C = A * B, C is 3x3, A is 3x3 , B is 3x3 sym
0205 
0206     // Based on script generation and adapted to custom sizes.
0207 
0208     typedef float T;
0209     const idx_t N = NN;
0210 
0211     const T* a00 = A00.fArray;
0212     ASSUME_ALIGNED(a00, 64);
0213     const T* a01 = A01.fArray;
0214     ASSUME_ALIGNED(a01, 64);
0215     const T* b = B.fArray;
0216     ASSUME_ALIGNED(b, 64);
0217     T* c = C.fArray;
0218     ASSUME_ALIGNED(c, 64);
0219 
0220 #pragma omp simd
0221     for (int n = 0; n < N; ++n) {
0222       c[0 * N + n] = a00[n] * b[0 * N + n] + a01[n] * b[1 * N + n];
0223       c[1 * N + n] = a00[n] * b[1 * N + n] + a01[n] * b[2 * N + n];
0224       c[2 * N + n] = a00[n] * b[3 * N + n] + a01[n] * b[4 * N + n];
0225       c[3 * N + n] = b[3 * N + n];
0226       c[4 * N + n] = b[4 * N + n];
0227       c[5 * N + n] = b[5 * N + n];
0228       c[6 * N + n] = a01[n] * b[0 * N + n] - a00[n] * b[1 * N + n];
0229       c[7 * N + n] = a01[n] * b[1 * N + n] - a00[n] * b[2 * N + n];
0230       c[8 * N + n] = a01[n] * b[3 * N + n] - a00[n] * b[4 * N + n];
0231     }
0232   }
0233 
0234   inline void ProjectResErrTransp(const MPlexQF& A00, const MPlexQF& A01, const MPlexHH& B, MPlex2S& C) {
0235     // C = A * B, C is 3x3 sym, A is 3x3 , B is 3x3
0236 
0237     // Based on script generation and adapted to custom sizes.
0238 
0239     typedef float T;
0240     const idx_t N = NN;
0241 
0242     const T* a00 = A00.fArray;
0243     ASSUME_ALIGNED(a00, 64);
0244     const T* a01 = A01.fArray;
0245     ASSUME_ALIGNED(a01, 64);
0246     const T* b = B.fArray;
0247     ASSUME_ALIGNED(b, 64);
0248     T* c = C.fArray;
0249     ASSUME_ALIGNED(c, 64);
0250 
0251 #pragma omp simd
0252     for (int n = 0; n < N; ++n) {
0253       c[0 * N + n] = b[0 * N + n] * a00[n] + b[1 * N + n] * a01[n];
0254       c[1 * N + n] = b[3 * N + n] * a00[n] + b[4 * N + n] * a01[n];
0255       c[2 * N + n] = b[5 * N + n];
0256     }
0257   }
0258 
0259   inline void RotateResidualsOnTangentPlane(const MPlexQF& R00,  //r00
0260                                             const MPlexQF& R01,  //r01
0261                                             const MPlexHV& A,    //res_glo
0262                                             MPlex2V& B)          //res_loc
0263   {
0264     RotateResidualsOnTangentPlane_impl(R00, R01, A, B, 0, NN);
0265   }
0266 
0267   //==============================================================================
0268 
0269   /*
0270   inline void ProjectResErr(const MPlex2H& A, const MPlexHS& B, MPlex2H& C) {
0271     // C = A * B, C is 2x3, A is 2x3 , B is 3x3 sym
0272 
0273     //
0274     // A 0 1 2
0275     //   3 4 5
0276     // B 0 1 3
0277     //   1 2 4
0278     //   3 4 5
0279     //
0280 
0281     typedef float T;
0282     const idx_t N = NN;
0283 
0284     const T* a = A.fArray;
0285     ASSUME_ALIGNED(a, 64);
0286     const T* b = B.fArray;
0287     ASSUME_ALIGNED(b, 64);
0288     T* c = C.fArray;
0289     ASSUME_ALIGNED(c, 64);
0290 
0291 #pragma omp simd
0292     for (int n = 0; n < N; ++n) {
0293       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[3 * N + n];
0294       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[2 * N + n] + a[2 * N + n] * b[4 * N + n];
0295       c[2 * N + n] = a[0 * N + n] * b[3 * N + n] + a[1 * N + n] * b[4 * N + n] + a[2 * N + n] * b[5 * N + n];
0296       c[3 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[1 * N + n] + a[5 * N + n] * b[3 * N + n];
0297       c[4 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[2 * N + n] + a[5 * N + n] * b[4 * N + n];
0298       c[5 * N + n] = a[3 * N + n] * b[3 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[5 * N + n];
0299     }
0300   }
0301   */
0302 
0303   // inline void ProjectResErr(const MPlex2H& A, const MPlexLS& B, MPlex2H& C) {
0304   template <class T1, class T2>
0305   inline void ProjectResErr(const T1& A, const T2& B, MPlex2H& C) {
0306     // C = A * B, C is 2x3, A is 2x3 , B is 3x3 sym
0307 
0308     /*
0309     A 0 1 2
0310       3 4 5
0311     B 0 1 3
0312       1 2 4
0313       3 4 5
0314     */
0315 
0316     typedef float T;
0317     const idx_t N = NN;
0318 
0319     const T* a = A.fArray;
0320     ASSUME_ALIGNED(a, 64);
0321     const T* b = B.fArray;
0322     ASSUME_ALIGNED(b, 64);
0323     T* c = C.fArray;
0324     ASSUME_ALIGNED(c, 64);
0325 
0326 #pragma omp simd
0327     for (int n = 0; n < N; ++n) {
0328       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[3 * N + n];
0329       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[2 * N + n] + a[2 * N + n] * b[4 * N + n];
0330       c[2 * N + n] = a[0 * N + n] * b[3 * N + n] + a[1 * N + n] * b[4 * N + n] + a[2 * N + n] * b[5 * N + n];
0331       c[3 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[1 * N + n] + a[5 * N + n] * b[3 * N + n];
0332       c[4 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[2 * N + n] + a[5 * N + n] * b[4 * N + n];
0333       c[5 * N + n] = a[3 * N + n] * b[3 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[5 * N + n];
0334     }
0335   }
0336 
0337   //inline void ProjectResErrTransp(const MPlex2H& A, const MPlex2H& B, MPlex2S& C) {
0338   template <class T1>
0339   inline void ProjectResErrTransp(const T1& A, const MPlex2H& B, MPlex2S& C) {
0340     // C = B * A^T, C is 2x2 sym, A is 2x3 (A^T is 3x2), B is 2x3
0341 
0342     /*
0343     B   0 1 2
0344         3 4 5
0345     A^T 0 3
0346         1 4
0347         2 5
0348     */
0349 
0350     typedef float T;
0351     const idx_t N = NN;
0352 
0353     const T* a = A.fArray;
0354     ASSUME_ALIGNED(a, 64);
0355     const T* b = B.fArray;
0356     ASSUME_ALIGNED(b, 64);
0357     T* c = C.fArray;
0358     ASSUME_ALIGNED(c, 64);
0359 
0360 #pragma omp simd
0361     for (int n = 0; n < N; ++n) {
0362       c[0 * N + n] = b[0 * N + n] * a[0 * N + n] + b[1 * N + n] * a[1 * N + n] + b[2 * N + n] * a[2 * N + n];
0363       c[1 * N + n] = b[0 * N + n] * a[3 * N + n] + b[1 * N + n] * a[4 * N + n] + b[2 * N + n] * a[5 * N + n];
0364       c[2 * N + n] = b[3 * N + n] * a[3 * N + n] + b[4 * N + n] * a[4 * N + n] + b[5 * N + n] * a[5 * N + n];
0365     }
0366   }
0367 
0368   inline void RotateVectorOnPlane(const MPlexHH& R, const MPlexHV& A, MPlexHV& B) {
0369     // typedef float T;
0370     // const idx_t N = NN;
0371 
0372     // const T* a = A.fArray;
0373     // ASSUME_ALIGNED(a, 64);
0374     // T* b = B.fArray;
0375     // ASSUME_ALIGNED(b, 64);
0376     // const T* r = R.fArray;
0377     // ASSUME_ALIGNED(r, 64);
0378 
0379 #pragma omp simd
0380     for (int n = 0; n < NN; ++n) {
0381       B(n, 0, 0) = R(n, 0, 0) * A(n, 0, 0) + R(n, 0, 1) * A(n, 1, 0) + R(n, 0, 2) * A(n, 2, 0);
0382       B(n, 1, 0) = R(n, 1, 0) * A(n, 0, 0) + R(n, 1, 1) * A(n, 1, 0) + R(n, 1, 2) * A(n, 2, 0);
0383       B(n, 2, 0) = R(n, 2, 0) * A(n, 0, 0) + R(n, 2, 1) * A(n, 1, 0) + R(n, 2, 2) * A(n, 2, 0);
0384     }
0385   }
0386 
0387   inline void RotateVectorOnPlaneTransp(const MPlexHH& R, const MPlexHV& A, MPlexHV& B) {
0388     // typedef float T;
0389     // const idx_t N = NN;
0390 
0391     // const T* a = A.fArray;
0392     // ASSUME_ALIGNED(a, 64);
0393     // T* b = B.fArray;
0394     // ASSUME_ALIGNED(b, 64);
0395     // const T* r = R.fArray;
0396     // ASSUME_ALIGNED(r, 64);
0397 
0398 #pragma omp simd
0399     for (int n = 0; n < NN; ++n) {
0400       B(n, 0, 0) = R(n, 0, 0) * A(n, 0, 0) + R(n, 1, 0) * A(n, 1, 0) + R(n, 2, 0) * A(n, 2, 0);
0401       B(n, 1, 0) = R(n, 0, 1) * A(n, 0, 0) + R(n, 1, 1) * A(n, 1, 0) + R(n, 2, 1) * A(n, 2, 0);
0402       B(n, 2, 0) = R(n, 0, 2) * A(n, 0, 0) + R(n, 1, 2) * A(n, 1, 0) + R(n, 2, 2) * A(n, 2, 0);
0403     }
0404   }
0405 
0406   template <typename T1, typename T2>
0407   void RotateResidualsOnPlane(const T1& R,  //prj     - at least MPlex_2_3
0408                               const T2& A,  //res_glo - at least MPlex_3_1 (vector)
0409                               MPlex2V& B)   //res_loc - fixed as MPlex_2_1 (vector)
0410   {
0411 #pragma omp simd
0412     for (int n = 0; n < NN; ++n) {
0413       B(n, 0, 0) = R(n, 0, 0) * A(n, 0, 0) + R(n, 0, 1) * A(n, 1, 0) + R(n, 0, 2) * A(n, 2, 0);
0414       B(n, 1, 0) = R(n, 1, 0) * A(n, 0, 0) + R(n, 1, 1) * A(n, 1, 0) + R(n, 1, 2) * A(n, 2, 0);
0415     }
0416   }
0417 
0418   inline void KalmanHTG(const MPlexQF& A00, const MPlexQF& A01, const MPlex2S& B, MPlexHH& C) {
0419     // HTG  = rot * res_loc
0420     //   C  =  A  *    B
0421 
0422     // Based on script generation and adapted to custom sizes.
0423 
0424     typedef float T;
0425     const idx_t N = NN;
0426 
0427     const T* a00 = A00.fArray;
0428     ASSUME_ALIGNED(a00, 64);
0429     const T* a01 = A01.fArray;
0430     ASSUME_ALIGNED(a01, 64);
0431     const T* b = B.fArray;
0432     ASSUME_ALIGNED(b, 64);
0433     T* c = C.fArray;
0434     ASSUME_ALIGNED(c, 64);
0435 
0436 #pragma omp simd
0437     for (int n = 0; n < N; ++n) {
0438       c[0 * N + n] = a00[n] * b[0 * N + n];
0439       c[1 * N + n] = a00[n] * b[1 * N + n];
0440       c[2 * N + n] = 0.;
0441       c[3 * N + n] = a01[n] * b[0 * N + n];
0442       c[4 * N + n] = a01[n] * b[1 * N + n];
0443       c[5 * N + n] = 0.;
0444       c[6 * N + n] = b[1 * N + n];
0445       c[7 * N + n] = b[2 * N + n];
0446       c[8 * N + n] = 0.;
0447     }
0448   }
0449 
0450   inline void KalmanGain(const MPlexLS& A, const MPlexHH& B, MPlexLH& C) {
0451     // C = A * B, C is 6x3, A is 6x6 sym , B is 3x3
0452 
0453     typedef float T;
0454     const idx_t N = NN;
0455 
0456     const T* a = A.fArray;
0457     ASSUME_ALIGNED(a, 64);
0458     const T* b = B.fArray;
0459     ASSUME_ALIGNED(b, 64);
0460     T* c = C.fArray;
0461     ASSUME_ALIGNED(c, 64);
0462 
0463 #pragma omp simd
0464     for (int n = 0; n < N; ++n) {
0465       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[3 * N + n] + a[3 * N + n] * b[6 * N + n];
0466       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[4 * N + n] + a[3 * N + n] * b[7 * N + n];
0467       c[2 * N + n] = 0;
0468       c[3 * N + n] = a[1 * N + n] * b[0 * N + n] + a[2 * N + n] * b[3 * N + n] + a[4 * N + n] * b[6 * N + n];
0469       c[4 * N + n] = a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[4 * N + n] + a[4 * N + n] * b[7 * N + n];
0470       c[5 * N + n] = 0;
0471       c[6 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[3 * N + n] + a[5 * N + n] * b[6 * N + n];
0472       c[7 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[7 * N + n];
0473       c[8 * N + n] = 0;
0474       c[9 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[3 * N + n] + a[8 * N + n] * b[6 * N + n];
0475       c[10 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[4 * N + n] + a[8 * N + n] * b[7 * N + n];
0476       c[11 * N + n] = 0;
0477       c[12 * N + n] = a[10 * N + n] * b[0 * N + n] + a[11 * N + n] * b[3 * N + n] + a[12 * N + n] * b[6 * N + n];
0478       c[13 * N + n] = a[10 * N + n] * b[1 * N + n] + a[11 * N + n] * b[4 * N + n] + a[12 * N + n] * b[7 * N + n];
0479       c[14 * N + n] = 0;
0480       c[15 * N + n] = a[15 * N + n] * b[0 * N + n] + a[16 * N + n] * b[3 * N + n] + a[17 * N + n] * b[6 * N + n];
0481       c[16 * N + n] = a[15 * N + n] * b[1 * N + n] + a[16 * N + n] * b[4 * N + n] + a[17 * N + n] * b[7 * N + n];
0482       c[17 * N + n] = 0;
0483     }
0484   }
0485 
0486   inline void KalmanHTG(const MPlex2H& A, const MPlex2S& B, MPlexH2& C) {
0487     // HTG  = prj^T * res_loc
0488     //   C  =  A^T  *   B
0489 
0490     /*
0491     A^T 0 3
0492         1 4
0493         2 5
0494     B 0 1
0495       1 2
0496     C 0 1
0497       2 3
0498       4 5
0499     */
0500 
0501     typedef float T;
0502     const idx_t N = NN;
0503 
0504     const T* a = A.fArray;
0505     ASSUME_ALIGNED(a, 64);
0506     const T* b = B.fArray;
0507     ASSUME_ALIGNED(b, 64);
0508     T* c = C.fArray;
0509     ASSUME_ALIGNED(c, 64);
0510 
0511 #pragma omp simd
0512     for (int n = 0; n < N; ++n) {
0513       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[3 * N + n] * b[1 * N + n];
0514       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[3 * N + n] * b[2 * N + n];
0515       c[2 * N + n] = a[1 * N + n] * b[0 * N + n] + a[4 * N + n] * b[1 * N + n];
0516       c[3 * N + n] = a[1 * N + n] * b[1 * N + n] + a[4 * N + n] * b[2 * N + n];
0517       c[4 * N + n] = a[2 * N + n] * b[0 * N + n] + a[5 * N + n] * b[1 * N + n];
0518       c[5 * N + n] = a[2 * N + n] * b[1 * N + n] + a[5 * N + n] * b[2 * N + n];
0519     }
0520   }
0521 
0522   inline void KalmanGain(const MPlexLS& A, const MPlexH2& B, MPlexL2& C) {
0523     // C = A * B, C is 6x2, A is 6x6 sym , B is 3x2 (6x2 but half of it is zeros)
0524 
0525     /*
0526       A 0  1  3  6 10 15
0527         1  2  4  7 11 16
0528         3  4  5  8 12 17
0529         6  7  8  9 13 18
0530        10 11 12 13 14 19
0531        15 16 17 18 19 20
0532       B 0  1
0533         2  3
0534     4  5
0535         X  X with X=0, so not even included in B
0536         X  X
0537         X  X
0538       C 0  1
0539         2  3
0540     4  5
0541         6  7
0542         8  9
0543        10 11
0544      */
0545 
0546     typedef float T;
0547     const idx_t N = NN;
0548 
0549     const T* a = A.fArray;
0550     ASSUME_ALIGNED(a, 64);
0551     const T* b = B.fArray;
0552     ASSUME_ALIGNED(b, 64);
0553     T* c = C.fArray;
0554     ASSUME_ALIGNED(c, 64);
0555 
0556 #pragma omp simd
0557     for (int n = 0; n < N; ++n) {
0558       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[2 * N + n] + a[3 * N + n] * b[4 * N + n];
0559       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[3 * N + n] + a[3 * N + n] * b[5 * N + n];
0560       c[2 * N + n] = a[1 * N + n] * b[0 * N + n] + a[2 * N + n] * b[2 * N + n] + a[4 * N + n] * b[4 * N + n];
0561       c[3 * N + n] = a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[3 * N + n] + a[4 * N + n] * b[5 * N + n];
0562       c[4 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[2 * N + n] + a[5 * N + n] * b[4 * N + n];
0563       c[5 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[3 * N + n] + a[5 * N + n] * b[5 * N + n];
0564       c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[2 * N + n] + a[8 * N + n] * b[4 * N + n];
0565       c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[3 * N + n] + a[8 * N + n] * b[5 * N + n];
0566       c[8 * N + n] = a[10 * N + n] * b[0 * N + n] + a[11 * N + n] * b[2 * N + n] + a[12 * N + n] * b[4 * N + n];
0567       c[9 * N + n] = a[10 * N + n] * b[1 * N + n] + a[11 * N + n] * b[3 * N + n] + a[12 * N + n] * b[5 * N + n];
0568       c[10 * N + n] = a[15 * N + n] * b[0 * N + n] + a[16 * N + n] * b[2 * N + n] + a[17 * N + n] * b[4 * N + n];
0569       c[11 * N + n] = a[15 * N + n] * b[1 * N + n] + a[16 * N + n] * b[3 * N + n] + a[17 * N + n] * b[5 * N + n];
0570     }
0571   }
0572 
0573   inline void CovXYconstrain(const MPlexQF& R00, const MPlexQF& R01, const MPlexLS& Ci, MPlexLS& Co) {
0574     // C is transformed to align along y after rotation and rotated back
0575 
0576     typedef float T;
0577     const idx_t N = NN;
0578 
0579     const T* r00 = R00.fArray;
0580     ASSUME_ALIGNED(r00, 64);
0581     const T* r01 = R01.fArray;
0582     ASSUME_ALIGNED(r01, 64);
0583     const T* ci = Ci.fArray;
0584     ASSUME_ALIGNED(ci, 64);
0585     T* co = Co.fArray;
0586     ASSUME_ALIGNED(co, 64);
0587 
0588 #pragma omp simd
0589     for (int n = 0; n < N; ++n) {
0590       // a bit loopy to avoid temporaries
0591       co[0 * N + n] =
0592           r00[n] * r00[n] * ci[0 * N + n] + 2 * r00[n] * r01[n] * ci[1 * N + n] + r01[n] * r01[n] * ci[2 * N + n];
0593       co[1 * N + n] = r00[n] * r01[n] * co[0 * N + n];
0594       co[2 * N + n] = r01[n] * r01[n] * co[0 * N + n];
0595       co[0 * N + n] = r00[n] * r00[n] * co[0 * N + n];
0596 
0597       co[3 * N + n] = r00[n] * ci[3 * N + n] + r01[n] * ci[4 * N + n];
0598       co[4 * N + n] = r01[n] * co[3 * N + n];
0599       co[3 * N + n] = r00[n] * co[3 * N + n];
0600 
0601       co[6 * N + n] = r00[n] * ci[6 * N + n] + r01[n] * ci[7 * N + n];
0602       co[7 * N + n] = r01[n] * co[6 * N + n];
0603       co[6 * N + n] = r00[n] * co[6 * N + n];
0604 
0605       co[10 * N + n] = r00[n] * ci[10 * N + n] + r01[n] * ci[11 * N + n];
0606       co[11 * N + n] = r01[n] * co[10 * N + n];
0607       co[10 * N + n] = r00[n] * co[10 * N + n];
0608 
0609       co[15 * N + n] = r00[n] * ci[15 * N + n] + r01[n] * ci[16 * N + n];
0610       co[16 * N + n] = r01[n] * co[15 * N + n];
0611       co[15 * N + n] = r00[n] * co[15 * N + n];
0612     }
0613   }
0614 
0615   void KalmanGain(const MPlexLS& A, const MPlex2S& B, MPlexL2& C) {
0616     // C = A * B, C is 6x2, A is 6x6 sym , B is 2x2
0617 
0618     typedef float T;
0619     const idx_t N = NN;
0620 
0621     const T* a = A.fArray;
0622     ASSUME_ALIGNED(a, 64);
0623     const T* b = B.fArray;
0624     ASSUME_ALIGNED(b, 64);
0625     T* c = C.fArray;
0626     ASSUME_ALIGNED(c, 64);
0627 
0628 #include "KalmanGain62.ah"
0629   }
0630 
0631   inline void KHMult(const MPlexLH& A, const MPlexQF& B00, const MPlexQF& B01, MPlexLL& C) {
0632     // C = A * B, C is 6x6, A is 6x3 , B is 3x6
0633     KHMult_imp(A, B00, B01, C, 0, NN);
0634   }
0635 
0636   inline void KHMult(const MPlexL2& A, const MPlex2H& B, MPlexLL& C) {
0637     // C = A * B, C is 6x6, A is 6x2 , B is 2x3 (2x6 but half of it made of zeros)
0638 
0639     /*
0640     A 0  1
0641       2  3
0642       4  5
0643       6  7
0644       8  9
0645      10 11
0646     B  0  1  2  X  X  X with X=0 so not included in B
0647        3  4  5  X  X  X
0648     C  0  1  2  3  4  5
0649        6  7  8  9 10 11
0650       12 13 14 15 16 17
0651       18 19 20 21 22 23
0652       24 25 26 27 28 29
0653       30 31 32 33 34 34
0654     */
0655 
0656     // typedef float T;
0657     // const idx_t N = NN;
0658 
0659     // const T* a = A.fArray;
0660     // ASSUME_ALIGNED(a, 64);
0661     // const T* b = B.fArray;
0662     // ASSUME_ALIGNED(b, 64);
0663     // T* c = C.fArray;
0664     // ASSUME_ALIGNED(c, 64);
0665 
0666 #pragma omp simd
0667     for (int n = 0; n < NN; ++n) {
0668       C(n, 0, 0) = A(n, 0, 0) * B(n, 0, 0) + A(n, 0, 1) * B(n, 1, 0);
0669       C(n, 0, 1) = A(n, 0, 0) * B(n, 0, 1) + A(n, 0, 1) * B(n, 1, 1);
0670       C(n, 0, 2) = A(n, 0, 0) * B(n, 0, 2) + A(n, 0, 1) * B(n, 1, 2);
0671       C(n, 0, 3) = 0;
0672       C(n, 0, 4) = 0;
0673       C(n, 0, 5) = 0;
0674       C(n, 0, 6) = A(n, 1, 0) * B(n, 0, 0) + A(n, 1, 1) * B(n, 1, 0);
0675       C(n, 0, 7) = A(n, 1, 0) * B(n, 0, 1) + A(n, 1, 1) * B(n, 1, 1);
0676       C(n, 0, 8) = A(n, 1, 0) * B(n, 0, 2) + A(n, 1, 1) * B(n, 1, 2);
0677       C(n, 0, 9) = 0;
0678       C(n, 0, 10) = 0;
0679       C(n, 0, 11) = 0;
0680       C(n, 0, 12) = A(n, 2, 0) * B(n, 0, 0) + A(n, 2, 1) * B(n, 1, 0);
0681       C(n, 0, 13) = A(n, 2, 0) * B(n, 0, 1) + A(n, 2, 1) * B(n, 1, 1);
0682       C(n, 0, 14) = A(n, 2, 0) * B(n, 0, 2) + A(n, 2, 1) * B(n, 1, 2);
0683       C(n, 0, 15) = 0;
0684       C(n, 0, 16) = 0;
0685       C(n, 0, 17) = 0;
0686       C(n, 0, 18) = A(n, 3, 0) * B(n, 0, 0) + A(n, 3, 1) * B(n, 1, 0);
0687       C(n, 0, 19) = A(n, 3, 0) * B(n, 0, 1) + A(n, 3, 1) * B(n, 1, 1);
0688       C(n, 0, 20) = A(n, 3, 0) * B(n, 0, 2) + A(n, 3, 1) * B(n, 1, 2);
0689       C(n, 0, 21) = 0;
0690       C(n, 0, 22) = 0;
0691       C(n, 0, 23) = 0;
0692       C(n, 0, 24) = A(n, 4, 0) * B(n, 0, 0) + A(n, 4, 1) * B(n, 1, 0);
0693       C(n, 0, 25) = A(n, 4, 0) * B(n, 0, 1) + A(n, 4, 1) * B(n, 1, 1);
0694       C(n, 0, 26) = A(n, 4, 0) * B(n, 0, 2) + A(n, 4, 1) * B(n, 1, 2);
0695       C(n, 0, 27) = 0;
0696       C(n, 0, 28) = 0;
0697       C(n, 0, 29) = 0;
0698       C(n, 0, 30) = A(n, 5, 0) * B(n, 0, 0) + A(n, 5, 1) * B(n, 1, 0);
0699       C(n, 0, 31) = A(n, 5, 0) * B(n, 0, 1) + A(n, 5, 1) * B(n, 1, 1);
0700       C(n, 0, 32) = A(n, 5, 0) * B(n, 0, 2) + A(n, 5, 1) * B(n, 1, 2);
0701       C(n, 0, 33) = 0;
0702       C(n, 0, 34) = 0;
0703       C(n, 0, 35) = 0;
0704     }
0705   }
0706 
0707   inline void KHC(const MPlexLL& A, const MPlexLS& B, MPlexLS& C) {
0708     // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
0709 
0710     typedef float T;
0711     const idx_t N = NN;
0712 
0713     const T* a = A.fArray;
0714     ASSUME_ALIGNED(a, 64);
0715     const T* b = B.fArray;
0716     ASSUME_ALIGNED(b, 64);
0717     T* c = C.fArray;
0718     ASSUME_ALIGNED(c, 64);
0719 
0720 #include "KHC.ah"
0721   }
0722 
0723   inline void KHC(const MPlexL2& A, const MPlexLS& B, MPlexLS& C) {
0724     // C = A * B, C is 6x6 sym, A is 6x2 , B is 6x6 sym
0725 
0726     typedef float T;
0727     const idx_t N = NN;
0728 
0729     const T* a = A.fArray;
0730     ASSUME_ALIGNED(a, 64);
0731     const T* b = B.fArray;
0732     ASSUME_ALIGNED(b, 64);
0733     T* c = C.fArray;
0734     ASSUME_ALIGNED(c, 64);
0735 
0736 #include "K62HC.ah"
0737   }
0738 
0739   void JacCCS2Loc(const MPlex55& A, const MPlex56& B, MPlex56& C) {
0740     typedef float T;
0741     const idx_t N = NN;
0742 
0743     const T* a = A.fArray;
0744     ASSUME_ALIGNED(a, 64);
0745     const T* b = B.fArray;
0746     ASSUME_ALIGNED(b, 64);
0747     T* c = C.fArray;
0748     ASSUME_ALIGNED(c, 64);
0749 
0750 #include "JacCCS2Loc.ah"
0751   }
0752 
0753   void PsErrLoc(const MPlex56& A, const MPlexLS& B, MPlex56& C) {
0754     // C = A * B
0755 
0756     typedef float T;
0757     const Matriplex::idx_t N = NN;
0758 
0759     const T* a = A.fArray;
0760     ASSUME_ALIGNED(a, 64);
0761     const T* b = B.fArray;
0762     ASSUME_ALIGNED(b, 64);
0763     T* c = C.fArray;
0764     ASSUME_ALIGNED(c, 64);
0765 
0766 #include "PsErrLoc.ah"
0767   }
0768 
0769   void PsErrLocTransp(const MPlex56& B, const MPlex56& A, MPlex5S& C) {
0770     // C = B * AT;
0771 
0772     typedef float T;
0773     const Matriplex::idx_t N = NN;
0774 
0775     const T* a = A.fArray;
0776     ASSUME_ALIGNED(a, 64);
0777     const T* b = B.fArray;
0778     ASSUME_ALIGNED(b, 64);
0779     T* c = C.fArray;
0780     ASSUME_ALIGNED(c, 64);
0781 
0782 #include "PsErrLocTransp.ah"
0783   }
0784 
0785   void PsErrLocUpd(const MPlex55& A, const MPlex5S& B, MPlex5S& C) {
0786     // C = A * B;
0787 
0788     typedef float T;
0789     const Matriplex::idx_t N = NN;
0790 
0791     const T* a = A.fArray;
0792     ASSUME_ALIGNED(a, 64);
0793     const T* b = B.fArray;
0794     ASSUME_ALIGNED(b, 64);
0795     T* c = C.fArray;
0796     ASSUME_ALIGNED(c, 64);
0797 
0798 #include "PsErrLocUpd.ah"
0799   }
0800 
0801   void JacLoc2CCS(const MPlex65& A, const MPlex55& B, MPlex65& C) {
0802     // C = A * B;
0803 
0804     typedef float T;
0805     const Matriplex::idx_t N = NN;
0806 
0807     const T* a = A.fArray;
0808     ASSUME_ALIGNED(a, 64);
0809     const T* b = B.fArray;
0810     ASSUME_ALIGNED(b, 64);
0811     T* c = C.fArray;
0812     ASSUME_ALIGNED(c, 64);
0813 
0814 #include "JacLoc2CCS.ah"
0815   }
0816 
0817   void OutErrCCS(const MPlex65& A, const MPlex5S& B, MPlex65& C) {
0818     // C = A * B
0819 
0820     typedef float T;
0821     const Matriplex::idx_t N = NN;
0822 
0823     const T* a = A.fArray;
0824     ASSUME_ALIGNED(a, 64);
0825     const T* b = B.fArray;
0826     ASSUME_ALIGNED(b, 64);
0827     T* c = C.fArray;
0828     ASSUME_ALIGNED(c, 64);
0829 
0830 #include "OutErrCCS.ah"
0831   }
0832 
0833   void OutErrCCSTransp(const MPlex65& B, const MPlex65& A, MPlexLS& C) {
0834     // C = B * AT;
0835 
0836     typedef float T;
0837     const Matriplex::idx_t N = NN;
0838 
0839     const T* a = A.fArray;
0840     ASSUME_ALIGNED(a, 64);
0841     const T* b = B.fArray;
0842     ASSUME_ALIGNED(b, 64);
0843     T* c = C.fArray;
0844     ASSUME_ALIGNED(c, 64);
0845 
0846 #include "OutErrCCSTransp.ah"
0847   }
0848 
0849   //Warning: MultFull is not vectorized!
0850   template <typename T1, typename T2, typename T3>
0851   void MultFull(const T1& A, int nia, int nja, const T2& B, int nib, int njb, T3& C, int nic, int njc) {
0852 #ifdef DEBUG
0853     assert(nja == nib);
0854     assert(nia == nic);
0855     assert(njb == njc);
0856 #endif
0857     for (int n = 0; n < NN; ++n) {
0858       for (int i = 0; i < nia; ++i) {
0859         for (int j = 0; j < njb; ++j) {
0860           C(n, i, j) = 0.;
0861           for (int k = 0; k < nja; ++k)
0862             C(n, i, j) += A.constAt(n, i, k) * B.constAt(n, k, j);
0863         }
0864       }
0865     }
0866   }
0867 
0868   //Warning: MultTranspFull is not vectorized!
0869   // (careful about which one is transposed, I think rows and cols are swapped and the one that is transposed is B)
0870   template <typename T1, typename T2, typename T3>
0871   void MultTranspFull(const T1& A, int nia, int nja, const T2& B, int nib, int njb, T3& C, int nic, int njc) {
0872 #ifdef DEBUG
0873     assert(nja == njb);
0874     assert(nia == nic);
0875     assert(nib == njc);
0876 #endif
0877     for (int n = 0; n < NN; ++n) {
0878       for (int i = 0; i < nia; ++i) {
0879         for (int j = 0; j < nib; ++j) {
0880           C(n, i, j) = 0.;
0881           for (int k = 0; k < nja; ++k)
0882             C(n, i, j) += A.constAt(n, i, k) * B.constAt(n, j, k);
0883         }
0884       }
0885     }
0886   }
0887 
0888 }  // namespace
0889 
0890 //==============================================================================
0891 // Kalman operations - common dummy variables
0892 //==============================================================================
0893 
0894 namespace {
0895   // Dummy variables for parameter consistency to kalmanOperation.
0896   // Through KalmanFilterOperation enum parameter it is guaranteed that
0897   // those will never get accessed in the code (read from or written into).
0898 
0899   CMS_SA_ALLOW mkfit::MPlexLS dummy_err;
0900   CMS_SA_ALLOW mkfit::MPlexLV dummy_par;
0901   CMS_SA_ALLOW mkfit::MPlexQF dummy_chi2;
0902 }  // namespace
0903 
0904 namespace mkfit {
0905 
0906   //==============================================================================
0907   // Kalman operations - Barrel
0908   //==============================================================================
0909 
0910   void kalmanUpdate(const MPlexLS& psErr,
0911                     const MPlexLV& psPar,
0912                     const MPlexHS& msErr,
0913                     const MPlexHV& msPar,
0914                     MPlexLS& outErr,
0915                     MPlexLV& outPar,
0916                     const int N_proc) {
0917     kalmanOperation(KFO_Update_Params | KFO_Local_Cov, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
0918   }
0919 
0920   void kalmanPropagateAndUpdate(const MPlexLS& psErr,
0921                                 const MPlexLV& psPar,
0922                                 MPlexQI& Chg,
0923                                 const MPlexHS& msErr,
0924                                 const MPlexHV& msPar,
0925                                 MPlexLS& outErr,
0926                                 MPlexLV& outPar,
0927                                 MPlexQI& outFailFlag,
0928                                 const int N_proc,
0929                                 const PropagationFlags& propFlags,
0930                                 const bool propToHit) {
0931     if (propToHit) {
0932       MPlexLS propErr;
0933       MPlexLV propPar;
0934       MPlexQF msRad;
0935 #pragma omp simd
0936       for (int n = 0; n < NN; ++n) {
0937         msRad.At(n, 0, 0) = hipo(msPar.constAt(n, 0, 0), msPar.constAt(n, 1, 0));
0938       }
0939 
0940       propagateHelixToRMPlex(psErr, psPar, Chg, msRad, propErr, propPar, outFailFlag, N_proc, propFlags);
0941 
0942       kalmanOperation(
0943           KFO_Update_Params | KFO_Local_Cov, propErr, propPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
0944     } else {
0945       kalmanOperation(
0946           KFO_Update_Params | KFO_Local_Cov, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
0947     }
0948     for (int n = 0; n < NN; ++n) {
0949       if (n < N_proc && outPar.At(n, 3, 0) < 0) {
0950         Chg.At(n, 0, 0) = -Chg.At(n, 0, 0);
0951         outPar.At(n, 3, 0) = -outPar.At(n, 3, 0);
0952       }
0953     }
0954   }
0955 
0956   //------------------------------------------------------------------------------
0957 
0958   void kalmanComputeChi2(const MPlexLS& psErr,
0959                          const MPlexLV& psPar,
0960                          const MPlexQI& inChg,
0961                          const MPlexHS& msErr,
0962                          const MPlexHV& msPar,
0963                          MPlexQF& outChi2,
0964                          const int N_proc) {
0965     kalmanOperation(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
0966   }
0967 
0968   void kalmanPropagateAndComputeChi2(const MPlexLS& psErr,
0969                                      const MPlexLV& psPar,
0970                                      const MPlexQI& inChg,
0971                                      const MPlexHS& msErr,
0972                                      const MPlexHV& msPar,
0973                                      MPlexQF& outChi2,
0974                                      MPlexLV& propPar,
0975                                      MPlexQI& outFailFlag,
0976                                      const int N_proc,
0977                                      const PropagationFlags& propFlags,
0978                                      const bool propToHit) {
0979     propPar = psPar;
0980     if (propToHit) {
0981       MPlexLS propErr;
0982       MPlexQF msRad;
0983 #pragma omp simd
0984       for (int n = 0; n < NN; ++n) {
0985         if (n < N_proc) {
0986           msRad.At(n, 0, 0) = hipo(msPar.constAt(n, 0, 0), msPar.constAt(n, 1, 0));
0987         } else {
0988           msRad.At(n, 0, 0) = 0.0f;
0989         }
0990       }
0991 
0992       propagateHelixToRMPlex(psErr, psPar, inChg, msRad, propErr, propPar, outFailFlag, N_proc, propFlags);
0993 
0994       kalmanOperation(KFO_Calculate_Chi2, propErr, propPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
0995     } else {
0996       kalmanOperation(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
0997     }
0998   }
0999 
1000   //------------------------------------------------------------------------------
1001 
1002   void kalmanOperation(const int kfOp,
1003                        const MPlexLS& psErr,
1004                        const MPlexLV& psPar,
1005                        const MPlexHS& msErr,
1006                        const MPlexHV& msPar,
1007                        MPlexLS& outErr,
1008                        MPlexLV& outPar,
1009                        MPlexQF& outChi2,
1010                        const int N_proc) {
1011 #ifdef DEBUG
1012     {
1013       dmutex_guard;
1014       printf("psPar:\n");
1015       for (int i = 0; i < 6; ++i) {
1016         printf("%8f ", psPar.constAt(0, 0, i));
1017         printf("\n");
1018       }
1019       printf("\n");
1020       printf("psErr:\n");
1021       for (int i = 0; i < 6; ++i) {
1022         for (int j = 0; j < 6; ++j)
1023           printf("%8f ", psErr.constAt(0, i, j));
1024         printf("\n");
1025       }
1026       printf("\n");
1027       printf("msPar:\n");
1028       for (int i = 0; i < 3; ++i) {
1029         printf("%8f ", msPar.constAt(0, 0, i));
1030         printf("\n");
1031       }
1032       printf("\n");
1033       printf("msErr:\n");
1034       for (int i = 0; i < 3; ++i) {
1035         for (int j = 0; j < 3; ++j)
1036           printf("%8f ", msErr.constAt(0, i, j));
1037         printf("\n");
1038       }
1039       printf("\n");
1040     }
1041 #endif
1042 
1043     // Rotate global point on tangent plane to cylinder
1044     // Tangent point is half way between hit and propagate position
1045 
1046     // Rotation matrix
1047     //  rotT00  0  rotT01
1048     //  rotT01  0 -rotT00
1049     //     0    1    0
1050     // Minimize temporaries: only two float are needed!
1051 
1052     MPlexQF rotT00;
1053     MPlexQF rotT01;
1054     for (int n = 0; n < NN; ++n) {
1055       if (n < N_proc) {
1056         const float r = hipo(msPar.constAt(n, 0, 0), msPar.constAt(n, 1, 0));
1057         rotT00.At(n, 0, 0) = -(msPar.constAt(n, 1, 0) + psPar.constAt(n, 1, 0)) / (2 * r);
1058         rotT01.At(n, 0, 0) = (msPar.constAt(n, 0, 0) + psPar.constAt(n, 0, 0)) / (2 * r);
1059       } else {
1060         rotT00.At(n, 0, 0) = 0.0f;
1061         rotT01.At(n, 0, 0) = 0.0f;
1062       }
1063     }
1064 
1065     MPlexHV res_glo;  //position residual in global coordinates
1066     SubtractFirst3(msPar, psPar, res_glo);
1067 
1068     MPlexHS resErr_glo;  //covariance sum in global position coordinates
1069     AddIntoUpperLeft3x3(psErr, msErr, resErr_glo);
1070 
1071     MPlex2V res_loc;  //position residual in local coordinates
1072     RotateResidualsOnTangentPlane(rotT00, rotT01, res_glo, res_loc);
1073     MPlex2S resErr_loc;  //covariance sum in local position coordinates
1074     MPlexHH tempHH;
1075     ProjectResErr(rotT00, rotT01, resErr_glo, tempHH);
1076     ProjectResErrTransp(rotT00, rotT01, tempHH, resErr_loc);
1077 
1078 #ifdef DEBUG
1079     {
1080       dmutex_guard;
1081       printf("res_glo:\n");
1082       for (int i = 0; i < 3; ++i) {
1083         printf("%8f ", res_glo.At(0, i, 0));
1084       }
1085       printf("\n");
1086       printf("resErr_glo:\n");
1087       for (int i = 0; i < 3; ++i) {
1088         for (int j = 0; j < 3; ++j)
1089           printf("%8f ", resErr_glo.At(0, i, j));
1090         printf("\n");
1091       }
1092       printf("\n");
1093       printf("res_loc:\n");
1094       for (int i = 0; i < 2; ++i) {
1095         printf("%8f ", res_loc.At(0, i, 0));
1096       }
1097       printf("\n");
1098       printf("tempHH:\n");
1099       for (int i = 0; i < 3; ++i) {
1100         for (int j = 0; j < 3; ++j)
1101           printf("%8f ", tempHH.At(0, i, j));
1102         printf("\n");
1103       }
1104       printf("\n");
1105       printf("resErr_loc:\n");
1106       for (int i = 0; i < 2; ++i) {
1107         for (int j = 0; j < 2; ++j)
1108           printf("%8f ", resErr_loc.At(0, i, j));
1109         printf("\n");
1110       }
1111       printf("\n");
1112     }
1113 #endif
1114 
1115     //invert the 2x2 matrix
1116     Matriplex::invertCramerSym(resErr_loc);
1117 
1118     if (kfOp & KFO_Calculate_Chi2) {
1119       Chi2Similarity(res_loc, resErr_loc, outChi2);
1120 
1121 #ifdef DEBUG
1122       {
1123         dmutex_guard;
1124         printf("resErr_loc (Inv):\n");
1125         for (int i = 0; i < 2; ++i) {
1126           for (int j = 0; j < 2; ++j)
1127             printf("%8f ", resErr_loc.At(0, i, j));
1128           printf("\n");
1129         }
1130         printf("\n");
1131         printf("chi2: %8f\n", outChi2.At(0, 0, 0));
1132       }
1133 #endif
1134     }
1135 
1136     if (kfOp & KFO_Update_Params) {
1137       MPlexLS psErrLoc = psErr;
1138       if (kfOp & KFO_Local_Cov)
1139         CovXYconstrain(rotT00, rotT01, psErr, psErrLoc);
1140 
1141       MPlexLH K;                                      // kalman gain, fixme should be L2
1142       KalmanHTG(rotT00, rotT01, resErr_loc, tempHH);  // intermediate term to get kalman gain (H^T*G)
1143       KalmanGain(psErrLoc, tempHH, K);
1144 
1145       MultResidualsAdd(K, psPar, res_loc, outPar);
1146 
1147       squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
1148 
1149       MPlexLL tempLL;
1150       KHMult(K, rotT00, rotT01, tempLL);
1151       KHC(tempLL, psErrLoc, outErr);
1152       outErr.subtract(psErrLoc, outErr);
1153 
1154 #ifdef DEBUG
1155       {
1156         dmutex_guard;
1157         if (kfOp & KFO_Local_Cov) {
1158           printf("psErrLoc:\n");
1159           for (int i = 0; i < 6; ++i) {
1160             for (int j = 0; j < 6; ++j)
1161               printf("% 8e ", psErrLoc.At(0, i, j));
1162             printf("\n");
1163           }
1164           printf("\n");
1165         }
1166         printf("resErr_loc (Inv):\n");
1167         for (int i = 0; i < 2; ++i) {
1168           for (int j = 0; j < 2; ++j)
1169             printf("%8f ", resErr_loc.At(0, i, j));
1170           printf("\n");
1171         }
1172         printf("\n");
1173         printf("tempHH:\n");
1174         for (int i = 0; i < 3; ++i) {
1175           for (int j = 0; j < 3; ++j)
1176             printf("%8f ", tempHH.At(0, i, j));
1177           printf("\n");
1178         }
1179         printf("\n");
1180         printf("K:\n");
1181         for (int i = 0; i < 6; ++i) {
1182           for (int j = 0; j < 3; ++j)
1183             printf("%8f ", K.At(0, i, j));
1184           printf("\n");
1185         }
1186         printf("\n");
1187         printf("tempLL:\n");
1188         for (int i = 0; i < 6; ++i) {
1189           for (int j = 0; j < 6; ++j)
1190             printf("%8f ", tempLL.At(0, i, j));
1191           printf("\n");
1192         }
1193         printf("\n");
1194         printf("outPar:\n");
1195         for (int i = 0; i < 6; ++i) {
1196           printf("%8f  ", outPar.At(0, i, 0));
1197         }
1198         printf("\n");
1199         printf("outErr:\n");
1200         for (int i = 0; i < 6; ++i) {
1201           for (int j = 0; j < 6; ++j)
1202             printf("%8f ", outErr.At(0, i, j));
1203           printf("\n");
1204         }
1205         printf("\n");
1206       }
1207 #endif
1208     }
1209   }
1210 
1211   //==============================================================================
1212   // Kalman operations - Plane
1213   //==============================================================================
1214 
1215   void kalmanUpdatePlane(const MPlexLS& psErr,
1216                          const MPlexLV& psPar,
1217                          const MPlexQI& Chg,
1218                          const MPlexHS& msErr,
1219                          const MPlexHV& msPar,
1220                          const MPlexHV& plNrm,
1221                          const MPlexHV& plDir,
1222                          const MPlexHV& plPnt,
1223                          MPlexLS& outErr,
1224                          MPlexLV& outPar,
1225                          const int N_proc) {
1226     kalmanOperationPlaneLocal(KFO_Update_Params | KFO_Local_Cov,
1227                               psErr,
1228                               psPar,
1229                               Chg,
1230                               msErr,
1231                               msPar,
1232                               plNrm,
1233                               plDir,
1234                               plPnt,
1235                               outErr,
1236                               outPar,
1237                               dummy_chi2,
1238                               N_proc);
1239   }
1240 
1241   void kalmanPropagateAndUpdatePlane(const MPlexLS& psErr,
1242                                      const MPlexLV& psPar,
1243                                      MPlexQI& Chg,
1244                                      const MPlexHS& msErr,
1245                                      const MPlexHV& msPar,
1246                                      const MPlexHV& plNrm,
1247                                      const MPlexHV& plDir,
1248                                      const MPlexHV& plPnt,
1249                                      MPlexLS& outErr,
1250                                      MPlexLV& outPar,
1251                                      MPlexQI& outFailFlag,
1252                                      const int N_proc,
1253                                      const PropagationFlags& propFlags,
1254                                      const bool propToHit) {
1255     if (propToHit) {
1256       MPlexLS propErr;
1257       MPlexLV propPar;
1258       propagateHelixToPlaneMPlex(psErr, psPar, Chg, plPnt, plNrm, propErr, propPar, outFailFlag, N_proc, propFlags);
1259 
1260       kalmanOperationPlaneLocal(KFO_Update_Params | KFO_Local_Cov,
1261                                 propErr,
1262                                 propPar,
1263                                 Chg,
1264                                 msErr,
1265                                 msPar,
1266                                 plNrm,
1267                                 plDir,
1268                                 plPnt,
1269                                 outErr,
1270                                 outPar,
1271                                 dummy_chi2,
1272                                 N_proc);
1273     } else {
1274       kalmanOperationPlaneLocal(KFO_Update_Params | KFO_Local_Cov,
1275                                 psErr,
1276                                 psPar,
1277                                 Chg,
1278                                 msErr,
1279                                 msPar,
1280                                 plNrm,
1281                                 plDir,
1282                                 plPnt,
1283                                 outErr,
1284                                 outPar,
1285                                 dummy_chi2,
1286                                 N_proc);
1287     }
1288     for (int n = 0; n < NN; ++n) {
1289       if (outPar.At(n, 3, 0) < 0) {
1290         Chg.At(n, 0, 0) = -Chg.At(n, 0, 0);
1291         outPar.At(n, 3, 0) = -outPar.At(n, 3, 0);
1292       }
1293     }
1294   }
1295 
1296   //------------------------------------------------------------------------------
1297 
1298   void kalmanComputeChi2Plane(const MPlexLS& psErr,
1299                               const MPlexLV& psPar,
1300                               const MPlexQI& inChg,
1301                               const MPlexHS& msErr,
1302                               const MPlexHV& msPar,
1303                               const MPlexHV& plNrm,
1304                               const MPlexHV& plDir,
1305                               const MPlexHV& plPnt,
1306                               MPlexQF& outChi2,
1307                               const int N_proc) {
1308     kalmanOperationPlaneLocal(KFO_Calculate_Chi2,
1309                               psErr,
1310                               psPar,
1311                               inChg,
1312                               msErr,
1313                               msPar,
1314                               plNrm,
1315                               plDir,
1316                               plPnt,
1317                               dummy_err,
1318                               dummy_par,
1319                               outChi2,
1320                               N_proc);
1321   }
1322 
1323   void kalmanPropagateAndComputeChi2Plane(const MPlexLS& psErr,
1324                                           const MPlexLV& psPar,
1325                                           const MPlexQI& inChg,
1326                                           const MPlexHS& msErr,
1327                                           const MPlexHV& msPar,
1328                                           const MPlexHV& plNrm,
1329                                           const MPlexHV& plDir,
1330                                           const MPlexHV& plPnt,
1331                                           MPlexQF& outChi2,
1332                                           MPlexLV& propPar,
1333                                           MPlexQI& outFailFlag,
1334                                           const int N_proc,
1335                                           const PropagationFlags& propFlags,
1336                                           const bool propToHit) {
1337     propPar = psPar;
1338     if (propToHit) {
1339       MPlexLS propErr;
1340       propagateHelixToPlaneMPlex(psErr, psPar, inChg, plPnt, plNrm, propErr, propPar, outFailFlag, N_proc, propFlags);
1341 
1342       kalmanOperationPlaneLocal(KFO_Calculate_Chi2,
1343                                 propErr,
1344                                 propPar,
1345                                 inChg,
1346                                 msErr,
1347                                 msPar,
1348                                 plNrm,
1349                                 plDir,
1350                                 plPnt,
1351                                 dummy_err,
1352                                 dummy_par,
1353                                 outChi2,
1354                                 N_proc);
1355     } else {
1356       kalmanOperationPlaneLocal(KFO_Calculate_Chi2,
1357                                 psErr,
1358                                 psPar,
1359                                 inChg,
1360                                 msErr,
1361                                 msPar,
1362                                 plNrm,
1363                                 plDir,
1364                                 plPnt,
1365                                 dummy_err,
1366                                 dummy_par,
1367                                 outChi2,
1368                                 N_proc);
1369     }
1370   }
1371 
1372   //------------------------------------------------------------------------------
1373 
1374   void kalmanOperationPlaneLocal(const int kfOp,
1375                                  const MPlexLS& psErr,
1376                                  const MPlexLV& psPar,
1377                                  const MPlexQI& inChg,
1378                                  const MPlexHS& msErr,
1379                                  const MPlexHV& msPar,
1380                                  const MPlexHV& plNrm,
1381                                  const MPlexHV& plDir,
1382                                  const MPlexHV& plPnt,
1383                                  MPlexLS& outErr,
1384                                  MPlexLV& outPar,
1385                                  MPlexQF& outChi2,
1386                                  const int N_proc) {
1387 #ifdef DEBUG
1388     {
1389       dmutex_guard;
1390       printf("psPar:\n");
1391       for (int i = 0; i < 6; ++i) {
1392         printf("%8f ", psPar.constAt(0, 0, i));
1393         printf("\n");
1394       }
1395       printf("\n");
1396       printf("psErr:\n");
1397       for (int i = 0; i < 6; ++i) {
1398         for (int j = 0; j < 6; ++j)
1399           printf("%8f ", psErr.constAt(0, i, j));
1400         printf("\n");
1401       }
1402       printf("\n");
1403       printf("msPar:\n");
1404       for (int i = 0; i < 3; ++i) {
1405         printf("%8f ", msPar.constAt(0, 0, i));
1406         printf("\n");
1407       }
1408       printf("\n");
1409       printf("msErr:\n");
1410       for (int i = 0; i < 3; ++i) {
1411         for (int j = 0; j < 3; ++j)
1412           printf("%8f ", msErr.constAt(0, i, j));
1413         printf("\n");
1414       }
1415       printf("\n");
1416     }
1417 #endif
1418 
1419     MPlexHH rot;
1420 #pragma omp simd
1421     for (int n = 0; n < NN; ++n) {
1422       rot(n, 0, 0) = plDir(n, 0, 0);
1423       rot(n, 0, 1) = plDir(n, 1, 0);
1424       rot(n, 0, 2) = plDir(n, 2, 0);
1425       rot(n, 1, 0) = plNrm(n, 1, 0) * plDir(n, 2, 0) - plNrm(n, 2, 0) * plDir(n, 1, 0);
1426       rot(n, 1, 1) = plNrm(n, 2, 0) * plDir(n, 0, 0) - plNrm(n, 0, 0) * plDir(n, 2, 0);
1427       rot(n, 1, 2) = plNrm(n, 0, 0) * plDir(n, 1, 0) - plNrm(n, 1, 0) * plDir(n, 0, 0);
1428       rot(n, 2, 0) = plNrm(n, 0, 0);
1429       rot(n, 2, 1) = plNrm(n, 1, 0);
1430       rot(n, 2, 2) = plNrm(n, 2, 0);
1431     }
1432 
1433     // get local parameters
1434     MPlexHV xd;
1435 #pragma omp simd
1436     for (int n = 0; n < NN; ++n) {
1437       xd(n, 0, 0) = psPar(n, 0, 0) - plPnt(n, 0, 0);
1438       xd(n, 0, 1) = psPar(n, 0, 1) - plPnt(n, 0, 1);
1439       xd(n, 0, 2) = psPar(n, 0, 2) - plPnt(n, 0, 2);
1440     }
1441     MPlex2V xlo;
1442     RotateResidualsOnPlane(rot, xd, xlo);
1443 
1444     MPlexQF sinP, sinT, cosP, cosT, pt;  //fixme VDT or something?
1445 #pragma omp simd
1446     for (int n = 0; n < NN; ++n) {
1447       pt(n, 0, 0) = 1.f / psPar(n, 3, 0);
1448       sinP(n, 0, 0) = std::sin(psPar(n, 4, 0));
1449       cosP(n, 0, 0) = std::cos(psPar(n, 4, 0));
1450       sinT(n, 0, 0) = std::sin(psPar(n, 5, 0));
1451       cosT(n, 0, 0) = std::cos(psPar(n, 5, 0));
1452     }
1453 
1454     MPlexHV pgl;
1455 #pragma omp simd
1456     for (int n = 0; n < NN; ++n) {
1457       pgl(n, 0, 0) = cosP(n, 0, 0) * pt(n, 0, 0);
1458       pgl(n, 0, 1) = sinP(n, 0, 0) * pt(n, 0, 0);
1459       pgl(n, 0, 2) = cosT(n, 0, 0) * pt(n, 0, 0) / sinT(n, 0, 0);
1460     }
1461 
1462     MPlexHV plo;
1463     RotateVectorOnPlane(rot, pgl, plo);
1464     MPlex5V lp;
1465 #pragma omp simd
1466     for (int n = 0; n < NN; ++n) {
1467       lp(n, 0, 0) = inChg(n, 0, 0) * psPar(n, 3, 0) * sinT(n, 0, 0);
1468       lp(n, 0, 1) = plo(n, 0, 0) / plo(n, 0, 2);
1469       lp(n, 0, 2) = plo(n, 0, 1) / plo(n, 0, 2);
1470       lp(n, 0, 3) = xlo(n, 0, 0);
1471       lp(n, 0, 4) = xlo(n, 0, 1);
1472     }
1473     MPlexQI pzSign;
1474 #pragma omp simd
1475     for (int n = 0; n < NN; ++n) {
1476       pzSign(n, 0, 0) = plo(n, 0, 2) > 0.f ? 1 : -1;
1477     }
1478 
1479     /*
1480     printf("rot:\n");
1481     for (int i = 0; i < 3; ++i) {
1482       for (int j = 0; j < 3; ++j)
1483     printf("%8f ", rot.At(0, i, j));
1484       printf("\n");
1485     }
1486     printf("\n");
1487     printf("plPnt:\n");
1488     for (int i = 0; i < 3; ++i) {
1489       printf("%8f ", plPnt.constAt(0, 0, i));
1490     }
1491     printf("\n");
1492     printf("xlo:\n");
1493     for (int i = 0; i < 2; ++i) {
1494       printf("%8f ", xlo.At(0, i, 0));
1495     }
1496     printf("\n");
1497     printf("pgl:\n");
1498     for (int i = 0; i < 3; ++i) {
1499       printf("%8f ", pgl.At(0, i, 0));
1500     }
1501     printf("\n");
1502     printf("plo:\n");
1503     for (int i = 0; i < 3; ++i) {
1504       printf("%8f ", plo.At(0, i, 0));
1505     }
1506     printf("\n");
1507     printf("lp:\n");
1508     for (int i = 0; i < 5; ++i) {
1509       printf("%8f ", lp.At(0, i, 0));
1510     }
1511     printf("\n");
1512     */
1513 
1514     //now we need the jacobian to convert from CCS to curvilinear
1515     // code from TrackState::jacobianCCSToCurvilinear
1516     MPlex56 jacCCS2Curv(0.f);
1517 #pragma omp simd
1518     for (int n = 0; n < NN; ++n) {
1519       jacCCS2Curv(n, 0, 3) = inChg(n, 0, 0) * sinT(n, 0, 0);
1520       jacCCS2Curv(n, 0, 5) = inChg(n, 0, 0) * cosT(n, 0, 0) * psPar(n, 3, 0);
1521       jacCCS2Curv(n, 1, 5) = -1.f;
1522       jacCCS2Curv(n, 2, 4) = 1.f;
1523       jacCCS2Curv(n, 3, 0) = -sinP(n, 0, 0);
1524       jacCCS2Curv(n, 3, 1) = cosP(n, 0, 0);
1525       jacCCS2Curv(n, 4, 0) = -cosP(n, 0, 0) * cosT(n, 0, 0);
1526       jacCCS2Curv(n, 4, 1) = -sinP(n, 0, 0) * cosT(n, 0, 0);
1527       jacCCS2Curv(n, 4, 2) = sinT(n, 0, 0);
1528     }
1529 
1530     //now we need the jacobian from curv to local
1531     // code from TrackingTools/AnalyticalJacobians/src/JacobianCurvilinearToLocal.cc
1532     MPlexHV un;
1533     MPlexHV vn;
1534 #pragma omp simd
1535     for (int n = 0; n < NN; ++n) {
1536       const float abslp00 = std::abs(lp(n, 0, 0));
1537       vn(n, 0, 2) = std::max(1.e-30f, abslp00 * pt(n, 0, 0));
1538       un(n, 0, 0) = -pgl(n, 0, 1) * abslp00 / vn(n, 0, 2);
1539       un(n, 0, 1) = pgl(n, 0, 0) * abslp00 / vn(n, 0, 2);
1540       un(n, 0, 2) = 0.f;
1541       vn(n, 0, 0) = -pgl(n, 0, 2) * abslp00 * un(n, 0, 1);
1542       vn(n, 0, 1) = pgl(n, 0, 2) * abslp00 * un(n, 0, 0);
1543     }
1544     MPlexHV u;
1545     RotateVectorOnPlane(rot, un, u);
1546     MPlexHV v;
1547     RotateVectorOnPlane(rot, vn, v);
1548     MPlex55 jacCurv2Loc(0.f);
1549 #pragma omp simd
1550     for (int n = 0; n < NN; ++n) {
1551       // fixme? //(pf.use_param_b_field ? 0.01f * Const::sol * Config::bFieldFromZR(psPar(n, 2, 0), hipo(psPar(n, 0, 0), psPar(n, 1, 0))) : 0.01f * Const::sol * Config::Bfield);
1552       const float bF = 0.01f * Const::sol * Config::Bfield;
1553       const float qh2 = bF * lp(n, 0, 0);
1554       const float t1r = std::sqrt(1.f + lp(n, 0, 1) * lp(n, 0, 1) + lp(n, 0, 2) * lp(n, 0, 2)) * pzSign(n, 0, 0);
1555       const float t2r = t1r * t1r;
1556       const float t3r = t1r * t2r;
1557       jacCurv2Loc(n, 0, 0) = 1.f;
1558       jacCurv2Loc(n, 1, 1) = -u(n, 0, 1) * t2r;
1559       jacCurv2Loc(n, 1, 2) = v(n, 0, 1) * vn(n, 0, 2) * t2r;
1560       jacCurv2Loc(n, 2, 1) = u(n, 0, 0) * t2r;
1561       jacCurv2Loc(n, 2, 2) = -v(n, 0, 0) * vn(n, 0, 2) * t2r;
1562       jacCurv2Loc(n, 3, 3) = v(n, 0, 1) * t1r;
1563       jacCurv2Loc(n, 3, 4) = -u(n, 0, 1) * t1r;
1564       jacCurv2Loc(n, 4, 3) = -v(n, 0, 0) * t1r;
1565       jacCurv2Loc(n, 4, 4) = u(n, 0, 0) * t1r;
1566       const float cosz = -vn(n, 0, 2) * qh2;
1567       const float ui = u(n, 0, 2) * t3r;
1568       const float vi = v(n, 0, 2) * t3r;
1569       jacCurv2Loc(n, 1, 3) = -ui * v(n, 0, 1) * cosz;
1570       jacCurv2Loc(n, 1, 4) = -vi * v(n, 0, 1) * cosz;
1571       jacCurv2Loc(n, 2, 3) = ui * v(n, 0, 0) * cosz;
1572       jacCurv2Loc(n, 2, 4) = vi * v(n, 0, 0) * cosz;
1573       //
1574     }
1575 
1576     // jacobian for converting from CCS to Loc (via Curv)
1577     MPlex56 jacCCS2Loc;
1578     JacCCS2Loc(jacCurv2Loc, jacCCS2Curv, jacCCS2Loc);
1579 
1580     // local error!
1581     MPlex5S psErrLoc;
1582     MPlex56 temp56;
1583     PsErrLoc(jacCCS2Loc, psErr, temp56);
1584     PsErrLocTransp(temp56, jacCCS2Loc, psErrLoc);
1585 
1586     MPlexHV md;
1587 #pragma omp simd
1588     for (int n = 0; n < NN; ++n) {
1589       md(n, 0, 0) = msPar(n, 0, 0) - plPnt(n, 0, 0);
1590       md(n, 0, 1) = msPar(n, 0, 1) - plPnt(n, 0, 1);
1591       md(n, 0, 2) = msPar(n, 0, 2) - plPnt(n, 0, 2);
1592     }
1593     MPlex2V mslo;
1594     RotateResidualsOnPlane(rot, md, mslo);
1595 
1596     MPlex2V res_loc;  //position residual in local coordinates
1597 #pragma omp simd
1598     for (int n = 0; n < NN; ++n) {
1599       res_loc(n, 0, 0) = mslo(n, 0, 0) - xlo(n, 0, 0);
1600       res_loc(n, 0, 1) = mslo(n, 0, 1) - xlo(n, 0, 1);
1601     }
1602 
1603     MPlex2S msErr_loc;
1604     MPlex2H temp2Hmsl;
1605     ProjectResErr(rot, msErr, temp2Hmsl);
1606     ProjectResErrTransp(rot, temp2Hmsl, msErr_loc);
1607 
1608     MPlex2S resErr_loc;  //covariance sum in local position coordinates
1609 #pragma omp simd
1610     for (int n = 0; n < NN; ++n) {
1611       resErr_loc(n, 0, 0) = psErrLoc(n, 3, 3) + msErr_loc(n, 0, 0);
1612       resErr_loc(n, 0, 1) = psErrLoc(n, 3, 4) + msErr_loc(n, 0, 1);
1613       resErr_loc(n, 1, 1) = psErrLoc(n, 4, 4) + msErr_loc(n, 1, 1);
1614     }
1615     /*
1616     printf("jacCCS2Curv:\n");
1617     for (int i = 0; i < 5; ++i) {
1618       for (int j = 0; j < 6; ++j)
1619     printf("%8f ", jacCCS2Curv.At(0, i, j));
1620       printf("\n");
1621     }
1622     printf("un:\n");
1623     for (int i = 0; i < 3; ++i) {
1624       printf("%8f ", un.At(0, i, 0));
1625     }
1626     printf("\n");
1627     printf("u:\n");
1628     for (int i = 0; i < 3; ++i) {
1629       printf("%8f ", u.At(0, i, 0));
1630     }
1631     printf("\n");
1632     printf("\n");
1633     printf("jacCurv2Loc:\n");
1634     for (int i = 0; i < 5; ++i) {
1635       for (int j = 0; j < 5; ++j)
1636     printf("%8f ", jacCurv2Loc.At(0, i, j));
1637       printf("\n");
1638     }
1639     printf("\n");
1640     printf("jacCCS2Loc:\n");
1641     for (int i = 0; i < 5; ++i) {
1642       for (int j = 0; j < 6; ++j)
1643     printf("%8f ", jacCCS2Loc.At(0, i, j));
1644       printf("\n");
1645     }
1646     printf("\n");
1647     printf("temp56:\n");
1648     for (int i = 0; i < 5; ++i) {
1649       for (int j = 0; j < 6; ++j)
1650     printf("%8f ", temp56.At(0, i, j));
1651       printf("\n");
1652     }
1653     printf("\n");
1654     printf("psErrLoc:\n");
1655     for (int i = 0; i < 5; ++i) {
1656       for (int j = 0; j < 5; ++j)
1657     printf("%8f ", psErrLoc.At(0, i, j));
1658       printf("\n");
1659     }
1660     printf("\n");
1661     printf("res_loc:\n");
1662     for (int i = 0; i < 2; ++i) {
1663       printf("%8f ", res_loc.At(0, i, 0));
1664     }
1665     printf("\n");
1666     printf("resErr_loc:\n");
1667     for (int i = 0; i < 2; ++i) {
1668       for (int j = 0; j < 2; ++j)
1669     printf("%8f ", resErr_loc.At(0, i, j));
1670       printf("\n");
1671     }
1672     printf("\n");
1673     */
1674     //invert the 2x2 matrix
1675     Matriplex::invertCramerSym(resErr_loc);
1676 
1677     if (kfOp & KFO_Calculate_Chi2) {
1678       Chi2Similarity(res_loc, resErr_loc, outChi2);
1679 
1680 #ifdef DEBUG
1681       {
1682         dmutex_guard;
1683         printf("resErr_loc (Inv):\n");
1684         for (int i = 0; i < 2; ++i) {
1685           for (int j = 0; j < 2; ++j)
1686             printf("%8f ", resErr_loc.At(0, i, j));
1687           printf("\n");
1688         }
1689         printf("\n");
1690         printf("chi2: %8f\n", outChi2.At(0, 0, 0));
1691       }
1692 #endif
1693     }
1694 
1695     if (kfOp & KFO_Update_Params) {
1696       MPlex52 K;  // kalman gain
1697 #pragma omp simd
1698       for (int n = 0; n < NN; ++n) {
1699 #pragma GCC unroll 5
1700         for (int j = 0; j < 5; ++j) {
1701           K(n, j, 0) = resErr_loc(n, 0, 0) * psErrLoc(n, j, 3) + resErr_loc(n, 0, 1) * psErrLoc(n, j, 4);
1702           K(n, j, 1) = resErr_loc(n, 0, 1) * psErrLoc(n, j, 3) + resErr_loc(n, 1, 1) * psErrLoc(n, j, 4);
1703         }
1704       }
1705 
1706       MPlex5V lp_upd;
1707       MultResidualsAdd(K, lp, res_loc, lp_upd);
1708 
1709       MPlex55 ImKH(0.f);
1710 #pragma omp simd
1711       for (int n = 0; n < NN; ++n) {
1712 #pragma GCC unroll 5
1713         for (int j = 0; j < 5; ++j) {
1714           ImKH(n, j, j) = 1.f;
1715           ImKH(n, j, 3) -= K(n, j, 0);
1716           ImKH(n, j, 4) -= K(n, j, 1);
1717         }
1718       }
1719       MPlex5S psErrLoc_upd;
1720       PsErrLocUpd(ImKH, psErrLoc, psErrLoc_upd);
1721 
1722       //convert local updated parameters into CCS
1723       MPlexHV lxu;
1724       MPlexHV lpu;
1725 #pragma omp simd
1726       for (int n = 0; n < NN; ++n) {
1727         lxu(n, 0, 0) = lp_upd(n, 0, 3);
1728         lxu(n, 0, 1) = lp_upd(n, 0, 4);
1729         lxu(n, 0, 2) = 0.f;
1730         lpu(n, 0, 2) =
1731             pzSign(n, 0, 0) / (std::max(std::abs(lp_upd(n, 0, 0)), 1.e-9f) *
1732                                std::sqrt(1.f + lp_upd(n, 0, 1) * lp_upd(n, 0, 1) + lp_upd(n, 0, 2) * lp_upd(n, 0, 2)));
1733         lpu(n, 0, 0) = lpu(n, 0, 2) * lp_upd(n, 0, 1);
1734         lpu(n, 0, 1) = lpu(n, 0, 2) * lp_upd(n, 0, 2);
1735       }
1736       MPlexHV gxu;
1737       RotateVectorOnPlaneTransp(rot, lxu, gxu);
1738 #pragma omp simd
1739       for (int n = 0; n < NN; ++n) {
1740         gxu(n, 0, 0) += plPnt(n, 0, 0);
1741         gxu(n, 0, 1) += plPnt(n, 0, 1);
1742         gxu(n, 0, 2) += plPnt(n, 0, 2);
1743       }
1744       MPlexHV gpu;
1745       RotateVectorOnPlaneTransp(rot, lpu, gpu);
1746 
1747       MPlexQF p;
1748 #pragma omp simd
1749       for (int n = 0; n < NN; ++n) {
1750         pt(n, 0, 0) = std::sqrt(gpu.At(n, 0, 0) * gpu.At(n, 0, 0) + gpu.At(n, 0, 1) * gpu.At(n, 0, 1));
1751         p(n, 0, 0) = std::sqrt(pt.At(n, 0, 0) * pt.At(n, 0, 0) + gpu.At(n, 0, 2) * gpu.At(n, 0, 2));
1752         sinP(n, 0, 0) = gpu.At(n, 0, 1) / pt(n, 0, 0);
1753         cosP(n, 0, 0) = gpu.At(n, 0, 0) / pt(n, 0, 0);
1754         sinT(n, 0, 0) = pt(n, 0, 0) / p(n, 0, 0);
1755         cosT(n, 0, 0) = gpu.At(n, 0, 2) / p(n, 0, 0);
1756       }
1757 
1758 #pragma omp simd
1759       for (int n = 0; n < NN; ++n) {
1760         outPar(n, 0, 0) = gxu.At(n, 0, 0);
1761         outPar(n, 0, 1) = gxu.At(n, 0, 1);
1762         outPar(n, 0, 2) = gxu.At(n, 0, 2);
1763         outPar(n, 0, 3) = 1.f / pt(n, 0, 0);
1764         outPar(n, 0, 4) = getPhi(gpu.At(n, 0, 0), gpu.At(n, 0, 1));  //fixme VDT or something?
1765         outPar(n, 0, 5) = getTheta(pt(n, 0, 0), gpu.At(n, 0, 2));
1766       }
1767 
1768       //now we need the jacobian to convert from curvilinear to CCS
1769       // code from TrackState::jacobianCurvilinearToCCS
1770       MPlex65 jacCurv2CCS(0.f);
1771 #pragma omp simd
1772       for (int n = 0; n < NN; ++n) {
1773         jacCurv2CCS(n, 0, 3) = -sinP(n, 0, 0);
1774         jacCurv2CCS(n, 0, 4) = -cosT(n, 0, 0) * cosP(n, 0, 0);
1775         jacCurv2CCS(n, 1, 3) = cosP(n, 0, 0);
1776         jacCurv2CCS(n, 1, 4) = -cosT(n, 0, 0) * sinP(n, 0, 0);
1777         jacCurv2CCS(n, 2, 4) = sinT(n, 0, 0);
1778         jacCurv2CCS(n, 3, 0) = inChg(n, 0, 0) / sinT(n, 0, 0);
1779         jacCurv2CCS(n, 3, 1) = outPar(n, 3, 0) * cosT(n, 0, 0) / sinT(n, 0, 0);
1780         jacCurv2CCS(n, 4, 2) = 1.f;
1781         jacCurv2CCS(n, 5, 1) = -1.f;
1782       }
1783 
1784       //now we need the jacobian from local to curv
1785       // code from TrackingTools/AnalyticalJacobians/src/JacobianLocalToCurvilinear.cc
1786       MPlexHV tnl;
1787 #pragma omp simd
1788       for (int n = 0; n < NN; ++n) {
1789         const float abslpupd00 = std::max(std::abs(lp_upd(n, 0, 0)), 1.e-9f);
1790         tnl(n, 0, 0) = lpu(n, 0, 0) * abslpupd00;
1791         tnl(n, 0, 1) = lpu(n, 0, 1) * abslpupd00;
1792         tnl(n, 0, 2) = lpu(n, 0, 2) * abslpupd00;
1793       }
1794       MPlexHV tn;
1795       RotateVectorOnPlaneTransp(rot, tnl, tn);
1796 #pragma omp simd
1797       for (int n = 0; n < NN; ++n) {
1798         vn(n, 0, 2) = std::max(1.e-30f, std::sqrt(tn(n, 0, 0) * tn(n, 0, 0) + tn(n, 0, 1) * tn(n, 0, 1)));
1799         un(n, 0, 0) = -tn(n, 0, 1) / vn(n, 0, 2);
1800         un(n, 0, 1) = tn(n, 0, 0) / vn(n, 0, 2);
1801         un(n, 0, 2) = 0.f;
1802         vn(n, 0, 0) = -tn(n, 0, 2) * un(n, 0, 1);
1803         vn(n, 0, 1) = tn(n, 0, 2) * un(n, 0, 0);
1804       }
1805       MPlex55 jacLoc2Curv(0.f);
1806 #pragma omp simd
1807       for (int n = 0; n < NN; ++n) {
1808         // fixme? //(pf.use_param_b_field ? 0.01f * Const::sol * Config::bFieldFromZR(psPar(n, 2, 0), hipo(psPar(n, 0, 0), psPar(n, 1, 0))) : 0.01f * Const::sol * Config::Bfield);
1809         const float bF = 0.01f * Const::sol * Config::Bfield;  //fixme: cache?
1810         const float qh2 = bF * lp_upd(n, 0, 0);
1811         const float cosl1 = 1.f / vn(n, 0, 2);
1812         const float uj = un(n, 0, 0) * rot(n, 0, 0) + un(n, 0, 1) * rot(n, 0, 1);
1813         const float uk = un(n, 0, 0) * rot(n, 1, 0) + un(n, 0, 1) * rot(n, 1, 1);
1814         const float vj = vn(n, 0, 0) * rot(n, 0, 0) + vn(n, 0, 1) * rot(n, 0, 1) + vn(n, 0, 2) * rot(n, 0, 2);
1815         const float vk = vn(n, 0, 0) * rot(n, 1, 0) + vn(n, 0, 1) * rot(n, 1, 1) + vn(n, 0, 2) * rot(n, 1, 2);
1816         const float cosz = vn(n, 0, 2) * qh2;
1817         jacLoc2Curv(n, 0, 0) = 1.f;
1818         jacLoc2Curv(n, 1, 1) = tnl(n, 0, 2) * vj;
1819         jacLoc2Curv(n, 1, 2) = tnl(n, 0, 2) * vk;
1820         jacLoc2Curv(n, 2, 1) = tnl(n, 0, 2) * uj * cosl1;
1821         jacLoc2Curv(n, 2, 2) = tnl(n, 0, 2) * uk * cosl1;
1822         jacLoc2Curv(n, 3, 3) = uj;
1823         jacLoc2Curv(n, 3, 4) = uk;
1824         jacLoc2Curv(n, 4, 3) = vj;
1825         jacLoc2Curv(n, 4, 4) = vk;
1826         jacLoc2Curv(n, 2, 3) = tnl(n, 0, 0) * (cosz * cosl1);
1827         jacLoc2Curv(n, 2, 4) = tnl(n, 0, 1) * (cosz * cosl1);
1828       }
1829 
1830       // jacobian for converting from Loc to CCS (via Curv)
1831       MPlex65 jacLoc2CCS;
1832       JacLoc2CCS(jacCurv2CCS, jacLoc2Curv, jacLoc2CCS);
1833 
1834       // CCS error!
1835       MPlex65 temp65;
1836       OutErrCCS(jacLoc2CCS, psErrLoc_upd, temp65);
1837       OutErrCCSTransp(temp65, jacLoc2CCS, outErr);
1838 
1839       /*
1840       printf("\n");
1841       printf("lp_upd:\n");
1842       for (int i = 0; i < 5; ++i) {
1843     printf("%8f ", lp_upd.At(0, i, 0));
1844       }
1845       printf("\n");
1846       printf("psErrLoc_upd:\n");
1847       for (int i = 0; i < 5; ++i) {
1848         for (int j = 0; j < 5; ++j)
1849           printf("%8f ", psErrLoc_upd.At(0, i, j));
1850         printf("\n");
1851       }
1852       printf("\n");
1853       printf("lxu:\n");
1854       for (int i = 0; i < 3; ++i) {
1855     printf("%8f ", lxu.At(0, i, 0));
1856       }
1857       printf("\n");
1858       printf("lpu:\n");
1859       for (int i = 0; i < 3; ++i) {
1860     printf("%8f ", lpu.At(0, i, 0));
1861       }
1862       printf("\n");
1863       printf("gxu:\n");
1864       for (int i = 0; i < 3; ++i) {
1865     printf("%8f ", gxu.At(0, i, 0));
1866       }
1867       printf("\n");
1868       printf("gpu:\n");
1869       for (int i = 0; i < 3; ++i) {
1870     printf("%8f ", gpu.At(0, i, 0));
1871       }
1872       printf("\n");
1873       printf("outPar:\n");
1874       for (int i = 0; i < 6; ++i) {
1875     printf("%8f ", outPar.At(0, i, 0));
1876       }
1877       printf("\n");
1878       printf("tnl:\n");
1879       for (int i = 0; i < 3; ++i) {
1880     printf("%8f ", tnl.At(0, i, 0));
1881       }
1882       printf("\n");
1883       printf("tn:\n");
1884       for (int i = 0; i < 3; ++i) {
1885     printf("%8f ", tn.At(0, i, 0));
1886       }
1887       printf("\n");
1888       printf("un:\n");
1889       for (int i = 0; i < 3; ++i) {
1890     printf("%8f ", un.At(0, i, 0));
1891       }
1892       printf("\n");
1893       printf("vn:\n");
1894       for (int i = 0; i < 3; ++i) {
1895     printf("%8f ", vn.At(0, i, 0));
1896       }
1897       printf("\n");
1898       printf("jacLoc2Curv:\n");
1899       for (int i = 0; i < 5; ++i) {
1900     for (int j = 0; j < 5; ++j)
1901       printf("%8f ", jacLoc2Curv.At(0, i, j));
1902     printf("\n");
1903       }
1904       printf("\n");
1905       printf("outErr:\n");
1906       for (int i = 0; i < 6; ++i) {
1907         for (int j = 0; j < 6; ++j)
1908           printf("%8f ", outErr.At(0, i, j));
1909         printf("\n");
1910       }
1911       printf("\n");
1912       */
1913 
1914 #ifdef DEBUG
1915       {
1916         dmutex_guard;
1917         if (kfOp & KFO_Local_Cov) {
1918           printf("psErrLoc_upd:\n");
1919           for (int i = 0; i < 5; ++i) {
1920             for (int j = 0; j < 5; ++j)
1921               printf("% 8e ", psErrLoc_upd.At(0, i, j));
1922             printf("\n");
1923           }
1924           printf("\n");
1925         }
1926         printf("resErr_loc (Inv):\n");
1927         for (int i = 0; i < 2; ++i) {
1928           for (int j = 0; j < 2; ++j)
1929             printf("%8f ", resErr_loc.At(0, i, j));
1930           printf("\n");
1931         }
1932         printf("\n");
1933         printf("K:\n");
1934         for (int i = 0; i < 6; ++i) {
1935           for (int j = 0; j < 2; ++j)
1936             printf("%8f ", K.At(0, i, j));
1937           printf("\n");
1938         }
1939         printf("\n");
1940         printf("outPar:\n");
1941         for (int i = 0; i < 6; ++i) {
1942           printf("%8f  ", outPar.At(0, i, 0));
1943         }
1944         printf("\n");
1945         printf("outErr:\n");
1946         for (int i = 0; i < 6; ++i) {
1947           for (int j = 0; j < 6; ++j)
1948             printf("%8f ", outErr.At(0, i, j));
1949           printf("\n");
1950         }
1951         printf("\n");
1952       }
1953 #endif
1954     }
1955 
1956     return;
1957   }
1958 
1959   //------------------------------------------------------------------------------
1960 
1961   void kalmanOperationPlane(const int kfOp,
1962                             const MPlexLS& psErr,
1963                             const MPlexLV& psPar,
1964                             const MPlexQI& inChg,
1965                             const MPlexHS& msErr,
1966                             const MPlexHV& msPar,
1967                             const MPlexHV& plNrm,
1968                             const MPlexHV& plDir,
1969                             const MPlexHV& plPnt,  //not used, can be removed (fixme)
1970                             MPlexLS& outErr,
1971                             MPlexLV& outPar,
1972                             MPlexQF& outChi2,
1973                             const int N_proc) {
1974 #ifdef DEBUG
1975     {
1976       dmutex_guard;
1977       printf("psPar:\n");
1978       for (int i = 0; i < 6; ++i) {
1979         printf("%8f ", psPar.constAt(0, 0, i));
1980         printf("\n");
1981       }
1982       printf("\n");
1983       printf("psErr:\n");
1984       for (int i = 0; i < 6; ++i) {
1985         for (int j = 0; j < 6; ++j)
1986           printf("%8f ", psErr.constAt(0, i, j));
1987         printf("\n");
1988       }
1989       printf("\n");
1990       printf("msPar:\n");
1991       for (int i = 0; i < 3; ++i) {
1992         printf("%8f ", msPar.constAt(0, 0, i));
1993         printf("\n");
1994       }
1995       printf("\n");
1996       printf("msErr:\n");
1997       for (int i = 0; i < 3; ++i) {
1998         for (int j = 0; j < 3; ++j)
1999           printf("%8f ", msErr.constAt(0, i, j));
2000         printf("\n");
2001       }
2002       printf("\n");
2003     }
2004 #endif
2005 
2006     // Rotate global point on local plane
2007 
2008     // Rotation matrix
2009     //    D0  D1   D2
2010     //    X0  X1   X2
2011     //    N0  N1   N2
2012     // where D is the strip direction vector plDir, N is the normal plNrm, and X is the cross product between the two
2013 
2014     MPlex2H prj;
2015     for (int n = 0; n < NN; ++n) {
2016       prj(n, 0, 0) = plDir(n, 0, 0);
2017       prj(n, 0, 1) = plDir(n, 1, 0);
2018       prj(n, 0, 2) = plDir(n, 2, 0);
2019       prj(n, 1, 0) = plNrm(n, 1, 0) * plDir(n, 2, 0) - plNrm(n, 2, 0) * plDir(n, 1, 0);
2020       prj(n, 1, 1) = plNrm(n, 2, 0) * plDir(n, 0, 0) - plNrm(n, 0, 0) * plDir(n, 2, 0);
2021       prj(n, 1, 2) = plNrm(n, 0, 0) * plDir(n, 1, 0) - plNrm(n, 1, 0) * plDir(n, 0, 0);
2022     }
2023 
2024     MPlexHV res_glo;  //position residual in global coordinates
2025     SubtractFirst3(msPar, psPar, res_glo);
2026 
2027     MPlexHS resErr_glo;  //covariance sum in global position coordinates
2028     AddIntoUpperLeft3x3(psErr, msErr, resErr_glo);
2029 
2030     MPlex2V res_loc;  //position residual in local coordinates
2031     RotateResidualsOnPlane(prj, res_glo, res_loc);
2032     MPlex2S resErr_loc;  //covariance sum in local position coordinates
2033     MPlex2H temp2H;
2034     ProjectResErr(prj, resErr_glo, temp2H);
2035     ProjectResErrTransp(prj, temp2H, resErr_loc);
2036 
2037 #ifdef DEBUG
2038     {
2039       dmutex_guard;
2040       printf("prj:\n");
2041       for (int i = 0; i < 2; ++i) {
2042         for (int j = 0; j < 3; ++j)
2043           printf("%8f ", prj.At(0, i, j));
2044         printf("\n");
2045       }
2046       printf("\n");
2047       printf("res_glo:\n");
2048       for (int i = 0; i < 3; ++i) {
2049         printf("%8f ", res_glo.At(0, i, 0));
2050       }
2051       printf("\n");
2052       printf("resErr_glo:\n");
2053       for (int i = 0; i < 3; ++i) {
2054         for (int j = 0; j < 3; ++j)
2055           printf("%8f ", resErr_glo.At(0, i, j));
2056         printf("\n");
2057       }
2058       printf("\n");
2059       printf("res_loc:\n");
2060       for (int i = 0; i < 2; ++i) {
2061         printf("%8f ", res_loc.At(0, i, 0));
2062       }
2063       printf("\n");
2064       printf("temp2H:\n");
2065       for (int i = 0; i < 2; ++i) {
2066         for (int j = 0; j < 3; ++j)
2067           printf("%8f ", temp2H.At(0, i, j));
2068         printf("\n");
2069       }
2070       printf("\n");
2071       printf("resErr_loc:\n");
2072       for (int i = 0; i < 2; ++i) {
2073         for (int j = 0; j < 2; ++j)
2074           printf("%8f ", resErr_loc.At(0, i, j));
2075         printf("\n");
2076       }
2077       printf("\n");
2078     }
2079 #endif
2080 
2081     //invert the 2x2 matrix
2082     Matriplex::invertCramerSym(resErr_loc);
2083 
2084     if (kfOp & KFO_Calculate_Chi2) {
2085       Chi2Similarity(res_loc, resErr_loc, outChi2);
2086 
2087 #ifdef DEBUG
2088       {
2089         dmutex_guard;
2090         printf("resErr_loc (Inv):\n");
2091         for (int i = 0; i < 2; ++i) {
2092           for (int j = 0; j < 2; ++j)
2093             printf("%8f ", resErr_loc.At(0, i, j));
2094           printf("\n");
2095         }
2096         printf("\n");
2097         printf("chi2: %8f\n", outChi2.At(0, 0, 0));
2098       }
2099 #endif
2100     }
2101 
2102     if (kfOp & KFO_Update_Params) {
2103       MPlexLS psErrLoc = psErr;
2104 
2105       MPlexH2 tempH2;
2106       MPlexL2 K;                           // kalman gain
2107       KalmanHTG(prj, resErr_loc, tempH2);  // intermediate term to get kalman gain (H^T*G)
2108       KalmanGain(psErrLoc, tempH2, K);
2109 
2110       MultResidualsAdd(K, psPar, res_loc, outPar);
2111 
2112       squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
2113 
2114       MPlexLL tempLL;
2115       KHMult(K, prj, tempLL);
2116       KHC(tempLL, psErrLoc, outErr);
2117       outErr.subtract(psErrLoc, outErr);
2118 
2119 #ifdef DEBUG
2120       {
2121         dmutex_guard;
2122         if (kfOp & KFO_Local_Cov) {
2123           printf("psErrLoc:\n");
2124           for (int i = 0; i < 6; ++i) {
2125             for (int j = 0; j < 6; ++j)
2126               printf("% 8e ", psErrLoc.At(0, i, j));
2127             printf("\n");
2128           }
2129           printf("\n");
2130         }
2131         printf("resErr_loc (Inv):\n");
2132         for (int i = 0; i < 2; ++i) {
2133           for (int j = 0; j < 2; ++j)
2134             printf("%8f ", resErr_loc.At(0, i, j));
2135           printf("\n");
2136         }
2137         printf("\n");
2138         printf("tempH2:\n");
2139         for (int i = 0; i < 3; ++i) {
2140           for (int j = 0; j < 2; ++j)
2141             printf("%8f ", tempH2.At(0, i, j));
2142           printf("\n");
2143         }
2144         printf("\n");
2145         printf("K:\n");
2146         for (int i = 0; i < 6; ++i) {
2147           for (int j = 0; j < 2; ++j)
2148             printf("%8f ", K.At(0, i, j));
2149           printf("\n");
2150         }
2151         printf("\n");
2152         printf("tempLL:\n");
2153         for (int i = 0; i < 6; ++i) {
2154           for (int j = 0; j < 6; ++j)
2155             printf("%8f ", tempLL.At(0, i, j));
2156           printf("\n");
2157         }
2158         printf("\n");
2159         printf("outPar:\n");
2160         for (int i = 0; i < 6; ++i) {
2161           printf("%8f  ", outPar.At(0, i, 0));
2162         }
2163         printf("\n");
2164         printf("outErr:\n");
2165         for (int i = 0; i < 6; ++i) {
2166           for (int j = 0; j < 6; ++j)
2167             printf("%8f ", outErr.At(0, i, j));
2168           printf("\n");
2169         }
2170         printf("\n");
2171       }
2172 #endif
2173     }
2174   }
2175 
2176   //==============================================================================
2177   // Kalman operations - Endcap
2178   //==============================================================================
2179 
2180   void kalmanUpdateEndcap(const MPlexLS& psErr,
2181                           const MPlexLV& psPar,
2182                           const MPlexHS& msErr,
2183                           const MPlexHV& msPar,
2184                           MPlexLS& outErr,
2185                           MPlexLV& outPar,
2186                           const int N_proc) {
2187     kalmanOperationEndcap(KFO_Update_Params, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
2188   }
2189 
2190   void kalmanPropagateAndUpdateEndcap(const MPlexLS& psErr,
2191                                       const MPlexLV& psPar,
2192                                       MPlexQI& Chg,
2193                                       const MPlexHS& msErr,
2194                                       const MPlexHV& msPar,
2195                                       MPlexLS& outErr,
2196                                       MPlexLV& outPar,
2197                                       MPlexQI& outFailFlag,
2198                                       const int N_proc,
2199                                       const PropagationFlags& propFlags,
2200                                       const bool propToHit) {
2201     if (propToHit) {
2202       MPlexLS propErr;
2203       MPlexLV propPar;
2204       MPlexQF msZ;
2205 #pragma omp simd
2206       for (int n = 0; n < NN; ++n) {
2207         msZ.At(n, 0, 0) = msPar.constAt(n, 2, 0);
2208       }
2209 
2210       propagateHelixToZMPlex(psErr, psPar, Chg, msZ, propErr, propPar, outFailFlag, N_proc, propFlags);
2211 
2212       kalmanOperationEndcap(KFO_Update_Params, propErr, propPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
2213     } else {
2214       kalmanOperationEndcap(KFO_Update_Params, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
2215     }
2216     for (int n = 0; n < NN; ++n) {
2217       if (n < N_proc && outPar.At(n, 3, 0) < 0) {
2218         Chg.At(n, 0, 0) = -Chg.At(n, 0, 0);
2219         outPar.At(n, 3, 0) = -outPar.At(n, 3, 0);
2220       }
2221     }
2222   }
2223 
2224   //------------------------------------------------------------------------------
2225 
2226   void kalmanComputeChi2Endcap(const MPlexLS& psErr,
2227                                const MPlexLV& psPar,
2228                                const MPlexQI& inChg,
2229                                const MPlexHS& msErr,
2230                                const MPlexHV& msPar,
2231                                MPlexQF& outChi2,
2232                                const int N_proc) {
2233     kalmanOperationEndcap(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
2234   }
2235 
2236   void kalmanPropagateAndComputeChi2Endcap(const MPlexLS& psErr,
2237                                            const MPlexLV& psPar,
2238                                            const MPlexQI& inChg,
2239                                            const MPlexHS& msErr,
2240                                            const MPlexHV& msPar,
2241                                            MPlexQF& outChi2,
2242                                            MPlexLV& propPar,
2243                                            MPlexQI& outFailFlag,
2244                                            const int N_proc,
2245                                            const PropagationFlags& propFlags,
2246                                            const bool propToHit) {
2247     propPar = psPar;
2248     if (propToHit) {
2249       MPlexLS propErr;
2250       MPlexQF msZ;
2251 #pragma omp simd
2252       for (int n = 0; n < NN; ++n) {
2253         msZ.At(n, 0, 0) = msPar.constAt(n, 2, 0);
2254       }
2255 
2256       propagateHelixToZMPlex(psErr, psPar, inChg, msZ, propErr, propPar, outFailFlag, N_proc, propFlags);
2257 
2258       kalmanOperationEndcap(KFO_Calculate_Chi2, propErr, propPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
2259     } else {
2260       kalmanOperationEndcap(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
2261     }
2262   }
2263 
2264   //------------------------------------------------------------------------------
2265 
2266   void kalmanOperationEndcap(const int kfOp,
2267                              const MPlexLS& psErr,
2268                              const MPlexLV& psPar,
2269                              const MPlexHS& msErr,
2270                              const MPlexHV& msPar,
2271                              MPlexLS& outErr,
2272                              MPlexLV& outPar,
2273                              MPlexQF& outChi2,
2274                              const int N_proc) {
2275 #ifdef DEBUG
2276     {
2277       dmutex_guard;
2278       printf("updateParametersEndcapMPlex\n");
2279       printf("psPar:\n");
2280       for (int i = 0; i < 6; ++i) {
2281         printf("%8f ", psPar.constAt(0, 0, i));
2282         printf("\n");
2283       }
2284       printf("\n");
2285       printf("msPar:\n");
2286       for (int i = 0; i < 3; ++i) {
2287         printf("%8f ", msPar.constAt(0, 0, i));
2288         printf("\n");
2289       }
2290       printf("\n");
2291       printf("psErr:\n");
2292       for (int i = 0; i < 6; ++i) {
2293         for (int j = 0; j < 6; ++j)
2294           printf("%8f ", psErr.constAt(0, i, j));
2295         printf("\n");
2296       }
2297       printf("\n");
2298       printf("msErr:\n");
2299       for (int i = 0; i < 3; ++i) {
2300         for (int j = 0; j < 3; ++j)
2301           printf("%8f ", msErr.constAt(0, i, j));
2302         printf("\n");
2303       }
2304       printf("\n");
2305     }
2306 #endif
2307 
2308     MPlex2V res;
2309     SubtractFirst2(msPar, psPar, res);
2310 
2311     MPlex2S resErr;
2312     AddIntoUpperLeft2x2(psErr, msErr, resErr);
2313 
2314 #ifdef DEBUG
2315     {
2316       dmutex_guard;
2317       printf("resErr:\n");
2318       for (int i = 0; i < 2; ++i) {
2319         for (int j = 0; j < 2; ++j)
2320           printf("%8f ", resErr.At(0, i, j));
2321         printf("\n");
2322       }
2323       printf("\n");
2324     }
2325 #endif
2326 
2327     //invert the 2x2 matrix
2328     Matriplex::invertCramerSym(resErr);
2329 
2330     if (kfOp & KFO_Calculate_Chi2) {
2331       Chi2Similarity(res, resErr, outChi2);
2332 
2333 #ifdef DEBUG
2334       {
2335         dmutex_guard;
2336         printf("resErr_loc (Inv):\n");
2337         for (int i = 0; i < 2; ++i) {
2338           for (int j = 0; j < 2; ++j)
2339             printf("%8f ", resErr.At(0, i, j));
2340           printf("\n");
2341         }
2342         printf("\n");
2343         printf("chi2: %8f\n", outChi2.At(0, 0, 0));
2344       }
2345 #endif
2346     }
2347 
2348     if (kfOp & KFO_Update_Params) {
2349       MPlexL2 K;
2350       KalmanGain(psErr, resErr, K);
2351 
2352       MultResidualsAdd(K, psPar, res, outPar);
2353 
2354       squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
2355 
2356       KHC(K, psErr, outErr);
2357 
2358 #ifdef DEBUG
2359       {
2360         dmutex_guard;
2361         printf("outErr before subtract:\n");
2362         for (int i = 0; i < 6; ++i) {
2363           for (int j = 0; j < 6; ++j)
2364             printf("%8f ", outErr.At(0, i, j));
2365           printf("\n");
2366         }
2367         printf("\n");
2368       }
2369 #endif
2370 
2371       outErr.subtract(psErr, outErr);
2372 
2373 #ifdef DEBUG
2374       {
2375         dmutex_guard;
2376         printf("res:\n");
2377         for (int i = 0; i < 2; ++i) {
2378           printf("%8f ", res.At(0, i, 0));
2379         }
2380         printf("\n");
2381         printf("resErr (Inv):\n");
2382         for (int i = 0; i < 2; ++i) {
2383           for (int j = 0; j < 2; ++j)
2384             printf("%8f ", resErr.At(0, i, j));
2385           printf("\n");
2386         }
2387         printf("\n");
2388         printf("K:\n");
2389         for (int i = 0; i < 6; ++i) {
2390           for (int j = 0; j < 2; ++j)
2391             printf("%8f ", K.At(0, i, j));
2392           printf("\n");
2393         }
2394         printf("\n");
2395         printf("outPar:\n");
2396         for (int i = 0; i < 6; ++i) {
2397           printf("%8f  ", outPar.At(0, i, 0));
2398         }
2399         printf("\n");
2400         printf("outErr:\n");
2401         for (int i = 0; i < 6; ++i) {
2402           for (int j = 0; j < 6; ++j)
2403             printf("%8f ", outErr.At(0, i, j));
2404           printf("\n");
2405         }
2406         printf("\n");
2407       }
2408 #endif
2409     }
2410   }
2411 
2412 }  // end namespace mkfit