Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:28:19

0001 #include "KalmanUtilsMPlex.h"
0002 #include "PropagationMPlex.h"
0003 
0004 //#define DEBUG
0005 #include "Debug.h"
0006 
0007 #include "KalmanUtilsMPlex.icc"
0008 
0009 #include "RecoTracker/MkFitCore/interface/cms_common_macros.h"
0010 
0011 namespace {
0012   using namespace mkfit;
0013   using idx_t = Matriplex::idx_t;
0014 
0015   inline void MultResidualsAdd(const MPlexLH& A, const MPlexLV& B, const MPlex2V& C, MPlexLV& D) {
0016     // outPar = psPar + kalmanGain*(dPar)
0017     //   D    =   B         A         C
0018     // where right half of kalman gain is 0
0019 
0020     // XXX Regenerate with a script.
0021 
0022     MultResidualsAdd_imp(A, B, C, D, 0, NN);
0023   }
0024 
0025   inline void MultResidualsAdd(const MPlexL2& A, const MPlexLV& B, const MPlex2V& C, MPlexLV& D) {
0026     // outPar = psPar + kalmanGain*(dPar)
0027     //   D    =   B         A         C
0028     // where right half of kalman gain is 0
0029 
0030     // XXX Regenerate with a script.
0031 
0032     typedef float T;
0033     const idx_t N = NN;
0034 
0035     const T* a = A.fArray;
0036     ASSUME_ALIGNED(a, 64);
0037     const T* b = B.fArray;
0038     ASSUME_ALIGNED(b, 64);
0039     const T* c = C.fArray;
0040     ASSUME_ALIGNED(c, 64);
0041     T* d = D.fArray;
0042     ASSUME_ALIGNED(d, 64);
0043 
0044 #pragma omp simd
0045     for (idx_t n = 0; n < N; ++n) {
0046       // generate loop (can also write it manually this time, it's not much)
0047       d[0 * N + n] = b[0 * N + n] + a[0 * N + n] * c[0 * N + n] + a[1 * N + n] * c[1 * N + n];
0048       d[1 * N + n] = b[1 * N + n] + a[2 * N + n] * c[0 * N + n] + a[3 * N + n] * c[1 * N + n];
0049       d[2 * N + n] = b[2 * N + n] + a[4 * N + n] * c[0 * N + n] + a[5 * N + n] * c[1 * N + n];
0050       d[3 * N + n] = b[3 * N + n] + a[6 * N + n] * c[0 * N + n] + a[7 * N + n] * c[1 * N + n];
0051       d[4 * N + n] = b[4 * N + n] + a[8 * N + n] * c[0 * N + n] + a[9 * N + n] * c[1 * N + n];
0052       d[5 * N + n] = b[5 * N + n] + a[10 * N + n] * c[0 * N + n] + a[11 * N + n] * c[1 * N + n];
0053     }
0054   }
0055 
0056   //------------------------------------------------------------------------------
0057 
0058   inline void Chi2Similarity(const MPlex2V& A,  //resPar
0059                              const MPlex2S& C,  //resErr
0060                              MPlexQF& D)        //outChi2
0061   {
0062     // outChi2 = (resPar) * resErr * (resPar)
0063     //   D     =    A      *    C   *      A
0064 
0065     typedef float T;
0066     const idx_t N = NN;
0067 
0068     const T* a = A.fArray;
0069     ASSUME_ALIGNED(a, 64);
0070     const T* c = C.fArray;
0071     ASSUME_ALIGNED(c, 64);
0072     T* d = D.fArray;
0073     ASSUME_ALIGNED(d, 64);
0074 
0075 #pragma omp simd
0076     for (idx_t n = 0; n < N; ++n) {
0077       // generate loop (can also write it manually this time, it's not much)
0078       d[0 * N + n] = c[0 * N + n] * a[0 * N + n] * a[0 * N + n] + c[2 * N + n] * a[1 * N + n] * a[1 * N + n] +
0079                      2 * (c[1 * N + n] * a[1 * N + n] * a[0 * N + n]);
0080     }
0081   }
0082 
0083   //------------------------------------------------------------------------------
0084 
0085   inline void AddIntoUpperLeft3x3(const MPlexLS& A, const MPlexHS& B, MPlexHS& C) {
0086     // The rest of matrix is left untouched.
0087 
0088     typedef float T;
0089     const idx_t N = NN;
0090 
0091     const T* a = A.fArray;
0092     ASSUME_ALIGNED(a, 64);
0093     const T* b = B.fArray;
0094     ASSUME_ALIGNED(b, 64);
0095     T* c = C.fArray;
0096     ASSUME_ALIGNED(c, 64);
0097 
0098 #pragma omp simd
0099     for (idx_t n = 0; n < N; ++n) {
0100       c[0 * N + n] = a[0 * N + n] + b[0 * N + n];
0101       c[1 * N + n] = a[1 * N + n] + b[1 * N + n];
0102       c[2 * N + n] = a[2 * N + n] + b[2 * N + n];
0103       c[3 * N + n] = a[3 * N + n] + b[3 * N + n];
0104       c[4 * N + n] = a[4 * N + n] + b[4 * N + n];
0105       c[5 * N + n] = a[5 * N + n] + b[5 * N + n];
0106     }
0107   }
0108 
0109   inline void AddIntoUpperLeft2x2(const MPlexLS& A, const MPlexHS& B, MPlex2S& C) {
0110     // The rest of matrix is left untouched.
0111 
0112     typedef float T;
0113     const idx_t N = NN;
0114 
0115     const T* a = A.fArray;
0116     ASSUME_ALIGNED(a, 64);
0117     const T* b = B.fArray;
0118     ASSUME_ALIGNED(b, 64);
0119     T* c = C.fArray;
0120     ASSUME_ALIGNED(c, 64);
0121 
0122 #pragma omp simd
0123     for (idx_t n = 0; n < N; ++n) {
0124       c[0 * N + n] = a[0 * N + n] + b[0 * N + n];
0125       c[1 * N + n] = a[1 * N + n] + b[1 * N + n];
0126       c[2 * N + n] = a[2 * N + n] + b[2 * N + n];
0127     }
0128   }
0129 
0130   //------------------------------------------------------------------------------
0131 
0132   inline void SubtractFirst3(const MPlexHV& A, const MPlexLV& B, MPlexHV& C) {
0133     // The rest of matrix is left untouched.
0134 
0135     typedef float T;
0136     const idx_t N = NN;
0137 
0138     const T* a = A.fArray;
0139     ASSUME_ALIGNED(a, 64);
0140     const T* b = B.fArray;
0141     ASSUME_ALIGNED(b, 64);
0142     T* c = C.fArray;
0143     ASSUME_ALIGNED(c, 64);
0144 
0145 #pragma omp simd
0146     for (idx_t n = 0; n < N; ++n) {
0147       c[0 * N + n] = a[0 * N + n] - b[0 * N + n];
0148       c[1 * N + n] = a[1 * N + n] - b[1 * N + n];
0149       c[2 * N + n] = a[2 * N + n] - b[2 * N + n];
0150     }
0151   }
0152 
0153   inline void SubtractFirst2(const MPlexHV& A, const MPlexLV& B, MPlex2V& C) {
0154     // The rest of matrix is left untouched.
0155 
0156     typedef float T;
0157     const idx_t N = NN;
0158 
0159     const T* a = A.fArray;
0160     ASSUME_ALIGNED(a, 64);
0161     const T* b = B.fArray;
0162     ASSUME_ALIGNED(b, 64);
0163     T* c = C.fArray;
0164     ASSUME_ALIGNED(c, 64);
0165 
0166 #pragma omp simd
0167     for (idx_t n = 0; n < N; ++n) {
0168       c[0 * N + n] = a[0 * N + n] - b[0 * N + n];
0169       c[1 * N + n] = a[1 * N + n] - b[1 * N + n];
0170     }
0171   }
0172 
0173   //==============================================================================
0174 
0175   inline void ProjectResErr(const MPlexQF& A00, const MPlexQF& A01, const MPlexHS& B, MPlexHH& C) {
0176     // C = A * B, C is 3x3, A is 3x3 , B is 3x3 sym
0177 
0178     // Based on script generation and adapted to custom sizes.
0179 
0180     typedef float T;
0181     const idx_t N = NN;
0182 
0183     const T* a00 = A00.fArray;
0184     ASSUME_ALIGNED(a00, 64);
0185     const T* a01 = A01.fArray;
0186     ASSUME_ALIGNED(a01, 64);
0187     const T* b = B.fArray;
0188     ASSUME_ALIGNED(b, 64);
0189     T* c = C.fArray;
0190     ASSUME_ALIGNED(c, 64);
0191 
0192 #pragma omp simd
0193     for (int n = 0; n < N; ++n) {
0194       c[0 * N + n] = a00[n] * b[0 * N + n] + a01[n] * b[1 * N + n];
0195       c[1 * N + n] = a00[n] * b[1 * N + n] + a01[n] * b[2 * N + n];
0196       c[2 * N + n] = a00[n] * b[3 * N + n] + a01[n] * b[4 * N + n];
0197       c[3 * N + n] = b[3 * N + n];
0198       c[4 * N + n] = b[4 * N + n];
0199       c[5 * N + n] = b[5 * N + n];
0200       c[6 * N + n] = a01[n] * b[0 * N + n] - a00[n] * b[1 * N + n];
0201       c[7 * N + n] = a01[n] * b[1 * N + n] - a00[n] * b[2 * N + n];
0202       c[8 * N + n] = a01[n] * b[3 * N + n] - a00[n] * b[4 * N + n];
0203     }
0204   }
0205 
0206   inline void ProjectResErrTransp(const MPlexQF& A00, const MPlexQF& A01, const MPlexHH& B, MPlex2S& C) {
0207     // C = A * B, C is 3x3 sym, A is 3x3 , B is 3x3
0208 
0209     // Based on script generation and adapted to custom sizes.
0210 
0211     typedef float T;
0212     const idx_t N = NN;
0213 
0214     const T* a00 = A00.fArray;
0215     ASSUME_ALIGNED(a00, 64);
0216     const T* a01 = A01.fArray;
0217     ASSUME_ALIGNED(a01, 64);
0218     const T* b = B.fArray;
0219     ASSUME_ALIGNED(b, 64);
0220     T* c = C.fArray;
0221     ASSUME_ALIGNED(c, 64);
0222 
0223 #pragma omp simd
0224     for (int n = 0; n < N; ++n) {
0225       c[0 * N + n] = b[0 * N + n] * a00[n] + b[1 * N + n] * a01[n];
0226       c[1 * N + n] = b[3 * N + n] * a00[n] + b[4 * N + n] * a01[n];
0227       c[2 * N + n] = b[5 * N + n];
0228     }
0229   }
0230 
0231   inline void RotateResidualsOnTangentPlane(const MPlexQF& R00,  //r00
0232                                             const MPlexQF& R01,  //r01
0233                                             const MPlexHV& A,    //res_glo
0234                                             MPlex2V& B)          //res_loc
0235   {
0236     RotateResidualsOnTangentPlane_impl(R00, R01, A, B, 0, NN);
0237   }
0238 
0239   //==============================================================================
0240 
0241   inline void ProjectResErr(const MPlex2H& A, const MPlexHS& B, MPlex2H& C) {
0242     // C = A * B, C is 2x3, A is 2x3 , B is 3x3 sym
0243 
0244     /*
0245     A 0 1 2
0246       3 4 5
0247     B 0 1 3
0248       1 2 4
0249       3 4 5
0250     */
0251 
0252     typedef float T;
0253     const idx_t N = NN;
0254 
0255     const T* a = A.fArray;
0256     ASSUME_ALIGNED(a, 64);
0257     const T* b = B.fArray;
0258     ASSUME_ALIGNED(b, 64);
0259     T* c = C.fArray;
0260     ASSUME_ALIGNED(c, 64);
0261 
0262 #pragma omp simd
0263     for (int n = 0; n < N; ++n) {
0264       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[3 * N + n];
0265       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[2 * N + n] + a[2 * N + n] * b[4 * N + n];
0266       c[2 * N + n] = a[0 * N + n] * b[3 * N + n] + a[1 * N + n] * b[4 * N + n] + a[2 * N + n] * b[5 * N + n];
0267       c[3 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[1 * N + n] + a[5 * N + n] * b[3 * N + n];
0268       c[4 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[2 * N + n] + a[5 * N + n] * b[4 * N + n];
0269       c[5 * N + n] = a[3 * N + n] * b[3 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[5 * N + n];
0270     }
0271   }
0272 
0273   inline void ProjectResErrTransp(const MPlex2H& A, const MPlex2H& B, MPlex2S& C) {
0274     // C = B * A^T, C is 2x2 sym, A is 2x3 (A^T is 3x2), B is 2x3
0275 
0276     /*
0277     B   0 1 2
0278         3 4 5
0279     A^T 0 3
0280         1 4
0281         2 5
0282     */
0283 
0284     typedef float T;
0285     const idx_t N = NN;
0286 
0287     const T* a = A.fArray;
0288     ASSUME_ALIGNED(a, 64);
0289     const T* b = B.fArray;
0290     ASSUME_ALIGNED(b, 64);
0291     T* c = C.fArray;
0292     ASSUME_ALIGNED(c, 64);
0293 
0294 #pragma omp simd
0295     for (int n = 0; n < N; ++n) {
0296       c[0 * N + n] = b[0 * N + n] * a[0 * N + n] + b[1 * N + n] * a[1 * N + n] + b[2 * N + n] * a[2 * N + n];
0297       c[1 * N + n] = b[0 * N + n] * a[3 * N + n] + b[1 * N + n] * a[4 * N + n] + b[2 * N + n] * a[5 * N + n];
0298       c[2 * N + n] = b[3 * N + n] * a[3 * N + n] + b[4 * N + n] * a[4 * N + n] + b[5 * N + n] * a[5 * N + n];
0299     }
0300   }
0301 
0302   inline void RotateResidualsOnPlane(const MPlex2H& R,  //prj
0303                                      const MPlexHV& A,  //res_glo
0304                                      MPlex2V& B)        //res_loc
0305   {
0306     // typedef float T;
0307     // const idx_t N = NN;
0308 
0309     // const T* a = A.fArray;
0310     // ASSUME_ALIGNED(a, 64);
0311     // T* b = B.fArray;
0312     // ASSUME_ALIGNED(b, 64);
0313     // const T* r = R.fArray;
0314     // ASSUME_ALIGNED(r, 64);
0315 
0316 #pragma omp simd
0317     for (int n = 0; n < NN; ++n) {
0318       B(n, 0, 0) = R(n, 0, 0) * A(n, 0, 0) + R(n, 0, 1) * A(n, 1, 0) + R(n, 0, 2) * A(n, 2, 0);
0319       B(n, 1, 0) = R(n, 1, 0) * A(n, 0, 0) + R(n, 1, 1) * A(n, 1, 0) + R(n, 1, 2) * A(n, 2, 0);
0320     }
0321   }
0322 
0323   inline void KalmanHTG(const MPlexQF& A00, const MPlexQF& A01, const MPlex2S& B, MPlexHH& C) {
0324     // HTG  = rot * res_loc
0325     //   C  =  A  *    B
0326 
0327     // Based on script generation and adapted to custom sizes.
0328 
0329     typedef float T;
0330     const idx_t N = NN;
0331 
0332     const T* a00 = A00.fArray;
0333     ASSUME_ALIGNED(a00, 64);
0334     const T* a01 = A01.fArray;
0335     ASSUME_ALIGNED(a01, 64);
0336     const T* b = B.fArray;
0337     ASSUME_ALIGNED(b, 64);
0338     T* c = C.fArray;
0339     ASSUME_ALIGNED(c, 64);
0340 
0341 #pragma omp simd
0342     for (int n = 0; n < N; ++n) {
0343       c[0 * N + n] = a00[n] * b[0 * N + n];
0344       c[1 * N + n] = a00[n] * b[1 * N + n];
0345       c[2 * N + n] = 0.;
0346       c[3 * N + n] = a01[n] * b[0 * N + n];
0347       c[4 * N + n] = a01[n] * b[1 * N + n];
0348       c[5 * N + n] = 0.;
0349       c[6 * N + n] = b[1 * N + n];
0350       c[7 * N + n] = b[2 * N + n];
0351       c[8 * N + n] = 0.;
0352     }
0353   }
0354 
0355   inline void KalmanGain(const MPlexLS& A, const MPlexHH& B, MPlexLH& C) {
0356     // C = A * B, C is 6x3, A is 6x6 sym , B is 3x3
0357 
0358     typedef float T;
0359     const idx_t N = NN;
0360 
0361     const T* a = A.fArray;
0362     ASSUME_ALIGNED(a, 64);
0363     const T* b = B.fArray;
0364     ASSUME_ALIGNED(b, 64);
0365     T* c = C.fArray;
0366     ASSUME_ALIGNED(c, 64);
0367 
0368 #pragma omp simd
0369     for (int n = 0; n < N; ++n) {
0370       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[3 * N + n] + a[3 * N + n] * b[6 * N + n];
0371       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[4 * N + n] + a[3 * N + n] * b[7 * N + n];
0372       c[2 * N + n] = 0;
0373       c[3 * N + n] = a[1 * N + n] * b[0 * N + n] + a[2 * N + n] * b[3 * N + n] + a[4 * N + n] * b[6 * N + n];
0374       c[4 * N + n] = a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[4 * N + n] + a[4 * N + n] * b[7 * N + n];
0375       c[5 * N + n] = 0;
0376       c[6 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[3 * N + n] + a[5 * N + n] * b[6 * N + n];
0377       c[7 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[7 * N + n];
0378       c[8 * N + n] = 0;
0379       c[9 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[3 * N + n] + a[8 * N + n] * b[6 * N + n];
0380       c[10 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[4 * N + n] + a[8 * N + n] * b[7 * N + n];
0381       c[11 * N + n] = 0;
0382       c[12 * N + n] = a[10 * N + n] * b[0 * N + n] + a[11 * N + n] * b[3 * N + n] + a[12 * N + n] * b[6 * N + n];
0383       c[13 * N + n] = a[10 * N + n] * b[1 * N + n] + a[11 * N + n] * b[4 * N + n] + a[12 * N + n] * b[7 * N + n];
0384       c[14 * N + n] = 0;
0385       c[15 * N + n] = a[15 * N + n] * b[0 * N + n] + a[16 * N + n] * b[3 * N + n] + a[17 * N + n] * b[6 * N + n];
0386       c[16 * N + n] = a[15 * N + n] * b[1 * N + n] + a[16 * N + n] * b[4 * N + n] + a[17 * N + n] * b[7 * N + n];
0387       c[17 * N + n] = 0;
0388     }
0389   }
0390 
0391   inline void KalmanHTG(const MPlex2H& A, const MPlex2S& B, MPlexH2& C) {
0392     // HTG  = prj^T * res_loc
0393     //   C  =  A^T  *   B
0394 
0395     /*
0396     A^T 0 3
0397         1 4
0398         2 5
0399     B 0 1
0400       1 2
0401     C 0 1
0402       2 3
0403       4 5
0404     */
0405 
0406     typedef float T;
0407     const idx_t N = NN;
0408 
0409     const T* a = A.fArray;
0410     ASSUME_ALIGNED(a, 64);
0411     const T* b = B.fArray;
0412     ASSUME_ALIGNED(b, 64);
0413     T* c = C.fArray;
0414     ASSUME_ALIGNED(c, 64);
0415 
0416 #pragma omp simd
0417     for (int n = 0; n < N; ++n) {
0418       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[3 * N + n] * b[1 * N + n];
0419       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[3 * N + n] * b[2 * N + n];
0420       c[2 * N + n] = a[1 * N + n] * b[0 * N + n] + a[4 * N + n] * b[1 * N + n];
0421       c[3 * N + n] = a[1 * N + n] * b[1 * N + n] + a[4 * N + n] * b[2 * N + n];
0422       c[4 * N + n] = a[2 * N + n] * b[0 * N + n] + a[5 * N + n] * b[1 * N + n];
0423       c[5 * N + n] = a[2 * N + n] * b[1 * N + n] + a[5 * N + n] * b[2 * N + n];
0424     }
0425   }
0426 
0427   inline void KalmanGain(const MPlexLS& A, const MPlexH2& B, MPlexL2& C) {
0428     // C = A * B, C is 6x2, A is 6x6 sym , B is 3x2 (6x2 but half of it is zeros)
0429 
0430     /*
0431       A 0  1  3  6 10 15
0432         1  2  4  7 11 16
0433         3  4  5  8 12 17
0434         6  7  8  9 13 18
0435        10 11 12 13 14 19
0436        15 16 17 18 19 20
0437       B 0  1
0438         2  3
0439     4  5
0440         X  X with X=0, so not even included in B
0441         X  X
0442         X  X
0443       C 0  1
0444         2  3
0445     4  5
0446         6  7
0447         8  9
0448        10 11
0449      */
0450 
0451     typedef float T;
0452     const idx_t N = NN;
0453 
0454     const T* a = A.fArray;
0455     ASSUME_ALIGNED(a, 64);
0456     const T* b = B.fArray;
0457     ASSUME_ALIGNED(b, 64);
0458     T* c = C.fArray;
0459     ASSUME_ALIGNED(c, 64);
0460 
0461 #pragma omp simd
0462     for (int n = 0; n < N; ++n) {
0463       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[2 * N + n] + a[3 * N + n] * b[4 * N + n];
0464       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[3 * N + n] + a[3 * N + n] * b[5 * N + n];
0465       c[2 * N + n] = a[1 * N + n] * b[0 * N + n] + a[2 * N + n] * b[2 * N + n] + a[4 * N + n] * b[4 * N + n];
0466       c[3 * N + n] = a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[3 * N + n] + a[4 * N + n] * b[5 * N + n];
0467       c[4 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[2 * N + n] + a[5 * N + n] * b[4 * N + n];
0468       c[5 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[3 * N + n] + a[5 * N + n] * b[5 * N + n];
0469       c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[2 * N + n] + a[8 * N + n] * b[4 * N + n];
0470       c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[3 * N + n] + a[8 * N + n] * b[5 * N + n];
0471       c[8 * N + n] = a[10 * N + n] * b[0 * N + n] + a[11 * N + n] * b[2 * N + n] + a[12 * N + n] * b[4 * N + n];
0472       c[9 * N + n] = a[10 * N + n] * b[1 * N + n] + a[11 * N + n] * b[3 * N + n] + a[12 * N + n] * b[5 * N + n];
0473       c[10 * N + n] = a[15 * N + n] * b[0 * N + n] + a[16 * N + n] * b[2 * N + n] + a[17 * N + n] * b[4 * N + n];
0474       c[11 * N + n] = a[15 * N + n] * b[1 * N + n] + a[16 * N + n] * b[3 * N + n] + a[17 * N + n] * b[5 * N + n];
0475     }
0476   }
0477 
0478   inline void CovXYconstrain(const MPlexQF& R00, const MPlexQF& R01, const MPlexLS& Ci, MPlexLS& Co) {
0479     // C is transformed to align along y after rotation and rotated back
0480 
0481     typedef float T;
0482     const idx_t N = NN;
0483 
0484     const T* r00 = R00.fArray;
0485     ASSUME_ALIGNED(r00, 64);
0486     const T* r01 = R01.fArray;
0487     ASSUME_ALIGNED(r01, 64);
0488     const T* ci = Ci.fArray;
0489     ASSUME_ALIGNED(ci, 64);
0490     T* co = Co.fArray;
0491     ASSUME_ALIGNED(co, 64);
0492 
0493 #pragma omp simd
0494     for (int n = 0; n < N; ++n) {
0495       // a bit loopy to avoid temporaries
0496       co[0 * N + n] =
0497           r00[n] * r00[n] * ci[0 * N + n] + 2 * r00[n] * r01[n] * ci[1 * N + n] + r01[n] * r01[n] * ci[2 * N + n];
0498       co[1 * N + n] = r00[n] * r01[n] * co[0 * N + n];
0499       co[2 * N + n] = r01[n] * r01[n] * co[0 * N + n];
0500       co[0 * N + n] = r00[n] * r00[n] * co[0 * N + n];
0501 
0502       co[3 * N + n] = r00[n] * ci[3 * N + n] + r01[n] * ci[4 * N + n];
0503       co[4 * N + n] = r01[n] * co[3 * N + n];
0504       co[3 * N + n] = r00[n] * co[3 * N + n];
0505 
0506       co[6 * N + n] = r00[n] * ci[6 * N + n] + r01[n] * ci[7 * N + n];
0507       co[7 * N + n] = r01[n] * co[6 * N + n];
0508       co[6 * N + n] = r00[n] * co[6 * N + n];
0509 
0510       co[10 * N + n] = r00[n] * ci[10 * N + n] + r01[n] * ci[11 * N + n];
0511       co[11 * N + n] = r01[n] * co[10 * N + n];
0512       co[10 * N + n] = r00[n] * co[10 * N + n];
0513 
0514       co[15 * N + n] = r00[n] * ci[15 * N + n] + r01[n] * ci[16 * N + n];
0515       co[16 * N + n] = r01[n] * co[15 * N + n];
0516       co[15 * N + n] = r00[n] * co[15 * N + n];
0517     }
0518   }
0519 
0520   void KalmanGain(const MPlexLS& A, const MPlex2S& B, MPlexL2& C) {
0521     // C = A * B, C is 6x2, A is 6x6 sym , B is 2x2
0522 
0523     typedef float T;
0524     const idx_t N = NN;
0525 
0526     const T* a = A.fArray;
0527     ASSUME_ALIGNED(a, 64);
0528     const T* b = B.fArray;
0529     ASSUME_ALIGNED(b, 64);
0530     T* c = C.fArray;
0531     ASSUME_ALIGNED(c, 64);
0532 
0533 #include "KalmanGain62.ah"
0534   }
0535 
0536   inline void KHMult(const MPlexLH& A, const MPlexQF& B00, const MPlexQF& B01, MPlexLL& C) {
0537     // C = A * B, C is 6x6, A is 6x3 , B is 3x6
0538     KHMult_imp(A, B00, B01, C, 0, NN);
0539   }
0540 
0541   inline void KHMult(const MPlexL2& A, const MPlex2H& B, MPlexLL& C) {
0542     // C = A * B, C is 6x6, A is 6x2 , B is 2x3 (2x6 but half of it made of zeros)
0543 
0544     /*
0545     A 0  1
0546       2  3
0547       4  5
0548       6  7
0549       8  9
0550      10 11
0551     B  0  1  2  X  X  X with X=0 so not included in B
0552        3  4  5  X  X  X
0553     C  0  1  2  3  4  5
0554        6  7  8  9 10 11
0555       12 13 14 15 16 17
0556       18 19 20 21 22 23
0557       24 25 26 27 28 29
0558       30 31 32 33 34 34
0559     */
0560 
0561     // typedef float T;
0562     // const idx_t N = NN;
0563 
0564     // const T* a = A.fArray;
0565     // ASSUME_ALIGNED(a, 64);
0566     // const T* b = B.fArray;
0567     // ASSUME_ALIGNED(b, 64);
0568     // T* c = C.fArray;
0569     // ASSUME_ALIGNED(c, 64);
0570 
0571 #pragma omp simd
0572     for (int n = 0; n < NN; ++n) {
0573       C(n, 0, 0) = A(n, 0, 0) * B(n, 0, 0) + A(n, 0, 1) * B(n, 1, 0);
0574       C(n, 0, 1) = A(n, 0, 0) * B(n, 0, 1) + A(n, 0, 1) * B(n, 1, 1);
0575       C(n, 0, 2) = A(n, 0, 0) * B(n, 0, 2) + A(n, 0, 1) * B(n, 1, 2);
0576       C(n, 0, 3) = 0;
0577       C(n, 0, 4) = 0;
0578       C(n, 0, 5) = 0;
0579       C(n, 0, 6) = A(n, 1, 0) * B(n, 0, 0) + A(n, 1, 1) * B(n, 1, 0);
0580       C(n, 0, 7) = A(n, 1, 0) * B(n, 0, 1) + A(n, 1, 1) * B(n, 1, 1);
0581       C(n, 0, 8) = A(n, 1, 0) * B(n, 0, 2) + A(n, 1, 1) * B(n, 1, 2);
0582       C(n, 0, 9) = 0;
0583       C(n, 0, 10) = 0;
0584       C(n, 0, 11) = 0;
0585       C(n, 0, 12) = A(n, 2, 0) * B(n, 0, 0) + A(n, 2, 1) * B(n, 1, 0);
0586       C(n, 0, 13) = A(n, 2, 0) * B(n, 0, 1) + A(n, 2, 1) * B(n, 1, 1);
0587       C(n, 0, 14) = A(n, 2, 0) * B(n, 0, 2) + A(n, 2, 1) * B(n, 1, 2);
0588       C(n, 0, 15) = 0;
0589       C(n, 0, 16) = 0;
0590       C(n, 0, 17) = 0;
0591       C(n, 0, 18) = A(n, 3, 0) * B(n, 0, 0) + A(n, 3, 1) * B(n, 1, 0);
0592       C(n, 0, 19) = A(n, 3, 0) * B(n, 0, 1) + A(n, 3, 1) * B(n, 1, 1);
0593       C(n, 0, 20) = A(n, 3, 0) * B(n, 0, 2) + A(n, 3, 1) * B(n, 1, 2);
0594       C(n, 0, 21) = 0;
0595       C(n, 0, 22) = 0;
0596       C(n, 0, 23) = 0;
0597       C(n, 0, 24) = A(n, 4, 0) * B(n, 0, 0) + A(n, 4, 1) * B(n, 1, 0);
0598       C(n, 0, 25) = A(n, 4, 0) * B(n, 0, 1) + A(n, 4, 1) * B(n, 1, 1);
0599       C(n, 0, 26) = A(n, 4, 0) * B(n, 0, 2) + A(n, 4, 1) * B(n, 1, 2);
0600       C(n, 0, 27) = 0;
0601       C(n, 0, 28) = 0;
0602       C(n, 0, 29) = 0;
0603       C(n, 0, 30) = A(n, 5, 0) * B(n, 0, 0) + A(n, 5, 1) * B(n, 1, 0);
0604       C(n, 0, 31) = A(n, 5, 0) * B(n, 0, 1) + A(n, 5, 1) * B(n, 1, 1);
0605       C(n, 0, 32) = A(n, 5, 0) * B(n, 0, 2) + A(n, 5, 1) * B(n, 1, 2);
0606       C(n, 0, 33) = 0;
0607       C(n, 0, 34) = 0;
0608       C(n, 0, 35) = 0;
0609     }
0610   }
0611 
0612   inline void KHC(const MPlexLL& A, const MPlexLS& B, MPlexLS& C) {
0613     // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
0614 
0615     typedef float T;
0616     const idx_t N = NN;
0617 
0618     const T* a = A.fArray;
0619     ASSUME_ALIGNED(a, 64);
0620     const T* b = B.fArray;
0621     ASSUME_ALIGNED(b, 64);
0622     T* c = C.fArray;
0623     ASSUME_ALIGNED(c, 64);
0624 
0625 #include "KHC.ah"
0626   }
0627 
0628   inline void KHC(const MPlexL2& A, const MPlexLS& B, MPlexLS& C) {
0629     // C = A * B, C is 6x6 sym, A is 6x2 , B is 6x6 sym
0630 
0631     typedef float T;
0632     const idx_t N = NN;
0633 
0634     const T* a = A.fArray;
0635     ASSUME_ALIGNED(a, 64);
0636     const T* b = B.fArray;
0637     ASSUME_ALIGNED(b, 64);
0638     T* c = C.fArray;
0639     ASSUME_ALIGNED(c, 64);
0640 
0641 #include "K62HC.ah"
0642   }
0643 
0644   //Warning: MultFull is not vectorized!
0645   template <typename T1, typename T2, typename T3>
0646   void MultFull(const T1& A, int nia, int nja, const T2& B, int nib, int njb, T3& C, int nic, int njc) {
0647 #ifdef DEBUG
0648     assert(nja == nib);
0649     assert(nia == nic);
0650     assert(njb == njc);
0651 #endif
0652     for (int n = 0; n < NN; ++n) {
0653       for (int i = 0; i < nia; ++i) {
0654         for (int j = 0; j < njb; ++j) {
0655           C(n, i, j) = 0.;
0656           for (int k = 0; k < nja; ++k)
0657             C(n, i, j) += A.constAt(n, i, k) * B.constAt(n, k, j);
0658         }
0659       }
0660     }
0661   }
0662 
0663   //Warning: MultTranspFull is not vectorized!
0664   // (careful about which one is transposed, I think rows and cols are swapped and the one that is transposed is A)
0665   template <typename T1, typename T2, typename T3>
0666   void MultTranspFull(const T1& A, int nia, int nja, const T2& B, int nib, int njb, T3& C, int nic, int njc) {
0667 #ifdef DEBUG
0668     assert(nja == njb);
0669     assert(nia == nic);
0670     assert(nib == njc);
0671 #endif
0672     for (int n = 0; n < NN; ++n) {
0673       for (int i = 0; i < nia; ++i) {
0674         for (int j = 0; j < nib; ++j) {
0675           C(n, i, j) = 0.;
0676           for (int k = 0; k < nja; ++k)
0677             C(n, i, j) += A.constAt(n, i, k) * B.constAt(n, j, k);
0678         }
0679       }
0680     }
0681   }
0682 
0683 }  // namespace
0684 
0685 //==============================================================================
0686 // Kalman operations - common dummy variables
0687 //==============================================================================
0688 
0689 namespace {
0690   // Dummy variables for parameter consistency to kalmanOperation.
0691   // Through KalmanFilterOperation enum parameter it is guaranteed that
0692   // those will never get accessed in the code (read from or written into).
0693 
0694   CMS_SA_ALLOW mkfit::MPlexLS dummy_err;
0695   CMS_SA_ALLOW mkfit::MPlexLV dummy_par;
0696   CMS_SA_ALLOW mkfit::MPlexQF dummy_chi2;
0697 }  // namespace
0698 
0699 namespace mkfit {
0700 
0701   //==============================================================================
0702   // Kalman operations - Barrel
0703   //==============================================================================
0704 
0705   void kalmanUpdate(const MPlexLS& psErr,
0706                     const MPlexLV& psPar,
0707                     const MPlexHS& msErr,
0708                     const MPlexHV& msPar,
0709                     MPlexLS& outErr,
0710                     MPlexLV& outPar,
0711                     const int N_proc) {
0712     kalmanOperation(KFO_Update_Params | KFO_Local_Cov, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
0713   }
0714 
0715   void kalmanPropagateAndUpdate(const MPlexLS& psErr,
0716                                 const MPlexLV& psPar,
0717                                 MPlexQI& Chg,
0718                                 const MPlexHS& msErr,
0719                                 const MPlexHV& msPar,
0720                                 MPlexLS& outErr,
0721                                 MPlexLV& outPar,
0722                                 MPlexQI& outFailFlag,
0723                                 const int N_proc,
0724                                 const PropagationFlags& propFlags,
0725                                 const bool propToHit) {
0726     if (propToHit) {
0727       MPlexLS propErr;
0728       MPlexLV propPar;
0729       MPlexQF msRad;
0730 #pragma omp simd
0731       for (int n = 0; n < NN; ++n) {
0732         msRad.At(n, 0, 0) = std::hypot(msPar.constAt(n, 0, 0), msPar.constAt(n, 1, 0));
0733       }
0734 
0735       propagateHelixToRMPlex(psErr, psPar, Chg, msRad, propErr, propPar, outFailFlag, N_proc, propFlags);
0736 
0737       kalmanOperation(
0738           KFO_Update_Params | KFO_Local_Cov, propErr, propPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
0739     } else {
0740       kalmanOperation(
0741           KFO_Update_Params | KFO_Local_Cov, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
0742     }
0743     for (int n = 0; n < NN; ++n) {
0744       if (n < N_proc && outPar.At(n, 3, 0) < 0) {
0745         Chg.At(n, 0, 0) = -Chg.At(n, 0, 0);
0746         outPar.At(n, 3, 0) = -outPar.At(n, 3, 0);
0747       }
0748     }
0749   }
0750 
0751   //------------------------------------------------------------------------------
0752 
0753   void kalmanComputeChi2(const MPlexLS& psErr,
0754                          const MPlexLV& psPar,
0755                          const MPlexQI& inChg,
0756                          const MPlexHS& msErr,
0757                          const MPlexHV& msPar,
0758                          MPlexQF& outChi2,
0759                          const int N_proc) {
0760     kalmanOperation(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
0761   }
0762 
0763   void kalmanPropagateAndComputeChi2(const MPlexLS& psErr,
0764                                      const MPlexLV& psPar,
0765                                      const MPlexQI& inChg,
0766                                      const MPlexHS& msErr,
0767                                      const MPlexHV& msPar,
0768                                      MPlexQF& outChi2,
0769                                      MPlexLV& propPar,
0770                                      MPlexQI& outFailFlag,
0771                                      const int N_proc,
0772                                      const PropagationFlags& propFlags,
0773                                      const bool propToHit) {
0774     propPar = psPar;
0775     if (propToHit) {
0776       MPlexLS propErr;
0777       MPlexQF msRad;
0778 #pragma omp simd
0779       for (int n = 0; n < NN; ++n) {
0780         if (n < N_proc) {
0781           msRad.At(n, 0, 0) = std::hypot(msPar.constAt(n, 0, 0), msPar.constAt(n, 1, 0));
0782         } else {
0783           msRad.At(n, 0, 0) = 0.0f;
0784         }
0785       }
0786 
0787       propagateHelixToRMPlex(psErr, psPar, inChg, msRad, propErr, propPar, outFailFlag, N_proc, propFlags);
0788 
0789       kalmanOperation(KFO_Calculate_Chi2, propErr, propPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
0790     } else {
0791       kalmanOperation(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
0792     }
0793   }
0794 
0795   //------------------------------------------------------------------------------
0796 
0797   void kalmanOperation(const int kfOp,
0798                        const MPlexLS& psErr,
0799                        const MPlexLV& psPar,
0800                        const MPlexHS& msErr,
0801                        const MPlexHV& msPar,
0802                        MPlexLS& outErr,
0803                        MPlexLV& outPar,
0804                        MPlexQF& outChi2,
0805                        const int N_proc) {
0806 #ifdef DEBUG
0807     {
0808       dmutex_guard;
0809       printf("psPar:\n");
0810       for (int i = 0; i < 6; ++i) {
0811         printf("%8f ", psPar.constAt(0, 0, i));
0812         printf("\n");
0813       }
0814       printf("\n");
0815       printf("psErr:\n");
0816       for (int i = 0; i < 6; ++i) {
0817         for (int j = 0; j < 6; ++j)
0818           printf("%8f ", psErr.constAt(0, i, j));
0819         printf("\n");
0820       }
0821       printf("\n");
0822       printf("msPar:\n");
0823       for (int i = 0; i < 3; ++i) {
0824         printf("%8f ", msPar.constAt(0, 0, i));
0825         printf("\n");
0826       }
0827       printf("\n");
0828       printf("msErr:\n");
0829       for (int i = 0; i < 3; ++i) {
0830         for (int j = 0; j < 3; ++j)
0831           printf("%8f ", msErr.constAt(0, i, j));
0832         printf("\n");
0833       }
0834       printf("\n");
0835     }
0836 #endif
0837 
0838     // Rotate global point on tangent plane to cylinder
0839     // Tangent point is half way between hit and propagate position
0840 
0841     // Rotation matrix
0842     //  rotT00  0  rotT01
0843     //  rotT01  0 -rotT00
0844     //     0    1    0
0845     // Minimize temporaries: only two float are needed!
0846 
0847     MPlexQF rotT00;
0848     MPlexQF rotT01;
0849     for (int n = 0; n < NN; ++n) {
0850       if (n < N_proc) {
0851         const float r = std::hypot(msPar.constAt(n, 0, 0), msPar.constAt(n, 1, 0));
0852         rotT00.At(n, 0, 0) = -(msPar.constAt(n, 1, 0) + psPar.constAt(n, 1, 0)) / (2 * r);
0853         rotT01.At(n, 0, 0) = (msPar.constAt(n, 0, 0) + psPar.constAt(n, 0, 0)) / (2 * r);
0854       } else {
0855         rotT00.At(n, 0, 0) = 0.0f;
0856         rotT01.At(n, 0, 0) = 0.0f;
0857       }
0858     }
0859 
0860     MPlexHV res_glo;  //position residual in global coordinates
0861     SubtractFirst3(msPar, psPar, res_glo);
0862 
0863     MPlexHS resErr_glo;  //covariance sum in global position coordinates
0864     AddIntoUpperLeft3x3(psErr, msErr, resErr_glo);
0865 
0866     MPlex2V res_loc;  //position residual in local coordinates
0867     RotateResidualsOnTangentPlane(rotT00, rotT01, res_glo, res_loc);
0868     MPlex2S resErr_loc;  //covariance sum in local position coordinates
0869     MPlexHH tempHH;
0870     ProjectResErr(rotT00, rotT01, resErr_glo, tempHH);
0871     ProjectResErrTransp(rotT00, rotT01, tempHH, resErr_loc);
0872 
0873 #ifdef DEBUG
0874     {
0875       dmutex_guard;
0876       printf("res_glo:\n");
0877       for (int i = 0; i < 3; ++i) {
0878         printf("%8f ", res_glo.At(0, i, 0));
0879       }
0880       printf("\n");
0881       printf("resErr_glo:\n");
0882       for (int i = 0; i < 3; ++i) {
0883         for (int j = 0; j < 3; ++j)
0884           printf("%8f ", resErr_glo.At(0, i, j));
0885         printf("\n");
0886       }
0887       printf("\n");
0888       printf("res_loc:\n");
0889       for (int i = 0; i < 2; ++i) {
0890         printf("%8f ", res_loc.At(0, i, 0));
0891       }
0892       printf("\n");
0893       printf("tempHH:\n");
0894       for (int i = 0; i < 3; ++i) {
0895         for (int j = 0; j < 3; ++j)
0896           printf("%8f ", tempHH.At(0, i, j));
0897         printf("\n");
0898       }
0899       printf("\n");
0900       printf("resErr_loc:\n");
0901       for (int i = 0; i < 2; ++i) {
0902         for (int j = 0; j < 2; ++j)
0903           printf("%8f ", resErr_loc.At(0, i, j));
0904         printf("\n");
0905       }
0906       printf("\n");
0907     }
0908 #endif
0909 
0910     //invert the 2x2 matrix
0911     Matriplex::invertCramerSym(resErr_loc);
0912 
0913     if (kfOp & KFO_Calculate_Chi2) {
0914       Chi2Similarity(res_loc, resErr_loc, outChi2);
0915 
0916 #ifdef DEBUG
0917       {
0918         dmutex_guard;
0919         printf("resErr_loc (Inv):\n");
0920         for (int i = 0; i < 2; ++i) {
0921           for (int j = 0; j < 2; ++j)
0922             printf("%8f ", resErr_loc.At(0, i, j));
0923           printf("\n");
0924         }
0925         printf("\n");
0926         printf("chi2: %8f\n", outChi2.At(0, 0, 0));
0927       }
0928 #endif
0929     }
0930 
0931     if (kfOp & KFO_Update_Params) {
0932       MPlexLS psErrLoc = psErr;
0933       if (kfOp & KFO_Local_Cov)
0934         CovXYconstrain(rotT00, rotT01, psErr, psErrLoc);
0935 
0936       MPlexLH K;                                      // kalman gain, fixme should be L2
0937       KalmanHTG(rotT00, rotT01, resErr_loc, tempHH);  // intermediate term to get kalman gain (H^T*G)
0938       KalmanGain(psErrLoc, tempHH, K);
0939 
0940       MultResidualsAdd(K, psPar, res_loc, outPar);
0941 
0942       squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
0943 
0944       MPlexLL tempLL;
0945       KHMult(K, rotT00, rotT01, tempLL);
0946       KHC(tempLL, psErrLoc, outErr);
0947       outErr.subtract(psErrLoc, outErr);
0948 
0949 #ifdef DEBUG
0950       {
0951         dmutex_guard;
0952         if (kfOp & KFO_Local_Cov) {
0953           printf("psErrLoc:\n");
0954           for (int i = 0; i < 6; ++i) {
0955             for (int j = 0; j < 6; ++j)
0956               printf("% 8e ", psErrLoc.At(0, i, j));
0957             printf("\n");
0958           }
0959           printf("\n");
0960         }
0961         printf("resErr_loc (Inv):\n");
0962         for (int i = 0; i < 2; ++i) {
0963           for (int j = 0; j < 2; ++j)
0964             printf("%8f ", resErr_loc.At(0, i, j));
0965           printf("\n");
0966         }
0967         printf("\n");
0968         printf("tempHH:\n");
0969         for (int i = 0; i < 3; ++i) {
0970           for (int j = 0; j < 3; ++j)
0971             printf("%8f ", tempHH.At(0, i, j));
0972           printf("\n");
0973         }
0974         printf("\n");
0975         printf("K:\n");
0976         for (int i = 0; i < 6; ++i) {
0977           for (int j = 0; j < 3; ++j)
0978             printf("%8f ", K.At(0, i, j));
0979           printf("\n");
0980         }
0981         printf("\n");
0982         printf("tempLL:\n");
0983         for (int i = 0; i < 6; ++i) {
0984           for (int j = 0; j < 6; ++j)
0985             printf("%8f ", tempLL.At(0, i, j));
0986           printf("\n");
0987         }
0988         printf("\n");
0989         printf("outPar:\n");
0990         for (int i = 0; i < 6; ++i) {
0991           printf("%8f  ", outPar.At(0, i, 0));
0992         }
0993         printf("\n");
0994         printf("outErr:\n");
0995         for (int i = 0; i < 6; ++i) {
0996           for (int j = 0; j < 6; ++j)
0997             printf("%8f ", outErr.At(0, i, j));
0998           printf("\n");
0999         }
1000         printf("\n");
1001       }
1002 #endif
1003     }
1004   }
1005 
1006   //==============================================================================
1007   // Kalman operations - Plane
1008   //==============================================================================
1009 
1010   void kalmanUpdatePlane(const MPlexLS& psErr,
1011                          const MPlexLV& psPar,
1012                          const MPlexHS& msErr,
1013                          const MPlexHV& msPar,
1014                          const MPlexHV& plNrm,
1015                          const MPlexHV& plDir,
1016                          MPlexLS& outErr,
1017                          MPlexLV& outPar,
1018                          const int N_proc) {
1019     kalmanOperationPlane(
1020         KFO_Update_Params | KFO_Local_Cov, psErr, psPar, msErr, msPar, plNrm, plDir, outErr, outPar, dummy_chi2, N_proc);
1021   }
1022 
1023   void kalmanPropagateAndUpdatePlane(const MPlexLS& psErr,
1024                                      const MPlexLV& psPar,
1025                                      MPlexQI& Chg,
1026                                      const MPlexHS& msErr,
1027                                      const MPlexHV& msPar,
1028                                      const MPlexHV& plNrm,
1029                                      const MPlexHV& plDir,
1030                                      MPlexLS& outErr,
1031                                      MPlexLV& outPar,
1032                                      MPlexQI& outFailFlag,
1033                                      const int N_proc,
1034                                      const PropagationFlags& propFlags,
1035                                      const bool propToHit) {
1036     if (propToHit) {
1037       MPlexLS propErr;
1038       MPlexLV propPar;
1039       propagateHelixToPlaneMPlex(psErr, psPar, Chg, msPar, plNrm, propErr, propPar, outFailFlag, N_proc, propFlags);
1040 
1041       kalmanOperationPlane(KFO_Update_Params | KFO_Local_Cov,
1042                            propErr,
1043                            propPar,
1044                            msErr,
1045                            msPar,
1046                            plNrm,
1047                            plDir,
1048                            outErr,
1049                            outPar,
1050                            dummy_chi2,
1051                            N_proc);
1052     } else {
1053       kalmanOperationPlane(KFO_Update_Params | KFO_Local_Cov,
1054                            psErr,
1055                            psPar,
1056                            msErr,
1057                            msPar,
1058                            plNrm,
1059                            plDir,
1060                            outErr,
1061                            outPar,
1062                            dummy_chi2,
1063                            N_proc);
1064     }
1065     for (int n = 0; n < NN; ++n) {
1066       if (outPar.At(n, 3, 0) < 0) {
1067         Chg.At(n, 0, 0) = -Chg.At(n, 0, 0);
1068         outPar.At(n, 3, 0) = -outPar.At(n, 3, 0);
1069       }
1070     }
1071   }
1072 
1073   //------------------------------------------------------------------------------
1074 
1075   void kalmanComputeChi2Plane(const MPlexLS& psErr,
1076                               const MPlexLV& psPar,
1077                               const MPlexQI& inChg,
1078                               const MPlexHS& msErr,
1079                               const MPlexHV& msPar,
1080                               const MPlexHV& plNrm,
1081                               const MPlexHV& plDir,
1082                               MPlexQF& outChi2,
1083                               const int N_proc) {
1084     kalmanOperationPlane(
1085         KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, plNrm, plDir, dummy_err, dummy_par, outChi2, N_proc);
1086   }
1087 
1088   void kalmanPropagateAndComputeChi2Plane(const MPlexLS& psErr,
1089                                           const MPlexLV& psPar,
1090                                           const MPlexQI& inChg,
1091                                           const MPlexHS& msErr,
1092                                           const MPlexHV& msPar,
1093                                           const MPlexHV& plNrm,
1094                                           const MPlexHV& plDir,
1095                                           MPlexQF& outChi2,
1096                                           MPlexLV& propPar,
1097                                           MPlexQI& outFailFlag,
1098                                           const int N_proc,
1099                                           const PropagationFlags& propFlags,
1100                                           const bool propToHit) {
1101     propPar = psPar;
1102     if (propToHit) {
1103       MPlexLS propErr;
1104       propagateHelixToPlaneMPlex(psErr, psPar, inChg, msPar, plNrm, propErr, propPar, outFailFlag, N_proc, propFlags);
1105 
1106       kalmanOperationPlane(
1107           KFO_Calculate_Chi2, propErr, propPar, msErr, msPar, plNrm, plDir, dummy_err, dummy_par, outChi2, N_proc);
1108     } else {
1109       kalmanOperationPlane(
1110           KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, plNrm, plDir, dummy_err, dummy_par, outChi2, N_proc);
1111     }
1112   }
1113 
1114   //------------------------------------------------------------------------------
1115 
1116   void kalmanOperationPlane(const int kfOp,
1117                             const MPlexLS& psErr,
1118                             const MPlexLV& psPar,
1119                             const MPlexHS& msErr,
1120                             const MPlexHV& msPar,
1121                             const MPlexHV& plNrm,
1122                             const MPlexHV& plDir,
1123                             MPlexLS& outErr,
1124                             MPlexLV& outPar,
1125                             MPlexQF& outChi2,
1126                             const int N_proc) {
1127 #ifdef DEBUG
1128     {
1129       dmutex_guard;
1130       printf("psPar:\n");
1131       for (int i = 0; i < 6; ++i) {
1132         printf("%8f ", psPar.constAt(0, 0, i));
1133         printf("\n");
1134       }
1135       printf("\n");
1136       printf("psErr:\n");
1137       for (int i = 0; i < 6; ++i) {
1138         for (int j = 0; j < 6; ++j)
1139           printf("%8f ", psErr.constAt(0, i, j));
1140         printf("\n");
1141       }
1142       printf("\n");
1143       printf("msPar:\n");
1144       for (int i = 0; i < 3; ++i) {
1145         printf("%8f ", msPar.constAt(0, 0, i));
1146         printf("\n");
1147       }
1148       printf("\n");
1149       printf("msErr:\n");
1150       for (int i = 0; i < 3; ++i) {
1151         for (int j = 0; j < 3; ++j)
1152           printf("%8f ", msErr.constAt(0, i, j));
1153         printf("\n");
1154       }
1155       printf("\n");
1156     }
1157 #endif
1158 
1159     // Rotate global point on tangent plane to cylinder
1160     // Tangent point is half way between hit and propagate position
1161 
1162     // Rotation matrix
1163     //    D0  D1   D2
1164     //    X0  X1   X2
1165     //    N0  N1   N2
1166     // where D is the strip direction vector plDir, N is the normal plNrm, and X is the cross product between the two
1167 
1168     MPlex2H prj;
1169     for (int n = 0; n < NN; ++n) {
1170       prj(n, 0, 0) = plDir(n, 0, 0);
1171       prj(n, 0, 1) = plDir(n, 1, 0);
1172       prj(n, 0, 2) = plDir(n, 2, 0);
1173       prj(n, 1, 0) = plNrm(n, 1, 0) * plDir(n, 2, 0) - plNrm(n, 2, 0) * plDir(n, 1, 0);
1174       prj(n, 1, 1) = plNrm(n, 2, 0) * plDir(n, 0, 0) - plNrm(n, 0, 0) * plDir(n, 2, 0);
1175       prj(n, 1, 2) = plNrm(n, 0, 0) * plDir(n, 1, 0) - plNrm(n, 1, 0) * plDir(n, 0, 0);
1176     }
1177 
1178     MPlexHV res_glo;  //position residual in global coordinates
1179     SubtractFirst3(msPar, psPar, res_glo);
1180 
1181     MPlexHS resErr_glo;  //covariance sum in global position coordinates
1182     AddIntoUpperLeft3x3(psErr, msErr, resErr_glo);
1183 
1184     MPlex2V res_loc;  //position residual in local coordinates
1185     RotateResidualsOnPlane(prj, res_glo, res_loc);
1186     MPlex2S resErr_loc;  //covariance sum in local position coordinates
1187     MPlex2H temp2H;
1188     ProjectResErr(prj, resErr_glo, temp2H);
1189     ProjectResErrTransp(prj, temp2H, resErr_loc);
1190 
1191 #ifdef DEBUG
1192     {
1193       dmutex_guard;
1194       printf("prj:\n");
1195       for (int i = 0; i < 2; ++i) {
1196         for (int j = 0; j < 3; ++j)
1197           printf("%8f ", prj.At(0, i, j));
1198         printf("\n");
1199       }
1200       printf("\n");
1201       printf("res_glo:\n");
1202       for (int i = 0; i < 3; ++i) {
1203         printf("%8f ", res_glo.At(0, i, 0));
1204       }
1205       printf("\n");
1206       printf("resErr_glo:\n");
1207       for (int i = 0; i < 3; ++i) {
1208         for (int j = 0; j < 3; ++j)
1209           printf("%8f ", resErr_glo.At(0, i, j));
1210         printf("\n");
1211       }
1212       printf("\n");
1213       printf("res_loc:\n");
1214       for (int i = 0; i < 2; ++i) {
1215         printf("%8f ", res_loc.At(0, i, 0));
1216       }
1217       printf("\n");
1218       printf("temp2H:\n");
1219       for (int i = 0; i < 2; ++i) {
1220         for (int j = 0; j < 3; ++j)
1221           printf("%8f ", temp2H.At(0, i, j));
1222         printf("\n");
1223       }
1224       printf("\n");
1225       printf("resErr_loc:\n");
1226       for (int i = 0; i < 2; ++i) {
1227         for (int j = 0; j < 2; ++j)
1228           printf("%8f ", resErr_loc.At(0, i, j));
1229         printf("\n");
1230       }
1231       printf("\n");
1232     }
1233 #endif
1234 
1235     //invert the 2x2 matrix
1236     Matriplex::invertCramerSym(resErr_loc);
1237 
1238     if (kfOp & KFO_Calculate_Chi2) {
1239       Chi2Similarity(res_loc, resErr_loc, outChi2);
1240 
1241 #ifdef DEBUG
1242       {
1243         dmutex_guard;
1244         printf("resErr_loc (Inv):\n");
1245         for (int i = 0; i < 2; ++i) {
1246           for (int j = 0; j < 2; ++j)
1247             printf("%8f ", resErr_loc.At(0, i, j));
1248           printf("\n");
1249         }
1250         printf("\n");
1251         printf("chi2: %8f\n", outChi2.At(0, 0, 0));
1252       }
1253 #endif
1254     }
1255 
1256     if (kfOp & KFO_Update_Params) {
1257       MPlexLS psErrLoc = psErr;
1258 
1259       MPlexH2 tempH2;
1260       MPlexL2 K;                           // kalman gain, fixme should be L2
1261       KalmanHTG(prj, resErr_loc, tempH2);  // intermediate term to get kalman gain (H^T*G)
1262       KalmanGain(psErrLoc, tempH2, K);
1263 
1264       MultResidualsAdd(K, psPar, res_loc, outPar);
1265 
1266       squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
1267 
1268       MPlexLL tempLL;
1269       KHMult(K, prj, tempLL);
1270       KHC(tempLL, psErrLoc, outErr);
1271       outErr.subtract(psErrLoc, outErr);
1272 
1273 #ifdef DEBUG
1274       {
1275         dmutex_guard;
1276         if (kfOp & KFO_Local_Cov) {
1277           printf("psErrLoc:\n");
1278           for (int i = 0; i < 6; ++i) {
1279             for (int j = 0; j < 6; ++j)
1280               printf("% 8e ", psErrLoc.At(0, i, j));
1281             printf("\n");
1282           }
1283           printf("\n");
1284         }
1285         printf("resErr_loc (Inv):\n");
1286         for (int i = 0; i < 2; ++i) {
1287           for (int j = 0; j < 2; ++j)
1288             printf("%8f ", resErr_loc.At(0, i, j));
1289           printf("\n");
1290         }
1291         printf("\n");
1292         printf("tempH2:\n");
1293         for (int i = 0; i < 3; ++i) {
1294           for (int j = 0; j < 2; ++j)
1295             printf("%8f ", tempH2.At(0, i, j));
1296           printf("\n");
1297         }
1298         printf("\n");
1299         printf("K:\n");
1300         for (int i = 0; i < 6; ++i) {
1301           for (int j = 0; j < 2; ++j)
1302             printf("%8f ", K.At(0, i, j));
1303           printf("\n");
1304         }
1305         printf("\n");
1306         printf("tempLL:\n");
1307         for (int i = 0; i < 6; ++i) {
1308           for (int j = 0; j < 6; ++j)
1309             printf("%8f ", tempLL.At(0, i, j));
1310           printf("\n");
1311         }
1312         printf("\n");
1313         printf("outPar:\n");
1314         for (int i = 0; i < 6; ++i) {
1315           printf("%8f  ", outPar.At(0, i, 0));
1316         }
1317         printf("\n");
1318         printf("outErr:\n");
1319         for (int i = 0; i < 6; ++i) {
1320           for (int j = 0; j < 6; ++j)
1321             printf("%8f ", outErr.At(0, i, j));
1322           printf("\n");
1323         }
1324         printf("\n");
1325       }
1326 #endif
1327     }
1328   }
1329 
1330   //==============================================================================
1331   // Kalman operations - Endcap
1332   //==============================================================================
1333 
1334   void kalmanUpdateEndcap(const MPlexLS& psErr,
1335                           const MPlexLV& psPar,
1336                           const MPlexHS& msErr,
1337                           const MPlexHV& msPar,
1338                           MPlexLS& outErr,
1339                           MPlexLV& outPar,
1340                           const int N_proc) {
1341     kalmanOperationEndcap(KFO_Update_Params, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
1342   }
1343 
1344   void kalmanPropagateAndUpdateEndcap(const MPlexLS& psErr,
1345                                       const MPlexLV& psPar,
1346                                       MPlexQI& Chg,
1347                                       const MPlexHS& msErr,
1348                                       const MPlexHV& msPar,
1349                                       MPlexLS& outErr,
1350                                       MPlexLV& outPar,
1351                                       MPlexQI& outFailFlag,
1352                                       const int N_proc,
1353                                       const PropagationFlags& propFlags,
1354                                       const bool propToHit) {
1355     if (propToHit) {
1356       MPlexLS propErr;
1357       MPlexLV propPar;
1358       MPlexQF msZ;
1359 #pragma omp simd
1360       for (int n = 0; n < NN; ++n) {
1361         msZ.At(n, 0, 0) = msPar.constAt(n, 2, 0);
1362       }
1363 
1364       propagateHelixToZMPlex(psErr, psPar, Chg, msZ, propErr, propPar, outFailFlag, N_proc, propFlags);
1365 
1366       kalmanOperationEndcap(KFO_Update_Params, propErr, propPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
1367     } else {
1368       kalmanOperationEndcap(KFO_Update_Params, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
1369     }
1370     for (int n = 0; n < NN; ++n) {
1371       if (n < N_proc && outPar.At(n, 3, 0) < 0) {
1372         Chg.At(n, 0, 0) = -Chg.At(n, 0, 0);
1373         outPar.At(n, 3, 0) = -outPar.At(n, 3, 0);
1374       }
1375     }
1376   }
1377 
1378   //------------------------------------------------------------------------------
1379 
1380   void kalmanComputeChi2Endcap(const MPlexLS& psErr,
1381                                const MPlexLV& psPar,
1382                                const MPlexQI& inChg,
1383                                const MPlexHS& msErr,
1384                                const MPlexHV& msPar,
1385                                MPlexQF& outChi2,
1386                                const int N_proc) {
1387     kalmanOperationEndcap(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
1388   }
1389 
1390   void kalmanPropagateAndComputeChi2Endcap(const MPlexLS& psErr,
1391                                            const MPlexLV& psPar,
1392                                            const MPlexQI& inChg,
1393                                            const MPlexHS& msErr,
1394                                            const MPlexHV& msPar,
1395                                            MPlexQF& outChi2,
1396                                            MPlexLV& propPar,
1397                                            MPlexQI& outFailFlag,
1398                                            const int N_proc,
1399                                            const PropagationFlags& propFlags,
1400                                            const bool propToHit) {
1401     propPar = psPar;
1402     if (propToHit) {
1403       MPlexLS propErr;
1404       MPlexQF msZ;
1405 #pragma omp simd
1406       for (int n = 0; n < NN; ++n) {
1407         msZ.At(n, 0, 0) = msPar.constAt(n, 2, 0);
1408       }
1409 
1410       propagateHelixToZMPlex(psErr, psPar, inChg, msZ, propErr, propPar, outFailFlag, N_proc, propFlags);
1411 
1412       kalmanOperationEndcap(KFO_Calculate_Chi2, propErr, propPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
1413     } else {
1414       kalmanOperationEndcap(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
1415     }
1416   }
1417 
1418   //------------------------------------------------------------------------------
1419 
1420   void kalmanOperationEndcap(const int kfOp,
1421                              const MPlexLS& psErr,
1422                              const MPlexLV& psPar,
1423                              const MPlexHS& msErr,
1424                              const MPlexHV& msPar,
1425                              MPlexLS& outErr,
1426                              MPlexLV& outPar,
1427                              MPlexQF& outChi2,
1428                              const int N_proc) {
1429 #ifdef DEBUG
1430     {
1431       dmutex_guard;
1432       printf("updateParametersEndcapMPlex\n");
1433       printf("psPar:\n");
1434       for (int i = 0; i < 6; ++i) {
1435         printf("%8f ", psPar.constAt(0, 0, i));
1436         printf("\n");
1437       }
1438       printf("\n");
1439       printf("msPar:\n");
1440       for (int i = 0; i < 3; ++i) {
1441         printf("%8f ", msPar.constAt(0, 0, i));
1442         printf("\n");
1443       }
1444       printf("\n");
1445       printf("psErr:\n");
1446       for (int i = 0; i < 6; ++i) {
1447         for (int j = 0; j < 6; ++j)
1448           printf("%8f ", psErr.constAt(0, i, j));
1449         printf("\n");
1450       }
1451       printf("\n");
1452       printf("msErr:\n");
1453       for (int i = 0; i < 3; ++i) {
1454         for (int j = 0; j < 3; ++j)
1455           printf("%8f ", msErr.constAt(0, i, j));
1456         printf("\n");
1457       }
1458       printf("\n");
1459     }
1460 #endif
1461 
1462     MPlex2V res;
1463     SubtractFirst2(msPar, psPar, res);
1464 
1465     MPlex2S resErr;
1466     AddIntoUpperLeft2x2(psErr, msErr, resErr);
1467 
1468 #ifdef DEBUG
1469     {
1470       dmutex_guard;
1471       printf("resErr:\n");
1472       for (int i = 0; i < 2; ++i) {
1473         for (int j = 0; j < 2; ++j)
1474           printf("%8f ", resErr.At(0, i, j));
1475         printf("\n");
1476       }
1477       printf("\n");
1478     }
1479 #endif
1480 
1481     //invert the 2x2 matrix
1482     Matriplex::invertCramerSym(resErr);
1483 
1484     if (kfOp & KFO_Calculate_Chi2) {
1485       Chi2Similarity(res, resErr, outChi2);
1486 
1487 #ifdef DEBUG
1488       {
1489         dmutex_guard;
1490         printf("resErr_loc (Inv):\n");
1491         for (int i = 0; i < 2; ++i) {
1492           for (int j = 0; j < 2; ++j)
1493             printf("%8f ", resErr.At(0, i, j));
1494           printf("\n");
1495         }
1496         printf("\n");
1497         printf("chi2: %8f\n", outChi2.At(0, 0, 0));
1498       }
1499 #endif
1500     }
1501 
1502     if (kfOp & KFO_Update_Params) {
1503       MPlexL2 K;
1504       KalmanGain(psErr, resErr, K);
1505 
1506       MultResidualsAdd(K, psPar, res, outPar);
1507 
1508       squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
1509 
1510       KHC(K, psErr, outErr);
1511 
1512 #ifdef DEBUG
1513       {
1514         dmutex_guard;
1515         printf("outErr before subtract:\n");
1516         for (int i = 0; i < 6; ++i) {
1517           for (int j = 0; j < 6; ++j)
1518             printf("%8f ", outErr.At(0, i, j));
1519           printf("\n");
1520         }
1521         printf("\n");
1522       }
1523 #endif
1524 
1525       outErr.subtract(psErr, outErr);
1526 
1527 #ifdef DEBUG
1528       {
1529         dmutex_guard;
1530         printf("res:\n");
1531         for (int i = 0; i < 2; ++i) {
1532           printf("%8f ", res.At(0, i, 0));
1533         }
1534         printf("\n");
1535         printf("resErr (Inv):\n");
1536         for (int i = 0; i < 2; ++i) {
1537           for (int j = 0; j < 2; ++j)
1538             printf("%8f ", resErr.At(0, i, j));
1539           printf("\n");
1540         }
1541         printf("\n");
1542         printf("K:\n");
1543         for (int i = 0; i < 6; ++i) {
1544           for (int j = 0; j < 2; ++j)
1545             printf("%8f ", K.At(0, i, j));
1546           printf("\n");
1547         }
1548         printf("\n");
1549         printf("outPar:\n");
1550         for (int i = 0; i < 6; ++i) {
1551           printf("%8f  ", outPar.At(0, i, 0));
1552         }
1553         printf("\n");
1554         printf("outErr:\n");
1555         for (int i = 0; i < 6; ++i) {
1556           for (int j = 0; j < 6; ++j)
1557             printf("%8f ", outErr.At(0, i, j));
1558           printf("\n");
1559         }
1560         printf("\n");
1561       }
1562 #endif
1563     }
1564   }
1565 
1566 }  // end namespace mkfit