Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2023-11-19 23:58:42

0001 #include "KalmanUtilsMPlex.h"
0002 #include "PropagationMPlex.h"
0003 
0004 //#define DEBUG
0005 #include "Debug.h"
0006 
0007 #include "KalmanUtilsMPlex.icc"
0008 
0009 #include "RecoTracker/MkFitCore/interface/cms_common_macros.h"
0010 
0011 namespace {
0012   using namespace mkfit;
0013   using idx_t = Matriplex::idx_t;
0014 
0015   inline void MultResidualsAdd(const MPlexLH& A, const MPlexLV& B, const MPlex2V& C, MPlexLV& D) {
0016     // outPar = psPar + kalmanGain*(dPar)
0017     //   D    =   B         A         C
0018     // where right half of kalman gain is 0
0019 
0020     // XXX Regenerate with a script.
0021 
0022     MultResidualsAdd_imp(A, B, C, D, 0, NN);
0023   }
0024 
0025   inline void MultResidualsAdd(const MPlexL2& A, const MPlexLV& B, const MPlex2V& C, MPlexLV& D) {
0026     // outPar = psPar + kalmanGain*(dPar)
0027     //   D    =   B         A         C
0028     // where right half of kalman gain is 0
0029 
0030     // XXX Regenerate with a script.
0031 
0032     typedef float T;
0033     const idx_t N = NN;
0034 
0035     const T* a = A.fArray;
0036     ASSUME_ALIGNED(a, 64);
0037     const T* b = B.fArray;
0038     ASSUME_ALIGNED(b, 64);
0039     const T* c = C.fArray;
0040     ASSUME_ALIGNED(c, 64);
0041     T* d = D.fArray;
0042     ASSUME_ALIGNED(d, 64);
0043 
0044 #pragma omp simd
0045     for (idx_t n = 0; n < N; ++n) {
0046       // generate loop (can also write it manually this time, it's not much)
0047       d[0 * N + n] = b[0 * N + n] + a[0 * N + n] * c[0 * N + n] + a[1 * N + n] * c[1 * N + n];
0048       d[1 * N + n] = b[1 * N + n] + a[2 * N + n] * c[0 * N + n] + a[3 * N + n] * c[1 * N + n];
0049       d[2 * N + n] = b[2 * N + n] + a[4 * N + n] * c[0 * N + n] + a[5 * N + n] * c[1 * N + n];
0050       d[3 * N + n] = b[3 * N + n] + a[6 * N + n] * c[0 * N + n] + a[7 * N + n] * c[1 * N + n];
0051       d[4 * N + n] = b[4 * N + n] + a[8 * N + n] * c[0 * N + n] + a[9 * N + n] * c[1 * N + n];
0052       d[5 * N + n] = b[5 * N + n] + a[10 * N + n] * c[0 * N + n] + a[11 * N + n] * c[1 * N + n];
0053     }
0054   }
0055 
0056   //------------------------------------------------------------------------------
0057 
0058   inline void Chi2Similarity(const MPlex2V& A,  //resPar
0059                              const MPlex2S& C,  //resErr
0060                              MPlexQF& D)        //outChi2
0061   {
0062     // outChi2 = (resPar) * resErr * (resPar)
0063     //   D     =    A      *    C   *      A
0064 
0065     typedef float T;
0066     const idx_t N = NN;
0067 
0068     const T* a = A.fArray;
0069     ASSUME_ALIGNED(a, 64);
0070     const T* c = C.fArray;
0071     ASSUME_ALIGNED(c, 64);
0072     T* d = D.fArray;
0073     ASSUME_ALIGNED(d, 64);
0074 
0075 #pragma omp simd
0076     for (idx_t n = 0; n < N; ++n) {
0077       // generate loop (can also write it manually this time, it's not much)
0078       d[0 * N + n] = c[0 * N + n] * a[0 * N + n] * a[0 * N + n] + c[2 * N + n] * a[1 * N + n] * a[1 * N + n] +
0079                      2 * (c[1 * N + n] * a[1 * N + n] * a[0 * N + n]);
0080     }
0081   }
0082 
0083   //------------------------------------------------------------------------------
0084 
0085   inline void AddIntoUpperLeft3x3(const MPlexLS& A, const MPlexHS& B, MPlexHS& C) {
0086     // The rest of matrix is left untouched.
0087 
0088     typedef float T;
0089     const idx_t N = NN;
0090 
0091     const T* a = A.fArray;
0092     ASSUME_ALIGNED(a, 64);
0093     const T* b = B.fArray;
0094     ASSUME_ALIGNED(b, 64);
0095     T* c = C.fArray;
0096     ASSUME_ALIGNED(c, 64);
0097 
0098 #pragma omp simd
0099     for (idx_t n = 0; n < N; ++n) {
0100       c[0 * N + n] = a[0 * N + n] + b[0 * N + n];
0101       c[1 * N + n] = a[1 * N + n] + b[1 * N + n];
0102       c[2 * N + n] = a[2 * N + n] + b[2 * N + n];
0103       c[3 * N + n] = a[3 * N + n] + b[3 * N + n];
0104       c[4 * N + n] = a[4 * N + n] + b[4 * N + n];
0105       c[5 * N + n] = a[5 * N + n] + b[5 * N + n];
0106     }
0107   }
0108 
0109   inline void AddIntoUpperLeft2x2(const MPlexLS& A, const MPlexHS& B, MPlex2S& C) {
0110     // The rest of matrix is left untouched.
0111 
0112     typedef float T;
0113     const idx_t N = NN;
0114 
0115     const T* a = A.fArray;
0116     ASSUME_ALIGNED(a, 64);
0117     const T* b = B.fArray;
0118     ASSUME_ALIGNED(b, 64);
0119     T* c = C.fArray;
0120     ASSUME_ALIGNED(c, 64);
0121 
0122 #pragma omp simd
0123     for (idx_t n = 0; n < N; ++n) {
0124       c[0 * N + n] = a[0 * N + n] + b[0 * N + n];
0125       c[1 * N + n] = a[1 * N + n] + b[1 * N + n];
0126       c[2 * N + n] = a[2 * N + n] + b[2 * N + n];
0127     }
0128   }
0129 
0130   //------------------------------------------------------------------------------
0131 
0132   inline void SubtractFirst3(const MPlexHV& A, const MPlexLV& B, MPlexHV& C) {
0133     // The rest of matrix is left untouched.
0134 
0135     typedef float T;
0136     const idx_t N = NN;
0137 
0138     const T* a = A.fArray;
0139     ASSUME_ALIGNED(a, 64);
0140     const T* b = B.fArray;
0141     ASSUME_ALIGNED(b, 64);
0142     T* c = C.fArray;
0143     ASSUME_ALIGNED(c, 64);
0144 
0145 #pragma omp simd
0146     for (idx_t n = 0; n < N; ++n) {
0147       c[0 * N + n] = a[0 * N + n] - b[0 * N + n];
0148       c[1 * N + n] = a[1 * N + n] - b[1 * N + n];
0149       c[2 * N + n] = a[2 * N + n] - b[2 * N + n];
0150     }
0151   }
0152 
0153   inline void SubtractFirst2(const MPlexHV& A, const MPlexLV& B, MPlex2V& C) {
0154     // The rest of matrix is left untouched.
0155 
0156     typedef float T;
0157     const idx_t N = NN;
0158 
0159     const T* a = A.fArray;
0160     ASSUME_ALIGNED(a, 64);
0161     const T* b = B.fArray;
0162     ASSUME_ALIGNED(b, 64);
0163     T* c = C.fArray;
0164     ASSUME_ALIGNED(c, 64);
0165 
0166 #pragma omp simd
0167     for (idx_t n = 0; n < N; ++n) {
0168       c[0 * N + n] = a[0 * N + n] - b[0 * N + n];
0169       c[1 * N + n] = a[1 * N + n] - b[1 * N + n];
0170     }
0171   }
0172 
0173   //==============================================================================
0174 
0175   inline void ProjectResErr(const MPlexQF& A00, const MPlexQF& A01, const MPlexHS& B, MPlexHH& C) {
0176     // C = A * B, C is 3x3, A is 3x3 , B is 3x3 sym
0177 
0178     // Based on script generation and adapted to custom sizes.
0179 
0180     typedef float T;
0181     const idx_t N = NN;
0182 
0183     const T* a00 = A00.fArray;
0184     ASSUME_ALIGNED(a00, 64);
0185     const T* a01 = A01.fArray;
0186     ASSUME_ALIGNED(a01, 64);
0187     const T* b = B.fArray;
0188     ASSUME_ALIGNED(b, 64);
0189     T* c = C.fArray;
0190     ASSUME_ALIGNED(c, 64);
0191 
0192 #pragma omp simd
0193     for (int n = 0; n < N; ++n) {
0194       c[0 * N + n] = a00[n] * b[0 * N + n] + a01[n] * b[1 * N + n];
0195       c[1 * N + n] = a00[n] * b[1 * N + n] + a01[n] * b[2 * N + n];
0196       c[2 * N + n] = a00[n] * b[3 * N + n] + a01[n] * b[4 * N + n];
0197       c[3 * N + n] = b[3 * N + n];
0198       c[4 * N + n] = b[4 * N + n];
0199       c[5 * N + n] = b[5 * N + n];
0200       c[6 * N + n] = a01[n] * b[0 * N + n] - a00[n] * b[1 * N + n];
0201       c[7 * N + n] = a01[n] * b[1 * N + n] - a00[n] * b[2 * N + n];
0202       c[8 * N + n] = a01[n] * b[3 * N + n] - a00[n] * b[4 * N + n];
0203     }
0204   }
0205 
0206   inline void ProjectResErrTransp(const MPlexQF& A00, const MPlexQF& A01, const MPlexHH& B, MPlex2S& C) {
0207     // C = A * B, C is 3x3 sym, A is 3x3 , B is 3x3
0208 
0209     // Based on script generation and adapted to custom sizes.
0210 
0211     typedef float T;
0212     const idx_t N = NN;
0213 
0214     const T* a00 = A00.fArray;
0215     ASSUME_ALIGNED(a00, 64);
0216     const T* a01 = A01.fArray;
0217     ASSUME_ALIGNED(a01, 64);
0218     const T* b = B.fArray;
0219     ASSUME_ALIGNED(b, 64);
0220     T* c = C.fArray;
0221     ASSUME_ALIGNED(c, 64);
0222 
0223 #pragma omp simd
0224     for (int n = 0; n < N; ++n) {
0225       c[0 * N + n] = b[0 * N + n] * a00[n] + b[1 * N + n] * a01[n];
0226       c[1 * N + n] = b[3 * N + n] * a00[n] + b[4 * N + n] * a01[n];
0227       c[2 * N + n] = b[5 * N + n];
0228     }
0229   }
0230 
0231   inline void RotateResidualsOnTangentPlane(const MPlexQF& R00,  //r00
0232                                             const MPlexQF& R01,  //r01
0233                                             const MPlexHV& A,    //res_glo
0234                                             MPlex2V& B)          //res_loc
0235   {
0236     RotateResidualsOnTangentPlane_impl(R00, R01, A, B, 0, NN);
0237   }
0238 
0239   //==============================================================================
0240 
0241   inline void ProjectResErr(const MPlex2H& A, const MPlexHS& B, MPlex2H& C) {
0242     // C = A * B, C is 2x3, A is 2x3 , B is 3x3 sym
0243 
0244     /*
0245     A 0 1 2
0246       3 4 5
0247     B 0 1 3
0248       1 2 4
0249       3 4 5
0250     */
0251 
0252     typedef float T;
0253     const idx_t N = NN;
0254 
0255     const T* a = A.fArray;
0256     ASSUME_ALIGNED(a, 64);
0257     const T* b = B.fArray;
0258     ASSUME_ALIGNED(b, 64);
0259     T* c = C.fArray;
0260     ASSUME_ALIGNED(c, 64);
0261 
0262 #pragma omp simd
0263     for (int n = 0; n < N; ++n) {
0264       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[3 * N + n];
0265       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[2 * N + n] + a[2 * N + n] * b[4 * N + n];
0266       c[2 * N + n] = a[0 * N + n] * b[3 * N + n] + a[1 * N + n] * b[4 * N + n] + a[2 * N + n] * b[5 * N + n];
0267       c[3 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[1 * N + n] + a[5 * N + n] * b[3 * N + n];
0268       c[4 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[2 * N + n] + a[5 * N + n] * b[4 * N + n];
0269       c[5 * N + n] = a[3 * N + n] * b[3 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[5 * N + n];
0270     }
0271   }
0272 
0273   inline void ProjectResErrTransp(const MPlex2H& A, const MPlex2H& B, MPlex2S& C) {
0274     // C = B * A^T, C is 2x2 sym, A is 2x3 (A^T is 3x2), B is 2x3
0275 
0276     /*
0277     B   0 1 2
0278         3 4 5
0279     A^T 0 3
0280         1 4
0281         2 5
0282     */
0283 
0284     typedef float T;
0285     const idx_t N = NN;
0286 
0287     const T* a = A.fArray;
0288     ASSUME_ALIGNED(a, 64);
0289     const T* b = B.fArray;
0290     ASSUME_ALIGNED(b, 64);
0291     T* c = C.fArray;
0292     ASSUME_ALIGNED(c, 64);
0293 
0294 #pragma omp simd
0295     for (int n = 0; n < N; ++n) {
0296       c[0 * N + n] = b[0 * N + n] * a[0 * N + n] + b[1 * N + n] * a[1 * N + n] + b[2 * N + n] * a[2 * N + n];
0297       c[1 * N + n] = b[0 * N + n] * a[3 * N + n] + b[1 * N + n] * a[4 * N + n] + b[2 * N + n] * a[5 * N + n];
0298       c[2 * N + n] = b[3 * N + n] * a[3 * N + n] + b[4 * N + n] * a[4 * N + n] + b[5 * N + n] * a[5 * N + n];
0299     }
0300   }
0301 
0302   inline void RotateResidualsOnPlane(const MPlex2H& R,  //prj
0303                                      const MPlexHV& A,  //res_glo
0304                                      MPlex2V& B)        //res_loc
0305   {
0306     // typedef float T;
0307     // const idx_t N = NN;
0308 
0309     // const T* a = A.fArray;
0310     // ASSUME_ALIGNED(a, 64);
0311     // T* b = B.fArray;
0312     // ASSUME_ALIGNED(b, 64);
0313     // const T* r = R.fArray;
0314     // ASSUME_ALIGNED(r, 64);
0315 
0316 #pragma omp simd
0317     for (int n = 0; n < NN; ++n) {
0318       B(n, 0, 0) = R(n, 0, 0) * A(n, 0, 0) + R(n, 0, 1) * A(n, 1, 0) + R(n, 0, 2) * A(n, 2, 0);
0319       B(n, 1, 0) = R(n, 1, 0) * A(n, 0, 0) + R(n, 1, 1) * A(n, 1, 0) + R(n, 1, 2) * A(n, 2, 0);
0320     }
0321   }
0322 
0323   inline void KalmanHTG(const MPlexQF& A00, const MPlexQF& A01, const MPlex2S& B, MPlexHH& C) {
0324     // HTG  = rot * res_loc
0325     //   C  =  A  *    B
0326 
0327     // Based on script generation and adapted to custom sizes.
0328 
0329     typedef float T;
0330     const idx_t N = NN;
0331 
0332     const T* a00 = A00.fArray;
0333     ASSUME_ALIGNED(a00, 64);
0334     const T* a01 = A01.fArray;
0335     ASSUME_ALIGNED(a01, 64);
0336     const T* b = B.fArray;
0337     ASSUME_ALIGNED(b, 64);
0338     T* c = C.fArray;
0339     ASSUME_ALIGNED(c, 64);
0340 
0341 #pragma omp simd
0342     for (int n = 0; n < N; ++n) {
0343       c[0 * N + n] = a00[n] * b[0 * N + n];
0344       c[1 * N + n] = a00[n] * b[1 * N + n];
0345       c[2 * N + n] = 0.;
0346       c[3 * N + n] = a01[n] * b[0 * N + n];
0347       c[4 * N + n] = a01[n] * b[1 * N + n];
0348       c[5 * N + n] = 0.;
0349       c[6 * N + n] = b[1 * N + n];
0350       c[7 * N + n] = b[2 * N + n];
0351       c[8 * N + n] = 0.;
0352     }
0353   }
0354 
0355   inline void KalmanGain(const MPlexLS& A, const MPlexHH& B, MPlexLH& C) {
0356     // C = A * B, C is 6x3, A is 6x6 sym , B is 3x3
0357 
0358     typedef float T;
0359     const idx_t N = NN;
0360 
0361     const T* a = A.fArray;
0362     ASSUME_ALIGNED(a, 64);
0363     const T* b = B.fArray;
0364     ASSUME_ALIGNED(b, 64);
0365     T* c = C.fArray;
0366     ASSUME_ALIGNED(c, 64);
0367 
0368 #pragma omp simd
0369     for (int n = 0; n < N; ++n) {
0370       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[3 * N + n] + a[3 * N + n] * b[6 * N + n];
0371       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[4 * N + n] + a[3 * N + n] * b[7 * N + n];
0372       c[2 * N + n] = 0;
0373       c[3 * N + n] = a[1 * N + n] * b[0 * N + n] + a[2 * N + n] * b[3 * N + n] + a[4 * N + n] * b[6 * N + n];
0374       c[4 * N + n] = a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[4 * N + n] + a[4 * N + n] * b[7 * N + n];
0375       c[5 * N + n] = 0;
0376       c[6 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[3 * N + n] + a[5 * N + n] * b[6 * N + n];
0377       c[7 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[7 * N + n];
0378       c[8 * N + n] = 0;
0379       c[9 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[3 * N + n] + a[8 * N + n] * b[6 * N + n];
0380       c[10 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[4 * N + n] + a[8 * N + n] * b[7 * N + n];
0381       c[11 * N + n] = 0;
0382       c[12 * N + n] = a[10 * N + n] * b[0 * N + n] + a[11 * N + n] * b[3 * N + n] + a[12 * N + n] * b[6 * N + n];
0383       c[13 * N + n] = a[10 * N + n] * b[1 * N + n] + a[11 * N + n] * b[4 * N + n] + a[12 * N + n] * b[7 * N + n];
0384       c[14 * N + n] = 0;
0385       c[15 * N + n] = a[15 * N + n] * b[0 * N + n] + a[16 * N + n] * b[3 * N + n] + a[17 * N + n] * b[6 * N + n];
0386       c[16 * N + n] = a[15 * N + n] * b[1 * N + n] + a[16 * N + n] * b[4 * N + n] + a[17 * N + n] * b[7 * N + n];
0387       c[17 * N + n] = 0;
0388     }
0389   }
0390 
0391   inline void KalmanHTG(const MPlex2H& A, const MPlex2S& B, MPlexH2& C) {
0392     // HTG  = prj^T * res_loc
0393     //   C  =  A^T  *   B
0394 
0395     /*
0396     A^T 0 3
0397         1 4
0398         2 5
0399     B 0 1
0400       1 2
0401     C 0 1
0402       2 3
0403       4 5
0404     */
0405 
0406     typedef float T;
0407     const idx_t N = NN;
0408 
0409     const T* a = A.fArray;
0410     ASSUME_ALIGNED(a, 64);
0411     const T* b = B.fArray;
0412     ASSUME_ALIGNED(b, 64);
0413     T* c = C.fArray;
0414     ASSUME_ALIGNED(c, 64);
0415 
0416 #pragma omp simd
0417     for (int n = 0; n < N; ++n) {
0418       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[3 * N + n] * b[1 * N + n];
0419       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[3 * N + n] * b[2 * N + n];
0420       c[2 * N + n] = a[1 * N + n] * b[0 * N + n] + a[4 * N + n] * b[1 * N + n];
0421       c[3 * N + n] = a[1 * N + n] * b[1 * N + n] + a[4 * N + n] * b[2 * N + n];
0422       c[4 * N + n] = a[2 * N + n] * b[0 * N + n] + a[5 * N + n] * b[1 * N + n];
0423       c[5 * N + n] = a[2 * N + n] * b[1 * N + n] + a[5 * N + n] * b[2 * N + n];
0424     }
0425   }
0426 
0427   inline void KalmanGain(const MPlexLS& A, const MPlexH2& B, MPlexL2& C) {
0428     // C = A * B, C is 6x2, A is 6x6 sym , B is 3x2 (6x2 but half of it is zeros)
0429 
0430     /*
0431       A 0  1  3  6 10 15
0432         1  2  4  7 11 16
0433         3  4  5  8 12 17
0434         6  7  8  9 13 18
0435        10 11 12 13 14 19
0436        15 16 17 18 19 20
0437       B 0  1
0438         2  3
0439     4  5
0440         X  X with X=0, so not even included in B
0441         X  X
0442         X  X
0443       C 0  1
0444         2  3
0445     4  5
0446         6  7
0447         8  9
0448        10 11
0449      */
0450 
0451     typedef float T;
0452     const idx_t N = NN;
0453 
0454     const T* a = A.fArray;
0455     ASSUME_ALIGNED(a, 64);
0456     const T* b = B.fArray;
0457     ASSUME_ALIGNED(b, 64);
0458     T* c = C.fArray;
0459     ASSUME_ALIGNED(c, 64);
0460 
0461 #pragma omp simd
0462     for (int n = 0; n < N; ++n) {
0463       c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[2 * N + n] + a[3 * N + n] * b[4 * N + n];
0464       c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[3 * N + n] + a[3 * N + n] * b[5 * N + n];
0465       c[2 * N + n] = a[1 * N + n] * b[0 * N + n] + a[2 * N + n] * b[2 * N + n] + a[4 * N + n] * b[4 * N + n];
0466       c[3 * N + n] = a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[3 * N + n] + a[4 * N + n] * b[5 * N + n];
0467       c[4 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[2 * N + n] + a[5 * N + n] * b[4 * N + n];
0468       c[5 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[3 * N + n] + a[5 * N + n] * b[5 * N + n];
0469       c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[2 * N + n] + a[8 * N + n] * b[4 * N + n];
0470       c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[3 * N + n] + a[8 * N + n] * b[5 * N + n];
0471       c[8 * N + n] = a[10 * N + n] * b[0 * N + n] + a[11 * N + n] * b[2 * N + n] + a[12 * N + n] * b[4 * N + n];
0472       c[9 * N + n] = a[10 * N + n] * b[1 * N + n] + a[11 * N + n] * b[3 * N + n] + a[12 * N + n] * b[5 * N + n];
0473       c[10 * N + n] = a[15 * N + n] * b[0 * N + n] + a[16 * N + n] * b[2 * N + n] + a[17 * N + n] * b[4 * N + n];
0474       c[11 * N + n] = a[15 * N + n] * b[1 * N + n] + a[16 * N + n] * b[3 * N + n] + a[17 * N + n] * b[5 * N + n];
0475     }
0476   }
0477 
0478   inline void CovXYconstrain(const MPlexQF& R00, const MPlexQF& R01, const MPlexLS& Ci, MPlexLS& Co) {
0479     // C is transformed to align along y after rotation and rotated back
0480 
0481     typedef float T;
0482     const idx_t N = NN;
0483 
0484     const T* r00 = R00.fArray;
0485     ASSUME_ALIGNED(r00, 64);
0486     const T* r01 = R01.fArray;
0487     ASSUME_ALIGNED(r01, 64);
0488     const T* ci = Ci.fArray;
0489     ASSUME_ALIGNED(ci, 64);
0490     T* co = Co.fArray;
0491     ASSUME_ALIGNED(co, 64);
0492 
0493 #pragma omp simd
0494     for (int n = 0; n < N; ++n) {
0495       // a bit loopy to avoid temporaries
0496       co[0 * N + n] =
0497           r00[n] * r00[n] * ci[0 * N + n] + 2 * r00[n] * r01[n] * ci[1 * N + n] + r01[n] * r01[n] * ci[2 * N + n];
0498       co[1 * N + n] = r00[n] * r01[n] * co[0 * N + n];
0499       co[2 * N + n] = r01[n] * r01[n] * co[0 * N + n];
0500       co[0 * N + n] = r00[n] * r00[n] * co[0 * N + n];
0501 
0502       co[3 * N + n] = r00[n] * ci[3 * N + n] + r01[n] * ci[4 * N + n];
0503       co[4 * N + n] = r01[n] * co[3 * N + n];
0504       co[3 * N + n] = r00[n] * co[3 * N + n];
0505 
0506       co[6 * N + n] = r00[n] * ci[6 * N + n] + r01[n] * ci[7 * N + n];
0507       co[7 * N + n] = r01[n] * co[6 * N + n];
0508       co[6 * N + n] = r00[n] * co[6 * N + n];
0509 
0510       co[10 * N + n] = r00[n] * ci[10 * N + n] + r01[n] * ci[11 * N + n];
0511       co[11 * N + n] = r01[n] * co[10 * N + n];
0512       co[10 * N + n] = r00[n] * co[10 * N + n];
0513 
0514       co[15 * N + n] = r00[n] * ci[15 * N + n] + r01[n] * ci[16 * N + n];
0515       co[16 * N + n] = r01[n] * co[15 * N + n];
0516       co[15 * N + n] = r00[n] * co[15 * N + n];
0517     }
0518   }
0519 
0520   void KalmanGain(const MPlexLS& A, const MPlex2S& B, MPlexL2& C) {
0521     // C = A * B, C is 6x2, A is 6x6 sym , B is 2x2
0522 
0523     typedef float T;
0524     const idx_t N = NN;
0525 
0526     const T* a = A.fArray;
0527     ASSUME_ALIGNED(a, 64);
0528     const T* b = B.fArray;
0529     ASSUME_ALIGNED(b, 64);
0530     T* c = C.fArray;
0531     ASSUME_ALIGNED(c, 64);
0532 
0533 #include "KalmanGain62.ah"
0534   }
0535 
0536   inline void KHMult(const MPlexLH& A, const MPlexQF& B00, const MPlexQF& B01, MPlexLL& C) {
0537     // C = A * B, C is 6x6, A is 6x3 , B is 3x6
0538     KHMult_imp(A, B00, B01, C, 0, NN);
0539   }
0540 
0541   inline void KHMult(const MPlexL2& A, const MPlex2H& B, MPlexLL& C) {
0542     // C = A * B, C is 6x6, A is 6x2 , B is 2x3 (2x6 but half of it made of zeros)
0543 
0544     /*
0545     A 0  1
0546       2  3
0547       4  5
0548       6  7
0549       8  9
0550      10 11
0551     B  0  1  2  X  X  X with X=0 so not included in B
0552        3  4  5  X  X  X
0553     C  0  1  2  3  4  5
0554        6  7  8  9 10 11
0555       12 13 14 15 16 17
0556       18 19 20 21 22 23
0557       24 25 26 27 28 29
0558       30 31 32 33 34 34
0559     */
0560 
0561     // typedef float T;
0562     // const idx_t N = NN;
0563 
0564     // const T* a = A.fArray;
0565     // ASSUME_ALIGNED(a, 64);
0566     // const T* b = B.fArray;
0567     // ASSUME_ALIGNED(b, 64);
0568     // T* c = C.fArray;
0569     // ASSUME_ALIGNED(c, 64);
0570 
0571 #pragma omp simd
0572     for (int n = 0; n < NN; ++n) {
0573       C(n, 0, 0) = A(n, 0, 0) * B(n, 0, 0) + A(n, 0, 1) * B(n, 1, 0);
0574       C(n, 0, 1) = A(n, 0, 0) * B(n, 0, 1) + A(n, 0, 1) * B(n, 1, 1);
0575       C(n, 0, 2) = A(n, 0, 0) * B(n, 0, 2) + A(n, 0, 1) * B(n, 1, 2);
0576       C(n, 0, 3) = 0;
0577       C(n, 0, 4) = 0;
0578       C(n, 0, 5) = 0;
0579       C(n, 0, 6) = A(n, 1, 0) * B(n, 0, 0) + A(n, 1, 1) * B(n, 1, 0);
0580       C(n, 0, 7) = A(n, 1, 0) * B(n, 0, 1) + A(n, 1, 1) * B(n, 1, 1);
0581       C(n, 0, 8) = A(n, 1, 0) * B(n, 0, 2) + A(n, 1, 1) * B(n, 1, 2);
0582       C(n, 0, 9) = 0;
0583       C(n, 0, 10) = 0;
0584       C(n, 0, 11) = 0;
0585       C(n, 0, 12) = A(n, 2, 0) * B(n, 0, 0) + A(n, 2, 1) * B(n, 1, 0);
0586       C(n, 0, 13) = A(n, 2, 0) * B(n, 0, 1) + A(n, 2, 1) * B(n, 1, 1);
0587       C(n, 0, 14) = A(n, 2, 0) * B(n, 0, 2) + A(n, 2, 1) * B(n, 1, 2);
0588       C(n, 0, 15) = 0;
0589       C(n, 0, 16) = 0;
0590       C(n, 0, 17) = 0;
0591       C(n, 0, 18) = A(n, 3, 0) * B(n, 0, 0) + A(n, 3, 1) * B(n, 1, 0);
0592       C(n, 0, 19) = A(n, 3, 0) * B(n, 0, 1) + A(n, 3, 1) * B(n, 1, 1);
0593       C(n, 0, 20) = A(n, 3, 0) * B(n, 0, 2) + A(n, 3, 1) * B(n, 1, 2);
0594       C(n, 0, 21) = 0;
0595       C(n, 0, 22) = 0;
0596       C(n, 0, 23) = 0;
0597       C(n, 0, 24) = A(n, 4, 0) * B(n, 0, 0) + A(n, 4, 1) * B(n, 1, 0);
0598       C(n, 0, 25) = A(n, 4, 0) * B(n, 0, 1) + A(n, 4, 1) * B(n, 1, 1);
0599       C(n, 0, 26) = A(n, 4, 0) * B(n, 0, 2) + A(n, 4, 1) * B(n, 1, 2);
0600       C(n, 0, 27) = 0;
0601       C(n, 0, 28) = 0;
0602       C(n, 0, 29) = 0;
0603       C(n, 0, 30) = A(n, 5, 0) * B(n, 0, 0) + A(n, 5, 1) * B(n, 1, 0);
0604       C(n, 0, 31) = A(n, 5, 0) * B(n, 0, 1) + A(n, 5, 1) * B(n, 1, 1);
0605       C(n, 0, 32) = A(n, 5, 0) * B(n, 0, 2) + A(n, 5, 1) * B(n, 1, 2);
0606       C(n, 0, 33) = 0;
0607       C(n, 0, 34) = 0;
0608       C(n, 0, 35) = 0;
0609     }
0610   }
0611 
0612   inline void KHC(const MPlexLL& A, const MPlexLS& B, MPlexLS& C) {
0613     // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
0614 
0615     typedef float T;
0616     const idx_t N = NN;
0617 
0618     const T* a = A.fArray;
0619     ASSUME_ALIGNED(a, 64);
0620     const T* b = B.fArray;
0621     ASSUME_ALIGNED(b, 64);
0622     T* c = C.fArray;
0623     ASSUME_ALIGNED(c, 64);
0624 
0625 #include "KHC.ah"
0626   }
0627 
0628   inline void KHC(const MPlexL2& A, const MPlexLS& B, MPlexLS& C) {
0629     // C = A * B, C is 6x6 sym, A is 6x2 , B is 6x6 sym
0630 
0631     typedef float T;
0632     const idx_t N = NN;
0633 
0634     const T* a = A.fArray;
0635     ASSUME_ALIGNED(a, 64);
0636     const T* b = B.fArray;
0637     ASSUME_ALIGNED(b, 64);
0638     T* c = C.fArray;
0639     ASSUME_ALIGNED(c, 64);
0640 
0641 #include "K62HC.ah"
0642   }
0643 
0644   //Warning: MultFull is not vectorized!
0645   template <typename T1, typename T2, typename T3>
0646   void MultFull(const T1& A, int nia, int nja, const T2& B, int nib, int njb, T3& C, int nic, int njc) {
0647 #ifdef DEBUG
0648     assert(nja == nib);
0649     assert(nia == nic);
0650     assert(njb == njc);
0651 #endif
0652     for (int n = 0; n < NN; ++n) {
0653       for (int i = 0; i < nia; ++i) {
0654         for (int j = 0; j < njb; ++j) {
0655           C(n, i, j) = 0.;
0656           for (int k = 0; k < nja; ++k)
0657             C(n, i, j) += A.constAt(n, i, k) * B.constAt(n, k, j);
0658         }
0659       }
0660     }
0661   }
0662 
0663   //Warning: MultTranspFull is not vectorized!
0664   // (careful about which one is transposed, I think rows and cols are swapped and the one that is transposed is A)
0665   template <typename T1, typename T2, typename T3>
0666   void MultTranspFull(const T1& A, int nia, int nja, const T2& B, int nib, int njb, T3& C, int nic, int njc) {
0667 #ifdef DEBUG
0668     assert(nja == njb);
0669     assert(nia == nic);
0670     assert(nib == njc);
0671 #endif
0672     for (int n = 0; n < NN; ++n) {
0673       for (int i = 0; i < nia; ++i) {
0674         for (int j = 0; j < nib; ++j) {
0675           C(n, i, j) = 0.;
0676           for (int k = 0; k < nja; ++k)
0677             C(n, i, j) += A.constAt(n, i, k) * B.constAt(n, j, k);
0678         }
0679       }
0680     }
0681   }
0682 
0683 }  // namespace
0684 
0685 //==============================================================================
0686 // Kalman operations - common dummy variables
0687 //==============================================================================
0688 
0689 namespace {
0690   // Dummy variables for parameter consistency to kalmanOperation.
0691   // Through KalmanFilterOperation enum parameter it is guaranteed that
0692   // those will never get accessed in the code (read from or written into).
0693 
0694   CMS_SA_ALLOW mkfit::MPlexLS dummy_err;
0695   CMS_SA_ALLOW mkfit::MPlexLV dummy_par;
0696   CMS_SA_ALLOW mkfit::MPlexQF dummy_chi2;
0697 }  // namespace
0698 
0699 namespace mkfit {
0700 
0701   //==============================================================================
0702   // Kalman operations - Barrel
0703   //==============================================================================
0704 
0705   void kalmanUpdate(const MPlexLS& psErr,
0706                     const MPlexLV& psPar,
0707                     const MPlexHS& msErr,
0708                     const MPlexHV& msPar,
0709                     MPlexLS& outErr,
0710                     MPlexLV& outPar,
0711                     const int N_proc) {
0712     kalmanOperation(KFO_Update_Params | KFO_Local_Cov, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
0713   }
0714 
0715   void kalmanPropagateAndUpdate(const MPlexLS& psErr,
0716                                 const MPlexLV& psPar,
0717                                 MPlexQI& Chg,
0718                                 const MPlexHS& msErr,
0719                                 const MPlexHV& msPar,
0720                                 MPlexLS& outErr,
0721                                 MPlexLV& outPar,
0722                                 MPlexQI& outFailFlag,
0723                                 const int N_proc,
0724                                 const PropagationFlags& propFlags,
0725                                 const bool propToHit) {
0726     if (propToHit) {
0727       MPlexLS propErr;
0728       MPlexLV propPar;
0729       MPlexQF msRad;
0730 #pragma omp simd
0731       for (int n = 0; n < NN; ++n) {
0732         msRad.At(n, 0, 0) = std::hypot(msPar.constAt(n, 0, 0), msPar.constAt(n, 1, 0));
0733       }
0734 
0735       propagateHelixToRMPlex(psErr, psPar, Chg, msRad, propErr, propPar, outFailFlag, N_proc, propFlags);
0736 
0737       kalmanOperation(
0738           KFO_Update_Params | KFO_Local_Cov, propErr, propPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
0739     } else {
0740       kalmanOperation(
0741           KFO_Update_Params | KFO_Local_Cov, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
0742     }
0743     for (int n = 0; n < NN; ++n) {
0744       if (outPar.At(n, 3, 0) < 0) {
0745         Chg.At(n, 0, 0) = -Chg.At(n, 0, 0);
0746         outPar.At(n, 3, 0) = -outPar.At(n, 3, 0);
0747       }
0748     }
0749   }
0750 
0751   //------------------------------------------------------------------------------
0752 
0753   void kalmanComputeChi2(const MPlexLS& psErr,
0754                          const MPlexLV& psPar,
0755                          const MPlexQI& inChg,
0756                          const MPlexHS& msErr,
0757                          const MPlexHV& msPar,
0758                          MPlexQF& outChi2,
0759                          const int N_proc) {
0760     kalmanOperation(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
0761   }
0762 
0763   void kalmanPropagateAndComputeChi2(const MPlexLS& psErr,
0764                                      const MPlexLV& psPar,
0765                                      const MPlexQI& inChg,
0766                                      const MPlexHS& msErr,
0767                                      const MPlexHV& msPar,
0768                                      MPlexQF& outChi2,
0769                                      MPlexLV& propPar,
0770                                      MPlexQI& outFailFlag,
0771                                      const int N_proc,
0772                                      const PropagationFlags& propFlags,
0773                                      const bool propToHit) {
0774     propPar = psPar;
0775     if (propToHit) {
0776       MPlexLS propErr;
0777       MPlexQF msRad;
0778 #pragma omp simd
0779       for (int n = 0; n < NN; ++n) {
0780         msRad.At(n, 0, 0) = std::hypot(msPar.constAt(n, 0, 0), msPar.constAt(n, 1, 0));
0781       }
0782 
0783       propagateHelixToRMPlex(psErr, psPar, inChg, msRad, propErr, propPar, outFailFlag, N_proc, propFlags);
0784 
0785       kalmanOperation(KFO_Calculate_Chi2, propErr, propPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
0786     } else {
0787       kalmanOperation(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
0788     }
0789   }
0790 
0791   //------------------------------------------------------------------------------
0792 
0793   void kalmanOperation(const int kfOp,
0794                        const MPlexLS& psErr,
0795                        const MPlexLV& psPar,
0796                        const MPlexHS& msErr,
0797                        const MPlexHV& msPar,
0798                        MPlexLS& outErr,
0799                        MPlexLV& outPar,
0800                        MPlexQF& outChi2,
0801                        const int N_proc) {
0802 #ifdef DEBUG
0803     {
0804       dmutex_guard;
0805       printf("psPar:\n");
0806       for (int i = 0; i < 6; ++i) {
0807         printf("%8f ", psPar.constAt(0, 0, i));
0808         printf("\n");
0809       }
0810       printf("\n");
0811       printf("psErr:\n");
0812       for (int i = 0; i < 6; ++i) {
0813         for (int j = 0; j < 6; ++j)
0814           printf("%8f ", psErr.constAt(0, i, j));
0815         printf("\n");
0816       }
0817       printf("\n");
0818       printf("msPar:\n");
0819       for (int i = 0; i < 3; ++i) {
0820         printf("%8f ", msPar.constAt(0, 0, i));
0821         printf("\n");
0822       }
0823       printf("\n");
0824       printf("msErr:\n");
0825       for (int i = 0; i < 3; ++i) {
0826         for (int j = 0; j < 3; ++j)
0827           printf("%8f ", msErr.constAt(0, i, j));
0828         printf("\n");
0829       }
0830       printf("\n");
0831     }
0832 #endif
0833 
0834     // Rotate global point on tangent plane to cylinder
0835     // Tangent point is half way between hit and propagate position
0836 
0837     // Rotation matrix
0838     //  rotT00  0  rotT01
0839     //  rotT01  0 -rotT00
0840     //     0    1    0
0841     // Minimize temporaries: only two float are needed!
0842 
0843     MPlexQF rotT00;
0844     MPlexQF rotT01;
0845     for (int n = 0; n < NN; ++n) {
0846       const float r = std::hypot(msPar.constAt(n, 0, 0), msPar.constAt(n, 1, 0));
0847       rotT00.At(n, 0, 0) = -(msPar.constAt(n, 1, 0) + psPar.constAt(n, 1, 0)) / (2 * r);
0848       rotT01.At(n, 0, 0) = (msPar.constAt(n, 0, 0) + psPar.constAt(n, 0, 0)) / (2 * r);
0849     }
0850 
0851     MPlexHV res_glo;  //position residual in global coordinates
0852     SubtractFirst3(msPar, psPar, res_glo);
0853 
0854     MPlexHS resErr_glo;  //covariance sum in global position coordinates
0855     AddIntoUpperLeft3x3(psErr, msErr, resErr_glo);
0856 
0857     MPlex2V res_loc;  //position residual in local coordinates
0858     RotateResidualsOnTangentPlane(rotT00, rotT01, res_glo, res_loc);
0859     MPlex2S resErr_loc;  //covariance sum in local position coordinates
0860     MPlexHH tempHH;
0861     ProjectResErr(rotT00, rotT01, resErr_glo, tempHH);
0862     ProjectResErrTransp(rotT00, rotT01, tempHH, resErr_loc);
0863 
0864 #ifdef DEBUG
0865     {
0866       dmutex_guard;
0867       printf("res_glo:\n");
0868       for (int i = 0; i < 3; ++i) {
0869         printf("%8f ", res_glo.At(0, i, 0));
0870       }
0871       printf("\n");
0872       printf("resErr_glo:\n");
0873       for (int i = 0; i < 3; ++i) {
0874         for (int j = 0; j < 3; ++j)
0875           printf("%8f ", resErr_glo.At(0, i, j));
0876         printf("\n");
0877       }
0878       printf("\n");
0879       printf("res_loc:\n");
0880       for (int i = 0; i < 2; ++i) {
0881         printf("%8f ", res_loc.At(0, i, 0));
0882       }
0883       printf("\n");
0884       printf("tempHH:\n");
0885       for (int i = 0; i < 3; ++i) {
0886         for (int j = 0; j < 3; ++j)
0887           printf("%8f ", tempHH.At(0, i, j));
0888         printf("\n");
0889       }
0890       printf("\n");
0891       printf("resErr_loc:\n");
0892       for (int i = 0; i < 2; ++i) {
0893         for (int j = 0; j < 2; ++j)
0894           printf("%8f ", resErr_loc.At(0, i, j));
0895         printf("\n");
0896       }
0897       printf("\n");
0898     }
0899 #endif
0900 
0901     //invert the 2x2 matrix
0902     Matriplex::invertCramerSym(resErr_loc);
0903 
0904     if (kfOp & KFO_Calculate_Chi2) {
0905       Chi2Similarity(res_loc, resErr_loc, outChi2);
0906 
0907 #ifdef DEBUG
0908       {
0909         dmutex_guard;
0910         printf("resErr_loc (Inv):\n");
0911         for (int i = 0; i < 2; ++i) {
0912           for (int j = 0; j < 2; ++j)
0913             printf("%8f ", resErr_loc.At(0, i, j));
0914           printf("\n");
0915         }
0916         printf("\n");
0917         printf("chi2: %8f\n", outChi2.At(0, 0, 0));
0918       }
0919 #endif
0920     }
0921 
0922     if (kfOp & KFO_Update_Params) {
0923       MPlexLS psErrLoc = psErr;
0924       if (kfOp & KFO_Local_Cov)
0925         CovXYconstrain(rotT00, rotT01, psErr, psErrLoc);
0926 
0927       MPlexLH K;                                      // kalman gain, fixme should be L2
0928       KalmanHTG(rotT00, rotT01, resErr_loc, tempHH);  // intermediate term to get kalman gain (H^T*G)
0929       KalmanGain(psErrLoc, tempHH, K);
0930 
0931       MultResidualsAdd(K, psPar, res_loc, outPar);
0932 
0933       squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
0934 
0935       MPlexLL tempLL;
0936       KHMult(K, rotT00, rotT01, tempLL);
0937       KHC(tempLL, psErrLoc, outErr);
0938       outErr.subtract(psErrLoc, outErr);
0939 
0940 #ifdef DEBUG
0941       {
0942         dmutex_guard;
0943         if (kfOp & KFO_Local_Cov) {
0944           printf("psErrLoc:\n");
0945           for (int i = 0; i < 6; ++i) {
0946             for (int j = 0; j < 6; ++j)
0947               printf("% 8e ", psErrLoc.At(0, i, j));
0948             printf("\n");
0949           }
0950           printf("\n");
0951         }
0952         printf("resErr_loc (Inv):\n");
0953         for (int i = 0; i < 2; ++i) {
0954           for (int j = 0; j < 2; ++j)
0955             printf("%8f ", resErr_loc.At(0, i, j));
0956           printf("\n");
0957         }
0958         printf("\n");
0959         printf("tempHH:\n");
0960         for (int i = 0; i < 3; ++i) {
0961           for (int j = 0; j < 3; ++j)
0962             printf("%8f ", tempHH.At(0, i, j));
0963           printf("\n");
0964         }
0965         printf("\n");
0966         printf("K:\n");
0967         for (int i = 0; i < 6; ++i) {
0968           for (int j = 0; j < 3; ++j)
0969             printf("%8f ", K.At(0, i, j));
0970           printf("\n");
0971         }
0972         printf("\n");
0973         printf("tempLL:\n");
0974         for (int i = 0; i < 6; ++i) {
0975           for (int j = 0; j < 6; ++j)
0976             printf("%8f ", tempLL.At(0, i, j));
0977           printf("\n");
0978         }
0979         printf("\n");
0980         printf("outPar:\n");
0981         for (int i = 0; i < 6; ++i) {
0982           printf("%8f  ", outPar.At(0, i, 0));
0983         }
0984         printf("\n");
0985         printf("outErr:\n");
0986         for (int i = 0; i < 6; ++i) {
0987           for (int j = 0; j < 6; ++j)
0988             printf("%8f ", outErr.At(0, i, j));
0989           printf("\n");
0990         }
0991         printf("\n");
0992       }
0993 #endif
0994     }
0995   }
0996 
0997   //==============================================================================
0998   // Kalman operations - Plane
0999   //==============================================================================
1000 
1001   void kalmanUpdatePlane(const MPlexLS& psErr,
1002                          const MPlexLV& psPar,
1003                          const MPlexHS& msErr,
1004                          const MPlexHV& msPar,
1005                          const MPlexHV& plNrm,
1006                          const MPlexHV& plDir,
1007                          MPlexLS& outErr,
1008                          MPlexLV& outPar,
1009                          const int N_proc) {
1010     kalmanOperationPlane(
1011         KFO_Update_Params | KFO_Local_Cov, psErr, psPar, msErr, msPar, plNrm, plDir, outErr, outPar, dummy_chi2, N_proc);
1012   }
1013 
1014   void kalmanPropagateAndUpdatePlane(const MPlexLS& psErr,
1015                                      const MPlexLV& psPar,
1016                                      MPlexQI& Chg,
1017                                      const MPlexHS& msErr,
1018                                      const MPlexHV& msPar,
1019                                      const MPlexHV& plNrm,
1020                                      const MPlexHV& plDir,
1021                                      MPlexLS& outErr,
1022                                      MPlexLV& outPar,
1023                                      MPlexQI& outFailFlag,
1024                                      const int N_proc,
1025                                      const PropagationFlags& propFlags,
1026                                      const bool propToHit) {
1027     if (propToHit) {
1028       MPlexLS propErr;
1029       MPlexLV propPar;
1030       propagateHelixToPlaneMPlex(psErr, psPar, Chg, msPar, plNrm, propErr, propPar, outFailFlag, N_proc, propFlags);
1031 
1032       kalmanOperationPlane(KFO_Update_Params | KFO_Local_Cov,
1033                            propErr,
1034                            propPar,
1035                            msErr,
1036                            msPar,
1037                            plNrm,
1038                            plDir,
1039                            outErr,
1040                            outPar,
1041                            dummy_chi2,
1042                            N_proc);
1043     } else {
1044       kalmanOperationPlane(KFO_Update_Params | KFO_Local_Cov,
1045                            psErr,
1046                            psPar,
1047                            msErr,
1048                            msPar,
1049                            plNrm,
1050                            plDir,
1051                            outErr,
1052                            outPar,
1053                            dummy_chi2,
1054                            N_proc);
1055     }
1056     for (int n = 0; n < NN; ++n) {
1057       if (outPar.At(n, 3, 0) < 0) {
1058         Chg.At(n, 0, 0) = -Chg.At(n, 0, 0);
1059         outPar.At(n, 3, 0) = -outPar.At(n, 3, 0);
1060       }
1061     }
1062   }
1063 
1064   //------------------------------------------------------------------------------
1065 
1066   void kalmanComputeChi2Plane(const MPlexLS& psErr,
1067                               const MPlexLV& psPar,
1068                               const MPlexQI& inChg,
1069                               const MPlexHS& msErr,
1070                               const MPlexHV& msPar,
1071                               const MPlexHV& plNrm,
1072                               const MPlexHV& plDir,
1073                               MPlexQF& outChi2,
1074                               const int N_proc) {
1075     kalmanOperationPlane(
1076         KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, plNrm, plDir, dummy_err, dummy_par, outChi2, N_proc);
1077   }
1078 
1079   void kalmanPropagateAndComputeChi2Plane(const MPlexLS& psErr,
1080                                           const MPlexLV& psPar,
1081                                           const MPlexQI& inChg,
1082                                           const MPlexHS& msErr,
1083                                           const MPlexHV& msPar,
1084                                           const MPlexHV& plNrm,
1085                                           const MPlexHV& plDir,
1086                                           MPlexQF& outChi2,
1087                                           MPlexLV& propPar,
1088                                           MPlexQI& outFailFlag,
1089                                           const int N_proc,
1090                                           const PropagationFlags& propFlags,
1091                                           const bool propToHit) {
1092     propPar = psPar;
1093     if (propToHit) {
1094       MPlexLS propErr;
1095       propagateHelixToPlaneMPlex(psErr, psPar, inChg, msPar, plNrm, propErr, propPar, outFailFlag, N_proc, propFlags);
1096 
1097       kalmanOperationPlane(
1098           KFO_Calculate_Chi2, propErr, propPar, msErr, msPar, plNrm, plDir, dummy_err, dummy_par, outChi2, N_proc);
1099     } else {
1100       kalmanOperationPlane(
1101           KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, plNrm, plDir, dummy_err, dummy_par, outChi2, N_proc);
1102     }
1103   }
1104 
1105   //------------------------------------------------------------------------------
1106 
1107   void kalmanOperationPlane(const int kfOp,
1108                             const MPlexLS& psErr,
1109                             const MPlexLV& psPar,
1110                             const MPlexHS& msErr,
1111                             const MPlexHV& msPar,
1112                             const MPlexHV& plNrm,
1113                             const MPlexHV& plDir,
1114                             MPlexLS& outErr,
1115                             MPlexLV& outPar,
1116                             MPlexQF& outChi2,
1117                             const int N_proc) {
1118 #ifdef DEBUG
1119     {
1120       dmutex_guard;
1121       printf("psPar:\n");
1122       for (int i = 0; i < 6; ++i) {
1123         printf("%8f ", psPar.constAt(0, 0, i));
1124         printf("\n");
1125       }
1126       printf("\n");
1127       printf("psErr:\n");
1128       for (int i = 0; i < 6; ++i) {
1129         for (int j = 0; j < 6; ++j)
1130           printf("%8f ", psErr.constAt(0, i, j));
1131         printf("\n");
1132       }
1133       printf("\n");
1134       printf("msPar:\n");
1135       for (int i = 0; i < 3; ++i) {
1136         printf("%8f ", msPar.constAt(0, 0, i));
1137         printf("\n");
1138       }
1139       printf("\n");
1140       printf("msErr:\n");
1141       for (int i = 0; i < 3; ++i) {
1142         for (int j = 0; j < 3; ++j)
1143           printf("%8f ", msErr.constAt(0, i, j));
1144         printf("\n");
1145       }
1146       printf("\n");
1147     }
1148 #endif
1149 
1150     // Rotate global point on tangent plane to cylinder
1151     // Tangent point is half way between hit and propagate position
1152 
1153     // Rotation matrix
1154     //    D0  D1   D2
1155     //    X0  X1   X2
1156     //    N0  N1   N2
1157     // where D is the strip direction vector plDir, N is the normal plNrm, and X is the cross product between the two
1158 
1159     MPlex2H prj;
1160     for (int n = 0; n < NN; ++n) {
1161       prj(n, 0, 0) = plDir(n, 0, 0);
1162       prj(n, 0, 1) = plDir(n, 1, 0);
1163       prj(n, 0, 2) = plDir(n, 2, 0);
1164       prj(n, 1, 0) = plNrm(n, 1, 0) * plDir(n, 2, 0) - plNrm(n, 2, 0) * plDir(n, 1, 0);
1165       prj(n, 1, 1) = plNrm(n, 2, 0) * plDir(n, 0, 0) - plNrm(n, 0, 0) * plDir(n, 2, 0);
1166       prj(n, 1, 2) = plNrm(n, 0, 0) * plDir(n, 1, 0) - plNrm(n, 1, 0) * plDir(n, 0, 0);
1167     }
1168 
1169     MPlexHV res_glo;  //position residual in global coordinates
1170     SubtractFirst3(msPar, psPar, res_glo);
1171 
1172     MPlexHS resErr_glo;  //covariance sum in global position coordinates
1173     AddIntoUpperLeft3x3(psErr, msErr, resErr_glo);
1174 
1175     MPlex2V res_loc;  //position residual in local coordinates
1176     RotateResidualsOnPlane(prj, res_glo, res_loc);
1177     MPlex2S resErr_loc;  //covariance sum in local position coordinates
1178     MPlex2H temp2H;
1179     ProjectResErr(prj, resErr_glo, temp2H);
1180     ProjectResErrTransp(prj, temp2H, resErr_loc);
1181 
1182 #ifdef DEBUG
1183     {
1184       dmutex_guard;
1185       printf("prj:\n");
1186       for (int i = 0; i < 2; ++i) {
1187         for (int j = 0; j < 3; ++j)
1188           printf("%8f ", prj.At(0, i, j));
1189         printf("\n");
1190       }
1191       printf("\n");
1192       printf("res_glo:\n");
1193       for (int i = 0; i < 3; ++i) {
1194         printf("%8f ", res_glo.At(0, i, 0));
1195       }
1196       printf("\n");
1197       printf("resErr_glo:\n");
1198       for (int i = 0; i < 3; ++i) {
1199         for (int j = 0; j < 3; ++j)
1200           printf("%8f ", resErr_glo.At(0, i, j));
1201         printf("\n");
1202       }
1203       printf("\n");
1204       printf("res_loc:\n");
1205       for (int i = 0; i < 2; ++i) {
1206         printf("%8f ", res_loc.At(0, i, 0));
1207       }
1208       printf("\n");
1209       printf("temp2H:\n");
1210       for (int i = 0; i < 2; ++i) {
1211         for (int j = 0; j < 3; ++j)
1212           printf("%8f ", temp2H.At(0, i, j));
1213         printf("\n");
1214       }
1215       printf("\n");
1216       printf("resErr_loc:\n");
1217       for (int i = 0; i < 2; ++i) {
1218         for (int j = 0; j < 2; ++j)
1219           printf("%8f ", resErr_loc.At(0, i, j));
1220         printf("\n");
1221       }
1222       printf("\n");
1223     }
1224 #endif
1225 
1226     //invert the 2x2 matrix
1227     Matriplex::invertCramerSym(resErr_loc);
1228 
1229     if (kfOp & KFO_Calculate_Chi2) {
1230       Chi2Similarity(res_loc, resErr_loc, outChi2);
1231 
1232 #ifdef DEBUG
1233       {
1234         dmutex_guard;
1235         printf("resErr_loc (Inv):\n");
1236         for (int i = 0; i < 2; ++i) {
1237           for (int j = 0; j < 2; ++j)
1238             printf("%8f ", resErr_loc.At(0, i, j));
1239           printf("\n");
1240         }
1241         printf("\n");
1242         printf("chi2: %8f\n", outChi2.At(0, 0, 0));
1243       }
1244 #endif
1245     }
1246 
1247     if (kfOp & KFO_Update_Params) {
1248       MPlexLS psErrLoc = psErr;
1249 
1250       MPlexH2 tempH2;
1251       MPlexL2 K;                           // kalman gain, fixme should be L2
1252       KalmanHTG(prj, resErr_loc, tempH2);  // intermediate term to get kalman gain (H^T*G)
1253       KalmanGain(psErrLoc, tempH2, K);
1254 
1255       MultResidualsAdd(K, psPar, res_loc, outPar);
1256 
1257       squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
1258 
1259       MPlexLL tempLL;
1260       KHMult(K, prj, tempLL);
1261       KHC(tempLL, psErrLoc, outErr);
1262       outErr.subtract(psErrLoc, outErr);
1263 
1264 #ifdef DEBUG
1265       {
1266         dmutex_guard;
1267         if (kfOp & KFO_Local_Cov) {
1268           printf("psErrLoc:\n");
1269           for (int i = 0; i < 6; ++i) {
1270             for (int j = 0; j < 6; ++j)
1271               printf("% 8e ", psErrLoc.At(0, i, j));
1272             printf("\n");
1273           }
1274           printf("\n");
1275         }
1276         printf("resErr_loc (Inv):\n");
1277         for (int i = 0; i < 2; ++i) {
1278           for (int j = 0; j < 2; ++j)
1279             printf("%8f ", resErr_loc.At(0, i, j));
1280           printf("\n");
1281         }
1282         printf("\n");
1283         printf("tempH2:\n");
1284         for (int i = 0; i < 3; ++i) {
1285           for (int j = 0; j < 2; ++j)
1286             printf("%8f ", tempH2.At(0, i, j));
1287           printf("\n");
1288         }
1289         printf("\n");
1290         printf("K:\n");
1291         for (int i = 0; i < 6; ++i) {
1292           for (int j = 0; j < 2; ++j)
1293             printf("%8f ", K.At(0, i, j));
1294           printf("\n");
1295         }
1296         printf("\n");
1297         printf("tempLL:\n");
1298         for (int i = 0; i < 6; ++i) {
1299           for (int j = 0; j < 6; ++j)
1300             printf("%8f ", tempLL.At(0, i, j));
1301           printf("\n");
1302         }
1303         printf("\n");
1304         printf("outPar:\n");
1305         for (int i = 0; i < 6; ++i) {
1306           printf("%8f  ", outPar.At(0, i, 0));
1307         }
1308         printf("\n");
1309         printf("outErr:\n");
1310         for (int i = 0; i < 6; ++i) {
1311           for (int j = 0; j < 6; ++j)
1312             printf("%8f ", outErr.At(0, i, j));
1313           printf("\n");
1314         }
1315         printf("\n");
1316       }
1317 #endif
1318     }
1319   }
1320 
1321   //==============================================================================
1322   // Kalman operations - Endcap
1323   //==============================================================================
1324 
1325   void kalmanUpdateEndcap(const MPlexLS& psErr,
1326                           const MPlexLV& psPar,
1327                           const MPlexHS& msErr,
1328                           const MPlexHV& msPar,
1329                           MPlexLS& outErr,
1330                           MPlexLV& outPar,
1331                           const int N_proc) {
1332     kalmanOperationEndcap(KFO_Update_Params, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
1333   }
1334 
1335   void kalmanPropagateAndUpdateEndcap(const MPlexLS& psErr,
1336                                       const MPlexLV& psPar,
1337                                       MPlexQI& Chg,
1338                                       const MPlexHS& msErr,
1339                                       const MPlexHV& msPar,
1340                                       MPlexLS& outErr,
1341                                       MPlexLV& outPar,
1342                                       MPlexQI& outFailFlag,
1343                                       const int N_proc,
1344                                       const PropagationFlags& propFlags,
1345                                       const bool propToHit) {
1346     if (propToHit) {
1347       MPlexLS propErr;
1348       MPlexLV propPar;
1349       MPlexQF msZ;
1350 #pragma omp simd
1351       for (int n = 0; n < NN; ++n) {
1352         msZ.At(n, 0, 0) = msPar.constAt(n, 2, 0);
1353       }
1354 
1355       propagateHelixToZMPlex(psErr, psPar, Chg, msZ, propErr, propPar, outFailFlag, N_proc, propFlags);
1356 
1357       kalmanOperationEndcap(KFO_Update_Params, propErr, propPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
1358     } else {
1359       kalmanOperationEndcap(KFO_Update_Params, psErr, psPar, msErr, msPar, outErr, outPar, dummy_chi2, N_proc);
1360     }
1361     for (int n = 0; n < NN; ++n) {
1362       if (outPar.At(n, 3, 0) < 0) {
1363         Chg.At(n, 0, 0) = -Chg.At(n, 0, 0);
1364         outPar.At(n, 3, 0) = -outPar.At(n, 3, 0);
1365       }
1366     }
1367   }
1368 
1369   //------------------------------------------------------------------------------
1370 
1371   void kalmanComputeChi2Endcap(const MPlexLS& psErr,
1372                                const MPlexLV& psPar,
1373                                const MPlexQI& inChg,
1374                                const MPlexHS& msErr,
1375                                const MPlexHV& msPar,
1376                                MPlexQF& outChi2,
1377                                const int N_proc) {
1378     kalmanOperationEndcap(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
1379   }
1380 
1381   void kalmanPropagateAndComputeChi2Endcap(const MPlexLS& psErr,
1382                                            const MPlexLV& psPar,
1383                                            const MPlexQI& inChg,
1384                                            const MPlexHS& msErr,
1385                                            const MPlexHV& msPar,
1386                                            MPlexQF& outChi2,
1387                                            MPlexLV& propPar,
1388                                            MPlexQI& outFailFlag,
1389                                            const int N_proc,
1390                                            const PropagationFlags& propFlags,
1391                                            const bool propToHit) {
1392     propPar = psPar;
1393     if (propToHit) {
1394       MPlexLS propErr;
1395       MPlexQF msZ;
1396 #pragma omp simd
1397       for (int n = 0; n < NN; ++n) {
1398         msZ.At(n, 0, 0) = msPar.constAt(n, 2, 0);
1399       }
1400 
1401       propagateHelixToZMPlex(psErr, psPar, inChg, msZ, propErr, propPar, outFailFlag, N_proc, propFlags);
1402 
1403       kalmanOperationEndcap(KFO_Calculate_Chi2, propErr, propPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
1404     } else {
1405       kalmanOperationEndcap(KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, dummy_err, dummy_par, outChi2, N_proc);
1406     }
1407   }
1408 
1409   //------------------------------------------------------------------------------
1410 
1411   void kalmanOperationEndcap(const int kfOp,
1412                              const MPlexLS& psErr,
1413                              const MPlexLV& psPar,
1414                              const MPlexHS& msErr,
1415                              const MPlexHV& msPar,
1416                              MPlexLS& outErr,
1417                              MPlexLV& outPar,
1418                              MPlexQF& outChi2,
1419                              const int N_proc) {
1420 #ifdef DEBUG
1421     {
1422       dmutex_guard;
1423       printf("updateParametersEndcapMPlex\n");
1424       printf("psPar:\n");
1425       for (int i = 0; i < 6; ++i) {
1426         printf("%8f ", psPar.constAt(0, 0, i));
1427         printf("\n");
1428       }
1429       printf("\n");
1430       printf("msPar:\n");
1431       for (int i = 0; i < 3; ++i) {
1432         printf("%8f ", msPar.constAt(0, 0, i));
1433         printf("\n");
1434       }
1435       printf("\n");
1436       printf("psErr:\n");
1437       for (int i = 0; i < 6; ++i) {
1438         for (int j = 0; j < 6; ++j)
1439           printf("%8f ", psErr.constAt(0, i, j));
1440         printf("\n");
1441       }
1442       printf("\n");
1443       printf("msErr:\n");
1444       for (int i = 0; i < 3; ++i) {
1445         for (int j = 0; j < 3; ++j)
1446           printf("%8f ", msErr.constAt(0, i, j));
1447         printf("\n");
1448       }
1449       printf("\n");
1450     }
1451 #endif
1452 
1453     MPlex2V res;
1454     SubtractFirst2(msPar, psPar, res);
1455 
1456     MPlex2S resErr;
1457     AddIntoUpperLeft2x2(psErr, msErr, resErr);
1458 
1459 #ifdef DEBUG
1460     {
1461       dmutex_guard;
1462       printf("resErr:\n");
1463       for (int i = 0; i < 2; ++i) {
1464         for (int j = 0; j < 2; ++j)
1465           printf("%8f ", resErr.At(0, i, j));
1466         printf("\n");
1467       }
1468       printf("\n");
1469     }
1470 #endif
1471 
1472     //invert the 2x2 matrix
1473     Matriplex::invertCramerSym(resErr);
1474 
1475     if (kfOp & KFO_Calculate_Chi2) {
1476       Chi2Similarity(res, resErr, outChi2);
1477 
1478 #ifdef DEBUG
1479       {
1480         dmutex_guard;
1481         printf("resErr_loc (Inv):\n");
1482         for (int i = 0; i < 2; ++i) {
1483           for (int j = 0; j < 2; ++j)
1484             printf("%8f ", resErr.At(0, i, j));
1485           printf("\n");
1486         }
1487         printf("\n");
1488         printf("chi2: %8f\n", outChi2.At(0, 0, 0));
1489       }
1490 #endif
1491     }
1492 
1493     if (kfOp & KFO_Update_Params) {
1494       MPlexL2 K;
1495       KalmanGain(psErr, resErr, K);
1496 
1497       MultResidualsAdd(K, psPar, res, outPar);
1498 
1499       squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
1500 
1501       KHC(K, psErr, outErr);
1502 
1503 #ifdef DEBUG
1504       {
1505         dmutex_guard;
1506         printf("outErr before subtract:\n");
1507         for (int i = 0; i < 6; ++i) {
1508           for (int j = 0; j < 6; ++j)
1509             printf("%8f ", outErr.At(0, i, j));
1510           printf("\n");
1511         }
1512         printf("\n");
1513       }
1514 #endif
1515 
1516       outErr.subtract(psErr, outErr);
1517 
1518 #ifdef DEBUG
1519       {
1520         dmutex_guard;
1521         printf("res:\n");
1522         for (int i = 0; i < 2; ++i) {
1523           printf("%8f ", res.At(0, i, 0));
1524         }
1525         printf("\n");
1526         printf("resErr (Inv):\n");
1527         for (int i = 0; i < 2; ++i) {
1528           for (int j = 0; j < 2; ++j)
1529             printf("%8f ", resErr.At(0, i, j));
1530           printf("\n");
1531         }
1532         printf("\n");
1533         printf("K:\n");
1534         for (int i = 0; i < 6; ++i) {
1535           for (int j = 0; j < 2; ++j)
1536             printf("%8f ", K.At(0, i, j));
1537           printf("\n");
1538         }
1539         printf("\n");
1540         printf("outPar:\n");
1541         for (int i = 0; i < 6; ++i) {
1542           printf("%8f  ", outPar.At(0, i, 0));
1543         }
1544         printf("\n");
1545         printf("outErr:\n");
1546         for (int i = 0; i < 6; ++i) {
1547           for (int j = 0; j < 6; ++j)
1548             printf("%8f ", outErr.At(0, i, j));
1549           printf("\n");
1550         }
1551         printf("\n");
1552       }
1553 #endif
1554     }
1555   }
1556 
1557 }  // end namespace mkfit