Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoTracker/MkFitCore/src/KalmanGain.ah is written in an unsupported language. File is not indexed.

0001 #ifdef MPLEX_INTRINSICS
0002 
0003    for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T))
0004    {
0005       #ifdef AVX512_INTRINSICS
0006       IntrVec_t all_zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
0007       #else
0008       IntrVec_t all_zeros = { 0, 0, 0, 0, 0, 0, 0, 0 };
0009       #endif
0010 
0011       IntrVec_t a_0 = LD(a, 0);
0012       IntrVec_t b_0 = LD(b, 0);
0013       IntrVec_t c_0 = MUL(a_0, b_0);
0014       IntrVec_t b_1 = LD(b, 1);
0015       IntrVec_t c_1 = MUL(a_0, b_1);
0016 
0017       IntrVec_t a_1 = LD(a, 1);
0018       IntrVec_t b_3 = LD(b, 3);
0019       c_0 = FMA(a_1, b_3, c_0);
0020       IntrVec_t b_4 = LD(b, 4);
0021       c_1 = FMA(a_1, b_4, c_1);
0022 
0023       IntrVec_t a_3 = LD(a, 3);
0024       IntrVec_t b_6 = LD(b, 6);
0025       c_0 = FMA(a_3, b_6, c_0);
0026       IntrVec_t b_7 = LD(b, 7);
0027       c_1 = FMA(a_3, b_7, c_1);
0028 
0029 
0030 
0031       ST(c, 2, all_zeros);
0032 
0033       IntrVec_t c_3 = MUL(a_1, b_0);
0034       IntrVec_t c_4 = MUL(a_1, b_1);
0035 
0036       IntrVec_t a_2 = LD(a, 2);
0037       c_3 = FMA(a_2, b_3, c_3);
0038       ST(c, 0, c_0);
0039       ST(c, 1, c_1);
0040       c_4 = FMA(a_2, b_4, c_4);
0041 
0042       IntrVec_t a_4 = LD(a, 4);
0043       c_3 = FMA(a_4, b_6, c_3);
0044       c_4 = FMA(a_4, b_7, c_4);
0045 
0046 
0047 
0048       ST(c, 5, all_zeros);
0049 
0050       IntrVec_t c_6 = MUL(a_3, b_0);
0051       IntrVec_t c_7 = MUL(a_3, b_1);
0052 
0053       c_6 = FMA(a_4, b_3, c_6);
0054       c_7 = FMA(a_4, b_4, c_7);
0055       ST(c, 3, c_3);
0056       ST(c, 4, c_4);
0057 
0058       IntrVec_t a_5 = LD(a, 5);
0059       c_6 = FMA(a_5, b_6, c_6);
0060       c_7 = FMA(a_5, b_7, c_7);
0061 
0062 
0063 
0064       ST(c, 8, all_zeros);
0065 
0066       IntrVec_t a_6 = LD(a, 6);
0067       IntrVec_t c_9 = MUL(a_6, b_0);
0068       IntrVec_t c_10 = MUL(a_6, b_1);
0069 
0070       IntrVec_t a_7 = LD(a, 7);
0071       c_9 = FMA(a_7, b_3, c_9);
0072       ST(c, 6, c_6);
0073       ST(c, 7, c_7);
0074       c_10 = FMA(a_7, b_4, c_10);
0075 
0076       IntrVec_t a_8 = LD(a, 8);
0077       c_9 = FMA(a_8, b_6, c_9);
0078       c_10 = FMA(a_8, b_7, c_10);
0079 
0080 
0081 
0082       ST(c, 11, all_zeros);
0083 
0084       IntrVec_t a_10 = LD(a, 10);
0085       IntrVec_t c_12 = MUL(a_10, b_0);
0086       IntrVec_t c_13 = MUL(a_10, b_1);
0087 
0088       IntrVec_t a_11 = LD(a, 11);
0089       c_12 = FMA(a_11, b_3, c_12);
0090       ST(c, 9, c_9);
0091       ST(c, 10, c_10);
0092       c_13 = FMA(a_11, b_4, c_13);
0093 
0094       IntrVec_t a_12 = LD(a, 12);
0095       c_12 = FMA(a_12, b_6, c_12);
0096       c_13 = FMA(a_12, b_7, c_13);
0097 
0098 
0099 
0100       ST(c, 14, all_zeros);
0101 
0102       IntrVec_t a_15 = LD(a, 15);
0103       IntrVec_t c_15 = MUL(a_15, b_0);
0104       IntrVec_t c_16 = MUL(a_15, b_1);
0105 
0106       IntrVec_t a_16 = LD(a, 16);
0107       c_15 = FMA(a_16, b_3, c_15);
0108       ST(c, 12, c_12);
0109       ST(c, 13, c_13);
0110       c_16 = FMA(a_16, b_4, c_16);
0111 
0112       IntrVec_t a_17 = LD(a, 17);
0113       c_15 = FMA(a_17, b_6, c_15);
0114       c_16 = FMA(a_17, b_7, c_16);
0115 
0116 
0117 
0118       ST(c, 17, all_zeros);
0119       ST(c, 15, c_15);
0120       ST(c, 16, c_16);
0121    }
0122 
0123 #else
0124 
0125 #pragma omp simd
0126    for (int n = 0; n < N; ++n)
0127    {
0128       c[ 0*N+n] = a[ 0*N+n]*b[ 0*N+n] + a[ 1*N+n]*b[ 3*N+n] + a[ 3*N+n]*b[ 6*N+n];
0129       c[ 1*N+n] = a[ 0*N+n]*b[ 1*N+n] + a[ 1*N+n]*b[ 4*N+n] + a[ 3*N+n]*b[ 7*N+n];
0130       c[ 2*N+n] = 0;
0131       c[ 3*N+n] = a[ 1*N+n]*b[ 0*N+n] + a[ 2*N+n]*b[ 3*N+n] + a[ 4*N+n]*b[ 6*N+n];
0132       c[ 4*N+n] = a[ 1*N+n]*b[ 1*N+n] + a[ 2*N+n]*b[ 4*N+n] + a[ 4*N+n]*b[ 7*N+n];
0133       c[ 5*N+n] = 0;
0134       c[ 6*N+n] = a[ 3*N+n]*b[ 0*N+n] + a[ 4*N+n]*b[ 3*N+n] + a[ 5*N+n]*b[ 6*N+n];
0135       c[ 7*N+n] = a[ 3*N+n]*b[ 1*N+n] + a[ 4*N+n]*b[ 4*N+n] + a[ 5*N+n]*b[ 7*N+n];
0136       c[ 8*N+n] = 0;
0137       c[ 9*N+n] = a[ 6*N+n]*b[ 0*N+n] + a[ 7*N+n]*b[ 3*N+n] + a[ 8*N+n]*b[ 6*N+n];
0138       c[10*N+n] = a[ 6*N+n]*b[ 1*N+n] + a[ 7*N+n]*b[ 4*N+n] + a[ 8*N+n]*b[ 7*N+n];
0139       c[11*N+n] = 0;
0140       c[12*N+n] = a[10*N+n]*b[ 0*N+n] + a[11*N+n]*b[ 3*N+n] + a[12*N+n]*b[ 6*N+n];
0141       c[13*N+n] = a[10*N+n]*b[ 1*N+n] + a[11*N+n]*b[ 4*N+n] + a[12*N+n]*b[ 7*N+n];
0142       c[14*N+n] = 0;
0143       c[15*N+n] = a[15*N+n]*b[ 0*N+n] + a[16*N+n]*b[ 3*N+n] + a[17*N+n]*b[ 6*N+n];
0144       c[16*N+n] = a[15*N+n]*b[ 1*N+n] + a[16*N+n]*b[ 4*N+n] + a[17*N+n]*b[ 7*N+n];
0145       c[17*N+n] = 0;
0146    }
0147 #endif