Warning, /RecoTracker/MkFitCore/src/KalmanGain.ah is written in an unsupported language. File is not indexed.
0001 #ifdef MPLEX_INTRINSICS
0002
0003 for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T))
0004 {
0005 #ifdef AVX512_INTRINSICS
0006 IntrVec_t all_zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
0007 #else
0008 IntrVec_t all_zeros = { 0, 0, 0, 0, 0, 0, 0, 0 };
0009 #endif
0010
0011 IntrVec_t a_0 = LD(a, 0);
0012 IntrVec_t b_0 = LD(b, 0);
0013 IntrVec_t c_0 = MUL(a_0, b_0);
0014 IntrVec_t b_1 = LD(b, 1);
0015 IntrVec_t c_1 = MUL(a_0, b_1);
0016
0017 IntrVec_t a_1 = LD(a, 1);
0018 IntrVec_t b_3 = LD(b, 3);
0019 c_0 = FMA(a_1, b_3, c_0);
0020 IntrVec_t b_4 = LD(b, 4);
0021 c_1 = FMA(a_1, b_4, c_1);
0022
0023 IntrVec_t a_3 = LD(a, 3);
0024 IntrVec_t b_6 = LD(b, 6);
0025 c_0 = FMA(a_3, b_6, c_0);
0026 IntrVec_t b_7 = LD(b, 7);
0027 c_1 = FMA(a_3, b_7, c_1);
0028
0029
0030
0031 ST(c, 2, all_zeros);
0032
0033 IntrVec_t c_3 = MUL(a_1, b_0);
0034 IntrVec_t c_4 = MUL(a_1, b_1);
0035
0036 IntrVec_t a_2 = LD(a, 2);
0037 c_3 = FMA(a_2, b_3, c_3);
0038 ST(c, 0, c_0);
0039 ST(c, 1, c_1);
0040 c_4 = FMA(a_2, b_4, c_4);
0041
0042 IntrVec_t a_4 = LD(a, 4);
0043 c_3 = FMA(a_4, b_6, c_3);
0044 c_4 = FMA(a_4, b_7, c_4);
0045
0046
0047
0048 ST(c, 5, all_zeros);
0049
0050 IntrVec_t c_6 = MUL(a_3, b_0);
0051 IntrVec_t c_7 = MUL(a_3, b_1);
0052
0053 c_6 = FMA(a_4, b_3, c_6);
0054 c_7 = FMA(a_4, b_4, c_7);
0055 ST(c, 3, c_3);
0056 ST(c, 4, c_4);
0057
0058 IntrVec_t a_5 = LD(a, 5);
0059 c_6 = FMA(a_5, b_6, c_6);
0060 c_7 = FMA(a_5, b_7, c_7);
0061
0062
0063
0064 ST(c, 8, all_zeros);
0065
0066 IntrVec_t a_6 = LD(a, 6);
0067 IntrVec_t c_9 = MUL(a_6, b_0);
0068 IntrVec_t c_10 = MUL(a_6, b_1);
0069
0070 IntrVec_t a_7 = LD(a, 7);
0071 c_9 = FMA(a_7, b_3, c_9);
0072 ST(c, 6, c_6);
0073 ST(c, 7, c_7);
0074 c_10 = FMA(a_7, b_4, c_10);
0075
0076 IntrVec_t a_8 = LD(a, 8);
0077 c_9 = FMA(a_8, b_6, c_9);
0078 c_10 = FMA(a_8, b_7, c_10);
0079
0080
0081
0082 ST(c, 11, all_zeros);
0083
0084 IntrVec_t a_10 = LD(a, 10);
0085 IntrVec_t c_12 = MUL(a_10, b_0);
0086 IntrVec_t c_13 = MUL(a_10, b_1);
0087
0088 IntrVec_t a_11 = LD(a, 11);
0089 c_12 = FMA(a_11, b_3, c_12);
0090 ST(c, 9, c_9);
0091 ST(c, 10, c_10);
0092 c_13 = FMA(a_11, b_4, c_13);
0093
0094 IntrVec_t a_12 = LD(a, 12);
0095 c_12 = FMA(a_12, b_6, c_12);
0096 c_13 = FMA(a_12, b_7, c_13);
0097
0098
0099
0100 ST(c, 14, all_zeros);
0101
0102 IntrVec_t a_15 = LD(a, 15);
0103 IntrVec_t c_15 = MUL(a_15, b_0);
0104 IntrVec_t c_16 = MUL(a_15, b_1);
0105
0106 IntrVec_t a_16 = LD(a, 16);
0107 c_15 = FMA(a_16, b_3, c_15);
0108 ST(c, 12, c_12);
0109 ST(c, 13, c_13);
0110 c_16 = FMA(a_16, b_4, c_16);
0111
0112 IntrVec_t a_17 = LD(a, 17);
0113 c_15 = FMA(a_17, b_6, c_15);
0114 c_16 = FMA(a_17, b_7, c_16);
0115
0116
0117
0118 ST(c, 17, all_zeros);
0119 ST(c, 15, c_15);
0120 ST(c, 16, c_16);
0121 }
0122
0123 #else
0124
0125 #pragma omp simd
0126 for (int n = 0; n < N; ++n)
0127 {
0128 c[ 0*N+n] = a[ 0*N+n]*b[ 0*N+n] + a[ 1*N+n]*b[ 3*N+n] + a[ 3*N+n]*b[ 6*N+n];
0129 c[ 1*N+n] = a[ 0*N+n]*b[ 1*N+n] + a[ 1*N+n]*b[ 4*N+n] + a[ 3*N+n]*b[ 7*N+n];
0130 c[ 2*N+n] = 0;
0131 c[ 3*N+n] = a[ 1*N+n]*b[ 0*N+n] + a[ 2*N+n]*b[ 3*N+n] + a[ 4*N+n]*b[ 6*N+n];
0132 c[ 4*N+n] = a[ 1*N+n]*b[ 1*N+n] + a[ 2*N+n]*b[ 4*N+n] + a[ 4*N+n]*b[ 7*N+n];
0133 c[ 5*N+n] = 0;
0134 c[ 6*N+n] = a[ 3*N+n]*b[ 0*N+n] + a[ 4*N+n]*b[ 3*N+n] + a[ 5*N+n]*b[ 6*N+n];
0135 c[ 7*N+n] = a[ 3*N+n]*b[ 1*N+n] + a[ 4*N+n]*b[ 4*N+n] + a[ 5*N+n]*b[ 7*N+n];
0136 c[ 8*N+n] = 0;
0137 c[ 9*N+n] = a[ 6*N+n]*b[ 0*N+n] + a[ 7*N+n]*b[ 3*N+n] + a[ 8*N+n]*b[ 6*N+n];
0138 c[10*N+n] = a[ 6*N+n]*b[ 1*N+n] + a[ 7*N+n]*b[ 4*N+n] + a[ 8*N+n]*b[ 7*N+n];
0139 c[11*N+n] = 0;
0140 c[12*N+n] = a[10*N+n]*b[ 0*N+n] + a[11*N+n]*b[ 3*N+n] + a[12*N+n]*b[ 6*N+n];
0141 c[13*N+n] = a[10*N+n]*b[ 1*N+n] + a[11*N+n]*b[ 4*N+n] + a[12*N+n]*b[ 7*N+n];
0142 c[14*N+n] = 0;
0143 c[15*N+n] = a[15*N+n]*b[ 0*N+n] + a[16*N+n]*b[ 3*N+n] + a[17*N+n]*b[ 6*N+n];
0144 c[16*N+n] = a[15*N+n]*b[ 1*N+n] + a[16*N+n]*b[ 4*N+n] + a[17*N+n]*b[ 7*N+n];
0145 c[17*N+n] = 0;
0146 }
0147 #endif