Warning, /RecoTracker/MkFitCore/src/MultHelixPropTransp.ah is written in an unsupported language. File is not indexed.
0001 #ifdef MPLEX_INTRINSICS
0002
0003 for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T)) {
0004 IntrVec_t b_0 = LD(b, 0);
0005 IntrVec_t a_0 = LD(a, 0);
0006 IntrVec_t c_0 = MUL(b_0, a_0);
0007
0008 IntrVec_t b_1 = LD(b, 1);
0009 IntrVec_t a_1 = LD(a, 1);
0010 c_0 = FMA(b_1, a_1, c_0);
0011
0012 IntrVec_t b_3 = LD(b, 3);
0013 IntrVec_t a_3 = LD(a, 3);
0014 c_0 = FMA(b_3, a_3, c_0);
0015
0016 IntrVec_t b_4 = LD(b, 4);
0017 IntrVec_t a_4 = LD(a, 4);
0018 c_0 = FMA(b_4, a_4, c_0);
0019
0020 IntrVec_t b_6 = LD(b, 6);
0021 IntrVec_t c_1 = MUL(b_6, a_0);
0022 IntrVec_t a_6 = LD(a, 6);
0023 IntrVec_t c_2 = MUL(b_6, a_6);
0024 ST(c, 0, c_0);
0025
0026 IntrVec_t b_7 = LD(b, 7);
0027 c_1 = FMA(b_7, a_1, c_1);
0028 IntrVec_t a_7 = LD(a, 7);
0029 c_2 = FMA(b_7, a_7, c_2);
0030
0031 IntrVec_t b_9 = LD(b, 9);
0032 c_1 = FMA(b_9, a_3, c_1);
0033 IntrVec_t a_9 = LD(a, 9);
0034 c_2 = FMA(b_9, a_9, c_2);
0035
0036 IntrVec_t b_10 = LD(b, 10);
0037 c_1 = FMA(b_10, a_4, c_1);
0038 IntrVec_t a_10 = LD(a, 10);
0039 c_2 = FMA(b_10, a_10, c_2);
0040
0041 IntrVec_t b_12 = LD(b, 12);
0042 IntrVec_t c_3 = MUL(b_12, a_0);
0043 IntrVec_t c_4 = MUL(b_12, a_6);
0044 IntrVec_t a_12 = LD(a, 12);
0045 IntrVec_t c_5 = MUL(b_12, a_12);
0046 ST(c, 1, c_1);
0047 ST(c, 2, c_2);
0048
0049 IntrVec_t b_13 = LD(b, 13);
0050 c_3 = FMA(b_13, a_1, c_3);
0051 c_4 = FMA(b_13, a_7, c_4);
0052 IntrVec_t a_13 = LD(a, 13);
0053 c_5 = FMA(b_13, a_13, c_5);
0054
0055 IntrVec_t b_14 = LD(b, 14);
0056 c_5 = ADD(b_14, c_5);
0057
0058 IntrVec_t b_15 = LD(b, 15);
0059 c_3 = FMA(b_15, a_3, c_3);
0060 c_4 = FMA(b_15, a_9, c_4);
0061 IntrVec_t a_15 = LD(a, 15);
0062 c_5 = FMA(b_15, a_15, c_5);
0063
0064 IntrVec_t b_16 = LD(b, 16);
0065 c_3 = FMA(b_16, a_4, c_3);
0066 c_4 = FMA(b_16, a_10, c_4);
0067 IntrVec_t a_16 = LD(a, 16);
0068 c_5 = FMA(b_16, a_16, c_5);
0069
0070 IntrVec_t b_17 = LD(b, 17);
0071 IntrVec_t a_17 = LD(a, 17);
0072 c_5 = FMA(b_17, a_17, c_5);
0073
0074 IntrVec_t b_18 = LD(b, 18);
0075 IntrVec_t c_6 = MUL(b_18, a_0);
0076 ST(c, 3, c_3);
0077 ST(c, 4, c_4);
0078 ST(c, 5, c_5);
0079 IntrVec_t c_7 = MUL(b_18, a_6);
0080 IntrVec_t c_8 = MUL(b_18, a_12);
0081 IntrVec_t a_18 = LD(a, 18);
0082 IntrVec_t c_9 = MUL(b_18, a_18);
0083
0084 IntrVec_t b_19 = LD(b, 19);
0085 c_6 = FMA(b_19, a_1, c_6);
0086 c_7 = FMA(b_19, a_7, c_7);
0087 c_8 = FMA(b_19, a_13, c_8);
0088 IntrVec_t a_19 = LD(a, 19);
0089 c_9 = FMA(b_19, a_19, c_9);
0090
0091 IntrVec_t b_20 = LD(b, 20);
0092 c_8 = ADD(b_20, c_8);
0093
0094 IntrVec_t b_21 = LD(b, 21);
0095 c_6 = FMA(b_21, a_3, c_6);
0096 c_7 = FMA(b_21, a_9, c_7);
0097 c_8 = FMA(b_21, a_15, c_8);
0098 IntrVec_t a_21 = LD(a, 21);
0099 c_9 = FMA(b_21, a_21, c_9);
0100
0101 IntrVec_t b_22 = LD(b, 22);
0102 c_6 = FMA(b_22, a_4, c_6);
0103 c_7 = FMA(b_22, a_10, c_7);
0104 c_8 = FMA(b_22, a_16, c_8);
0105 IntrVec_t a_22 = LD(a, 22);
0106 c_9 = FMA(b_22, a_22, c_9);
0107
0108 IntrVec_t b_23 = LD(b, 23);
0109 c_8 = FMA(b_23, a_17, c_8);
0110
0111 IntrVec_t b_24 = LD(b, 24);
0112 IntrVec_t c_10 = MUL(b_24, a_0);
0113 ST(c, 6, c_6);
0114 ST(c, 7, c_7);
0115 ST(c, 8, c_8);
0116 ST(c, 9, c_9);
0117 IntrVec_t c_11 = MUL(b_24, a_6);
0118 IntrVec_t c_12 = MUL(b_24, a_12);
0119 IntrVec_t c_13 = MUL(b_24, a_18);
0120 IntrVec_t a_24 = LD(a, 24);
0121 IntrVec_t c_14 = MUL(b_24, a_24);
0122
0123 IntrVec_t b_25 = LD(b, 25);
0124 c_10 = FMA(b_25, a_1, c_10);
0125 c_11 = FMA(b_25, a_7, c_11);
0126 c_12 = FMA(b_25, a_13, c_12);
0127 c_13 = FMA(b_25, a_19, c_13);
0128 IntrVec_t a_25 = LD(a, 25);
0129 c_14 = FMA(b_25, a_25, c_14);
0130
0131 IntrVec_t b_26 = LD(b, 26);
0132 c_12 = ADD(b_26, c_12);
0133
0134 IntrVec_t b_27 = LD(b, 27);
0135 c_10 = FMA(b_27, a_3, c_10);
0136 c_11 = FMA(b_27, a_9, c_11);
0137 c_12 = FMA(b_27, a_15, c_12);
0138 c_13 = FMA(b_27, a_21, c_13);
0139 IntrVec_t a_27 = LD(a, 27);
0140 c_14 = FMA(b_27, a_27, c_14);
0141
0142 IntrVec_t b_28 = LD(b, 28);
0143 c_10 = FMA(b_28, a_4, c_10);
0144 c_11 = FMA(b_28, a_10, c_11);
0145 c_12 = FMA(b_28, a_16, c_12);
0146 c_13 = FMA(b_28, a_22, c_13);
0147 IntrVec_t a_28 = LD(a, 28);
0148 c_14 = FMA(b_28, a_28, c_14);
0149
0150 IntrVec_t b_29 = LD(b, 29);
0151 c_12 = FMA(b_29, a_17, c_12);
0152
0153 IntrVec_t b_30 = LD(b, 30);
0154 IntrVec_t c_15 = MUL(b_30, a_0);
0155 ST(c, 10, c_10);
0156 ST(c, 11, c_11);
0157 ST(c, 12, c_12);
0158 ST(c, 13, c_13);
0159 ST(c, 14, c_14);
0160 IntrVec_t c_16 = MUL(b_30, a_6);
0161 IntrVec_t c_17 = MUL(b_30, a_12);
0162 IntrVec_t c_18 = MUL(b_30, a_18);
0163 IntrVec_t c_19 = MUL(b_30, a_24);
0164
0165 IntrVec_t b_31 = LD(b, 31);
0166 c_15 = FMA(b_31, a_1, c_15);
0167 c_16 = FMA(b_31, a_7, c_16);
0168 c_17 = FMA(b_31, a_13, c_17);
0169 c_18 = FMA(b_31, a_19, c_18);
0170 c_19 = FMA(b_31, a_25, c_19);
0171
0172 IntrVec_t b_32 = LD(b, 32);
0173 c_17 = ADD(b_32, c_17);
0174
0175 IntrVec_t b_33 = LD(b, 33);
0176 c_15 = FMA(b_33, a_3, c_15);
0177 c_16 = FMA(b_33, a_9, c_16);
0178 c_17 = FMA(b_33, a_15, c_17);
0179 c_18 = FMA(b_33, a_21, c_18);
0180 c_19 = FMA(b_33, a_27, c_19);
0181
0182 IntrVec_t b_34 = LD(b, 34);
0183 c_15 = FMA(b_34, a_4, c_15);
0184 c_16 = FMA(b_34, a_10, c_16);
0185 c_17 = FMA(b_34, a_16, c_17);
0186 c_18 = FMA(b_34, a_22, c_18);
0187 c_19 = FMA(b_34, a_28, c_19);
0188
0189 IntrVec_t b_35 = LD(b, 35);
0190 c_17 = FMA(b_35, a_17, c_17);
0191 IntrVec_t c_20 = b_35;
0192 ST(c, 15, c_15);
0193 ST(c, 16, c_16);
0194 ST(c, 17, c_17);
0195 ST(c, 18, c_18);
0196 ST(c, 19, c_19);
0197 ST(c, 20, c_20);
0198 }
0199
0200 #else
0201
0202 #pragma omp simd
0203 for (int n = 0; n < N; ++n) {
0204 c[0 * N + n] = b[0 * N + n] * a[0 * N + n] + b[1 * N + n] * a[1 * N + n] + b[3 * N + n] * a[3 * N + n] +
0205 b[4 * N + n] * a[4 * N + n];
0206 c[1 * N + n] = b[6 * N + n] * a[0 * N + n] + b[7 * N + n] * a[1 * N + n] + b[9 * N + n] * a[3 * N + n] +
0207 b[10 * N + n] * a[4 * N + n];
0208 c[2 * N + n] = b[6 * N + n] * a[6 * N + n] + b[7 * N + n] * a[7 * N + n] + b[9 * N + n] * a[9 * N + n] +
0209 b[10 * N + n] * a[10 * N + n];
0210 c[3 * N + n] = b[12 * N + n] * a[0 * N + n] + b[13 * N + n] * a[1 * N + n] + b[15 * N + n] * a[3 * N + n] +
0211 b[16 * N + n] * a[4 * N + n];
0212 c[4 * N + n] = b[12 * N + n] * a[6 * N + n] + b[13 * N + n] * a[7 * N + n] + b[15 * N + n] * a[9 * N + n] +
0213 b[16 * N + n] * a[10 * N + n];
0214 c[5 * N + n] = b[12 * N + n] * a[12 * N + n] + b[13 * N + n] * a[13 * N + n] + b[14 * N + n] +
0215 b[15 * N + n] * a[15 * N + n] + b[16 * N + n] * a[16 * N + n] + b[17 * N + n] * a[17 * N + n];
0216 c[6 * N + n] = b[18 * N + n] * a[0 * N + n] + b[19 * N + n] * a[1 * N + n] + b[21 * N + n] * a[3 * N + n] +
0217 b[22 * N + n] * a[4 * N + n];
0218 c[7 * N + n] = b[18 * N + n] * a[6 * N + n] + b[19 * N + n] * a[7 * N + n] + b[21 * N + n] * a[9 * N + n] +
0219 b[22 * N + n] * a[10 * N + n];
0220 c[8 * N + n] = b[18 * N + n] * a[12 * N + n] + b[19 * N + n] * a[13 * N + n] + b[20 * N + n] +
0221 b[21 * N + n] * a[15 * N + n] + b[22 * N + n] * a[16 * N + n] + b[23 * N + n] * a[17 * N + n];
0222 c[9 * N + n] = b[18 * N + n] * a[18 * N + n] + b[19 * N + n] * a[19 * N + n] + b[21 * N + n] * a[21 * N + n] +
0223 b[22 * N + n] * a[22 * N + n];
0224 c[10 * N + n] = b[24 * N + n] * a[0 * N + n] + b[25 * N + n] * a[1 * N + n] + b[27 * N + n] * a[3 * N + n] +
0225 b[28 * N + n] * a[4 * N + n];
0226 c[11 * N + n] = b[24 * N + n] * a[6 * N + n] + b[25 * N + n] * a[7 * N + n] + b[27 * N + n] * a[9 * N + n] +
0227 b[28 * N + n] * a[10 * N + n];
0228 c[12 * N + n] = b[24 * N + n] * a[12 * N + n] + b[25 * N + n] * a[13 * N + n] + b[26 * N + n] +
0229 b[27 * N + n] * a[15 * N + n] + b[28 * N + n] * a[16 * N + n] + b[29 * N + n] * a[17 * N + n];
0230 c[13 * N + n] = b[24 * N + n] * a[18 * N + n] + b[25 * N + n] * a[19 * N + n] + b[27 * N + n] * a[21 * N + n] +
0231 b[28 * N + n] * a[22 * N + n];
0232 c[14 * N + n] = b[24 * N + n] * a[24 * N + n] + b[25 * N + n] * a[25 * N + n] + b[27 * N + n] * a[27 * N + n] +
0233 b[28 * N + n] * a[28 * N + n];
0234 c[15 * N + n] = b[30 * N + n] * a[0 * N + n] + b[31 * N + n] * a[1 * N + n] + b[33 * N + n] * a[3 * N + n] +
0235 b[34 * N + n] * a[4 * N + n];
0236 c[16 * N + n] = b[30 * N + n] * a[6 * N + n] + b[31 * N + n] * a[7 * N + n] + b[33 * N + n] * a[9 * N + n] +
0237 b[34 * N + n] * a[10 * N + n];
0238 c[17 * N + n] = b[30 * N + n] * a[12 * N + n] + b[31 * N + n] * a[13 * N + n] + b[32 * N + n] +
0239 b[33 * N + n] * a[15 * N + n] + b[34 * N + n] * a[16 * N + n] + b[35 * N + n] * a[17 * N + n];
0240 c[18 * N + n] = b[30 * N + n] * a[18 * N + n] + b[31 * N + n] * a[19 * N + n] + b[33 * N + n] * a[21 * N + n] +
0241 b[34 * N + n] * a[22 * N + n];
0242 c[19 * N + n] = b[30 * N + n] * a[24 * N + n] + b[31 * N + n] * a[25 * N + n] + b[33 * N + n] * a[27 * N + n] +
0243 b[34 * N + n] * a[28 * N + n];
0244 c[20 * N + n] = b[35 * N + n];
0245 }
0246 #endif