Warning, /RecoTracker/MkFitCore/src/MultHelixProp.ah is written in an unsupported language. File is not indexed.
0001 #ifdef MPLEX_INTRINSICS
0002
0003 for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T)) {
0004 IntrVec_t a_0 = LD(a, 0);
0005 IntrVec_t b_0 = LD(b, 0);
0006 IntrVec_t c_0 = MUL(a_0, b_0);
0007 IntrVec_t b_1 = LD(b, 1);
0008 IntrVec_t c_1 = MUL(a_0, b_1);
0009 IntrVec_t b_3 = LD(b, 3);
0010 IntrVec_t c_2 = MUL(a_0, b_3);
0011 IntrVec_t b_6 = LD(b, 6);
0012 IntrVec_t c_3 = MUL(a_0, b_6);
0013 IntrVec_t b_10 = LD(b, 10);
0014 IntrVec_t c_4 = MUL(a_0, b_10);
0015 IntrVec_t b_15 = LD(b, 15);
0016 IntrVec_t c_5 = MUL(a_0, b_15);
0017
0018 IntrVec_t a_1 = LD(a, 1);
0019 c_0 = FMA(a_1, b_1, c_0);
0020 IntrVec_t b_2 = LD(b, 2);
0021 c_1 = FMA(a_1, b_2, c_1);
0022 IntrVec_t b_4 = LD(b, 4);
0023 c_2 = FMA(a_1, b_4, c_2);
0024 IntrVec_t b_7 = LD(b, 7);
0025 c_3 = FMA(a_1, b_7, c_3);
0026 IntrVec_t b_11 = LD(b, 11);
0027 c_4 = FMA(a_1, b_11, c_4);
0028 IntrVec_t b_16 = LD(b, 16);
0029 c_5 = FMA(a_1, b_16, c_5);
0030
0031 IntrVec_t a_3 = LD(a, 3);
0032 c_0 = FMA(a_3, b_6, c_0);
0033 c_1 = FMA(a_3, b_7, c_1);
0034 IntrVec_t b_8 = LD(b, 8);
0035 c_2 = FMA(a_3, b_8, c_2);
0036 IntrVec_t b_9 = LD(b, 9);
0037 c_3 = FMA(a_3, b_9, c_3);
0038 IntrVec_t b_13 = LD(b, 13);
0039 c_4 = FMA(a_3, b_13, c_4);
0040 IntrVec_t b_18 = LD(b, 18);
0041 c_5 = FMA(a_3, b_18, c_5);
0042
0043 IntrVec_t a_4 = LD(a, 4);
0044 c_0 = FMA(a_4, b_10, c_0);
0045 c_1 = FMA(a_4, b_11, c_1);
0046 IntrVec_t b_12 = LD(b, 12);
0047 c_2 = FMA(a_4, b_12, c_2);
0048 c_3 = FMA(a_4, b_13, c_3);
0049 IntrVec_t b_14 = LD(b, 14);
0050 c_4 = FMA(a_4, b_14, c_4);
0051 IntrVec_t b_19 = LD(b, 19);
0052 c_5 = FMA(a_4, b_19, c_5);
0053
0054 IntrVec_t a_6 = LD(a, 6);
0055 IntrVec_t c_6 = MUL(a_6, b_0);
0056 IntrVec_t c_7 = MUL(a_6, b_1);
0057 IntrVec_t c_8 = MUL(a_6, b_3);
0058 ST(c, 0, c_0);
0059 ST(c, 1, c_1);
0060 ST(c, 2, c_2);
0061 ST(c, 3, c_3);
0062 ST(c, 4, c_4);
0063 ST(c, 5, c_5);
0064 IntrVec_t c_9 = MUL(a_6, b_6);
0065 IntrVec_t c_10 = MUL(a_6, b_10);
0066 IntrVec_t c_11 = MUL(a_6, b_15);
0067
0068 IntrVec_t a_7 = LD(a, 7);
0069 c_6 = FMA(a_7, b_1, c_6);
0070 c_7 = FMA(a_7, b_2, c_7);
0071 c_8 = FMA(a_7, b_4, c_8);
0072 c_9 = FMA(a_7, b_7, c_9);
0073 c_10 = FMA(a_7, b_11, c_10);
0074 c_11 = FMA(a_7, b_16, c_11);
0075
0076 IntrVec_t a_9 = LD(a, 9);
0077 c_6 = FMA(a_9, b_6, c_6);
0078 c_7 = FMA(a_9, b_7, c_7);
0079 c_8 = FMA(a_9, b_8, c_8);
0080 c_9 = FMA(a_9, b_9, c_9);
0081 c_10 = FMA(a_9, b_13, c_10);
0082 c_11 = FMA(a_9, b_18, c_11);
0083
0084 IntrVec_t a_10 = LD(a, 10);
0085 c_6 = FMA(a_10, b_10, c_6);
0086 c_7 = FMA(a_10, b_11, c_7);
0087 c_8 = FMA(a_10, b_12, c_8);
0088 c_9 = FMA(a_10, b_13, c_9);
0089 c_10 = FMA(a_10, b_14, c_10);
0090 c_11 = FMA(a_10, b_19, c_11);
0091
0092 IntrVec_t a_12 = LD(a, 12);
0093 IntrVec_t c_12 = MUL(a_12, b_0);
0094 IntrVec_t c_13 = MUL(a_12, b_1);
0095 IntrVec_t c_14 = MUL(a_12, b_3);
0096 ST(c, 6, c_6);
0097 ST(c, 7, c_7);
0098 ST(c, 8, c_8);
0099 ST(c, 9, c_9);
0100 ST(c, 10, c_10);
0101 ST(c, 11, c_11);
0102 IntrVec_t c_15 = MUL(a_12, b_6);
0103 IntrVec_t c_16 = MUL(a_12, b_10);
0104 IntrVec_t c_17 = MUL(a_12, b_15);
0105
0106 IntrVec_t a_13 = LD(a, 13);
0107 c_12 = FMA(a_13, b_1, c_12);
0108 c_13 = FMA(a_13, b_2, c_13);
0109 c_14 = FMA(a_13, b_4, c_14);
0110 c_15 = FMA(a_13, b_7, c_15);
0111 c_16 = FMA(a_13, b_11, c_16);
0112 c_17 = FMA(a_13, b_16, c_17);
0113
0114 c_12 = ADD(b_3, c_12);
0115 c_13 = ADD(b_4, c_13);
0116 IntrVec_t b_5 = LD(b, 5);
0117 c_14 = ADD(b_5, c_14);
0118 c_15 = ADD(b_8, c_15);
0119 c_16 = ADD(b_12, c_16);
0120 IntrVec_t b_17 = LD(b, 17);
0121 c_17 = ADD(b_17, c_17);
0122
0123 IntrVec_t a_15 = LD(a, 15);
0124 c_12 = FMA(a_15, b_6, c_12);
0125 c_13 = FMA(a_15, b_7, c_13);
0126 c_14 = FMA(a_15, b_8, c_14);
0127 c_15 = FMA(a_15, b_9, c_15);
0128 c_16 = FMA(a_15, b_13, c_16);
0129 c_17 = FMA(a_15, b_18, c_17);
0130
0131 IntrVec_t a_16 = LD(a, 16);
0132 c_12 = FMA(a_16, b_10, c_12);
0133 c_13 = FMA(a_16, b_11, c_13);
0134 c_14 = FMA(a_16, b_12, c_14);
0135 c_15 = FMA(a_16, b_13, c_15);
0136 c_16 = FMA(a_16, b_14, c_16);
0137 c_17 = FMA(a_16, b_19, c_17);
0138
0139 IntrVec_t a_17 = LD(a, 17);
0140 c_12 = FMA(a_17, b_15, c_12);
0141 c_13 = FMA(a_17, b_16, c_13);
0142 c_14 = FMA(a_17, b_17, c_14);
0143 c_15 = FMA(a_17, b_18, c_15);
0144 c_16 = FMA(a_17, b_19, c_16);
0145 ST(c, 12, c_12);
0146 ST(c, 13, c_13);
0147 ST(c, 14, c_14);
0148 ST(c, 15, c_15);
0149 ST(c, 16, c_16);
0150 IntrVec_t b_20 = LD(b, 20);
0151 c_17 = FMA(a_17, b_20, c_17);
0152
0153 IntrVec_t a_18 = LD(a, 18);
0154 IntrVec_t c_18 = MUL(a_18, b_0);
0155 IntrVec_t c_19 = MUL(a_18, b_1);
0156 IntrVec_t c_20 = MUL(a_18, b_3);
0157 ST(c, 17, c_17);
0158 IntrVec_t c_21 = MUL(a_18, b_6);
0159 IntrVec_t c_22 = MUL(a_18, b_10);
0160 IntrVec_t c_23 = MUL(a_18, b_15);
0161
0162 IntrVec_t a_19 = LD(a, 19);
0163 c_18 = FMA(a_19, b_1, c_18);
0164 c_19 = FMA(a_19, b_2, c_19);
0165 c_20 = FMA(a_19, b_4, c_20);
0166 c_21 = FMA(a_19, b_7, c_21);
0167 c_22 = FMA(a_19, b_11, c_22);
0168 c_23 = FMA(a_19, b_16, c_23);
0169
0170 IntrVec_t a_21 = LD(a, 21);
0171 c_18 = FMA(a_21, b_6, c_18);
0172 c_19 = FMA(a_21, b_7, c_19);
0173 c_20 = FMA(a_21, b_8, c_20);
0174 c_21 = FMA(a_21, b_9, c_21);
0175 c_22 = FMA(a_21, b_13, c_22);
0176 c_23 = FMA(a_21, b_18, c_23);
0177
0178 IntrVec_t a_22 = LD(a, 22);
0179 c_18 = FMA(a_22, b_10, c_18);
0180 c_19 = FMA(a_22, b_11, c_19);
0181 c_20 = FMA(a_22, b_12, c_20);
0182 c_21 = FMA(a_22, b_13, c_21);
0183 c_22 = FMA(a_22, b_14, c_22);
0184 c_23 = FMA(a_22, b_19, c_23);
0185
0186 IntrVec_t a_24 = LD(a, 24);
0187 IntrVec_t c_24 = MUL(a_24, b_0);
0188 IntrVec_t c_25 = MUL(a_24, b_1);
0189 IntrVec_t c_26 = MUL(a_24, b_3);
0190 ST(c, 18, c_18);
0191 ST(c, 19, c_19);
0192 ST(c, 20, c_20);
0193 ST(c, 21, c_21);
0194 ST(c, 22, c_22);
0195 ST(c, 23, c_23);
0196 IntrVec_t c_27 = MUL(a_24, b_6);
0197 IntrVec_t c_28 = MUL(a_24, b_10);
0198 IntrVec_t c_29 = MUL(a_24, b_15);
0199
0200 IntrVec_t a_25 = LD(a, 25);
0201 c_24 = FMA(a_25, b_1, c_24);
0202 c_25 = FMA(a_25, b_2, c_25);
0203 c_26 = FMA(a_25, b_4, c_26);
0204 c_27 = FMA(a_25, b_7, c_27);
0205 c_28 = FMA(a_25, b_11, c_28);
0206 c_29 = FMA(a_25, b_16, c_29);
0207
0208 IntrVec_t a_27 = LD(a, 27);
0209 c_24 = FMA(a_27, b_6, c_24);
0210 c_25 = FMA(a_27, b_7, c_25);
0211 c_26 = FMA(a_27, b_8, c_26);
0212 c_27 = FMA(a_27, b_9, c_27);
0213 c_28 = FMA(a_27, b_13, c_28);
0214 c_29 = FMA(a_27, b_18, c_29);
0215
0216 IntrVec_t a_28 = LD(a, 28);
0217 c_24 = FMA(a_28, b_10, c_24);
0218 c_25 = FMA(a_28, b_11, c_25);
0219 c_26 = FMA(a_28, b_12, c_26);
0220 c_27 = FMA(a_28, b_13, c_27);
0221 c_28 = FMA(a_28, b_14, c_28);
0222 c_29 = FMA(a_28, b_19, c_29);
0223
0224 IntrVec_t c_30 = b_15;
0225 IntrVec_t c_31 = b_16;
0226 IntrVec_t c_32 = b_17;
0227 IntrVec_t c_33 = b_18;
0228 ST(c, 24, c_24);
0229 ST(c, 25, c_25);
0230 ST(c, 26, c_26);
0231 ST(c, 27, c_27);
0232 ST(c, 28, c_28);
0233 ST(c, 29, c_29);
0234 ST(c, 30, c_30);
0235 ST(c, 31, c_31);
0236 ST(c, 32, c_32);
0237 ST(c, 33, c_33);
0238 IntrVec_t c_34 = b_19;
0239 IntrVec_t c_35 = b_20;
0240 ST(c, 34, c_34);
0241 ST(c, 35, c_35);
0242 }
0243
0244 #else
0245
0246 #pragma omp simd
0247 for (int n = 0; n < N; ++n) {
0248 c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[1 * N + n] + a[3 * N + n] * b[6 * N + n] +
0249 a[4 * N + n] * b[10 * N + n];
0250 c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[2 * N + n] + a[3 * N + n] * b[7 * N + n] +
0251 a[4 * N + n] * b[11 * N + n];
0252 c[2 * N + n] = a[0 * N + n] * b[3 * N + n] + a[1 * N + n] * b[4 * N + n] + a[3 * N + n] * b[8 * N + n] +
0253 a[4 * N + n] * b[12 * N + n];
0254 c[3 * N + n] = a[0 * N + n] * b[6 * N + n] + a[1 * N + n] * b[7 * N + n] + a[3 * N + n] * b[9 * N + n] +
0255 a[4 * N + n] * b[13 * N + n];
0256 c[4 * N + n] = a[0 * N + n] * b[10 * N + n] + a[1 * N + n] * b[11 * N + n] + a[3 * N + n] * b[13 * N + n] +
0257 a[4 * N + n] * b[14 * N + n];
0258 c[5 * N + n] = a[0 * N + n] * b[15 * N + n] + a[1 * N + n] * b[16 * N + n] + a[3 * N + n] * b[18 * N + n] +
0259 a[4 * N + n] * b[19 * N + n];
0260 c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[1 * N + n] + a[9 * N + n] * b[6 * N + n] +
0261 a[10 * N + n] * b[10 * N + n];
0262 c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[2 * N + n] + a[9 * N + n] * b[7 * N + n] +
0263 a[10 * N + n] * b[11 * N + n];
0264 c[8 * N + n] = a[6 * N + n] * b[3 * N + n] + a[7 * N + n] * b[4 * N + n] + a[9 * N + n] * b[8 * N + n] +
0265 a[10 * N + n] * b[12 * N + n];
0266 c[9 * N + n] = a[6 * N + n] * b[6 * N + n] + a[7 * N + n] * b[7 * N + n] + a[9 * N + n] * b[9 * N + n] +
0267 a[10 * N + n] * b[13 * N + n];
0268 c[10 * N + n] = a[6 * N + n] * b[10 * N + n] + a[7 * N + n] * b[11 * N + n] + a[9 * N + n] * b[13 * N + n] +
0269 a[10 * N + n] * b[14 * N + n];
0270 c[11 * N + n] = a[6 * N + n] * b[15 * N + n] + a[7 * N + n] * b[16 * N + n] + a[9 * N + n] * b[18 * N + n] +
0271 a[10 * N + n] * b[19 * N + n];
0272 c[12 * N + n] = a[12 * N + n] * b[0 * N + n] + a[13 * N + n] * b[1 * N + n] + b[3 * N + n] +
0273 a[15 * N + n] * b[6 * N + n] + a[16 * N + n] * b[10 * N + n] + a[17 * N + n] * b[15 * N + n];
0274 c[13 * N + n] = a[12 * N + n] * b[1 * N + n] + a[13 * N + n] * b[2 * N + n] + b[4 * N + n] +
0275 a[15 * N + n] * b[7 * N + n] + a[16 * N + n] * b[11 * N + n] + a[17 * N + n] * b[16 * N + n];
0276 c[14 * N + n] = a[12 * N + n] * b[3 * N + n] + a[13 * N + n] * b[4 * N + n] + b[5 * N + n] +
0277 a[15 * N + n] * b[8 * N + n] + a[16 * N + n] * b[12 * N + n] + a[17 * N + n] * b[17 * N + n];
0278 c[15 * N + n] = a[12 * N + n] * b[6 * N + n] + a[13 * N + n] * b[7 * N + n] + b[8 * N + n] +
0279 a[15 * N + n] * b[9 * N + n] + a[16 * N + n] * b[13 * N + n] + a[17 * N + n] * b[18 * N + n];
0280 c[16 * N + n] = a[12 * N + n] * b[10 * N + n] + a[13 * N + n] * b[11 * N + n] + b[12 * N + n] +
0281 a[15 * N + n] * b[13 * N + n] + a[16 * N + n] * b[14 * N + n] + a[17 * N + n] * b[19 * N + n];
0282 c[17 * N + n] = a[12 * N + n] * b[15 * N + n] + a[13 * N + n] * b[16 * N + n] + b[17 * N + n] +
0283 a[15 * N + n] * b[18 * N + n] + a[16 * N + n] * b[19 * N + n] + a[17 * N + n] * b[20 * N + n];
0284 c[18 * N + n] = a[18 * N + n] * b[0 * N + n] + a[19 * N + n] * b[1 * N + n] + a[21 * N + n] * b[6 * N + n] +
0285 a[22 * N + n] * b[10 * N + n];
0286 c[19 * N + n] = a[18 * N + n] * b[1 * N + n] + a[19 * N + n] * b[2 * N + n] + a[21 * N + n] * b[7 * N + n] +
0287 a[22 * N + n] * b[11 * N + n];
0288 c[20 * N + n] = a[18 * N + n] * b[3 * N + n] + a[19 * N + n] * b[4 * N + n] + a[21 * N + n] * b[8 * N + n] +
0289 a[22 * N + n] * b[12 * N + n];
0290 c[21 * N + n] = a[18 * N + n] * b[6 * N + n] + a[19 * N + n] * b[7 * N + n] + a[21 * N + n] * b[9 * N + n] +
0291 a[22 * N + n] * b[13 * N + n];
0292 c[22 * N + n] = a[18 * N + n] * b[10 * N + n] + a[19 * N + n] * b[11 * N + n] + a[21 * N + n] * b[13 * N + n] +
0293 a[22 * N + n] * b[14 * N + n];
0294 c[23 * N + n] = a[18 * N + n] * b[15 * N + n] + a[19 * N + n] * b[16 * N + n] + a[21 * N + n] * b[18 * N + n] +
0295 a[22 * N + n] * b[19 * N + n];
0296 c[24 * N + n] = a[24 * N + n] * b[0 * N + n] + a[25 * N + n] * b[1 * N + n] + a[27 * N + n] * b[6 * N + n] +
0297 a[28 * N + n] * b[10 * N + n];
0298 c[25 * N + n] = a[24 * N + n] * b[1 * N + n] + a[25 * N + n] * b[2 * N + n] + a[27 * N + n] * b[7 * N + n] +
0299 a[28 * N + n] * b[11 * N + n];
0300 c[26 * N + n] = a[24 * N + n] * b[3 * N + n] + a[25 * N + n] * b[4 * N + n] + a[27 * N + n] * b[8 * N + n] +
0301 a[28 * N + n] * b[12 * N + n];
0302 c[27 * N + n] = a[24 * N + n] * b[6 * N + n] + a[25 * N + n] * b[7 * N + n] + a[27 * N + n] * b[9 * N + n] +
0303 a[28 * N + n] * b[13 * N + n];
0304 c[28 * N + n] = a[24 * N + n] * b[10 * N + n] + a[25 * N + n] * b[11 * N + n] + a[27 * N + n] * b[13 * N + n] +
0305 a[28 * N + n] * b[14 * N + n];
0306 c[29 * N + n] = a[24 * N + n] * b[15 * N + n] + a[25 * N + n] * b[16 * N + n] + a[27 * N + n] * b[18 * N + n] +
0307 a[28 * N + n] * b[19 * N + n];
0308 c[30 * N + n] = b[15 * N + n];
0309 c[31 * N + n] = b[16 * N + n];
0310 c[32 * N + n] = b[17 * N + n];
0311 c[33 * N + n] = b[18 * N + n];
0312 c[34 * N + n] = b[19 * N + n];
0313 c[35 * N + n] = b[20 * N + n];
0314 }
0315 #endif