Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoTracker/MkFitCore/src/MultHelixPlanePropTransp.ah is written in an unsupported language. File is not indexed.

0001 #ifdef MPLEX_INTRINSICS
0002 
0003 for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T)) {
0004   IntrVec_t b_0 = LD(b, 0);
0005   IntrVec_t a_0 = LD(a, 0);
0006   IntrVec_t c_0 = MUL(b_0, a_0);
0007 
0008   IntrVec_t b_1 = LD(b, 1);
0009   IntrVec_t a_1 = LD(a, 1);
0010   c_0 = FMA(b_1, a_1, c_0);
0011 
0012   IntrVec_t b_2 = LD(b, 2);
0013   IntrVec_t a_2 = LD(a, 2);
0014   c_0 = FMA(b_2, a_2, c_0);
0015 
0016   IntrVec_t b_3 = LD(b, 3);
0017   IntrVec_t a_3 = LD(a, 3);
0018   c_0 = FMA(b_3, a_3, c_0);
0019 
0020   IntrVec_t b_4 = LD(b, 4);
0021   IntrVec_t a_4 = LD(a, 4);
0022   c_0 = FMA(b_4, a_4, c_0);
0023 
0024   IntrVec_t b_5 = LD(b, 5);
0025   IntrVec_t a_5 = LD(a, 5);
0026   c_0 = FMA(b_5, a_5, c_0);
0027 
0028   IntrVec_t b_6 = LD(b, 6);
0029   IntrVec_t c_1 = MUL(b_6, a_0);
0030   IntrVec_t a_6 = LD(a, 6);
0031   IntrVec_t c_2 = MUL(b_6, a_6);
0032   ST(c, 0, c_0);
0033 
0034   IntrVec_t b_7 = LD(b, 7);
0035   c_1 = FMA(b_7, a_1, c_1);
0036   IntrVec_t a_7 = LD(a, 7);
0037   c_2 = FMA(b_7, a_7, c_2);
0038 
0039   IntrVec_t b_8 = LD(b, 8);
0040   c_1 = FMA(b_8, a_2, c_1);
0041   IntrVec_t a_8 = LD(a, 8);
0042   c_2 = FMA(b_8, a_8, c_2);
0043 
0044   IntrVec_t b_9 = LD(b, 9);
0045   c_1 = FMA(b_9, a_3, c_1);
0046   IntrVec_t a_9 = LD(a, 9);
0047   c_2 = FMA(b_9, a_9, c_2);
0048 
0049   IntrVec_t b_10 = LD(b, 10);
0050   c_1 = FMA(b_10, a_4, c_1);
0051   IntrVec_t a_10 = LD(a, 10);
0052   c_2 = FMA(b_10, a_10, c_2);
0053 
0054   IntrVec_t b_11 = LD(b, 11);
0055   c_1 = FMA(b_11, a_5, c_1);
0056   IntrVec_t a_11 = LD(a, 11);
0057   c_2 = FMA(b_11, a_11, c_2);
0058 
0059   IntrVec_t b_12 = LD(b, 12);
0060   IntrVec_t c_3 = MUL(b_12, a_0);
0061   ST(c, 1, c_1);
0062   IntrVec_t c_4 = MUL(b_12, a_6);
0063   ST(c, 2, c_2);
0064   IntrVec_t a_12 = LD(a, 12);
0065   IntrVec_t c_5 = MUL(b_12, a_12);
0066 
0067   IntrVec_t b_13 = LD(b, 13);
0068   c_3 = FMA(b_13, a_1, c_3);
0069   c_4 = FMA(b_13, a_7, c_4);
0070   IntrVec_t a_13 = LD(a, 13);
0071   c_5 = FMA(b_13, a_13, c_5);
0072 
0073   IntrVec_t b_14 = LD(b, 14);
0074   c_3 = FMA(b_14, a_2, c_3);
0075   c_4 = FMA(b_14, a_8, c_4);
0076   IntrVec_t a_14 = LD(a, 14);
0077   c_5 = FMA(b_14, a_14, c_5);
0078 
0079   IntrVec_t b_15 = LD(b, 15);
0080   c_3 = FMA(b_15, a_3, c_3);
0081   c_4 = FMA(b_15, a_9, c_4);
0082   IntrVec_t a_15 = LD(a, 15);
0083   c_5 = FMA(b_15, a_15, c_5);
0084 
0085   IntrVec_t b_16 = LD(b, 16);
0086   c_3 = FMA(b_16, a_4, c_3);
0087   c_4 = FMA(b_16, a_10, c_4);
0088   IntrVec_t a_16 = LD(a, 16);
0089   c_5 = FMA(b_16, a_16, c_5);
0090 
0091   IntrVec_t b_17 = LD(b, 17);
0092   c_3 = FMA(b_17, a_5, c_3);
0093   c_4 = FMA(b_17, a_11, c_4);
0094   IntrVec_t a_17 = LD(a, 17);
0095   c_5 = FMA(b_17, a_17, c_5);
0096 
0097   IntrVec_t b_18 = LD(b, 18);
0098   IntrVec_t c_6 = MUL(b_18, a_0);
0099   ST(c, 3, c_3);
0100   ST(c, 4, c_4);
0101   ST(c, 5, c_5);
0102   IntrVec_t c_7 = MUL(b_18, a_6);
0103   IntrVec_t c_8 = MUL(b_18, a_12);
0104 
0105   IntrVec_t b_19 = LD(b, 19);
0106   c_6 = FMA(b_19, a_1, c_6);
0107   c_7 = FMA(b_19, a_7, c_7);
0108   c_8 = FMA(b_19, a_13, c_8);
0109 
0110   IntrVec_t b_20 = LD(b, 20);
0111   c_6 = FMA(b_20, a_2, c_6);
0112   c_7 = FMA(b_20, a_8, c_7);
0113   c_8 = FMA(b_20, a_14, c_8);
0114 
0115   IntrVec_t b_21 = LD(b, 21);
0116   c_6 = FMA(b_21, a_3, c_6);
0117   c_7 = FMA(b_21, a_9, c_7);
0118   c_8 = FMA(b_21, a_15, c_8);
0119   IntrVec_t c_9 = b_21;
0120 
0121   IntrVec_t b_22 = LD(b, 22);
0122   c_6 = FMA(b_22, a_4, c_6);
0123   c_7 = FMA(b_22, a_10, c_7);
0124   c_8 = FMA(b_22, a_16, c_8);
0125   IntrVec_t a_22 = LD(a, 22);
0126   c_9 = FMA(b_22, a_22, c_9);
0127 
0128   IntrVec_t b_23 = LD(b, 23);
0129   c_6 = FMA(b_23, a_5, c_6);
0130   c_7 = FMA(b_23, a_11, c_7);
0131   c_8 = FMA(b_23, a_17, c_8);
0132   IntrVec_t a_23 = LD(a, 23);
0133   c_9 = FMA(b_23, a_23, c_9);
0134   ST(c, 6, c_6);
0135   ST(c, 7, c_7);
0136   ST(c, 8, c_8);
0137 
0138   IntrVec_t b_24 = LD(b, 24);
0139   IntrVec_t c_10 = MUL(b_24, a_0);
0140   ST(c, 9, c_9);
0141   IntrVec_t c_11 = MUL(b_24, a_6);
0142   IntrVec_t c_12 = MUL(b_24, a_12);
0143   IntrVec_t a_24 = LD(a, 24);
0144   IntrVec_t c_14 = MUL(b_24, a_24);
0145 
0146   IntrVec_t b_25 = LD(b, 25);
0147   c_10 = FMA(b_25, a_1, c_10);
0148   c_11 = FMA(b_25, a_7, c_11);
0149   c_12 = FMA(b_25, a_13, c_12);
0150   IntrVec_t a_25 = LD(a, 25);
0151   c_14 = FMA(b_25, a_25, c_14);
0152 
0153   IntrVec_t b_26 = LD(b, 26);
0154   c_10 = FMA(b_26, a_2, c_10);
0155   c_11 = FMA(b_26, a_8, c_11);
0156   c_12 = FMA(b_26, a_14, c_12);
0157   IntrVec_t a_26 = LD(a, 26);
0158   c_14 = FMA(b_26, a_26, c_14);
0159 
0160   IntrVec_t b_27 = LD(b, 27);
0161   c_10 = FMA(b_27, a_3, c_10);
0162   c_11 = FMA(b_27, a_9, c_11);
0163   c_12 = FMA(b_27, a_15, c_12);
0164   IntrVec_t c_13 = b_27;
0165   IntrVec_t a_27 = LD(a, 27);
0166   c_14 = FMA(b_27, a_27, c_14);
0167 
0168   IntrVec_t b_28 = LD(b, 28);
0169   c_10 = FMA(b_28, a_4, c_10);
0170   c_11 = FMA(b_28, a_10, c_11);
0171   c_12 = FMA(b_28, a_16, c_12);
0172   c_13 = FMA(b_28, a_22, c_13);
0173   IntrVec_t a_28 = LD(a, 28);
0174   c_14 = FMA(b_28, a_28, c_14);
0175 
0176   IntrVec_t b_29 = LD(b, 29);
0177   c_10 = FMA(b_29, a_5, c_10);
0178   c_11 = FMA(b_29, a_11, c_11);
0179   c_12 = FMA(b_29, a_17, c_12);
0180   c_13 = FMA(b_29, a_23, c_13);
0181   IntrVec_t a_29 = LD(a, 29);
0182   c_14 = FMA(b_29, a_29, c_14);
0183   ST(c, 10, c_10);
0184   ST(c, 11, c_11);
0185   ST(c, 12, c_12);
0186   ST(c, 13, c_13);
0187   ST(c, 14, c_14);
0188 
0189   IntrVec_t b_30 = LD(b, 30);
0190   IntrVec_t c_15 = MUL(b_30, a_0);
0191   IntrVec_t c_16 = MUL(b_30, a_6);
0192   IntrVec_t c_17 = MUL(b_30, a_12);
0193   IntrVec_t c_19 = MUL(b_30, a_24);
0194 
0195   IntrVec_t b_31 = LD(b, 31);
0196   c_15 = FMA(b_31, a_1, c_15);
0197   c_16 = FMA(b_31, a_7, c_16);
0198   c_17 = FMA(b_31, a_13, c_17);
0199   c_19 = FMA(b_31, a_25, c_19);
0200 
0201   IntrVec_t b_32 = LD(b, 32);
0202   c_15 = FMA(b_32, a_2, c_15);
0203   c_16 = FMA(b_32, a_8, c_16);
0204   c_17 = FMA(b_32, a_14, c_17);
0205   c_19 = FMA(b_32, a_26, c_19);
0206 
0207   IntrVec_t b_33 = LD(b, 33);
0208   c_15 = FMA(b_33, a_3, c_15);
0209   c_16 = FMA(b_33, a_9, c_16);
0210   c_17 = FMA(b_33, a_15, c_17);
0211   IntrVec_t c_18 = b_33;
0212   c_19 = FMA(b_33, a_27, c_19);
0213 
0214   IntrVec_t b_34 = LD(b, 34);
0215   c_15 = FMA(b_34, a_4, c_15);
0216   c_16 = FMA(b_34, a_10, c_16);
0217   c_17 = FMA(b_34, a_16, c_17);
0218   c_18 = FMA(b_34, a_22, c_18);
0219   c_19 = FMA(b_34, a_28, c_19);
0220   IntrVec_t a_34 = LD(a, 34);
0221   IntrVec_t c_20 = MUL(b_34, a_34);
0222 
0223   IntrVec_t b_35 = LD(b, 35);
0224   c_15 = FMA(b_35, a_5, c_15);
0225   c_16 = FMA(b_35, a_11, c_16);
0226   c_17 = FMA(b_35, a_17, c_17);
0227   c_18 = FMA(b_35, a_23, c_18);
0228   c_19 = FMA(b_35, a_29, c_19);
0229   ST(c, 15, c_15);
0230   ST(c, 16, c_16);
0231   ST(c, 17, c_17);
0232   ST(c, 18, c_18);
0233   ST(c, 19, c_19);
0234   IntrVec_t a_35 = LD(a, 35);
0235   c_20 = FMA(b_35, a_35, c_20);
0236   ST(c, 20, c_20);
0237 }
0238 
0239 #else
0240 
0241 #pragma omp simd
0242 for (int n = 0; n < N; ++n) {
0243   c[0 * N + n] = b[0 * N + n] * a[0 * N + n] + b[1 * N + n] * a[1 * N + n] + b[2 * N + n] * a[2 * N + n] +
0244                  b[3 * N + n] * a[3 * N + n] + b[4 * N + n] * a[4 * N + n] + b[5 * N + n] * a[5 * N + n];
0245   c[1 * N + n] = b[6 * N + n] * a[0 * N + n] + b[7 * N + n] * a[1 * N + n] + b[8 * N + n] * a[2 * N + n] +
0246                  b[9 * N + n] * a[3 * N + n] + b[10 * N + n] * a[4 * N + n] + b[11 * N + n] * a[5 * N + n];
0247   c[2 * N + n] = b[6 * N + n] * a[6 * N + n] + b[7 * N + n] * a[7 * N + n] + b[8 * N + n] * a[8 * N + n] +
0248                  b[9 * N + n] * a[9 * N + n] + b[10 * N + n] * a[10 * N + n] + b[11 * N + n] * a[11 * N + n];
0249   c[3 * N + n] = b[12 * N + n] * a[0 * N + n] + b[13 * N + n] * a[1 * N + n] + b[14 * N + n] * a[2 * N + n] +
0250                  b[15 * N + n] * a[3 * N + n] + b[16 * N + n] * a[4 * N + n] + b[17 * N + n] * a[5 * N + n];
0251   c[4 * N + n] = b[12 * N + n] * a[6 * N + n] + b[13 * N + n] * a[7 * N + n] + b[14 * N + n] * a[8 * N + n] +
0252                  b[15 * N + n] * a[9 * N + n] + b[16 * N + n] * a[10 * N + n] + b[17 * N + n] * a[11 * N + n];
0253   c[5 * N + n] = b[12 * N + n] * a[12 * N + n] + b[13 * N + n] * a[13 * N + n] + b[14 * N + n] * a[14 * N + n] +
0254                  b[15 * N + n] * a[15 * N + n] + b[16 * N + n] * a[16 * N + n] + b[17 * N + n] * a[17 * N + n];
0255   c[6 * N + n] = b[18 * N + n] * a[0 * N + n] + b[19 * N + n] * a[1 * N + n] + b[20 * N + n] * a[2 * N + n] +
0256                  b[21 * N + n] * a[3 * N + n] + b[22 * N + n] * a[4 * N + n] + b[23 * N + n] * a[5 * N + n];
0257   c[7 * N + n] = b[18 * N + n] * a[6 * N + n] + b[19 * N + n] * a[7 * N + n] + b[20 * N + n] * a[8 * N + n] +
0258                  b[21 * N + n] * a[9 * N + n] + b[22 * N + n] * a[10 * N + n] + b[23 * N + n] * a[11 * N + n];
0259   c[8 * N + n] = b[18 * N + n] * a[12 * N + n] + b[19 * N + n] * a[13 * N + n] + b[20 * N + n] * a[14 * N + n] +
0260                  b[21 * N + n] * a[15 * N + n] + b[22 * N + n] * a[16 * N + n] + b[23 * N + n] * a[17 * N + n];
0261   c[9 * N + n] = b[21 * N + n] + b[22 * N + n] * a[22 * N + n] + b[23 * N + n] * a[23 * N + n];
0262   c[10 * N + n] = b[24 * N + n] * a[0 * N + n] + b[25 * N + n] * a[1 * N + n] + b[26 * N + n] * a[2 * N + n] +
0263                   b[27 * N + n] * a[3 * N + n] + b[28 * N + n] * a[4 * N + n] + b[29 * N + n] * a[5 * N + n];
0264   c[11 * N + n] = b[24 * N + n] * a[6 * N + n] + b[25 * N + n] * a[7 * N + n] + b[26 * N + n] * a[8 * N + n] +
0265                   b[27 * N + n] * a[9 * N + n] + b[28 * N + n] * a[10 * N + n] + b[29 * N + n] * a[11 * N + n];
0266   c[12 * N + n] = b[24 * N + n] * a[12 * N + n] + b[25 * N + n] * a[13 * N + n] + b[26 * N + n] * a[14 * N + n] +
0267                   b[27 * N + n] * a[15 * N + n] + b[28 * N + n] * a[16 * N + n] + b[29 * N + n] * a[17 * N + n];
0268   c[13 * N + n] = b[27 * N + n] + b[28 * N + n] * a[22 * N + n] + b[29 * N + n] * a[23 * N + n];
0269   c[14 * N + n] = b[24 * N + n] * a[24 * N + n] + b[25 * N + n] * a[25 * N + n] + b[26 * N + n] * a[26 * N + n] +
0270                   b[27 * N + n] * a[27 * N + n] + b[28 * N + n] * a[28 * N + n] + b[29 * N + n] * a[29 * N + n];
0271   c[15 * N + n] = b[30 * N + n] * a[0 * N + n] + b[31 * N + n] * a[1 * N + n] + b[32 * N + n] * a[2 * N + n] +
0272                   b[33 * N + n] * a[3 * N + n] + b[34 * N + n] * a[4 * N + n] + b[35 * N + n] * a[5 * N + n];
0273   c[16 * N + n] = b[30 * N + n] * a[6 * N + n] + b[31 * N + n] * a[7 * N + n] + b[32 * N + n] * a[8 * N + n] +
0274                   b[33 * N + n] * a[9 * N + n] + b[34 * N + n] * a[10 * N + n] + b[35 * N + n] * a[11 * N + n];
0275   c[17 * N + n] = b[30 * N + n] * a[12 * N + n] + b[31 * N + n] * a[13 * N + n] + b[32 * N + n] * a[14 * N + n] +
0276                   b[33 * N + n] * a[15 * N + n] + b[34 * N + n] * a[16 * N + n] + b[35 * N + n] * a[17 * N + n];
0277   c[18 * N + n] = b[33 * N + n] + b[34 * N + n] * a[22 * N + n] + b[35 * N + n] * a[23 * N + n];
0278   c[19 * N + n] = b[30 * N + n] * a[24 * N + n] + b[31 * N + n] * a[25 * N + n] + b[32 * N + n] * a[26 * N + n] +
0279                   b[33 * N + n] * a[27 * N + n] + b[34 * N + n] * a[28 * N + n] + b[35 * N + n] * a[29 * N + n];
0280   c[20 * N + n] = b[34 * N + n] * a[34 * N + n] + b[35 * N + n] * a[35 * N + n];
0281 }
0282 #endif