Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoTracker/MkFitCore/src/MultHelixPropTranspEndcap.ah is written in an unsupported language. File is not indexed.

0001 #ifdef MPLEX_INTRINSICS
0002 
0003 for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T)) {
0004 #ifdef AVX512_INTRINSICS
0005   IntrVec_t all_zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
0006 #else
0007   IntrVec_t all_zeros = {0, 0, 0, 0, 0, 0, 0, 0};
0008 #endif
0009 
0010   IntrVec_t b_0 = LD(b, 0);
0011   IntrVec_t c_0 = b_0;
0012 
0013   IntrVec_t b_2 = LD(b, 2);
0014   IntrVec_t a_2 = LD(a, 2);
0015   c_0 = FMA(b_2, a_2, c_0);
0016 
0017   IntrVec_t b_3 = LD(b, 3);
0018   IntrVec_t a_3 = LD(a, 3);
0019   c_0 = FMA(b_3, a_3, c_0);
0020 
0021   IntrVec_t b_4 = LD(b, 4);
0022   IntrVec_t a_4 = LD(a, 4);
0023   c_0 = FMA(b_4, a_4, c_0);
0024 
0025   IntrVec_t b_5 = LD(b, 5);
0026   IntrVec_t a_5 = LD(a, 5);
0027   c_0 = FMA(b_5, a_5, c_0);
0028 
0029   IntrVec_t b_6 = LD(b, 6);
0030   IntrVec_t c_1 = b_6;
0031 
0032   IntrVec_t b_7 = LD(b, 7);
0033   IntrVec_t c_2 = b_7;
0034   ST(c, 0, c_0);
0035 
0036   IntrVec_t b_8 = LD(b, 8);
0037   c_1 = FMA(b_8, a_2, c_1);
0038   IntrVec_t a_8 = LD(a, 8);
0039   c_2 = FMA(b_8, a_8, c_2);
0040 
0041   IntrVec_t b_9 = LD(b, 9);
0042   c_1 = FMA(b_9, a_3, c_1);
0043   IntrVec_t a_9 = LD(a, 9);
0044   c_2 = FMA(b_9, a_9, c_2);
0045 
0046   IntrVec_t b_10 = LD(b, 10);
0047   c_1 = FMA(b_10, a_4, c_1);
0048   IntrVec_t a_10 = LD(a, 10);
0049   c_2 = FMA(b_10, a_10, c_2);
0050 
0051   IntrVec_t b_11 = LD(b, 11);
0052   c_1 = FMA(b_11, a_5, c_1);
0053   IntrVec_t a_11 = LD(a, 11);
0054   c_2 = FMA(b_11, a_11, c_2);
0055 
0056   IntrVec_t b_12 = LD(b, 12);
0057   IntrVec_t c_3 = b_12;
0058   ST(c, 1, c_1);
0059 
0060   IntrVec_t b_13 = LD(b, 13);
0061   IntrVec_t c_4 = b_13;
0062   ST(c, 2, c_2);
0063 
0064   IntrVec_t b_14 = LD(b, 14);
0065   c_3 = FMA(b_14, a_2, c_3);
0066   c_4 = FMA(b_14, a_8, c_4);
0067 
0068   IntrVec_t b_15 = LD(b, 15);
0069   c_3 = FMA(b_15, a_3, c_3);
0070   c_4 = FMA(b_15, a_9, c_4);
0071 
0072   IntrVec_t b_16 = LD(b, 16);
0073   c_3 = FMA(b_16, a_4, c_3);
0074   c_4 = FMA(b_16, a_10, c_4);
0075 
0076   IntrVec_t b_17 = LD(b, 17);
0077   c_3 = FMA(b_17, a_5, c_3);
0078   c_4 = FMA(b_17, a_11, c_4);
0079   ST(c, 5, all_zeros);
0080 
0081   IntrVec_t b_18 = LD(b, 18);
0082   IntrVec_t c_6 = b_18;
0083 
0084   IntrVec_t b_19 = LD(b, 19);
0085   IntrVec_t c_7 = b_19;
0086   ST(c, 3, c_3);
0087   ST(c, 4, c_4);
0088 
0089   IntrVec_t b_20 = LD(b, 20);
0090   c_6 = FMA(b_20, a_2, c_6);
0091   c_7 = FMA(b_20, a_8, c_7);
0092 
0093   IntrVec_t b_21 = LD(b, 21);
0094   c_6 = FMA(b_21, a_3, c_6);
0095   c_7 = FMA(b_21, a_9, c_7);
0096   IntrVec_t c_9 = b_21;
0097 
0098   IntrVec_t b_22 = LD(b, 22);
0099   c_6 = FMA(b_22, a_4, c_6);
0100   c_7 = FMA(b_22, a_10, c_7);
0101 
0102   IntrVec_t b_23 = LD(b, 23);
0103   c_6 = FMA(b_23, a_5, c_6);
0104   c_7 = FMA(b_23, a_11, c_7);
0105   ST(c, 8, all_zeros);
0106 
0107   IntrVec_t b_24 = LD(b, 24);
0108   IntrVec_t c_10 = b_24;
0109 
0110   IntrVec_t b_25 = LD(b, 25);
0111   IntrVec_t c_11 = b_25;
0112   ST(c, 6, c_6);
0113   ST(c, 7, c_7);
0114   ST(c, 9, c_9);
0115 
0116   IntrVec_t b_26 = LD(b, 26);
0117   c_10 = FMA(b_26, a_2, c_10);
0118   c_11 = FMA(b_26, a_8, c_11);
0119   IntrVec_t a_26 = LD(a, 26);
0120   IntrVec_t c_14 = MUL(b_26, a_26);
0121 
0122   IntrVec_t b_27 = LD(b, 27);
0123   c_10 = FMA(b_27, a_3, c_10);
0124   c_11 = FMA(b_27, a_9, c_11);
0125   IntrVec_t c_13 = b_27;
0126   IntrVec_t a_27 = LD(a, 27);
0127   c_14 = FMA(b_27, a_27, c_14);
0128 
0129   IntrVec_t b_28 = LD(b, 28);
0130   c_10 = FMA(b_28, a_4, c_10);
0131   c_11 = FMA(b_28, a_10, c_11);
0132   c_14 = ADD(b_28, c_14);
0133 
0134   IntrVec_t b_29 = LD(b, 29);
0135   c_10 = FMA(b_29, a_5, c_10);
0136   c_11 = FMA(b_29, a_11, c_11);
0137   ST(c, 12, all_zeros);
0138   IntrVec_t a_29 = LD(a, 29);
0139   c_14 = FMA(b_29, a_29, c_14);
0140 
0141   IntrVec_t b_30 = LD(b, 30);
0142   IntrVec_t c_15 = b_30;
0143   ST(c, 10, c_10);
0144   ST(c, 11, c_11);
0145   ST(c, 13, c_13);
0146   ST(c, 14, c_14);
0147 
0148   IntrVec_t b_31 = LD(b, 31);
0149   IntrVec_t c_16 = b_31;
0150 
0151   IntrVec_t b_32 = LD(b, 32);
0152   c_15 = FMA(b_32, a_2, c_15);
0153   c_16 = FMA(b_32, a_8, c_16);
0154   IntrVec_t c_19 = MUL(b_32, a_26);
0155 
0156   IntrVec_t b_33 = LD(b, 33);
0157   c_15 = FMA(b_33, a_3, c_15);
0158   c_16 = FMA(b_33, a_9, c_16);
0159   IntrVec_t c_18 = b_33;
0160   c_19 = FMA(b_33, a_27, c_19);
0161 
0162   IntrVec_t b_34 = LD(b, 34);
0163   c_15 = FMA(b_34, a_4, c_15);
0164   c_16 = FMA(b_34, a_10, c_16);
0165   c_19 = ADD(b_34, c_19);
0166 
0167   IntrVec_t b_35 = LD(b, 35);
0168   c_15 = FMA(b_35, a_5, c_15);
0169   c_16 = FMA(b_35, a_11, c_16);
0170   ST(c, 17, all_zeros);
0171   c_19 = FMA(b_35, a_29, c_19);
0172   IntrVec_t c_20 = b_35;
0173   ST(c, 15, c_15);
0174   ST(c, 16, c_16);
0175   ST(c, 18, c_18);
0176   ST(c, 19, c_19);
0177   ST(c, 20, c_20);
0178 }
0179 
0180 #else
0181 
0182 #pragma omp simd
0183 for (int n = 0; n < N; ++n) {
0184   c[0 * N + n] = b[0 * N + n] + b[2 * N + n] * a[2 * N + n] + b[3 * N + n] * a[3 * N + n] +
0185                  b[4 * N + n] * a[4 * N + n] + b[5 * N + n] * a[5 * N + n];
0186   c[1 * N + n] = b[6 * N + n] + b[8 * N + n] * a[2 * N + n] + b[9 * N + n] * a[3 * N + n] +
0187                  b[10 * N + n] * a[4 * N + n] + b[11 * N + n] * a[5 * N + n];
0188   c[2 * N + n] = b[7 * N + n] + b[8 * N + n] * a[8 * N + n] + b[9 * N + n] * a[9 * N + n] +
0189                  b[10 * N + n] * a[10 * N + n] + b[11 * N + n] * a[11 * N + n];
0190   c[3 * N + n] = b[12 * N + n] + b[14 * N + n] * a[2 * N + n] + b[15 * N + n] * a[3 * N + n] +
0191                  b[16 * N + n] * a[4 * N + n] + b[17 * N + n] * a[5 * N + n];
0192   c[4 * N + n] = b[13 * N + n] + b[14 * N + n] * a[8 * N + n] + b[15 * N + n] * a[9 * N + n] +
0193                  b[16 * N + n] * a[10 * N + n] + b[17 * N + n] * a[11 * N + n];
0194   c[5 * N + n] = 0;
0195   c[6 * N + n] = b[18 * N + n] + b[20 * N + n] * a[2 * N + n] + b[21 * N + n] * a[3 * N + n] +
0196                  b[22 * N + n] * a[4 * N + n] + b[23 * N + n] * a[5 * N + n];
0197   c[7 * N + n] = b[19 * N + n] + b[20 * N + n] * a[8 * N + n] + b[21 * N + n] * a[9 * N + n] +
0198                  b[22 * N + n] * a[10 * N + n] + b[23 * N + n] * a[11 * N + n];
0199   c[8 * N + n] = 0;
0200   c[9 * N + n] = b[21 * N + n];
0201   c[10 * N + n] = b[24 * N + n] + b[26 * N + n] * a[2 * N + n] + b[27 * N + n] * a[3 * N + n] +
0202                   b[28 * N + n] * a[4 * N + n] + b[29 * N + n] * a[5 * N + n];
0203   c[11 * N + n] = b[25 * N + n] + b[26 * N + n] * a[8 * N + n] + b[27 * N + n] * a[9 * N + n] +
0204                   b[28 * N + n] * a[10 * N + n] + b[29 * N + n] * a[11 * N + n];
0205   c[12 * N + n] = 0;
0206   c[13 * N + n] = b[27 * N + n];
0207   c[14 * N + n] =
0208       b[26 * N + n] * a[26 * N + n] + b[27 * N + n] * a[27 * N + n] + b[28 * N + n] + b[29 * N + n] * a[29 * N + n];
0209   c[15 * N + n] = b[30 * N + n] + b[32 * N + n] * a[2 * N + n] + b[33 * N + n] * a[3 * N + n] +
0210                   b[34 * N + n] * a[4 * N + n] + b[35 * N + n] * a[5 * N + n];
0211   c[16 * N + n] = b[31 * N + n] + b[32 * N + n] * a[8 * N + n] + b[33 * N + n] * a[9 * N + n] +
0212                   b[34 * N + n] * a[10 * N + n] + b[35 * N + n] * a[11 * N + n];
0213   c[17 * N + n] = 0;
0214   c[18 * N + n] = b[33 * N + n];
0215   c[19 * N + n] =
0216       b[32 * N + n] * a[26 * N + n] + b[33 * N + n] * a[27 * N + n] + b[34 * N + n] + b[35 * N + n] * a[29 * N + n];
0217   c[20 * N + n] = b[35 * N + n];
0218 }
0219 #endif