Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoTracker/MkFitCore/src/OutErrCCSTransp.ah is written in an unsupported language. File is not indexed.

0001 #ifdef MPLEX_INTRINSICS
0002 
0003 for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T)) {
0004   IntrVec_t b_3 = LD(b, 3);
0005   IntrVec_t a_3 = LD(a, 3);
0006   IntrVec_t c_0 = MUL(b_3, a_3);
0007 
0008   IntrVec_t b_4 = LD(b, 4);
0009   IntrVec_t a_4 = LD(a, 4);
0010   c_0 = FMA(b_4, a_4, c_0);
0011 
0012   IntrVec_t b_8 = LD(b, 8);
0013   IntrVec_t c_1 = MUL(b_8, a_3);
0014   IntrVec_t a_8 = LD(a, 8);
0015   IntrVec_t c_2 = MUL(b_8, a_8);
0016   ST(c, 0, c_0);
0017 
0018   IntrVec_t b_9 = LD(b, 9);
0019   c_1 = FMA(b_9, a_4, c_1);
0020   IntrVec_t a_9 = LD(a, 9);
0021   c_2 = FMA(b_9, a_9, c_2);
0022 
0023   IntrVec_t b_13 = LD(b, 13);
0024   IntrVec_t c_3 = MUL(b_13, a_3);
0025   ST(c, 1, c_1);
0026   IntrVec_t c_4 = MUL(b_13, a_8);
0027   ST(c, 2, c_2);
0028   IntrVec_t a_13 = LD(a, 13);
0029   IntrVec_t c_5 = MUL(b_13, a_13);
0030 
0031   IntrVec_t b_14 = LD(b, 14);
0032   c_3 = FMA(b_14, a_4, c_3);
0033   c_4 = FMA(b_14, a_9, c_4);
0034   IntrVec_t a_14 = LD(a, 14);
0035   c_5 = FMA(b_14, a_14, c_5);
0036 
0037   IntrVec_t b_15 = LD(b, 15);
0038   IntrVec_t a_15 = LD(a, 15);
0039   IntrVec_t c_9 = MUL(b_15, a_15);
0040   ST(c, 3, c_3);
0041   ST(c, 4, c_4);
0042   ST(c, 5, c_5);
0043 
0044   IntrVec_t b_16 = LD(b, 16);
0045   IntrVec_t a_16 = LD(a, 16);
0046   c_9 = FMA(b_16, a_16, c_9);
0047 
0048   IntrVec_t b_17 = LD(b, 17);
0049   IntrVec_t a_17 = LD(a, 17);
0050   c_9 = FMA(b_17, a_17, c_9);
0051 
0052   IntrVec_t b_18 = LD(b, 18);
0053   IntrVec_t c_6 = MUL(b_18, a_3);
0054   IntrVec_t c_7 = MUL(b_18, a_8);
0055   IntrVec_t c_8 = MUL(b_18, a_13);
0056 
0057   IntrVec_t b_19 = LD(b, 19);
0058   c_6 = FMA(b_19, a_4, c_6);
0059   c_7 = FMA(b_19, a_9, c_7);
0060   c_8 = FMA(b_19, a_14, c_8);
0061 
0062   IntrVec_t b_20 = LD(b, 20);
0063   IntrVec_t c_13 = MUL(b_20, a_15);
0064   ST(c, 6, c_6);
0065   ST(c, 7, c_7);
0066   ST(c, 8, c_8);
0067   ST(c, 9, c_9);
0068 
0069   IntrVec_t b_21 = LD(b, 21);
0070   c_13 = FMA(b_21, a_16, c_13);
0071   IntrVec_t a_21 = LD(a, 21);
0072   IntrVec_t c_14 = MUL(b_21, a_21);
0073 
0074   IntrVec_t b_22 = LD(b, 22);
0075   c_13 = FMA(b_22, a_17, c_13);
0076   IntrVec_t a_22 = LD(a, 22);
0077   c_14 = FMA(b_22, a_22, c_14);
0078 
0079   IntrVec_t b_23 = LD(b, 23);
0080   IntrVec_t c_10 = MUL(b_23, a_3);
0081   IntrVec_t c_11 = MUL(b_23, a_8);
0082   IntrVec_t c_12 = MUL(b_23, a_13);
0083   IntrVec_t a_23 = LD(a, 23);
0084   c_14 = FMA(b_23, a_23, c_14);
0085 
0086   IntrVec_t b_24 = LD(b, 24);
0087   c_10 = FMA(b_24, a_4, c_10);
0088   c_11 = FMA(b_24, a_9, c_11);
0089   c_12 = FMA(b_24, a_14, c_12);
0090   IntrVec_t a_24 = LD(a, 24);
0091   c_14 = FMA(b_24, a_24, c_14);
0092   ST(c, 10, c_10);
0093   ST(c, 11, c_11);
0094   ST(c, 12, c_12);
0095   ST(c, 13, c_13);
0096   ST(c, 14, c_14);
0097 
0098   IntrVec_t b_25 = LD(b, 25);
0099   IntrVec_t c_18 = MUL(b_25, a_15);
0100 
0101   IntrVec_t b_26 = LD(b, 26);
0102   c_18 = FMA(b_26, a_16, c_18);
0103   IntrVec_t c_19 = MUL(b_26, a_21);
0104   IntrVec_t a_26 = LD(a, 26);
0105   IntrVec_t c_20 = MUL(b_26, a_26);
0106 
0107   IntrVec_t b_27 = LD(b, 27);
0108   c_18 = FMA(b_27, a_17, c_18);
0109   c_19 = FMA(b_27, a_22, c_19);
0110   IntrVec_t a_27 = LD(a, 27);
0111   c_20 = FMA(b_27, a_27, c_20);
0112 
0113   IntrVec_t b_28 = LD(b, 28);
0114   IntrVec_t c_15 = MUL(b_28, a_3);
0115   IntrVec_t c_16 = MUL(b_28, a_8);
0116   IntrVec_t c_17 = MUL(b_28, a_13);
0117   c_19 = FMA(b_28, a_23, c_19);
0118 
0119   IntrVec_t b_29 = LD(b, 29);
0120   c_15 = FMA(b_29, a_4, c_15);
0121   c_16 = FMA(b_29, a_9, c_16);
0122   c_17 = FMA(b_29, a_14, c_17);
0123   c_19 = FMA(b_29, a_24, c_19);
0124   ST(c, 15, c_15);
0125   ST(c, 16, c_16);
0126   ST(c, 17, c_17);
0127   ST(c, 18, c_18);
0128   ST(c, 19, c_19);
0129   ST(c, 20, c_20);
0130 }
0131 
0132 #else
0133 
0134 #pragma omp simd
0135 for (int n = 0; n < N; ++n) {
0136   c[0 * N + n] = b[3 * N + n] * a[3 * N + n] + b[4 * N + n] * a[4 * N + n];
0137   c[1 * N + n] = b[8 * N + n] * a[3 * N + n] + b[9 * N + n] * a[4 * N + n];
0138   c[2 * N + n] = b[8 * N + n] * a[8 * N + n] + b[9 * N + n] * a[9 * N + n];
0139   c[3 * N + n] = b[13 * N + n] * a[3 * N + n] + b[14 * N + n] * a[4 * N + n];
0140   c[4 * N + n] = b[13 * N + n] * a[8 * N + n] + b[14 * N + n] * a[9 * N + n];
0141   c[5 * N + n] = b[13 * N + n] * a[13 * N + n] + b[14 * N + n] * a[14 * N + n];
0142   c[6 * N + n] = b[18 * N + n] * a[3 * N + n] + b[19 * N + n] * a[4 * N + n];
0143   c[7 * N + n] = b[18 * N + n] * a[8 * N + n] + b[19 * N + n] * a[9 * N + n];
0144   c[8 * N + n] = b[18 * N + n] * a[13 * N + n] + b[19 * N + n] * a[14 * N + n];
0145   c[9 * N + n] = b[15 * N + n] * a[15 * N + n] + b[16 * N + n] * a[16 * N + n] + b[17 * N + n] * a[17 * N + n];
0146   c[10 * N + n] = b[23 * N + n] * a[3 * N + n] + b[24 * N + n] * a[4 * N + n];
0147   c[11 * N + n] = b[23 * N + n] * a[8 * N + n] + b[24 * N + n] * a[9 * N + n];
0148   c[12 * N + n] = b[23 * N + n] * a[13 * N + n] + b[24 * N + n] * a[14 * N + n];
0149   c[13 * N + n] = b[20 * N + n] * a[15 * N + n] + b[21 * N + n] * a[16 * N + n] + b[22 * N + n] * a[17 * N + n];
0150   c[14 * N + n] = b[21 * N + n] * a[21 * N + n] + b[22 * N + n] * a[22 * N + n] + b[23 * N + n] * a[23 * N + n] +
0151                   b[24 * N + n] * a[24 * N + n];
0152   c[15 * N + n] = b[28 * N + n] * a[3 * N + n] + b[29 * N + n] * a[4 * N + n];
0153   c[16 * N + n] = b[28 * N + n] * a[8 * N + n] + b[29 * N + n] * a[9 * N + n];
0154   c[17 * N + n] = b[28 * N + n] * a[13 * N + n] + b[29 * N + n] * a[14 * N + n];
0155   c[18 * N + n] = b[25 * N + n] * a[15 * N + n] + b[26 * N + n] * a[16 * N + n] + b[27 * N + n] * a[17 * N + n];
0156   c[19 * N + n] = b[26 * N + n] * a[21 * N + n] + b[27 * N + n] * a[22 * N + n] + b[28 * N + n] * a[23 * N + n] +
0157                   b[29 * N + n] * a[24 * N + n];
0158   c[20 * N + n] = b[26 * N + n] * a[26 * N + n] + b[27 * N + n] * a[27 * N + n];
0159 }
0160 #endif