Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoTracker/MkFitCore/src/OutErrCCS.ah is written in an unsupported language. File is not indexed.

0001 #ifdef MPLEX_INTRINSICS
0002 
0003 for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T)) {
0004   IntrVec_t a_3 = LD(a, 3);
0005   IntrVec_t b_6 = LD(b, 6);
0006   IntrVec_t c_0 = MUL(a_3, b_6);
0007   IntrVec_t b_7 = LD(b, 7);
0008   IntrVec_t c_1 = MUL(a_3, b_7);
0009   IntrVec_t b_8 = LD(b, 8);
0010   IntrVec_t c_2 = MUL(a_3, b_8);
0011   IntrVec_t b_9 = LD(b, 9);
0012   IntrVec_t c_3 = MUL(a_3, b_9);
0013   IntrVec_t b_13 = LD(b, 13);
0014   IntrVec_t c_4 = MUL(a_3, b_13);
0015 
0016   IntrVec_t a_4 = LD(a, 4);
0017   IntrVec_t b_10 = LD(b, 10);
0018   c_0 = FMA(a_4, b_10, c_0);
0019   IntrVec_t b_11 = LD(b, 11);
0020   c_1 = FMA(a_4, b_11, c_1);
0021   IntrVec_t b_12 = LD(b, 12);
0022   c_2 = FMA(a_4, b_12, c_2);
0023   ST(c, 0, c_0);
0024   c_3 = FMA(a_4, b_13, c_3);
0025   ST(c, 1, c_1);
0026   IntrVec_t b_14 = LD(b, 14);
0027   c_4 = FMA(a_4, b_14, c_4);
0028   ST(c, 2, c_2);
0029   ST(c, 3, c_3);
0030 
0031   IntrVec_t a_8 = LD(a, 8);
0032   IntrVec_t c_5 = MUL(a_8, b_6);
0033   ST(c, 4, c_4);
0034   IntrVec_t c_6 = MUL(a_8, b_7);
0035   IntrVec_t c_7 = MUL(a_8, b_8);
0036   IntrVec_t c_8 = MUL(a_8, b_9);
0037   IntrVec_t c_9 = MUL(a_8, b_13);
0038 
0039   IntrVec_t a_9 = LD(a, 9);
0040   c_5 = FMA(a_9, b_10, c_5);
0041   c_6 = FMA(a_9, b_11, c_6);
0042   c_7 = FMA(a_9, b_12, c_7);
0043   c_8 = FMA(a_9, b_13, c_8);
0044   c_9 = FMA(a_9, b_14, c_9);
0045   ST(c, 5, c_5);
0046   ST(c, 6, c_6);
0047   ST(c, 7, c_7);
0048   ST(c, 8, c_8);
0049   ST(c, 9, c_9);
0050 
0051   IntrVec_t a_13 = LD(a, 13);
0052   IntrVec_t c_10 = MUL(a_13, b_6);
0053   IntrVec_t c_11 = MUL(a_13, b_7);
0054   IntrVec_t c_12 = MUL(a_13, b_8);
0055   IntrVec_t c_13 = MUL(a_13, b_9);
0056   IntrVec_t c_14 = MUL(a_13, b_13);
0057 
0058   IntrVec_t a_14 = LD(a, 14);
0059   c_10 = FMA(a_14, b_10, c_10);
0060   c_11 = FMA(a_14, b_11, c_11);
0061   c_12 = FMA(a_14, b_12, c_12);
0062   c_13 = FMA(a_14, b_13, c_13);
0063   c_14 = FMA(a_14, b_14, c_14);
0064   ST(c, 10, c_10);
0065   ST(c, 11, c_11);
0066   ST(c, 12, c_12);
0067   ST(c, 13, c_13);
0068   ST(c, 14, c_14);
0069 
0070   IntrVec_t a_15 = LD(a, 15);
0071   IntrVec_t b_0 = LD(b, 0);
0072   IntrVec_t c_15 = MUL(a_15, b_0);
0073   IntrVec_t b_1 = LD(b, 1);
0074   IntrVec_t c_16 = MUL(a_15, b_1);
0075   IntrVec_t b_3 = LD(b, 3);
0076   IntrVec_t c_17 = MUL(a_15, b_3);
0077   IntrVec_t c_18 = MUL(a_15, b_6);
0078   IntrVec_t c_19 = MUL(a_15, b_10);
0079 
0080   IntrVec_t a_16 = LD(a, 16);
0081   c_15 = FMA(a_16, b_1, c_15);
0082   IntrVec_t b_2 = LD(b, 2);
0083   c_16 = FMA(a_16, b_2, c_16);
0084   IntrVec_t b_4 = LD(b, 4);
0085   c_17 = FMA(a_16, b_4, c_17);
0086   c_18 = FMA(a_16, b_7, c_18);
0087   c_19 = FMA(a_16, b_11, c_19);
0088 
0089   IntrVec_t a_17 = LD(a, 17);
0090   c_15 = FMA(a_17, b_3, c_15);
0091   c_16 = FMA(a_17, b_4, c_16);
0092   IntrVec_t b_5 = LD(b, 5);
0093   c_17 = FMA(a_17, b_5, c_17);
0094   c_18 = FMA(a_17, b_8, c_18);
0095   c_19 = FMA(a_17, b_12, c_19);
0096 
0097   IntrVec_t a_21 = LD(a, 21);
0098   IntrVec_t c_20 = MUL(a_21, b_1);
0099   IntrVec_t c_21 = MUL(a_21, b_2);
0100   IntrVec_t c_22 = MUL(a_21, b_4);
0101   ST(c, 15, c_15);
0102   ST(c, 16, c_16);
0103   ST(c, 17, c_17);
0104   ST(c, 18, c_18);
0105   ST(c, 19, c_19);
0106   IntrVec_t c_23 = MUL(a_21, b_7);
0107   IntrVec_t c_24 = MUL(a_21, b_11);
0108 
0109   IntrVec_t a_22 = LD(a, 22);
0110   c_20 = FMA(a_22, b_3, c_20);
0111   c_21 = FMA(a_22, b_4, c_21);
0112   c_22 = FMA(a_22, b_5, c_22);
0113   c_23 = FMA(a_22, b_8, c_23);
0114   c_24 = FMA(a_22, b_12, c_24);
0115 
0116   IntrVec_t a_23 = LD(a, 23);
0117   c_20 = FMA(a_23, b_6, c_20);
0118   c_21 = FMA(a_23, b_7, c_21);
0119   c_22 = FMA(a_23, b_8, c_22);
0120   c_23 = FMA(a_23, b_9, c_23);
0121   c_24 = FMA(a_23, b_13, c_24);
0122 
0123   IntrVec_t a_24 = LD(a, 24);
0124   c_20 = FMA(a_24, b_10, c_20);
0125   c_21 = FMA(a_24, b_11, c_21);
0126   c_22 = FMA(a_24, b_12, c_22);
0127   c_23 = FMA(a_24, b_13, c_23);
0128   c_24 = FMA(a_24, b_14, c_24);
0129   ST(c, 20, c_20);
0130   ST(c, 21, c_21);
0131   ST(c, 22, c_22);
0132   ST(c, 23, c_23);
0133   ST(c, 24, c_24);
0134 
0135   IntrVec_t a_26 = LD(a, 26);
0136   IntrVec_t c_25 = MUL(a_26, b_1);
0137   IntrVec_t c_26 = MUL(a_26, b_2);
0138   IntrVec_t c_27 = MUL(a_26, b_4);
0139   IntrVec_t c_28 = MUL(a_26, b_7);
0140   IntrVec_t c_29 = MUL(a_26, b_11);
0141 
0142   IntrVec_t a_27 = LD(a, 27);
0143   c_25 = FMA(a_27, b_3, c_25);
0144   c_26 = FMA(a_27, b_4, c_26);
0145   c_27 = FMA(a_27, b_5, c_27);
0146   c_28 = FMA(a_27, b_8, c_28);
0147   c_29 = FMA(a_27, b_12, c_29);
0148 
0149   ST(c, 25, c_25);
0150   ST(c, 26, c_26);
0151   ST(c, 27, c_27);
0152   ST(c, 28, c_28);
0153   ST(c, 29, c_29);
0154 }
0155 
0156 #else
0157 
0158 #pragma omp simd
0159 for (int n = 0; n < N; ++n) {
0160   c[0 * N + n] = a[3 * N + n] * b[6 * N + n] + a[4 * N + n] * b[10 * N + n];
0161   c[1 * N + n] = a[3 * N + n] * b[7 * N + n] + a[4 * N + n] * b[11 * N + n];
0162   c[2 * N + n] = a[3 * N + n] * b[8 * N + n] + a[4 * N + n] * b[12 * N + n];
0163   c[3 * N + n] = a[3 * N + n] * b[9 * N + n] + a[4 * N + n] * b[13 * N + n];
0164   c[4 * N + n] = a[3 * N + n] * b[13 * N + n] + a[4 * N + n] * b[14 * N + n];
0165   c[5 * N + n] = a[8 * N + n] * b[6 * N + n] + a[9 * N + n] * b[10 * N + n];
0166   c[6 * N + n] = a[8 * N + n] * b[7 * N + n] + a[9 * N + n] * b[11 * N + n];
0167   c[7 * N + n] = a[8 * N + n] * b[8 * N + n] + a[9 * N + n] * b[12 * N + n];
0168   c[8 * N + n] = a[8 * N + n] * b[9 * N + n] + a[9 * N + n] * b[13 * N + n];
0169   c[9 * N + n] = a[8 * N + n] * b[13 * N + n] + a[9 * N + n] * b[14 * N + n];
0170   c[10 * N + n] = a[13 * N + n] * b[6 * N + n] + a[14 * N + n] * b[10 * N + n];
0171   c[11 * N + n] = a[13 * N + n] * b[7 * N + n] + a[14 * N + n] * b[11 * N + n];
0172   c[12 * N + n] = a[13 * N + n] * b[8 * N + n] + a[14 * N + n] * b[12 * N + n];
0173   c[13 * N + n] = a[13 * N + n] * b[9 * N + n] + a[14 * N + n] * b[13 * N + n];
0174   c[14 * N + n] = a[13 * N + n] * b[13 * N + n] + a[14 * N + n] * b[14 * N + n];
0175   c[15 * N + n] = a[15 * N + n] * b[0 * N + n] + a[16 * N + n] * b[1 * N + n] + a[17 * N + n] * b[3 * N + n];
0176   c[16 * N + n] = a[15 * N + n] * b[1 * N + n] + a[16 * N + n] * b[2 * N + n] + a[17 * N + n] * b[4 * N + n];
0177   c[17 * N + n] = a[15 * N + n] * b[3 * N + n] + a[16 * N + n] * b[4 * N + n] + a[17 * N + n] * b[5 * N + n];
0178   c[18 * N + n] = a[15 * N + n] * b[6 * N + n] + a[16 * N + n] * b[7 * N + n] + a[17 * N + n] * b[8 * N + n];
0179   c[19 * N + n] = a[15 * N + n] * b[10 * N + n] + a[16 * N + n] * b[11 * N + n] + a[17 * N + n] * b[12 * N + n];
0180   c[20 * N + n] = a[21 * N + n] * b[1 * N + n] + a[22 * N + n] * b[3 * N + n] + a[23 * N + n] * b[6 * N + n] +
0181                   a[24 * N + n] * b[10 * N + n];
0182   c[21 * N + n] = a[21 * N + n] * b[2 * N + n] + a[22 * N + n] * b[4 * N + n] + a[23 * N + n] * b[7 * N + n] +
0183                   a[24 * N + n] * b[11 * N + n];
0184   c[22 * N + n] = a[21 * N + n] * b[4 * N + n] + a[22 * N + n] * b[5 * N + n] + a[23 * N + n] * b[8 * N + n] +
0185                   a[24 * N + n] * b[12 * N + n];
0186   c[23 * N + n] = a[21 * N + n] * b[7 * N + n] + a[22 * N + n] * b[8 * N + n] + a[23 * N + n] * b[9 * N + n] +
0187                   a[24 * N + n] * b[13 * N + n];
0188   c[24 * N + n] = a[21 * N + n] * b[11 * N + n] + a[22 * N + n] * b[12 * N + n] + a[23 * N + n] * b[13 * N + n] +
0189                   a[24 * N + n] * b[14 * N + n];
0190   c[25 * N + n] = a[26 * N + n] * b[1 * N + n] + a[27 * N + n] * b[3 * N + n];
0191   c[26 * N + n] = a[26 * N + n] * b[2 * N + n] + a[27 * N + n] * b[4 * N + n];
0192   c[27 * N + n] = a[26 * N + n] * b[4 * N + n] + a[27 * N + n] * b[5 * N + n];
0193   c[28 * N + n] = a[26 * N + n] * b[7 * N + n] + a[27 * N + n] * b[8 * N + n];
0194   c[29 * N + n] = a[26 * N + n] * b[11 * N + n] + a[27 * N + n] * b[12 * N + n];
0195 }
0196 #endif