Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoTracker/MkFitCore/src/PsErrLoc.ah is written in an unsupported language. File is not indexed.

0001 #ifdef MPLEX_INTRINSICS
0002 
0003 for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T)) {
0004   IntrVec_t a_3 = LD(a, 3);
0005   IntrVec_t b_6 = LD(b, 6);
0006   IntrVec_t c_0 = MUL(a_3, b_6);
0007   IntrVec_t b_7 = LD(b, 7);
0008   IntrVec_t c_1 = MUL(a_3, b_7);
0009   IntrVec_t b_8 = LD(b, 8);
0010   IntrVec_t c_2 = MUL(a_3, b_8);
0011   IntrVec_t b_9 = LD(b, 9);
0012   IntrVec_t c_3 = MUL(a_3, b_9);
0013   IntrVec_t b_13 = LD(b, 13);
0014   IntrVec_t c_4 = MUL(a_3, b_13);
0015   IntrVec_t b_18 = LD(b, 18);
0016   IntrVec_t c_5 = MUL(a_3, b_18);
0017 
0018   IntrVec_t a_5 = LD(a, 5);
0019   IntrVec_t b_15 = LD(b, 15);
0020   c_0 = FMA(a_5, b_15, c_0);
0021   IntrVec_t b_16 = LD(b, 16);
0022   c_1 = FMA(a_5, b_16, c_1);
0023   IntrVec_t b_17 = LD(b, 17);
0024   c_2 = FMA(a_5, b_17, c_2);
0025   ST(c, 0, c_0);
0026   c_3 = FMA(a_5, b_18, c_3);
0027   ST(c, 1, c_1);
0028   IntrVec_t b_19 = LD(b, 19);
0029   c_4 = FMA(a_5, b_19, c_4);
0030   ST(c, 2, c_2);
0031   ST(c, 3, c_3);
0032   IntrVec_t b_20 = LD(b, 20);
0033   c_5 = FMA(a_5, b_20, c_5);
0034   ST(c, 4, c_4);
0035 
0036   IntrVec_t a_6 = LD(a, 6);
0037   IntrVec_t b_0 = LD(b, 0);
0038   IntrVec_t c_6 = MUL(a_6, b_0);
0039   ST(c, 5, c_5);
0040   IntrVec_t b_1 = LD(b, 1);
0041   IntrVec_t c_7 = MUL(a_6, b_1);
0042   IntrVec_t b_3 = LD(b, 3);
0043   IntrVec_t c_8 = MUL(a_6, b_3);
0044   IntrVec_t c_9 = MUL(a_6, b_6);
0045   IntrVec_t b_10 = LD(b, 10);
0046   IntrVec_t c_10 = MUL(a_6, b_10);
0047   IntrVec_t c_11 = MUL(a_6, b_15);
0048 
0049   IntrVec_t a_7 = LD(a, 7);
0050   c_6 = FMA(a_7, b_1, c_6);
0051   IntrVec_t b_2 = LD(b, 2);
0052   c_7 = FMA(a_7, b_2, c_7);
0053   IntrVec_t b_4 = LD(b, 4);
0054   c_8 = FMA(a_7, b_4, c_8);
0055   c_9 = FMA(a_7, b_7, c_9);
0056   IntrVec_t b_11 = LD(b, 11);
0057   c_10 = FMA(a_7, b_11, c_10);
0058   c_11 = FMA(a_7, b_16, c_11);
0059 
0060   IntrVec_t a_8 = LD(a, 8);
0061   c_6 = FMA(a_8, b_3, c_6);
0062   c_7 = FMA(a_8, b_4, c_7);
0063   IntrVec_t b_5 = LD(b, 5);
0064   c_8 = FMA(a_8, b_5, c_8);
0065   c_9 = FMA(a_8, b_8, c_9);
0066   IntrVec_t b_12 = LD(b, 12);
0067   c_10 = FMA(a_8, b_12, c_10);
0068   c_11 = FMA(a_8, b_17, c_11);
0069 
0070   IntrVec_t a_10 = LD(a, 10);
0071   c_6 = FMA(a_10, b_10, c_6);
0072   c_7 = FMA(a_10, b_11, c_7);
0073   c_8 = FMA(a_10, b_12, c_8);
0074   c_9 = FMA(a_10, b_13, c_9);
0075   IntrVec_t b_14 = LD(b, 14);
0076   c_10 = FMA(a_10, b_14, c_10);
0077   c_11 = FMA(a_10, b_19, c_11);
0078 
0079   IntrVec_t a_11 = LD(a, 11);
0080   c_6 = FMA(a_11, b_15, c_6);
0081   c_7 = FMA(a_11, b_16, c_7);
0082   c_8 = FMA(a_11, b_17, c_8);
0083   c_9 = FMA(a_11, b_18, c_9);
0084   c_10 = FMA(a_11, b_19, c_10);
0085   ST(c, 6, c_6);
0086   ST(c, 7, c_7);
0087   ST(c, 8, c_8);
0088   ST(c, 9, c_9);
0089   ST(c, 10, c_10);
0090   c_11 = FMA(a_11, b_20, c_11);
0091 
0092   IntrVec_t a_12 = LD(a, 12);
0093   IntrVec_t c_12 = MUL(a_12, b_0);
0094   IntrVec_t c_13 = MUL(a_12, b_1);
0095   IntrVec_t c_14 = MUL(a_12, b_3);
0096   ST(c, 11, c_11);
0097   IntrVec_t c_15 = MUL(a_12, b_6);
0098   IntrVec_t c_16 = MUL(a_12, b_10);
0099   IntrVec_t c_17 = MUL(a_12, b_15);
0100 
0101   IntrVec_t a_13 = LD(a, 13);
0102   c_12 = FMA(a_13, b_1, c_12);
0103   c_13 = FMA(a_13, b_2, c_13);
0104   c_14 = FMA(a_13, b_4, c_14);
0105   c_15 = FMA(a_13, b_7, c_15);
0106   c_16 = FMA(a_13, b_11, c_16);
0107   c_17 = FMA(a_13, b_16, c_17);
0108 
0109   IntrVec_t a_14 = LD(a, 14);
0110   c_12 = FMA(a_14, b_3, c_12);
0111   c_13 = FMA(a_14, b_4, c_13);
0112   c_14 = FMA(a_14, b_5, c_14);
0113   c_15 = FMA(a_14, b_8, c_15);
0114   c_16 = FMA(a_14, b_12, c_16);
0115   c_17 = FMA(a_14, b_17, c_17);
0116 
0117   IntrVec_t a_16 = LD(a, 16);
0118   c_12 = FMA(a_16, b_10, c_12);
0119   c_13 = FMA(a_16, b_11, c_13);
0120   c_14 = FMA(a_16, b_12, c_14);
0121   c_15 = FMA(a_16, b_13, c_15);
0122   c_16 = FMA(a_16, b_14, c_16);
0123   c_17 = FMA(a_16, b_19, c_17);
0124 
0125   IntrVec_t a_17 = LD(a, 17);
0126   c_12 = FMA(a_17, b_15, c_12);
0127   c_13 = FMA(a_17, b_16, c_13);
0128   c_14 = FMA(a_17, b_17, c_14);
0129   c_15 = FMA(a_17, b_18, c_15);
0130   c_16 = FMA(a_17, b_19, c_16);
0131   ST(c, 12, c_12);
0132   ST(c, 13, c_13);
0133   ST(c, 14, c_14);
0134   ST(c, 15, c_15);
0135   ST(c, 16, c_16);
0136   c_17 = FMA(a_17, b_20, c_17);
0137 
0138   IntrVec_t a_18 = LD(a, 18);
0139   IntrVec_t c_18 = MUL(a_18, b_0);
0140   IntrVec_t c_19 = MUL(a_18, b_1);
0141   IntrVec_t c_20 = MUL(a_18, b_3);
0142   ST(c, 17, c_17);
0143   IntrVec_t c_21 = MUL(a_18, b_6);
0144   IntrVec_t c_22 = MUL(a_18, b_10);
0145   IntrVec_t c_23 = MUL(a_18, b_15);
0146 
0147   IntrVec_t a_19 = LD(a, 19);
0148   c_18 = FMA(a_19, b_1, c_18);
0149   c_19 = FMA(a_19, b_2, c_19);
0150   c_20 = FMA(a_19, b_4, c_20);
0151   c_21 = FMA(a_19, b_7, c_21);
0152   c_22 = FMA(a_19, b_11, c_22);
0153   c_23 = FMA(a_19, b_16, c_23);
0154 
0155   IntrVec_t a_20 = LD(a, 20);
0156   c_18 = FMA(a_20, b_3, c_18);
0157   c_19 = FMA(a_20, b_4, c_19);
0158   c_20 = FMA(a_20, b_5, c_20);
0159   c_21 = FMA(a_20, b_8, c_21);
0160   c_22 = FMA(a_20, b_12, c_22);
0161   c_23 = FMA(a_20, b_17, c_23);
0162 
0163   IntrVec_t a_24 = LD(a, 24);
0164   IntrVec_t c_24 = MUL(a_24, b_0);
0165   IntrVec_t c_25 = MUL(a_24, b_1);
0166   IntrVec_t c_26 = MUL(a_24, b_3);
0167   ST(c, 18, c_18);
0168   ST(c, 19, c_19);
0169   ST(c, 20, c_20);
0170   ST(c, 21, c_21);
0171   ST(c, 22, c_22);
0172   ST(c, 23, c_23);
0173   IntrVec_t c_27 = MUL(a_24, b_6);
0174   IntrVec_t c_28 = MUL(a_24, b_10);
0175   IntrVec_t c_29 = MUL(a_24, b_15);
0176 
0177   IntrVec_t a_25 = LD(a, 25);
0178   c_24 = FMA(a_25, b_1, c_24);
0179   c_25 = FMA(a_25, b_2, c_25);
0180   c_26 = FMA(a_25, b_4, c_26);
0181   c_27 = FMA(a_25, b_7, c_27);
0182   c_28 = FMA(a_25, b_11, c_28);
0183   c_29 = FMA(a_25, b_16, c_29);
0184 
0185   IntrVec_t a_26 = LD(a, 26);
0186   c_24 = FMA(a_26, b_3, c_24);
0187   c_25 = FMA(a_26, b_4, c_25);
0188   c_26 = FMA(a_26, b_5, c_26);
0189   c_27 = FMA(a_26, b_8, c_27);
0190   c_28 = FMA(a_26, b_12, c_28);
0191   c_29 = FMA(a_26, b_17, c_29);
0192 
0193   ST(c, 24, c_24);
0194   ST(c, 25, c_25);
0195   ST(c, 26, c_26);
0196   ST(c, 27, c_27);
0197   ST(c, 28, c_28);
0198   ST(c, 29, c_29);
0199 }
0200 
0201 #else
0202 
0203 #pragma omp simd
0204 for (int n = 0; n < N; ++n) {
0205   c[0 * N + n] = a[3 * N + n] * b[6 * N + n] + a[5 * N + n] * b[15 * N + n];
0206   c[1 * N + n] = a[3 * N + n] * b[7 * N + n] + a[5 * N + n] * b[16 * N + n];
0207   c[2 * N + n] = a[3 * N + n] * b[8 * N + n] + a[5 * N + n] * b[17 * N + n];
0208   c[3 * N + n] = a[3 * N + n] * b[9 * N + n] + a[5 * N + n] * b[18 * N + n];
0209   c[4 * N + n] = a[3 * N + n] * b[13 * N + n] + a[5 * N + n] * b[19 * N + n];
0210   c[5 * N + n] = a[3 * N + n] * b[18 * N + n] + a[5 * N + n] * b[20 * N + n];
0211   c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[1 * N + n] + a[8 * N + n] * b[3 * N + n] +
0212                  a[10 * N + n] * b[10 * N + n] + a[11 * N + n] * b[15 * N + n];
0213   c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[2 * N + n] + a[8 * N + n] * b[4 * N + n] +
0214                  a[10 * N + n] * b[11 * N + n] + a[11 * N + n] * b[16 * N + n];
0215   c[8 * N + n] = a[6 * N + n] * b[3 * N + n] + a[7 * N + n] * b[4 * N + n] + a[8 * N + n] * b[5 * N + n] +
0216                  a[10 * N + n] * b[12 * N + n] + a[11 * N + n] * b[17 * N + n];
0217   c[9 * N + n] = a[6 * N + n] * b[6 * N + n] + a[7 * N + n] * b[7 * N + n] + a[8 * N + n] * b[8 * N + n] +
0218                  a[10 * N + n] * b[13 * N + n] + a[11 * N + n] * b[18 * N + n];
0219   c[10 * N + n] = a[6 * N + n] * b[10 * N + n] + a[7 * N + n] * b[11 * N + n] + a[8 * N + n] * b[12 * N + n] +
0220                   a[10 * N + n] * b[14 * N + n] + a[11 * N + n] * b[19 * N + n];
0221   c[11 * N + n] = a[6 * N + n] * b[15 * N + n] + a[7 * N + n] * b[16 * N + n] + a[8 * N + n] * b[17 * N + n] +
0222                   a[10 * N + n] * b[19 * N + n] + a[11 * N + n] * b[20 * N + n];
0223   c[12 * N + n] = a[12 * N + n] * b[0 * N + n] + a[13 * N + n] * b[1 * N + n] + a[14 * N + n] * b[3 * N + n] +
0224                   a[16 * N + n] * b[10 * N + n] + a[17 * N + n] * b[15 * N + n];
0225   c[13 * N + n] = a[12 * N + n] * b[1 * N + n] + a[13 * N + n] * b[2 * N + n] + a[14 * N + n] * b[4 * N + n] +
0226                   a[16 * N + n] * b[11 * N + n] + a[17 * N + n] * b[16 * N + n];
0227   c[14 * N + n] = a[12 * N + n] * b[3 * N + n] + a[13 * N + n] * b[4 * N + n] + a[14 * N + n] * b[5 * N + n] +
0228                   a[16 * N + n] * b[12 * N + n] + a[17 * N + n] * b[17 * N + n];
0229   c[15 * N + n] = a[12 * N + n] * b[6 * N + n] + a[13 * N + n] * b[7 * N + n] + a[14 * N + n] * b[8 * N + n] +
0230                   a[16 * N + n] * b[13 * N + n] + a[17 * N + n] * b[18 * N + n];
0231   c[16 * N + n] = a[12 * N + n] * b[10 * N + n] + a[13 * N + n] * b[11 * N + n] + a[14 * N + n] * b[12 * N + n] +
0232                   a[16 * N + n] * b[14 * N + n] + a[17 * N + n] * b[19 * N + n];
0233   c[17 * N + n] = a[12 * N + n] * b[15 * N + n] + a[13 * N + n] * b[16 * N + n] + a[14 * N + n] * b[17 * N + n] +
0234                   a[16 * N + n] * b[19 * N + n] + a[17 * N + n] * b[20 * N + n];
0235   c[18 * N + n] = a[18 * N + n] * b[0 * N + n] + a[19 * N + n] * b[1 * N + n] + a[20 * N + n] * b[3 * N + n];
0236   c[19 * N + n] = a[18 * N + n] * b[1 * N + n] + a[19 * N + n] * b[2 * N + n] + a[20 * N + n] * b[4 * N + n];
0237   c[20 * N + n] = a[18 * N + n] * b[3 * N + n] + a[19 * N + n] * b[4 * N + n] + a[20 * N + n] * b[5 * N + n];
0238   c[21 * N + n] = a[18 * N + n] * b[6 * N + n] + a[19 * N + n] * b[7 * N + n] + a[20 * N + n] * b[8 * N + n];
0239   c[22 * N + n] = a[18 * N + n] * b[10 * N + n] + a[19 * N + n] * b[11 * N + n] + a[20 * N + n] * b[12 * N + n];
0240   c[23 * N + n] = a[18 * N + n] * b[15 * N + n] + a[19 * N + n] * b[16 * N + n] + a[20 * N + n] * b[17 * N + n];
0241   c[24 * N + n] = a[24 * N + n] * b[0 * N + n] + a[25 * N + n] * b[1 * N + n] + a[26 * N + n] * b[3 * N + n];
0242   c[25 * N + n] = a[24 * N + n] * b[1 * N + n] + a[25 * N + n] * b[2 * N + n] + a[26 * N + n] * b[4 * N + n];
0243   c[26 * N + n] = a[24 * N + n] * b[3 * N + n] + a[25 * N + n] * b[4 * N + n] + a[26 * N + n] * b[5 * N + n];
0244   c[27 * N + n] = a[24 * N + n] * b[6 * N + n] + a[25 * N + n] * b[7 * N + n] + a[26 * N + n] * b[8 * N + n];
0245   c[28 * N + n] = a[24 * N + n] * b[10 * N + n] + a[25 * N + n] * b[11 * N + n] + a[26 * N + n] * b[12 * N + n];
0246   c[29 * N + n] = a[24 * N + n] * b[15 * N + n] + a[25 * N + n] * b[16 * N + n] + a[26 * N + n] * b[17 * N + n];
0247 }
0248 #endif