Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-02-21 23:14:15

0001 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
0002 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
0003 
0004 #include "MatriplexCommon.h"
0005 #include "Matriplex.h"
0006 
0007 //==============================================================================
0008 // MatriplexSym
0009 //==============================================================================
0010 
0011 namespace Matriplex {
0012 
0013   const idx_t gSymOffsets[7][36] = {{},
0014                                     {},
0015                                     {0, 1, 1, 2},
0016                                     {0, 1, 3, 1, 2, 4, 3, 4, 5},  // 3
0017                                     {},
0018                                     {},
0019                                     {0, 1, 3, 6, 10, 15, 1,  2,  4,  7,  11, 16, 3,  4,  5,  8,  12, 17,
0020                                      6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
0021 
0022   //------------------------------------------------------------------------------
0023 
0024   template <typename T, idx_t D, idx_t N>
0025   class MatriplexSym {
0026   public:
0027     typedef T value_type;
0028 
0029     /// no. of matrix rows
0030     static constexpr int kRows = D;
0031     /// no. of matrix columns
0032     static constexpr int kCols = D;
0033     /// no of elements: lower triangle
0034     static constexpr int kSize = (D + 1) * D / 2;
0035     /// size of the whole matriplex
0036     static constexpr int kTotSize = N * kSize;
0037 
0038     T fArray[kTotSize] __attribute__((aligned(64)));
0039 
0040     MatriplexSym() {}
0041     MatriplexSym(T v) { setVal(v); }
0042 
0043     idx_t plexSize() const { return N; }
0044 
0045     void setVal(T v) {
0046       for (idx_t i = 0; i < kTotSize; ++i) {
0047         fArray[i] = v;
0048       }
0049     }
0050 
0051     void add(const MatriplexSym& v) {
0052       for (idx_t i = 0; i < kTotSize; ++i) {
0053         fArray[i] += v.fArray[i];
0054       }
0055     }
0056 
0057     void scale(T scale) {
0058       for (idx_t i = 0; i < kTotSize; ++i) {
0059         fArray[i] *= scale;
0060       }
0061     }
0062 
0063     T operator[](idx_t xx) const { return fArray[xx]; }
0064     T& operator[](idx_t xx) { return fArray[xx]; }
0065 
0066     const idx_t* offsets() const { return gSymOffsets[D]; }
0067     idx_t off(idx_t i) const { return gSymOffsets[D][i]; }
0068 
0069     const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[off(i * D + j) * N + n]; }
0070 
0071     T& At(idx_t n, idx_t i, idx_t j) { return fArray[off(i * D + j) * N + n]; }
0072 
0073     T& operator()(idx_t n, idx_t i, idx_t j) { return At(n, i, j); }
0074     const T& operator()(idx_t n, idx_t i, idx_t j) const { return constAt(n, i, j); }
0075 
0076     MatriplexSym& operator=(const MatriplexSym& m) {
0077       memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
0078       return *this;
0079     }
0080 
0081     void copySlot(idx_t n, const MatriplexSym& m) {
0082       for (idx_t i = n; i < kTotSize; i += N) {
0083         fArray[i] = m.fArray[i];
0084       }
0085     }
0086 
0087     void copyIn(idx_t n, const T* arr) {
0088       for (idx_t i = n; i < kTotSize; i += N) {
0089         fArray[i] = *(arr++);
0090       }
0091     }
0092 
0093     void copyIn(idx_t n, const MatriplexSym& m, idx_t in) {
0094       for (idx_t i = n; i < kTotSize; i += N, in += N) {
0095         fArray[i] = m[in];
0096       }
0097     }
0098 
0099     void copy(idx_t n, idx_t in) {
0100       for (idx_t i = n; i < kTotSize; i += N, in += N) {
0101         fArray[i] = fArray[in];
0102       }
0103     }
0104 
0105 #if defined(AVX512_INTRINSICS)
0106 
0107     template <typename U>
0108     void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
0109       //_mm512_prefetch_i32gather_ps(vi, arr, 1, _MM_HINT_T0);
0110 
0111       const __m512 src = {0};
0112       const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
0113 
0114       for (int i = 0; i < kSize; ++i, ++arr) {
0115         //_mm512_prefetch_i32gather_ps(vi, arr+2, 1, _MM_HINT_NTA);
0116 
0117         __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
0118         _mm512_mask_store_ps(&fArray[i * N], k, reg);
0119       }
0120     }
0121 
0122     // Experimental methods, slurpIn() seems to be at least as fast.
0123     // See comments in mkFit/MkFitter.cc MkFitter::addBestHit().
0124 
0125     void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
0126       // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
0127 
0128       for (int i = 0; i < N; ++i) {
0129         __m512 reg = _mm512_load_ps(arr + vi[i]);
0130         _mm512_store_ps((void*)(tmp + 64 * i), reg);
0131       }
0132 
0133       for (int i = 0; i < kSize; ++i) {
0134         __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
0135         _mm512_store_ps(&fArray[i * N], reg);
0136       }
0137     }
0138 
0139     void Contaginate(const char* arr, int vi[N], const char* tmp) {
0140       // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
0141 
0142       for (int i = 0; i < N; ++i) {
0143         __m512 reg = _mm512_load_ps(arr + vi[i]);
0144         _mm512_store_ps((void*)(tmp + 64 * i), reg);
0145       }
0146     }
0147 
0148     void Plexify(const char* tmp, __m512i& ui) {
0149       for (int i = 0; i < kSize; ++i) {
0150         __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
0151         _mm512_store_ps(&fArray[i * N], reg);
0152       }
0153     }
0154 
0155 #elif defined(AVX2_INTRINSICS)
0156 
0157     template <typename U>
0158     void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
0159       const __m256 src = {0};
0160 
0161       __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
0162       __m256i k_sel = _mm256_set1_epi32(N_proc);
0163       __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
0164 
0165       k = k_master;
0166       for (int i = 0; i < kSize; ++i, ++arr) {
0167         __m256 reg = _mm256_mask_i32gather_ps(src, arr, vi, (__m256)k, sizeof(U));
0168         // Restore mask (docs say gather clears it but it doesn't seem to).
0169         k = k_master;
0170         _mm256_maskstore_ps(&fArray[i * N], k, reg);
0171       }
0172     }
0173 
0174 #else
0175 
0176     void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
0177       // Separate N_proc == N case (gains about 7% in fit test).
0178       if (N_proc == N) {
0179         for (int i = 0; i < kSize; ++i) {
0180           for (int j = 0; j < N; ++j) {
0181             fArray[i * N + j] = *(arr + i + vi[j]);
0182           }
0183         }
0184       } else {
0185         for (int i = 0; i < kSize; ++i) {
0186           for (int j = 0; j < N_proc; ++j) {
0187             fArray[i * N + j] = *(arr + i + vi[j]);
0188           }
0189         }
0190       }
0191     }
0192 
0193 #endif
0194 
0195     void copyOut(idx_t n, T* arr) const {
0196       for (idx_t i = n; i < kTotSize; i += N) {
0197         *(arr++) = fArray[i];
0198       }
0199     }
0200 
0201     void setDiagonal3x3(idx_t n, T d) {
0202       T* p = fArray + n;
0203 
0204       p[0 * N] = d;
0205       p[1 * N] = 0;
0206       p[2 * N] = d;
0207       p[3 * N] = 0;
0208       p[4 * N] = 0;
0209       p[5 * N] = d;
0210     }
0211 
0212     MatriplexSym& subtract(const MatriplexSym& a, const MatriplexSym& b) {
0213       // Does *this = a - b;
0214 
0215 #pragma omp simd
0216       for (idx_t i = 0; i < kTotSize; ++i) {
0217         fArray[i] = a.fArray[i] - b.fArray[i];
0218       }
0219 
0220       return *this;
0221     }
0222 
0223     // ==================================================================
0224     // Operations specific to Kalman fit in 6 parameter space
0225     // ==================================================================
0226 
0227     void addNoiseIntoUpperLeft3x3(T noise) {
0228       T* p = fArray;
0229       ASSUME_ALIGNED(p, 64);
0230 
0231 #pragma omp simd
0232       for (idx_t n = 0; n < N; ++n) {
0233         p[0 * N + n] += noise;
0234         p[2 * N + n] += noise;
0235         p[5 * N + n] += noise;
0236       }
0237     }
0238 
0239     void invertUpperLeft3x3() {
0240       typedef T TT;
0241 
0242       T* a = fArray;
0243       ASSUME_ALIGNED(a, 64);
0244 
0245 #pragma omp simd
0246       for (idx_t n = 0; n < N; ++n) {
0247         const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
0248         const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
0249         const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
0250         const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
0251         const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
0252         const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
0253 
0254         // Force determinant calculation in double precision.
0255         const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
0256         const TT s = TT(1) / det;
0257 
0258         a[0 * N + n] = s * c00;
0259         a[1 * N + n] = s * c01;
0260         a[2 * N + n] = s * c11;
0261         a[3 * N + n] = s * c02;
0262         a[4 * N + n] = s * c12;
0263         a[5 * N + n] = s * c22;
0264       }
0265     }
0266   };
0267 
0268   template <typename T, idx_t D, idx_t N>
0269   using MPlexSym = MatriplexSym<T, D, N>;
0270 
0271   //==============================================================================
0272   // Multiplications
0273   //==============================================================================
0274 
0275   template <typename T, idx_t D, idx_t N>
0276   struct SymMultiplyCls {
0277     static void multiply(const MPlexSym<T, D, N>& A, const MPlexSym<T, D, N>& B, MPlex<T, D, D, N>& C) {
0278       throw std::runtime_error("general symmetric multiplication not supported");
0279     }
0280   };
0281 
0282   template <typename T, idx_t N>
0283   struct SymMultiplyCls<T, 3, N> {
0284     static void multiply(const MPlexSym<T, 3, N>& A, const MPlexSym<T, 3, N>& B, MPlex<T, 3, 3, N>& C) {
0285       const T* a = A.fArray;
0286       ASSUME_ALIGNED(a, 64);
0287       const T* b = B.fArray;
0288       ASSUME_ALIGNED(b, 64);
0289       T* c = C.fArray;
0290       ASSUME_ALIGNED(c, 64);
0291 
0292 #ifdef MPLEX_INTRINSICS
0293 
0294       for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
0295 #include "intr_sym_3x3.ah"
0296       }
0297 
0298 #else
0299 
0300 #pragma omp simd
0301       for (idx_t n = 0; n < N; ++n) {
0302 #include "std_sym_3x3.ah"
0303       }
0304 
0305 #endif
0306     }
0307   };
0308 
0309   template <typename T, idx_t N>
0310   struct SymMultiplyCls<T, 6, N> {
0311     static void multiply(const MPlexSym<float, 6, N>& A, const MPlexSym<float, 6, N>& B, MPlex<float, 6, 6, N>& C) {
0312       const T* a = A.fArray;
0313       ASSUME_ALIGNED(a, 64);
0314       const T* b = B.fArray;
0315       ASSUME_ALIGNED(b, 64);
0316       T* c = C.fArray;
0317       ASSUME_ALIGNED(c, 64);
0318 
0319 #ifdef MPLEX_INTRINSICS
0320 
0321       for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
0322 #include "intr_sym_6x6.ah"
0323       }
0324 
0325 #else
0326 
0327 #pragma omp simd
0328       for (idx_t n = 0; n < N; ++n) {
0329 #include "std_sym_6x6.ah"
0330       }
0331 
0332 #endif
0333     }
0334   };
0335 
0336   template <typename T, idx_t D, idx_t N>
0337   void multiply(const MPlexSym<T, D, N>& A, const MPlexSym<T, D, N>& B, MPlex<T, D, D, N>& C) {
0338     SymMultiplyCls<T, D, N>::multiply(A, B, C);
0339   }
0340 
0341   //==============================================================================
0342   // Cramer inversion
0343   //==============================================================================
0344 
0345   template <typename T, idx_t D, idx_t N>
0346   struct CramerInverterSym {
0347     static void invert(MPlexSym<T, D, N>& A, double* determ = nullptr) {
0348       throw std::runtime_error("general cramer inversion not supported");
0349     }
0350   };
0351 
0352   template <typename T, idx_t N>
0353   struct CramerInverterSym<T, 2, N> {
0354     static void invert(MPlexSym<T, 2, N>& A, double* determ = nullptr) {
0355       typedef T TT;
0356 
0357       T* a = A.fArray;
0358       ASSUME_ALIGNED(a, 64);
0359 
0360 #pragma omp simd
0361       for (idx_t n = 0; n < N; ++n) {
0362         // Force determinant calculation in double precision.
0363         const double det = (double)a[0 * N + n] * a[2 * N + n] - (double)a[1 * N + n] * a[1 * N + n];
0364         if (determ)
0365           determ[n] = det;
0366 
0367         const TT s = TT(1) / det;
0368         const TT tmp = s * a[2 * N + n];
0369         a[1 * N + n] *= -s;
0370         a[2 * N + n] = s * a[0 * N + n];
0371         a[0 * N + n] = tmp;
0372       }
0373     }
0374   };
0375 
0376   template <typename T, idx_t N>
0377   struct CramerInverterSym<T, 3, N> {
0378     static void invert(MPlexSym<T, 3, N>& A, double* determ = nullptr) {
0379       typedef T TT;
0380 
0381       T* a = A.fArray;
0382       ASSUME_ALIGNED(a, 64);
0383 
0384 #pragma omp simd
0385       for (idx_t n = 0; n < N; ++n) {
0386         const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
0387         const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
0388         const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
0389         const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
0390         const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
0391         const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
0392 
0393         // Force determinant calculation in double precision.
0394         const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
0395         if (determ)
0396           determ[n] = det;
0397 
0398         const TT s = TT(1) / det;
0399         a[0 * N + n] = s * c00;
0400         a[1 * N + n] = s * c01;
0401         a[2 * N + n] = s * c11;
0402         a[3 * N + n] = s * c02;
0403         a[4 * N + n] = s * c12;
0404         a[5 * N + n] = s * c22;
0405       }
0406     }
0407   };
0408 
0409   template <typename T, idx_t D, idx_t N>
0410   void invertCramerSym(MPlexSym<T, D, N>& A, double* determ = nullptr) {
0411     CramerInverterSym<T, D, N>::invert(A, determ);
0412   }
0413 
0414   //==============================================================================
0415   // Cholesky inversion
0416   //==============================================================================
0417 
0418   template <typename T, idx_t D, idx_t N>
0419   struct CholeskyInverterSym {
0420     static void invert(MPlexSym<T, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
0421   };
0422 
0423   template <typename T, idx_t N>
0424   struct CholeskyInverterSym<T, 3, N> {
0425     static void invert(MPlexSym<T, 3, N>& A) {
0426       typedef T TT;
0427 
0428       T* a = A.fArray;
0429 
0430 #pragma omp simd
0431       for (idx_t n = 0; n < N; ++n) {
0432         TT l0 = std::sqrt(T(1) / a[0 * N + n]);
0433         TT l1 = a[1 * N + n] * l0;
0434         TT l2 = a[2 * N + n] - l1 * l1;
0435         l2 = std::sqrt(T(1) / l2);
0436         TT l3 = a[3 * N + n] * l0;
0437         TT l4 = (a[4 * N + n] - l1 * l3) * l2;
0438         TT l5 = a[5 * N + n] - (l3 * l3 + l4 * l4);
0439         l5 = std::sqrt(T(1) / l5);
0440 
0441         // decomposition done
0442 
0443         l3 = (l1 * l4 * l2 - l3) * l0 * l5;
0444         l1 = -l1 * l0 * l2;
0445         l4 = -l4 * l2 * l5;
0446 
0447         a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
0448         a[1 * N + n] = l3 * l4 + l1 * l2;
0449         a[2 * N + n] = l4 * l4 + l2 * l2;
0450         a[3 * N + n] = l3 * l5;
0451         a[4 * N + n] = l4 * l5;
0452         a[5 * N + n] = l5 * l5;
0453 
0454         // m(2,x) are all zero if anything went wrong at l5.
0455         // all zero, if anything went wrong already for l0 or l2.
0456       }
0457     }
0458   };
0459 
0460   template <typename T, idx_t D, idx_t N>
0461   void invertCholeskySym(MPlexSym<T, D, N>& A) {
0462     CholeskyInverterSym<T, D, N>::invert(A);
0463   }
0464 
0465 }  // end namespace Matriplex
0466 
0467 #endif