Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:28:18

0001 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
0002 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
0003 
0004 #include "MatriplexCommon.h"
0005 #include "Matriplex.h"
0006 
0007 //==============================================================================
0008 // MatriplexSym
0009 //==============================================================================
0010 
0011 namespace Matriplex {
0012 
0013   const idx_t gSymOffsets[7][36] = {{},
0014                                     {},
0015                                     {0, 1, 1, 2},
0016                                     {0, 1, 3, 1, 2, 4, 3, 4, 5},  // 3
0017                                     {},
0018                                     {},
0019                                     {0, 1, 3, 6, 10, 15, 1,  2,  4,  7,  11, 16, 3,  4,  5,  8,  12, 17,
0020                                      6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
0021 
0022   //------------------------------------------------------------------------------
0023 
0024   template <typename T, idx_t D, idx_t N>
0025   class __attribute__((aligned(MPLEX_ALIGN))) MatriplexSym {
0026   public:
0027     typedef T value_type;
0028 
0029     /// no. of matrix rows
0030     static constexpr int kRows = D;
0031     /// no. of matrix columns
0032     static constexpr int kCols = D;
0033     /// no of elements: lower triangle
0034     static constexpr int kSize = (D + 1) * D / 2;
0035     /// size of the whole matriplex
0036     static constexpr int kTotSize = N * kSize;
0037 
0038     T fArray[kTotSize];
0039 
0040     MatriplexSym() {}
0041     MatriplexSym(T v) { setVal(v); }
0042 
0043     idx_t plexSize() const { return N; }
0044 
0045     void setVal(T v) {
0046       for (idx_t i = 0; i < kTotSize; ++i) {
0047         fArray[i] = v;
0048       }
0049     }
0050 
0051     void add(const MatriplexSym& v) {
0052       for (idx_t i = 0; i < kTotSize; ++i) {
0053         fArray[i] += v.fArray[i];
0054       }
0055     }
0056 
0057     void scale(T scale) {
0058       for (idx_t i = 0; i < kTotSize; ++i) {
0059         fArray[i] *= scale;
0060       }
0061     }
0062 
0063     T operator[](idx_t xx) const { return fArray[xx]; }
0064     T& operator[](idx_t xx) { return fArray[xx]; }
0065 
0066     const idx_t* offsets() const { return gSymOffsets[D]; }
0067     idx_t off(idx_t i) const { return gSymOffsets[D][i]; }
0068 
0069     const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[off(i * D + j) * N + n]; }
0070 
0071     T& At(idx_t n, idx_t i, idx_t j) { return fArray[off(i * D + j) * N + n]; }
0072 
0073     T& operator()(idx_t n, idx_t i, idx_t j) { return At(n, i, j); }
0074     const T& operator()(idx_t n, idx_t i, idx_t j) const { return constAt(n, i, j); }
0075 
0076     MatriplexSym& operator=(const MatriplexSym& m) {
0077       memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
0078       return *this;
0079     }
0080 
0081     MatriplexSym(const MatriplexSym& m) = default;
0082 
0083     void copySlot(idx_t n, const MatriplexSym& m) {
0084       for (idx_t i = n; i < kTotSize; i += N) {
0085         fArray[i] = m.fArray[i];
0086       }
0087     }
0088 
0089     void copyIn(idx_t n, const T* arr) {
0090       for (idx_t i = n; i < kTotSize; i += N) {
0091         fArray[i] = *(arr++);
0092       }
0093     }
0094 
0095     void copyIn(idx_t n, const MatriplexSym& m, idx_t in) {
0096       for (idx_t i = n; i < kTotSize; i += N, in += N) {
0097         fArray[i] = m[in];
0098       }
0099     }
0100 
0101     void copy(idx_t n, idx_t in) {
0102       for (idx_t i = n; i < kTotSize; i += N, in += N) {
0103         fArray[i] = fArray[in];
0104       }
0105     }
0106 
0107 #if defined(AVX512_INTRINSICS)
0108 
0109     template <typename U>
0110     void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
0111       //_mm512_prefetch_i32gather_ps(vi, arr, 1, _MM_HINT_T0);
0112 
0113       const __m512 src = {0};
0114       const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
0115 
0116       for (int i = 0; i < kSize; ++i, ++arr) {
0117         //_mm512_prefetch_i32gather_ps(vi, arr+2, 1, _MM_HINT_NTA);
0118 
0119         __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
0120         _mm512_mask_store_ps(&fArray[i * N], k, reg);
0121       }
0122     }
0123 
0124     // Experimental methods, slurpIn() seems to be at least as fast.
0125     // See comments in mkFit/MkFitter.cc MkFitter::addBestHit().
0126 
0127     void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
0128       // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
0129 
0130       for (int i = 0; i < N; ++i) {
0131         __m512 reg = _mm512_load_ps(arr + vi[i]);
0132         _mm512_store_ps((void*)(tmp + 64 * i), reg);
0133       }
0134 
0135       for (int i = 0; i < kSize; ++i) {
0136         __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
0137         _mm512_store_ps(&fArray[i * N], reg);
0138       }
0139     }
0140 
0141     void Contaginate(const char* arr, int vi[N], const char* tmp) {
0142       // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
0143 
0144       for (int i = 0; i < N; ++i) {
0145         __m512 reg = _mm512_load_ps(arr + vi[i]);
0146         _mm512_store_ps((void*)(tmp + 64 * i), reg);
0147       }
0148     }
0149 
0150     void Plexify(const char* tmp, __m512i& ui) {
0151       for (int i = 0; i < kSize; ++i) {
0152         __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
0153         _mm512_store_ps(&fArray[i * N], reg);
0154       }
0155     }
0156 
0157 #elif defined(AVX2_INTRINSICS)
0158 
0159     template <typename U>
0160     void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
0161       const __m256 src = {0};
0162 
0163       __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
0164       __m256i k_sel = _mm256_set1_epi32(N_proc);
0165       __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
0166 
0167       k = k_master;
0168       for (int i = 0; i < kSize; ++i, ++arr) {
0169         __m256 reg = _mm256_mask_i32gather_ps(src, arr, vi, (__m256)k, sizeof(U));
0170         // Restore mask (docs say gather clears it but it doesn't seem to).
0171         k = k_master;
0172         _mm256_maskstore_ps(&fArray[i * N], k, reg);
0173       }
0174     }
0175 
0176 #else
0177 
0178     void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
0179       // Separate N_proc == N case (gains about 7% in fit test).
0180       if (N_proc == N) {
0181         for (int i = 0; i < kSize; ++i) {
0182           for (int j = 0; j < N; ++j) {
0183             fArray[i * N + j] = *(arr + i + vi[j]);
0184           }
0185         }
0186       } else {
0187         for (int i = 0; i < kSize; ++i) {
0188           for (int j = 0; j < N_proc; ++j) {
0189             fArray[i * N + j] = *(arr + i + vi[j]);
0190           }
0191         }
0192       }
0193     }
0194 
0195 #endif
0196 
0197     void copyOut(idx_t n, T* arr) const {
0198       for (idx_t i = n; i < kTotSize; i += N) {
0199         *(arr++) = fArray[i];
0200       }
0201     }
0202 
0203     void setDiagonal3x3(idx_t n, T d) {
0204       T* p = fArray + n;
0205 
0206       p[0 * N] = d;
0207       p[1 * N] = 0;
0208       p[2 * N] = d;
0209       p[3 * N] = 0;
0210       p[4 * N] = 0;
0211       p[5 * N] = d;
0212     }
0213 
0214     MatriplexSym& subtract(const MatriplexSym& a, const MatriplexSym& b) {
0215       // Does *this = a - b;
0216 
0217 #pragma omp simd
0218       for (idx_t i = 0; i < kTotSize; ++i) {
0219         fArray[i] = a.fArray[i] - b.fArray[i];
0220       }
0221 
0222       return *this;
0223     }
0224 
0225     // ==================================================================
0226     // Operations specific to Kalman fit in 6 parameter space
0227     // ==================================================================
0228 
0229     void addNoiseIntoUpperLeft3x3(T noise) {
0230       T* p = fArray;
0231       ASSUME_ALIGNED(p, 64);
0232 
0233 #pragma omp simd
0234       for (idx_t n = 0; n < N; ++n) {
0235         p[0 * N + n] += noise;
0236         p[2 * N + n] += noise;
0237         p[5 * N + n] += noise;
0238       }
0239     }
0240 
0241     void invertUpperLeft3x3() {
0242       typedef T TT;
0243 
0244       T* a = fArray;
0245       ASSUME_ALIGNED(a, 64);
0246 
0247 #pragma omp simd
0248       for (idx_t n = 0; n < N; ++n) {
0249         const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
0250         const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
0251         const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
0252         const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
0253         const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
0254         const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
0255 
0256         // Force determinant calculation in double precision.
0257         const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
0258         const TT s = TT(1) / det;
0259 
0260         a[0 * N + n] = s * c00;
0261         a[1 * N + n] = s * c01;
0262         a[2 * N + n] = s * c11;
0263         a[3 * N + n] = s * c02;
0264         a[4 * N + n] = s * c12;
0265         a[5 * N + n] = s * c22;
0266       }
0267     }
0268 
0269     Matriplex<T, 1, 1, N> ReduceFixedIJ(idx_t i, idx_t j) const {
0270       Matriplex<T, 1, 1, N> t;
0271       for (idx_t n = 0; n < N; ++n) {
0272         t[n] = constAt(n, i, j);
0273       }
0274       return t;
0275     }
0276   };
0277 
0278   template <typename T, idx_t D, idx_t N>
0279   using MPlexSym = MatriplexSym<T, D, N>;
0280 
0281   //==============================================================================
0282   // Multiplications
0283   //==============================================================================
0284 
0285   template <typename T, idx_t D, idx_t N>
0286   struct SymMultiplyCls {
0287     static void multiply(const MPlexSym<T, D, N>& A, const MPlexSym<T, D, N>& B, MPlex<T, D, D, N>& C) {
0288       throw std::runtime_error("general symmetric multiplication not supported");
0289     }
0290   };
0291 
0292   template <typename T, idx_t N>
0293   struct SymMultiplyCls<T, 3, N> {
0294     static void multiply(const MPlexSym<T, 3, N>& A, const MPlexSym<T, 3, N>& B, MPlex<T, 3, 3, N>& C) {
0295       const T* a = A.fArray;
0296       ASSUME_ALIGNED(a, 64);
0297       const T* b = B.fArray;
0298       ASSUME_ALIGNED(b, 64);
0299       T* c = C.fArray;
0300       ASSUME_ALIGNED(c, 64);
0301 
0302 #ifdef MPLEX_INTRINSICS
0303 
0304       for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
0305 #include "intr_sym_3x3.ah"
0306       }
0307 
0308 #else
0309 
0310 #pragma omp simd
0311       for (idx_t n = 0; n < N; ++n) {
0312 #include "std_sym_3x3.ah"
0313       }
0314 
0315 #endif
0316     }
0317   };
0318 
0319   template <typename T, idx_t N>
0320   struct SymMultiplyCls<T, 6, N> {
0321     static void multiply(const MPlexSym<float, 6, N>& A, const MPlexSym<float, 6, N>& B, MPlex<float, 6, 6, N>& C) {
0322       const T* a = A.fArray;
0323       ASSUME_ALIGNED(a, 64);
0324       const T* b = B.fArray;
0325       ASSUME_ALIGNED(b, 64);
0326       T* c = C.fArray;
0327       ASSUME_ALIGNED(c, 64);
0328 
0329 #ifdef MPLEX_INTRINSICS
0330 
0331       for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
0332 #include "intr_sym_6x6.ah"
0333       }
0334 
0335 #else
0336 
0337 #pragma omp simd
0338       for (idx_t n = 0; n < N; ++n) {
0339 #include "std_sym_6x6.ah"
0340       }
0341 
0342 #endif
0343     }
0344   };
0345 
0346   template <typename T, idx_t D, idx_t N>
0347   void multiply(const MPlexSym<T, D, N>& A, const MPlexSym<T, D, N>& B, MPlex<T, D, D, N>& C) {
0348     SymMultiplyCls<T, D, N>::multiply(A, B, C);
0349   }
0350 
0351   //==============================================================================
0352   // Cramer inversion
0353   //==============================================================================
0354 
0355   template <typename T, idx_t D, idx_t N>
0356   struct CramerInverterSym {
0357     static void invert(MPlexSym<T, D, N>& A, double* determ = nullptr) {
0358       throw std::runtime_error("general cramer inversion not supported");
0359     }
0360   };
0361 
0362   template <typename T, idx_t N>
0363   struct CramerInverterSym<T, 2, N> {
0364     static void invert(MPlexSym<T, 2, N>& A, double* determ = nullptr) {
0365       typedef T TT;
0366 
0367       T* a = A.fArray;
0368       ASSUME_ALIGNED(a, 64);
0369 
0370 #pragma omp simd
0371       for (idx_t n = 0; n < N; ++n) {
0372         // Force determinant calculation in double precision.
0373         const double det = (double)a[0 * N + n] * a[2 * N + n] - (double)a[1 * N + n] * a[1 * N + n];
0374         if (determ)
0375           determ[n] = det;
0376 
0377         const TT s = TT(1) / det;
0378         const TT tmp = s * a[2 * N + n];
0379         a[1 * N + n] *= -s;
0380         a[2 * N + n] = s * a[0 * N + n];
0381         a[0 * N + n] = tmp;
0382       }
0383     }
0384   };
0385 
0386   template <typename T, idx_t N>
0387   struct CramerInverterSym<T, 3, N> {
0388     static void invert(MPlexSym<T, 3, N>& A, double* determ = nullptr) {
0389       typedef T TT;
0390 
0391       T* a = A.fArray;
0392       ASSUME_ALIGNED(a, 64);
0393 
0394 #pragma omp simd
0395       for (idx_t n = 0; n < N; ++n) {
0396         const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
0397         const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
0398         const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
0399         const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
0400         const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
0401         const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
0402 
0403         // Force determinant calculation in double precision.
0404         const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
0405         if (determ)
0406           determ[n] = det;
0407 
0408         const TT s = TT(1) / det;
0409         a[0 * N + n] = s * c00;
0410         a[1 * N + n] = s * c01;
0411         a[2 * N + n] = s * c11;
0412         a[3 * N + n] = s * c02;
0413         a[4 * N + n] = s * c12;
0414         a[5 * N + n] = s * c22;
0415       }
0416     }
0417   };
0418 
0419   template <typename T, idx_t D, idx_t N>
0420   void invertCramerSym(MPlexSym<T, D, N>& A, double* determ = nullptr) {
0421     CramerInverterSym<T, D, N>::invert(A, determ);
0422   }
0423 
0424   //==============================================================================
0425   // Cholesky inversion
0426   //==============================================================================
0427 
0428   template <typename T, idx_t D, idx_t N>
0429   struct CholeskyInverterSym {
0430     static void invert(MPlexSym<T, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
0431   };
0432 
0433   template <typename T, idx_t N>
0434   struct CholeskyInverterSym<T, 3, N> {
0435     static void invert(MPlexSym<T, 3, N>& A) {
0436       typedef T TT;
0437 
0438       T* a = A.fArray;
0439 
0440 #pragma omp simd
0441       for (idx_t n = 0; n < N; ++n) {
0442         TT l0 = std::sqrt(T(1) / a[0 * N + n]);
0443         TT l1 = a[1 * N + n] * l0;
0444         TT l2 = a[2 * N + n] - l1 * l1;
0445         l2 = std::sqrt(T(1) / l2);
0446         TT l3 = a[3 * N + n] * l0;
0447         TT l4 = (a[4 * N + n] - l1 * l3) * l2;
0448         TT l5 = a[5 * N + n] - (l3 * l3 + l4 * l4);
0449         l5 = std::sqrt(T(1) / l5);
0450 
0451         // decomposition done
0452 
0453         l3 = (l1 * l4 * l2 - l3) * l0 * l5;
0454         l1 = -l1 * l0 * l2;
0455         l4 = -l4 * l2 * l5;
0456 
0457         a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
0458         a[1 * N + n] = l3 * l4 + l1 * l2;
0459         a[2 * N + n] = l4 * l4 + l2 * l2;
0460         a[3 * N + n] = l3 * l5;
0461         a[4 * N + n] = l4 * l5;
0462         a[5 * N + n] = l5 * l5;
0463 
0464         // m(2,x) are all zero if anything went wrong at l5.
0465         // all zero, if anything went wrong already for l0 or l2.
0466       }
0467     }
0468   };
0469 
0470   template <typename T, idx_t D, idx_t N>
0471   void invertCholeskySym(MPlexSym<T, D, N>& A) {
0472     CholeskyInverterSym<T, D, N>::invert(A);
0473   }
0474 
0475 }  // end namespace Matriplex
0476 
0477 #endif