File indexing completed on 2024-10-17 22:59:03
0001 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
0002 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
0003
0004 #include "MatriplexCommon.h"
0005 #include "Matriplex.h"
0006
0007
0008
0009
0010
0011 namespace Matriplex {
0012
0013 const idx_t gSymOffsets[7][36] = {
0014 {},
0015 {},
0016 {0, 1, 1, 2},
0017 {0, 1, 3, 1, 2, 4, 3, 4, 5},
0018 {},
0019 {0, 1, 3, 6, 10, 1, 2, 4, 7, 11, 3, 4, 5, 8, 12, 6, 7, 8, 9, 13, 10, 11, 12, 13, 14},
0020 {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
0021 6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
0022
0023
0024
0025 template <typename T, idx_t D, idx_t N>
0026 class __attribute__((aligned(MPLEX_ALIGN))) MatriplexSym {
0027 public:
0028 typedef T value_type;
0029
0030
0031 static constexpr int kRows = D;
0032
0033 static constexpr int kCols = D;
0034
0035 static constexpr int kSize = (D + 1) * D / 2;
0036
0037 static constexpr int kTotSize = N * kSize;
0038
0039 T fArray[kTotSize];
0040
0041 MatriplexSym() {}
0042 MatriplexSym(T v) { setVal(v); }
0043
0044 idx_t plexSize() const { return N; }
0045
0046 void setVal(T v) {
0047 for (idx_t i = 0; i < kTotSize; ++i) {
0048 fArray[i] = v;
0049 }
0050 }
0051
0052 void add(const MatriplexSym& v) {
0053 for (idx_t i = 0; i < kTotSize; ++i) {
0054 fArray[i] += v.fArray[i];
0055 }
0056 }
0057
0058 void scale(T scale) {
0059 for (idx_t i = 0; i < kTotSize; ++i) {
0060 fArray[i] *= scale;
0061 }
0062 }
0063
0064 T operator[](idx_t xx) const { return fArray[xx]; }
0065 T& operator[](idx_t xx) { return fArray[xx]; }
0066
0067 const idx_t* offsets() const { return gSymOffsets[D]; }
0068 idx_t off(idx_t i) const { return gSymOffsets[D][i]; }
0069
0070 const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[off(i * D + j) * N + n]; }
0071
0072 T& At(idx_t n, idx_t i, idx_t j) { return fArray[off(i * D + j) * N + n]; }
0073
0074 T& operator()(idx_t n, idx_t i, idx_t j) { return At(n, i, j); }
0075 const T& operator()(idx_t n, idx_t i, idx_t j) const { return constAt(n, i, j); }
0076
0077 MatriplexSym& operator=(const MatriplexSym& m) {
0078 memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
0079 return *this;
0080 }
0081
0082 MatriplexSym(const MatriplexSym& m) = default;
0083
0084 void copySlot(idx_t n, const MatriplexSym& m) {
0085 for (idx_t i = n; i < kTotSize; i += N) {
0086 fArray[i] = m.fArray[i];
0087 }
0088 }
0089
0090 void copyIn(idx_t n, const T* arr) {
0091 for (idx_t i = n; i < kTotSize; i += N) {
0092 fArray[i] = *(arr++);
0093 }
0094 }
0095
0096 void copyIn(idx_t n, const MatriplexSym& m, idx_t in) {
0097 for (idx_t i = n; i < kTotSize; i += N, in += N) {
0098 fArray[i] = m[in];
0099 }
0100 }
0101
0102 void copy(idx_t n, idx_t in) {
0103 for (idx_t i = n; i < kTotSize; i += N, in += N) {
0104 fArray[i] = fArray[in];
0105 }
0106 }
0107
0108 #if defined(AVX512_INTRINSICS)
0109
0110 template <typename U>
0111 void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
0112
0113
0114 const __m512 src = {0};
0115 const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
0116
0117 for (int i = 0; i < kSize; ++i, ++arr) {
0118
0119
0120 __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
0121 _mm512_mask_store_ps(&fArray[i * N], k, reg);
0122 }
0123 }
0124
0125
0126
0127
0128 void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
0129
0130
0131 for (int i = 0; i < N; ++i) {
0132 __m512 reg = _mm512_load_ps(arr + vi[i]);
0133 _mm512_store_ps((void*)(tmp + 64 * i), reg);
0134 }
0135
0136 for (int i = 0; i < kSize; ++i) {
0137 __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
0138 _mm512_store_ps(&fArray[i * N], reg);
0139 }
0140 }
0141
0142 void Contaginate(const char* arr, int vi[N], const char* tmp) {
0143
0144
0145 for (int i = 0; i < N; ++i) {
0146 __m512 reg = _mm512_load_ps(arr + vi[i]);
0147 _mm512_store_ps((void*)(tmp + 64 * i), reg);
0148 }
0149 }
0150
0151 void Plexify(const char* tmp, __m512i& ui) {
0152 for (int i = 0; i < kSize; ++i) {
0153 __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
0154 _mm512_store_ps(&fArray[i * N], reg);
0155 }
0156 }
0157
0158 #elif defined(AVX2_INTRINSICS)
0159
0160 template <typename U>
0161 void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
0162 const __m256 src = {0};
0163
0164 __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
0165 __m256i k_sel = _mm256_set1_epi32(N_proc);
0166 __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
0167
0168 k = k_master;
0169 for (int i = 0; i < kSize; ++i, ++arr) {
0170 __m256 reg = _mm256_mask_i32gather_ps(src, arr, vi, (__m256)k, sizeof(U));
0171
0172 k = k_master;
0173 _mm256_maskstore_ps(&fArray[i * N], k, reg);
0174 }
0175 }
0176
0177 #else
0178
0179 void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
0180
0181 if (N_proc == N) {
0182 for (int i = 0; i < kSize; ++i) {
0183 for (int j = 0; j < N; ++j) {
0184 fArray[i * N + j] = *(arr + i + vi[j]);
0185 }
0186 }
0187 } else {
0188 for (int i = 0; i < kSize; ++i) {
0189 for (int j = 0; j < N_proc; ++j) {
0190 fArray[i * N + j] = *(arr + i + vi[j]);
0191 }
0192 }
0193 }
0194 }
0195
0196 #endif
0197
0198 void copyOut(idx_t n, T* arr) const {
0199 for (idx_t i = n; i < kTotSize; i += N) {
0200 *(arr++) = fArray[i];
0201 }
0202 }
0203
0204 void setDiagonal3x3(idx_t n, T d) {
0205 T* p = fArray + n;
0206
0207 p[0 * N] = d;
0208 p[1 * N] = 0;
0209 p[2 * N] = d;
0210 p[3 * N] = 0;
0211 p[4 * N] = 0;
0212 p[5 * N] = d;
0213 }
0214
0215 MatriplexSym& subtract(const MatriplexSym& a, const MatriplexSym& b) {
0216
0217
0218 #pragma omp simd
0219 for (idx_t i = 0; i < kTotSize; ++i) {
0220 fArray[i] = a.fArray[i] - b.fArray[i];
0221 }
0222
0223 return *this;
0224 }
0225
0226
0227
0228
0229
0230 void addNoiseIntoUpperLeft3x3(T noise) {
0231 T* p = fArray;
0232 ASSUME_ALIGNED(p, 64);
0233
0234 #pragma omp simd
0235 for (idx_t n = 0; n < N; ++n) {
0236 p[0 * N + n] += noise;
0237 p[2 * N + n] += noise;
0238 p[5 * N + n] += noise;
0239 }
0240 }
0241
0242 void invertUpperLeft3x3() {
0243 typedef T TT;
0244
0245 T* a = fArray;
0246 ASSUME_ALIGNED(a, 64);
0247
0248 #pragma omp simd
0249 for (idx_t n = 0; n < N; ++n) {
0250 const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
0251 const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
0252 const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
0253 const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
0254 const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
0255 const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
0256
0257
0258 const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
0259 const TT s = TT(1) / det;
0260
0261 a[0 * N + n] = s * c00;
0262 a[1 * N + n] = s * c01;
0263 a[2 * N + n] = s * c11;
0264 a[3 * N + n] = s * c02;
0265 a[4 * N + n] = s * c12;
0266 a[5 * N + n] = s * c22;
0267 }
0268 }
0269
0270 Matriplex<T, 1, 1, N> ReduceFixedIJ(idx_t i, idx_t j) const {
0271 Matriplex<T, 1, 1, N> t;
0272 for (idx_t n = 0; n < N; ++n) {
0273 t[n] = constAt(n, i, j);
0274 }
0275 return t;
0276 }
0277 };
0278
0279 template <typename T, idx_t D, idx_t N>
0280 using MPlexSym = MatriplexSym<T, D, N>;
0281
0282
0283
0284
0285
0286 template <typename T, idx_t D, idx_t N>
0287 struct SymMultiplyCls {
0288 static void multiply(const MPlexSym<T, D, N>& A, const MPlexSym<T, D, N>& B, MPlex<T, D, D, N>& C) {
0289 throw std::runtime_error("general symmetric multiplication not supported");
0290 }
0291 };
0292
0293 template <typename T, idx_t N>
0294 struct SymMultiplyCls<T, 3, N> {
0295 static void multiply(const MPlexSym<T, 3, N>& A, const MPlexSym<T, 3, N>& B, MPlex<T, 3, 3, N>& C) {
0296 const T* a = A.fArray;
0297 ASSUME_ALIGNED(a, 64);
0298 const T* b = B.fArray;
0299 ASSUME_ALIGNED(b, 64);
0300 T* c = C.fArray;
0301 ASSUME_ALIGNED(c, 64);
0302
0303 #ifdef MPLEX_INTRINSICS
0304
0305 for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
0306 #include "intr_sym_3x3.ah"
0307 }
0308
0309 #else
0310
0311 #pragma omp simd
0312 for (idx_t n = 0; n < N; ++n) {
0313 #include "std_sym_3x3.ah"
0314 }
0315
0316 #endif
0317 }
0318 };
0319
0320 template <typename T, idx_t N>
0321 struct SymMultiplyCls<T, 6, N> {
0322 static void multiply(const MPlexSym<float, 6, N>& A, const MPlexSym<float, 6, N>& B, MPlex<float, 6, 6, N>& C) {
0323 const T* a = A.fArray;
0324 ASSUME_ALIGNED(a, 64);
0325 const T* b = B.fArray;
0326 ASSUME_ALIGNED(b, 64);
0327 T* c = C.fArray;
0328 ASSUME_ALIGNED(c, 64);
0329
0330 #ifdef MPLEX_INTRINSICS
0331
0332 for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
0333 #include "intr_sym_6x6.ah"
0334 }
0335
0336 #else
0337
0338 #pragma omp simd
0339 for (idx_t n = 0; n < N; ++n) {
0340 #include "std_sym_6x6.ah"
0341 }
0342
0343 #endif
0344 }
0345 };
0346
0347 template <typename T, idx_t D, idx_t N>
0348 void multiply(const MPlexSym<T, D, N>& A, const MPlexSym<T, D, N>& B, MPlex<T, D, D, N>& C) {
0349 SymMultiplyCls<T, D, N>::multiply(A, B, C);
0350 }
0351
0352
0353
0354
0355
0356 template <typename T, idx_t D, idx_t N>
0357 struct CramerInverterSym {
0358 static void invert(MPlexSym<T, D, N>& A, double* determ = nullptr) {
0359 throw std::runtime_error("general cramer inversion not supported");
0360 }
0361 };
0362
0363 template <typename T, idx_t N>
0364 struct CramerInverterSym<T, 2, N> {
0365 static void invert(MPlexSym<T, 2, N>& A, double* determ = nullptr) {
0366 typedef T TT;
0367
0368 T* a = A.fArray;
0369 ASSUME_ALIGNED(a, 64);
0370
0371 #pragma omp simd
0372 for (idx_t n = 0; n < N; ++n) {
0373
0374 const double det = (double)a[0 * N + n] * a[2 * N + n] - (double)a[1 * N + n] * a[1 * N + n];
0375 if (determ)
0376 determ[n] = det;
0377
0378 const TT s = TT(1) / det;
0379 const TT tmp = s * a[2 * N + n];
0380 a[1 * N + n] *= -s;
0381 a[2 * N + n] = s * a[0 * N + n];
0382 a[0 * N + n] = tmp;
0383 }
0384 }
0385 };
0386
0387 template <typename T, idx_t N>
0388 struct CramerInverterSym<T, 3, N> {
0389 static void invert(MPlexSym<T, 3, N>& A, double* determ = nullptr) {
0390 typedef T TT;
0391
0392 T* a = A.fArray;
0393 ASSUME_ALIGNED(a, 64);
0394
0395 #pragma omp simd
0396 for (idx_t n = 0; n < N; ++n) {
0397 const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
0398 const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
0399 const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
0400 const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
0401 const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
0402 const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
0403
0404
0405 const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
0406 if (determ)
0407 determ[n] = det;
0408
0409 const TT s = TT(1) / det;
0410 a[0 * N + n] = s * c00;
0411 a[1 * N + n] = s * c01;
0412 a[2 * N + n] = s * c11;
0413 a[3 * N + n] = s * c02;
0414 a[4 * N + n] = s * c12;
0415 a[5 * N + n] = s * c22;
0416 }
0417 }
0418 };
0419
0420 template <typename T, idx_t D, idx_t N>
0421 void invertCramerSym(MPlexSym<T, D, N>& A, double* determ = nullptr) {
0422 CramerInverterSym<T, D, N>::invert(A, determ);
0423 }
0424
0425
0426
0427
0428
0429 template <typename T, idx_t D, idx_t N>
0430 struct CholeskyInverterSym {
0431 static void invert(MPlexSym<T, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
0432 };
0433
0434 template <typename T, idx_t N>
0435 struct CholeskyInverterSym<T, 3, N> {
0436 static void invert(MPlexSym<T, 3, N>& A) {
0437 typedef T TT;
0438
0439 T* a = A.fArray;
0440
0441 #pragma omp simd
0442 for (idx_t n = 0; n < N; ++n) {
0443 TT l0 = std::sqrt(T(1) / a[0 * N + n]);
0444 TT l1 = a[1 * N + n] * l0;
0445 TT l2 = a[2 * N + n] - l1 * l1;
0446 l2 = std::sqrt(T(1) / l2);
0447 TT l3 = a[3 * N + n] * l0;
0448 TT l4 = (a[4 * N + n] - l1 * l3) * l2;
0449 TT l5 = a[5 * N + n] - (l3 * l3 + l4 * l4);
0450 l5 = std::sqrt(T(1) / l5);
0451
0452
0453
0454 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
0455 l1 = -l1 * l0 * l2;
0456 l4 = -l4 * l2 * l5;
0457
0458 a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
0459 a[1 * N + n] = l3 * l4 + l1 * l2;
0460 a[2 * N + n] = l4 * l4 + l2 * l2;
0461 a[3 * N + n] = l3 * l5;
0462 a[4 * N + n] = l4 * l5;
0463 a[5 * N + n] = l5 * l5;
0464
0465
0466
0467 }
0468 }
0469 };
0470
0471 template <typename T, idx_t D, idx_t N>
0472 void invertCholeskySym(MPlexSym<T, D, N>& A) {
0473 CholeskyInverterSym<T, D, N>::invert(A);
0474 }
0475
0476 }
0477
0478 #endif