File indexing completed on 2024-04-06 12:28:18
0001 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
0002 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
0003
0004 #include "MatriplexCommon.h"
0005 #include "Matriplex.h"
0006
0007
0008
0009
0010
0011 namespace Matriplex {
0012
0013 const idx_t gSymOffsets[7][36] = {{},
0014 {},
0015 {0, 1, 1, 2},
0016 {0, 1, 3, 1, 2, 4, 3, 4, 5},
0017 {},
0018 {},
0019 {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
0020 6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
0021
0022
0023
0024 template <typename T, idx_t D, idx_t N>
0025 class __attribute__((aligned(MPLEX_ALIGN))) MatriplexSym {
0026 public:
0027 typedef T value_type;
0028
0029
0030 static constexpr int kRows = D;
0031
0032 static constexpr int kCols = D;
0033
0034 static constexpr int kSize = (D + 1) * D / 2;
0035
0036 static constexpr int kTotSize = N * kSize;
0037
0038 T fArray[kTotSize];
0039
0040 MatriplexSym() {}
0041 MatriplexSym(T v) { setVal(v); }
0042
0043 idx_t plexSize() const { return N; }
0044
0045 void setVal(T v) {
0046 for (idx_t i = 0; i < kTotSize; ++i) {
0047 fArray[i] = v;
0048 }
0049 }
0050
0051 void add(const MatriplexSym& v) {
0052 for (idx_t i = 0; i < kTotSize; ++i) {
0053 fArray[i] += v.fArray[i];
0054 }
0055 }
0056
0057 void scale(T scale) {
0058 for (idx_t i = 0; i < kTotSize; ++i) {
0059 fArray[i] *= scale;
0060 }
0061 }
0062
0063 T operator[](idx_t xx) const { return fArray[xx]; }
0064 T& operator[](idx_t xx) { return fArray[xx]; }
0065
0066 const idx_t* offsets() const { return gSymOffsets[D]; }
0067 idx_t off(idx_t i) const { return gSymOffsets[D][i]; }
0068
0069 const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[off(i * D + j) * N + n]; }
0070
0071 T& At(idx_t n, idx_t i, idx_t j) { return fArray[off(i * D + j) * N + n]; }
0072
0073 T& operator()(idx_t n, idx_t i, idx_t j) { return At(n, i, j); }
0074 const T& operator()(idx_t n, idx_t i, idx_t j) const { return constAt(n, i, j); }
0075
0076 MatriplexSym& operator=(const MatriplexSym& m) {
0077 memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
0078 return *this;
0079 }
0080
0081 MatriplexSym(const MatriplexSym& m) = default;
0082
0083 void copySlot(idx_t n, const MatriplexSym& m) {
0084 for (idx_t i = n; i < kTotSize; i += N) {
0085 fArray[i] = m.fArray[i];
0086 }
0087 }
0088
0089 void copyIn(idx_t n, const T* arr) {
0090 for (idx_t i = n; i < kTotSize; i += N) {
0091 fArray[i] = *(arr++);
0092 }
0093 }
0094
0095 void copyIn(idx_t n, const MatriplexSym& m, idx_t in) {
0096 for (idx_t i = n; i < kTotSize; i += N, in += N) {
0097 fArray[i] = m[in];
0098 }
0099 }
0100
0101 void copy(idx_t n, idx_t in) {
0102 for (idx_t i = n; i < kTotSize; i += N, in += N) {
0103 fArray[i] = fArray[in];
0104 }
0105 }
0106
0107 #if defined(AVX512_INTRINSICS)
0108
0109 template <typename U>
0110 void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
0111
0112
0113 const __m512 src = {0};
0114 const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
0115
0116 for (int i = 0; i < kSize; ++i, ++arr) {
0117
0118
0119 __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
0120 _mm512_mask_store_ps(&fArray[i * N], k, reg);
0121 }
0122 }
0123
0124
0125
0126
0127 void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
0128
0129
0130 for (int i = 0; i < N; ++i) {
0131 __m512 reg = _mm512_load_ps(arr + vi[i]);
0132 _mm512_store_ps((void*)(tmp + 64 * i), reg);
0133 }
0134
0135 for (int i = 0; i < kSize; ++i) {
0136 __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
0137 _mm512_store_ps(&fArray[i * N], reg);
0138 }
0139 }
0140
0141 void Contaginate(const char* arr, int vi[N], const char* tmp) {
0142
0143
0144 for (int i = 0; i < N; ++i) {
0145 __m512 reg = _mm512_load_ps(arr + vi[i]);
0146 _mm512_store_ps((void*)(tmp + 64 * i), reg);
0147 }
0148 }
0149
0150 void Plexify(const char* tmp, __m512i& ui) {
0151 for (int i = 0; i < kSize; ++i) {
0152 __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
0153 _mm512_store_ps(&fArray[i * N], reg);
0154 }
0155 }
0156
0157 #elif defined(AVX2_INTRINSICS)
0158
0159 template <typename U>
0160 void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
0161 const __m256 src = {0};
0162
0163 __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
0164 __m256i k_sel = _mm256_set1_epi32(N_proc);
0165 __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
0166
0167 k = k_master;
0168 for (int i = 0; i < kSize; ++i, ++arr) {
0169 __m256 reg = _mm256_mask_i32gather_ps(src, arr, vi, (__m256)k, sizeof(U));
0170
0171 k = k_master;
0172 _mm256_maskstore_ps(&fArray[i * N], k, reg);
0173 }
0174 }
0175
0176 #else
0177
0178 void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
0179
0180 if (N_proc == N) {
0181 for (int i = 0; i < kSize; ++i) {
0182 for (int j = 0; j < N; ++j) {
0183 fArray[i * N + j] = *(arr + i + vi[j]);
0184 }
0185 }
0186 } else {
0187 for (int i = 0; i < kSize; ++i) {
0188 for (int j = 0; j < N_proc; ++j) {
0189 fArray[i * N + j] = *(arr + i + vi[j]);
0190 }
0191 }
0192 }
0193 }
0194
0195 #endif
0196
0197 void copyOut(idx_t n, T* arr) const {
0198 for (idx_t i = n; i < kTotSize; i += N) {
0199 *(arr++) = fArray[i];
0200 }
0201 }
0202
0203 void setDiagonal3x3(idx_t n, T d) {
0204 T* p = fArray + n;
0205
0206 p[0 * N] = d;
0207 p[1 * N] = 0;
0208 p[2 * N] = d;
0209 p[3 * N] = 0;
0210 p[4 * N] = 0;
0211 p[5 * N] = d;
0212 }
0213
0214 MatriplexSym& subtract(const MatriplexSym& a, const MatriplexSym& b) {
0215
0216
0217 #pragma omp simd
0218 for (idx_t i = 0; i < kTotSize; ++i) {
0219 fArray[i] = a.fArray[i] - b.fArray[i];
0220 }
0221
0222 return *this;
0223 }
0224
0225
0226
0227
0228
0229 void addNoiseIntoUpperLeft3x3(T noise) {
0230 T* p = fArray;
0231 ASSUME_ALIGNED(p, 64);
0232
0233 #pragma omp simd
0234 for (idx_t n = 0; n < N; ++n) {
0235 p[0 * N + n] += noise;
0236 p[2 * N + n] += noise;
0237 p[5 * N + n] += noise;
0238 }
0239 }
0240
0241 void invertUpperLeft3x3() {
0242 typedef T TT;
0243
0244 T* a = fArray;
0245 ASSUME_ALIGNED(a, 64);
0246
0247 #pragma omp simd
0248 for (idx_t n = 0; n < N; ++n) {
0249 const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
0250 const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
0251 const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
0252 const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
0253 const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
0254 const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
0255
0256
0257 const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
0258 const TT s = TT(1) / det;
0259
0260 a[0 * N + n] = s * c00;
0261 a[1 * N + n] = s * c01;
0262 a[2 * N + n] = s * c11;
0263 a[3 * N + n] = s * c02;
0264 a[4 * N + n] = s * c12;
0265 a[5 * N + n] = s * c22;
0266 }
0267 }
0268
0269 Matriplex<T, 1, 1, N> ReduceFixedIJ(idx_t i, idx_t j) const {
0270 Matriplex<T, 1, 1, N> t;
0271 for (idx_t n = 0; n < N; ++n) {
0272 t[n] = constAt(n, i, j);
0273 }
0274 return t;
0275 }
0276 };
0277
0278 template <typename T, idx_t D, idx_t N>
0279 using MPlexSym = MatriplexSym<T, D, N>;
0280
0281
0282
0283
0284
0285 template <typename T, idx_t D, idx_t N>
0286 struct SymMultiplyCls {
0287 static void multiply(const MPlexSym<T, D, N>& A, const MPlexSym<T, D, N>& B, MPlex<T, D, D, N>& C) {
0288 throw std::runtime_error("general symmetric multiplication not supported");
0289 }
0290 };
0291
0292 template <typename T, idx_t N>
0293 struct SymMultiplyCls<T, 3, N> {
0294 static void multiply(const MPlexSym<T, 3, N>& A, const MPlexSym<T, 3, N>& B, MPlex<T, 3, 3, N>& C) {
0295 const T* a = A.fArray;
0296 ASSUME_ALIGNED(a, 64);
0297 const T* b = B.fArray;
0298 ASSUME_ALIGNED(b, 64);
0299 T* c = C.fArray;
0300 ASSUME_ALIGNED(c, 64);
0301
0302 #ifdef MPLEX_INTRINSICS
0303
0304 for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
0305 #include "intr_sym_3x3.ah"
0306 }
0307
0308 #else
0309
0310 #pragma omp simd
0311 for (idx_t n = 0; n < N; ++n) {
0312 #include "std_sym_3x3.ah"
0313 }
0314
0315 #endif
0316 }
0317 };
0318
0319 template <typename T, idx_t N>
0320 struct SymMultiplyCls<T, 6, N> {
0321 static void multiply(const MPlexSym<float, 6, N>& A, const MPlexSym<float, 6, N>& B, MPlex<float, 6, 6, N>& C) {
0322 const T* a = A.fArray;
0323 ASSUME_ALIGNED(a, 64);
0324 const T* b = B.fArray;
0325 ASSUME_ALIGNED(b, 64);
0326 T* c = C.fArray;
0327 ASSUME_ALIGNED(c, 64);
0328
0329 #ifdef MPLEX_INTRINSICS
0330
0331 for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
0332 #include "intr_sym_6x6.ah"
0333 }
0334
0335 #else
0336
0337 #pragma omp simd
0338 for (idx_t n = 0; n < N; ++n) {
0339 #include "std_sym_6x6.ah"
0340 }
0341
0342 #endif
0343 }
0344 };
0345
0346 template <typename T, idx_t D, idx_t N>
0347 void multiply(const MPlexSym<T, D, N>& A, const MPlexSym<T, D, N>& B, MPlex<T, D, D, N>& C) {
0348 SymMultiplyCls<T, D, N>::multiply(A, B, C);
0349 }
0350
0351
0352
0353
0354
0355 template <typename T, idx_t D, idx_t N>
0356 struct CramerInverterSym {
0357 static void invert(MPlexSym<T, D, N>& A, double* determ = nullptr) {
0358 throw std::runtime_error("general cramer inversion not supported");
0359 }
0360 };
0361
0362 template <typename T, idx_t N>
0363 struct CramerInverterSym<T, 2, N> {
0364 static void invert(MPlexSym<T, 2, N>& A, double* determ = nullptr) {
0365 typedef T TT;
0366
0367 T* a = A.fArray;
0368 ASSUME_ALIGNED(a, 64);
0369
0370 #pragma omp simd
0371 for (idx_t n = 0; n < N; ++n) {
0372
0373 const double det = (double)a[0 * N + n] * a[2 * N + n] - (double)a[1 * N + n] * a[1 * N + n];
0374 if (determ)
0375 determ[n] = det;
0376
0377 const TT s = TT(1) / det;
0378 const TT tmp = s * a[2 * N + n];
0379 a[1 * N + n] *= -s;
0380 a[2 * N + n] = s * a[0 * N + n];
0381 a[0 * N + n] = tmp;
0382 }
0383 }
0384 };
0385
0386 template <typename T, idx_t N>
0387 struct CramerInverterSym<T, 3, N> {
0388 static void invert(MPlexSym<T, 3, N>& A, double* determ = nullptr) {
0389 typedef T TT;
0390
0391 T* a = A.fArray;
0392 ASSUME_ALIGNED(a, 64);
0393
0394 #pragma omp simd
0395 for (idx_t n = 0; n < N; ++n) {
0396 const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
0397 const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
0398 const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
0399 const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
0400 const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
0401 const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
0402
0403
0404 const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
0405 if (determ)
0406 determ[n] = det;
0407
0408 const TT s = TT(1) / det;
0409 a[0 * N + n] = s * c00;
0410 a[1 * N + n] = s * c01;
0411 a[2 * N + n] = s * c11;
0412 a[3 * N + n] = s * c02;
0413 a[4 * N + n] = s * c12;
0414 a[5 * N + n] = s * c22;
0415 }
0416 }
0417 };
0418
0419 template <typename T, idx_t D, idx_t N>
0420 void invertCramerSym(MPlexSym<T, D, N>& A, double* determ = nullptr) {
0421 CramerInverterSym<T, D, N>::invert(A, determ);
0422 }
0423
0424
0425
0426
0427
0428 template <typename T, idx_t D, idx_t N>
0429 struct CholeskyInverterSym {
0430 static void invert(MPlexSym<T, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
0431 };
0432
0433 template <typename T, idx_t N>
0434 struct CholeskyInverterSym<T, 3, N> {
0435 static void invert(MPlexSym<T, 3, N>& A) {
0436 typedef T TT;
0437
0438 T* a = A.fArray;
0439
0440 #pragma omp simd
0441 for (idx_t n = 0; n < N; ++n) {
0442 TT l0 = std::sqrt(T(1) / a[0 * N + n]);
0443 TT l1 = a[1 * N + n] * l0;
0444 TT l2 = a[2 * N + n] - l1 * l1;
0445 l2 = std::sqrt(T(1) / l2);
0446 TT l3 = a[3 * N + n] * l0;
0447 TT l4 = (a[4 * N + n] - l1 * l3) * l2;
0448 TT l5 = a[5 * N + n] - (l3 * l3 + l4 * l4);
0449 l5 = std::sqrt(T(1) / l5);
0450
0451
0452
0453 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
0454 l1 = -l1 * l0 * l2;
0455 l4 = -l4 * l2 * l5;
0456
0457 a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
0458 a[1 * N + n] = l3 * l4 + l1 * l2;
0459 a[2 * N + n] = l4 * l4 + l2 * l2;
0460 a[3 * N + n] = l3 * l5;
0461 a[4 * N + n] = l4 * l5;
0462 a[5 * N + n] = l5 * l5;
0463
0464
0465
0466 }
0467 }
0468 };
0469
0470 template <typename T, idx_t D, idx_t N>
0471 void invertCholeskySym(MPlexSym<T, D, N>& A) {
0472 CholeskyInverterSym<T, D, N>::invert(A);
0473 }
0474
0475 }
0476
0477 #endif