File indexing completed on 2022-02-21 23:14:15
0001 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
0002 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
0003
0004 #include "MatriplexCommon.h"
0005 #include "Matriplex.h"
0006
0007
0008
0009
0010
0011 namespace Matriplex {
0012
0013 const idx_t gSymOffsets[7][36] = {{},
0014 {},
0015 {0, 1, 1, 2},
0016 {0, 1, 3, 1, 2, 4, 3, 4, 5},
0017 {},
0018 {},
0019 {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
0020 6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
0021
0022
0023
0024 template <typename T, idx_t D, idx_t N>
0025 class MatriplexSym {
0026 public:
0027 typedef T value_type;
0028
0029
0030 static constexpr int kRows = D;
0031
0032 static constexpr int kCols = D;
0033
0034 static constexpr int kSize = (D + 1) * D / 2;
0035
0036 static constexpr int kTotSize = N * kSize;
0037
0038 T fArray[kTotSize] __attribute__((aligned(64)));
0039
0040 MatriplexSym() {}
0041 MatriplexSym(T v) { setVal(v); }
0042
0043 idx_t plexSize() const { return N; }
0044
0045 void setVal(T v) {
0046 for (idx_t i = 0; i < kTotSize; ++i) {
0047 fArray[i] = v;
0048 }
0049 }
0050
0051 void add(const MatriplexSym& v) {
0052 for (idx_t i = 0; i < kTotSize; ++i) {
0053 fArray[i] += v.fArray[i];
0054 }
0055 }
0056
0057 void scale(T scale) {
0058 for (idx_t i = 0; i < kTotSize; ++i) {
0059 fArray[i] *= scale;
0060 }
0061 }
0062
0063 T operator[](idx_t xx) const { return fArray[xx]; }
0064 T& operator[](idx_t xx) { return fArray[xx]; }
0065
0066 const idx_t* offsets() const { return gSymOffsets[D]; }
0067 idx_t off(idx_t i) const { return gSymOffsets[D][i]; }
0068
0069 const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[off(i * D + j) * N + n]; }
0070
0071 T& At(idx_t n, idx_t i, idx_t j) { return fArray[off(i * D + j) * N + n]; }
0072
0073 T& operator()(idx_t n, idx_t i, idx_t j) { return At(n, i, j); }
0074 const T& operator()(idx_t n, idx_t i, idx_t j) const { return constAt(n, i, j); }
0075
0076 MatriplexSym& operator=(const MatriplexSym& m) {
0077 memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
0078 return *this;
0079 }
0080
0081 void copySlot(idx_t n, const MatriplexSym& m) {
0082 for (idx_t i = n; i < kTotSize; i += N) {
0083 fArray[i] = m.fArray[i];
0084 }
0085 }
0086
0087 void copyIn(idx_t n, const T* arr) {
0088 for (idx_t i = n; i < kTotSize; i += N) {
0089 fArray[i] = *(arr++);
0090 }
0091 }
0092
0093 void copyIn(idx_t n, const MatriplexSym& m, idx_t in) {
0094 for (idx_t i = n; i < kTotSize; i += N, in += N) {
0095 fArray[i] = m[in];
0096 }
0097 }
0098
0099 void copy(idx_t n, idx_t in) {
0100 for (idx_t i = n; i < kTotSize; i += N, in += N) {
0101 fArray[i] = fArray[in];
0102 }
0103 }
0104
0105 #if defined(AVX512_INTRINSICS)
0106
0107 template <typename U>
0108 void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
0109
0110
0111 const __m512 src = {0};
0112 const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
0113
0114 for (int i = 0; i < kSize; ++i, ++arr) {
0115
0116
0117 __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
0118 _mm512_mask_store_ps(&fArray[i * N], k, reg);
0119 }
0120 }
0121
0122
0123
0124
0125 void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
0126
0127
0128 for (int i = 0; i < N; ++i) {
0129 __m512 reg = _mm512_load_ps(arr + vi[i]);
0130 _mm512_store_ps((void*)(tmp + 64 * i), reg);
0131 }
0132
0133 for (int i = 0; i < kSize; ++i) {
0134 __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
0135 _mm512_store_ps(&fArray[i * N], reg);
0136 }
0137 }
0138
0139 void Contaginate(const char* arr, int vi[N], const char* tmp) {
0140
0141
0142 for (int i = 0; i < N; ++i) {
0143 __m512 reg = _mm512_load_ps(arr + vi[i]);
0144 _mm512_store_ps((void*)(tmp + 64 * i), reg);
0145 }
0146 }
0147
0148 void Plexify(const char* tmp, __m512i& ui) {
0149 for (int i = 0; i < kSize; ++i) {
0150 __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
0151 _mm512_store_ps(&fArray[i * N], reg);
0152 }
0153 }
0154
0155 #elif defined(AVX2_INTRINSICS)
0156
0157 template <typename U>
0158 void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
0159 const __m256 src = {0};
0160
0161 __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
0162 __m256i k_sel = _mm256_set1_epi32(N_proc);
0163 __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
0164
0165 k = k_master;
0166 for (int i = 0; i < kSize; ++i, ++arr) {
0167 __m256 reg = _mm256_mask_i32gather_ps(src, arr, vi, (__m256)k, sizeof(U));
0168
0169 k = k_master;
0170 _mm256_maskstore_ps(&fArray[i * N], k, reg);
0171 }
0172 }
0173
0174 #else
0175
0176 void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
0177
0178 if (N_proc == N) {
0179 for (int i = 0; i < kSize; ++i) {
0180 for (int j = 0; j < N; ++j) {
0181 fArray[i * N + j] = *(arr + i + vi[j]);
0182 }
0183 }
0184 } else {
0185 for (int i = 0; i < kSize; ++i) {
0186 for (int j = 0; j < N_proc; ++j) {
0187 fArray[i * N + j] = *(arr + i + vi[j]);
0188 }
0189 }
0190 }
0191 }
0192
0193 #endif
0194
0195 void copyOut(idx_t n, T* arr) const {
0196 for (idx_t i = n; i < kTotSize; i += N) {
0197 *(arr++) = fArray[i];
0198 }
0199 }
0200
0201 void setDiagonal3x3(idx_t n, T d) {
0202 T* p = fArray + n;
0203
0204 p[0 * N] = d;
0205 p[1 * N] = 0;
0206 p[2 * N] = d;
0207 p[3 * N] = 0;
0208 p[4 * N] = 0;
0209 p[5 * N] = d;
0210 }
0211
0212 MatriplexSym& subtract(const MatriplexSym& a, const MatriplexSym& b) {
0213
0214
0215 #pragma omp simd
0216 for (idx_t i = 0; i < kTotSize; ++i) {
0217 fArray[i] = a.fArray[i] - b.fArray[i];
0218 }
0219
0220 return *this;
0221 }
0222
0223
0224
0225
0226
0227 void addNoiseIntoUpperLeft3x3(T noise) {
0228 T* p = fArray;
0229 ASSUME_ALIGNED(p, 64);
0230
0231 #pragma omp simd
0232 for (idx_t n = 0; n < N; ++n) {
0233 p[0 * N + n] += noise;
0234 p[2 * N + n] += noise;
0235 p[5 * N + n] += noise;
0236 }
0237 }
0238
0239 void invertUpperLeft3x3() {
0240 typedef T TT;
0241
0242 T* a = fArray;
0243 ASSUME_ALIGNED(a, 64);
0244
0245 #pragma omp simd
0246 for (idx_t n = 0; n < N; ++n) {
0247 const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
0248 const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
0249 const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
0250 const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
0251 const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
0252 const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
0253
0254
0255 const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
0256 const TT s = TT(1) / det;
0257
0258 a[0 * N + n] = s * c00;
0259 a[1 * N + n] = s * c01;
0260 a[2 * N + n] = s * c11;
0261 a[3 * N + n] = s * c02;
0262 a[4 * N + n] = s * c12;
0263 a[5 * N + n] = s * c22;
0264 }
0265 }
0266 };
0267
0268 template <typename T, idx_t D, idx_t N>
0269 using MPlexSym = MatriplexSym<T, D, N>;
0270
0271
0272
0273
0274
0275 template <typename T, idx_t D, idx_t N>
0276 struct SymMultiplyCls {
0277 static void multiply(const MPlexSym<T, D, N>& A, const MPlexSym<T, D, N>& B, MPlex<T, D, D, N>& C) {
0278 throw std::runtime_error("general symmetric multiplication not supported");
0279 }
0280 };
0281
0282 template <typename T, idx_t N>
0283 struct SymMultiplyCls<T, 3, N> {
0284 static void multiply(const MPlexSym<T, 3, N>& A, const MPlexSym<T, 3, N>& B, MPlex<T, 3, 3, N>& C) {
0285 const T* a = A.fArray;
0286 ASSUME_ALIGNED(a, 64);
0287 const T* b = B.fArray;
0288 ASSUME_ALIGNED(b, 64);
0289 T* c = C.fArray;
0290 ASSUME_ALIGNED(c, 64);
0291
0292 #ifdef MPLEX_INTRINSICS
0293
0294 for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
0295 #include "intr_sym_3x3.ah"
0296 }
0297
0298 #else
0299
0300 #pragma omp simd
0301 for (idx_t n = 0; n < N; ++n) {
0302 #include "std_sym_3x3.ah"
0303 }
0304
0305 #endif
0306 }
0307 };
0308
0309 template <typename T, idx_t N>
0310 struct SymMultiplyCls<T, 6, N> {
0311 static void multiply(const MPlexSym<float, 6, N>& A, const MPlexSym<float, 6, N>& B, MPlex<float, 6, 6, N>& C) {
0312 const T* a = A.fArray;
0313 ASSUME_ALIGNED(a, 64);
0314 const T* b = B.fArray;
0315 ASSUME_ALIGNED(b, 64);
0316 T* c = C.fArray;
0317 ASSUME_ALIGNED(c, 64);
0318
0319 #ifdef MPLEX_INTRINSICS
0320
0321 for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
0322 #include "intr_sym_6x6.ah"
0323 }
0324
0325 #else
0326
0327 #pragma omp simd
0328 for (idx_t n = 0; n < N; ++n) {
0329 #include "std_sym_6x6.ah"
0330 }
0331
0332 #endif
0333 }
0334 };
0335
0336 template <typename T, idx_t D, idx_t N>
0337 void multiply(const MPlexSym<T, D, N>& A, const MPlexSym<T, D, N>& B, MPlex<T, D, D, N>& C) {
0338 SymMultiplyCls<T, D, N>::multiply(A, B, C);
0339 }
0340
0341
0342
0343
0344
0345 template <typename T, idx_t D, idx_t N>
0346 struct CramerInverterSym {
0347 static void invert(MPlexSym<T, D, N>& A, double* determ = nullptr) {
0348 throw std::runtime_error("general cramer inversion not supported");
0349 }
0350 };
0351
0352 template <typename T, idx_t N>
0353 struct CramerInverterSym<T, 2, N> {
0354 static void invert(MPlexSym<T, 2, N>& A, double* determ = nullptr) {
0355 typedef T TT;
0356
0357 T* a = A.fArray;
0358 ASSUME_ALIGNED(a, 64);
0359
0360 #pragma omp simd
0361 for (idx_t n = 0; n < N; ++n) {
0362
0363 const double det = (double)a[0 * N + n] * a[2 * N + n] - (double)a[1 * N + n] * a[1 * N + n];
0364 if (determ)
0365 determ[n] = det;
0366
0367 const TT s = TT(1) / det;
0368 const TT tmp = s * a[2 * N + n];
0369 a[1 * N + n] *= -s;
0370 a[2 * N + n] = s * a[0 * N + n];
0371 a[0 * N + n] = tmp;
0372 }
0373 }
0374 };
0375
0376 template <typename T, idx_t N>
0377 struct CramerInverterSym<T, 3, N> {
0378 static void invert(MPlexSym<T, 3, N>& A, double* determ = nullptr) {
0379 typedef T TT;
0380
0381 T* a = A.fArray;
0382 ASSUME_ALIGNED(a, 64);
0383
0384 #pragma omp simd
0385 for (idx_t n = 0; n < N; ++n) {
0386 const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
0387 const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
0388 const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
0389 const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
0390 const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
0391 const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
0392
0393
0394 const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
0395 if (determ)
0396 determ[n] = det;
0397
0398 const TT s = TT(1) / det;
0399 a[0 * N + n] = s * c00;
0400 a[1 * N + n] = s * c01;
0401 a[2 * N + n] = s * c11;
0402 a[3 * N + n] = s * c02;
0403 a[4 * N + n] = s * c12;
0404 a[5 * N + n] = s * c22;
0405 }
0406 }
0407 };
0408
0409 template <typename T, idx_t D, idx_t N>
0410 void invertCramerSym(MPlexSym<T, D, N>& A, double* determ = nullptr) {
0411 CramerInverterSym<T, D, N>::invert(A, determ);
0412 }
0413
0414
0415
0416
0417
0418 template <typename T, idx_t D, idx_t N>
0419 struct CholeskyInverterSym {
0420 static void invert(MPlexSym<T, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
0421 };
0422
0423 template <typename T, idx_t N>
0424 struct CholeskyInverterSym<T, 3, N> {
0425 static void invert(MPlexSym<T, 3, N>& A) {
0426 typedef T TT;
0427
0428 T* a = A.fArray;
0429
0430 #pragma omp simd
0431 for (idx_t n = 0; n < N; ++n) {
0432 TT l0 = std::sqrt(T(1) / a[0 * N + n]);
0433 TT l1 = a[1 * N + n] * l0;
0434 TT l2 = a[2 * N + n] - l1 * l1;
0435 l2 = std::sqrt(T(1) / l2);
0436 TT l3 = a[3 * N + n] * l0;
0437 TT l4 = (a[4 * N + n] - l1 * l3) * l2;
0438 TT l5 = a[5 * N + n] - (l3 * l3 + l4 * l4);
0439 l5 = std::sqrt(T(1) / l5);
0440
0441
0442
0443 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
0444 l1 = -l1 * l0 * l2;
0445 l4 = -l4 * l2 * l5;
0446
0447 a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
0448 a[1 * N + n] = l3 * l4 + l1 * l2;
0449 a[2 * N + n] = l4 * l4 + l2 * l2;
0450 a[3 * N + n] = l3 * l5;
0451 a[4 * N + n] = l4 * l5;
0452 a[5 * N + n] = l5 * l5;
0453
0454
0455
0456 }
0457 }
0458 };
0459
0460 template <typename T, idx_t D, idx_t N>
0461 void invertCholeskySym(MPlexSym<T, D, N>& A) {
0462 CholeskyInverterSym<T, D, N>::invert(A);
0463 }
0464
0465 }
0466
0467 #endif