CaloRecHit/interface/MultifitComputations.h

0001 #ifndef DataFormats_CaloRecHit_interface_MultifitComputations_h
0002 #define DataFormats_CaloRecHit_interface_MultifitComputations_h
0003
0004 #include <cmath>
0005 #include <limits>
0006 #include <type_traits>
0007
0008 #include <Eigen/Dense>
0009
0010 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0011
0012 namespace calo {
0013   namespace multifit {
0014
0015     template <int NROWS, int NCOLS>
0016     using ColMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::ColMajor>;
0017
0018     template <int NROWS, int NCOLS>
0019     using RowMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::RowMajor>;
0020
0021     template <int SIZE, typename T = float>
0022     using ColumnVector = Eigen::Matrix<T, SIZE, 1>;
0023
0024     template <int SIZE, typename T = float>
0025     using RowVector = Eigen::Matrix<T, 1, SIZE>;
0026
0027     // FIXME: provide specialization for Row Major layout
0028     template <typename T, int Stride, int Order = Eigen::ColMajor>
0029     struct MapSymM {
0030       using type = T;
0031       using base_type = typename std::remove_const<type>::type;
0032
0033       static constexpr int total = Stride * (Stride + 1) / 2;
0034       static constexpr int stride = Stride;
0035       T* data;
0036
0037       EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC MapSymM(T* data) : data{data} {}
0038
0039       EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC T const& operator()(int const row, int const col) const {
0040         auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
0041         auto const index = total - tmp + row - col;
0042         return data[index];
0043       }
0044
0045       template <typename U = T>
0046       EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC typename std::enable_if<std::is_same<base_type, U>::value, base_type>::type&
0047       operator()(int const row, int const col) {
0048         auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
0049         auto const index = total - tmp + row - col;
0050         return data[index];
0051       }
0052     };
0053
0054     // FIXME: either use/modify/improve eigen or make this more generic
0055     // this is a map for a pulse matrix to building a 2d matrix for each channel
0056     // and hide indexing
0057     template <typename T>
0058     struct MapMForPM {
0059       using type = T;
0060       using base_type = typename std::remove_cv<type>::type;
0061
0062       type* data;
0063       EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC MapMForPM(type* data) : data{data} {}
0064
0065       EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC base_type operator()(int const row, int const col) const {
0066         auto const index = 2 - col + row;
0067         return index >= 0 ? data[index] : 0;
0068       }
0069     };
0070
0071     // simple/trivial cholesky decomposition impl
0072     template <typename MatrixType1, typename MatrixType2>
0073     EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) {
0074       auto const sqrtm_0_0 = std::sqrt(M(0, 0));
0075       L(0, 0) = sqrtm_0_0;
0076       using T = typename MatrixType1::base_type;
0077
0078       CMS_UNROLL_LOOP
0079       for (int i = 1; i < MatrixType1::stride; i++) {
0080         T sumsq{0};
0081         for (int j = 0; j < i; j++) {
0082           T sumsq2{0};
0083           auto const m_i_j = M(i, j);
0084           for (int k = 0; k < j; ++k)
0085             sumsq2 += L(i, k) * L(j, k);
0086
0087           auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
0088           L(i, j) = value_i_j;
0089
0090           sumsq += value_i_j * value_i_j;
0091         }
0092
0093         auto const l_i_i = std::sqrt(M(i, i) - sumsq);
0094         L(i, i) = l_i_i;
0095       }
0096     }
0097
0098     template <typename MatrixType1, typename MatrixType2>
0099     EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition(MatrixType1& L,
0100                                                                      MatrixType2 const& M,
0101                                                                      int const N) {
0102       auto const sqrtm_0_0 = std::sqrt(M(0, 0));
0103       L(0, 0) = sqrtm_0_0;
0104       using T = typename MatrixType1::base_type;
0105
0106       for (int i = 1; i < N; i++) {
0107         T sumsq{0};
0108         for (int j = 0; j < i; j++) {
0109           T sumsq2{0};
0110           auto const m_i_j = M(i, j);
0111           for (int k = 0; k < j; ++k)
0112             sumsq2 += L(i, k) * L(j, k);
0113
0114           auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
0115           L(i, j) = value_i_j;
0116
0117           sumsq += value_i_j * value_i_j;
0118         }
0119
0120         auto const l_i_i = std::sqrt(M(i, i) - sumsq);
0121         L(i, i) = l_i_i;
0122       }
0123     }
0124
0125     template <typename MatrixType1, typename MatrixType2, typename VectorType>
0126     EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition_forwardsubst_with_offsets(
0127         MatrixType1& L,
0128         MatrixType2 const& M,
0129         float b[MatrixType1::stride],
0130         VectorType const& Atb,
0131         int const N,
0132         ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
0133       auto const real_0 = pulseOffsets(0);
0134       auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0));
0135       L(0, 0) = sqrtm_0_0;
0136       using T = typename MatrixType1::base_type;
0137       b[0] = Atb(real_0) / sqrtm_0_0;
0138
0139       for (int i = 1; i < N; i++) {
0140         auto const i_real = pulseOffsets(i);
0141         T sumsq{0};
0142         T total = 0;
0143         auto const atb = Atb(i_real);
0144         for (int j = 0; j < i; j++) {
0145           auto const j_real = pulseOffsets(j);
0146           T sumsq2{0};
0147           auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
0148           for (int k = 0; k < j; ++k)
0149             sumsq2 += L(i, k) * L(j, k);
0150
0151           auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
0152           L(i, j) = value_i_j;
0153
0154           sumsq += value_i_j * value_i_j;
0155           total += value_i_j * b[j];
0156         }
0157
0158         auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
0159         L(i, i) = l_i_i;
0160         b[i] = (atb - total) / l_i_i;
0161       }
0162     }
0163
0164     template <typename MatrixType1, typename MatrixType2, typename VectorType>
0165     EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void update_decomposition_forwardsubst_with_offsets(
0166         MatrixType1& L,
0167         MatrixType2 const& M,
0168         float b[MatrixType1::stride],
0169         VectorType const& Atb,
0170         int const N,
0171         ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
0172       using T = typename MatrixType1::base_type;
0173       auto const i = N - 1;
0174       auto const i_real = pulseOffsets(i);
0175       T sumsq{0};
0176       T total = 0;
0177       for (int j = 0; j < i; j++) {
0178         auto const j_real = pulseOffsets(j);
0179         T sumsq2{0};
0180         auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
0181         for (int k = 0; k < j; ++k)
0182           sumsq2 += L(i, k) * L(j, k);
0183
0184         auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
0185         L(i, j) = value_i_j;
0186         sumsq += value_i_j * value_i_j;
0187
0188         total += value_i_j * b[j];
0189       }
0190
0191       auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
0192       L(i, i) = l_i_i;
0193       b[i] = (Atb(i_real) - total) / l_i_i;
0194     }
0195
0196     template <typename MatrixType1, typename MatrixType2, typename MatrixType3>
0197     EIGEN_DEVICE_FUNC void solve_forward_subst_matrix(MatrixType1& A,
0198                                                       MatrixType2 const& pulseMatrixView,
0199                                                       MatrixType3 const& matrixL) {
0200       // FIXME: this assumes pulses are on columns and samples on rows
0201       constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
0202       constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;
0203
0204       CMS_UNROLL_LOOP
0205       for (int icol = 0; icol < NPULSES; icol++) {
0206         float reg_b[NSAMPLES];
0207         float reg_L[NSAMPLES];
0208
0209         // preload a column and load column 0 of cholesky
0210         CMS_UNROLL_LOOP
0211         for (int i = 0; i < NSAMPLES; i++) {
0212 #ifdef __CUDA_ARCH__
0213           // load through the read-only cache
0214           reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol));
0215 #else
0216           reg_b[i] = pulseMatrixView.coeffRef(i, icol);
0217 #endif  // __CUDA_ARCH__
0218           reg_L[i] = matrixL(i, 0);
0219         }
0220
0221         // compute x0 and store it
0222         auto x_prev = reg_b[0] / reg_L[0];
0223         A(0, icol) = x_prev;
0224
0225         // iterate
0226         CMS_UNROLL_LOOP
0227         for (int iL = 1; iL < NSAMPLES; iL++) {
0228           // update accum
0229           CMS_UNROLL_LOOP
0230           for (int counter = iL; counter < NSAMPLES; counter++)
0231             reg_b[counter] -= x_prev * reg_L[counter];
0232
0233           // load the next column of cholesky
0234           CMS_UNROLL_LOOP
0235           for (int counter = iL; counter < NSAMPLES; counter++)
0236             reg_L[counter] = matrixL(counter, iL);
0237
0238           // compute the next x for M(iL, icol)
0239           x_prev = reg_b[iL] / reg_L[iL];
0240
0241           // store the result value
0242           A(iL, icol) = x_prev;
0243         }
0244       }
0245     }
0246
0247     template <typename MatrixType1, typename MatrixType2>
0248     EIGEN_DEVICE_FUNC void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime],
0249                                                       MatrixType1 inputAmplitudesView,
0250                                                       MatrixType2 matrixL) {
0251       constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime;
0252
0253       float reg_b_tmp[NSAMPLES];
0254       float reg_L[NSAMPLES];
0255
0256       // preload a column and load column 0 of cholesky
0257       CMS_UNROLL_LOOP
0258       for (int i = 0; i < NSAMPLES; i++) {
0259         reg_b_tmp[i] = inputAmplitudesView(i);
0260         reg_L[i] = matrixL(i, 0);
0261       }
0262
0263       // compute x0 and store it
0264       auto x_prev = reg_b_tmp[0] / reg_L[0];
0265       reg_b[0] = x_prev;
0266
0267       // iterate
0268       CMS_UNROLL_LOOP
0269       for (int iL = 1; iL < NSAMPLES; iL++) {
0270         // update accum
0271         CMS_UNROLL_LOOP
0272         for (int counter = iL; counter < NSAMPLES; counter++)
0273           reg_b_tmp[counter] -= x_prev * reg_L[counter];
0274
0275         // load the next column of cholesky
0276         CMS_UNROLL_LOOP
0277         for (int counter = iL; counter < NSAMPLES; counter++)
0278           reg_L[counter] = matrixL(counter, iL);
0279
0280         // compute the next x for M(iL, icol)
0281         x_prev = reg_b_tmp[iL] / reg_L[iL];
0282
0283         // store the result value
0284         reg_b[iL] = x_prev;
0285       }
0286     }
0287
0288     template <typename MatrixType1, typename MatrixType2, typename MatrixType3, typename MatrixType4>
0289     EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void calculateChiSq(MatrixType1 const& matrixL,
0290                                                               MatrixType2 const& pulseMatrixView,
0291                                                               MatrixType3 const& resultAmplitudesVector,
0292                                                               MatrixType4 const& inputAmplitudesView,
0293                                                               float& chi2) {
0294       // FIXME: this assumes pulses are on columns and samples on rows
0295       constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
0296       constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;
0297
0298       // replace pulseMatrixView * resultAmplitudesVector - inputAmplitudesView
0299       // NOTE:
0300       float accum[NSAMPLES];
0301       {
0302         float results[NPULSES];
0303
0304         // preload results and permute according to the pulse offsets /////////////// ??? this is not done in ECAL
0305         CMS_UNROLL_LOOP
0306         for (int counter = 0; counter < NPULSES; counter++) {
0307           results[counter] = resultAmplitudesVector[counter];
0308         }
0309
0310         // load accum
0311         CMS_UNROLL_LOOP
0312         for (int counter = 0; counter < NSAMPLES; counter++)
0313           accum[counter] = -inputAmplitudesView(counter);
0314
0315         // iterate
0316         for (int icol = 0; icol < NPULSES; icol++) {
0317           float pm_col[NSAMPLES];
0318
0319           // preload a column of pulse matrix
0320           CMS_UNROLL_LOOP
0321           for (int counter = 0; counter < NSAMPLES; counter++)
0322 #ifdef __CUDA_ARCH__
0323             pm_col[counter] = __ldg(&pulseMatrixView.coeffRef(counter, icol));
0324 #else
0325             pm_col[counter] = pulseMatrixView.coeffRef(counter, icol);
0326 #endif
0327
0328           // accum
0329           CMS_UNROLL_LOOP
0330           for (int counter = 0; counter < NSAMPLES; counter++)
0331             accum[counter] += results[icol] * pm_col[counter];
0332         }
0333       }
0334
0335       // compute chi2 and check that there is no rotation
0336       // chi2 = matrixDecomposition
0337       //    .matrixL()
0338       //    . solve(mapAccum)
0339       //            .solve(pulseMatrixView * resultAmplitudesVector - inputAmplitudesView)
0340       //    .squaredNorm();
0341
0342       {
0343         float reg_L[NSAMPLES];
0344         float accumSum = 0;
0345
0346         // preload a column and load column 0 of cholesky
0347         CMS_UNROLL_LOOP
0348         for (int i = 0; i < NSAMPLES; i++) {
0349           reg_L[i] = matrixL(i, 0);
0350         }
0351
0352         // compute x0 and store it
0353         auto x_prev = accum[0] / reg_L[0];
0354         accumSum += x_prev * x_prev;
0355
0356         // iterate
0357         CMS_UNROLL_LOOP
0358         for (int iL = 1; iL < NSAMPLES; iL++) {
0359           // update accum
0360           CMS_UNROLL_LOOP
0361           for (int counter = iL; counter < NSAMPLES; counter++)
0362             accum[counter] -= x_prev * reg_L[counter];
0363
0364           // load the next column of cholesky
0365           CMS_UNROLL_LOOP
0366           for (int counter = iL; counter < NSAMPLES; counter++)
0367             reg_L[counter] = matrixL(counter, iL);
0368
0369           // compute the next x for M(iL, icol)
0370           x_prev = accum[iL] / reg_L[iL];
0371
0372           // store the result value
0373           accumSum += x_prev * x_prev;
0374         }
0375
0376         chi2 = accumSum;
0377       }
0378     }
0379
0380     // TODO: add active bxs
0381     template <typename MatrixType, typename VectorType>
0382     EIGEN_DEVICE_FUNC void fnnls(MatrixType const& AtA,
0383                                  VectorType const& Atb,
0384                                  VectorType& solution,
0385                                  int& npassive,
0386                                  ColumnVector<VectorType::RowsAtCompileTime, int>& pulseOffsets,
0387                                  MapSymM<float, VectorType::RowsAtCompileTime>& matrixL,
0388                                  double eps,                    // convergence condition
0389                                  const int maxIterations,       // maximum number of iterations
0390                                  const int relaxationPeriod,    // every "relaxationPeriod" iterations
0391                                  const int relaxationFactor) {  // multiply "eps" by "relaxationFactor"
0392       // constants
0393       constexpr auto NPULSES = VectorType::RowsAtCompileTime;
0394
0395       // to keep track of where to terminate if converged
0396       Eigen::Index w_max_idx_prev = 0;
0397       float w_max_prev = 0;
0398       bool recompute = false;
0399
0400       // used throughout
0401       VectorType s;
0402       float reg_b[NPULSES];
0403       //float matrixLStorage[MapSymM<float, NPULSES>::total];
0404       //MapSymM<float, NPULSES> matrixL{matrixLStorage};
0405
0406       int iter = 0;
0407       while (true) {
0408         if (iter > 0 || npassive == 0) {
0409           auto const nactive = NPULSES - npassive;
0410           // exit if there are no more pulses to constrain
0411           if (nactive == 0)
0412             break;
0413
0414           // compute the gradient
0415           //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive);
0416           Eigen::Index w_max_idx = 0;
0417           float w_max = -std::numeric_limits<float>::max();
0418           for (int icol = npassive; icol < NPULSES; icol++) {
0419             auto const icol_real = pulseOffsets(icol);
0420             auto const atb = Atb(icol_real);
0421             float sum = 0;
0422             CMS_UNROLL_LOOP
0423             for (int counter = 0; counter < NPULSES; counter++)
0424               sum += counter > icol_real ? AtA(counter, icol_real) * solution(counter)
0425                                          : AtA(icol_real, counter) * solution(counter);
0426
0427             auto const w = atb - sum;
0428             if (w > w_max) {
0429               w_max = w;
0430               w_max_idx = icol - npassive;
0431             }
0432           }
0433
0434           // check for convergence
0435           if (w_max < eps || (w_max_idx == w_max_idx_prev && w_max == w_max_prev))
0436             break;
0437
0438           if (iter >= maxIterations)
0439             break;
0440
0441           w_max_prev = w_max;
0442           w_max_idx_prev = w_max_idx;
0443
0444           // move index to the right part of the vector
0445           w_max_idx += npassive;
0446
0447           Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(w_max_idx));
0448           ++npassive;
0449         }
0450
0451         // inner loop
0452         while (true) {
0453           if (npassive == 0)
0454             break;
0455
0456           //s.head(npassive)
0457           //auto const& matrixL =
0458           //    AtA.topLeftCorner(npassive, npassive)
0459           //        .llt().matrixL();
0460           //.solve(Atb.head(npassive));
0461           if (recompute || iter == 0)
0462             compute_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
0463           else
0464             update_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
0465
0466           // run backward substituion
0467           s(npassive - 1) = reg_b[npassive - 1] / matrixL(npassive - 1, npassive - 1);
0468           for (int i = npassive - 2; i >= 0; --i) {
0469             float total = 0;
0470             for (int j = i + 1; j < npassive; j++)
0471               total += matrixL(j, i) * s(j);
0472
0473             s(i) = (reg_b[i] - total) / matrixL(i, i);
0474           }
0475
0476           // done if solution values are all positive
0477           bool hasNegative = false;
0478           bool hasNans = false;
0479           for (int counter = 0; counter < npassive; counter++) {
0480             auto const s_ii = s(counter);
0481             hasNegative |= s_ii <= 0;
0482             hasNans |= std::isnan(s_ii);
0483           }
0484
0485           // FIXME: temporary solution. my cholesky impl is unstable yielding nans
0486           // this check removes nans - do not accept solution unless all values
0487           // are stable
0488           if (hasNans)
0489             break;
0490           if (!hasNegative) {
0491             for (int i = 0; i < npassive; i++) {
0492               auto const i_real = pulseOffsets(i);
0493               solution(i_real) = s(i);
0494             }
0495             //solution.head(npassive) = s.head(npassive);
0496             recompute = false;
0497             break;
0498           }
0499
0500           // there were negative values -> have to recompute the whole decomp
0501           recompute = true;
0502
0503           auto alpha = std::numeric_limits<float>::max();
0504           Eigen::Index alpha_idx = 0, alpha_idx_real = 0;
0505           for (int i = 0; i < npassive; i++) {
0506             if (s[i] <= 0.) {
0507               auto const i_real = pulseOffsets(i);
0508               auto const ratio = solution[i_real] / (solution[i_real] - s[i]);
0509               if (ratio < alpha) {
0510                 alpha = ratio;
0511                 alpha_idx = i;
0512                 alpha_idx_real = i_real;
0513               }
0514             }
0515           }
0516
0517           // upadte solution
0518           for (int i = 0; i < npassive; i++) {
0519             auto const i_real = pulseOffsets(i);
0520             solution(i_real) += alpha * (s(i) - solution(i_real));
0521           }
0522           //solution.head(npassive) += alpha *
0523           //    (s.head(npassive) - solution.head(npassive));
0524           solution[alpha_idx_real] = 0;
0525           --npassive;
0526
0527           Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(alpha_idx));
0528         }
0529
0530         // as in cpu
0531         ++iter;
0532         if (iter % relaxationPeriod == 0)
0533           eps *= relaxationFactor;
0534       }
0535     }
0536
0537   }  // namespace multifit
0538 }  // namespace calo
0539
0540 #endif  // DataFormats_CaloRecHit_interface_MultifitComputations_h