plugins/alpaka/AmplitudeComputationKernels.dev.cc

0001 #include <cmath>
0002 #include <limits>
0003 #include <alpaka/alpaka.hpp>
0004
0005 #include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
0006 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
0007 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0008 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0009
0010 #include "AmplitudeComputationKernels.h"
0011 #include "KernelHelpers.h"
0012 #include "EcalUncalibRecHitMultiFitAlgoPortable.h"
0013
0014 namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
0015
0016   using namespace ::ecal::multifit;
0017
0018   template <typename MatrixType>
0019   ALPAKA_FN_ACC ALPAKA_FN_INLINE void update_covariance(EcalPulseCovariance const& pulse_covariance,
0020                                                         MatrixType& inverse_cov,
0021                                                         SampleVector const& amplitudes) {
0022     constexpr auto nsamples = SampleVector::RowsAtCompileTime;
0023     constexpr auto npulses = BXVectorType::RowsAtCompileTime;
0024
0025     CMS_UNROLL_LOOP
0026     for (unsigned int ipulse = 0; ipulse < npulses; ++ipulse) {
0027       auto const amplitude = amplitudes.coeff(ipulse);
0028       if (amplitude == 0)
0029         continue;
0030
0031       // FIXME: ipulse - 5 -> ipulse - firstOffset
0032       int bx = ipulse - 5;
0033       int first_sample_t = std::max(0, bx + 3);
0034       int offset = -3 - bx;
0035
0036       auto const value_sq = amplitude * amplitude;
0037
0038       for (int col = first_sample_t; col < nsamples; ++col) {
0039         for (int row = col; row < nsamples; ++row) {
0040           inverse_cov(row, col) += value_sq * pulse_covariance.covval[row + offset][col + offset];
0041         }
0042       }
0043     }
0044   }
0045
0046   ///
0047   /// launch ctx parameters are (nchannels / block, blocks)
0048   /// TODO: trivial impl for now, there must be a way to improve
0049   ///
0050   /// Conventions:
0051   ///   - amplitudes -> solution vector, what we are fitting for
0052   ///   - samples -> raw detector responses
0053   ///   - passive constraint - satisfied constraint
0054   ///   - active constraint - unsatisfied (yet) constraint
0055   ///
0056   class Kernel_minimize {
0057   public:
0058     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0059                                   InputProduct::ConstView const& digisDevEB,
0060                                   InputProduct::ConstView const& digisDevEE,
0061                                   OutputProduct::View uncalibRecHitsEB,
0062                                   OutputProduct::View uncalibRecHitsEE,
0063                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0064                                   ::ecal::multifit::SampleMatrix const* noisecov,
0065                                   ::ecal::multifit::PulseMatrixType const* pulse_matrix,
0066                                   ::ecal::multifit::BXVectorType* bxs,
0067                                   ::ecal::multifit::SampleVector const* samples,
0068                                   bool* hasSwitchToGain6,
0069                                   bool* hasSwitchToGain1,
0070                                   bool* isSaturated,
0071                                   char* acState,
0072                                   int max_iterations) const {
0073       // FIXME: ecal has 10 samples and 10 pulses....
0074       // but this needs to be properly treated and renamed everywhere
0075       constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime;
0076       constexpr auto NPULSES = SampleMatrix::ColsAtCompileTime;
0077       static_assert(NSAMPLES == NPULSES);
0078
0079       using DataType = SampleVector::Scalar;
0080
0081       auto const elemsPerBlock(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
0082
0083       auto const nchannelsEB = digisDevEB.size();
0084       auto const nchannels = nchannelsEB + digisDevEE.size();
0085       auto const offsetForHashes = conditionsDev.offsetEE();
0086
0087       auto const* pulse_covariance = reinterpret_cast<const EcalPulseCovariance*>(conditionsDev.pulseCovariance());
0088
0089       // shared memory
0090       DataType* shrmem = alpaka::getDynSharedMem<DataType>(acc);
0091
0092       // channel
0093       for (auto idx : cms::alpakatools::uniform_elements(acc, nchannels)) {
0094         if (static_cast<MinimizationState>(acState[idx]) == MinimizationState::Precomputed)
0095           continue;
0096
0097         auto const elemIdx = idx % elemsPerBlock;
0098
0099         // shared memory pointers
0100         DataType* shrMatrixLForFnnlsStorage = shrmem + calo::multifit::MapSymM<DataType, NPULSES>::total * elemIdx;
0101         DataType* shrAtAStorage =
0102             shrmem + calo::multifit::MapSymM<DataType, NPULSES>::total * (elemIdx + elemsPerBlock);
0103
0104         auto* amplitudes =
0105             reinterpret_cast<SampleVector*>(idx >= nchannelsEB ? uncalibRecHitsEE.outOfTimeAmplitudes()->data()
0106                                                                : uncalibRecHitsEB.outOfTimeAmplitudes()->data());
0107         auto* energies = idx >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude();
0108         auto* chi2s = idx >= nchannelsEB ? uncalibRecHitsEE.chi2() : uncalibRecHitsEB.chi2();
0109
0110         // get the hash
0111         int const inputCh = idx >= nchannelsEB ? idx - nchannelsEB : idx;
0112         auto const* dids = idx >= nchannelsEB ? digisDevEE.id() : digisDevEB.id();
0113         auto const did = DetId{dids[inputCh]};
0114         auto const isBarrel = did.subdetId() == EcalBarrel;
0115         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0116                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0117
0118         // inits
0119         int npassive = 0;
0120
0121         calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
0122         CMS_UNROLL_LOOP
0123         for (int i = 0; i < NPULSES; ++i)
0124           pulseOffsets(i) = i;
0125
0126         calo::multifit::ColumnVector<NPULSES, DataType> resultAmplitudes;
0127         CMS_UNROLL_LOOP
0128         for (int counter = 0; counter < NPULSES; ++counter)
0129           resultAmplitudes(counter) = 0;
0130
0131         // inits
0132         //SampleDecompLLT covariance_decomposition;
0133         //SampleMatrix inverse_cov;
0134         //        SampleVector::Scalar chi2 = 0, chi2_now = 0;
0135         float chi2 = 0, chi2_now = 0;
0136
0137         // loop for up to max_iterations
0138         for (int iter = 0; iter < max_iterations; ++iter) {
0139           //inverse_cov = noisecov[idx];
0140           //DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
0141           DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
0142           calo::multifit::MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
0143           int counter = 0;
0144           CMS_UNROLL_LOOP
0145           for (int col = 0; col < NSAMPLES; ++col) {
0146             CMS_UNROLL_LOOP
0147             for (int row = col; row < NSAMPLES; ++row) {
0148               covMatrixStorage[counter++] = noisecov[idx].coeffRef(row, col);
0149             }
0150           }
0151           update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes);
0152
0153           // compute actual covariance decomposition
0154           //covariance_decomposition.compute(inverse_cov);
0155           //auto const& matrixL = covariance_decomposition.matrixL();
0156           DataType matrixLStorage[calo::multifit::MapSymM<DataType, NSAMPLES>::total];
0157           calo::multifit::MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
0158           calo::multifit::compute_decomposition_unrolled(matrixL, covMatrix);
0159
0160           // L * A = P
0161           calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
0162           calo::multifit::solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
0163
0164           // L b = s
0165           float reg_b[NSAMPLES];
0166           calo::multifit::solve_forward_subst_vector(reg_b, samples[idx], matrixL);
0167
0168           // FIXME: shared mem
0169           //DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
0170           calo::multifit::MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
0171           //SampleMatrix AtA;
0172           SampleVector Atb;
0173           CMS_UNROLL_LOOP
0174           for (int icol = 0; icol < NPULSES; ++icol) {
0175             float reg_ai[NSAMPLES];
0176
0177             // load column icol
0178             CMS_UNROLL_LOOP
0179             for (int counter = 0; counter < NSAMPLES; ++counter)
0180               reg_ai[counter] = A(counter, icol);
0181
0182             // compute diagoanl
0183             float sum = 0.f;
0184             CMS_UNROLL_LOOP
0185             for (int counter = 0; counter < NSAMPLES; ++counter)
0186               sum += reg_ai[counter] * reg_ai[counter];
0187
0188             // store
0189             AtA(icol, icol) = sum;
0190
0191             // go thru the other columns
0192             CMS_UNROLL_LOOP
0193             for (int j = icol + 1; j < NPULSES; ++j) {
0194               // load column j
0195               float reg_aj[NSAMPLES];
0196               CMS_UNROLL_LOOP
0197               for (int counter = 0; counter < NSAMPLES; ++counter)
0198                 reg_aj[counter] = A(counter, j);
0199
0200               // accum
0201               float sum = 0.f;
0202               CMS_UNROLL_LOOP
0203               for (int counter = 0; counter < NSAMPLES; ++counter)
0204                 sum += reg_aj[counter] * reg_ai[counter];
0205
0206               // store
0207               //AtA(icol, j) = sum;
0208               AtA(j, icol) = sum;
0209             }
0210
0211             // Atb accum
0212             float sum_atb = 0.f;
0213             CMS_UNROLL_LOOP
0214             for (int counter = 0; counter < NSAMPLES; ++counter)
0215               sum_atb += reg_ai[counter] * reg_b[counter];
0216
0217             // store atb
0218             Atb(icol) = sum_atb;
0219           }
0220
0221           // FIXME: shared mem
0222           //DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
0223           calo::multifit::MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
0224
0225           calo::multifit::fnnls(AtA,
0226                                 Atb,
0227                                 //amplitudes[idx],
0228                                 resultAmplitudes,
0229                                 npassive,
0230                                 pulseOffsets,
0231                                 matrixLForFnnls,
0232                                 1e-11,
0233                                 500,
0234                                 16,
0235                                 2);
0236
0237           calo::multifit::calculateChiSq(matrixL, pulse_matrix[idx], resultAmplitudes, samples[idx], chi2_now);
0238
0239           auto const deltachi2 = chi2_now - chi2;
0240           chi2 = chi2_now;
0241
0242           if (std::abs(deltachi2) < 1e-3)
0243             break;
0244         }
0245
0246         // store to global output values
0247         // FIXME: amplitudes are used in global directly
0248         chi2s[inputCh] = chi2;
0249         energies[inputCh] = resultAmplitudes(5);
0250
0251         CMS_UNROLL_LOOP
0252         for (int counter = 0; counter < NPULSES; ++counter)
0253           amplitudes[inputCh](counter) = resultAmplitudes(counter);
0254       }
0255     }
0256   };
0257
0258   void minimization_procedure(Queue& queue,
0259                               InputProduct const& digisDevEB,
0260                               InputProduct const& digisDevEE,
0261                               OutputProduct& uncalibRecHitsDevEB,
0262                               OutputProduct& uncalibRecHitsDevEE,
0263                               EventDataForScratchDevice& scratch,
0264                               EcalMultifitConditionsDevice const& conditionsDev,
0265                               ConfigurationParameters const& configParams,
0266                               uint32_t const totalChannels) {
0267     using DataType = SampleVector::Scalar;
0268     // TODO: configure from python
0269     auto threads_min = configParams.kernelMinimizeThreads[0];
0270     auto blocks_min = cms::alpakatools::divide_up_by(totalChannels, threads_min);
0271
0272     auto workDivMinimize = cms::alpakatools::make_workdiv<Acc1D>(blocks_min, threads_min);
0273     alpaka::exec<Acc1D>(queue,
0274                         workDivMinimize,
0275                         Kernel_minimize{},
0276                         digisDevEB.const_view(),
0277                         digisDevEE.const_view(),
0278                         uncalibRecHitsDevEB.view(),
0279                         uncalibRecHitsDevEE.view(),
0280                         conditionsDev.const_view(),
0281                         reinterpret_cast<::ecal::multifit::SampleMatrix*>(scratch.noisecovDevBuf.data()),
0282                         reinterpret_cast<::ecal::multifit::PulseMatrixType*>(scratch.pulse_matrixDevBuf.data()),
0283                         reinterpret_cast<::ecal::multifit::BXVectorType*>(scratch.activeBXsDevBuf.data()),
0284                         reinterpret_cast<::ecal::multifit::SampleVector*>(scratch.samplesDevBuf.data()),
0285                         scratch.hasSwitchToGain6DevBuf.data(),
0286                         scratch.hasSwitchToGain1DevBuf.data(),
0287                         scratch.isSaturatedDevBuf.data(),
0288                         scratch.acStateDevBuf.data(),
0289                         50);  // maximum number of fit iterations
0290   }
0291
0292 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit
0293
0294 namespace alpaka::trait {
0295   using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0296   using namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit;
0297
0298   //! The trait for getting the size of the block shared dynamic memory for Kernel_minimize.
0299   template <>
0300   struct BlockSharedMemDynSizeBytes<Kernel_minimize, Acc1D> {
0301     //! \return The size of the shared memory allocated for a block.
0302     template <typename TVec, typename... TArgs>
0303     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_minimize const&,
0304                                                                  TVec const& threadsPerBlock,
0305                                                                  TVec const& elemsPerThread,
0306                                                                  TArgs const&...) -> std::size_t {
0307       using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0308
0309       // return the amount of dynamic shared memory needed
0310       std::size_t bytes =
0311           2 * threadsPerBlock[0u] * elemsPerThread[0u] *
0312           calo::multifit::MapSymM<ScalarType, ::ecal::multifit::SampleVector::RowsAtCompileTime>::total *
0313           sizeof(ScalarType);
0314       return bytes;
0315     }
0316   };
0317 }  // namespace alpaka::trait