Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-20 02:32:07

0001 #include <cmath>
0002 #include <limits>
0003 #include <alpaka/alpaka.hpp>
0004 
0005 #include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
0006 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
0007 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0008 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0009 
0010 #include "AmplitudeComputationKernels.h"
0011 #include "KernelHelpers.h"
0012 #include "EcalUncalibRecHitMultiFitAlgoPortable.h"
0013 
0014 namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
0015 
0016   using namespace ::ecal::multifit;
0017 
0018   template <typename MatrixType>
0019   ALPAKA_FN_ACC ALPAKA_FN_INLINE void update_covariance(EcalPulseCovariance const& pulse_covariance,
0020                                                         MatrixType& inverse_cov,
0021                                                         SampleVector const& amplitudes) {
0022     constexpr auto nsamples = SampleVector::RowsAtCompileTime;
0023     constexpr auto npulses = BXVectorType::RowsAtCompileTime;
0024 
0025     CMS_UNROLL_LOOP
0026     for (unsigned int ipulse = 0; ipulse < npulses; ++ipulse) {
0027       auto const amplitude = amplitudes.coeff(ipulse);
0028       if (amplitude == 0)
0029         continue;
0030 
0031       // FIXME: ipulse - 5 -> ipulse - firstOffset
0032       int bx = ipulse - 5;
0033       int first_sample_t = std::max(0, bx + 3);
0034       int offset = -3 - bx;
0035 
0036       auto const value_sq = amplitude * amplitude;
0037 
0038       for (int col = first_sample_t; col < nsamples; ++col) {
0039         for (int row = col; row < nsamples; ++row) {
0040           inverse_cov(row, col) += value_sq * pulse_covariance.covval[row + offset][col + offset];
0041         }
0042       }
0043     }
0044   }
0045 
0046   ///
0047   /// launch ctx parameters are (nchannels / block, blocks)
0048   /// TODO: trivial impl for now, there must be a way to improve
0049   ///
0050   /// Conventions:
0051   ///   - amplitudes -> solution vector, what we are fitting for
0052   ///   - samples -> raw detector responses
0053   ///   - passive constraint - satisfied constraint
0054   ///   - active constraint - unsatisfied (yet) constraint
0055   ///
0056   class Kernel_minimize {
0057   public:
0058     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0059     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0060                                   InputProduct::ConstView const& digisDevEB,
0061                                   InputProduct::ConstView const& digisDevEE,
0062                                   OutputProduct::View uncalibRecHitsEB,
0063                                   OutputProduct::View uncalibRecHitsEE,
0064                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0065                                   ::ecal::multifit::SampleMatrix const* noisecov,
0066                                   ::ecal::multifit::PulseMatrixType const* pulse_matrix,
0067                                   ::ecal::multifit::BXVectorType* bxs,
0068                                   ::ecal::multifit::SampleVector const* samples,
0069                                   bool* hasSwitchToGain6,
0070                                   bool* hasSwitchToGain1,
0071                                   bool* isSaturated,
0072                                   char* acState,
0073                                   int max_iterations) const {
0074       // FIXME: ecal has 10 samples and 10 pulses....
0075       // but this needs to be properly treated and renamed everywhere
0076       constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime;
0077       constexpr auto NPULSES = SampleMatrix::ColsAtCompileTime;
0078       static_assert(NSAMPLES == NPULSES);
0079 
0080       using DataType = SampleVector::Scalar;
0081 
0082       auto const elemsPerBlock(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
0083 
0084       auto const nchannelsEB = digisDevEB.size();
0085       auto const nchannels = nchannelsEB + digisDevEE.size();
0086       auto const offsetForHashes = conditionsDev.offsetEE();
0087 
0088       auto const* pulse_covariance = reinterpret_cast<const EcalPulseCovariance*>(conditionsDev.pulseCovariance());
0089 
0090       // shared memory
0091       DataType* shrmem = alpaka::getDynSharedMem<DataType>(acc);
0092 
0093       // channel
0094       for (auto idx : cms::alpakatools::uniform_elements(acc, nchannels)) {
0095         if (static_cast<MinimizationState>(acState[idx]) == MinimizationState::Precomputed)
0096           continue;
0097 
0098         auto const elemIdx = idx % elemsPerBlock;
0099 
0100         // shared memory pointers
0101         DataType* shrMatrixLForFnnlsStorage = shrmem + calo::multifit::MapSymM<DataType, NPULSES>::total * elemIdx;
0102         DataType* shrAtAStorage =
0103             shrmem + calo::multifit::MapSymM<DataType, NPULSES>::total * (elemIdx + elemsPerBlock);
0104 
0105         auto* amplitudes =
0106             reinterpret_cast<SampleVector*>(idx >= nchannelsEB ? uncalibRecHitsEE.outOfTimeAmplitudes()->data()
0107                                                                : uncalibRecHitsEB.outOfTimeAmplitudes()->data());
0108         auto* energies = idx >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude();
0109         auto* chi2s = idx >= nchannelsEB ? uncalibRecHitsEE.chi2() : uncalibRecHitsEB.chi2();
0110 
0111         // get the hash
0112         int const inputCh = idx >= nchannelsEB ? idx - nchannelsEB : idx;
0113         auto const* dids = idx >= nchannelsEB ? digisDevEE.id() : digisDevEB.id();
0114         auto const did = DetId{dids[inputCh]};
0115         auto const isBarrel = did.subdetId() == EcalBarrel;
0116         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0117                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0118 
0119         // inits
0120         int npassive = 0;
0121 
0122         calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
0123         CMS_UNROLL_LOOP
0124         for (int i = 0; i < NPULSES; ++i)
0125           pulseOffsets(i) = i;
0126 
0127         calo::multifit::ColumnVector<NPULSES, DataType> resultAmplitudes;
0128         CMS_UNROLL_LOOP
0129         for (int counter = 0; counter < NPULSES; ++counter)
0130           resultAmplitudes(counter) = 0;
0131 
0132         // inits
0133         //SampleDecompLLT covariance_decomposition;
0134         //SampleMatrix inverse_cov;
0135         //        SampleVector::Scalar chi2 = 0, chi2_now = 0;
0136         float chi2 = 0, chi2_now = 0;
0137 
0138         // loop for up to max_iterations
0139         for (int iter = 0; iter < max_iterations; ++iter) {
0140           //inverse_cov = noisecov[idx];
0141           //DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
0142           DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
0143           calo::multifit::MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
0144           int counter = 0;
0145           CMS_UNROLL_LOOP
0146           for (int col = 0; col < NSAMPLES; ++col) {
0147             CMS_UNROLL_LOOP
0148             for (int row = col; row < NSAMPLES; ++row) {
0149               covMatrixStorage[counter++] = noisecov[idx].coeffRef(row, col);
0150             }
0151           }
0152           update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes);
0153 
0154           // compute actual covariance decomposition
0155           //covariance_decomposition.compute(inverse_cov);
0156           //auto const& matrixL = covariance_decomposition.matrixL();
0157           DataType matrixLStorage[calo::multifit::MapSymM<DataType, NSAMPLES>::total];
0158           calo::multifit::MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
0159           calo::multifit::compute_decomposition_unrolled(matrixL, covMatrix);
0160 
0161           // L * A = P
0162           calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
0163           calo::multifit::solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
0164 
0165           // L b = s
0166           float reg_b[NSAMPLES];
0167           calo::multifit::solve_forward_subst_vector(reg_b, samples[idx], matrixL);
0168 
0169           // FIXME: shared mem
0170           //DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
0171           calo::multifit::MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
0172           //SampleMatrix AtA;
0173           SampleVector Atb;
0174           CMS_UNROLL_LOOP
0175           for (int icol = 0; icol < NPULSES; ++icol) {
0176             float reg_ai[NSAMPLES];
0177 
0178             // load column icol
0179             CMS_UNROLL_LOOP
0180             for (int counter = 0; counter < NSAMPLES; ++counter)
0181               reg_ai[counter] = A(counter, icol);
0182 
0183             // compute diagoanl
0184             float sum = 0.f;
0185             CMS_UNROLL_LOOP
0186             for (int counter = 0; counter < NSAMPLES; ++counter)
0187               sum += reg_ai[counter] * reg_ai[counter];
0188 
0189             // store
0190             AtA(icol, icol) = sum;
0191 
0192             // go thru the other columns
0193             CMS_UNROLL_LOOP
0194             for (int j = icol + 1; j < NPULSES; ++j) {
0195               // load column j
0196               float reg_aj[NSAMPLES];
0197               CMS_UNROLL_LOOP
0198               for (int counter = 0; counter < NSAMPLES; ++counter)
0199                 reg_aj[counter] = A(counter, j);
0200 
0201               // accum
0202               float sum = 0.f;
0203               CMS_UNROLL_LOOP
0204               for (int counter = 0; counter < NSAMPLES; ++counter)
0205                 sum += reg_aj[counter] * reg_ai[counter];
0206 
0207               // store
0208               //AtA(icol, j) = sum;
0209               AtA(j, icol) = sum;
0210             }
0211 
0212             // Atb accum
0213             float sum_atb = 0.f;
0214             CMS_UNROLL_LOOP
0215             for (int counter = 0; counter < NSAMPLES; ++counter)
0216               sum_atb += reg_ai[counter] * reg_b[counter];
0217 
0218             // store atb
0219             Atb(icol) = sum_atb;
0220           }
0221 
0222           // FIXME: shared mem
0223           //DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
0224           calo::multifit::MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
0225 
0226           calo::multifit::fnnls(AtA,
0227                                 Atb,
0228                                 //amplitudes[idx],
0229                                 resultAmplitudes,
0230                                 npassive,
0231                                 pulseOffsets,
0232                                 matrixLForFnnls,
0233                                 1e-11,
0234                                 500,
0235                                 16,
0236                                 2);
0237 
0238           calo::multifit::calculateChiSq(matrixL, pulse_matrix[idx], resultAmplitudes, samples[idx], chi2_now);
0239 
0240           auto const deltachi2 = chi2_now - chi2;
0241           chi2 = chi2_now;
0242 
0243           if (std::abs(deltachi2) < 1e-3)
0244             break;
0245         }
0246 
0247         // store to global output values
0248         // FIXME: amplitudes are used in global directly
0249         chi2s[inputCh] = chi2;
0250         energies[inputCh] = resultAmplitudes(5);
0251 
0252         CMS_UNROLL_LOOP
0253         for (int counter = 0; counter < NPULSES; ++counter)
0254           amplitudes[inputCh](counter) = resultAmplitudes(counter);
0255       }
0256     }
0257   };
0258 
0259   void minimization_procedure(Queue& queue,
0260                               InputProduct const& digisDevEB,
0261                               InputProduct const& digisDevEE,
0262                               OutputProduct& uncalibRecHitsDevEB,
0263                               OutputProduct& uncalibRecHitsDevEE,
0264                               EventDataForScratchDevice& scratch,
0265                               EcalMultifitConditionsDevice const& conditionsDev,
0266                               ConfigurationParameters const& configParams,
0267                               uint32_t const totalChannels) {
0268     using DataType = SampleVector::Scalar;
0269     // TODO: configure from python
0270     auto threads_min = configParams.kernelMinimizeThreads[0];
0271     auto blocks_min = cms::alpakatools::divide_up_by(totalChannels, threads_min);
0272 
0273     auto workDivMinimize = cms::alpakatools::make_workdiv<Acc1D>(blocks_min, threads_min);
0274     alpaka::exec<Acc1D>(queue,
0275                         workDivMinimize,
0276                         Kernel_minimize{},
0277                         digisDevEB.const_view(),
0278                         digisDevEE.const_view(),
0279                         uncalibRecHitsDevEB.view(),
0280                         uncalibRecHitsDevEE.view(),
0281                         conditionsDev.const_view(),
0282                         reinterpret_cast<::ecal::multifit::SampleMatrix*>(scratch.noisecovDevBuf.data()),
0283                         reinterpret_cast<::ecal::multifit::PulseMatrixType*>(scratch.pulse_matrixDevBuf.data()),
0284                         reinterpret_cast<::ecal::multifit::BXVectorType*>(scratch.activeBXsDevBuf.data()),
0285                         reinterpret_cast<::ecal::multifit::SampleVector*>(scratch.samplesDevBuf.data()),
0286                         scratch.hasSwitchToGain6DevBuf.data(),
0287                         scratch.hasSwitchToGain1DevBuf.data(),
0288                         scratch.isSaturatedDevBuf.data(),
0289                         scratch.acStateDevBuf.data(),
0290                         50);  // maximum number of fit iterations
0291   }
0292 
0293 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit
0294 
0295 namespace alpaka::trait {
0296   using namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit;
0297 
0298   //! The trait for getting the size of the block shared dynamic memory for Kernel_minimize.
0299   template <typename TAcc>
0300   struct BlockSharedMemDynSizeBytes<Kernel_minimize, TAcc> {
0301     //! \return The size of the shared memory allocated for a block.
0302     template <typename TVec, typename... TArgs>
0303     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_minimize const&,
0304                                                                  TVec const& threadsPerBlock,
0305                                                                  TVec const& elemsPerThread,
0306                                                                  TArgs const&...) -> std::size_t {
0307       using ScalarType = ecal::multifit::SampleVector::Scalar;
0308 
0309       // return the amount of dynamic shared memory needed
0310       std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] *
0311                           calo::multifit::MapSymM<ScalarType, ecal::multifit::SampleVector::RowsAtCompileTime>::total *
0312                           sizeof(ScalarType);
0313       return bytes;
0314     }
0315   };
0316 }  // namespace alpaka::trait