EcalRecProducers/plugins/AmplitudeComputationKernels.cu

0001 #include <cmath>
0002 #include <limits>
0003
0004 #include <cuda.h>
0005
0006 #include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
0007 #include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
0008 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0009 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
0010 #include "DataFormats/Math/interface/approx_exp.h"
0011 #include "DataFormats/Math/interface/approx_log.h"
0012 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0013
0014 #include "AmplitudeComputationCommonKernels.h"
0015 #include "AmplitudeComputationKernels.h"
0016 #include "KernelHelpers.h"
0017
0018 namespace ecal {
0019   namespace multifit {
0020
0021     template <typename MatrixType>
0022     __device__ __forceinline__ bool update_covariance(EcalPulseCovariance const& pulse_covariance,
0023                                                       MatrixType& inverse_cov,
0024                                                       SampleVector const& amplitudes) {
0025       constexpr int nsamples = SampleVector::RowsAtCompileTime;
0026       constexpr int npulses = BXVectorType::RowsAtCompileTime;
0027
0028       CMS_UNROLL_LOOP
0029       for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) {
0030         auto const amplitude = amplitudes.coeff(ipulse);
0031         if (amplitude == 0)
0032           continue;
0033
0034         // FIXME: ipulse - 5 -> ipulse - firstOffset
0035         int bx = ipulse - 5;
0036         int first_sample_t = std::max(0, bx + 3);
0037         int offset = -3 - bx;
0038
0039         auto const value_sq = amplitude * amplitude;
0040
0041         for (int col = first_sample_t; col < nsamples; col++) {
0042           for (int row = col; row < nsamples; row++) {
0043             inverse_cov(row, col) += value_sq * __ldg(&pulse_covariance.covval[row + offset][col + offset]);
0044           }
0045         }
0046       }
0047
0048       return true;
0049     }
0050
0051     ///
0052     /// launch ctx parameters are (nchannels / block, blocks)
0053     /// TODO: trivial impl for now, there must be a way to improve
0054     ///
0055     /// Conventions:
0056     ///   - amplitudes -> solution vector, what we are fitting for
0057     ///   - samples -> raw detector responses
0058     ///   - passive constraint - satisfied constraint
0059     ///   - active constraint - unsatisfied (yet) constraint
0060     ///
0061     __global__ void kernel_minimize(uint32_t const* dids_eb,
0062                                     uint32_t const* dids_ee,
0063                                     SampleMatrix const* __restrict__ noisecov,
0064                                     EcalPulseCovariance const* __restrict__ pulse_covariance,
0065                                     BXVectorType* bxs,
0066                                     SampleVector const* __restrict__ samples,
0067                                     SampleVector* amplitudesEB,
0068                                     SampleVector* amplitudesEE,
0069                                     PulseMatrixType const* __restrict__ pulse_matrix,
0070                                     ::ecal::reco::StorageScalarType* chi2sEB,
0071                                     ::ecal::reco::StorageScalarType* chi2sEE,
0072                                     ::ecal::reco::StorageScalarType* energiesEB,
0073                                     ::ecal::reco::StorageScalarType* energiesEE,
0074                                     char* acState,
0075                                     int nchannels,
0076                                     int max_iterations,
0077                                     uint32_t const offsetForHashes,
0078                                     uint32_t const offsetForInputs) {
0079       // FIXME: ecal has 10 samples and 10 pulses....
0080       // but this needs to be properly treated and renamed everywhere
0081       constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime;
0082       constexpr auto NPULSES = SampleMatrix::ColsAtCompileTime;
0083       static_assert(NSAMPLES == NPULSES);
0084
0085       using DataType = SampleVector::Scalar;
0086
0087       extern __shared__ char shrmem[];
0088       DataType* shrMatrixLForFnnlsStorage =
0089           reinterpret_cast<DataType*>(shrmem) + calo::multifit::MapSymM<DataType, NPULSES>::total * threadIdx.x;
0090       DataType* shrAtAStorage = reinterpret_cast<DataType*>(shrmem) +
0091                                 calo::multifit::MapSymM<DataType, NPULSES>::total * (threadIdx.x + blockDim.x);
0092
0093       // channel
0094       int idx = threadIdx.x + blockDim.x * blockIdx.x;
0095
0096 // ref the right ptr
0097 #define ARRANGE(var) auto* var = idx >= offsetForInputs ? var##EE : var##EB
0098       ARRANGE(amplitudes);
0099       ARRANGE(chi2s);
0100       ARRANGE(energies);
0101 #undef ARRANGE
0102
0103       if (idx < nchannels) {
0104         if (static_cast<MinimizationState>(acState[idx]) == MinimizationState::Precomputed)
0105           return;
0106
0107         // get the hash
0108         int const inputCh = idx >= offsetForInputs ? idx - offsetForInputs : idx;
0109         auto const* dids = idx >= offsetForInputs ? dids_ee : dids_eb;
0110         auto const did = DetId{dids[inputCh]};
0111         auto const isBarrel = did.subdetId() == EcalBarrel;
0112         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0113                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0114
0115         // inits
0116         int iter = 0;
0117         int npassive = 0;
0118
0119         calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
0120         CMS_UNROLL_LOOP
0121         for (int i = 0; i < NPULSES; ++i)
0122           pulseOffsets(i) = i;
0123
0124         calo::multifit::ColumnVector<NPULSES, DataType> resultAmplitudes;
0125         CMS_UNROLL_LOOP
0126         for (int counter = 0; counter < NPULSES; counter++)
0127           resultAmplitudes(counter) = 0;
0128
0129         // inits
0130         //SampleDecompLLT covariance_decomposition;
0131         //SampleMatrix inverse_cov;
0132         //        SampleVector::Scalar chi2 = 0, chi2_now = 0;
0133         float chi2 = 0, chi2_now = 0;
0134
0135         // loop until ocnverge
0136         while (true) {
0137           if (iter >= max_iterations)
0138             break;
0139
0140           //inverse_cov = noisecov[idx];
0141           //DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
0142           DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
0143           calo::multifit::MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
0144           int counter = 0;
0145           CMS_UNROLL_LOOP
0146           for (int col = 0; col < NSAMPLES; col++) {
0147             CMS_UNROLL_LOOP
0148             for (int row = col; row < NSAMPLES; row++)
0149               covMatrixStorage[counter++] = __ldg(&noisecov[idx].coeffRef(row, col));
0150           }
0151           update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes);
0152
0153           // compute actual covariance decomposition
0154           //covariance_decomposition.compute(inverse_cov);
0155           //auto const& matrixL = covariance_decomposition.matrixL();
0156           DataType matrixLStorage[calo::multifit::MapSymM<DataType, NSAMPLES>::total];
0157           calo::multifit::MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
0158           calo::multifit::compute_decomposition_unrolled(matrixL, covMatrix);
0159
0160           // L * A = P
0161           calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
0162           calo::multifit::solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
0163
0164           // L b = s
0165           float reg_b[NSAMPLES];
0166           calo::multifit::solve_forward_subst_vector(reg_b, samples[idx], matrixL);
0167
0168           // FIXME: shared mem
0169           //DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
0170           calo::multifit::MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
0171           //SampleMatrix AtA;
0172           SampleVector Atb;
0173           CMS_UNROLL_LOOP
0174           for (int icol = 0; icol < NPULSES; icol++) {
0175             float reg_ai[NSAMPLES];
0176
0177             // load column icol
0178             CMS_UNROLL_LOOP
0179             for (int counter = 0; counter < NSAMPLES; counter++)
0180               reg_ai[counter] = A(counter, icol);
0181
0182             // compute diagoanl
0183             float sum = 0.f;
0184             CMS_UNROLL_LOOP
0185             for (int counter = 0; counter < NSAMPLES; counter++)
0186               sum += reg_ai[counter] * reg_ai[counter];
0187
0188             // store
0189             AtA(icol, icol) = sum;
0190
0191             // go thru the other columns
0192             CMS_UNROLL_LOOP
0193             for (int j = icol + 1; j < NPULSES; j++) {
0194               // load column j
0195               float reg_aj[NSAMPLES];
0196               CMS_UNROLL_LOOP
0197               for (int counter = 0; counter < NSAMPLES; counter++)
0198                 reg_aj[counter] = A(counter, j);
0199
0200               // accum
0201               float sum = 0.f;
0202               CMS_UNROLL_LOOP
0203               for (int counter = 0; counter < NSAMPLES; counter++)
0204                 sum += reg_aj[counter] * reg_ai[counter];
0205
0206               // store
0207               //AtA(icol, j) = sum;
0208               AtA(j, icol) = sum;
0209             }
0210
0211             // Atb accum
0212             float sum_atb = 0.f;
0213             CMS_UNROLL_LOOP
0214             for (int counter = 0; counter < NSAMPLES; counter++)
0215               sum_atb += reg_ai[counter] * reg_b[counter];
0216
0217             // store atb
0218             Atb(icol) = sum_atb;
0219           }
0220
0221           // FIXME: shared mem
0222           //DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
0223           calo::multifit::MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
0224
0225           calo::multifit::fnnls(AtA,
0226                                 Atb,
0227                                 //amplitudes[idx],
0228                                 resultAmplitudes,
0229                                 npassive,
0230                                 pulseOffsets,
0231                                 matrixLForFnnls,
0232                                 1e-11,
0233                                 500,
0234                                 16,
0235                                 2);
0236
0237           calo::multifit::calculateChiSq(matrixL, pulse_matrix[idx], resultAmplitudes, samples[idx], chi2_now);
0238
0239           auto deltachi2 = chi2_now - chi2;
0240           chi2 = chi2_now;
0241
0242           if (std::abs(deltachi2) < 1e-3)
0243             break;
0244
0245           //---- AM: TEST
0246           //---- it was 3 lines above, now here as in the CPU version
0247           ++iter;
0248         }
0249
0250         // store to global output values
0251         // FIXME: amplitudes are used in global directly
0252         chi2s[inputCh] = chi2;
0253         energies[inputCh] = resultAmplitudes(5);
0254
0255         CMS_UNROLL_LOOP
0256         for (int counter = 0; counter < NPULSES; counter++)
0257           amplitudes[inputCh](counter) = resultAmplitudes(counter);
0258       }
0259     }
0260
0261     namespace v1 {
0262
0263       void minimization_procedure(EventInputDataGPU const& eventInputGPU,
0264                                   EventOutputDataGPU& eventOutputGPU,
0265                                   EventDataForScratchGPU& scratch,
0266                                   ConditionsProducts const& conditions,
0267                                   ConfigurationParameters const& configParameters,
0268                                   cudaStream_t cudaStream) {
0269         using DataType = SampleVector::Scalar;
0270         unsigned int totalChannels = eventInputGPU.ebDigis.size + eventInputGPU.eeDigis.size;
0271         //    unsigned int threads_min = conf.threads.x;
0272         // TODO: configure from python
0273         unsigned int threads_min = configParameters.kernelMinimizeThreads[0];
0274         unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min;
0275         uint32_t const offsetForHashes = conditions.offsetForHashes;
0276         uint32_t const offsetForInputs = eventInputGPU.ebDigis.size;
0277         auto const nbytesShared = 2 * threads_min *
0278                                   calo::multifit::MapSymM<DataType, SampleVector::RowsAtCompileTime>::total *
0279                                   sizeof(DataType);
0280         kernel_minimize<<<blocks_min, threads_min, nbytesShared, cudaStream>>>(
0281             eventInputGPU.ebDigis.ids.get(),
0282             eventInputGPU.eeDigis.ids.get(),
0283             (SampleMatrix*)scratch.noisecov.get(),
0284             conditions.pulseCovariances.values,
0285             (BXVectorType*)scratch.activeBXs.get(),
0286             (SampleVector*)scratch.samples.get(),
0287             (SampleVector*)eventOutputGPU.recHitsEB.amplitudesAll.get(),
0288             (SampleVector*)eventOutputGPU.recHitsEE.amplitudesAll.get(),
0289             (PulseMatrixType*)scratch.pulse_matrix.get(),
0290             eventOutputGPU.recHitsEB.chi2.get(),
0291             eventOutputGPU.recHitsEE.chi2.get(),
0292             eventOutputGPU.recHitsEB.amplitude.get(),
0293             eventOutputGPU.recHitsEE.amplitude.get(),
0294             scratch.acState.get(),
0295             totalChannels,
0296             50,
0297             offsetForHashes,
0298             offsetForInputs);
0299         cudaCheck(cudaGetLastError());
0300       }
0301
0302     }  // namespace v1
0303
0304   }  // namespace multifit
0305 }  // namespace ecal