Warning, /RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu is written in an unsupported language. File is not indexed.
0001 #include <cmath>
0002 #include <limits>
0003
0004 #include <cuda.h>
0005
0006 #include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
0007 #include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
0008 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0009 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
0010 #include "DataFormats/Math/interface/approx_exp.h"
0011 #include "DataFormats/Math/interface/approx_log.h"
0012 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0013
0014 #include "AmplitudeComputationCommonKernels.h"
0015 #include "AmplitudeComputationKernels.h"
0016 #include "KernelHelpers.h"
0017
0018 namespace ecal {
0019 namespace multifit {
0020
0021 template <typename MatrixType>
0022 __device__ __forceinline__ bool update_covariance(EcalPulseCovariance const& pulse_covariance,
0023 MatrixType& inverse_cov,
0024 SampleVector const& amplitudes) {
0025 constexpr int nsamples = SampleVector::RowsAtCompileTime;
0026 constexpr int npulses = BXVectorType::RowsAtCompileTime;
0027
0028 CMS_UNROLL_LOOP
0029 for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) {
0030 auto const amplitude = amplitudes.coeff(ipulse);
0031 if (amplitude == 0)
0032 continue;
0033
0034 // FIXME: ipulse - 5 -> ipulse - firstOffset
0035 int bx = ipulse - 5;
0036 int first_sample_t = std::max(0, bx + 3);
0037 int offset = -3 - bx;
0038
0039 auto const value_sq = amplitude * amplitude;
0040
0041 for (int col = first_sample_t; col < nsamples; col++) {
0042 for (int row = col; row < nsamples; row++) {
0043 inverse_cov(row, col) += value_sq * __ldg(&pulse_covariance.covval[row + offset][col + offset]);
0044 }
0045 }
0046 }
0047
0048 return true;
0049 }
0050
0051 ///
0052 /// launch ctx parameters are (nchannels / block, blocks)
0053 /// TODO: trivial impl for now, there must be a way to improve
0054 ///
0055 /// Conventions:
0056 /// - amplitudes -> solution vector, what we are fitting for
0057 /// - samples -> raw detector responses
0058 /// - passive constraint - satisfied constraint
0059 /// - active constraint - unsatisfied (yet) constraint
0060 ///
0061 __global__ void kernel_minimize(uint32_t const* dids_eb,
0062 uint32_t const* dids_ee,
0063 SampleMatrix const* __restrict__ noisecov,
0064 EcalPulseCovariance const* __restrict__ pulse_covariance,
0065 BXVectorType* bxs,
0066 SampleVector const* __restrict__ samples,
0067 SampleVector* amplitudesEB,
0068 SampleVector* amplitudesEE,
0069 PulseMatrixType const* __restrict__ pulse_matrix,
0070 ::ecal::reco::StorageScalarType* chi2sEB,
0071 ::ecal::reco::StorageScalarType* chi2sEE,
0072 ::ecal::reco::StorageScalarType* energiesEB,
0073 ::ecal::reco::StorageScalarType* energiesEE,
0074 char* acState,
0075 int nchannels,
0076 int max_iterations,
0077 uint32_t const offsetForHashes,
0078 uint32_t const offsetForInputs) {
0079 // FIXME: ecal has 10 samples and 10 pulses....
0080 // but this needs to be properly treated and renamed everywhere
0081 constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime;
0082 constexpr auto NPULSES = SampleMatrix::ColsAtCompileTime;
0083 static_assert(NSAMPLES == NPULSES);
0084
0085 using DataType = SampleVector::Scalar;
0086
0087 extern __shared__ char shrmem[];
0088 DataType* shrMatrixLForFnnlsStorage =
0089 reinterpret_cast<DataType*>(shrmem) + calo::multifit::MapSymM<DataType, NPULSES>::total * threadIdx.x;
0090 DataType* shrAtAStorage = reinterpret_cast<DataType*>(shrmem) +
0091 calo::multifit::MapSymM<DataType, NPULSES>::total * (threadIdx.x + blockDim.x);
0092
0093 // channel
0094 int idx = threadIdx.x + blockDim.x * blockIdx.x;
0095
0096 // ref the right ptr
0097 #define ARRANGE(var) auto* var = idx >= offsetForInputs ? var##EE : var##EB
0098 ARRANGE(amplitudes);
0099 ARRANGE(chi2s);
0100 ARRANGE(energies);
0101 #undef ARRANGE
0102
0103 if (idx < nchannels) {
0104 if (static_cast<MinimizationState>(acState[idx]) == MinimizationState::Precomputed)
0105 return;
0106
0107 // get the hash
0108 int const inputCh = idx >= offsetForInputs ? idx - offsetForInputs : idx;
0109 auto const* dids = idx >= offsetForInputs ? dids_ee : dids_eb;
0110 auto const did = DetId{dids[inputCh]};
0111 auto const isBarrel = did.subdetId() == EcalBarrel;
0112 auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0113 : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0114
0115 // inits
0116 int iter = 0;
0117 int npassive = 0;
0118
0119 calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
0120 CMS_UNROLL_LOOP
0121 for (int i = 0; i < NPULSES; ++i)
0122 pulseOffsets(i) = i;
0123
0124 calo::multifit::ColumnVector<NPULSES, DataType> resultAmplitudes;
0125 CMS_UNROLL_LOOP
0126 for (int counter = 0; counter < NPULSES; counter++)
0127 resultAmplitudes(counter) = 0;
0128
0129 // inits
0130 //SampleDecompLLT covariance_decomposition;
0131 //SampleMatrix inverse_cov;
0132 // SampleVector::Scalar chi2 = 0, chi2_now = 0;
0133 float chi2 = 0, chi2_now = 0;
0134
0135 // loop until ocnverge
0136 while (true) {
0137 if (iter >= max_iterations)
0138 break;
0139
0140 //inverse_cov = noisecov[idx];
0141 //DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
0142 DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
0143 calo::multifit::MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
0144 int counter = 0;
0145 CMS_UNROLL_LOOP
0146 for (int col = 0; col < NSAMPLES; col++) {
0147 CMS_UNROLL_LOOP
0148 for (int row = col; row < NSAMPLES; row++)
0149 covMatrixStorage[counter++] = __ldg(&noisecov[idx].coeffRef(row, col));
0150 }
0151 update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes);
0152
0153 // compute actual covariance decomposition
0154 //covariance_decomposition.compute(inverse_cov);
0155 //auto const& matrixL = covariance_decomposition.matrixL();
0156 DataType matrixLStorage[calo::multifit::MapSymM<DataType, NSAMPLES>::total];
0157 calo::multifit::MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
0158 calo::multifit::compute_decomposition_unrolled(matrixL, covMatrix);
0159
0160 // L * A = P
0161 calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
0162 calo::multifit::solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
0163
0164 // L b = s
0165 float reg_b[NSAMPLES];
0166 calo::multifit::solve_forward_subst_vector(reg_b, samples[idx], matrixL);
0167
0168 // FIXME: shared mem
0169 //DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
0170 calo::multifit::MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
0171 //SampleMatrix AtA;
0172 SampleVector Atb;
0173 CMS_UNROLL_LOOP
0174 for (int icol = 0; icol < NPULSES; icol++) {
0175 float reg_ai[NSAMPLES];
0176
0177 // load column icol
0178 CMS_UNROLL_LOOP
0179 for (int counter = 0; counter < NSAMPLES; counter++)
0180 reg_ai[counter] = A(counter, icol);
0181
0182 // compute diagoanl
0183 float sum = 0.f;
0184 CMS_UNROLL_LOOP
0185 for (int counter = 0; counter < NSAMPLES; counter++)
0186 sum += reg_ai[counter] * reg_ai[counter];
0187
0188 // store
0189 AtA(icol, icol) = sum;
0190
0191 // go thru the other columns
0192 CMS_UNROLL_LOOP
0193 for (int j = icol + 1; j < NPULSES; j++) {
0194 // load column j
0195 float reg_aj[NSAMPLES];
0196 CMS_UNROLL_LOOP
0197 for (int counter = 0; counter < NSAMPLES; counter++)
0198 reg_aj[counter] = A(counter, j);
0199
0200 // accum
0201 float sum = 0.f;
0202 CMS_UNROLL_LOOP
0203 for (int counter = 0; counter < NSAMPLES; counter++)
0204 sum += reg_aj[counter] * reg_ai[counter];
0205
0206 // store
0207 //AtA(icol, j) = sum;
0208 AtA(j, icol) = sum;
0209 }
0210
0211 // Atb accum
0212 float sum_atb = 0.f;
0213 CMS_UNROLL_LOOP
0214 for (int counter = 0; counter < NSAMPLES; counter++)
0215 sum_atb += reg_ai[counter] * reg_b[counter];
0216
0217 // store atb
0218 Atb(icol) = sum_atb;
0219 }
0220
0221 // FIXME: shared mem
0222 //DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
0223 calo::multifit::MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
0224
0225 calo::multifit::fnnls(AtA,
0226 Atb,
0227 //amplitudes[idx],
0228 resultAmplitudes,
0229 npassive,
0230 pulseOffsets,
0231 matrixLForFnnls,
0232 1e-11,
0233 500,
0234 16,
0235 2);
0236
0237 calo::multifit::calculateChiSq(matrixL, pulse_matrix[idx], resultAmplitudes, samples[idx], chi2_now);
0238
0239 auto deltachi2 = chi2_now - chi2;
0240 chi2 = chi2_now;
0241
0242 if (std::abs(deltachi2) < 1e-3)
0243 break;
0244
0245 //---- AM: TEST
0246 //---- it was 3 lines above, now here as in the CPU version
0247 ++iter;
0248 }
0249
0250 // store to global output values
0251 // FIXME: amplitudes are used in global directly
0252 chi2s[inputCh] = chi2;
0253 energies[inputCh] = resultAmplitudes(5);
0254
0255 CMS_UNROLL_LOOP
0256 for (int counter = 0; counter < NPULSES; counter++)
0257 amplitudes[inputCh](counter) = resultAmplitudes(counter);
0258 }
0259 }
0260
0261 namespace v1 {
0262
0263 void minimization_procedure(EventInputDataGPU const& eventInputGPU,
0264 EventOutputDataGPU& eventOutputGPU,
0265 EventDataForScratchGPU& scratch,
0266 ConditionsProducts const& conditions,
0267 ConfigurationParameters const& configParameters,
0268 cudaStream_t cudaStream) {
0269 using DataType = SampleVector::Scalar;
0270 unsigned int totalChannels = eventInputGPU.ebDigis.size + eventInputGPU.eeDigis.size;
0271 // unsigned int threads_min = conf.threads.x;
0272 // TODO: configure from python
0273 unsigned int threads_min = configParameters.kernelMinimizeThreads[0];
0274 unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min;
0275 uint32_t const offsetForHashes = conditions.offsetForHashes;
0276 uint32_t const offsetForInputs = eventInputGPU.ebDigis.size;
0277 auto const nbytesShared = 2 * threads_min *
0278 calo::multifit::MapSymM<DataType, SampleVector::RowsAtCompileTime>::total *
0279 sizeof(DataType);
0280 kernel_minimize<<<blocks_min, threads_min, nbytesShared, cudaStream>>>(
0281 eventInputGPU.ebDigis.ids.get(),
0282 eventInputGPU.eeDigis.ids.get(),
0283 (SampleMatrix*)scratch.noisecov.get(),
0284 conditions.pulseCovariances.values,
0285 (BXVectorType*)scratch.activeBXs.get(),
0286 (SampleVector*)scratch.samples.get(),
0287 (SampleVector*)eventOutputGPU.recHitsEB.amplitudesAll.get(),
0288 (SampleVector*)eventOutputGPU.recHitsEE.amplitudesAll.get(),
0289 (PulseMatrixType*)scratch.pulse_matrix.get(),
0290 eventOutputGPU.recHitsEB.chi2.get(),
0291 eventOutputGPU.recHitsEE.chi2.get(),
0292 eventOutputGPU.recHitsEB.amplitude.get(),
0293 eventOutputGPU.recHitsEE.amplitude.get(),
0294 scratch.acState.get(),
0295 totalChannels,
0296 50,
0297 offsetForHashes,
0298 offsetForInputs);
0299 cudaCheck(cudaGetLastError());
0300 }
0301
0302 } // namespace v1
0303
0304 } // namespace multifit
0305 } // namespace ecal