Warning, /RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu is written in an unsupported language. File is not indexed.
0001 #include <cstdlib>
0002 #include <limits>
0003
0004 #include <cuda.h>
0005
0006 #include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
0007 #include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
0008 #include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h"
0009 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0010 #include "DataFormats/EcalDigi/interface/EcalMGPASample.h"
0011 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
0012 #include "DataFormats/Math/interface/approx_exp.h"
0013 #include "DataFormats/Math/interface/approx_log.h"
0014 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0015
0016 #include "AmplitudeComputationCommonKernels.h"
0017 #include "KernelHelpers.h"
0018
0019 namespace ecal {
0020 namespace multifit {
0021
0022 ///
0023 /// assume kernel launch configuration is
0024 /// (MAXSAMPLES * nchannels, blocks)
0025 ///
0026 __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
0027 uint16_t const* digis_in_eb,
0028 uint32_t const* dids_eb,
0029 uint16_t const* digis_in_ee,
0030 uint32_t const* dids_ee,
0031 SampleVector* amplitudes,
0032 SampleVector* amplitudesForMinimizationEB,
0033 SampleVector* amplitudesForMinimizationEE,
0034 SampleGainVector* gainsNoise,
0035 float const* mean_x1,
0036 float const* mean_x12,
0037 float const* rms_x12,
0038 float const* mean_x6,
0039 float const* gain6Over1,
0040 float const* gain12Over6,
0041 bool* hasSwitchToGain6,
0042 bool* hasSwitchToGain1,
0043 bool* isSaturated,
0044 ::ecal::reco::StorageScalarType* energiesEB,
0045 ::ecal::reco::StorageScalarType* energiesEE,
0046 ::ecal::reco::StorageScalarType* chi2EB,
0047 ::ecal::reco::StorageScalarType* chi2EE,
0048 ::ecal::reco::StorageScalarType* g_pedestalEB,
0049 ::ecal::reco::StorageScalarType* g_pedestalEE,
0050 uint32_t* dids_outEB,
0051 uint32_t* dids_outEE,
0052 uint32_t* flagsEB,
0053 uint32_t* flagsEE,
0054 char* acState,
0055 BXVectorType* bxs,
0056 uint32_t const offsetForHashes,
0057 uint32_t const offsetForInputs,
0058 bool const gainSwitchUseMaxSampleEB,
0059 bool const gainSwitchUseMaxSampleEE,
0060 int const nchannels) {
0061 constexpr bool dynamicPedestal = false; //---- default to false, ok
0062 constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0063 constexpr int sample_max = 5;
0064 constexpr int full_pulse_max = 9;
0065 int const tx = threadIdx.x + blockIdx.x * blockDim.x;
0066 int const nchannels_per_block = blockDim.x / nsamples;
0067 int const ch = tx / nsamples;
0068 // for accessing input arrays
0069 int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0070 int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * 10 : tx;
0071 // eb is first and then ee
0072 auto const* digis_in = ch >= offsetForInputs ? digis_in_ee : digis_in_eb;
0073 auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
0074 int const sample = threadIdx.x % nsamples;
0075
0076 // need to ref the right ptr
0077 // macro is for clarity and safety
0078 #define ARRANGE(var) auto* var = ch >= offsetForInputs ? var##EE : var##EB
0079 ARRANGE(amplitudesForMinimization);
0080 ARRANGE(energies);
0081 ARRANGE(chi2);
0082 ARRANGE(g_pedestal);
0083 ARRANGE(dids_out);
0084 ARRANGE(flags);
0085 #undef ARRANGE
0086
0087 if (ch < nchannels) {
0088 // array of 10 x channels per block
0089 // TODO: any other way of doing simple reduction
0090 // assume bool is 1 byte, should be quite safe
0091 extern __shared__ char shared_mem[];
0092 bool* shr_hasSwitchToGain6 = reinterpret_cast<bool*>(shared_mem);
0093 bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + nchannels_per_block * nsamples;
0094 bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + nchannels_per_block * nsamples;
0095 bool* shr_isSaturated = shr_hasSwitchToGain0 + nchannels_per_block * nsamples;
0096 bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + nchannels_per_block * nsamples;
0097 char* shr_counts = reinterpret_cast<char*>(shr_hasSwitchToGain0_tmp) + nchannels_per_block * nsamples;
0098
0099 //
0100 // indices
0101 //
0102 auto const did = DetId{dids[inputCh]};
0103 auto const isBarrel = did.subdetId() == EcalBarrel;
0104 // TODO offset for ee, 0 for eb
0105 auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0106 : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0107
0108 //
0109 // pulse shape template
0110
0111 // will be used in the future for setting state
0112 auto const rmsForChecking = rms_x12[hashedId];
0113
0114 //
0115 // amplitudes
0116 //
0117 int const adc = ecalMGPA::adc(digis_in[inputTx]);
0118 int const gainId = ecalMGPA::gainId(digis_in[inputTx]);
0119 SampleVector::Scalar amplitude = 0.;
0120 SampleVector::Scalar pedestal = 0.;
0121 SampleVector::Scalar gainratio = 0.;
0122
0123 // store into shared mem for initialization
0124 shr_hasSwitchToGain6[threadIdx.x] = gainId == EcalMgpaBitwiseGain6;
0125 shr_hasSwitchToGain1[threadIdx.x] = gainId == EcalMgpaBitwiseGain1;
0126 shr_hasSwitchToGain0_tmp[threadIdx.x] = gainId == EcalMgpaBitwiseGain0;
0127 shr_hasSwitchToGain0[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x];
0128 shr_counts[threadIdx.x] = 0;
0129 __syncthreads();
0130
0131 // non-divergent branch (except for the last 4 threads)
0132 if (threadIdx.x <= blockDim.x - 5) {
0133 CMS_UNROLL_LOOP
0134 for (int i = 0; i < 5; i++)
0135 shr_counts[threadIdx.x] += shr_hasSwitchToGain0[threadIdx.x + i];
0136 }
0137 shr_isSaturated[threadIdx.x] = shr_counts[threadIdx.x] == 5;
0138
0139 //
0140 // unrolled reductions
0141 //
0142 if (sample < 5) {
0143 shr_hasSwitchToGain6[threadIdx.x] =
0144 shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 5];
0145 shr_hasSwitchToGain1[threadIdx.x] =
0146 shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 5];
0147
0148 // duplication of hasSwitchToGain0 in order not to
0149 // introduce another syncthreads
0150 shr_hasSwitchToGain0_tmp[threadIdx.x] =
0151 shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 5];
0152 }
0153 __syncthreads();
0154
0155 if (sample < 2) {
0156 // note, both threads per channel take value [3] twice to avoid another if
0157 shr_hasSwitchToGain6[threadIdx.x] = shr_hasSwitchToGain6[threadIdx.x] ||
0158 shr_hasSwitchToGain6[threadIdx.x + 2] ||
0159 shr_hasSwitchToGain6[threadIdx.x + 3];
0160 shr_hasSwitchToGain1[threadIdx.x] = shr_hasSwitchToGain1[threadIdx.x] ||
0161 shr_hasSwitchToGain1[threadIdx.x + 2] ||
0162 shr_hasSwitchToGain1[threadIdx.x + 3];
0163
0164 shr_hasSwitchToGain0_tmp[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x] ||
0165 shr_hasSwitchToGain0_tmp[threadIdx.x + 2] ||
0166 shr_hasSwitchToGain0_tmp[threadIdx.x + 3];
0167
0168 // sample < 2 -> first 2 threads of each channel will be used here
0169 // => 0 -> will compare 3 and 4 and put into 0
0170 // => 1 -> will compare 4 and 5 and put into 1
0171 shr_isSaturated[threadIdx.x] = shr_isSaturated[threadIdx.x + 3] || shr_isSaturated[threadIdx.x + 4];
0172 }
0173 __syncthreads();
0174
0175 bool check_hasSwitchToGain0 = false;
0176
0177 if (sample == 0) {
0178 shr_hasSwitchToGain6[threadIdx.x] =
0179 shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 1];
0180 shr_hasSwitchToGain1[threadIdx.x] =
0181 shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 1];
0182 shr_hasSwitchToGain0_tmp[threadIdx.x] =
0183 shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 1];
0184
0185 hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x];
0186 hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x];
0187
0188 // set only for the threadIdx.x corresponding to sample==0
0189 check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x];
0190
0191 shr_isSaturated[threadIdx.x + 3] = shr_isSaturated[threadIdx.x] || shr_isSaturated[threadIdx.x + 1];
0192 isSaturated[ch] = shr_isSaturated[threadIdx.x + 3];
0193 }
0194
0195 // TODO: w/o this sync, there is a race
0196 // if (threadIdx == sample_max) below uses max sample thread, not for 0 sample
0197 // check if we can remove it
0198 __syncthreads();
0199
0200 // TODO: divergent branch
0201 if (gainId == 0 || gainId == 3) {
0202 pedestal = mean_x1[hashedId];
0203 gainratio = gain6Over1[hashedId] * gain12Over6[hashedId];
0204 gainsNoise[ch](sample) = 2;
0205 } else if (gainId == 1) {
0206 pedestal = mean_x12[hashedId];
0207 gainratio = 1.;
0208 gainsNoise[ch](sample) = 0;
0209 } else if (gainId == 2) {
0210 pedestal = mean_x6[hashedId];
0211 gainratio = gain12Over6[hashedId];
0212 gainsNoise[ch](sample) = 1;
0213 }
0214
0215 // TODO: compile time constant -> branch should be non-divergent
0216 if (dynamicPedestal)
0217 amplitude = static_cast<SampleVector::Scalar>(adc) * gainratio;
0218 else
0219 amplitude = (static_cast<SampleVector::Scalar>(adc) - pedestal) * gainratio;
0220 amplitudes[ch][sample] = amplitude;
0221
0222 #ifdef ECAL_RECO_CUDA_DEBUG
0223 printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, pedestal, gainratio);
0224 if (adc == 0)
0225 printf("adc is zero\n");
0226 #endif
0227
0228 //
0229 // initialization
0230 //
0231 amplitudesForMinimization[inputCh](sample) = 0;
0232 bxs[ch](sample) = sample - 5;
0233
0234 // select the thread for the max sample
0235 //---> hardcoded above to be 5th sample, ok
0236 if (sample == sample_max) {
0237 //
0238 // initialization
0239 //
0240 acState[ch] = static_cast<char>(MinimizationState::NotFinished);
0241 energies[inputCh] = 0;
0242 chi2[inputCh] = 0;
0243 g_pedestal[inputCh] = 0;
0244 uint32_t flag = 0;
0245 dids_out[inputCh] = did.rawId();
0246
0247 // start of this channel in shared mem
0248 int const chStart = threadIdx.x - sample_max;
0249 // thread for the max sample in shared mem
0250 int const threadMax = threadIdx.x;
0251 auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE;
0252
0253 // this flag setting is applied to all of the cases
0254 if (shr_hasSwitchToGain6[chStart])
0255 flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
0256 if (shr_hasSwitchToGain1[chStart])
0257 flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
0258
0259 // this corresponds to cpu branching on lastSampleBeforeSaturation
0260 // likely false
0261 if (check_hasSwitchToGain0) {
0262 // assign for the case some sample having gainId == 0
0263 //energies[inputCh] = amplitudes[ch][sample_max];
0264 energies[inputCh] = amplitude;
0265
0266 // check if samples before sample_max have true
0267 bool saturated_before_max = false;
0268 CMS_UNROLL_LOOP
0269 for (char ii = 0; ii < 5; ii++)
0270 saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii];
0271
0272 // if saturation is in the max sample and not in the first 5
0273 if (!saturated_before_max && shr_hasSwitchToGain0[threadMax])
0274 energies[inputCh] = 49140; // 4095 * 12 (maximum ADC range * MultiGainPreAmplifier (MGPA) gain)
0275 // This is the actual maximum range that is set when we saturate.
0276 //---- AM FIXME : no pedestal subtraction???
0277 //It should be "(4095. - pedestal) * gainratio"
0278
0279 // set state flag to terminate further processing of this channel
0280 acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0281 flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
0282 flags[inputCh] = flag;
0283 return;
0284 }
0285
0286 // according to cpu version
0287 // auto max_amplitude = amplitudes[ch][sample_max];
0288 auto const max_amplitude = amplitude;
0289 // according to cpu version
0290 auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max - 7];
0291 // note, no syncing as the same thread will be accessing here
0292 bool hasGainSwitch =
0293 shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3];
0294
0295 // pedestal is final unconditionally
0296 g_pedestal[inputCh] = pedestal;
0297 if (hasGainSwitch && gainSwitchUseMaxSample) {
0298 // thread for sample=0 will access the right guys
0299 energies[inputCh] = max_amplitude / shape_value;
0300 acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0301 flags[inputCh] = flag;
0302 return;
0303 }
0304
0305 // this happens cause sometimes rms_x12 is 0...
0306 // needs to be checkec why this is the case
0307 // general case here is that noisecov is a Zero matrix
0308 if (rmsForChecking == 0) {
0309 acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0310 flags[inputCh] = flag;
0311 return;
0312 }
0313
0314 // for the case when no shortcuts were taken
0315 flags[inputCh] = flag;
0316 }
0317 }
0318 }
0319
0320 ///
0321 /// assume kernel launch configuration is
0322 /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
0323 ///
0324 __global__ void kernel_prep_2d(SampleGainVector const* gainNoise,
0325 uint32_t const* dids_eb,
0326 uint32_t const* dids_ee,
0327 float const* rms_x12,
0328 float const* rms_x6,
0329 float const* rms_x1,
0330 float const* gain12Over6,
0331 float const* gain6Over1,
0332 double const* G12SamplesCorrelationEB,
0333 double const* G6SamplesCorrelationEB,
0334 double const* G1SamplesCorrelationEB,
0335 double const* G12SamplesCorrelationEE,
0336 double const* G6SamplesCorrelationEE,
0337 double const* G1SamplesCorrelationEE,
0338 SampleMatrix* noisecov,
0339 PulseMatrixType* pulse_matrix,
0340 EcalPulseShape const* pulse_shape,
0341 bool const* hasSwitchToGain6,
0342 bool const* hasSwitchToGain1,
0343 bool const* isSaturated,
0344 uint32_t const offsetForHashes,
0345 uint32_t const offsetForInputs) {
0346 int const ch = blockIdx.x;
0347 int const tx = threadIdx.x;
0348 int const ty = threadIdx.y;
0349 constexpr float addPedestalUncertainty = 0.f;
0350 constexpr bool dynamicPedestal = false;
0351 constexpr bool simplifiedNoiseModelForGainSwitch = true; //---- default is true
0352
0353 // to access input arrays (ids and digis only)
0354 int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0355 auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
0356
0357 bool tmp0 = hasSwitchToGain6[ch];
0358 bool tmp1 = hasSwitchToGain1[ch];
0359 auto const did = DetId{dids[inputCh]};
0360 auto const isBarrel = did.subdetId() == EcalBarrel;
0361 auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0362 : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0363 auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE;
0364 auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE;
0365 auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE;
0366 bool tmp2 = isSaturated[ch];
0367 bool hasGainSwitch = tmp0 || tmp1 || tmp2;
0368 auto const vidx = std::abs(ty - tx);
0369
0370 // non-divergent branch for all threads per block
0371 if (hasGainSwitch) {
0372 // TODO: did not include simplified noise model
0373 float noise_value = 0;
0374
0375 // non-divergent branch - all threads per block
0376 // TODO: all of these constants indicate that
0377 // that these parts could be splitted into completely different
0378 // kernels and run one of them only depending on the config
0379 if (simplifiedNoiseModelForGainSwitch) {
0380 int isample_max = 5; // according to cpu defs
0381 int gainidx = gainNoise[ch][isample_max];
0382
0383 // non-divergent branches
0384 if (gainidx == 0)
0385 noise_value = rms_x12[hashedId] * rms_x12[hashedId] * G12SamplesCorrelation[vidx];
0386 if (gainidx == 1)
0387 noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
0388 G6SamplesCorrelation[vidx];
0389 if (gainidx == 2)
0390 noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * gain6Over1[hashedId] * gain6Over1[hashedId] *
0391 rms_x1[hashedId] * rms_x1[hashedId] * G1SamplesCorrelation[vidx];
0392 if (!dynamicPedestal && addPedestalUncertainty > 0.f)
0393 noise_value += addPedestalUncertainty * addPedestalUncertainty;
0394 } else {
0395 int gainidx = 0;
0396 char mask = gainidx;
0397 int pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
0398 // NB: gainratio is 1, that is why it does not appear in the formula
0399 noise_value += rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx];
0400 // non-divergent branch
0401 if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0402 noise_value += addPedestalUncertainty * addPedestalUncertainty * pedestal; // gainratio is 1
0403 }
0404
0405 //
0406 gainidx = 1;
0407 mask = gainidx;
0408 pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
0409 noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
0410 pedestal * G6SamplesCorrelation[vidx];
0411 // non-divergent branch
0412 if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0413 noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * addPedestalUncertainty *
0414 addPedestalUncertainty * pedestal;
0415 }
0416
0417 //
0418 gainidx = 2;
0419 mask = gainidx;
0420 pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
0421 float tmp = gain6Over1[hashedId] * gain12Over6[hashedId];
0422 noise_value += tmp * tmp * rms_x1[hashedId] * rms_x1[hashedId] * pedestal * G1SamplesCorrelation[vidx];
0423 // non-divergent branch
0424 if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0425 noise_value += tmp * tmp * addPedestalUncertainty * addPedestalUncertainty * pedestal;
0426 }
0427 }
0428
0429 noisecov[ch](ty, tx) = noise_value;
0430 } else {
0431 auto rms = rms_x12[hashedId];
0432 float noise_value = rms * rms * G12SamplesCorrelation[vidx];
0433 if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0434 //---- add fully correlated component to noise covariance to inflate pedestal uncertainty
0435 noise_value += addPedestalUncertainty * addPedestalUncertainty;
0436 }
0437 noisecov[ch](ty, tx) = noise_value;
0438 }
0439
0440 // pulse matrix
0441 int const posToAccess = 9 - tx + ty; // see cpu for reference
0442 float const value = posToAccess >= 7 ? pulse_shape[hashedId].pdfval[posToAccess - 7] : 0;
0443 pulse_matrix[ch](ty, tx) = value;
0444 }
0445
0446 __global__ void kernel_permute_results(SampleVector* amplitudes,
0447 BXVectorType const* activeBXs,
0448 ::ecal::reco::StorageScalarType* energies,
0449 char const* acState,
0450 int const nchannels) {
0451 // constants
0452 constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0453
0454 // indices
0455 int const tx = threadIdx.x + blockIdx.x * blockDim.x;
0456 int const ch = tx / nsamples;
0457 int const sampleidx = tx % nsamples; // this is to address activeBXs
0458
0459 if (ch >= nchannels)
0460 return;
0461
0462 // channels that have amplitude precomputed do not need results to be permuted
0463 auto const state = static_cast<MinimizationState>(acState[ch]);
0464 if (state == MinimizationState::Precomputed)
0465 return;
0466
0467 // configure shared memory and cp into it
0468 extern __shared__ char smem[];
0469 SampleVector::Scalar* values = reinterpret_cast<SampleVector::Scalar*>(smem);
0470 values[threadIdx.x] = amplitudes[ch](sampleidx);
0471 __syncthreads();
0472
0473 // get the sample for this bx
0474 auto const sample = static_cast<int>(activeBXs[ch](sampleidx)) + 5;
0475
0476 // store back to global
0477 amplitudes[ch](sample) = values[threadIdx.x];
0478
0479 // store sample 5 separately
0480 // only for the case when minimization was performed
0481 // not for cases with precomputed amplitudes
0482 if (sample == 5)
0483 energies[ch] = values[threadIdx.x];
0484 }
0485
0486 } // namespace multifit
0487 } // namespace ecal