Warning, /RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu is written in an unsupported language. File is not indexed.
0001 #include <cmath>
0002 #include <limits>
0003
0004 #include <cuda.h>
0005
0006 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0007 #include "DataFormats/EcalDigi/interface/EcalMGPASample.h"
0008 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
0009 #include "DataFormats/Math/interface/approx_exp.h"
0010 #include "DataFormats/Math/interface/approx_log.h"
0011 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0012
0013 #include "TimeComputationKernels.h"
0014 #include "KernelHelpers.h"
0015
0016 //#define DEBUG
0017
0018 //#define ECAL_RECO_CUDA_DEBUG
0019
0020 namespace ecal {
0021 namespace multifit {
0022
0023 __device__ __forceinline__ bool use_sample(unsigned int sample_mask, unsigned int sample) {
0024 return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
0025 }
0026
0027 __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
0028 SampleVector::Scalar const* sample_value_errors,
0029 bool const* useless_sample_values,
0030 SampleVector::Scalar* chi2s,
0031 SampleVector::Scalar* sum0s,
0032 SampleVector::Scalar* sumAAs,
0033 const int nchannels) {
0034 using ScalarType = SampleVector::Scalar;
0035 constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0036
0037 // indices
0038 int tx = threadIdx.x + blockDim.x * blockIdx.x;
0039 int ltx = threadIdx.x;
0040 int ch = tx / nsamples;
0041 int nchannels_per_block = blockDim.x / nsamples;
0042
0043 // threads that return here should not affect the __syncthreads() below since they have exitted the kernel
0044 if (ch >= nchannels)
0045 return;
0046
0047 int sample = tx % nsamples;
0048
0049 // shared mem inits
0050 extern __shared__ char sdata[];
0051 char* s_sum0 = sdata;
0052 SampleVector::Scalar* s_sum1 = reinterpret_cast<SampleVector::Scalar*>(s_sum0 + nchannels_per_block * nsamples);
0053 SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block * nsamples;
0054 SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block * nsamples;
0055
0056 // TODO make sure no div by 0
0057 const auto inv_error =
0058 useless_sample_values[tx] ? 0.0 : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]);
0059 const auto sample_value = sample_values[tx];
0060 s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
0061 s_sum1[ltx] = inv_error;
0062 s_sumA[ltx] = sample_value * inv_error;
0063 s_sumAA[ltx] = sample_value * sample_value * inv_error;
0064 __syncthreads();
0065
0066 // 5 threads for [0, 4] samples
0067 if (sample < 5) {
0068 s_sum0[ltx] += s_sum0[ltx + 5];
0069 s_sum1[ltx] += s_sum1[ltx + 5];
0070 s_sumA[ltx] += s_sumA[ltx + 5];
0071 s_sumAA[ltx] += s_sumAA[ltx + 5];
0072 }
0073 __syncthreads();
0074
0075 if (sample < 2) {
0076 // note double counting of sample 3
0077 s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
0078 s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
0079 s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
0080 s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
0081 }
0082 __syncthreads();
0083
0084 if (sample == 0) {
0085 // note, subtract to remove the double counting of sample == 3
0086 const auto sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
0087 const auto sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
0088 const auto sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
0089 const auto sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
0090 const auto chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast<ScalarType>(0);
0091 chi2s[ch] = chi2;
0092 sum0s[ch] = sum0;
0093 sumAAs[ch] = sumAA;
0094
0095 #ifdef DEBUG_TC_NULLHYPOT
0096 if (ch == 0) {
0097 printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast<int>(sum0), sumAA);
0098 }
0099 #endif
0100 }
0101 }
0102
0103 constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
0104 constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
0105
0106 //#define DEBUG_TC_MAKERATIO
0107 //
0108 // launch ctx parameters are
0109 // 45 threads per channel, X channels per block, Y blocks
0110 // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
0111 // TODO: it might be much beter to use 32 threads per channel instead of 45
0112 // to simplify the synchronization
0113 //
0114 __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
0115 SampleVector::Scalar const* sample_value_errors,
0116 uint32_t const* dids_eb,
0117 uint32_t const* dids_ee,
0118 bool const* useless_sample_values,
0119 char const* pedestal_nums,
0120 ConfigurationParameters::type const* amplitudeFitParametersEB,
0121 ConfigurationParameters::type const* amplitudeFitParametersEE,
0122 ConfigurationParameters::type const* timeFitParametersEB,
0123 ConfigurationParameters::type const* timeFitParametersEE,
0124 SampleVector::Scalar const* sumAAsNullHypot,
0125 SampleVector::Scalar const* sum0sNullHypot,
0126 SampleVector::Scalar* tMaxAlphaBetas,
0127 SampleVector::Scalar* tMaxErrorAlphaBetas,
0128 SampleVector::Scalar* g_accTimeMax,
0129 SampleVector::Scalar* g_accTimeWgt,
0130 TimeComputationState* g_state,
0131 unsigned const int timeFitParameters_sizeEB,
0132 unsigned const int timeFitParameters_sizeEE,
0133 ConfigurationParameters::type const timeFitLimits_firstEB,
0134 ConfigurationParameters::type const timeFitLimits_firstEE,
0135 ConfigurationParameters::type const timeFitLimits_secondEB,
0136 ConfigurationParameters::type const timeFitLimits_secondEE,
0137 const int nchannels,
0138 uint32_t const offsetForInputs) {
0139 using ScalarType = SampleVector::Scalar;
0140
0141 // constants
0142 constexpr int nthreads_per_channel = 45; // n=10, n(n-1)/2
0143 constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0144
0145 // indices
0146 const int gtx = threadIdx.x + blockDim.x * blockIdx.x;
0147 const int ch = gtx / nthreads_per_channel;
0148 const int ltx = threadIdx.x % nthreads_per_channel;
0149 const int ch_start = ch * nsamples;
0150 const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
0151 const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0152
0153 // remove inactive threads
0154 // threads that return here should not affect the __syncthreads() below since they have exitted the kernel
0155 if (ch >= nchannels)
0156 return;
0157
0158 const auto did = DetId{dids[inputCh]};
0159 const auto isBarrel = did.subdetId() == EcalBarrel;
0160 const auto* amplitudeFitParameters = isBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
0161 const auto* timeFitParameters = isBarrel ? timeFitParametersEB : timeFitParametersEE;
0162 const auto timeFitParameters_size = isBarrel ? timeFitParameters_sizeEB : timeFitParameters_sizeEE;
0163 const auto timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
0164 const auto timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
0165
0166 extern __shared__ char smem[];
0167 ScalarType* shr_chi2s = reinterpret_cast<ScalarType*>(smem);
0168 ScalarType* shr_time_wgt = shr_chi2s + blockDim.x;
0169 ScalarType* shr_time_max = shr_time_wgt + blockDim.x;
0170 ScalarType* shrTimeMax = shr_time_max + blockDim.x;
0171 ScalarType* shrTimeWgt = shrTimeMax + blockDim.x;
0172
0173 // map tx -> (sample_i, sample_j)
0174 int sample_i, sample_j = 0;
0175 if (ltx >= 0 && ltx <= 8) {
0176 sample_i = 0;
0177 sample_j = 1 + ltx;
0178 } else if (ltx <= 16) {
0179 sample_i = 1;
0180 sample_j = 2 + ltx - 9;
0181 } else if (ltx <= 23) {
0182 sample_i = 2;
0183 sample_j = 3 + ltx - 17;
0184 } else if (ltx <= 29) {
0185 sample_i = 3;
0186 sample_j = 4 + ltx - 24;
0187 } else if (ltx <= 34) {
0188 sample_i = 4;
0189 sample_j = 5 + ltx - 30;
0190 } else if (ltx <= 38) {
0191 sample_i = 5;
0192 sample_j = 6 + ltx - 35;
0193 } else if (ltx <= 41) {
0194 sample_i = 6;
0195 sample_j = 7 + ltx - 39;
0196 } else if (ltx <= 43) {
0197 sample_i = 7;
0198 sample_j = 8 + ltx - 42;
0199 } else if (ltx <= 44) {
0200 sample_i = 8;
0201 sample_j = 9;
0202 } else
0203 assert(false);
0204
0205 const auto tx_i = ch_start + sample_i;
0206 const auto tx_j = ch_start + sample_j;
0207
0208 //
0209 // note, given the way we partition the block, with 45 threads per channel
0210 // we will end up with inactive threads which need to be dragged along
0211 // through the synching point
0212 //
0213 bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
0214 sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
0215
0216 //
0217 // see cpu implementation for explanation
0218 //
0219 ScalarType chi2 = std::numeric_limits<ScalarType>::max();
0220 ScalarType tmax = 0;
0221 ScalarType tmaxerr = 0;
0222 shrTimeMax[threadIdx.x] = 0;
0223 shrTimeWgt[threadIdx.x] = 0;
0224 bool internalCondForSkipping1 = true;
0225 bool internalCondForSkipping2 = true;
0226 if (!condForUselessSamples) {
0227 const auto rtmp = sample_values[tx_i] / sample_values[tx_j];
0228 const auto invampl_i = 1.0 / sample_values[tx_i];
0229 const auto relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
0230 const auto invampl_j = 1.0 / sample_values[tx_j];
0231 const auto relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
0232 const auto err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
0233 auto err2 = sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
0234 // TODO non-divergent branch for a block if each block has 1 channel
0235 // otherwise non-divergent for groups of 45 threads
0236 // at this point, pedestal_nums[ch] can be either 0, 1 or 2
0237 if (pedestal_nums[ch] == 2)
0238 err2 *= err2 * 0.5;
0239 const auto err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
0240 const auto total_error = std::sqrt(err1 + err2 + err3);
0241
0242 const auto alpha = amplitudeFitParameters[0];
0243 const auto beta = amplitudeFitParameters[1];
0244 const auto alphabeta = alpha * beta;
0245 const auto invalphabeta = 1.0 / alphabeta;
0246
0247 // variables instead of a struct
0248 const auto ratio_index = sample_i;
0249 const auto ratio_step = sample_j - sample_i;
0250 const auto ratio_value = rtmp;
0251 const auto ratio_error = total_error;
0252
0253 const auto rlim_i_j = fast_expf(static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
0254 internalCondForSkipping1 = !(total_error < 1.0 && rtmp > 0.001 && rtmp < rlim_i_j);
0255 if (!internalCondForSkipping1) {
0256 //
0257 // precompute.
0258 // in cpu version this was done conditionally
0259 // however easier to do it here (precompute) and then just filter out
0260 // if not needed
0261 //
0262 const auto l_timeFitLimits_first = timeFitLimits_first;
0263 const auto l_timeFitLimits_second = timeFitLimits_second;
0264 if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
0265 const auto time_max_i = static_cast<ScalarType>(ratio_index);
0266 auto u = timeFitParameters[timeFitParameters_size - 1];
0267 CMS_UNROLL_LOOP
0268 for (int k = timeFitParameters_size - 2; k >= 0; k--)
0269 u = u * ratio_value + timeFitParameters[k];
0270
0271 auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
0272 for (int k = timeFitParameters_size - 2; k >= 1; k--)
0273 du = du * ratio_value + k * timeFitParameters[k];
0274
0275 const auto error2 = ratio_error * ratio_error * du * du;
0276 const auto time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast<ScalarType>(0);
0277 const auto time_wgt = error2 > 0 ? 1.0 / error2 : static_cast<ScalarType>(0);
0278
0279 // store into shared mem
0280 // note, this name is essentially identical to the one used
0281 // below.
0282 shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0;
0283 shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0;
0284 } else {
0285 shrTimeMax[threadIdx.x] = 0;
0286 shrTimeWgt[threadIdx.x] = 0;
0287 }
0288
0289 // continue with ratios
0290 const auto stepOverBeta = static_cast<SampleVector::Scalar>(ratio_step) / beta;
0291 const auto offset = static_cast<SampleVector::Scalar>(ratio_index) + alphabeta;
0292 const auto rmin = std::max(ratio_value - ratio_error, 0.001);
0293 const auto rmax = std::min(ratio_value + ratio_error,
0294 fast_expf(static_cast<SampleVector::Scalar>(ratio_step) / beta) - 0.001);
0295 const auto time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.0);
0296 const auto time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.0);
0297
0298 // set these guys
0299 tmax = 0.5 * (time1 + time2);
0300 tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
0301 #ifdef DEBUG_TC_MAKERATIO
0302 if (ch == 1 || ch == 0)
0303 printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n",
0304 ch,
0305 ltx,
0306 tmax,
0307 tmaxerr,
0308 time1,
0309 time2,
0310 offset,
0311 rmin,
0312 rmax);
0313 #endif
0314
0315 SampleVector::Scalar sumAf = 0;
0316 SampleVector::Scalar sumff = 0;
0317 const int itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
0318 auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
0319 // TODO: data dependence
0320 for (int it = itmin + 1; it < nsamples; it++) {
0321 loffset += invalphabeta;
0322 if (useless_sample_values[ch_start + it])
0323 continue;
0324 const auto inverr2 = 1.0 / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
0325 const auto term1 = 1.0 + loffset;
0326 const auto f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
0327 sumAf += sample_values[ch_start + it] * (f * inverr2);
0328 sumff += f * (f * inverr2);
0329 }
0330
0331 const auto sumAA = sumAAsNullHypot[ch];
0332 const auto sum0 = sum0sNullHypot[ch];
0333 chi2 = sumAA;
0334 // TODO: sum0 can not be 0 below, need to introduce the check upfront
0335 if (sumff > 0) {
0336 chi2 = sumAA - sumAf * (sumAf / sumff);
0337 }
0338 chi2 /= sum0;
0339
0340 #ifdef DEBUG_TC_MAKERATIO
0341 if (ch == 1 || ch == 0)
0342 printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n",
0343 ch,
0344 ltx,
0345 sumAf,
0346 sumff,
0347 sumAA,
0348 static_cast<int>(sum0),
0349 tmax,
0350 tmaxerr,
0351 chi2);
0352 #endif
0353
0354 if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
0355 internalCondForSkipping2 = false;
0356 else
0357 chi2 = std::numeric_limits<ScalarType>::max();
0358 }
0359 }
0360
0361 // store into smem
0362 shr_chi2s[threadIdx.x] = chi2;
0363 __syncthreads();
0364
0365 // find min chi2 - quite crude for now
0366 // TODO validate/check
0367 char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
0368 bool oddElements = nthreads_per_channel % 2;
0369 CMS_UNROLL_LOOP
0370 while (iter >= 1) {
0371 if (ltx < iter)
0372 // for odd ns, the last guy will just store itself
0373 // exception is for ltx == 0 and iter==1
0374 shr_chi2s[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
0375 ? shr_chi2s[threadIdx.x]
0376 : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x + iter]);
0377 __syncthreads();
0378 oddElements = iter % 2;
0379 iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
0380 }
0381
0382 // filter out inactive or useless samples threads
0383 if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
0384 // min chi2, now compute weighted average of tmax measurements
0385 // see cpu version for more explanation
0386 const auto chi2min = shr_chi2s[threadIdx.x - ltx];
0387 const auto chi2Limit = chi2min + 1.0;
0388 const auto inverseSigmaSquared = chi2 < chi2Limit ? 1.0 / (tmaxerr * tmaxerr) : 0.0;
0389
0390 #ifdef DEBUG_TC_MAKERATIO
0391 if (ch == 1 || ch == 0)
0392 printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
0393 ch,
0394 ltx,
0395 chi2min,
0396 chi2Limit,
0397 inverseSigmaSquared);
0398 #endif
0399
0400 // store into shared mem and run reduction
0401 // TODO: check if cooperative groups would be better
0402 // TODO: check if shuffling intrinsics are better
0403 shr_time_wgt[threadIdx.x] = inverseSigmaSquared;
0404 shr_time_max[threadIdx.x] = tmax * inverseSigmaSquared;
0405 } else {
0406 shr_time_wgt[threadIdx.x] = 0;
0407 shr_time_max[threadIdx.x] = 0;
0408 }
0409 __syncthreads();
0410
0411 // reduce to compute time_max and time_wgt
0412 iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
0413 oddElements = nthreads_per_channel % 2;
0414 CMS_UNROLL_LOOP
0415 while (iter >= 1) {
0416 if (ltx < iter) {
0417 shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
0418 ? shr_time_wgt[threadIdx.x]
0419 : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x + iter];
0420 shr_time_max[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
0421 ? shr_time_max[threadIdx.x]
0422 : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x + iter];
0423 shrTimeMax[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
0424 ? shrTimeMax[threadIdx.x]
0425 : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x + iter];
0426 shrTimeWgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
0427 ? shrTimeWgt[threadIdx.x]
0428 : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x + iter];
0429 }
0430
0431 __syncthreads();
0432 oddElements = iter % 2;
0433 iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
0434 }
0435
0436 // load from shared memory the 0th guy (will contain accumulated values)
0437 // compute
0438 // store into global mem
0439 if (ltx == 0) {
0440 const auto tmp_time_max = shr_time_max[threadIdx.x];
0441 const auto tmp_time_wgt = shr_time_wgt[threadIdx.x];
0442
0443 // we are done if there number of time ratios is 0
0444 if (tmp_time_wgt == 0 && tmp_time_max == 0) {
0445 g_state[ch] = TimeComputationState::Finished;
0446 return;
0447 }
0448
0449 // no div by 0
0450 const auto tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
0451 const auto tMaxErrorAlphaBeta = 1.0 / std::sqrt(tmp_time_wgt);
0452
0453 tMaxAlphaBetas[ch] = tMaxAlphaBeta;
0454 tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta;
0455 g_accTimeMax[ch] = shrTimeMax[threadIdx.x];
0456 g_accTimeWgt[ch] = shrTimeWgt[threadIdx.x];
0457 g_state[ch] = TimeComputationState::NotFinished;
0458
0459 #ifdef DEBUG_TC_MAKERATIO
0460 printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
0461 printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
0462 ch,
0463 tMaxAlphaBeta,
0464 tMaxErrorAlphaBeta,
0465 shrTimeMax[threadIdx.x],
0466 shrTimeWgt[threadIdx.x]);
0467 #endif
0468 }
0469 }
0470
0471 /// launch ctx parameters are
0472 /// 10 threads per channel, N channels per block, Y blocks
0473 /// TODO: do we need to keep the state around or can be removed?!
0474 //#define DEBUG_FINDAMPLCHI2_AND_FINISH
0475 __global__ void kernel_time_compute_findamplchi2_and_finish(
0476 SampleVector::Scalar const* sample_values,
0477 SampleVector::Scalar const* sample_value_errors,
0478 uint32_t const* dids_eb,
0479 uint32_t const* dids_ee,
0480 bool const* useless_samples,
0481 SampleVector::Scalar const* g_tMaxAlphaBeta,
0482 SampleVector::Scalar const* g_tMaxErrorAlphaBeta,
0483 SampleVector::Scalar const* g_accTimeMax,
0484 SampleVector::Scalar const* g_accTimeWgt,
0485 ConfigurationParameters::type const* amplitudeFitParametersEB,
0486 ConfigurationParameters::type const* amplitudeFitParametersEE,
0487 SampleVector::Scalar const* sumAAsNullHypot,
0488 SampleVector::Scalar const* sum0sNullHypot,
0489 SampleVector::Scalar const* chi2sNullHypot,
0490 TimeComputationState* g_state,
0491 SampleVector::Scalar* g_ampMaxAlphaBeta,
0492 SampleVector::Scalar* g_ampMaxError,
0493 SampleVector::Scalar* g_timeMax,
0494 SampleVector::Scalar* g_timeError,
0495 const int nchannels,
0496 uint32_t const offsetForInputs) {
0497 using ScalarType = SampleVector::Scalar;
0498
0499 // constants
0500 constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0501
0502 // indices
0503 const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
0504 const int ch = gtx / nsamples;
0505 const int sample = threadIdx.x % nsamples;
0506 const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
0507 const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0508
0509 // configure shared mem
0510 // per block, we need #threads per block * 2 * sizeof(ScalarType)
0511 // we run with N channels per block
0512 extern __shared__ char smem[];
0513 ScalarType* shr_sumAf = reinterpret_cast<ScalarType*>(smem);
0514 ScalarType* shr_sumff = shr_sumAf + blockDim.x;
0515
0516 if (ch >= nchannels)
0517 return;
0518
0519 auto state = g_state[ch];
0520 const auto did = DetId{dids[inputCh]};
0521 const auto* amplitudeFitParameters =
0522 did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
0523
0524 // TODO is that better than storing into global and launching another kernel
0525 // for the first 10 threads
0526 if (state == TimeComputationState::NotFinished) {
0527 const auto alpha = amplitudeFitParameters[0];
0528 const auto beta = amplitudeFitParameters[1];
0529 const auto alphabeta = alpha * beta;
0530 const auto invalphabeta = 1.0 / alphabeta;
0531 const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
0532 const auto sample_value = sample_values[gtx];
0533 const auto sample_value_error = sample_value_errors[gtx];
0534 const auto inverr2 =
0535 useless_samples[gtx] ? static_cast<ScalarType>(0) : 1.0 / (sample_value_error * sample_value_error);
0536 const auto offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) * invalphabeta;
0537 const auto term1 = 1.0 + offset;
0538 const auto f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast<ScalarType>(0.0);
0539 const auto sumAf = sample_value * (f * inverr2);
0540 const auto sumff = f * (f * inverr2);
0541
0542 // store into shared mem
0543 shr_sumAf[threadIdx.x] = sumAf;
0544 shr_sumff[threadIdx.x] = sumff;
0545 } else {
0546 shr_sumAf[threadIdx.x] = 0;
0547 shr_sumff[threadIdx.x] = 0;
0548 }
0549 __syncthreads();
0550
0551 // reduce
0552 // unroll completely here (but hardcoded)
0553 if (sample < 5) {
0554 shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 5];
0555 shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 5];
0556 }
0557 __syncthreads();
0558
0559 if (sample < 2) {
0560 // will need to subtract for ltx = 3, we double count here
0561 shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 2] + shr_sumAf[threadIdx.x + 3];
0562 shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 2] + shr_sumff[threadIdx.x + 3];
0563 }
0564 __syncthreads();
0565
0566 if (sample == 0) {
0567 // exit if the state is done
0568 // note, we do not exit before all __synchtreads are finished
0569 if (state == TimeComputationState::Finished) {
0570 g_timeMax[ch] = 5;
0571 g_timeError[ch] = -999;
0572 return;
0573 }
0574
0575 // subtract to avoid double counting
0576 const auto sumff = shr_sumff[threadIdx.x] + shr_sumff[threadIdx.x + 1] - shr_sumff[threadIdx.x + 3];
0577 const auto sumAf = shr_sumAf[threadIdx.x] + shr_sumAf[threadIdx.x + 1] - shr_sumAf[threadIdx.x + 3];
0578
0579 const auto ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
0580 const auto sumAA = sumAAsNullHypot[ch];
0581 const auto sum0 = sum0sNullHypot[ch];
0582 const auto nullChi2 = chi2sNullHypot[ch];
0583 if (sumff > 0) {
0584 const auto chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
0585 if (chi2AlphaBeta > nullChi2) {
0586 // null hypothesis is better
0587 state = TimeComputationState::Finished;
0588 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0589 printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
0590 ch,
0591 chi2AlphaBeta,
0592 nullChi2,
0593 sumAA,
0594 sumAf,
0595 sumff,
0596 sum0);
0597 #endif
0598 }
0599
0600 // store to global
0601 g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
0602 } else {
0603 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0604 printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
0605 #endif
0606 state = TimeComputationState::Finished;
0607 }
0608
0609 // store the state to global and finish calcs
0610 g_state[ch] = state;
0611 if (state == TimeComputationState::Finished) {
0612 // store default values into global
0613 g_timeMax[ch] = 5;
0614 g_timeError[ch] = -999;
0615 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0616 printf("ch = %d finished state\n", ch);
0617 #endif
0618 return;
0619 }
0620
0621 const auto ampMaxError = g_ampMaxError[ch];
0622 const auto test_ratio = ampMaxAlphaBeta / ampMaxError;
0623 const auto accTimeMax = g_accTimeMax[ch];
0624 const auto accTimeWgt = g_accTimeWgt[ch];
0625 const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
0626 const auto tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
0627 // branch to separate large vs small pulses
0628 // see cpu version for more info
0629 if (test_ratio > 5.0 && accTimeWgt > 0) {
0630 const auto tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast<ScalarType>(0);
0631 const auto tMaxErrorRatio = accTimeWgt > 0 ? 1.0 / std::sqrt(accTimeWgt) : static_cast<ScalarType>(0);
0632
0633 if (test_ratio > 10.0) {
0634 g_timeMax[ch] = tMaxRatio;
0635 g_timeError[ch] = tMaxErrorRatio;
0636
0637 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0638 printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
0639 #endif
0640 } else {
0641 const auto timeMax = (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
0642 tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
0643 5.0;
0644 const auto timeError = (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
0645 tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
0646 5.0;
0647 state = TimeComputationState::Finished;
0648 g_state[ch] = state;
0649 g_timeMax[ch] = timeMax;
0650 g_timeError[ch] = timeError;
0651
0652 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0653 printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
0654 #endif
0655 }
0656 } else {
0657 state = TimeComputationState::Finished;
0658 g_state[ch] = state;
0659 g_timeMax[ch] = tMaxAlphaBeta;
0660 g_timeError[ch] = tMaxErrorAlphaBeta;
0661
0662 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0663 printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
0664 #endif
0665 }
0666 }
0667 }
0668
0669 __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
0670 uint16_t const* digis_ee,
0671 SampleVector::Scalar* sample_values,
0672 SampleVector::Scalar* sample_value_errors,
0673 bool* useless_sample_values,
0674 unsigned const int sample_mask,
0675 const int nchannels,
0676 uint32_t const offsetForInputs) {
0677 using ScalarType = SampleVector::Scalar;
0678
0679 // constants
0680 constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0681
0682 // indices
0683 const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
0684 const int ch = gtx / nsamples;
0685 const int sample = threadIdx.x % nsamples;
0686 const int inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx;
0687 const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
0688
0689 // remove thread for sample 0, oversubscribing is easier than ....
0690 if (ch >= nchannels || sample == 0)
0691 return;
0692
0693 if (!use_sample(sample_mask, sample))
0694 return;
0695
0696 const auto gainIdPrev = ecalMGPA::gainId(digis[inputGtx - 1]);
0697 const auto gainIdNext = ecalMGPA::gainId(digis[inputGtx]);
0698 if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
0699 sample_values[gtx - 1] = 0;
0700 sample_value_errors[gtx - 1] = 1e+9;
0701 useless_sample_values[gtx - 1] = true;
0702 }
0703 }
0704
0705 __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
0706 SampleVector::Scalar const* sample_value_errors,
0707 uint32_t const* dids,
0708 bool const* useless_samples,
0709 SampleVector::Scalar const* g_timeMax,
0710 SampleVector::Scalar const* amplitudeFitParametersEB,
0711 SampleVector::Scalar const* amplitudeFitParametersEE,
0712 SampleVector::Scalar* g_amplitudeMax,
0713 const int nchannels) {
0714 using ScalarType = SampleVector::Scalar;
0715
0716 // constants
0717 constexpr ScalarType corr4 = 1.;
0718 constexpr ScalarType corr6 = 1.;
0719 constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0720
0721 // indices
0722 const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
0723 const int ch = gtx / nsamples;
0724 const int sample = threadIdx.x % nsamples;
0725
0726 if (ch >= nchannels)
0727 return;
0728
0729 const auto did = DetId{dids[ch]};
0730 const auto* amplitudeFitParameters =
0731 did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
0732
0733 // configure shared mem
0734 extern __shared__ char smem[];
0735 ScalarType* shr_sum1 = reinterpret_cast<ScalarType*>(smem);
0736 auto* shr_sumA = shr_sum1 + blockDim.x;
0737 auto* shr_sumF = shr_sumA + blockDim.x;
0738 auto* shr_sumAF = shr_sumF + blockDim.x;
0739 auto* shr_sumFF = shr_sumAF + blockDim.x;
0740
0741 const auto alpha = amplitudeFitParameters[0];
0742 const auto beta = amplitudeFitParameters[1];
0743 const auto timeMax = g_timeMax[ch];
0744 const auto pedestalLimit = timeMax - (alpha * beta) - 1.0;
0745 const auto sample_value = sample_values[gtx];
0746 const auto sample_value_error = sample_value_errors[gtx];
0747 const auto inverr2 =
0748 sample_value_error > 0 ? 1. / (sample_value_error * sample_value_error) : static_cast<ScalarType>(0);
0749 const auto termOne = 1 + (sample - timeMax) / (alpha * beta);
0750 const auto f = termOne > 1.e-5 ? fast_expf(alpha * fast_logf(termOne) - (sample - timeMax) / beta)
0751 : static_cast<ScalarType>(0.);
0752
0753 bool const cond = ((sample < pedestalLimit) || (f > 0.6 * corr6 && sample <= timeMax) ||
0754 (f > 0.4 * corr4 && sample >= timeMax)) &&
0755 !useless_samples[gtx];
0756
0757 // store into shared mem
0758 shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast<ScalarType>(0);
0759 shr_sumA[threadIdx.x] = cond ? sample_value * inverr2 : static_cast<ScalarType>(0);
0760 shr_sumF[threadIdx.x] = cond ? f * inverr2 : static_cast<ScalarType>(0);
0761 shr_sumAF[threadIdx.x] = cond ? (f * inverr2) * sample_value : static_cast<ScalarType>(0);
0762 shr_sumFF[threadIdx.x] = cond ? f * (f * inverr2) : static_cast<ScalarType>(0);
0763
0764 // reduction
0765 if (sample <= 4) {
0766 shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 5];
0767 shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 5];
0768 shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 5];
0769 shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 5];
0770 shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 5];
0771 }
0772 __syncthreads();
0773
0774 if (sample < 2) {
0775 // note: we double count sample 3
0776 shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 2] + shr_sum1[threadIdx.x + 3];
0777 shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 2] + shr_sumA[threadIdx.x + 3];
0778 shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 2] + shr_sumF[threadIdx.x + 3];
0779 shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 2] + shr_sumAF[threadIdx.x + 3];
0780 shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 2] + shr_sumFF[threadIdx.x + 3];
0781 }
0782 __syncthreads();
0783
0784 if (sample == 0) {
0785 const auto sum1 = shr_sum1[threadIdx.x] + shr_sum1[threadIdx.x + 1] - shr_sum1[threadIdx.x + 3];
0786 const auto sumA = shr_sumA[threadIdx.x] + shr_sumA[threadIdx.x + 1] - shr_sumA[threadIdx.x + 3];
0787 const auto sumF = shr_sumF[threadIdx.x] + shr_sumF[threadIdx.x + 1] - shr_sumF[threadIdx.x + 3];
0788 const auto sumAF = shr_sumAF[threadIdx.x] + shr_sumAF[threadIdx.x + 1] - shr_sumAF[threadIdx.x + 3];
0789 const auto sumFF = shr_sumFF[threadIdx.x] + shr_sumFF[threadIdx.x + 1] - shr_sumFF[threadIdx.x + 3];
0790
0791 const auto denom = sumFF * sum1 - sumF * sumF;
0792 const auto condForDenom = sum1 > 0 && std::abs(denom) > 1.e-20;
0793 const auto amplitudeMax = condForDenom ? (sumAF * sum1 - sumA * sumF) / denom : static_cast<ScalarType>(0.);
0794
0795 // store into global mem
0796 g_amplitudeMax[ch] = amplitudeMax;
0797 }
0798 }
0799
0800 //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
0801 __global__ void kernel_time_computation_init(uint16_t const* digis_eb,
0802 uint32_t const* dids_eb,
0803 uint16_t const* digis_ee,
0804 uint32_t const* dids_ee,
0805 float const* rms_x12,
0806 float const* rms_x6,
0807 float const* rms_x1,
0808 float const* mean_x12,
0809 float const* mean_x6,
0810 float const* mean_x1,
0811 float const* gain12Over6,
0812 float const* gain6Over1,
0813 SampleVector::Scalar* sample_values,
0814 SampleVector::Scalar* sample_value_errors,
0815 SampleVector::Scalar* ampMaxError,
0816 bool* useless_sample_values,
0817 char* pedestal_nums,
0818 uint32_t const offsetForHashes,
0819 uint32_t const offsetForInputs,
0820 unsigned const int sample_maskEB,
0821 unsigned const int sample_maskEE,
0822 int nchannels) {
0823 using ScalarType = SampleVector::Scalar;
0824
0825 // constants
0826 constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0827
0828 // indices
0829 const int tx = threadIdx.x + blockDim.x * blockIdx.x;
0830 const int ch = tx / nsamples;
0831 const int inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx;
0832 const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0833 const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
0834 const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
0835
0836 // threads that return here should not affect the __syncthreads() below since they have exitted the kernel
0837 if (ch >= nchannels)
0838 return;
0839
0840 // indices/inits
0841 const int sample = tx % nsamples;
0842 const int input_ch_start = inputCh * nsamples;
0843 SampleVector::Scalar pedestal = 0.;
0844 int num = 0;
0845
0846 // configure shared mem
0847 extern __shared__ char smem[];
0848 ScalarType* shrSampleValues = reinterpret_cast<SampleVector::Scalar*>(smem);
0849 ScalarType* shrSampleValueErrors = shrSampleValues + blockDim.x;
0850
0851 // 0 and 1 sample values
0852 const auto adc0 = ecalMGPA::adc(digis[input_ch_start]);
0853 const auto gainId0 = ecalMGPA::gainId(digis[input_ch_start]);
0854 const auto adc1 = ecalMGPA::adc(digis[input_ch_start + 1]);
0855 const auto gainId1 = ecalMGPA::gainId(digis[input_ch_start + 1]);
0856 const auto did = DetId{dids[inputCh]};
0857 const auto isBarrel = did.subdetId() == EcalBarrel;
0858 const auto sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE;
0859 const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0860 : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0861
0862 // set pedestal
0863 // TODO this branch is non-divergent for a group of 10 threads
0864 if (gainId0 == 1 && use_sample(sample_mask, 0)) {
0865 pedestal = static_cast<SampleVector::Scalar>(adc0);
0866 num = 1;
0867
0868 const auto diff = adc1 - adc0;
0869 if (gainId1 == 1 && use_sample(sample_mask, 1) && std::abs(diff) < 3 * rms_x12[hashedId]) {
0870 pedestal = (pedestal + static_cast<SampleVector::Scalar>(adc1)) / 2.0;
0871 num = 2;
0872 }
0873 } else {
0874 pedestal = mean_x12[ch];
0875 }
0876
0877 // ped subtracted and gain-renormalized samples.
0878 const auto gainId = ecalMGPA::gainId(digis[inputTx]);
0879 const auto adc = ecalMGPA::adc(digis[inputTx]);
0880
0881 bool bad = false;
0882 SampleVector::Scalar sample_value, sample_value_error;
0883 // TODO divergent branch
0884 // TODO: piece below is general both for amplitudes and timing
0885 // potentially there is a way to reduce the amount of code...
0886 if (!use_sample(sample_mask, sample)) {
0887 bad = true;
0888 sample_value = 0;
0889 sample_value_error = 0;
0890 } else if (gainId == 1) {
0891 sample_value = static_cast<SampleVector::Scalar>(adc) - pedestal;
0892 sample_value_error = rms_x12[hashedId];
0893 } else if (gainId == 2) {
0894 sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x6[hashedId]) * gain12Over6[hashedId];
0895 sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId];
0896 } else if (gainId == 3) {
0897 sample_value =
0898 (static_cast<SampleVector::Scalar>(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] * gain12Over6[hashedId];
0899 sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId];
0900 } else {
0901 sample_value = 0;
0902 sample_value_error = 0;
0903 bad = true;
0904 }
0905
0906 // TODO: make sure we save things correctly when sample is useless
0907 const auto useless_sample = (sample_value_error <= 0) | bad;
0908 useless_sample_values[tx] = useless_sample;
0909 sample_values[tx] = sample_value;
0910 sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error;
0911
0912 // DEBUG
0913 #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
0914 if (ch == 0) {
0915 printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
0916 sample,
0917 sample_value,
0918 sample_value_error,
0919 useless_sample ? '1' : '0');
0920 }
0921 #endif
0922
0923 // store into the shared mem
0924 shrSampleValues[threadIdx.x] = sample_value_error > 0 ? sample_value : std::numeric_limits<ScalarType>::min();
0925 shrSampleValueErrors[threadIdx.x] = sample_value_error;
0926 __syncthreads();
0927
0928 // perform the reduction with min
0929 if (sample < 5) {
0930 // note, if equal -> we keep the value with lower sample as for cpu
0931 shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 5]
0932 ? shrSampleValueErrors[threadIdx.x + 5]
0933 : shrSampleValueErrors[threadIdx.x];
0934 shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 5]);
0935 }
0936 __syncthreads();
0937
0938 // a bit of an overkill, but easier than to compare across 3 values
0939 if (sample < 3) {
0940 shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 3]
0941 ? shrSampleValueErrors[threadIdx.x + 3]
0942 : shrSampleValueErrors[threadIdx.x];
0943 shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 3]);
0944 }
0945 __syncthreads();
0946
0947 if (sample < 2) {
0948 shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 2]
0949 ? shrSampleValueErrors[threadIdx.x + 2]
0950 : shrSampleValueErrors[threadIdx.x];
0951 shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 2]);
0952 }
0953 __syncthreads();
0954
0955 if (sample == 0) {
0956 // we only needd the max error
0957 const auto maxSampleValueError = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 1]
0958 ? shrSampleValueErrors[threadIdx.x + 1]
0959 : shrSampleValueErrors[threadIdx.x];
0960
0961 // # pedestal samples used
0962 pedestal_nums[ch] = num;
0963 // this is used downstream
0964 ampMaxError[ch] = maxSampleValueError;
0965
0966 // DEBUG
0967 #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
0968 if (ch == 0) {
0969 printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
0970 }
0971 #endif
0972 }
0973 }
0974
0975 ///
0976 /// launch context parameters: 1 thread per channel
0977 ///
0978 //#define DEBUG_TIME_CORRECTION
0979 __global__ void kernel_time_correction_and_finalize(
0980 // SampleVector::Scalar const* g_amplitude,
0981 ::ecal::reco::StorageScalarType const* g_amplitudeEB,
0982 ::ecal::reco::StorageScalarType const* g_amplitudeEE,
0983 uint16_t const* digis_eb,
0984 uint32_t const* dids_eb,
0985 uint16_t const* digis_ee,
0986 uint32_t const* dids_ee,
0987 float const* amplitudeBinsEB,
0988 float const* amplitudeBinsEE,
0989 float const* shiftBinsEB,
0990 float const* shiftBinsEE,
0991 SampleVector::Scalar const* g_timeMax,
0992 SampleVector::Scalar const* g_timeError,
0993 float const* g_rms_x12,
0994 float const* timeCalibConstant,
0995 float* g_jitterEB,
0996 float* g_jitterEE,
0997 float* g_jitterErrorEB,
0998 float* g_jitterErrorEE,
0999 uint32_t* flagsEB,
1000 uint32_t* flagsEE,
1001 const int amplitudeBinsSizeEB,
1002 const int amplitudeBinsSizeEE,
1003 ConfigurationParameters::type const timeConstantTermEB,
1004 ConfigurationParameters::type const timeConstantTermEE,
1005 float const offsetTimeValueEB,
1006 float const offsetTimeValueEE,
1007 ConfigurationParameters::type const timeNconstEB,
1008 ConfigurationParameters::type const timeNconstEE,
1009 ConfigurationParameters::type const amplitudeThresholdEB,
1010 ConfigurationParameters::type const amplitudeThresholdEE,
1011 ConfigurationParameters::type const outOfTimeThreshG12pEB,
1012 ConfigurationParameters::type const outOfTimeThreshG12pEE,
1013 ConfigurationParameters::type const outOfTimeThreshG12mEB,
1014 ConfigurationParameters::type const outOfTimeThreshG12mEE,
1015 ConfigurationParameters::type const outOfTimeThreshG61pEB,
1016 ConfigurationParameters::type const outOfTimeThreshG61pEE,
1017 ConfigurationParameters::type const outOfTimeThreshG61mEB,
1018 ConfigurationParameters::type const outOfTimeThreshG61mEE,
1019 uint32_t const offsetForHashes,
1020 uint32_t const offsetForInputs,
1021 const int nchannels) {
1022 using ScalarType = SampleVector::Scalar;
1023
1024 // constants
1025 constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
1026
1027 // indices
1028 const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
1029 const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
1030 const auto* dids = gtx >= offsetForInputs ? dids_ee : dids_eb;
1031 const auto& digis = gtx >= offsetForInputs ? digis_ee : digis_eb;
1032
1033 // filter out outside of range threads
1034 if (gtx >= nchannels)
1035 return;
1036
1037 // need to ref the right ptrs
1038 #define ARRANGE(var) auto* var = gtx >= offsetForInputs ? var##EE : var##EB
1039 ARRANGE(g_amplitude);
1040 ARRANGE(g_jitter);
1041 ARRANGE(g_jitterError);
1042 ARRANGE(flags);
1043 #undef ARRANGE
1044
1045 const auto did = DetId{dids[inputGtx]};
1046 const auto isBarrel = did.subdetId() == EcalBarrel;
1047 const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
1048 : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
1049 const auto* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE;
1050 const auto* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE;
1051 const auto amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE;
1052 const auto timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
1053 const auto timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
1054 const auto offsetTimeValue = isBarrel ? offsetTimeValueEB : offsetTimeValueEE;
1055 const auto amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE;
1056 const auto outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
1057 const auto outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
1058 const auto outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
1059 const auto outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
1060
1061 // load some
1062 const auto amplitude = g_amplitude[inputGtx];
1063 const auto rms_x12 = g_rms_x12[hashedId];
1064 const auto timeCalibConst = timeCalibConstant[hashedId];
1065
1066 int myBin = -1;
1067 for (int bin = 0; bin < amplitudeBinsSize; bin++) {
1068 if (amplitude > amplitudeBins[bin])
1069 myBin = bin;
1070 else
1071 break;
1072 }
1073
1074 ScalarType correction = 0;
1075 if (myBin == -1) {
1076 correction = shiftBins[0];
1077 } else if (myBin == amplitudeBinsSize - 1) {
1078 correction = shiftBins[myBin];
1079 } else {
1080 correction = shiftBins[myBin + 1] - shiftBins[myBin];
1081 correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
1082 correction += shiftBins[myBin];
1083 }
1084
1085 // correction * 1./25.
1086 correction = correction * 0.04;
1087 const auto timeMax = g_timeMax[gtx];
1088 const auto timeError = g_timeError[gtx];
1089 const auto jitter = timeMax - 5 + correction;
1090 const auto jitterError =
1091 std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25.
1092
1093 #ifdef DEBUG_TIME_CORRECTION
1094 printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
1095 gtx,
1096 timeMax,
1097 timeError,
1098 jitter,
1099 correction);
1100 // }
1101 #endif
1102
1103 // store back to global
1104 g_jitter[inputGtx] = jitter;
1105 g_jitterError[inputGtx] = jitterError;
1106
1107 // set the flag
1108 // TODO: replace with something more efficient (if required),
1109 // for now just to make it work
1110 if (amplitude > amplitudeThreshold * rms_x12) {
1111 auto threshP = outOfTimeThreshG12p;
1112 auto threshM = outOfTimeThreshG12m;
1113 if (amplitude > 3000.) {
1114 for (int isample = 0; isample < nsamples; isample++) {
1115 int gainid = ecalMGPA::gainId(digis[nsamples * inputGtx + isample]);
1116 if (gainid != 1) {
1117 threshP = outOfTimeThreshG61p;
1118 threshM = outOfTimeThreshG61m;
1119 break;
1120 }
1121 }
1122 }
1123
1124 const auto correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
1125 const auto nterm = timeNconst * rms_x12 / amplitude;
1126 const auto sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
1127 if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
1128 flags[inputGtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
1129 }
1130 }
1131
1132 } // namespace multifit
1133 } // namespace ecal