Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu is written in an unsupported language. File is not indexed.

0001 #include <cmath>
0002 #include <limits>
0003 
0004 #include <cuda.h>
0005 
0006 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0007 #include "DataFormats/EcalDigi/interface/EcalMGPASample.h"
0008 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
0009 #include "DataFormats/Math/interface/approx_exp.h"
0010 #include "DataFormats/Math/interface/approx_log.h"
0011 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0012 
0013 #include "TimeComputationKernels.h"
0014 #include "KernelHelpers.h"
0015 
0016 //#define DEBUG
0017 
0018 //#define ECAL_RECO_CUDA_DEBUG
0019 
0020 namespace ecal {
0021   namespace multifit {
0022 
0023     __device__ __forceinline__ bool use_sample(unsigned int sample_mask, unsigned int sample) {
0024       return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
0025     }
0026 
0027     __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
0028                                                   SampleVector::Scalar const* sample_value_errors,
0029                                                   bool const* useless_sample_values,
0030                                                   SampleVector::Scalar* chi2s,
0031                                                   SampleVector::Scalar* sum0s,
0032                                                   SampleVector::Scalar* sumAAs,
0033                                                   const int nchannels) {
0034       using ScalarType = SampleVector::Scalar;
0035       constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0036 
0037       // indices
0038       int tx = threadIdx.x + blockDim.x * blockIdx.x;
0039       int ltx = threadIdx.x;
0040       int ch = tx / nsamples;
0041       int nchannels_per_block = blockDim.x / nsamples;
0042 
0043       // threads that return here should not affect the __syncthreads() below since they have exitted the kernel
0044       if (ch >= nchannels)
0045         return;
0046 
0047       int sample = tx % nsamples;
0048 
0049       // shared mem inits
0050       extern __shared__ char sdata[];
0051       char* s_sum0 = sdata;
0052       SampleVector::Scalar* s_sum1 = reinterpret_cast<SampleVector::Scalar*>(s_sum0 + nchannels_per_block * nsamples);
0053       SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block * nsamples;
0054       SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block * nsamples;
0055 
0056       // TODO make sure no div by 0
0057       const auto inv_error =
0058           useless_sample_values[tx] ? 0.0 : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]);
0059       const auto sample_value = sample_values[tx];
0060       s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
0061       s_sum1[ltx] = inv_error;
0062       s_sumA[ltx] = sample_value * inv_error;
0063       s_sumAA[ltx] = sample_value * sample_value * inv_error;
0064       __syncthreads();
0065 
0066       // 5 threads for [0, 4] samples
0067       if (sample < 5) {
0068         s_sum0[ltx] += s_sum0[ltx + 5];
0069         s_sum1[ltx] += s_sum1[ltx + 5];
0070         s_sumA[ltx] += s_sumA[ltx + 5];
0071         s_sumAA[ltx] += s_sumAA[ltx + 5];
0072       }
0073       __syncthreads();
0074 
0075       if (sample < 2) {
0076         // note double counting of sample 3
0077         s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
0078         s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
0079         s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
0080         s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
0081       }
0082       __syncthreads();
0083 
0084       if (sample == 0) {
0085         // note, subtract to remove the double counting of sample == 3
0086         const auto sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
0087         const auto sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
0088         const auto sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
0089         const auto sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
0090         const auto chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast<ScalarType>(0);
0091         chi2s[ch] = chi2;
0092         sum0s[ch] = sum0;
0093         sumAAs[ch] = sumAA;
0094 
0095 #ifdef DEBUG_TC_NULLHYPOT
0096         if (ch == 0) {
0097           printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast<int>(sum0), sumAA);
0098         }
0099 #endif
0100       }
0101     }
0102 
0103     constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
0104     constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
0105 
0106     //#define DEBUG_TC_MAKERATIO
0107     //
0108     // launch ctx parameters are
0109     // 45 threads per channel, X channels per block, Y blocks
0110     // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
0111     // TODO: it might be much beter to use 32 threads per channel instead of 45
0112     // to simplify the synchronization
0113     //
0114     __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
0115                                                   SampleVector::Scalar const* sample_value_errors,
0116                                                   uint32_t const* dids_eb,
0117                                                   uint32_t const* dids_ee,
0118                                                   bool const* useless_sample_values,
0119                                                   char const* pedestal_nums,
0120                                                   ConfigurationParameters::type const* amplitudeFitParametersEB,
0121                                                   ConfigurationParameters::type const* amplitudeFitParametersEE,
0122                                                   ConfigurationParameters::type const* timeFitParametersEB,
0123                                                   ConfigurationParameters::type const* timeFitParametersEE,
0124                                                   SampleVector::Scalar const* sumAAsNullHypot,
0125                                                   SampleVector::Scalar const* sum0sNullHypot,
0126                                                   SampleVector::Scalar* tMaxAlphaBetas,
0127                                                   SampleVector::Scalar* tMaxErrorAlphaBetas,
0128                                                   SampleVector::Scalar* g_accTimeMax,
0129                                                   SampleVector::Scalar* g_accTimeWgt,
0130                                                   TimeComputationState* g_state,
0131                                                   unsigned const int timeFitParameters_sizeEB,
0132                                                   unsigned const int timeFitParameters_sizeEE,
0133                                                   ConfigurationParameters::type const timeFitLimits_firstEB,
0134                                                   ConfigurationParameters::type const timeFitLimits_firstEE,
0135                                                   ConfigurationParameters::type const timeFitLimits_secondEB,
0136                                                   ConfigurationParameters::type const timeFitLimits_secondEE,
0137                                                   const int nchannels,
0138                                                   uint32_t const offsetForInputs) {
0139       using ScalarType = SampleVector::Scalar;
0140 
0141       // constants
0142       constexpr int nthreads_per_channel = 45;  // n=10, n(n-1)/2
0143       constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0144 
0145       // indices
0146       const int gtx = threadIdx.x + blockDim.x * blockIdx.x;
0147       const int ch = gtx / nthreads_per_channel;
0148       const int ltx = threadIdx.x % nthreads_per_channel;
0149       const int ch_start = ch * nsamples;
0150       const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
0151       const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0152 
0153       // remove inactive threads
0154       // threads that return here should not affect the __syncthreads() below since they have exitted the kernel
0155       if (ch >= nchannels)
0156         return;
0157 
0158       const auto did = DetId{dids[inputCh]};
0159       const auto isBarrel = did.subdetId() == EcalBarrel;
0160       const auto* amplitudeFitParameters = isBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
0161       const auto* timeFitParameters = isBarrel ? timeFitParametersEB : timeFitParametersEE;
0162       const auto timeFitParameters_size = isBarrel ? timeFitParameters_sizeEB : timeFitParameters_sizeEE;
0163       const auto timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
0164       const auto timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
0165 
0166       extern __shared__ char smem[];
0167       ScalarType* shr_chi2s = reinterpret_cast<ScalarType*>(smem);
0168       ScalarType* shr_time_wgt = shr_chi2s + blockDim.x;
0169       ScalarType* shr_time_max = shr_time_wgt + blockDim.x;
0170       ScalarType* shrTimeMax = shr_time_max + blockDim.x;
0171       ScalarType* shrTimeWgt = shrTimeMax + blockDim.x;
0172 
0173       // map tx -> (sample_i, sample_j)
0174       int sample_i, sample_j = 0;
0175       if (ltx >= 0 && ltx <= 8) {
0176         sample_i = 0;
0177         sample_j = 1 + ltx;
0178       } else if (ltx <= 16) {
0179         sample_i = 1;
0180         sample_j = 2 + ltx - 9;
0181       } else if (ltx <= 23) {
0182         sample_i = 2;
0183         sample_j = 3 + ltx - 17;
0184       } else if (ltx <= 29) {
0185         sample_i = 3;
0186         sample_j = 4 + ltx - 24;
0187       } else if (ltx <= 34) {
0188         sample_i = 4;
0189         sample_j = 5 + ltx - 30;
0190       } else if (ltx <= 38) {
0191         sample_i = 5;
0192         sample_j = 6 + ltx - 35;
0193       } else if (ltx <= 41) {
0194         sample_i = 6;
0195         sample_j = 7 + ltx - 39;
0196       } else if (ltx <= 43) {
0197         sample_i = 7;
0198         sample_j = 8 + ltx - 42;
0199       } else if (ltx <= 44) {
0200         sample_i = 8;
0201         sample_j = 9;
0202       } else
0203         assert(false);
0204 
0205       const auto tx_i = ch_start + sample_i;
0206       const auto tx_j = ch_start + sample_j;
0207 
0208       //
0209       // note, given the way we partition the block, with 45 threads per channel
0210       // we will end up with inactive threads which need to be dragged along
0211       // through the synching point
0212       //
0213       bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
0214                                          sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
0215 
0216       //
0217       // see cpu implementation for explanation
0218       //
0219       ScalarType chi2 = std::numeric_limits<ScalarType>::max();
0220       ScalarType tmax = 0;
0221       ScalarType tmaxerr = 0;
0222       shrTimeMax[threadIdx.x] = 0;
0223       shrTimeWgt[threadIdx.x] = 0;
0224       bool internalCondForSkipping1 = true;
0225       bool internalCondForSkipping2 = true;
0226       if (!condForUselessSamples) {
0227         const auto rtmp = sample_values[tx_i] / sample_values[tx_j];
0228         const auto invampl_i = 1.0 / sample_values[tx_i];
0229         const auto relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
0230         const auto invampl_j = 1.0 / sample_values[tx_j];
0231         const auto relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
0232         const auto err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
0233         auto err2 = sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
0234         // TODO non-divergent branch for a block if each block has 1 channel
0235         // otherwise non-divergent for groups of 45 threads
0236         // at this point, pedestal_nums[ch] can be either 0, 1 or 2
0237         if (pedestal_nums[ch] == 2)
0238           err2 *= err2 * 0.5;
0239         const auto err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
0240         const auto total_error = std::sqrt(err1 + err2 + err3);
0241 
0242         const auto alpha = amplitudeFitParameters[0];
0243         const auto beta = amplitudeFitParameters[1];
0244         const auto alphabeta = alpha * beta;
0245         const auto invalphabeta = 1.0 / alphabeta;
0246 
0247         // variables instead of a struct
0248         const auto ratio_index = sample_i;
0249         const auto ratio_step = sample_j - sample_i;
0250         const auto ratio_value = rtmp;
0251         const auto ratio_error = total_error;
0252 
0253         const auto rlim_i_j = fast_expf(static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
0254         internalCondForSkipping1 = !(total_error < 1.0 && rtmp > 0.001 && rtmp < rlim_i_j);
0255         if (!internalCondForSkipping1) {
0256           //
0257           // precompute.
0258           // in cpu version this was done conditionally
0259           // however easier to do it here (precompute) and then just filter out
0260           // if not needed
0261           //
0262           const auto l_timeFitLimits_first = timeFitLimits_first;
0263           const auto l_timeFitLimits_second = timeFitLimits_second;
0264           if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
0265             const auto time_max_i = static_cast<ScalarType>(ratio_index);
0266             auto u = timeFitParameters[timeFitParameters_size - 1];
0267             CMS_UNROLL_LOOP
0268             for (int k = timeFitParameters_size - 2; k >= 0; k--)
0269               u = u * ratio_value + timeFitParameters[k];
0270 
0271             auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
0272             for (int k = timeFitParameters_size - 2; k >= 1; k--)
0273               du = du * ratio_value + k * timeFitParameters[k];
0274 
0275             const auto error2 = ratio_error * ratio_error * du * du;
0276             const auto time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast<ScalarType>(0);
0277             const auto time_wgt = error2 > 0 ? 1.0 / error2 : static_cast<ScalarType>(0);
0278 
0279             // store into shared mem
0280             // note, this name is essentially identical to the one used
0281             // below.
0282             shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0;
0283             shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0;
0284           } else {
0285             shrTimeMax[threadIdx.x] = 0;
0286             shrTimeWgt[threadIdx.x] = 0;
0287           }
0288 
0289           // continue with ratios
0290           const auto stepOverBeta = static_cast<SampleVector::Scalar>(ratio_step) / beta;
0291           const auto offset = static_cast<SampleVector::Scalar>(ratio_index) + alphabeta;
0292           const auto rmin = std::max(ratio_value - ratio_error, 0.001);
0293           const auto rmax = std::min(ratio_value + ratio_error,
0294                                      fast_expf(static_cast<SampleVector::Scalar>(ratio_step) / beta) - 0.001);
0295           const auto time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.0);
0296           const auto time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.0);
0297 
0298           // set these guys
0299           tmax = 0.5 * (time1 + time2);
0300           tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
0301 #ifdef DEBUG_TC_MAKERATIO
0302           if (ch == 1 || ch == 0)
0303             printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n",
0304                    ch,
0305                    ltx,
0306                    tmax,
0307                    tmaxerr,
0308                    time1,
0309                    time2,
0310                    offset,
0311                    rmin,
0312                    rmax);
0313 #endif
0314 
0315           SampleVector::Scalar sumAf = 0;
0316           SampleVector::Scalar sumff = 0;
0317           const int itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
0318           auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
0319           // TODO: data dependence
0320           for (int it = itmin + 1; it < nsamples; it++) {
0321             loffset += invalphabeta;
0322             if (useless_sample_values[ch_start + it])
0323               continue;
0324             const auto inverr2 = 1.0 / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
0325             const auto term1 = 1.0 + loffset;
0326             const auto f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
0327             sumAf += sample_values[ch_start + it] * (f * inverr2);
0328             sumff += f * (f * inverr2);
0329           }
0330 
0331           const auto sumAA = sumAAsNullHypot[ch];
0332           const auto sum0 = sum0sNullHypot[ch];
0333           chi2 = sumAA;
0334           // TODO: sum0 can not be 0 below, need to introduce the check upfront
0335           if (sumff > 0) {
0336             chi2 = sumAA - sumAf * (sumAf / sumff);
0337           }
0338           chi2 /= sum0;
0339 
0340 #ifdef DEBUG_TC_MAKERATIO
0341           if (ch == 1 || ch == 0)
0342             printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n",
0343                    ch,
0344                    ltx,
0345                    sumAf,
0346                    sumff,
0347                    sumAA,
0348                    static_cast<int>(sum0),
0349                    tmax,
0350                    tmaxerr,
0351                    chi2);
0352 #endif
0353 
0354           if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
0355             internalCondForSkipping2 = false;
0356           else
0357             chi2 = std::numeric_limits<ScalarType>::max();
0358         }
0359       }
0360 
0361       // store into smem
0362       shr_chi2s[threadIdx.x] = chi2;
0363       __syncthreads();
0364 
0365       // find min chi2 - quite crude for now
0366       // TODO validate/check
0367       char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
0368       bool oddElements = nthreads_per_channel % 2;
0369       CMS_UNROLL_LOOP
0370       while (iter >= 1) {
0371         if (ltx < iter)
0372           // for odd ns, the last guy will just store itself
0373           // exception is for ltx == 0 and iter==1
0374           shr_chi2s[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
0375                                        ? shr_chi2s[threadIdx.x]
0376                                        : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x + iter]);
0377         __syncthreads();
0378         oddElements = iter % 2;
0379         iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
0380       }
0381 
0382       // filter out inactive or useless samples threads
0383       if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
0384         // min chi2, now compute weighted average of tmax measurements
0385         // see cpu version for more explanation
0386         const auto chi2min = shr_chi2s[threadIdx.x - ltx];
0387         const auto chi2Limit = chi2min + 1.0;
0388         const auto inverseSigmaSquared = chi2 < chi2Limit ? 1.0 / (tmaxerr * tmaxerr) : 0.0;
0389 
0390 #ifdef DEBUG_TC_MAKERATIO
0391         if (ch == 1 || ch == 0)
0392           printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
0393                  ch,
0394                  ltx,
0395                  chi2min,
0396                  chi2Limit,
0397                  inverseSigmaSquared);
0398 #endif
0399 
0400         // store into shared mem and run reduction
0401         // TODO: check if cooperative groups would be better
0402         // TODO: check if shuffling intrinsics are better
0403         shr_time_wgt[threadIdx.x] = inverseSigmaSquared;
0404         shr_time_max[threadIdx.x] = tmax * inverseSigmaSquared;
0405       } else {
0406         shr_time_wgt[threadIdx.x] = 0;
0407         shr_time_max[threadIdx.x] = 0;
0408       }
0409       __syncthreads();
0410 
0411       // reduce to compute time_max and time_wgt
0412       iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
0413       oddElements = nthreads_per_channel % 2;
0414       CMS_UNROLL_LOOP
0415       while (iter >= 1) {
0416         if (ltx < iter) {
0417           shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
0418                                           ? shr_time_wgt[threadIdx.x]
0419                                           : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x + iter];
0420           shr_time_max[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
0421                                           ? shr_time_max[threadIdx.x]
0422                                           : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x + iter];
0423           shrTimeMax[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
0424                                         ? shrTimeMax[threadIdx.x]
0425                                         : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x + iter];
0426           shrTimeWgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
0427                                         ? shrTimeWgt[threadIdx.x]
0428                                         : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x + iter];
0429         }
0430 
0431         __syncthreads();
0432         oddElements = iter % 2;
0433         iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
0434       }
0435 
0436       // load from shared memory the 0th guy (will contain accumulated values)
0437       // compute
0438       // store into global mem
0439       if (ltx == 0) {
0440         const auto tmp_time_max = shr_time_max[threadIdx.x];
0441         const auto tmp_time_wgt = shr_time_wgt[threadIdx.x];
0442 
0443         // we are done if there number of time ratios is 0
0444         if (tmp_time_wgt == 0 && tmp_time_max == 0) {
0445           g_state[ch] = TimeComputationState::Finished;
0446           return;
0447         }
0448 
0449         // no div by 0
0450         const auto tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
0451         const auto tMaxErrorAlphaBeta = 1.0 / std::sqrt(tmp_time_wgt);
0452 
0453         tMaxAlphaBetas[ch] = tMaxAlphaBeta;
0454         tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta;
0455         g_accTimeMax[ch] = shrTimeMax[threadIdx.x];
0456         g_accTimeWgt[ch] = shrTimeWgt[threadIdx.x];
0457         g_state[ch] = TimeComputationState::NotFinished;
0458 
0459 #ifdef DEBUG_TC_MAKERATIO
0460         printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
0461         printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
0462                ch,
0463                tMaxAlphaBeta,
0464                tMaxErrorAlphaBeta,
0465                shrTimeMax[threadIdx.x],
0466                shrTimeWgt[threadIdx.x]);
0467 #endif
0468       }
0469     }
0470 
0471     /// launch ctx parameters are
0472     /// 10 threads per channel, N channels per block, Y blocks
0473     /// TODO: do we need to keep the state around or can be removed?!
0474     //#define DEBUG_FINDAMPLCHI2_AND_FINISH
0475     __global__ void kernel_time_compute_findamplchi2_and_finish(
0476         SampleVector::Scalar const* sample_values,
0477         SampleVector::Scalar const* sample_value_errors,
0478         uint32_t const* dids_eb,
0479         uint32_t const* dids_ee,
0480         bool const* useless_samples,
0481         SampleVector::Scalar const* g_tMaxAlphaBeta,
0482         SampleVector::Scalar const* g_tMaxErrorAlphaBeta,
0483         SampleVector::Scalar const* g_accTimeMax,
0484         SampleVector::Scalar const* g_accTimeWgt,
0485         ConfigurationParameters::type const* amplitudeFitParametersEB,
0486         ConfigurationParameters::type const* amplitudeFitParametersEE,
0487         SampleVector::Scalar const* sumAAsNullHypot,
0488         SampleVector::Scalar const* sum0sNullHypot,
0489         SampleVector::Scalar const* chi2sNullHypot,
0490         TimeComputationState* g_state,
0491         SampleVector::Scalar* g_ampMaxAlphaBeta,
0492         SampleVector::Scalar* g_ampMaxError,
0493         SampleVector::Scalar* g_timeMax,
0494         SampleVector::Scalar* g_timeError,
0495         const int nchannels,
0496         uint32_t const offsetForInputs) {
0497       using ScalarType = SampleVector::Scalar;
0498 
0499       // constants
0500       constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0501 
0502       // indices
0503       const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
0504       const int ch = gtx / nsamples;
0505       const int sample = threadIdx.x % nsamples;
0506       const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
0507       const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0508 
0509       // configure shared mem
0510       // per block, we need #threads per block * 2 * sizeof(ScalarType)
0511       // we run with N channels per block
0512       extern __shared__ char smem[];
0513       ScalarType* shr_sumAf = reinterpret_cast<ScalarType*>(smem);
0514       ScalarType* shr_sumff = shr_sumAf + blockDim.x;
0515 
0516       if (ch >= nchannels)
0517         return;
0518 
0519       auto state = g_state[ch];
0520       const auto did = DetId{dids[inputCh]};
0521       const auto* amplitudeFitParameters =
0522           did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
0523 
0524       // TODO is that better than storing into global and launching another kernel
0525       // for the first 10 threads
0526       if (state == TimeComputationState::NotFinished) {
0527         const auto alpha = amplitudeFitParameters[0];
0528         const auto beta = amplitudeFitParameters[1];
0529         const auto alphabeta = alpha * beta;
0530         const auto invalphabeta = 1.0 / alphabeta;
0531         const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
0532         const auto sample_value = sample_values[gtx];
0533         const auto sample_value_error = sample_value_errors[gtx];
0534         const auto inverr2 =
0535             useless_samples[gtx] ? static_cast<ScalarType>(0) : 1.0 / (sample_value_error * sample_value_error);
0536         const auto offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) * invalphabeta;
0537         const auto term1 = 1.0 + offset;
0538         const auto f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast<ScalarType>(0.0);
0539         const auto sumAf = sample_value * (f * inverr2);
0540         const auto sumff = f * (f * inverr2);
0541 
0542         // store into shared mem
0543         shr_sumAf[threadIdx.x] = sumAf;
0544         shr_sumff[threadIdx.x] = sumff;
0545       } else {
0546         shr_sumAf[threadIdx.x] = 0;
0547         shr_sumff[threadIdx.x] = 0;
0548       }
0549       __syncthreads();
0550 
0551       // reduce
0552       // unroll completely here (but hardcoded)
0553       if (sample < 5) {
0554         shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 5];
0555         shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 5];
0556       }
0557       __syncthreads();
0558 
0559       if (sample < 2) {
0560         // will need to subtract for ltx = 3, we double count here
0561         shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 2] + shr_sumAf[threadIdx.x + 3];
0562         shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 2] + shr_sumff[threadIdx.x + 3];
0563       }
0564       __syncthreads();
0565 
0566       if (sample == 0) {
0567         // exit if the state is done
0568         // note, we do not exit before all __synchtreads are finished
0569         if (state == TimeComputationState::Finished) {
0570           g_timeMax[ch] = 5;
0571           g_timeError[ch] = -999;
0572           return;
0573         }
0574 
0575         // subtract to avoid double counting
0576         const auto sumff = shr_sumff[threadIdx.x] + shr_sumff[threadIdx.x + 1] - shr_sumff[threadIdx.x + 3];
0577         const auto sumAf = shr_sumAf[threadIdx.x] + shr_sumAf[threadIdx.x + 1] - shr_sumAf[threadIdx.x + 3];
0578 
0579         const auto ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
0580         const auto sumAA = sumAAsNullHypot[ch];
0581         const auto sum0 = sum0sNullHypot[ch];
0582         const auto nullChi2 = chi2sNullHypot[ch];
0583         if (sumff > 0) {
0584           const auto chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
0585           if (chi2AlphaBeta > nullChi2) {
0586             // null hypothesis is better
0587             state = TimeComputationState::Finished;
0588 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0589             printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
0590                    ch,
0591                    chi2AlphaBeta,
0592                    nullChi2,
0593                    sumAA,
0594                    sumAf,
0595                    sumff,
0596                    sum0);
0597 #endif
0598           }
0599 
0600           // store to global
0601           g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
0602         } else {
0603 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0604           printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
0605 #endif
0606           state = TimeComputationState::Finished;
0607         }
0608 
0609         // store the state to global and finish calcs
0610         g_state[ch] = state;
0611         if (state == TimeComputationState::Finished) {
0612           // store default values into global
0613           g_timeMax[ch] = 5;
0614           g_timeError[ch] = -999;
0615 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0616           printf("ch = %d finished state\n", ch);
0617 #endif
0618           return;
0619         }
0620 
0621         const auto ampMaxError = g_ampMaxError[ch];
0622         const auto test_ratio = ampMaxAlphaBeta / ampMaxError;
0623         const auto accTimeMax = g_accTimeMax[ch];
0624         const auto accTimeWgt = g_accTimeWgt[ch];
0625         const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
0626         const auto tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
0627         // branch to separate large vs small pulses
0628         // see cpu version for more info
0629         if (test_ratio > 5.0 && accTimeWgt > 0) {
0630           const auto tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast<ScalarType>(0);
0631           const auto tMaxErrorRatio = accTimeWgt > 0 ? 1.0 / std::sqrt(accTimeWgt) : static_cast<ScalarType>(0);
0632 
0633           if (test_ratio > 10.0) {
0634             g_timeMax[ch] = tMaxRatio;
0635             g_timeError[ch] = tMaxErrorRatio;
0636 
0637 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0638             printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
0639 #endif
0640           } else {
0641             const auto timeMax = (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
0642                                   tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
0643                                  5.0;
0644             const auto timeError = (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
0645                                     tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
0646                                    5.0;
0647             state = TimeComputationState::Finished;
0648             g_state[ch] = state;
0649             g_timeMax[ch] = timeMax;
0650             g_timeError[ch] = timeError;
0651 
0652 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0653             printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
0654 #endif
0655           }
0656         } else {
0657           state = TimeComputationState::Finished;
0658           g_state[ch] = state;
0659           g_timeMax[ch] = tMaxAlphaBeta;
0660           g_timeError[ch] = tMaxErrorAlphaBeta;
0661 
0662 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0663           printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
0664 #endif
0665         }
0666       }
0667     }
0668 
0669     __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
0670                                                     uint16_t const* digis_ee,
0671                                                     SampleVector::Scalar* sample_values,
0672                                                     SampleVector::Scalar* sample_value_errors,
0673                                                     bool* useless_sample_values,
0674                                                     unsigned const int sample_mask,
0675                                                     const int nchannels,
0676                                                     uint32_t const offsetForInputs) {
0677       using ScalarType = SampleVector::Scalar;
0678 
0679       // constants
0680       constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0681 
0682       // indices
0683       const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
0684       const int ch = gtx / nsamples;
0685       const int sample = threadIdx.x % nsamples;
0686       const int inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx;
0687       const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
0688 
0689       // remove thread for sample 0, oversubscribing is easier than ....
0690       if (ch >= nchannels || sample == 0)
0691         return;
0692 
0693       if (!use_sample(sample_mask, sample))
0694         return;
0695 
0696       const auto gainIdPrev = ecalMGPA::gainId(digis[inputGtx - 1]);
0697       const auto gainIdNext = ecalMGPA::gainId(digis[inputGtx]);
0698       if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
0699         sample_values[gtx - 1] = 0;
0700         sample_value_errors[gtx - 1] = 1e+9;
0701         useless_sample_values[gtx - 1] = true;
0702       }
0703     }
0704 
0705     __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
0706                                              SampleVector::Scalar const* sample_value_errors,
0707                                              uint32_t const* dids,
0708                                              bool const* useless_samples,
0709                                              SampleVector::Scalar const* g_timeMax,
0710                                              SampleVector::Scalar const* amplitudeFitParametersEB,
0711                                              SampleVector::Scalar const* amplitudeFitParametersEE,
0712                                              SampleVector::Scalar* g_amplitudeMax,
0713                                              const int nchannels) {
0714       using ScalarType = SampleVector::Scalar;
0715 
0716       // constants
0717       constexpr ScalarType corr4 = 1.;
0718       constexpr ScalarType corr6 = 1.;
0719       constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0720 
0721       // indices
0722       const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
0723       const int ch = gtx / nsamples;
0724       const int sample = threadIdx.x % nsamples;
0725 
0726       if (ch >= nchannels)
0727         return;
0728 
0729       const auto did = DetId{dids[ch]};
0730       const auto* amplitudeFitParameters =
0731           did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
0732 
0733       // configure shared mem
0734       extern __shared__ char smem[];
0735       ScalarType* shr_sum1 = reinterpret_cast<ScalarType*>(smem);
0736       auto* shr_sumA = shr_sum1 + blockDim.x;
0737       auto* shr_sumF = shr_sumA + blockDim.x;
0738       auto* shr_sumAF = shr_sumF + blockDim.x;
0739       auto* shr_sumFF = shr_sumAF + blockDim.x;
0740 
0741       const auto alpha = amplitudeFitParameters[0];
0742       const auto beta = amplitudeFitParameters[1];
0743       const auto timeMax = g_timeMax[ch];
0744       const auto pedestalLimit = timeMax - (alpha * beta) - 1.0;
0745       const auto sample_value = sample_values[gtx];
0746       const auto sample_value_error = sample_value_errors[gtx];
0747       const auto inverr2 =
0748           sample_value_error > 0 ? 1. / (sample_value_error * sample_value_error) : static_cast<ScalarType>(0);
0749       const auto termOne = 1 + (sample - timeMax) / (alpha * beta);
0750       const auto f = termOne > 1.e-5 ? fast_expf(alpha * fast_logf(termOne) - (sample - timeMax) / beta)
0751                                      : static_cast<ScalarType>(0.);
0752 
0753       bool const cond = ((sample < pedestalLimit) || (f > 0.6 * corr6 && sample <= timeMax) ||
0754                          (f > 0.4 * corr4 && sample >= timeMax)) &&
0755                         !useless_samples[gtx];
0756 
0757       // store into shared mem
0758       shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast<ScalarType>(0);
0759       shr_sumA[threadIdx.x] = cond ? sample_value * inverr2 : static_cast<ScalarType>(0);
0760       shr_sumF[threadIdx.x] = cond ? f * inverr2 : static_cast<ScalarType>(0);
0761       shr_sumAF[threadIdx.x] = cond ? (f * inverr2) * sample_value : static_cast<ScalarType>(0);
0762       shr_sumFF[threadIdx.x] = cond ? f * (f * inverr2) : static_cast<ScalarType>(0);
0763 
0764       // reduction
0765       if (sample <= 4) {
0766         shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 5];
0767         shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 5];
0768         shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 5];
0769         shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 5];
0770         shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 5];
0771       }
0772       __syncthreads();
0773 
0774       if (sample < 2) {
0775         // note: we double count sample 3
0776         shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 2] + shr_sum1[threadIdx.x + 3];
0777         shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 2] + shr_sumA[threadIdx.x + 3];
0778         shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 2] + shr_sumF[threadIdx.x + 3];
0779         shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 2] + shr_sumAF[threadIdx.x + 3];
0780         shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 2] + shr_sumFF[threadIdx.x + 3];
0781       }
0782       __syncthreads();
0783 
0784       if (sample == 0) {
0785         const auto sum1 = shr_sum1[threadIdx.x] + shr_sum1[threadIdx.x + 1] - shr_sum1[threadIdx.x + 3];
0786         const auto sumA = shr_sumA[threadIdx.x] + shr_sumA[threadIdx.x + 1] - shr_sumA[threadIdx.x + 3];
0787         const auto sumF = shr_sumF[threadIdx.x] + shr_sumF[threadIdx.x + 1] - shr_sumF[threadIdx.x + 3];
0788         const auto sumAF = shr_sumAF[threadIdx.x] + shr_sumAF[threadIdx.x + 1] - shr_sumAF[threadIdx.x + 3];
0789         const auto sumFF = shr_sumFF[threadIdx.x] + shr_sumFF[threadIdx.x + 1] - shr_sumFF[threadIdx.x + 3];
0790 
0791         const auto denom = sumFF * sum1 - sumF * sumF;
0792         const auto condForDenom = sum1 > 0 && std::abs(denom) > 1.e-20;
0793         const auto amplitudeMax = condForDenom ? (sumAF * sum1 - sumA * sumF) / denom : static_cast<ScalarType>(0.);
0794 
0795         // store into global mem
0796         g_amplitudeMax[ch] = amplitudeMax;
0797       }
0798     }
0799 
0800     //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
0801     __global__ void kernel_time_computation_init(uint16_t const* digis_eb,
0802                                                  uint32_t const* dids_eb,
0803                                                  uint16_t const* digis_ee,
0804                                                  uint32_t const* dids_ee,
0805                                                  float const* rms_x12,
0806                                                  float const* rms_x6,
0807                                                  float const* rms_x1,
0808                                                  float const* mean_x12,
0809                                                  float const* mean_x6,
0810                                                  float const* mean_x1,
0811                                                  float const* gain12Over6,
0812                                                  float const* gain6Over1,
0813                                                  SampleVector::Scalar* sample_values,
0814                                                  SampleVector::Scalar* sample_value_errors,
0815                                                  SampleVector::Scalar* ampMaxError,
0816                                                  bool* useless_sample_values,
0817                                                  char* pedestal_nums,
0818                                                  uint32_t const offsetForHashes,
0819                                                  uint32_t const offsetForInputs,
0820                                                  unsigned const int sample_maskEB,
0821                                                  unsigned const int sample_maskEE,
0822                                                  int nchannels) {
0823       using ScalarType = SampleVector::Scalar;
0824 
0825       // constants
0826       constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0827 
0828       // indices
0829       const int tx = threadIdx.x + blockDim.x * blockIdx.x;
0830       const int ch = tx / nsamples;
0831       const int inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx;
0832       const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0833       const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
0834       const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
0835 
0836       // threads that return here should not affect the __syncthreads() below since they have exitted the kernel
0837       if (ch >= nchannels)
0838         return;
0839 
0840       // indices/inits
0841       const int sample = tx % nsamples;
0842       const int input_ch_start = inputCh * nsamples;
0843       SampleVector::Scalar pedestal = 0.;
0844       int num = 0;
0845 
0846       // configure shared mem
0847       extern __shared__ char smem[];
0848       ScalarType* shrSampleValues = reinterpret_cast<SampleVector::Scalar*>(smem);
0849       ScalarType* shrSampleValueErrors = shrSampleValues + blockDim.x;
0850 
0851       // 0 and 1 sample values
0852       const auto adc0 = ecalMGPA::adc(digis[input_ch_start]);
0853       const auto gainId0 = ecalMGPA::gainId(digis[input_ch_start]);
0854       const auto adc1 = ecalMGPA::adc(digis[input_ch_start + 1]);
0855       const auto gainId1 = ecalMGPA::gainId(digis[input_ch_start + 1]);
0856       const auto did = DetId{dids[inputCh]};
0857       const auto isBarrel = did.subdetId() == EcalBarrel;
0858       const auto sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE;
0859       const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0860                                      : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0861 
0862       // set pedestal
0863       // TODO this branch is non-divergent for a group of 10 threads
0864       if (gainId0 == 1 && use_sample(sample_mask, 0)) {
0865         pedestal = static_cast<SampleVector::Scalar>(adc0);
0866         num = 1;
0867 
0868         const auto diff = adc1 - adc0;
0869         if (gainId1 == 1 && use_sample(sample_mask, 1) && std::abs(diff) < 3 * rms_x12[hashedId]) {
0870           pedestal = (pedestal + static_cast<SampleVector::Scalar>(adc1)) / 2.0;
0871           num = 2;
0872         }
0873       } else {
0874         pedestal = mean_x12[ch];
0875       }
0876 
0877       // ped subtracted and gain-renormalized samples.
0878       const auto gainId = ecalMGPA::gainId(digis[inputTx]);
0879       const auto adc = ecalMGPA::adc(digis[inputTx]);
0880 
0881       bool bad = false;
0882       SampleVector::Scalar sample_value, sample_value_error;
0883       // TODO divergent branch
0884       // TODO: piece below is general both for amplitudes and timing
0885       // potentially there is a way to reduce the amount of code...
0886       if (!use_sample(sample_mask, sample)) {
0887         bad = true;
0888         sample_value = 0;
0889         sample_value_error = 0;
0890       } else if (gainId == 1) {
0891         sample_value = static_cast<SampleVector::Scalar>(adc) - pedestal;
0892         sample_value_error = rms_x12[hashedId];
0893       } else if (gainId == 2) {
0894         sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x6[hashedId]) * gain12Over6[hashedId];
0895         sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId];
0896       } else if (gainId == 3) {
0897         sample_value =
0898             (static_cast<SampleVector::Scalar>(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] * gain12Over6[hashedId];
0899         sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId];
0900       } else {
0901         sample_value = 0;
0902         sample_value_error = 0;
0903         bad = true;
0904       }
0905 
0906       // TODO: make sure we save things correctly when sample is useless
0907       const auto useless_sample = (sample_value_error <= 0) | bad;
0908       useless_sample_values[tx] = useless_sample;
0909       sample_values[tx] = sample_value;
0910       sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error;
0911 
0912       // DEBUG
0913 #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
0914       if (ch == 0) {
0915         printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
0916                sample,
0917                sample_value,
0918                sample_value_error,
0919                useless_sample ? '1' : '0');
0920       }
0921 #endif
0922 
0923       // store into the shared mem
0924       shrSampleValues[threadIdx.x] = sample_value_error > 0 ? sample_value : std::numeric_limits<ScalarType>::min();
0925       shrSampleValueErrors[threadIdx.x] = sample_value_error;
0926       __syncthreads();
0927 
0928       // perform the reduction with min
0929       if (sample < 5) {
0930         // note, if equal -> we keep the value with lower sample as for cpu
0931         shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 5]
0932                                                 ? shrSampleValueErrors[threadIdx.x + 5]
0933                                                 : shrSampleValueErrors[threadIdx.x];
0934         shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 5]);
0935       }
0936       __syncthreads();
0937 
0938       // a bit of an overkill, but easier than to compare across 3 values
0939       if (sample < 3) {
0940         shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 3]
0941                                                 ? shrSampleValueErrors[threadIdx.x + 3]
0942                                                 : shrSampleValueErrors[threadIdx.x];
0943         shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 3]);
0944       }
0945       __syncthreads();
0946 
0947       if (sample < 2) {
0948         shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 2]
0949                                                 ? shrSampleValueErrors[threadIdx.x + 2]
0950                                                 : shrSampleValueErrors[threadIdx.x];
0951         shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 2]);
0952       }
0953       __syncthreads();
0954 
0955       if (sample == 0) {
0956         // we only needd the max error
0957         const auto maxSampleValueError = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 1]
0958                                              ? shrSampleValueErrors[threadIdx.x + 1]
0959                                              : shrSampleValueErrors[threadIdx.x];
0960 
0961         // # pedestal samples used
0962         pedestal_nums[ch] = num;
0963         // this is used downstream
0964         ampMaxError[ch] = maxSampleValueError;
0965 
0966         // DEBUG
0967 #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
0968         if (ch == 0) {
0969           printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
0970         }
0971 #endif
0972       }
0973     }
0974 
0975     ///
0976     /// launch context parameters: 1 thread per channel
0977     ///
0978     //#define DEBUG_TIME_CORRECTION
0979     __global__ void kernel_time_correction_and_finalize(
0980         //        SampleVector::Scalar const* g_amplitude,
0981         ::ecal::reco::StorageScalarType const* g_amplitudeEB,
0982         ::ecal::reco::StorageScalarType const* g_amplitudeEE,
0983         uint16_t const* digis_eb,
0984         uint32_t const* dids_eb,
0985         uint16_t const* digis_ee,
0986         uint32_t const* dids_ee,
0987         float const* amplitudeBinsEB,
0988         float const* amplitudeBinsEE,
0989         float const* shiftBinsEB,
0990         float const* shiftBinsEE,
0991         SampleVector::Scalar const* g_timeMax,
0992         SampleVector::Scalar const* g_timeError,
0993         float const* g_rms_x12,
0994         float const* timeCalibConstant,
0995         float* g_jitterEB,
0996         float* g_jitterEE,
0997         float* g_jitterErrorEB,
0998         float* g_jitterErrorEE,
0999         uint32_t* flagsEB,
1000         uint32_t* flagsEE,
1001         const int amplitudeBinsSizeEB,
1002         const int amplitudeBinsSizeEE,
1003         ConfigurationParameters::type const timeConstantTermEB,
1004         ConfigurationParameters::type const timeConstantTermEE,
1005         float const offsetTimeValueEB,
1006         float const offsetTimeValueEE,
1007         ConfigurationParameters::type const timeNconstEB,
1008         ConfigurationParameters::type const timeNconstEE,
1009         ConfigurationParameters::type const amplitudeThresholdEB,
1010         ConfigurationParameters::type const amplitudeThresholdEE,
1011         ConfigurationParameters::type const outOfTimeThreshG12pEB,
1012         ConfigurationParameters::type const outOfTimeThreshG12pEE,
1013         ConfigurationParameters::type const outOfTimeThreshG12mEB,
1014         ConfigurationParameters::type const outOfTimeThreshG12mEE,
1015         ConfigurationParameters::type const outOfTimeThreshG61pEB,
1016         ConfigurationParameters::type const outOfTimeThreshG61pEE,
1017         ConfigurationParameters::type const outOfTimeThreshG61mEB,
1018         ConfigurationParameters::type const outOfTimeThreshG61mEE,
1019         uint32_t const offsetForHashes,
1020         uint32_t const offsetForInputs,
1021         const int nchannels) {
1022       using ScalarType = SampleVector::Scalar;
1023 
1024       // constants
1025       constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
1026 
1027       // indices
1028       const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
1029       const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
1030       const auto* dids = gtx >= offsetForInputs ? dids_ee : dids_eb;
1031       const auto& digis = gtx >= offsetForInputs ? digis_ee : digis_eb;
1032 
1033       // filter out outside of range threads
1034       if (gtx >= nchannels)
1035         return;
1036 
1037 // need to ref the right ptrs
1038 #define ARRANGE(var) auto* var = gtx >= offsetForInputs ? var##EE : var##EB
1039       ARRANGE(g_amplitude);
1040       ARRANGE(g_jitter);
1041       ARRANGE(g_jitterError);
1042       ARRANGE(flags);
1043 #undef ARRANGE
1044 
1045       const auto did = DetId{dids[inputGtx]};
1046       const auto isBarrel = did.subdetId() == EcalBarrel;
1047       const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
1048                                      : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
1049       const auto* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE;
1050       const auto* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE;
1051       const auto amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE;
1052       const auto timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
1053       const auto timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
1054       const auto offsetTimeValue = isBarrel ? offsetTimeValueEB : offsetTimeValueEE;
1055       const auto amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE;
1056       const auto outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
1057       const auto outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
1058       const auto outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
1059       const auto outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
1060 
1061       // load some
1062       const auto amplitude = g_amplitude[inputGtx];
1063       const auto rms_x12 = g_rms_x12[hashedId];
1064       const auto timeCalibConst = timeCalibConstant[hashedId];
1065 
1066       int myBin = -1;
1067       for (int bin = 0; bin < amplitudeBinsSize; bin++) {
1068         if (amplitude > amplitudeBins[bin])
1069           myBin = bin;
1070         else
1071           break;
1072       }
1073 
1074       ScalarType correction = 0;
1075       if (myBin == -1) {
1076         correction = shiftBins[0];
1077       } else if (myBin == amplitudeBinsSize - 1) {
1078         correction = shiftBins[myBin];
1079       } else {
1080         correction = shiftBins[myBin + 1] - shiftBins[myBin];
1081         correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
1082         correction += shiftBins[myBin];
1083       }
1084 
1085       // correction * 1./25.
1086       correction = correction * 0.04;
1087       const auto timeMax = g_timeMax[gtx];
1088       const auto timeError = g_timeError[gtx];
1089       const auto jitter = timeMax - 5 + correction;
1090       const auto jitterError =
1091           std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04);  // 0.04 = 1./25.
1092 
1093 #ifdef DEBUG_TIME_CORRECTION
1094       printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
1095              gtx,
1096              timeMax,
1097              timeError,
1098              jitter,
1099              correction);
1100 //    }
1101 #endif
1102 
1103       // store back to  global
1104       g_jitter[inputGtx] = jitter;
1105       g_jitterError[inputGtx] = jitterError;
1106 
1107       // set the flag
1108       // TODO: replace with something more efficient (if required),
1109       // for now just to make it work
1110       if (amplitude > amplitudeThreshold * rms_x12) {
1111         auto threshP = outOfTimeThreshG12p;
1112         auto threshM = outOfTimeThreshG12m;
1113         if (amplitude > 3000.) {
1114           for (int isample = 0; isample < nsamples; isample++) {
1115             int gainid = ecalMGPA::gainId(digis[nsamples * inputGtx + isample]);
1116             if (gainid != 1) {
1117               threshP = outOfTimeThreshG61p;
1118               threshM = outOfTimeThreshG61m;
1119               break;
1120             }
1121           }
1122         }
1123 
1124         const auto correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
1125         const auto nterm = timeNconst * rms_x12 / amplitude;
1126         const auto sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
1127         if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
1128           flags[inputGtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
1129       }
1130     }
1131 
1132   }  // namespace multifit
1133 }  // namespace ecal