Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu is written in an unsupported language. File is not indexed.

0001 #include <cstdlib>
0002 #include <limits>
0003 
0004 #include <cuda.h>
0005 
0006 #include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
0007 #include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
0008 #include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h"
0009 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0010 #include "DataFormats/EcalDigi/interface/EcalMGPASample.h"
0011 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
0012 #include "DataFormats/Math/interface/approx_exp.h"
0013 #include "DataFormats/Math/interface/approx_log.h"
0014 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0015 
0016 #include "AmplitudeComputationCommonKernels.h"
0017 #include "KernelHelpers.h"
0018 
0019 namespace ecal {
0020   namespace multifit {
0021 
0022     ///
0023     /// assume kernel launch configuration is
0024     /// (MAXSAMPLES * nchannels, blocks)
0025     ///
0026     __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
0027                                                   uint16_t const* digis_in_eb,
0028                                                   uint32_t const* dids_eb,
0029                                                   uint16_t const* digis_in_ee,
0030                                                   uint32_t const* dids_ee,
0031                                                   SampleVector* amplitudes,
0032                                                   SampleVector* amplitudesForMinimizationEB,
0033                                                   SampleVector* amplitudesForMinimizationEE,
0034                                                   SampleGainVector* gainsNoise,
0035                                                   float const* mean_x1,
0036                                                   float const* mean_x12,
0037                                                   float const* rms_x12,
0038                                                   float const* mean_x6,
0039                                                   float const* gain6Over1,
0040                                                   float const* gain12Over6,
0041                                                   bool* hasSwitchToGain6,
0042                                                   bool* hasSwitchToGain1,
0043                                                   bool* isSaturated,
0044                                                   ::ecal::reco::StorageScalarType* energiesEB,
0045                                                   ::ecal::reco::StorageScalarType* energiesEE,
0046                                                   ::ecal::reco::StorageScalarType* chi2EB,
0047                                                   ::ecal::reco::StorageScalarType* chi2EE,
0048                                                   ::ecal::reco::StorageScalarType* g_pedestalEB,
0049                                                   ::ecal::reco::StorageScalarType* g_pedestalEE,
0050                                                   uint32_t* dids_outEB,
0051                                                   uint32_t* dids_outEE,
0052                                                   uint32_t* flagsEB,
0053                                                   uint32_t* flagsEE,
0054                                                   char* acState,
0055                                                   BXVectorType* bxs,
0056                                                   uint32_t const offsetForHashes,
0057                                                   uint32_t const offsetForInputs,
0058                                                   bool const gainSwitchUseMaxSampleEB,
0059                                                   bool const gainSwitchUseMaxSampleEE,
0060                                                   int const nchannels) {
0061       constexpr bool dynamicPedestal = false;  //---- default to false, ok
0062       constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0063       constexpr int sample_max = 5;
0064       constexpr int full_pulse_max = 9;
0065       int const tx = threadIdx.x + blockIdx.x * blockDim.x;
0066       int const nchannels_per_block = blockDim.x / nsamples;
0067       int const ch = tx / nsamples;
0068       // for accessing input arrays
0069       int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0070       int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * 10 : tx;
0071       // eb is first and then ee
0072       auto const* digis_in = ch >= offsetForInputs ? digis_in_ee : digis_in_eb;
0073       auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
0074       int const sample = threadIdx.x % nsamples;
0075 
0076       // need to ref the right ptr
0077       // macro is for clarity and safety
0078 #define ARRANGE(var) auto* var = ch >= offsetForInputs ? var##EE : var##EB
0079       ARRANGE(amplitudesForMinimization);
0080       ARRANGE(energies);
0081       ARRANGE(chi2);
0082       ARRANGE(g_pedestal);
0083       ARRANGE(dids_out);
0084       ARRANGE(flags);
0085 #undef ARRANGE
0086 
0087       if (ch < nchannels) {
0088         // array of 10 x channels per block
0089         // TODO: any other way of doing simple reduction
0090         // assume bool is 1 byte, should be quite safe
0091         extern __shared__ char shared_mem[];
0092         bool* shr_hasSwitchToGain6 = reinterpret_cast<bool*>(shared_mem);
0093         bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + nchannels_per_block * nsamples;
0094         bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + nchannels_per_block * nsamples;
0095         bool* shr_isSaturated = shr_hasSwitchToGain0 + nchannels_per_block * nsamples;
0096         bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + nchannels_per_block * nsamples;
0097         char* shr_counts = reinterpret_cast<char*>(shr_hasSwitchToGain0_tmp) + nchannels_per_block * nsamples;
0098 
0099         //
0100         // indices
0101         //
0102         auto const did = DetId{dids[inputCh]};
0103         auto const isBarrel = did.subdetId() == EcalBarrel;
0104         // TODO offset for ee, 0 for eb
0105         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0106                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0107 
0108         //
0109         // pulse shape template
0110 
0111         // will be used in the future for setting state
0112         auto const rmsForChecking = rms_x12[hashedId];
0113 
0114         //
0115         // amplitudes
0116         //
0117         int const adc = ecalMGPA::adc(digis_in[inputTx]);
0118         int const gainId = ecalMGPA::gainId(digis_in[inputTx]);
0119         SampleVector::Scalar amplitude = 0.;
0120         SampleVector::Scalar pedestal = 0.;
0121         SampleVector::Scalar gainratio = 0.;
0122 
0123         // store into shared mem for initialization
0124         shr_hasSwitchToGain6[threadIdx.x] = gainId == EcalMgpaBitwiseGain6;
0125         shr_hasSwitchToGain1[threadIdx.x] = gainId == EcalMgpaBitwiseGain1;
0126         shr_hasSwitchToGain0_tmp[threadIdx.x] = gainId == EcalMgpaBitwiseGain0;
0127         shr_hasSwitchToGain0[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x];
0128         shr_counts[threadIdx.x] = 0;
0129         __syncthreads();
0130 
0131         // non-divergent branch (except for the last 4 threads)
0132         if (threadIdx.x <= blockDim.x - 5) {
0133           CMS_UNROLL_LOOP
0134           for (int i = 0; i < 5; i++)
0135             shr_counts[threadIdx.x] += shr_hasSwitchToGain0[threadIdx.x + i];
0136         }
0137         shr_isSaturated[threadIdx.x] = shr_counts[threadIdx.x] == 5;
0138 
0139         //
0140         // unrolled reductions
0141         //
0142         if (sample < 5) {
0143           shr_hasSwitchToGain6[threadIdx.x] =
0144               shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 5];
0145           shr_hasSwitchToGain1[threadIdx.x] =
0146               shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 5];
0147 
0148           // duplication of hasSwitchToGain0 in order not to
0149           // introduce another syncthreads
0150           shr_hasSwitchToGain0_tmp[threadIdx.x] =
0151               shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 5];
0152         }
0153         __syncthreads();
0154 
0155         if (sample < 2) {
0156           // note, both threads per channel take value [3] twice to avoid another if
0157           shr_hasSwitchToGain6[threadIdx.x] = shr_hasSwitchToGain6[threadIdx.x] ||
0158                                               shr_hasSwitchToGain6[threadIdx.x + 2] ||
0159                                               shr_hasSwitchToGain6[threadIdx.x + 3];
0160           shr_hasSwitchToGain1[threadIdx.x] = shr_hasSwitchToGain1[threadIdx.x] ||
0161                                               shr_hasSwitchToGain1[threadIdx.x + 2] ||
0162                                               shr_hasSwitchToGain1[threadIdx.x + 3];
0163 
0164           shr_hasSwitchToGain0_tmp[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x] ||
0165                                                   shr_hasSwitchToGain0_tmp[threadIdx.x + 2] ||
0166                                                   shr_hasSwitchToGain0_tmp[threadIdx.x + 3];
0167 
0168           // sample < 2 -> first 2 threads of each channel will be used here
0169           // => 0 -> will compare 3 and 4 and put into 0
0170           // => 1 -> will compare 4 and 5 and put into 1
0171           shr_isSaturated[threadIdx.x] = shr_isSaturated[threadIdx.x + 3] || shr_isSaturated[threadIdx.x + 4];
0172         }
0173         __syncthreads();
0174 
0175         bool check_hasSwitchToGain0 = false;
0176 
0177         if (sample == 0) {
0178           shr_hasSwitchToGain6[threadIdx.x] =
0179               shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 1];
0180           shr_hasSwitchToGain1[threadIdx.x] =
0181               shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 1];
0182           shr_hasSwitchToGain0_tmp[threadIdx.x] =
0183               shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 1];
0184 
0185           hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x];
0186           hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x];
0187 
0188           // set only for the threadIdx.x corresponding to sample==0
0189           check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x];
0190 
0191           shr_isSaturated[threadIdx.x + 3] = shr_isSaturated[threadIdx.x] || shr_isSaturated[threadIdx.x + 1];
0192           isSaturated[ch] = shr_isSaturated[threadIdx.x + 3];
0193         }
0194 
0195         // TODO: w/o this sync, there is a race
0196         // if (threadIdx == sample_max) below uses max sample thread, not for 0 sample
0197         // check if we can remove it
0198         __syncthreads();
0199 
0200         // TODO: divergent branch
0201         if (gainId == 0 || gainId == 3) {
0202           pedestal = mean_x1[hashedId];
0203           gainratio = gain6Over1[hashedId] * gain12Over6[hashedId];
0204           gainsNoise[ch](sample) = 2;
0205         } else if (gainId == 1) {
0206           pedestal = mean_x12[hashedId];
0207           gainratio = 1.;
0208           gainsNoise[ch](sample) = 0;
0209         } else if (gainId == 2) {
0210           pedestal = mean_x6[hashedId];
0211           gainratio = gain12Over6[hashedId];
0212           gainsNoise[ch](sample) = 1;
0213         }
0214 
0215         // TODO: compile time constant -> branch should be non-divergent
0216         if (dynamicPedestal)
0217           amplitude = static_cast<SampleVector::Scalar>(adc) * gainratio;
0218         else
0219           amplitude = (static_cast<SampleVector::Scalar>(adc) - pedestal) * gainratio;
0220         amplitudes[ch][sample] = amplitude;
0221 
0222 #ifdef ECAL_RECO_CUDA_DEBUG
0223         printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, pedestal, gainratio);
0224         if (adc == 0)
0225           printf("adc is zero\n");
0226 #endif
0227 
0228         //
0229         // initialization
0230         //
0231         amplitudesForMinimization[inputCh](sample) = 0;
0232         bxs[ch](sample) = sample - 5;
0233 
0234         // select the thread for the max sample
0235         //---> hardcoded above to be 5th sample, ok
0236         if (sample == sample_max) {
0237           //
0238           // initialization
0239           //
0240           acState[ch] = static_cast<char>(MinimizationState::NotFinished);
0241           energies[inputCh] = 0;
0242           chi2[inputCh] = 0;
0243           g_pedestal[inputCh] = 0;
0244           uint32_t flag = 0;
0245           dids_out[inputCh] = did.rawId();
0246 
0247           // start of this channel in shared mem
0248           int const chStart = threadIdx.x - sample_max;
0249           // thread for the max sample in shared mem
0250           int const threadMax = threadIdx.x;
0251           auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE;
0252 
0253           // this flag setting is applied to all of the cases
0254           if (shr_hasSwitchToGain6[chStart])
0255             flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
0256           if (shr_hasSwitchToGain1[chStart])
0257             flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
0258 
0259           // this corresponds to cpu branching on lastSampleBeforeSaturation
0260           // likely false
0261           if (check_hasSwitchToGain0) {
0262             // assign for the case some sample having gainId == 0
0263             //energies[inputCh] = amplitudes[ch][sample_max];
0264             energies[inputCh] = amplitude;
0265 
0266             // check if samples before sample_max have true
0267             bool saturated_before_max = false;
0268             CMS_UNROLL_LOOP
0269             for (char ii = 0; ii < 5; ii++)
0270               saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii];
0271 
0272             // if saturation is in the max sample and not in the first 5
0273             if (!saturated_before_max && shr_hasSwitchToGain0[threadMax])
0274               energies[inputCh] = 49140;  // 4095 * 12 (maximum ADC range * MultiGainPreAmplifier (MGPA) gain)
0275                                           // This is the actual maximum range that is set when we saturate.
0276                                           //---- AM FIXME : no pedestal subtraction???
0277                                           //It should be "(4095. - pedestal) * gainratio"
0278 
0279             // set state flag to terminate further processing of this channel
0280             acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0281             flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
0282             flags[inputCh] = flag;
0283             return;
0284           }
0285 
0286           // according to cpu version
0287           //            auto max_amplitude = amplitudes[ch][sample_max];
0288           auto const max_amplitude = amplitude;
0289           // according to cpu version
0290           auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max - 7];
0291           // note, no syncing as the same thread will be accessing here
0292           bool hasGainSwitch =
0293               shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3];
0294 
0295           // pedestal is final unconditionally
0296           g_pedestal[inputCh] = pedestal;
0297           if (hasGainSwitch && gainSwitchUseMaxSample) {
0298             // thread for sample=0 will access the right guys
0299             energies[inputCh] = max_amplitude / shape_value;
0300             acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0301             flags[inputCh] = flag;
0302             return;
0303           }
0304 
0305           // this happens cause sometimes rms_x12 is 0...
0306           // needs to be checkec why this is the case
0307           // general case here is that noisecov is a Zero matrix
0308           if (rmsForChecking == 0) {
0309             acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0310             flags[inputCh] = flag;
0311             return;
0312           }
0313 
0314           // for the case when no shortcuts were taken
0315           flags[inputCh] = flag;
0316         }
0317       }
0318     }
0319 
0320     ///
0321     /// assume kernel launch configuration is
0322     /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
0323     ///
0324     __global__ void kernel_prep_2d(SampleGainVector const* gainNoise,
0325                                    uint32_t const* dids_eb,
0326                                    uint32_t const* dids_ee,
0327                                    float const* rms_x12,
0328                                    float const* rms_x6,
0329                                    float const* rms_x1,
0330                                    float const* gain12Over6,
0331                                    float const* gain6Over1,
0332                                    double const* G12SamplesCorrelationEB,
0333                                    double const* G6SamplesCorrelationEB,
0334                                    double const* G1SamplesCorrelationEB,
0335                                    double const* G12SamplesCorrelationEE,
0336                                    double const* G6SamplesCorrelationEE,
0337                                    double const* G1SamplesCorrelationEE,
0338                                    SampleMatrix* noisecov,
0339                                    PulseMatrixType* pulse_matrix,
0340                                    EcalPulseShape const* pulse_shape,
0341                                    bool const* hasSwitchToGain6,
0342                                    bool const* hasSwitchToGain1,
0343                                    bool const* isSaturated,
0344                                    uint32_t const offsetForHashes,
0345                                    uint32_t const offsetForInputs) {
0346       int const ch = blockIdx.x;
0347       int const tx = threadIdx.x;
0348       int const ty = threadIdx.y;
0349       constexpr float addPedestalUncertainty = 0.f;
0350       constexpr bool dynamicPedestal = false;
0351       constexpr bool simplifiedNoiseModelForGainSwitch = true;  //---- default is true
0352 
0353       // to access input arrays (ids and digis only)
0354       int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0355       auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
0356 
0357       bool tmp0 = hasSwitchToGain6[ch];
0358       bool tmp1 = hasSwitchToGain1[ch];
0359       auto const did = DetId{dids[inputCh]};
0360       auto const isBarrel = did.subdetId() == EcalBarrel;
0361       auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0362                                      : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0363       auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE;
0364       auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE;
0365       auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE;
0366       bool tmp2 = isSaturated[ch];
0367       bool hasGainSwitch = tmp0 || tmp1 || tmp2;
0368       auto const vidx = std::abs(ty - tx);
0369 
0370       // non-divergent branch for all threads per block
0371       if (hasGainSwitch) {
0372         // TODO: did not include simplified noise model
0373         float noise_value = 0;
0374 
0375         // non-divergent branch - all threads per block
0376         // TODO: all of these constants indicate that
0377         // that these parts could be splitted into completely different
0378         // kernels and run one of them only depending on the config
0379         if (simplifiedNoiseModelForGainSwitch) {
0380           int isample_max = 5;  // according to cpu defs
0381           int gainidx = gainNoise[ch][isample_max];
0382 
0383           // non-divergent branches
0384           if (gainidx == 0)
0385             noise_value = rms_x12[hashedId] * rms_x12[hashedId] * G12SamplesCorrelation[vidx];
0386           if (gainidx == 1)
0387             noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
0388                           G6SamplesCorrelation[vidx];
0389           if (gainidx == 2)
0390             noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * gain6Over1[hashedId] * gain6Over1[hashedId] *
0391                           rms_x1[hashedId] * rms_x1[hashedId] * G1SamplesCorrelation[vidx];
0392           if (!dynamicPedestal && addPedestalUncertainty > 0.f)
0393             noise_value += addPedestalUncertainty * addPedestalUncertainty;
0394         } else {
0395           int gainidx = 0;
0396           char mask = gainidx;
0397           int pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
0398           //            NB: gainratio is 1, that is why it does not appear in the formula
0399           noise_value += rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx];
0400           // non-divergent branch
0401           if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0402             noise_value += addPedestalUncertainty * addPedestalUncertainty * pedestal;  // gainratio is 1
0403           }
0404 
0405           //
0406           gainidx = 1;
0407           mask = gainidx;
0408           pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
0409           noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
0410                          pedestal * G6SamplesCorrelation[vidx];
0411           // non-divergent branch
0412           if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0413             noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * addPedestalUncertainty *
0414                            addPedestalUncertainty * pedestal;
0415           }
0416 
0417           //
0418           gainidx = 2;
0419           mask = gainidx;
0420           pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
0421           float tmp = gain6Over1[hashedId] * gain12Over6[hashedId];
0422           noise_value += tmp * tmp * rms_x1[hashedId] * rms_x1[hashedId] * pedestal * G1SamplesCorrelation[vidx];
0423           // non-divergent branch
0424           if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0425             noise_value += tmp * tmp * addPedestalUncertainty * addPedestalUncertainty * pedestal;
0426           }
0427         }
0428 
0429         noisecov[ch](ty, tx) = noise_value;
0430       } else {
0431         auto rms = rms_x12[hashedId];
0432         float noise_value = rms * rms * G12SamplesCorrelation[vidx];
0433         if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0434           //----  add fully correlated component to noise covariance to inflate pedestal uncertainty
0435           noise_value += addPedestalUncertainty * addPedestalUncertainty;
0436         }
0437         noisecov[ch](ty, tx) = noise_value;
0438       }
0439 
0440       // pulse matrix
0441       int const posToAccess = 9 - tx + ty;  // see cpu for reference
0442       float const value = posToAccess >= 7 ? pulse_shape[hashedId].pdfval[posToAccess - 7] : 0;
0443       pulse_matrix[ch](ty, tx) = value;
0444     }
0445 
0446     __global__ void kernel_permute_results(SampleVector* amplitudes,
0447                                            BXVectorType const* activeBXs,
0448                                            ::ecal::reco::StorageScalarType* energies,
0449                                            char const* acState,
0450                                            int const nchannels) {
0451       // constants
0452       constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
0453 
0454       // indices
0455       int const tx = threadIdx.x + blockIdx.x * blockDim.x;
0456       int const ch = tx / nsamples;
0457       int const sampleidx = tx % nsamples;  // this is to address activeBXs
0458 
0459       if (ch >= nchannels)
0460         return;
0461 
0462       // channels that have amplitude precomputed do not need results to be permuted
0463       auto const state = static_cast<MinimizationState>(acState[ch]);
0464       if (state == MinimizationState::Precomputed)
0465         return;
0466 
0467       // configure shared memory and cp into it
0468       extern __shared__ char smem[];
0469       SampleVector::Scalar* values = reinterpret_cast<SampleVector::Scalar*>(smem);
0470       values[threadIdx.x] = amplitudes[ch](sampleidx);
0471       __syncthreads();
0472 
0473       // get the sample for this bx
0474       auto const sample = static_cast<int>(activeBXs[ch](sampleidx)) + 5;
0475 
0476       // store back to global
0477       amplitudes[ch](sample) = values[threadIdx.x];
0478 
0479       // store sample 5 separately
0480       // only for the case when minimization was performed
0481       // not for cases with precomputed amplitudes
0482       if (sample == 5)
0483         energies[ch] = values[threadIdx.x];
0484     }
0485 
0486   }  // namespace multifit
0487 }  // namespace ecal