Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-01-22 07:34:23

0001 #include <iostream>
0002 #include <alpaka/alpaka.hpp>
0003 #include <Eigen/Dense>
0004 
0005 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0006 #include "HeterogeneousCore/AlpakaInterface/interface/atomicMaxPair.h"
0007 
0008 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
0009 // needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
0010 #include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
0011 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0012 #include "FWCore/Utilities/interface/HostDeviceConstant.h"
0013 // TODO reuse some of the HCAL constats from
0014 //#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
0015 
0016 #include "Mahi.h"
0017 
0018 #ifdef HCAL_MAHI_GPUDEBUG
0019 #define DETID_TO_DEBUG 1125647428
0020 #endif
0021 
0022 namespace ALPAKA_ACCELERATOR_NAMESPACE {
0023 
0024   using namespace cms::alpakatools;
0025   namespace hcal::reconstruction {
0026     namespace mahi {
0027 
0028       ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float uint_as_float(uint32_t val) {
0029 #if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
0030         return __uint_as_float(val);
0031 #else
0032         return edm::bit_cast<float>(val);
0033 #endif
0034       }
0035 
0036       ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t float_as_uint(float val) {
0037 #if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
0038         return __float_as_uint(val);
0039 #else
0040         return edm::bit_cast<unsigned int>(val);
0041 #endif
0042       }
0043 
0044       ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_time_slew_delay(float const fC,
0045                                                                    float const tzero,
0046                                                                    float const slope,
0047                                                                    float const tmax) {
0048         auto const rawDelay = tzero + slope * std::log(fC);
0049         return rawDelay < 0 ? 0 : (rawDelay > tmax ? tmax : rawDelay);
0050       }
0051 
0052       // HcalQIEShapes are hardcoded in HcalQIEData.cc basically
0053       // + some logic to generate 128 and 256 value arrays...
0054       HOST_DEVICE_CONSTANT float qie8shape[129] = {
0055           -1,   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   16,
0056           18,   20,   22,   24,   26,   28,   31,   34,   37,   40,   44,   48,   52,   57,   62,   57,   62,
0057           67,   72,   77,   82,   87,   92,   97,   102,  107,  112,  117,  122,  127,  132,  142,  152,  162,
0058           172,  182,  192,  202,  217,  232,  247,  262,  282,  302,  322,  347,  372,  347,  372,  397,  422,
0059           447,  472,  497,  522,  547,  572,  597,  622,  647,  672,  697,  722,  772,  822,  872,  922,  972,
0060           1022, 1072, 1147, 1222, 1297, 1372, 1472, 1572, 1672, 1797, 1922, 1797, 1922, 2047, 2172, 2297, 2422,
0061           2547, 2672, 2797, 2922, 3047, 3172, 3297, 3422, 3547, 3672, 3922, 4172, 4422, 4672, 4922, 5172, 5422,
0062           5797, 6172, 6547, 6922, 7422, 7922, 8422, 9047, 9672, 10297};
0063 
0064       HOST_DEVICE_CONSTANT float qie11shape[257] = {
0065           -0.5,    0.5,     1.5,     2.5,     3.5,     4.5,     5.5,     6.5,     7.5,     8.5,     9.5,     10.5,
0066           11.5,    12.5,    13.5,    14.5,    15.5,    17.5,    19.5,    21.5,    23.5,    25.5,    27.5,    29.5,
0067           31.5,    33.5,    35.5,    37.5,    39.5,    41.5,    43.5,    45.5,    47.5,    49.5,    51.5,    53.5,
0068           55.5,    59.5,    63.5,    67.5,    71.5,    75.5,    79.5,    83.5,    87.5,    91.5,    95.5,    99.5,
0069           103.5,   107.5,   111.5,   115.5,   119.5,   123.5,   127.5,   131.5,   135.5,   139.5,   147.5,   155.5,
0070           163.5,   171.5,   179.5,   187.5,   171.5,   179.5,   187.5,   195.5,   203.5,   211.5,   219.5,   227.5,
0071           235.5,   243.5,   251.5,   259.5,   267.5,   275.5,   283.5,   291.5,   299.5,   315.5,   331.5,   347.5,
0072           363.5,   379.5,   395.5,   411.5,   427.5,   443.5,   459.5,   475.5,   491.5,   507.5,   523.5,   539.5,
0073           555.5,   571.5,   587.5,   603.5,   619.5,   651.5,   683.5,   715.5,   747.5,   779.5,   811.5,   843.5,
0074           875.5,   907.5,   939.5,   971.5,   1003.5,  1035.5,  1067.5,  1099.5,  1131.5,  1163.5,  1195.5,  1227.5,
0075           1259.5,  1291.5,  1355.5,  1419.5,  1483.5,  1547.5,  1611.5,  1675.5,  1547.5,  1611.5,  1675.5,  1739.5,
0076           1803.5,  1867.5,  1931.5,  1995.5,  2059.5,  2123.5,  2187.5,  2251.5,  2315.5,  2379.5,  2443.5,  2507.5,
0077           2571.5,  2699.5,  2827.5,  2955.5,  3083.5,  3211.5,  3339.5,  3467.5,  3595.5,  3723.5,  3851.5,  3979.5,
0078           4107.5,  4235.5,  4363.5,  4491.5,  4619.5,  4747.5,  4875.5,  5003.5,  5131.5,  5387.5,  5643.5,  5899.5,
0079           6155.5,  6411.5,  6667.5,  6923.5,  7179.5,  7435.5,  7691.5,  7947.5,  8203.5,  8459.5,  8715.5,  8971.5,
0080           9227.5,  9483.5,  9739.5,  9995.5,  10251.5, 10507.5, 11019.5, 11531.5, 12043.5, 12555.5, 13067.5, 13579.5,
0081           12555.5, 13067.5, 13579.5, 14091.5, 14603.5, 15115.5, 15627.5, 16139.5, 16651.5, 17163.5, 17675.5, 18187.5,
0082           18699.5, 19211.5, 19723.5, 20235.5, 20747.5, 21771.5, 22795.5, 23819.5, 24843.5, 25867.5, 26891.5, 27915.5,
0083           28939.5, 29963.5, 30987.5, 32011.5, 33035.5, 34059.5, 35083.5, 36107.5, 37131.5, 38155.5, 39179.5, 40203.5,
0084           41227.5, 43275.5, 45323.5, 47371.5, 49419.5, 51467.5, 53515.5, 55563.5, 57611.5, 59659.5, 61707.5, 63755.5,
0085           65803.5, 67851.5, 69899.5, 71947.5, 73995.5, 76043.5, 78091.5, 80139.5, 82187.5, 84235.5, 88331.5, 92427.5,
0086           96523.5, 100620,  104716,  108812,  112908};
0087 
0088       constexpr int32_t IPHI_MAX = 72;
0089 
0090       ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHB(
0091           uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB) {
0092         HcalDetId did{didraw};
0093         uint32_t const value = (did.depth() - 1) + maxDepthHB * (did.iphi() - 1);
0094         return did.ieta() > 0
0095                    ? value + maxDepthHB * hcal::reconstruction::mahi::IPHI_MAX * (did.ieta() - firstHBRing)
0096                    : value + maxDepthHB * hcal::reconstruction::mahi::IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB);
0097       }
0098       ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHE(uint32_t const didraw,
0099                                                                 int const maxDepthHE,
0100                                                                 int const maxPhiHE,
0101                                                                 int const firstHERing,
0102                                                                 int const lastHERing,
0103                                                                 int const nEtaHE) {
0104         HcalDetId did{didraw};
0105         uint32_t const value = (did.depth() - 1) + maxDepthHE * (did.iphi() - 1);
0106         return did.ieta() > 0 ? value + maxDepthHE * maxPhiHE * (did.ieta() - firstHERing)
0107                               : value + maxDepthHE * maxPhiHE * (did.ieta() + lastHERing + nEtaHE);
0108       }
0109       ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range) {
0110         return capid * 4 + range;
0111       }
0112 
0113       ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_reco_correction_factor(float const par1,
0114                                                                           float const par2,
0115                                                                           float const par3,
0116                                                                           float const x) {
0117         return par3 * x * x + par2 * x + par1;
0118       }
0119 
0120       // compute the charge using the adc, qie type and the appropriate qie shape array
0121       ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_coder_charge(
0122           int const qieType, uint8_t const adc, uint8_t const capid, float const* qieOffsets, float const* qieSlopes) {
0123         auto const range = qieType == 0 ? (adc >> 5) & 0x3 : (adc >> 6) & 0x3;
0124         auto const* qieShapeToUse = qieType == 0 ? qie8shape : qie11shape;
0125         auto const nbins = qieType == 0 ? 32 : 64;
0126         auto const center = adc % nbins == nbins - 1 ? 0.5 * (3 * qieShapeToUse[adc] - qieShapeToUse[adc - 1])
0127                                                      : 0.5 * (qieShapeToUse[adc] + qieShapeToUse[adc + 1]);
0128         auto const index = get_qiecoder_index(capid, range);
0129         return (center - qieOffsets[index]) / qieSlopes[index];
0130       }
0131 
0132       // this is from
0133       //  https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc#L140
0134 
0135       ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_diff_charge_gain(int const qieType,
0136                                                                     uint8_t adc,
0137                                                                     uint8_t const capid,
0138                                                                     float const* qieOffsets,
0139                                                                     float const* qieSlopes,
0140                                                                     bool const isqie11) {
0141         constexpr uint32_t mantissaMaskQIE8 = 0x1fu;
0142         constexpr uint32_t mantissaMaskQIE11 = 0x3f;
0143         auto const mantissaMask = isqie11 ? mantissaMaskQIE11 : mantissaMaskQIE8;
0144         auto const q = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
0145         auto const mantissa = adc & mantissaMask;
0146 
0147         if (mantissa == 0u || mantissa == mantissaMask - 1u)
0148           return compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes) - q;
0149         else if (mantissa == 1u || mantissa == mantissaMask)
0150           return q - compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
0151         else {
0152           auto const qup = compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes);
0153           auto const qdown = compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
0154           auto const upgain = qup - q;
0155           auto const downgain = q - qdown;
0156           auto const averagegain = (qup - qdown) / 2.f;
0157           if (std::abs(upgain - downgain) < 0.01f * averagegain)
0158             return averagegain;
0159           else {
0160             auto const q2up = compute_coder_charge(qieType, adc + 2u, capid, qieOffsets, qieSlopes);
0161             auto const q2down = compute_coder_charge(qieType, adc - 2u, capid, qieOffsets, qieSlopes);
0162             auto const upgain2 = q2up - qup;
0163             auto const downgain2 = qdown - q2down;
0164             if (std::abs(upgain2 - upgain) < std::abs(downgain2 - downgain))
0165               return upgain;
0166             else
0167               return downgain;
0168           }
0169         }
0170       }
0171 
0172       using PulseShapeConstElement = typename HcalPulseShapeSoA::ConstView::const_element;
0173       // TODO: remove what's not needed
0174       // originally from from RecoLocalCalo/HcalRecAlgos/src/PulseShapeFunctor.cc
0175       ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_pulse_shape_value(PulseShapeConstElement const& pulseShape,
0176                                                                      float const pulse_time,
0177                                                                      int const sample,
0178                                                                      int const shift) {
0179         auto const& acc25nsVec = pulseShape.acc25nsVec();
0180         auto const& diff25nsItvlVec = pulseShape.diff25nsItvlVec();
0181         auto const& accVarLenIdxMinusOneVec = pulseShape.accVarLenIdxMinusOneVec();
0182         auto const& diffVarItvlIdxMinusOneVec = pulseShape.diffVarItvlIdxMinusOneVec();
0183         auto const& accVarLenIdxZeroVec = pulseShape.accVarLenIdxZEROVec();
0184         auto const& diffVarItvlIdxZeroVec = pulseShape.diffVarItvlIdxZEROVec();
0185 
0186         // constants
0187         constexpr float slew = 0.f;
0188         constexpr auto ns_per_bx = ::hcal::constants::nsPerBX;
0189 
0190         // FIXME: clean up all the rounding... this is coming from original cpu version
0191         float const i_start_float = -::hcal::constants::iniTimeShift - pulse_time - slew > 0.f
0192                                         ? 0.f
0193                                         : std::abs(-::hcal::constants::iniTimeShift - pulse_time - slew) + 1.f;
0194         int i_start = static_cast<int>(i_start_float);
0195         float offset_start = static_cast<float>(i_start) - ::hcal::constants::iniTimeShift - pulse_time - slew;
0196 
0197         // boundary
0198         if (offset_start == 1.0f) {
0199           offset_start = 0.f;
0200           i_start -= 1;
0201         }
0202 
0203         int const bin_start = static_cast<int>(offset_start);
0204         float const bin_start_up = static_cast<float>(bin_start) + 0.5f;
0205         int const bin_0_start = offset_start < bin_start_up ? bin_start - 1 : bin_start;
0206         int const its_start = i_start / ns_per_bx;
0207         int const distTo25ns_start = ns_per_bx - 1 - i_start % ns_per_bx;
0208         auto const factor = offset_start - static_cast<float>(bin_0_start) - 0.5;
0209 
0210         auto const sample_over10ts = sample + shift;
0211         float value = 0.0f;
0212         if (sample_over10ts == its_start) {
0213           value = bin_0_start == -1
0214                       ? accVarLenIdxMinusOneVec[distTo25ns_start] + factor * diffVarItvlIdxMinusOneVec[distTo25ns_start]
0215                       : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start];
0216         } else if (sample_over10ts > its_start) {
0217           int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start;
0218           value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx];
0219         }
0220         return value;
0221       }
0222 
0223       // TODO: provide constants from configuration
0224       // from RecoLocalCalo/HcalRecProducers/python/HBHEMahiParameters_cfi.py
0225       constexpr int nMaxItersMin = 50;
0226       constexpr int nMaxItersNNLS = 500;
0227       constexpr double nnlsThresh = 1e-11;
0228       constexpr float deltaChi2Threashold = 1e-3;
0229 
0230       // from RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc
0231       ALPAKA_FN_ACC float get_raw_charge(double const charge,
0232                                          double const pedestal,
0233                                          float const* shrChargeMinusPedestal,
0234                                          float const parLin1,
0235                                          float const parLin2,
0236                                          float const parLin3,
0237                                          int32_t const nsamplesForCompute,
0238                                          int32_t const soi,
0239                                          int const sipmQTSShift,
0240                                          int const sipmQNTStoSum,
0241                                          float const fcByPE,
0242                                          int32_t const lch,
0243                                          bool const isqie11) {
0244         float rawCharge;
0245 
0246         if (!isqie11)
0247           rawCharge = charge;
0248         else {
0249           int const first = std::max(soi + sipmQTSShift, 0);
0250           int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute);
0251           float sipmq = 0.0f;
0252           for (auto ts = first; ts < last; ts++)
0253             sipmq += shrChargeMinusPedestal[lch * nsamplesForCompute + ts];
0254           auto const effectivePixelsFired = sipmq / fcByPE;
0255           auto const factor = compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
0256           rawCharge = (charge - pedestal) * factor + pedestal;
0257 
0258 #ifdef HCAL_MAHI_GPUDEBUG
0259           printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
0260 #endif
0261         }
0262         return rawCharge;
0263       }
0264 
0265       // Tried using lambda, but strange error from  with nvcc
0266       inline constexpr bool TSenergyCompare(std::pair<unsigned int, float> a, std::pair<unsigned int, float> b) {
0267         return a.second > b.second;
0268       };
0269 
0270       class Kernel_prep1d_sameNumberOfSamples {
0271       public:
0272         ALPAKA_FN_ACC void operator()(Acc2D const& acc,
0273                                       OProductType::View outputGPU,
0274                                       IProductTypef01::ConstView f01HEDigis,
0275                                       IProductTypef5::ConstView f5HBDigis,
0276                                       IProductTypef3::ConstView f3HBDigis,
0277                                       HcalMahiConditionsPortableDevice::ConstView mahi,
0278                                       HcalSiPMCharacteristicsPortableDevice::ConstView sipmCharacteristics,
0279                                       HcalRecoParamWithPulseShapeDevice::ConstView recoParamsWithPS,
0280                                       bool const useEffectivePedestals,
0281                                       int const sipmQTSShift,
0282                                       int const sipmQNTStoSum,
0283                                       int const firstSampleShift,
0284                                       float const ts4Thresh,
0285                                       float* amplitudes,
0286                                       float* noiseTerms,
0287                                       float* electronicNoiseTerms,
0288                                       int8_t* soiSamples,
0289                                       int const windowSize) const {
0290           auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
0291           auto const startingSample = compute_nsamples<Flavor1>(f01HEDigis.stride()) - windowSize;
0292           auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
0293 
0294           //first index = groups of channel
0295           auto const nchannels_per_block(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
0296           //2nd index = groups of sample
0297           auto const nsamplesForCompute(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[1u]);
0298 
0299           // configure shared mem
0300           float* shrEnergyM0PerTS = alpaka::getDynSharedMem<float>(acc);
0301           float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesForCompute * nchannels_per_block;
0302           float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesForCompute * nchannels_per_block;
0303           float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block;
0304           unsigned long long int* shrMethod0EnergySamplePair =
0305               reinterpret_cast<unsigned long long int*>(shrEnergyM0TotalAccum + nchannels_per_block);
0306 
0307           //Loop over all groups of channels
0308           for (auto group : uniform_groups_y(acc, nchannels)) {
0309             //Loop over each channel, first compute soiSamples and shrMem for all channels
0310             for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
0311               auto const gch = channel.global;
0312               auto const lch = channel.local;
0313 
0314               for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
0315                 auto const sampleWithinWindow = i_sample;
0316                 auto const sample = i_sample + startingSample;
0317                 auto const linearThPerBlock = i_sample + channel.local * nsamplesForCompute;
0318                 // initialize
0319                 if (sampleWithinWindow == 0) {
0320                   soiSamples[gch] = -1;
0321                   shrMethod0EnergyAccum[lch] = 0;
0322                   shrMethod0EnergySamplePair[lch] = 0;
0323                   shrEnergyM0TotalAccum[lch] = 0;
0324                 }
0325 
0326                 // compute soiSamples
0327                 if (gch < f01HEDigis.size()) {
0328                   // NOTE: assume that soi is high only for a single guy!
0329                   //   which must be the case. cpu version does not check for that
0330                   //   if that is not the case, we will see that with cuda mmecheck
0331                   auto const soibit = soibit_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample);
0332                   if (soibit == 1)
0333                     soiSamples[gch] = sampleWithinWindow;
0334                 } else if (gch >= nchannelsf015) {
0335                   auto const soibit = soibit_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample);
0336                   if (soibit == 1)
0337                     soiSamples[gch] = sampleWithinWindow;
0338                 }
0339 
0340                 // compute shrMem
0341                 auto const id = gch < f01HEDigis.size()
0342                                     ? f01HEDigis.ids()[gch]
0343                                     : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
0344                                                            : f3HBDigis.ids()[gch - nchannelsf015]);
0345                 auto const did = HcalDetId{id};
0346 
0347                 auto const adc =
0348                     gch < f01HEDigis.size()
0349                         ? adc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
0350                         : (gch < nchannelsf015
0351                                ? adc_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
0352                                : adc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
0353                 auto const capid =
0354                     gch < f01HEDigis.size()
0355                         ? capid_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
0356                         : (gch < nchannelsf015
0357                                ? capid_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
0358                                : capid_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
0359 
0360                 // compute hash for this did
0361                 // hash needed to convert condition arrays order (based on Topo) into digi arrays order(based on FED)
0362                 auto const hashedId =
0363                     did.subdetId() == HcalBarrel
0364                         ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
0365                         : did2linearIndexHE(id,
0366                                             mahi.maxDepthHE(),
0367                                             mahi.maxPhiHE(),
0368                                             mahi.firstHERing(),
0369                                             mahi.lastHERing(),
0370                                             mahi.nEtaHE()) +
0371                               mahi.offsetForHashes();
0372 
0373                 // conditions based on the hash
0374                 auto const qieType = mahi.qieTypes_values()[hashedId] > 0 ? 1 : 0;  // 2 types at this point
0375                 auto const* qieOffsets = mahi.qieCoders_offsets()[hashedId].data();
0376                 auto const* qieSlopes = mahi.qieCoders_slopes()[hashedId].data();
0377                 auto const pedestal = mahi.pedestals_value()[hashedId][capid];
0378 
0379                 // compute charge
0380                 auto const charge = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
0381 
0382                 shrChargeMinusPedestal[linearThPerBlock] = charge - pedestal;
0383               }
0384             }
0385             alpaka::syncBlockThreads(acc);
0386 
0387             //Loop over each channel, compute input for multifit using shrChargeMinusPedestal
0388             for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
0389               auto const gch = channel.global;
0390               auto const lch = channel.local;
0391               for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
0392                 auto const sampleWithinWindow = i_sample;
0393                 auto const sample = i_sample + startingSample;
0394 
0395                 // initialize all output buffers
0396                 if (sampleWithinWindow == 0) {
0397                   outputGPU.detId()[gch] = 0;
0398                   outputGPU.energyM0()[gch] = 0;
0399                   outputGPU.timeM0()[gch] = 0;
0400                   outputGPU.energy()[gch] = 0;
0401                   outputGPU.chi2()[gch] = 0;
0402                 }
0403 
0404                 // offset output
0405                 auto* amplitudesForChannel = amplitudes + nsamplesForCompute * gch;
0406                 auto* noiseTermsForChannel = noiseTerms + nsamplesForCompute * gch;
0407                 auto* electronicNoiseTermsForChannel = electronicNoiseTerms + nsamplesForCompute * gch;
0408 
0409                 // get event input quantities
0410                 auto const stride = gch < f01HEDigis.size()
0411                                         ? f01HEDigis.stride()
0412                                         : (gch < f5HBDigis.size() ? f5HBDigis.stride() : f3HBDigis.stride());
0413                 auto const nsamples =
0414                     gch < f01HEDigis.size()
0415                         ? compute_nsamples<Flavor1>(stride)
0416                         : (gch < nchannelsf015 ? compute_nsamples<Flavor5>(stride) : compute_nsamples<Flavor3>(stride));
0417 
0418                 ALPAKA_ASSERT_ACC(nsamples == nsamplesForCompute || nsamples - startingSample == nsamplesForCompute);
0419 
0420                 auto const id = gch < f01HEDigis.size()
0421                                     ? f01HEDigis.ids()[gch]
0422                                     : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
0423                                                            : f3HBDigis.ids()[gch - nchannelsf015]);
0424                 auto const did = HcalDetId{id};
0425 
0426                 auto const adc =
0427                     gch < f01HEDigis.size()
0428                         ? adc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
0429                         : (gch < nchannelsf015
0430                                ? adc_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
0431                                : adc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
0432                 auto const capid =
0433                     gch < f01HEDigis.size()
0434                         ? capid_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
0435                         : (gch < nchannelsf015
0436                                ? capid_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
0437                                : capid_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
0438 
0439                 // compute hash for this did
0440                 // hash needed to convert condition arrays order (based on Topo) into digi arrays order(based on FED)
0441                 auto const hashedId =
0442                     did.subdetId() == HcalBarrel
0443                         ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
0444                         : did2linearIndexHE(id,
0445                                             mahi.maxDepthHE(),
0446                                             mahi.maxPhiHE(),
0447                                             mahi.firstHERing(),
0448                                             mahi.lastHERing(),
0449                                             mahi.nEtaHE()) +
0450                               mahi.offsetForHashes();
0451 
0452                 // conditions based on the hash
0453                 auto const qieType = mahi.qieTypes_values()[hashedId] > 0 ? 1 : 0;  // 2 types at this point
0454                 auto const* qieOffsets = mahi.qieCoders_offsets()[hashedId].data();
0455                 auto const* qieSlopes = mahi.qieCoders_slopes()[hashedId].data();
0456                 auto const* pedestalWidthsForChannel =
0457                     useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
0458                         ? mahi.effectivePedestalWidths()[hashedId].data()
0459                         : mahi.pedestals_width()[hashedId].data();
0460 
0461                 auto const gain = mahi.gains_value()[hashedId][capid];
0462                 auto const gain0 = mahi.gains_value()[hashedId][0];
0463                 auto const respCorrection = mahi.respCorrs_values()[hashedId];
0464                 auto const pedestal = mahi.pedestals_value()[hashedId][capid];
0465                 auto const pedestalWidth = pedestalWidthsForChannel[capid];
0466                 // if needed, only use effective pedestals for f01
0467                 auto const pedestalToUseForMethod0 =
0468                     useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
0469                         ? mahi.effectivePedestals()[hashedId][capid]
0470                         : pedestal;
0471                 auto const sipmType = mahi.sipmPar_type()[hashedId];
0472                 auto const fcByPE = mahi.sipmPar_fcByPE()[hashedId];
0473                 auto const recoParam1 = recoParamsWithPS.recoParamView().param1()[hashedId];
0474                 auto const recoParam2 = recoParamsWithPS.recoParamView().param2()[hashedId];
0475 
0476 #ifdef HCAL_MAHI_GPUDEBUG
0477                 printf("qieType = %d qieOffset0 = %f qieOffset1 = %f qieSlope0 = %f qieSlope1 = %f\n",
0478                        qieType,
0479                        qieOffsets[0],
0480                        qieOffsets[1],
0481                        qieSlopes[0],
0482                        qieSlopes[1]);
0483 #endif
0484 
0485                 // compute charge
0486                 auto const charge = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
0487 
0488                 int32_t const soi =
0489                     gch < f01HEDigis.size()
0490                         ? soiSamples[gch]
0491                         : (gch < nchannelsf015 ? f5HBDigis.npresamples()[gch - f01HEDigis.size()] : soiSamples[gch]);
0492 
0493                 bool badSOI = (soi < 0 or static_cast<unsigned>(soi) >= nsamplesForCompute);
0494                 if (badSOI and sampleWithinWindow == 0) {
0495 #ifdef GPU_DEBUG
0496                   printf("Found HBHE channel %d with invalid SOI %d\n", gch, soi);
0497 #endif
0498                   // mark the channel as bad
0499                   outputGPU.chi2()[gch] = -9999.f;
0500                 }
0501 
0502                 // type index starts from 1 .. 6
0503                 // precicionItem index starts from 0 .. 5
0504                 auto const precisionItem = sipmCharacteristics.precisionItem()[sipmType - 1];
0505                 auto const parLin1 = precisionItem.parLin1_;
0506                 auto const parLin2 = precisionItem.parLin2_;
0507                 auto const parLin3 = precisionItem.parLin3_;
0508 
0509                 //int32_t const soi = gch >= nchannelsf01HE
0510                 //    ? npresamplesf5HB[gch - nchannelsf01HE]
0511                 //    : soiSamples[gch];
0512                 // this is here just to make things uniform...
0513                 if (gch >= f01HEDigis.size() && gch < nchannelsf015 && sampleWithinWindow == 0)
0514                   soiSamples[gch] = f5HBDigis.npresamples()[gch - f01HEDigis.size()];
0515 
0516                 //
0517                 // compute various quantities (raw charge and tdc stuff)
0518                 // NOTE: this branch will be divergent only for a single warp that
0519                 // sits on the boundary when flavor 01 channels end and flavor 5 start
0520                 //
0521                 float const rawCharge = get_raw_charge(charge,
0522                                                        pedestal,
0523                                                        shrChargeMinusPedestal,
0524                                                        parLin1,
0525                                                        parLin2,
0526                                                        parLin3,
0527                                                        nsamplesForCompute,
0528                                                        soi,
0529                                                        sipmQTSShift,
0530                                                        sipmQNTStoSum,
0531                                                        fcByPE,
0532                                                        lch,
0533                                                        gch < f01HEDigis.size() || gch >= nchannelsf015);
0534 
0535                 auto const dfc = compute_diff_charge_gain(
0536                     qieType, adc, capid, qieOffsets, qieSlopes, gch < f01HEDigis.size() || gch >= nchannelsf015);
0537 
0538 #ifdef COMPUTE_TDC_TIME
0539                 float tdcTime;
0540                 if (gch >= f01HEDigis.size() && gch < nchannelsf015) {
0541                   tdcTime = HcalSpecialTimes::UNKNOWN_T_NOTDC;
0542                 } else {
0543                   if (gch < f01HEDigis.size())
0544                     tdcTime =
0545                         HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample));
0546                   else if (gch >= nchannelsf015)
0547                     tdcTime = HcalSpecialTimes::getTDCTime(
0548                         tdc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
0549                 }
0550 #endif  // COMPUTE_TDC_TIME
0551 
0552                 // compute method 0 quantities
0553                 // TODO: need to apply containment
0554                 // TODO: need to apply time slew
0555                 // TODO: for < run 3, apply HBM legacy energy correction
0556                 auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
0557                 auto const startSampleTmp = soi + firstSampleShift;
0558                 auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp;
0559                 auto const endSample =
0560                     startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute;
0561                 // NOTE: gain is a small number < 10^-3, multiply it last
0562                 auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
0563                 auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
0564                 // store to shared mem
0565                 shrEnergyM0PerTS[lch * nsamplesForCompute + sampleWithinWindow] = energym0_per_ts;
0566 
0567                 alpaka::atomicAdd(
0568                     acc, &shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0, alpaka::hierarchy::Threads{});
0569 
0570 #ifdef HCAL_MAHI_GPUDEBUG
0571                 printf(
0572                     "id = %u sample = %d gch = %d hashedId = %u adc = %u capid = %u\n"
0573                     "   charge = %f rawCharge = %f dfc = %f pedestal = %f\n"
0574                     "   gain = %f respCorrection = %f energym0_per_ts = %f\n",
0575                     id,
0576                     sample,
0577                     gch,
0578                     hashedId,
0579                     adc,
0580                     capid,
0581                     charge,
0582                     rawCharge,
0583                     dfc,
0584                     pedestalToUseForMethod0,
0585                     gain,
0586                     respCorrection,
0587                     energym0_per_ts);
0588                 printf("startSample = %d endSample = %d param1 = %u param2 = %u\n",
0589                        startSample,
0590                        endSample,
0591                        recoParam1,
0592                        recoParam2);
0593 #endif
0594                 //Find the max energy of lch channel and the corresponding TS
0595                 if (sampleWithinWindow >= static_cast<unsigned>(startSample) &&
0596                     sampleWithinWindow < static_cast<unsigned>(endSample)) {
0597                   //sum the energys of all TS
0598                   alpaka::atomicAdd(acc, &shrMethod0EnergyAccum[lch], energym0_per_ts, alpaka::hierarchy::Threads{});
0599                   // pair TS and E for all TSs, find max pair according to E
0600                   // TODO: Non-deterministic behavior for TS with equal energy
0601                   atomicMaxPair(
0602                       acc, &shrMethod0EnergySamplePair[lch], {sampleWithinWindow, energym0_per_ts}, TSenergyCompare);
0603                 }
0604 
0605                 // NOTE: must take soi, as values for that thread are used...
0606                 // NOTE: does not run if soi is bad, because it does not match any sampleWithinWindow
0607                 if (sampleWithinWindow == static_cast<unsigned>(soi)) {
0608                   // Channel quality check
0609                   //    https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecAlgos/plugins/HcalChannelPropertiesEP.cc#L107-L109
0610                   //    https://github.com/cms-sw/cmssw/blob/6d2f66057131baacc2fcbdd203588c41c885b42c/CondCore/HcalPlugins/plugins/HcalChannelQuality_PayloadInspector.cc#L30
0611                   //      const bool taggedBadByDb = severity.dropChannel(digistatus->getValue());
0612                   //  do not run MAHI if taggedBadByDb = true
0613                   auto const digiStatus_ = mahi.channelQuality_status()[hashedId];
0614                   const bool taggedBadByDb = (digiStatus_ / 32770);
0615 
0616                   if (taggedBadByDb)
0617                     outputGPU.chi2()[gch] = -9999.f;
0618 
0619                   // check as in cpu version if mahi is not needed
0620                   // (use "not" and ">", instead of "<=", to ensure that a NaN value will pass the check, and the hit be flagged as invalid)
0621                   if (not(energym0_per_ts_gain0 > ts4Thresh)) {
0622                     outputGPU.chi2()[gch] = -9999.f;
0623                   }
0624                 }
0625                 //
0626                 // preparations for mahi fit
0627                 //
0628                 auto const amplitude = rawCharge - pedestalToUseForMethod0;
0629                 auto const noiseADC = (1. / std::sqrt(12)) * dfc;
0630                 auto const noisePhotoSq = amplitude > pedestalWidth ? (amplitude * fcByPE) : 0.f;
0631                 auto const noiseTerm = noiseADC * noiseADC + noisePhotoSq + pedestalWidth * pedestalWidth;
0632 
0633                 // store to global memory
0634                 amplitudesForChannel[sampleWithinWindow] = amplitude;
0635                 noiseTermsForChannel[sampleWithinWindow] = noiseTerm;
0636                 electronicNoiseTermsForChannel[sampleWithinWindow] = pedestalWidth;
0637 
0638               }  //end sample loop
0639             }  // end channel loop
0640             alpaka::syncBlockThreads(acc);
0641 
0642             // compute energy using shrMethod0EnergySamplePair
0643             for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
0644               auto const gch = channel.global;
0645               auto const lch = channel.local;
0646               for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
0647                 auto const sampleWithinWindow = i_sample;
0648 
0649                 int32_t const soi =
0650                     gch < f01HEDigis.size()
0651                         ? soiSamples[gch]
0652                         : (gch < nchannelsf015 ? f5HBDigis.npresamples()[gch - f01HEDigis.size()] : soiSamples[gch]);
0653 
0654                 auto const id = gch < f01HEDigis.size()
0655                                     ? f01HEDigis.ids()[gch]
0656                                     : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
0657                                                            : f3HBDigis.ids()[gch - nchannelsf015]);
0658                 auto const did = HcalDetId{id};
0659                 auto const hashedId =
0660                     did.subdetId() == HcalBarrel
0661                         ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
0662                         : did2linearIndexHE(id,
0663                                             mahi.maxDepthHE(),
0664                                             mahi.maxPhiHE(),
0665                                             mahi.firstHERing(),
0666                                             mahi.lastHERing(),
0667                                             mahi.nEtaHE()) +
0668                               mahi.offsetForHashes();
0669 
0670                 auto const recoParam1 = recoParamsWithPS.recoParamView().param1()[hashedId];
0671                 auto const recoParam2 = recoParamsWithPS.recoParamView().param2()[hashedId];
0672 
0673                 auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
0674                 // NOTE: must take soi, as values for that thread are used...
0675                 // NOTE: does not run if soi is bad, because it does not match any sampleWithinWindow
0676                 if (sampleWithinWindow == static_cast<unsigned>(soi)) {
0677                   auto const method0_energy = shrMethod0EnergyAccum[lch];
0678                   auto const val = shrMethod0EnergySamplePair[lch];
0679                   int const max_sample = (val >> 32) & 0xffffffff;
0680 
0681                   float const max_energy = uint_as_float(static_cast<uint32_t>(val & 0xffffffff));
0682 
0683                   float const max_energy_1 = static_cast<unsigned>(max_sample) < nsamplesForCompute - 1
0684                                                  ? shrEnergyM0PerTS[lch * nsamplesForCompute + max_sample + 1]
0685                                                  : 0.f;
0686                   float const position = nsamplesToAdd < nsamplesForCompute ? max_sample - soi : max_sample;
0687                   auto const sum = max_energy + max_energy_1;
0688                   // FIXME: for full comparison with cpu method 0  timing,
0689                   // need to correct by slew
0690                   // requires an accumulator -> more shared mem -> omit here unless
0691                   // really needed
0692                   float const time =
0693                       max_energy > 0.f && max_energy_1 > 0.f ? 25.f * (position + max_energy_1 / sum) : 25.f * position;
0694 
0695                   // store method0 quantities to global mem
0696                   outputGPU.detId()[gch] = id;
0697                   outputGPU.energyM0()[gch] = method0_energy;
0698                   outputGPU.timeM0()[gch] = time;
0699 
0700                   // check as in cpu version if mahi is not needed
0701                   // FIXME: KNOWN ISSUE: observed a problem when rawCharge and pedestal
0702                   // are basically equal and generate -0.00000...
0703                   // needs to be treated properly
0704                   // (use "not" and ">", instead of "<=", to ensure that a NaN value will pass the check, and the hit be flagged as invalid)
0705                   if (not(shrEnergyM0TotalAccum[lch] > 0)) {
0706                     outputGPU.chi2()[gch] = -9999.f;
0707                   }
0708 
0709 #ifdef HCAL_MAHI_GPUDEBUG
0710                   printf("tsTOT = %f tstrig = %f ts4Thresh = %f\n",
0711                          shrEnergyM0TotalAccum[lch],
0712                          energym0_per_ts_gain0,
0713                          ts4Thresh);
0714 #endif
0715 
0716 #ifdef HCAL_MAHI_GPUDEBUG
0717                   printf(" method0_energy = %f max_sample = %d max_energy = %f time = %f\n",
0718                          method0_energy,
0719                          max_sample,
0720                          max_energy,
0721                          time);
0722 #endif
0723                 }
0724 #ifdef HCAL_MAHI_GPUDEBUG
0725                 printf(
0726                     "charge(%d) = %f pedestal(%d) = %f dfc(%d) = %f pedestalWidth(%d) = %f noiseADC(%d) = %f "
0727                     "noisPhoto(%d) =%f outputGPU chi2[gch] = %f \n",
0728                     sample,
0729                     rawCharge,
0730                     sample,
0731                     pedestalToUseForMethod0,
0732                     sample,
0733                     dfc,
0734                     sample,
0735                     pedestalWidth,
0736                     sample,
0737                     noiseADC,
0738                     sample,
0739                     noisePhotoSq,
0740                     outputGPU.chi2()[gch]);
0741 #endif
0742 
0743               }  // loop for sample
0744             }  // loop for channels
0745           }  // loop for channgel groups
0746         }
0747       };  //Kernel_prep1d_sameNumberOfSamples
0748 
0749       class Kernel_prep_pulseMatrices_sameNumberOfSamples {
0750       public:
0751         ALPAKA_FN_ACC void operator()(Acc3D const& acc,
0752                                       float* pulseMatrices,
0753                                       float* pulseMatricesM,
0754                                       float* pulseMatricesP,
0755                                       HcalMahiPulseOffsetsSoA::ConstView pulseOffsets,
0756                                       float const* amplitudes,
0757                                       IProductTypef01::ConstView f01HEDigis,
0758                                       IProductTypef5::ConstView f5HBDigis,
0759                                       IProductTypef3::ConstView f3HBDigis,
0760                                       int8_t* soiSamples,
0761                                       HcalMahiConditionsPortableDevice::ConstView mahi,
0762                                       HcalRecoParamWithPulseShapeDevice::ConstView recoParamsWithPS,
0763                                       float const meanTime,
0764                                       float const timeSigmaSiPM,
0765                                       float const timeSigmaHPD,
0766                                       bool const applyTimeSlew,
0767                                       float const tzeroTimeSlew,
0768                                       float const slopeTimeSlew,
0769                                       float const tmaxTimeSlew) const {
0770           //2nd index = pulse
0771           auto const npulses(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[1u]);
0772           //3rd index = sample
0773           auto const nsamples(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[2u]);
0774 
0775           auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
0776           auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
0777 
0778           //Loop over each channel
0779           for (auto channel : uniform_elements_z(acc, nchannels)) {
0780             //Loop over pulses
0781             for (auto ipulse : independent_group_elements_y(acc, npulses)) {
0782               //Loop over sample
0783               for (auto sample : independent_group_elements_x(acc, nsamples)) {
0784                 // conditions
0785                 auto const id = channel < f01HEDigis.size()
0786                                     ? f01HEDigis.ids()[channel]
0787                                     : (channel < nchannelsf015 ? f5HBDigis.ids()[channel - f01HEDigis.size()]
0788                                                                : f3HBDigis.ids()[channel - nchannelsf015]);
0789                 auto const deltaT =
0790                     channel >= f01HEDigis.size() && channel < nchannelsf015 ? timeSigmaHPD : timeSigmaSiPM;
0791 
0792                 // compute hash for this did
0793                 // hash needed to convert condition arrays order (based on Topo) into digi arrays order(based on FED)
0794                 auto const did = DetId{id};
0795                 auto const hashedId =
0796                     did.subdetId() == HcalBarrel
0797                         ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
0798                         : did2linearIndexHE(id,
0799                                             mahi.maxDepthHE(),
0800                                             mahi.maxPhiHE(),
0801                                             mahi.firstHERing(),
0802                                             mahi.lastHERing(),
0803                                             mahi.nEtaHE()) +
0804                               mahi.offsetForHashes();
0805                 auto const pulseShape = recoParamsWithPS.getPulseShape(hashedId);
0806 
0807                 // offset output arrays
0808                 auto* pulseMatrix = pulseMatrices + nsamples * npulses * channel;
0809                 auto* pulseMatrixM = pulseMatricesM + nsamples * npulses * channel;
0810                 auto* pulseMatrixP = pulseMatricesP + nsamples * npulses * channel;
0811 
0812                 // amplitude per ipulse
0813                 int const soi = soiSamples[channel];
0814                 int const pulseOffset = pulseOffsets.offsets()[ipulse];
0815                 auto const amplitude = amplitudes[channel * nsamples + pulseOffset + soi];
0816 
0817                 if (amplitude <= 0.f) {
0818                   pulseMatrix[ipulse * nsamples + sample] = 0.f;
0819                   pulseMatrixM[ipulse * nsamples + sample] = 0.f;
0820                   pulseMatrixP[ipulse * nsamples + sample] = 0.f;
0821                   continue;
0822                 }
0823 #ifdef HCAL_MAHI_GPUDEBUG
0824 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
0825                 if (id != DETID_TO_DEBUG)
0826                   return;
0827 #endif
0828                 if (sample == 0 && ipulse == 0) {
0829                   for (int i = 0; i < 8; i++)
0830                     printf("amplitude(%d) = %f\n", i, amplitudes[channel * nsamples + i]);
0831                   printf("acc25nsVec and diff25nsItvlVec for recoPulseShapeId = %u\n", recoPulseShapeId);
0832                   for (int i = 0; i < 256; i++) {
0833                     printf("acc25nsVec(%d) = %f diff25nsItvlVec(%d) = %f\n", i, acc25nsVec[i], i, diff25nsItvlVec[i]);
0834                   }
0835                   printf("accVarLenIdxZEROVec and accVarLenIdxMinusOneVec\n");
0836                   for (int i = 0; i < 25; i++) {
0837                     printf("accVarLenIdxZEROVec(%d) = %f accVarLenIdxMinusOneVec(%d) = %f\n",
0838                            i,
0839                            accVarLenIdxZeroVec[i],
0840                            i,
0841                            accVarLenIdxMinusOneVec[i]);
0842                   }
0843                   printf("diffVarItvlIdxZEROVec and diffVarItvlIdxMinusOneVec\n");
0844                   for (int i = 0; i < 25; i++) {
0845                     printf("diffVarItvlIdxZEROVec(%d) = %f diffVarItvlIdxMinusOneVec(%d) = %f\n",
0846                            i,
0847                            diffVarItvlIdxZeroVec[i],
0848                            i,
0849                            diffVarItvlIdxMinusOneVec[i]);
0850                   }
0851                 }
0852 #endif
0853 
0854                 auto t0 = meanTime;
0855                 if (applyTimeSlew) {
0856                   if (amplitude <= 1.0f)
0857                     t0 += compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
0858                   else
0859                     t0 += compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
0860                 }
0861                 auto const t0m = -deltaT + t0;
0862                 auto const t0p = deltaT + t0;
0863 
0864 #ifdef HCAL_MAHI_GPUDEBUG
0865                 if (sample == 0 && ipulse == 0) {
0866                   printf("time values: %f %f %f\n", t0, t0m, t0p);
0867                 }
0868 
0869                 if (sample == 0 && ipulse == 0) {
0870                   for (int i = 0; i < hcal::constants::maxSamples; i++) {
0871                     auto const value = compute_pulse_shape_value(pulseShape, t0, i, 0);
0872                   }
0873                   printf("\n");
0874                   for (int i = 0; i < hcal::constants::maxSamples; i++) {
0875                     auto const value = compute_pulse_shape_value(pulseShape, t0p, i, 0);
0876                   }
0877                   printf("\n");
0878                   for (int i = 0; i < hcal::constants::maxSamples; i++) {
0879                     auto const value = compute_pulse_shape_value(pulseShape, t0m, i, 0);
0880                   }
0881                 }
0882 #endif
0883 
0884                 // FIXME: shift should be treated properly,
0885                 // here assume 8 time slices and 8 samples
0886                 auto const shift = 4 - soi;  // as in cpu version!
0887 
0888                 int32_t const idx = sample - pulseOffset;
0889                 auto const value = idx >= 0 && static_cast<unsigned>(idx) < nsamples
0890                                        ? compute_pulse_shape_value(pulseShape, t0, idx, shift)
0891                                        : 0;
0892                 auto const value_t0m = idx >= 0 && static_cast<unsigned>(idx) < nsamples
0893                                            ? compute_pulse_shape_value(pulseShape, t0m, idx, shift)
0894                                            : 0;
0895                 auto const value_t0p = idx >= 0 && static_cast<unsigned>(idx) < nsamples
0896                                            ? compute_pulse_shape_value(pulseShape, t0p, idx, shift)
0897                                            : 0;
0898 
0899                 // store to global
0900                 pulseMatrix[ipulse * nsamples + sample] = value;
0901                 pulseMatrixM[ipulse * nsamples + sample] = value_t0m;
0902                 pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
0903 
0904               }  // loop over sample
0905             }  // loop over pulse
0906           }  // loop over channels
0907         }
0908       };  // Kernel_prep_pulseMatrices_sameNumberOfSamples
0909 
0910       template <int NSAMPLES, int NPULSES>
0911       ALPAKA_FN_ACC ALPAKA_FN_INLINE void update_covariance(
0912           calo::multifit::ColumnVector<NPULSES> const& resultAmplitudesVector,
0913           calo::multifit::MapSymM<float, NSAMPLES>& covarianceMatrix,
0914           Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
0915           Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
0916           Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
0917         CMS_UNROLL_LOOP
0918         for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
0919           auto const resultAmplitude = resultAmplitudesVector(ipulse);
0920           if (resultAmplitude == 0)
0921             continue;
0922 
0923 #ifdef HCAL_MAHI_GPUDEBUG
0924           printf("pulse cov array for ibx = %d\n", ipulse);
0925 #endif
0926 
0927           // preload a column
0928           float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES];
0929           CMS_UNROLL_LOOP
0930           for (int counter = 0; counter < NSAMPLES; counter++) {
0931             pmcol[counter] = pulseMatrix.coeffRef(counter, ipulse);
0932             pmpcol[counter] = pulseMatrixP.coeffRef(counter, ipulse);
0933             pmmcol[counter] = pulseMatrixM.coeffRef(counter, ipulse);
0934           }
0935 
0936           auto const ampl2 = resultAmplitude * resultAmplitude;
0937           CMS_UNROLL_LOOP
0938           for (int col = 0; col < NSAMPLES; col++) {
0939             auto const valueP_col = pmpcol[col];
0940             auto const valueM_col = pmmcol[col];
0941             auto const value_col = pmcol[col];
0942             auto const tmppcol = valueP_col - value_col;
0943             auto const tmpmcol = valueM_col - value_col;
0944 
0945             // diagonal
0946             auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol);
0947             covarianceMatrix(col, col) += ampl2 * tmp_value;
0948 
0949             // FIXME: understand if this actually gets unrolled
0950             CMS_UNROLL_LOOP
0951             for (int row = col + 1; row < NSAMPLES; row++) {
0952               float const valueP_row = pmpcol[row];  //pulseMatrixP(j, ipulseReal);
0953               float const value_row = pmcol[row];    //pulseMatrix(j, ipulseReal);
0954               float const valueM_row = pmmcol[row];  //pulseMatrixM(j, ipulseReal);
0955 
0956               float tmpprow = valueP_row - value_row;
0957               float tmpmrow = valueM_row - value_row;
0958 
0959               auto const covValue = 0.5 * (tmppcol * tmpprow + tmpmcol * tmpmrow);
0960 
0961               covarianceMatrix(row, col) += ampl2 * covValue;
0962             }
0963           }
0964         }
0965       }
0966 
0967       template <int NSAMPLES, int NPULSES>
0968       class Kernel_minimize {
0969       public:
0970         ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0971                                       OProductType::View outputGPU,
0972                                       float const* amplitudes,
0973                                       float* pulseMatrices,
0974                                       float* pulseMatricesM,
0975                                       float* pulseMatricesP,
0976                                       HcalMahiPulseOffsetsSoA::ConstView pulseOffsetsView,
0977                                       float* noiseTerms,
0978                                       float* electronicNoiseTerms,
0979                                       int8_t* soiSamples,
0980                                       HcalMahiConditionsPortableDevice::ConstView mahi,
0981                                       bool const useEffectivePedestals,
0982                                       IProductTypef01::ConstView f01HEDigis,
0983                                       IProductTypef5::ConstView f5HBDigis,
0984                                       IProductTypef3::ConstView f3HBDigis) const {
0985           // can be relaxed if needed - minor updates are needed in that case!
0986           static_assert(NPULSES == NSAMPLES);
0987 
0988           auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
0989 
0990           auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
0991 
0992           auto const threadsPerBlock(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
0993 
0994           //Loop over all groups of channels
0995           for (auto group : uniform_groups_x(acc, nchannels)) {
0996             //Loop over each channel
0997             for (auto channel : uniform_group_elements_x(acc, group, nchannels)) {
0998               auto const gch = channel.global;
0999               auto const lch = channel.local;
1000 
1001               // if chi2 is set to -9999 do not run minimization
1002               if (outputGPU.chi2()[gch] == -9999.f)
1003                 continue;
1004 
1005               // configure shared mem
1006               float* shrmem = alpaka::getDynSharedMem<float>(acc);
1007               float* shrMatrixLFnnlsStorage = shrmem + calo::multifit::MapSymM<float, NPULSES>::total * lch;
1008               float* shrAtAStorage = shrmem + calo::multifit::MapSymM<float, NPULSES>::total * (lch + threadsPerBlock);
1009 
1010               // conditions for pedestal widths
1011               auto const id = gch < f01HEDigis.size() ? f01HEDigis.ids()[gch]
1012                                                       : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
1013                                                                              : f3HBDigis.ids()[gch - nchannelsf015]);
1014               auto const did = DetId{id};
1015               auto const hashedId =
1016                   did.subdetId() == HcalBarrel
1017                       ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
1018                       : did2linearIndexHE(id,
1019                                           mahi.maxDepthHE(),
1020                                           mahi.maxPhiHE(),
1021                                           mahi.firstHERing(),
1022                                           mahi.lastHERing(),
1023                                           mahi.nEtaHE()) +
1024                             mahi.offsetForHashes();
1025 
1026               // conditions based on the hash
1027               auto const* pedestalWidthsForChannel =
1028                   useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
1029                       ? mahi.effectivePedestalWidths()[hashedId].data()
1030                       : mahi.pedestals_width()[hashedId].data();
1031               auto const averagePedestalWidth2 = 0.25 * (pedestalWidthsForChannel[0] * pedestalWidthsForChannel[0] +
1032                                                          pedestalWidthsForChannel[1] * pedestalWidthsForChannel[1] +
1033                                                          pedestalWidthsForChannel[2] * pedestalWidthsForChannel[2] +
1034                                                          pedestalWidthsForChannel[3] * pedestalWidthsForChannel[3]);
1035 
1036               // FIXME on cpu ts 0 capid was used - does it make any difference
1037               auto const gain = mahi.gains_value()[hashedId][0];
1038 
1039               auto const respCorrection = mahi.respCorrs_values()[hashedId];
1040               auto const noisecorr = mahi.sipmPar_auxi2()[hashedId];
1041 
1042 #ifdef HCAL_MAHI_GPUDEBUG
1043 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
1044               if (id != DETID_TO_DEBUG)
1045                 return;
1046 #endif
1047 #endif
1048 
1049               calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
1050               CMS_UNROLL_LOOP
1051               for (int i = 0; i < NPULSES; ++i)
1052                 pulseOffsets(i) = i;
1053 
1054               // output amplitudes/weights
1055               calo::multifit::ColumnVector<NPULSES> resultAmplitudesVector =
1056                   calo::multifit::ColumnVector<NPULSES>::Zero();
1057 
1058               // map views
1059               Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> inputAmplitudesView{amplitudes + gch * NSAMPLES};
1060               Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
1061               Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseElectronicView{electronicNoiseTerms +
1062                                                                                            gch * NSAMPLES};
1063               Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{
1064                   pulseMatricesM + gch * NSAMPLES * NPULSES};
1065               Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{
1066                   pulseMatricesP + gch * NSAMPLES * NPULSES};
1067               Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{
1068                   pulseMatrices + gch * NSAMPLES * NPULSES};
1069 #ifdef HCAL_MAHI_GPUDEBUG
1070               for (int i = 0; i < NSAMPLES; i++)
1071                 printf("inputValues(%d) = %f noiseTerms(%d) = %f\n", i, inputAmplitudesView(i), i, noiseTermsView(i));
1072               for (int i = 0; i < NSAMPLES; i++) {
1073                 for (int j = 0; j < NPULSES; j++)
1074                   printf("%f ", glbPulseMatrixView(i, j));
1075                 printf("\n");
1076               }
1077               printf("\n");
1078               for (int i = 0; i < NSAMPLES; i++) {
1079                 for (int j = 0; j < NPULSES; j++)
1080                   printf("%f ", glbPulseMatrixMView(i, j));
1081                 printf("\n");
1082               }
1083               printf("\n");
1084               for (int i = 0; i < NSAMPLES; i++) {
1085                 for (int j = 0; j < NPULSES; j++)
1086                   printf("%f ", glbPulseMatrixPView(i, j));
1087                 printf("\n");
1088               }
1089 #endif
1090 
1091               int npassive = 0;
1092               float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f;
1093               for (int iter = 1; iter < nMaxItersMin; iter++) {
1094                 //float covarianceMatrixStorage[MapSymM<float, NSAMPLES>::total];
1095                 // NOTE: only works when NSAMPLES == NPULSES
1096                 // if does not hold -> slightly rearrange shared mem to still reuse
1097                 // shared memory
1098                 float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
1099                 calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
1100                 CMS_UNROLL_LOOP
1101                 for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::total; counter++)
1102                   covarianceMatrixStorage[counter] = (noisecorr != 0.f) ? 0.f : averagePedestalWidth2;
1103                 CMS_UNROLL_LOOP
1104                 for (unsigned int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::stride; counter++) {
1105                   covarianceMatrix(counter, counter) += noiseTermsView.coeffRef(counter);
1106                   if (counter != 0)
1107                     covarianceMatrix(counter, counter - 1) +=
1108                         noisecorr * noiseElectronicView.coeffRef(counter - 1) * noiseElectronicView.coeffRef(counter);
1109                 }
1110 
1111                 // update covariance matrix
1112                 update_covariance(resultAmplitudesVector,
1113                                   covarianceMatrix,
1114                                   glbPulseMatrixView,
1115                                   glbPulseMatrixMView,
1116                                   glbPulseMatrixPView);
1117 
1118 #ifdef HCAL_MAHI_GPUDEBUG
1119                 printf("covariance matrix\n");
1120                 for (int i = 0; i < 8; i++) {
1121                   for (int j = 0; j < 8; j++)
1122                     printf("%f ", covarianceMatrix(i, j));
1123                   printf("\n");
1124                 }
1125 #endif
1126 
1127                 // compute Cholesky Decomposition L matrix
1128                 //matrixDecomposition.compute(covarianceMatrix);
1129                 //auto const& matrixL = matrixDecomposition.matrixL();
1130                 float matrixLStorage[calo::multifit::MapSymM<float, NSAMPLES>::total];
1131                 calo::multifit::MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
1132                 calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix);
1133 
1134                 //
1135                 // replace eigen
1136                 //
1137                 //auto const& A = matrixDecomposition
1138                 //    .matrixL()
1139                 //    .solve(pulseMatrixView);
1140                 calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
1141                 calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
1142 
1143                 //
1144                 // remove eigen
1145                 //
1146                 //auto const& b = matrixL
1147                 //   .solve(inputAmplitudesView);
1148                 //
1149                 float reg_b[NSAMPLES];
1150                 calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
1151 
1152                 // TODO: we do not really need to change these matrcies
1153                 // will be fixed in the optimized version
1154                 //ColMajorMatrix<NPULSES, NPULSES> AtA = A.transpose() * A;
1155                 //ColumnVector<NPULSES> Atb = A.transpose() * b;
1156                 //ColMajorMatrix<NPULSES, NPULSES> AtA;
1157                 //float AtAStorage[MapSymM<float, NPULSES>::total];
1158                 calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
1159                 calo::multifit::ColumnVector<NPULSES> Atb;
1160                 CMS_UNROLL_LOOP
1161                 for (int icol = 0; icol < NPULSES; icol++) {
1162                   float reg_ai[NSAMPLES];
1163 
1164                   // load column icol
1165                   CMS_UNROLL_LOOP
1166                   for (int counter = 0; counter < NSAMPLES; counter++)
1167                     reg_ai[counter] = A(counter, icol);
1168 
1169                   // compute diagonal
1170                   float sum = 0.f;
1171                   CMS_UNROLL_LOOP
1172                   for (int counter = 0; counter < NSAMPLES; counter++)
1173                     sum += reg_ai[counter] * reg_ai[counter];
1174 
1175                   // store
1176                   AtA(icol, icol) = sum;
1177 
1178                   // go thru the other columns
1179                   CMS_UNROLL_LOOP
1180                   for (int j = icol + 1; j < NPULSES; j++) {
1181                     // load column j
1182                     float reg_aj[NSAMPLES];
1183                     CMS_UNROLL_LOOP
1184                     for (int counter = 0; counter < NSAMPLES; counter++)
1185                       reg_aj[counter] = A(counter, j);
1186 
1187                     // accum
1188                     float sum = 0.f;
1189                     CMS_UNROLL_LOOP
1190                     for (int counter = 0; counter < NSAMPLES; counter++)
1191                       sum += reg_aj[counter] * reg_ai[counter];
1192 
1193                     // store
1194                     //AtA(icol, j) = sum;
1195                     AtA(j, icol) = sum;
1196                   }
1197 
1198                   // Atb accum
1199                   float sum_atb = 0;
1200                   CMS_UNROLL_LOOP
1201                   for (int counter = 0; counter < NSAMPLES; counter++)
1202                     sum_atb += reg_ai[counter] * reg_b[counter];
1203 
1204                   // store atb
1205                   Atb(icol) = sum_atb;
1206                 }
1207 
1208 #ifdef HCAL_MAHI_GPUDEBUG
1209                 printf("AtA\n");
1210                 for (int i = 0; i < 8; i++) {
1211                   for (int j = 0; j < 8; j++)
1212                     printf("%f ", AtA(i, j));
1213                   printf("\n");
1214                 }
1215                 printf("Atb\n");
1216                 for (int i = 0; i < 8; i++)
1217                   printf("%f ", Atb(i));
1218                 printf("\n");
1219                 printf("result Amplitudes before nnls\n");
1220                 for (int i = 0; i < 8; i++)
1221                   printf("%f ", resultAmplitudesVector(i));
1222                 printf("\n");
1223 #endif
1224 
1225                 // for fnnls
1226                 calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
1227 
1228                 // run fast nnls
1229                 calo::multifit::fnnls(AtA,
1230                                       Atb,
1231                                       resultAmplitudesVector,
1232                                       npassive,
1233                                       pulseOffsets,
1234                                       matrixLForFnnls,
1235                                       nnlsThresh,
1236                                       nMaxItersNNLS,
1237                                       10,
1238                                       10);
1239 
1240 #ifdef HCAL_MAHI_GPUDEBUG
1241                 printf("result Amplitudes\n");
1242                 for (int i = 0; i < 8; i++)
1243                   printf("resultAmplitudes(%d) = %f\n", i, resultAmplitudesVector(i));
1244 #endif
1245 
1246                 calo::multifit::calculateChiSq(
1247                     matrixL, glbPulseMatrixView, resultAmplitudesVector, inputAmplitudesView, chi2);
1248 
1249                 auto const deltaChi2 = std::abs(chi2 - previous_chi2);
1250                 if (chi2 == chi2_2itersback && chi2 < previous_chi2)
1251                   break;
1252 
1253                 // update
1254                 chi2_2itersback = previous_chi2;
1255                 previous_chi2 = chi2;
1256 
1257                 // exit condition
1258                 if (deltaChi2 < deltaChi2Threashold)
1259                   break;
1260               }
1261 
1262 #ifdef HCAL_MAHI_GPUDEBUG
1263               for (int i = 0; i < NPULSES; i++)
1264                 printf("pulseOffsets(%d) = %d outputAmplitudes(%d) = %f\n",
1265                        i,
1266                        pulseOffsets(i),
1267                        i,
1268                        resultAmplitudesVector(i));
1269               printf("chi2 = %f\n", chi2);
1270 #endif
1271 
1272               outputGPU.chi2()[gch] = chi2;
1273               auto const idx_for_energy = std::abs(pulseOffsetsView.offsets()[0]);
1274               outputGPU.energy()[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
1275 
1276             }  // loop over channels
1277           }  //loop over group of channels
1278         }
1279       };  // Kernel_minimize
1280 
1281     }  // namespace mahi
1282 
1283     void runMahiAsync(Queue& queue,
1284                       IProductTypef01::ConstView const& f01HEDigis,
1285                       IProductTypef5::ConstView const& f5HBDigis,
1286                       IProductTypef3::ConstView const& f3HBDigis,
1287                       OProductType::View outputGPU,
1288                       HcalMahiConditionsPortableDevice::ConstView const& mahi,
1289                       HcalSiPMCharacteristicsPortableDevice::ConstView const& sipmCharacteristics,
1290                       HcalRecoParamWithPulseShapeDevice::ConstView const& recoParamsWithPS,
1291                       HcalMahiPulseOffsetsSoA::ConstView const& mahiPulseOffsets,
1292                       ConfigParameters const& configParameters) {
1293       auto const totalChannels =
1294           f01HEDigis.metadata().size() + f5HBDigis.metadata().size() + f3HBDigis.metadata().size();
1295       // FIXME: the number of channels in output might change given that some channesl might be filtered out
1296 
1297       // TODO: this can be lifted by implementing a separate kernel
1298       // similar to the default one, but properly handling the diff in #sample
1299       // or modifying existing one
1300       // TODO: assert startingSample = f01nsamples - windowSize to be 0 or 2
1301       // assert f01nsamples == f5nsamples
1302       // assert f01nsamples == f3nsamples
1303       int constexpr windowSize = 8;
1304 
1305       //compute work division
1306       uint32_t nchannels_per_block = configParameters.kprep1dChannelsPerBlock;
1307       auto const blocks_y = cms::alpakatools::divide_up_by(totalChannels, nchannels_per_block);
1308 
1309       Vec2D const blocks_2d{blocks_y, 1u};  // {y, x} coordiantes
1310       Vec2D const threads_2d{nchannels_per_block, windowSize};
1311       auto workDivPrep2D = cms::alpakatools::make_workdiv<Acc2D>(blocks_2d, threads_2d);
1312 
1313       //Device buffer for output
1314       auto amplitudes = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1315       auto noiseTerms = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1316       auto electronicNoiseTerms = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1317       auto soiSamples = cms::alpakatools::make_device_buffer<int8_t[]>(queue, totalChannels * windowSize);
1318 
1319       alpaka::exec<Acc2D>(queue,
1320                           workDivPrep2D,
1321                           mahi::Kernel_prep1d_sameNumberOfSamples{},
1322                           outputGPU,
1323                           f01HEDigis,
1324                           f5HBDigis,
1325                           f3HBDigis,
1326                           mahi,
1327                           sipmCharacteristics,
1328                           recoParamsWithPS,
1329                           configParameters.useEffectivePedestals,
1330                           configParameters.sipmQTSShift,
1331                           configParameters.sipmQNTStoSum,
1332                           configParameters.firstSampleShift,
1333                           configParameters.ts4Thresh,
1334                           amplitudes.data(),
1335                           noiseTerms.data(),
1336                           electronicNoiseTerms.data(),
1337                           soiSamples.data(),
1338                           windowSize);
1339 
1340       //// 1024 is the max threads per block for gtx1080
1341       //// FIXME: Take this from Alpaka in a way that does not need to query deviceProperty for every event
1342       uint32_t const channelsPerBlock = 1024 / (windowSize * mahiPulseOffsets.metadata().size());
1343 
1344       //launch 1D blocks of 3D threads
1345       auto const blocks_z = cms::alpakatools::divide_up_by(totalChannels, channelsPerBlock);
1346       Vec3D const blocks_3d{blocks_z, 1u, 1u};  // 1D block in z {z,y,x} coordinates
1347       Vec3D const threads_3d{channelsPerBlock, mahiPulseOffsets.metadata().size(), windowSize};
1348 
1349       auto workDivPrep3D = cms::alpakatools::make_workdiv<Acc3D>(blocks_3d, threads_3d);
1350 
1351       //Device buffer for output
1352       auto pulseMatrices = cms::alpakatools::make_device_buffer<float[]>(
1353           queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1354       auto pulseMatricesM = cms::alpakatools::make_device_buffer<float[]>(
1355           queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1356       auto pulseMatricesP = cms::alpakatools::make_device_buffer<float[]>(
1357           queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1358 
1359       alpaka::exec<Acc3D>(queue,
1360                           workDivPrep3D,
1361                           mahi::Kernel_prep_pulseMatrices_sameNumberOfSamples{},
1362                           pulseMatrices.data(),
1363                           pulseMatricesM.data(),
1364                           pulseMatricesP.data(),
1365                           mahiPulseOffsets,
1366                           amplitudes.data(),
1367                           f01HEDigis,
1368                           f5HBDigis,
1369                           f3HBDigis,
1370                           soiSamples.data(),
1371                           mahi,
1372                           recoParamsWithPS,
1373                           configParameters.meanTime,
1374                           configParameters.timeSigmaSiPM,
1375                           configParameters.timeSigmaHPD,
1376                           configParameters.applyTimeSlew,
1377                           configParameters.tzeroTimeSlew,
1378                           configParameters.slopeTimeSlew,
1379                           configParameters.tmaxTimeSlew);
1380 
1381       uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
1382       auto blocks_1d = cms::alpakatools::divide_up_by(totalChannels, threadsPerBlock);
1383 
1384       auto workDivPrep1D = cms::alpakatools::make_workdiv<Acc1D>(blocks_1d, threadsPerBlock);
1385 
1386       alpaka::exec<Acc1D>(queue,
1387                           workDivPrep1D,
1388                           mahi::Kernel_minimize<8, 8>{},
1389                           outputGPU,
1390                           amplitudes.data(),
1391                           pulseMatrices.data(),
1392                           pulseMatricesM.data(),
1393                           pulseMatricesP.data(),
1394                           mahiPulseOffsets,
1395                           noiseTerms.data(),
1396                           electronicNoiseTerms.data(),
1397                           soiSamples.data(),
1398                           mahi,
1399                           configParameters.useEffectivePedestals,
1400                           f01HEDigis,
1401                           f5HBDigis,
1402                           f3HBDigis);
1403     }
1404 
1405   }  // namespace hcal::reconstruction
1406 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE
1407 
1408 namespace alpaka::trait {
1409   using namespace ALPAKA_ACCELERATOR_NAMESPACE;
1410   using namespace ALPAKA_ACCELERATOR_NAMESPACE::hcal::reconstruction::mahi;
1411 
1412   //! The trait for getting the size of the block shared dynamic memory for Kernel_prep_1d_and_initialize.
1413   template <>
1414   struct BlockSharedMemDynSizeBytes<Kernel_prep1d_sameNumberOfSamples, Acc2D> {
1415     //! \return The size of the shared memory allocated for a block.
1416     template <typename TVec, typename... TArgs>
1417     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_prep1d_sameNumberOfSamples const&,
1418                                                                  TVec const& threadsPerBlock,
1419                                                                  TVec const& elemsPerThread,
1420                                                                  TArgs const&...) -> std::size_t {
1421       // return the amount of dynamic shared memory needed
1422 
1423       // threadsPerBlock[1] = threads2d.x = windowSize = 8
1424       // threadsPerBlock[0] = threads2d.y = nchannels_per_block = 32
1425       // elemsPerThread = 1
1426       std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] *
1427                           ((2 * threadsPerBlock[1u] * elemsPerThread[1u] + 2) * sizeof(float) + sizeof(uint64_t));
1428       return bytes;
1429     }
1430   };
1431 
1432   //! The trait for getting the size of the block shared dynamic memory for kernel_minimize.
1433   template <int NSAMPLES, int NPULSES>
1434   struct BlockSharedMemDynSizeBytes<Kernel_minimize<NSAMPLES, NPULSES>, Acc1D> {
1435     //! \return The size of the shared memory allocated for a block.
1436     template <typename TVec, typename... TArgs>
1437     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_minimize<NSAMPLES, NPULSES> const&,
1438                                                                  TVec const& threadsPerBlock,
1439                                                                  TVec const& elemsPerThread,
1440                                                                  TArgs const&...) -> std::size_t {
1441       // return the amount of dynamic shared memory needed
1442 
1443       std::size_t bytes =
1444           2 * threadsPerBlock[0u] * elemsPerThread[0u] * (calo::multifit::MapSymM<float, 8>::total * sizeof(float));
1445       return bytes;
1446     }
1447   };
1448 
1449 }  // namespace alpaka::trait