Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu is written in an unsupported language. File is not indexed.

0001 #include <Eigen/Dense>
0002 
0003 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
0004 // needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
0005 #include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
0006 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0007 
0008 // TODO reuse some of the HCAL constats from
0009 //#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
0010 
0011 #include "SimpleAlgoGPU.h"
0012 #include "KernelHelpers.h"
0013 
0014 #ifdef HCAL_MAHI_GPUDEBUG
0015 #define DETID_TO_DEBUG 1125647428
0016 #endif
0017 
0018 namespace hcal {
0019   namespace mahi {
0020 
0021     // TODO: provide constants from configuration
0022     // from RecoLocalCalo/HcalRecProducers/python/HBHEMahiParameters_cfi.py
0023     constexpr int nMaxItersMin = 50;
0024     constexpr int nMaxItersNNLS = 500;
0025     constexpr double nnlsThresh = 1e-11;
0026     constexpr float deltaChi2Threashold = 1e-3;
0027 
0028     // from RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc
0029     __forceinline__ __device__ float get_raw_charge(double const charge,
0030                                                     double const pedestal,
0031                                                     float const* shrChargeMinusPedestal,
0032                                                     float const* parLin1Values,
0033                                                     float const* parLin2Values,
0034                                                     float const* parLin3Values,
0035                                                     int32_t const nsamplesForCompute,
0036                                                     int32_t const soi,
0037                                                     int const sipmQTSShift,
0038                                                     int const sipmQNTStoSum,
0039                                                     int const sipmType,
0040                                                     float const fcByPE,
0041                                                     bool const isqie11) {
0042       float rawCharge;
0043 
0044       if (!isqie11)
0045         rawCharge = charge;
0046       else {
0047         auto const parLin1 = parLin1Values[sipmType - 1];
0048         auto const parLin2 = parLin2Values[sipmType - 1];
0049         auto const parLin3 = parLin3Values[sipmType - 1];
0050 
0051         int const first = std::max(soi + sipmQTSShift, 0);
0052         int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute);
0053         float sipmq = 0.0f;
0054         for (auto ts = first; ts < last; ts++)
0055           sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesForCompute + ts];
0056         auto const effectivePixelsFired = sipmq / fcByPE;
0057         auto const factor =
0058             hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
0059         rawCharge = (charge - pedestal) * factor + pedestal;
0060 
0061 #ifdef HCAL_MAHI_GPUDEBUG
0062         printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
0063 #endif
0064       }
0065       return rawCharge;
0066     }
0067 
0068     // Assume: same number of samples for HB and HE
0069     // TODO: add/validate restrict (will increase #registers in use by the kernel)
0070     __global__ void kernel_prep1d_sameNumberOfSamples(float* amplitudes,
0071                                                       float* noiseTerms,
0072                                                       float* electronicNoiseTerms,
0073                                                       float* outputEnergy,
0074                                                       float* outputChi2,
0075                                                       uint16_t const* dataf01HE,
0076                                                       uint16_t const* dataf5HB,
0077                                                       uint16_t const* dataf3HB,
0078                                                       uint32_t const* idsf01HE,
0079                                                       uint32_t const* idsf5HB,
0080                                                       uint32_t const* idsf3HB,
0081                                                       uint32_t const stridef01HE,
0082                                                       uint32_t const stridef5HB,
0083                                                       uint32_t const stridef3HB,
0084                                                       uint32_t const nchannelsf01HE,
0085                                                       uint32_t const nchannelsf5HB,
0086                                                       uint8_t const* npresamplesf5HB,
0087                                                       int8_t* soiSamples,
0088                                                       float* method0Energy,
0089                                                       float* method0Time,
0090                                                       uint32_t* outputdid,
0091                                                       uint32_t const nchannels,
0092                                                       uint32_t const* qualityStatus,
0093                                                       uint32_t const* recoParam1Values,
0094                                                       uint32_t const* recoParam2Values,
0095                                                       float const* qieCoderOffsets,
0096                                                       float const* qieCoderSlopes,
0097                                                       int const* qieTypes,
0098                                                       float const* pedestalWidths,
0099                                                       float const* effectivePedestalWidths,
0100                                                       float const* pedestals,
0101                                                       float const* effectivePedestals,
0102                                                       bool const useEffectivePedestals,
0103                                                       int const* sipmTypeValues,
0104                                                       float const* fcByPEValues,
0105                                                       float const* parLin1Values,
0106                                                       float const* parLin2Values,
0107                                                       float const* parLin3Values,
0108                                                       float const* gainValues,
0109                                                       float const* respCorrectionValues,
0110                                                       int const maxDepthHB,
0111                                                       int const maxDepthHE,
0112                                                       int const maxPhiHE,
0113                                                       int const firstHBRing,
0114                                                       int const lastHBRing,
0115                                                       int const firstHERing,
0116                                                       int const lastHERing,
0117                                                       int const nEtaHB,
0118                                                       int const nEtaHE,
0119                                                       int const sipmQTSShift,
0120                                                       int const sipmQNTStoSum,
0121                                                       int const firstSampleShift,
0122                                                       uint32_t const offsetForHashes,
0123                                                       float const ts4Thresh,
0124                                                       int const startingSample) {
0125       // indices + runtime constants
0126       auto const sample = threadIdx.x + startingSample;
0127       auto const sampleWithinWindow = threadIdx.x;
0128       int32_t const nsamplesForCompute = blockDim.x;
0129       auto const lch = threadIdx.y;
0130       auto const gch = lch + blockDim.y * blockIdx.x;
0131       auto const nchannels_per_block = blockDim.y;
0132       auto const linearThPerBlock = threadIdx.x + threadIdx.y * blockDim.x;
0133 
0134       // remove
0135       if (gch >= nchannels)
0136         return;
0137 
0138       // initialize all output buffers
0139       if (sampleWithinWindow == 0) {
0140         outputdid[gch] = 0;
0141         method0Energy[gch] = 0;
0142         method0Time[gch] = 0;
0143         outputEnergy[gch] = 0;
0144         outputChi2[gch] = 0;
0145         soiSamples[gch] = -1;
0146       }
0147 
0148 #ifdef HCAL_MAHI_GPUDEBUG
0149 #ifdef HCAL_MAHI_GPUDEBUG_SINGLECHANNEL
0150       if (gch > 0)
0151         return;
0152 #endif
0153 #endif
0154 
0155       // configure shared mem
0156       extern __shared__ char smem[];
0157       float* shrEnergyM0PerTS = reinterpret_cast<float*>(smem);
0158       float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesForCompute * nchannels_per_block;
0159       float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesForCompute * nchannels_per_block;
0160       float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block;
0161       unsigned long long int* shrMethod0EnergySamplePair =
0162           reinterpret_cast<unsigned long long int*>(shrEnergyM0TotalAccum + nchannels_per_block);
0163       if (sampleWithinWindow == 0) {
0164         shrMethod0EnergyAccum[lch] = 0;
0165         shrMethod0EnergySamplePair[lch] = __float_as_uint(std::numeric_limits<float>::min());
0166         shrEnergyM0TotalAccum[lch] = 0;
0167       }
0168 
0169       // offset output
0170       auto* amplitudesForChannel = amplitudes + nsamplesForCompute * gch;
0171       auto* noiseTermsForChannel = noiseTerms + nsamplesForCompute * gch;
0172       auto* electronicNoiseTermsForChannel = electronicNoiseTerms + nsamplesForCompute * gch;
0173       auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
0174 
0175       // get event input quantities
0176       auto const stride = gch < nchannelsf01HE ? stridef01HE : (gch < nchannelsf015 ? stridef5HB : stridef3HB);
0177       auto const nsamples = gch < nchannelsf01HE ? compute_nsamples<Flavor1>(stride)
0178                                                  : (gch < nchannelsf015 ? compute_nsamples<Flavor5>(stride)
0179                                                                         : compute_nsamples<Flavor3>(stride));
0180 
0181 #ifdef HCAL_MAHI_GPUDEBUG
0182       assert(nsamples == nsamplesForCompute || nsamples - startingSample == nsamplesForCompute);
0183 #endif
0184 
0185       auto const id = gch < nchannelsf01HE
0186                           ? idsf01HE[gch]
0187                           : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
0188       auto const did = HcalDetId{id};
0189 
0190       auto const adc =
0191           gch < nchannelsf01HE
0192               ? adc_for_sample<Flavor1>(dataf01HE + stride * gch, sample)
0193               : (gch < nchannelsf015 ? adc_for_sample<Flavor5>(dataf5HB + stride * (gch - nchannelsf01HE), sample)
0194                                      : adc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
0195       auto const capid =
0196           gch < nchannelsf01HE
0197               ? capid_for_sample<Flavor1>(dataf01HE + stride * gch, sample)
0198               : (gch < nchannelsf015 ? capid_for_sample<Flavor5>(dataf5HB + stride * (gch - nchannelsf01HE), sample)
0199                                      : capid_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
0200 
0201 #ifdef HCAL_MAHI_GPUDEBUG
0202 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
0203       if (id != DETID_TO_DEBUG)
0204         return;
0205 #endif
0206 #endif
0207 
0208       // compute hash for this did
0209       auto const hashedId =
0210           did.subdetId() == HcalBarrel
0211               ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
0212               : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
0213                     offsetForHashes;
0214 
0215       // conditions based on the hash
0216       // FIXME: remove hardcoded values
0217       auto const qieType = qieTypes[hashedId] > 0 ? 1 : 0;  // 2 types at this point
0218       auto const* qieOffsets = qieCoderOffsets + hashedId * HcalQIECodersGPU::numValuesPerChannel;
0219       auto const* qieSlopes = qieCoderSlopes + hashedId * HcalQIECodersGPU::numValuesPerChannel;
0220       auto const* pedestalsForChannel = pedestals + hashedId * 4;
0221       auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
0222                                                  ? effectivePedestalWidths + hashedId * 4
0223                                                  : pedestalWidths + hashedId * 4;
0224 
0225       auto const* gains = gainValues + hashedId * 4;
0226       auto const gain = gains[capid];
0227       auto const gain0 = gains[0];
0228       auto const respCorrection = respCorrectionValues[hashedId];
0229       auto const pedestal = pedestalsForChannel[capid];
0230       auto const pedestalWidth = pedestalWidthsForChannel[capid];
0231       // if needed, only use effective pedestals for f01
0232       auto const pedestalToUseForMethod0 = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
0233                                                ? effectivePedestals[hashedId * 4 + capid]
0234                                                : pedestal;
0235       auto const sipmType = sipmTypeValues[hashedId];
0236       auto const fcByPE = fcByPEValues[hashedId];
0237       auto const recoParam1 = recoParam1Values[hashedId];
0238       auto const recoParam2 = recoParam2Values[hashedId];
0239 
0240 #ifdef HCAL_MAHI_GPUDEBUG
0241       printf("qieType = %d qieOffset0 = %f qieOffset1 = %f qieSlope0 = %f qieSlope1 = %f\n",
0242              qieType,
0243              qieOffsets[0],
0244              qieOffsets[1],
0245              qieSlopes[0],
0246              qieSlopes[1]);
0247 #endif
0248 
0249       // compute charge
0250       auto const charge = hcal::reconstruction::compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
0251 
0252       shrChargeMinusPedestal[linearThPerBlock] = charge - pedestal;
0253       if (gch < nchannelsf01HE) {
0254         // NOTE: assume that soi is high only for a single guy!
0255         //   which must be the case. cpu version does not check for that
0256         //   if that is not the case, we will see that with cuda mmecheck
0257         auto const soibit = soibit_for_sample<Flavor1>(dataf01HE + stride * gch, sample);
0258         if (soibit == 1)
0259           soiSamples[gch] = sampleWithinWindow;
0260       } else if (gch >= nchannelsf015) {
0261         auto const soibit = soibit_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample);
0262         if (soibit == 1)
0263           soiSamples[gch] = sampleWithinWindow;
0264       }
0265       __syncthreads();
0266       int32_t const soi = gch < nchannelsf01HE
0267                               ? soiSamples[gch]
0268                               : (gch < nchannelsf015 ? npresamplesf5HB[gch - nchannelsf01HE] : soiSamples[gch]);
0269 
0270       bool badSOI = (soi < 0 or soi >= nsamplesForCompute);
0271       if (badSOI and sampleWithinWindow == 0) {
0272 #ifdef GPU_DEBUG
0273         printf("Found HBHE channel %d with invalid SOI %d\n", gch, soi);
0274 #endif
0275         // mark the channel as bad
0276         outputChi2[gch] = -9999.f;
0277       }
0278 
0279       //int32_t const soi = gch >= nchannelsf01HE
0280       //    ? npresamplesf5HB[gch - nchannelsf01HE]
0281       //    : soiSamples[gch];
0282       // this is here just to make things uniform...
0283       if (gch >= nchannelsf01HE && gch < nchannelsf015 && sampleWithinWindow == 0)
0284         soiSamples[gch] = npresamplesf5HB[gch - nchannelsf01HE];
0285 
0286       //
0287       // compute various quantities (raw charge and tdc stuff)
0288       // NOTE: this branch will be divergent only for a single warp that
0289       // sits on the boundary when flavor 01 channels end and flavor 5 start
0290       //
0291       float const rawCharge = get_raw_charge(charge,
0292                                              pedestal,
0293                                              shrChargeMinusPedestal,
0294                                              parLin1Values,
0295                                              parLin2Values,
0296                                              parLin3Values,
0297                                              nsamplesForCompute,
0298                                              soi,
0299                                              sipmQTSShift,
0300                                              sipmQNTStoSum,
0301                                              sipmType,
0302                                              fcByPE,
0303                                              gch < nchannelsf01HE || gch >= nchannelsf015);
0304 
0305       auto const dfc = hcal::reconstruction::compute_diff_charge_gain(
0306           qieType, adc, capid, qieOffsets, qieSlopes, gch < nchannelsf01HE || gch >= nchannelsf015);
0307 
0308 #ifdef COMPUTE_TDC_TIME
0309       float tdcTime;
0310       if (gch >= nchannelsf01HE && gch < nchannelsf015) {
0311         tdcTime = HcalSpecialTimes::UNKNOWN_T_NOTDC;
0312       } else {
0313         if (gch < nchannelsf01HE)
0314           tdcTime = HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor1>(dataf01HE + stride * gch, sample));
0315         else if (gch >= nchannelsf015)
0316           tdcTime =
0317               HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
0318       }
0319 #endif  // COMPUTE_TDC_TIME
0320 
0321       // compute method 0 quantities
0322       // TODO: need to apply containment
0323       // TODO: need to apply time slew
0324       // TODO: for < run 3, apply HBM legacy energy correction
0325       auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
0326       auto const startSampleTmp = soi + firstSampleShift;
0327       auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp;
0328       auto const endSample =
0329           startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute;
0330       // NOTE: gain is a small number < 10^-3, multiply it last
0331       auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
0332       auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
0333       // store to shared mem
0334       shrEnergyM0PerTS[lch * nsamplesForCompute + sampleWithinWindow] = energym0_per_ts;
0335       atomicAdd(&shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0);
0336 
0337 #ifdef HCAL_MAHI_GPUDEBUG
0338       printf(
0339           "id = %u sample = %d gch = %d hashedId = %u adc = %u capid = %u\n"
0340           "   charge = %f rawCharge = %f dfc = %f pedestal = %f\n"
0341           "   gain = %f respCorrection = %f energym0_per_ts = %f\n",
0342           id,
0343           sample,
0344           gch,
0345           hashedId,
0346           adc,
0347           capid,
0348           charge,
0349           rawCharge,
0350           dfc,
0351           pedestalToUseForMethod0,
0352           gain,
0353           respCorrection,
0354           energym0_per_ts);
0355       printf(
0356           "startSample = %d endSample = %d param1 = %u param2 = %u\n", startSample, endSample, recoParam1, recoParam2);
0357 #endif
0358 
0359       if (sampleWithinWindow >= startSample && sampleWithinWindow < endSample) {
0360         atomicAdd(&shrMethod0EnergyAccum[lch], energym0_per_ts);
0361         // pack sample, energy as 64 bit value
0362         unsigned long long int old = shrMethod0EnergySamplePair[lch], assumed;
0363         unsigned long long int val =
0364             (static_cast<unsigned long long int>(sampleWithinWindow) << 32) + __float_as_uint(energym0_per_ts);
0365         do {
0366           assumed = old;
0367           // decode energy, sample values
0368           //int const current_sample = (assumed >> 32) & 0xffffffff;
0369           float const current_energy = __uint_as_float(assumed & 0xffffffff);
0370           if (energym0_per_ts > current_energy)
0371             old = atomicCAS(&shrMethod0EnergySamplePair[lch], assumed, val);
0372           else
0373             break;
0374         } while (assumed != old);
0375       }
0376       __syncthreads();
0377 
0378       // NOTE: must take soi, as values for that thread are used...
0379       // NOTE: does not run if soi is bad, because it does not match any sampleWithinWindow
0380       if (sampleWithinWindow == soi) {
0381         auto const method0_energy = shrMethod0EnergyAccum[lch];
0382         auto const val = shrMethod0EnergySamplePair[lch];
0383         int const max_sample = (val >> 32) & 0xffffffff;
0384         float const max_energy = __uint_as_float(val & 0xffffffff);
0385         float const max_energy_1 =
0386             max_sample < nsamplesForCompute - 1 ? shrEnergyM0PerTS[lch * nsamplesForCompute + max_sample + 1] : 0.f;
0387         float const position = nsamplesToAdd < nsamplesForCompute ? max_sample - soi : max_sample;
0388         auto const sum = max_energy + max_energy_1;
0389         // FIXME: for full comparison with cpu method 0  timing,
0390         // need to correct by slew
0391         // requires an accumulator -> more shared mem -> omit here unless
0392         // really needed
0393         float const time =
0394             max_energy > 0.f && max_energy_1 > 0.f ? 25.f * (position + max_energy_1 / sum) : 25.f * position;
0395 
0396         // store method0 quantities to global mem
0397         outputdid[gch] = id;
0398         method0Energy[gch] = method0_energy;
0399         method0Time[gch] = time;
0400 
0401 #ifdef HCAL_MAHI_GPUDEBUG
0402         printf("tsTOT = %f tstrig = %f ts4Thresh = %f\n", shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0, ts4Thresh);
0403 #endif
0404 
0405         // Channel quality check
0406         //    https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecAlgos/plugins/HcalChannelPropertiesEP.cc#L107-L109
0407         //    https://github.com/cms-sw/cmssw/blob/6d2f66057131baacc2fcbdd203588c41c885b42c/CondCore/HcalPlugins/plugins/HcalChannelQuality_PayloadInspector.cc#L30
0408         //      const bool taggedBadByDb = severity.dropChannel(digistatus->getValue());
0409         //  do not run MAHI if taggedBadByDb = true
0410 
0411         auto const digiStatus_ = qualityStatus[hashedId];
0412         const bool taggedBadByDb = (digiStatus_ / 32770);
0413 
0414         if (taggedBadByDb)
0415           outputChi2[gch] = -9999.f;
0416 
0417         // check as in cpu version if mahi is not needed
0418         // FIXME: KNOWN ISSUE: observed a problem when rawCharge and pedestal
0419         // are basically equal and generate -0.00000...
0420         // needs to be treated properly
0421         if (!(shrEnergyM0TotalAccum[lch] > 0 && energym0_per_ts_gain0 > ts4Thresh)) {
0422           // do not need to run mahi minimization
0423           //outputEnergy[gch] = 0; energy already inited to 0
0424           outputChi2[gch] = -9999.f;
0425         }
0426 
0427 #ifdef HCAL_MAHI_GPUDEBUG
0428         printf("method0_energy = %f max_sample = %d max_energy = %f time = %f\n",
0429                method0_energy,
0430                max_sample,
0431                max_energy,
0432                time);
0433 #endif
0434       }
0435 
0436       //
0437       // preparations for mahi fit
0438       //
0439       auto const amplitude = rawCharge - pedestalToUseForMethod0;
0440       auto const noiseADC = (1. / std::sqrt(12)) * dfc;
0441       auto const noisePhotoSq = amplitude > pedestalWidth ? (amplitude * fcByPE) : 0.f;
0442       auto const noiseTerm = noiseADC * noiseADC + noisePhotoSq + pedestalWidth * pedestalWidth;
0443 
0444 #ifdef HCAL_MAHI_GPUDEBUG
0445       printf(
0446           "charge(%d) = %f pedestal(%d) = %f dfc(%d) = %f pedestalWidth(%d) = %f noiseADC(%d) = %f noisPhoto(%d) = "
0447           "%f\n",
0448           sample,
0449           rawCharge,
0450           sample,
0451           pedestalToUseForMethod0,
0452           sample,
0453           dfc,
0454           sample,
0455           pedestalWidth,
0456           sample,
0457           noiseADC,
0458           sample,
0459           noisePhotoSq);
0460 #endif
0461 
0462       // store to global memory
0463       amplitudesForChannel[sampleWithinWindow] = amplitude;
0464       noiseTermsForChannel[sampleWithinWindow] = noiseTerm;
0465       electronicNoiseTermsForChannel[sampleWithinWindow] = pedestalWidth;
0466     }
0467 
0468     // TODO: need to add an array of offsets for pulses (a la activeBXs...)
0469     // Assume for now 8 pulses
0470     __global__ void kernel_prep_pulseMatrices_sameNumberOfSamples(float* pulseMatrices,
0471                                                                   float* pulseMatricesM,
0472                                                                   float* pulseMatricesP,
0473                                                                   int const* pulseOffsets,
0474                                                                   float const* amplitudes,
0475                                                                   uint32_t const* idsf01HE,
0476                                                                   uint32_t const* idsf5HB,
0477                                                                   uint32_t const* idsf3HB,
0478                                                                   uint32_t const nchannelsf01HE,
0479                                                                   uint32_t const nchannelsf5HB,
0480                                                                   uint32_t const nchannelsTotal,
0481                                                                   int8_t const* soiSamples,
0482                                                                   uint32_t const* recoPulseShapeIds,
0483                                                                   float const* acc25nsVecValues,
0484                                                                   float const* diff25nsItvlVecValues,
0485                                                                   float const* accVarLenIdxMinusOneVecValues,
0486                                                                   float const* diffVarItvlIdxMinusOneVecValues,
0487                                                                   float const* accVarLenIdxZeroVecValues,
0488                                                                   float const* diffVarItvlIdxZeroVecValues,
0489                                                                   float const meanTime,
0490                                                                   float const timeSigmaSiPM,
0491                                                                   float const timeSigmaHPD,
0492                                                                   int const maxDepthHB,
0493                                                                   int const maxDepthHE,
0494                                                                   int const maxPhiHE,
0495                                                                   int const firstHBRing,
0496                                                                   int const lastHBRing,
0497                                                                   int const firstHERing,
0498                                                                   int const lastHERing,
0499                                                                   int const nEtaHB,
0500                                                                   int const nEtaHE,
0501                                                                   uint32_t const offsetForHashes,
0502                                                                   bool const applyTimeSlew,
0503                                                                   float const tzeroTimeSlew,
0504                                                                   float const slopeTimeSlew,
0505                                                                   float const tmaxTimeSlew) {
0506       // indices
0507       auto const ipulse = threadIdx.y;
0508       auto const npulses = blockDim.y;
0509       auto const sample = threadIdx.x;
0510       auto const nsamples = blockDim.x;
0511       auto const lch = threadIdx.z;
0512       auto const gch = lch + blockIdx.x * blockDim.z;
0513       auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
0514 
0515       if (gch >= nchannelsTotal)
0516         return;
0517 
0518       // conditions
0519       auto const id = gch < nchannelsf01HE
0520                           ? idsf01HE[gch]
0521                           : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
0522       //auto const id = gch >= nchannelsf01HE
0523       //    ? idsf5HB[gch - nchannelsf01HE]
0524       //    : idsf01HE[gch];
0525       auto const deltaT = gch >= nchannelsf01HE && gch < nchannelsf015 ? timeSigmaHPD : timeSigmaSiPM;
0526       auto const did = DetId{id};
0527       auto const hashedId =
0528           did.subdetId() == HcalBarrel
0529               ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
0530               : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
0531                     offsetForHashes;
0532       auto const recoPulseShapeId = recoPulseShapeIds[hashedId];
0533       auto const* acc25nsVec = acc25nsVecValues + recoPulseShapeId * hcal::constants::maxPSshapeBin;
0534       auto const* diff25nsItvlVec = diff25nsItvlVecValues + recoPulseShapeId * hcal::constants::maxPSshapeBin;
0535       auto const* accVarLenIdxMinusOneVec = accVarLenIdxMinusOneVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
0536       auto const* diffVarItvlIdxMinusOneVec =
0537           diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
0538       auto const* accVarLenIdxZeroVec = accVarLenIdxZeroVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
0539       auto const* diffVarItvlIdxZeroVec = diffVarItvlIdxZeroVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
0540 
0541       // offset output arrays
0542       auto* pulseMatrix = pulseMatrices + nsamples * npulses * gch;
0543       auto* pulseMatrixM = pulseMatricesM + nsamples * npulses * gch;
0544       auto* pulseMatrixP = pulseMatricesP + nsamples * npulses * gch;
0545 
0546       // amplitude per ipulse
0547       int const soi = soiSamples[gch];
0548       int const pulseOffset = pulseOffsets[ipulse];
0549       auto const amplitude = amplitudes[gch * nsamples + pulseOffset + soi];
0550 
0551 #ifdef HCAL_MAHI_GPUDEBUG
0552 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
0553       if (id != DETID_TO_DEBUG)
0554         return;
0555 #endif
0556 #endif
0557 
0558 #ifdef HCAL_MAHI_GPUDEBUG
0559       if (sample == 0 && ipulse == 0) {
0560         for (int i = 0; i < 8; i++)
0561           printf("amplitude(%d) = %f\n", i, amplitudes[gch * nsamples + i]);
0562         printf("acc25nsVec and diff25nsItvlVec for recoPulseShapeId = %u\n", recoPulseShapeId);
0563         for (int i = 0; i < 256; i++) {
0564           printf("acc25nsVec(%d) = %f diff25nsItvlVec(%d) = %f\n", i, acc25nsVec[i], i, diff25nsItvlVec[i]);
0565         }
0566         printf("accVarLenIdxZEROVec and accVarLenIdxMinusOneVec\n");
0567         for (int i = 0; i < 25; i++) {
0568           printf("accVarLenIdxZEROVec(%d) = %f accVarLenIdxMinusOneVec(%d) = %f\n",
0569                  i,
0570                  accVarLenIdxZeroVec[i],
0571                  i,
0572                  accVarLenIdxMinusOneVec[i]);
0573         }
0574         printf("diffVarItvlIdxZEROVec and diffVarItvlIdxMinusOneVec\n");
0575         for (int i = 0; i < 25; i++) {
0576           printf("diffVarItvlIdxZEROVec(%d) = %f diffVarItvlIdxMinusOneVec(%d) = %f\n",
0577                  i,
0578                  diffVarItvlIdxZeroVec[i],
0579                  i,
0580                  diffVarItvlIdxMinusOneVec[i]);
0581         }
0582       }
0583 #endif
0584 
0585       auto t0 = meanTime;
0586       if (applyTimeSlew) {
0587         if (amplitude <= 1.0f)
0588           t0 += hcal::reconstruction::compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
0589         else
0590           t0 += hcal::reconstruction::compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
0591       }
0592       auto const t0m = -deltaT + t0;
0593       auto const t0p = deltaT + t0;
0594 
0595 #ifdef HCAL_MAHI_GPUDEBUG
0596       if (sample == 0 && ipulse == 0) {
0597         printf("time values: %f %f %f\n", t0, t0m, t0p);
0598       }
0599 
0600       if (sample == 0 && ipulse == 0) {
0601         for (int i = 0; i < hcal::constants::maxSamples; i++) {
0602           auto const value = hcal::reconstruction::compute_pulse_shape_value(t0,
0603                                                                              i,
0604                                                                              0,
0605                                                                              acc25nsVec,
0606                                                                              diff25nsItvlVec,
0607                                                                              accVarLenIdxMinusOneVec,
0608                                                                              diffVarItvlIdxMinusOneVec,
0609                                                                              accVarLenIdxZeroVec,
0610                                                                              diffVarItvlIdxZeroVec);
0611           printf("pulse(%d) = %f\n", i, value);
0612         }
0613         printf("\n");
0614         for (int i = 0; i < hcal::constants::maxSamples; i++) {
0615           auto const value = hcal::reconstruction::compute_pulse_shape_value(t0p,
0616                                                                              i,
0617                                                                              0,
0618                                                                              acc25nsVec,
0619                                                                              diff25nsItvlVec,
0620                                                                              accVarLenIdxMinusOneVec,
0621                                                                              diffVarItvlIdxMinusOneVec,
0622                                                                              accVarLenIdxZeroVec,
0623                                                                              diffVarItvlIdxZeroVec);
0624           printf("pulseP(%d) = %f\n", i, value);
0625         }
0626         printf("\n");
0627         for (int i = 0; i < hcal::constants::maxSamples; i++) {
0628           auto const value = hcal::reconstruction::compute_pulse_shape_value(t0m,
0629                                                                              i,
0630                                                                              0,
0631                                                                              acc25nsVec,
0632                                                                              diff25nsItvlVec,
0633                                                                              accVarLenIdxMinusOneVec,
0634                                                                              diffVarItvlIdxMinusOneVec,
0635                                                                              accVarLenIdxZeroVec,
0636                                                                              diffVarItvlIdxZeroVec);
0637           printf("pulseM(%d) = %f\n", i, value);
0638         }
0639       }
0640 #endif
0641 
0642       // FIXME: shift should be treated properly,
0643       // here assume 8 time slices and 8 samples
0644       auto const shift = 4 - soi;  // as in cpu version!
0645 
0646       // auto const offset = ipulse - soi;
0647       // auto const idx = sample - offset;
0648       int32_t const idx = sample - pulseOffset;
0649       auto const value = idx >= 0 && idx < nsamples
0650                              ? hcal::reconstruction::compute_pulse_shape_value(t0,
0651                                                                                idx,
0652                                                                                shift,
0653                                                                                acc25nsVec,
0654                                                                                diff25nsItvlVec,
0655                                                                                accVarLenIdxMinusOneVec,
0656                                                                                diffVarItvlIdxMinusOneVec,
0657                                                                                accVarLenIdxZeroVec,
0658                                                                                diffVarItvlIdxZeroVec)
0659                              : 0;
0660       auto const value_t0m = idx >= 0 && idx < nsamples
0661                                  ? hcal::reconstruction::compute_pulse_shape_value(t0m,
0662                                                                                    idx,
0663                                                                                    shift,
0664                                                                                    acc25nsVec,
0665                                                                                    diff25nsItvlVec,
0666                                                                                    accVarLenIdxMinusOneVec,
0667                                                                                    diffVarItvlIdxMinusOneVec,
0668                                                                                    accVarLenIdxZeroVec,
0669                                                                                    diffVarItvlIdxZeroVec)
0670                                  : 0;
0671       auto const value_t0p = idx >= 0 && idx < nsamples
0672                                  ? hcal::reconstruction::compute_pulse_shape_value(t0p,
0673                                                                                    idx,
0674                                                                                    shift,
0675                                                                                    acc25nsVec,
0676                                                                                    diff25nsItvlVec,
0677                                                                                    accVarLenIdxMinusOneVec,
0678                                                                                    diffVarItvlIdxMinusOneVec,
0679                                                                                    accVarLenIdxZeroVec,
0680                                                                                    diffVarItvlIdxZeroVec)
0681                                  : 0;
0682 
0683       // store to global
0684       if (amplitude > 0.f) {
0685         pulseMatrix[ipulse * nsamples + sample] = value;
0686         pulseMatrixM[ipulse * nsamples + sample] = value_t0m;
0687         pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
0688       } else {
0689         pulseMatrix[ipulse * nsamples + sample] = 0.f;
0690         pulseMatrixM[ipulse * nsamples + sample] = 0.f;
0691         pulseMatrixP[ipulse * nsamples + sample] = 0.f;
0692       }
0693     }
0694 
0695     template <int NSAMPLES, int NPULSES>
0696     __forceinline__ __device__ void update_covariance(
0697         calo::multifit::ColumnVector<NPULSES> const& resultAmplitudesVector,
0698         calo::multifit::MapSymM<float, NSAMPLES>& covarianceMatrix,
0699         Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
0700         Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
0701         Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
0702       CMS_UNROLL_LOOP
0703       for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
0704         auto const resultAmplitude = resultAmplitudesVector(ipulse);
0705         if (resultAmplitude == 0)
0706           continue;
0707 
0708 #ifdef HCAL_MAHI_GPUDEBUG
0709         printf("pulse cov array for ibx = %d\n", ipulse);
0710 #endif
0711 
0712         // preload a column
0713         float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES];
0714         CMS_UNROLL_LOOP
0715         for (int counter = 0; counter < NSAMPLES; counter++) {
0716           pmcol[counter] = __ldg(&pulseMatrix.coeffRef(counter, ipulse));
0717           pmpcol[counter] = __ldg(&pulseMatrixP.coeffRef(counter, ipulse));
0718           pmmcol[counter] = __ldg(&pulseMatrixM.coeffRef(counter, ipulse));
0719         }
0720 
0721         auto const ampl2 = resultAmplitude * resultAmplitude;
0722         CMS_UNROLL_LOOP
0723         for (int col = 0; col < NSAMPLES; col++) {
0724           auto const valueP_col = pmpcol[col];
0725           auto const valueM_col = pmmcol[col];
0726           auto const value_col = pmcol[col];
0727           auto const tmppcol = valueP_col - value_col;
0728           auto const tmpmcol = valueM_col - value_col;
0729 
0730           // diagonal
0731           auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol);
0732           covarianceMatrix(col, col) += ampl2 * tmp_value;
0733 
0734           // FIXME: understand if this actually gets unrolled
0735           CMS_UNROLL_LOOP
0736           for (int row = col + 1; row < NSAMPLES; row++) {
0737             float const valueP_row = pmpcol[row];  //pulseMatrixP(j, ipulseReal);
0738             float const value_row = pmcol[row];    //pulseMatrix(j, ipulseReal);
0739             float const valueM_row = pmmcol[row];  //pulseMatrixM(j, ipulseReal);
0740 
0741             float tmpprow = valueP_row - value_row;
0742             float tmpmrow = valueM_row - value_row;
0743 
0744             auto const covValue = 0.5 * (tmppcol * tmpprow + tmpmcol * tmpmrow);
0745 
0746             covarianceMatrix(row, col) += ampl2 * covValue;
0747           }
0748         }
0749       }
0750     }
0751 
0752     template <int NSAMPLES, int NPULSES>
0753     __global__ void kernel_minimize(float* outputEnergy,
0754                                     float* outputChi2,
0755                                     float const* __restrict__ inputAmplitudes,
0756                                     float const* __restrict__ pulseMatrices,
0757                                     float const* __restrict__ pulseMatricesM,
0758                                     float const* __restrict__ pulseMatricesP,
0759                                     int const* __restrict__ pulseOffsetValues,
0760                                     float const* __restrict__ noiseTerms,
0761                                     float const* __restrict__ electronicNoiseTerms,
0762                                     int8_t const* __restrict__ soiSamples,
0763                                     float const* __restrict__ noiseCorrelationValues,
0764                                     float const* __restrict__ pedestalWidths,
0765                                     float const* __restrict__ effectivePedestalWidths,
0766                                     bool const useEffectivePedestals,
0767                                     uint32_t const* __restrict__ idsf01HE,
0768                                     uint32_t const* __restrict__ idsf5HB,
0769                                     uint32_t const* __restrict__ idsf3HB,
0770                                     float const* __restrict__ gainValues,
0771                                     float const* __restrict__ respCorrectionValues,
0772                                     uint32_t const nchannelsf01HE,
0773                                     uint32_t const nchannelsf5HB,
0774                                     uint32_t const nchannelsTotal,
0775                                     uint32_t const offsetForHashes,
0776                                     int const maxDepthHB,
0777                                     int const maxDepthHE,
0778                                     int const maxPhiHE,
0779                                     int const firstHBRing,
0780                                     int const lastHBRing,
0781                                     int const firstHERing,
0782                                     int const lastHERing,
0783                                     int const nEtaHB,
0784                                     int const nEtaHE) {
0785       // can be relaxed if needed - minor updates are needed in that case!
0786       static_assert(NPULSES == NSAMPLES);
0787 
0788       // indices
0789       auto const gch = threadIdx.x + blockIdx.x * blockDim.x;
0790       auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
0791       if (gch >= nchannelsTotal)
0792         return;
0793 
0794       // if chi2 is set to -9999 do not run minimization
0795       if (outputChi2[gch] == -9999.f)
0796         return;
0797 
0798       // configure shared mem
0799       extern __shared__ char shrmem[];
0800       float* shrMatrixLFnnlsStorage =
0801           reinterpret_cast<float*>(shrmem) + calo::multifit::MapSymM<float, NPULSES>::total * threadIdx.x;
0802       float* shrAtAStorage = reinterpret_cast<float*>(shrmem) +
0803                              calo::multifit::MapSymM<float, NPULSES>::total * (threadIdx.x + blockDim.x);
0804 
0805       // conditions for pedestal widths
0806       auto const id = gch < nchannelsf01HE
0807                           ? idsf01HE[gch]
0808                           : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
0809       auto const did = DetId{id};
0810       auto const hashedId =
0811           did.subdetId() == HcalBarrel
0812               ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
0813               : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
0814                     offsetForHashes;
0815 
0816       auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
0817                                                  ? effectivePedestalWidths + hashedId * 4
0818                                                  : pedestalWidths + hashedId * 4;
0819       auto const averagePedestalWidth2 = 0.25 * (pedestalWidthsForChannel[0] * pedestalWidthsForChannel[0] +
0820                                                  pedestalWidthsForChannel[1] * pedestalWidthsForChannel[1] +
0821                                                  pedestalWidthsForChannel[2] * pedestalWidthsForChannel[2] +
0822                                                  pedestalWidthsForChannel[3] * pedestalWidthsForChannel[3]);
0823 
0824       auto const* gains = gainValues + hashedId * 4;
0825       // FIXME on cpu ts 0 capid was used - does it make any difference
0826       auto const gain = gains[0];
0827       auto const respCorrection = respCorrectionValues[hashedId];
0828 
0829       auto const noisecorr = noiseCorrelationValues[hashedId];
0830 
0831 #ifdef HCAL_MAHI_GPUDEBUG
0832 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
0833       if (id != DETID_TO_DEBUG)
0834         return;
0835 #endif
0836 #endif
0837 
0838       /*
0839       // TODO: provide this properly
0840       int const soi = soiSamples[gch];
0841       */
0842       calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
0843       CMS_UNROLL_LOOP
0844       for (int i = 0; i < NPULSES; ++i)
0845         pulseOffsets(i) = i;
0846       //        pulseOffsets(i) = pulseOffsetValues[i] - pulseOffsetValues[0];
0847 
0848       // output amplitudes/weights
0849       calo::multifit::ColumnVector<NPULSES> resultAmplitudesVector = calo::multifit::ColumnVector<NPULSES>::Zero();
0850 
0851       // map views
0852       Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> inputAmplitudesView{inputAmplitudes + gch * NSAMPLES};
0853       Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
0854       Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseElectronicView{electronicNoiseTerms +
0855                                                                                    gch * NSAMPLES};
0856       Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{pulseMatricesM +
0857                                                                                               gch * NSAMPLES * NPULSES};
0858       Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{pulseMatricesP +
0859                                                                                               gch * NSAMPLES * NPULSES};
0860       Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{pulseMatrices +
0861                                                                                              gch * NSAMPLES * NPULSES};
0862 
0863 #ifdef HCAL_MAHI_GPUDEBUG
0864       for (int i = 0; i < NSAMPLES; i++)
0865         printf("inputValues(%d) = %f noiseTerms(%d) = %f\n", i, inputAmplitudesView(i), i, noiseTermsView(i));
0866       for (int i = 0; i < NSAMPLES; i++) {
0867         for (int j = 0; j < NPULSES; j++)
0868           printf("%f ", glbPulseMatrixView(i, j));
0869         printf("\n");
0870       }
0871       printf("\n");
0872       for (int i = 0; i < NSAMPLES; i++) {
0873         for (int j = 0; j < NPULSES; j++)
0874           printf("%f ", glbPulseMatrixMView(i, j));
0875         printf("\n");
0876       }
0877       printf("\n");
0878       for (int i = 0; i < NSAMPLES; i++) {
0879         for (int j = 0; j < NPULSES; j++)
0880           printf("%f ", glbPulseMatrixPView(i, j));
0881         printf("\n");
0882       }
0883 #endif
0884 
0885       int npassive = 0;
0886       float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f;
0887       for (int iter = 1; iter < nMaxItersMin; iter++) {
0888         //float covarianceMatrixStorage[MapSymM<float, NSAMPLES>::total];
0889         // NOTE: only works when NSAMPLES == NPULSES
0890         // if does not hold -> slightly rearrange shared mem to still reuse
0891         // shared memory
0892         float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
0893         calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
0894         CMS_UNROLL_LOOP
0895         for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::total; counter++)
0896           covarianceMatrixStorage[counter] = (noisecorr != 0.f) ? 0.f : averagePedestalWidth2;
0897         CMS_UNROLL_LOOP
0898         for (unsigned int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::stride; counter++) {
0899           covarianceMatrix(counter, counter) += noiseTermsView.coeffRef(counter);
0900           if (counter != 0)
0901             covarianceMatrix(counter, counter - 1) += noisecorr * __ldg(&noiseElectronicView.coeffRef(counter - 1)) *
0902                                                       __ldg(&noiseElectronicView.coeffRef(counter));
0903         }
0904 
0905         // update covariance matrix
0906         update_covariance(
0907             resultAmplitudesVector, covarianceMatrix, glbPulseMatrixView, glbPulseMatrixMView, glbPulseMatrixPView);
0908 
0909 #ifdef HCAL_MAHI_GPUDEBUG
0910         printf("covariance matrix\n");
0911         for (int i = 0; i < 8; i++) {
0912           for (int j = 0; j < 8; j++)
0913             printf("%f ", covarianceMatrix(i, j));
0914           printf("\n");
0915         }
0916 #endif
0917 
0918         // compute Cholesky Decomposition L matrix
0919         //matrixDecomposition.compute(covarianceMatrix);
0920         //auto const& matrixL = matrixDecomposition.matrixL();
0921         float matrixLStorage[calo::multifit::MapSymM<float, NSAMPLES>::total];
0922         calo::multifit::MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
0923         calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix);
0924 
0925         //
0926         // replace eigen
0927         //
0928         //auto const& A = matrixDecomposition
0929         //    .matrixL()
0930         //    .solve(pulseMatrixView);
0931         calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
0932         calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
0933 
0934         //
0935         // remove eigen
0936         //
0937         //auto const& b = matrixL
0938         //   .solve(inputAmplitudesView);
0939         //
0940         float reg_b[NSAMPLES];
0941         calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
0942 
0943         // TODO: we do not really need to change these matrcies
0944         // will be fixed in the optimized version
0945         //ColMajorMatrix<NPULSES, NPULSES> AtA = A.transpose() * A;
0946         //ColumnVector<NPULSES> Atb = A.transpose() * b;
0947         //ColMajorMatrix<NPULSES, NPULSES> AtA;
0948         //float AtAStorage[MapSymM<float, NPULSES>::total];
0949         calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
0950         calo::multifit::ColumnVector<NPULSES> Atb;
0951         CMS_UNROLL_LOOP
0952         for (int icol = 0; icol < NPULSES; icol++) {
0953           float reg_ai[NSAMPLES];
0954 
0955           // load column icol
0956           CMS_UNROLL_LOOP
0957           for (int counter = 0; counter < NSAMPLES; counter++)
0958             reg_ai[counter] = A(counter, icol);
0959 
0960           // compute diagonal
0961           float sum = 0.f;
0962           CMS_UNROLL_LOOP
0963           for (int counter = 0; counter < NSAMPLES; counter++)
0964             sum += reg_ai[counter] * reg_ai[counter];
0965 
0966           // store
0967           AtA(icol, icol) = sum;
0968 
0969           // go thru the other columns
0970           CMS_UNROLL_LOOP
0971           for (int j = icol + 1; j < NPULSES; j++) {
0972             // load column j
0973             float reg_aj[NSAMPLES];
0974             CMS_UNROLL_LOOP
0975             for (int counter = 0; counter < NSAMPLES; counter++)
0976               reg_aj[counter] = A(counter, j);
0977 
0978             // accum
0979             float sum = 0.f;
0980             CMS_UNROLL_LOOP
0981             for (int counter = 0; counter < NSAMPLES; counter++)
0982               sum += reg_aj[counter] * reg_ai[counter];
0983 
0984             // store
0985             //AtA(icol, j) = sum;
0986             AtA(j, icol) = sum;
0987           }
0988 
0989           // Atb accum
0990           float sum_atb = 0;
0991           CMS_UNROLL_LOOP
0992           for (int counter = 0; counter < NSAMPLES; counter++)
0993             sum_atb += reg_ai[counter] * reg_b[counter];
0994 
0995           // store atb
0996           Atb(icol) = sum_atb;
0997         }
0998 
0999 #ifdef HCAL_MAHI_GPUDEBUG
1000         printf("AtA\n");
1001         for (int i = 0; i < 8; i++) {
1002           for (int j = 0; j < 8; j++)
1003             printf("%f ", AtA(i, j));
1004           printf("\n");
1005         }
1006         printf("Atb\n");
1007         for (int i = 0; i < 8; i++)
1008           printf("%f ", Atb(i));
1009         printf("\n");
1010         printf("result Amplitudes before nnls\n");
1011         for (int i = 0; i < 8; i++)
1012           printf("%f ", resultAmplitudesVector(i));
1013         printf("\n");
1014 #endif
1015 
1016         // for fnnls
1017         calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
1018 
1019         // run fast nnls
1020         calo::multifit::fnnls(
1021             AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, nnlsThresh, nMaxItersNNLS, 10, 10);
1022 
1023 #ifdef HCAL_MAHI_GPUDEBUG
1024         printf("result Amplitudes\n");
1025         for (int i = 0; i < 8; i++)
1026           printf("resultAmplitudes(%d) = %f\n", i, resultAmplitudesVector(i));
1027 #endif
1028 
1029         calo::multifit::calculateChiSq(matrixL, glbPulseMatrixView, resultAmplitudesVector, inputAmplitudesView, chi2);
1030 
1031         auto const deltaChi2 = std::abs(chi2 - previous_chi2);
1032         if (chi2 == chi2_2itersback && chi2 < previous_chi2)
1033           break;
1034 
1035         // update
1036         chi2_2itersback = previous_chi2;
1037         previous_chi2 = chi2;
1038 
1039         // exit condition
1040         if (deltaChi2 < deltaChi2Threashold)
1041           break;
1042       }
1043 
1044 #ifdef HCAL_MAHI_GPUDEBUG
1045       for (int i = 0; i < NPULSES; i++)
1046         printf("pulseOffsets(%d) = %d outputAmplitudes(%d) = %f\n", i, pulseOffsets(i), i, resultAmplitudesVector(i));
1047       printf("chi2 = %f\n", chi2);
1048 #endif
1049 
1050       outputChi2[gch] = chi2;
1051       auto const idx_for_energy = std::abs(pulseOffsetValues[0]);
1052       outputEnergy[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
1053       /*
1054       CMS_UNROLL_LOOP
1055       for (int i=0; i<NPULSES; i++)
1056           if (pulseOffsets[i] == soi)
1057               // NOTE: gain is a number < 10^-3/4, multiply first to avoid stab issues
1058               outputEnergy[gch] = (gain*resultAmplitudesVector(i))*respCorrection;
1059       */
1060     }
1061 
1062   }  // namespace mahi
1063 }  // namespace hcal
1064 
1065 namespace hcal {
1066   namespace reconstruction {
1067 
1068     void entryPoint(InputDataGPU const& inputGPU,
1069                     OutputDataGPU& outputGPU,
1070                     ConditionsProducts const& conditions,
1071                     ScratchDataGPU& scratch,
1072                     ConfigParameters const& configParameters,
1073                     cudaStream_t cudaStream) {
1074       auto const totalChannels = inputGPU.f01HEDigis.size + inputGPU.f5HBDigis.size + inputGPU.f3HBDigis.size;
1075 
1076       // FIXME: the number of channels in output might change given that some channesl might be filtered out
1077 
1078       // do not run when there are no rechits (e.g. if HCAL is not being read),
1079       // but do set the size of the output collection to 0
1080       outputGPU.recHits.size = totalChannels;
1081       if (totalChannels == 0) {
1082         return;
1083       }
1084 
1085       // TODO: this can be lifted by implementing a separate kernel
1086       // similar to the default one, but properly handling the diff in #sample
1087       // or modifying existing one
1088       auto const f01nsamples = compute_nsamples<Flavor1>(inputGPU.f01HEDigis.stride);
1089       auto const f5nsamples = compute_nsamples<Flavor5>(inputGPU.f5HBDigis.stride);
1090       auto const f3nsamples = compute_nsamples<Flavor3>(inputGPU.f3HBDigis.stride);
1091       int constexpr windowSize = 8;
1092       int const startingSample = f01nsamples - windowSize;
1093       assert(startingSample == 0 || startingSample == 2);
1094       if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f5HBDigis.stride > 0)
1095         assert(f01nsamples == f5nsamples);
1096       if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f3HBDigis.stride > 0)
1097         assert(f01nsamples == f3nsamples);
1098 
1099       dim3 threadsPerBlock{windowSize, configParameters.kprep1dChannelsPerBlock};
1100       int blocks = static_cast<uint32_t>(threadsPerBlock.y) > totalChannels
1101                        ? 1
1102                        : (totalChannels + threadsPerBlock.y - 1) / threadsPerBlock.y;
1103       int nbytesShared =
1104           ((2 * windowSize + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock;
1105       hcal::mahi::kernel_prep1d_sameNumberOfSamples<<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
1106           scratch.amplitudes.get(),
1107           scratch.noiseTerms.get(),
1108           scratch.electronicNoiseTerms.get(),
1109           outputGPU.recHits.energy.get(),
1110           outputGPU.recHits.chi2.get(),
1111           inputGPU.f01HEDigis.data.get(),
1112           inputGPU.f5HBDigis.data.get(),
1113           inputGPU.f3HBDigis.data.get(),
1114           inputGPU.f01HEDigis.ids.get(),
1115           inputGPU.f5HBDigis.ids.get(),
1116           inputGPU.f3HBDigis.ids.get(),
1117           inputGPU.f01HEDigis.stride,
1118           inputGPU.f5HBDigis.stride,
1119           inputGPU.f3HBDigis.stride,
1120           inputGPU.f01HEDigis.size,
1121           inputGPU.f5HBDigis.size,
1122           inputGPU.f5HBDigis.npresamples.get(),
1123           scratch.soiSamples.get(),
1124           outputGPU.recHits.energyM0.get(),
1125           outputGPU.recHits.timeM0.get(),
1126           outputGPU.recHits.did.get(),
1127           totalChannels,
1128           conditions.channelQuality.status,
1129           conditions.recoParams.param1,
1130           conditions.recoParams.param2,
1131           conditions.qieCoders.offsets,
1132           conditions.qieCoders.slopes,
1133           conditions.qieTypes.values,
1134           conditions.pedestalWidths.values,
1135           conditions.effectivePedestalWidths.values,
1136           conditions.pedestals.values,
1137           conditions.convertedEffectivePedestals ? conditions.convertedEffectivePedestals->values
1138                                                  : conditions.pedestals.values,
1139           configParameters.useEffectivePedestals,
1140           conditions.sipmParameters.type,
1141           conditions.sipmParameters.fcByPE,
1142           conditions.sipmCharacteristics.parLin1,
1143           conditions.sipmCharacteristics.parLin2,
1144           conditions.sipmCharacteristics.parLin3,
1145           conditions.gains.values,
1146           conditions.respCorrs.values,
1147           conditions.topology->maxDepthHB(),
1148           conditions.topology->maxDepthHE(),
1149           conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
1150                                                                                : hcal::reconstruction::IPHI_MAX,
1151           conditions.topology->firstHBRing(),
1152           conditions.topology->lastHBRing(),
1153           conditions.topology->firstHERing(),
1154           conditions.topology->lastHERing(),
1155           conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
1156           conditions.topology->firstHERing() > conditions.topology->lastHERing()
1157               ? 0
1158               : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1),
1159           configParameters.sipmQTSShift,
1160           configParameters.sipmQNTStoSum,
1161           configParameters.firstSampleShift,
1162           conditions.offsetForHashes,
1163           configParameters.ts4Thresh,
1164           startingSample);
1165       cudaCheck(cudaGetLastError());
1166 
1167       // 1024 is the max threads per block for gtx1080
1168       // FIXME: take this from cuda service or something like that
1169       uint32_t const channelsPerBlock = 1024 / (windowSize * conditions.pulseOffsetsHost.size());
1170       dim3 threadsPerBlock2{windowSize, static_cast<uint32_t>(conditions.pulseOffsetsHost.size()), channelsPerBlock};
1171       int blocks2 =
1172           threadsPerBlock2.z > totalChannels ? 1 : (totalChannels + threadsPerBlock2.z - 1) / threadsPerBlock2.z;
1173 
1174 #ifdef HCAL_MAHI_CPUDEBUG
1175       std::cout << "threads: " << threadsPerBlock2.x << " " << threadsPerBlock2.y << "  " << threadsPerBlock2.z
1176                 << std::endl;
1177       std::cout << "blocks: " << blocks2 << std::endl;
1178 #endif
1179 
1180       hcal::mahi::kernel_prep_pulseMatrices_sameNumberOfSamples<<<blocks2, threadsPerBlock2, 0, cudaStream>>>(
1181           scratch.pulseMatrices.get(),
1182           scratch.pulseMatricesM.get(),
1183           scratch.pulseMatricesP.get(),
1184           conditions.pulseOffsets.values,
1185           scratch.amplitudes.get(),
1186           inputGPU.f01HEDigis.ids.get(),
1187           inputGPU.f5HBDigis.ids.get(),
1188           inputGPU.f3HBDigis.ids.get(),
1189           inputGPU.f01HEDigis.size,
1190           inputGPU.f5HBDigis.size,
1191           totalChannels,
1192           scratch.soiSamples.get(),
1193           conditions.recoParams.ids,
1194           conditions.recoParams.acc25nsVec,
1195           conditions.recoParams.diff25nsItvlVec,
1196           conditions.recoParams.accVarLenIdxMinusOneVec,
1197           conditions.recoParams.diffVarItvlIdxMinusOneVec,
1198           conditions.recoParams.accVarLenIdxZEROVec,
1199           conditions.recoParams.diffVarItvlIdxZEROVec,
1200           configParameters.meanTime,
1201           configParameters.timeSigmaSiPM,
1202           configParameters.timeSigmaHPD,
1203           conditions.topology->maxDepthHB(),
1204           conditions.topology->maxDepthHE(),
1205           conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
1206                                                                                : hcal::reconstruction::IPHI_MAX,
1207           conditions.topology->firstHBRing(),
1208           conditions.topology->lastHBRing(),
1209           conditions.topology->firstHERing(),
1210           conditions.topology->lastHERing(),
1211           conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
1212           conditions.topology->firstHERing() > conditions.topology->lastHERing()
1213               ? 0
1214               : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1),
1215           conditions.offsetForHashes,
1216           configParameters.applyTimeSlew,
1217           configParameters.tzeroTimeSlew,
1218           configParameters.slopeTimeSlew,
1219           configParameters.tmaxTimeSlew);
1220       cudaCheck(cudaGetLastError());
1221 
1222       // number of samples is checked in above assert
1223       if (conditions.pulseOffsetsHost.size() == 8u) {
1224         // FIXME: provide constants from configuration
1225         uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
1226         uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock;
1227         auto const nbytesShared = 2 * threadsPerBlock * calo::multifit::MapSymM<float, 8>::total * sizeof(float);
1228         hcal::mahi::kernel_minimize<8, 8><<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
1229             outputGPU.recHits.energy.get(),
1230             outputGPU.recHits.chi2.get(),
1231             scratch.amplitudes.get(),
1232             scratch.pulseMatrices.get(),
1233             scratch.pulseMatricesM.get(),
1234             scratch.pulseMatricesP.get(),
1235             conditions.pulseOffsets.values,
1236             scratch.noiseTerms.get(),
1237             scratch.electronicNoiseTerms.get(),
1238             scratch.soiSamples.get(),
1239             conditions.sipmParameters.auxi2,
1240             conditions.pedestalWidths.values,
1241             conditions.effectivePedestalWidths.values,
1242             configParameters.useEffectivePedestals,
1243             inputGPU.f01HEDigis.ids.get(),
1244             inputGPU.f5HBDigis.ids.get(),
1245             inputGPU.f3HBDigis.ids.get(),
1246             conditions.gains.values,
1247             conditions.respCorrs.values,
1248             inputGPU.f01HEDigis.size,
1249             inputGPU.f5HBDigis.size,
1250             totalChannels,
1251             conditions.offsetForHashes,
1252             conditions.topology->maxDepthHB(),
1253             conditions.topology->maxDepthHE(),
1254             conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
1255                                                                                  : hcal::reconstruction::IPHI_MAX,
1256             conditions.topology->firstHBRing(),
1257             conditions.topology->lastHBRing(),
1258             conditions.topology->firstHERing(),
1259             conditions.topology->lastHERing(),
1260             conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
1261             conditions.topology->firstHERing() > conditions.topology->lastHERing()
1262                 ? 0
1263                 : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1));
1264       } else {
1265         throw cms::Exception("Invalid MahiGPU configuration")
1266             << "Currently support only 8 pulses and 8 time samples and provided: " << f01nsamples << " samples and "
1267             << conditions.pulseOffsetsHost.size() << " pulses" << std::endl;
1268       }
1269     }
1270 
1271   }  // namespace reconstruction
1272 }  // namespace hcal