File indexing completed on 2025-01-22 07:34:23
0001 #include <iostream>
0002 #include <alpaka/alpaka.hpp>
0003 #include <Eigen/Dense>
0004
0005 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0006 #include "HeterogeneousCore/AlpakaInterface/interface/atomicMaxPair.h"
0007
0008 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
0009
0010 #include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
0011 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0012 #include "FWCore/Utilities/interface/HostDeviceConstant.h"
0013
0014
0015
0016 #include "Mahi.h"
0017
0018 #ifdef HCAL_MAHI_GPUDEBUG
0019 #define DETID_TO_DEBUG 1125647428
0020 #endif
0021
0022 namespace ALPAKA_ACCELERATOR_NAMESPACE {
0023
0024 using namespace cms::alpakatools;
0025 namespace hcal::reconstruction {
0026 namespace mahi {
0027
0028 ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float uint_as_float(uint32_t val) {
0029 #if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
0030 return __uint_as_float(val);
0031 #else
0032 return edm::bit_cast<float>(val);
0033 #endif
0034 }
0035
0036 ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t float_as_uint(float val) {
0037 #if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
0038 return __float_as_uint(val);
0039 #else
0040 return edm::bit_cast<unsigned int>(val);
0041 #endif
0042 }
0043
0044 ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_time_slew_delay(float const fC,
0045 float const tzero,
0046 float const slope,
0047 float const tmax) {
0048 auto const rawDelay = tzero + slope * std::log(fC);
0049 return rawDelay < 0 ? 0 : (rawDelay > tmax ? tmax : rawDelay);
0050 }
0051
0052
0053
0054 HOST_DEVICE_CONSTANT float qie8shape[129] = {
0055 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
0056 18, 20, 22, 24, 26, 28, 31, 34, 37, 40, 44, 48, 52, 57, 62, 57, 62,
0057 67, 72, 77, 82, 87, 92, 97, 102, 107, 112, 117, 122, 127, 132, 142, 152, 162,
0058 172, 182, 192, 202, 217, 232, 247, 262, 282, 302, 322, 347, 372, 347, 372, 397, 422,
0059 447, 472, 497, 522, 547, 572, 597, 622, 647, 672, 697, 722, 772, 822, 872, 922, 972,
0060 1022, 1072, 1147, 1222, 1297, 1372, 1472, 1572, 1672, 1797, 1922, 1797, 1922, 2047, 2172, 2297, 2422,
0061 2547, 2672, 2797, 2922, 3047, 3172, 3297, 3422, 3547, 3672, 3922, 4172, 4422, 4672, 4922, 5172, 5422,
0062 5797, 6172, 6547, 6922, 7422, 7922, 8422, 9047, 9672, 10297};
0063
0064 HOST_DEVICE_CONSTANT float qie11shape[257] = {
0065 -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5,
0066 11.5, 12.5, 13.5, 14.5, 15.5, 17.5, 19.5, 21.5, 23.5, 25.5, 27.5, 29.5,
0067 31.5, 33.5, 35.5, 37.5, 39.5, 41.5, 43.5, 45.5, 47.5, 49.5, 51.5, 53.5,
0068 55.5, 59.5, 63.5, 67.5, 71.5, 75.5, 79.5, 83.5, 87.5, 91.5, 95.5, 99.5,
0069 103.5, 107.5, 111.5, 115.5, 119.5, 123.5, 127.5, 131.5, 135.5, 139.5, 147.5, 155.5,
0070 163.5, 171.5, 179.5, 187.5, 171.5, 179.5, 187.5, 195.5, 203.5, 211.5, 219.5, 227.5,
0071 235.5, 243.5, 251.5, 259.5, 267.5, 275.5, 283.5, 291.5, 299.5, 315.5, 331.5, 347.5,
0072 363.5, 379.5, 395.5, 411.5, 427.5, 443.5, 459.5, 475.5, 491.5, 507.5, 523.5, 539.5,
0073 555.5, 571.5, 587.5, 603.5, 619.5, 651.5, 683.5, 715.5, 747.5, 779.5, 811.5, 843.5,
0074 875.5, 907.5, 939.5, 971.5, 1003.5, 1035.5, 1067.5, 1099.5, 1131.5, 1163.5, 1195.5, 1227.5,
0075 1259.5, 1291.5, 1355.5, 1419.5, 1483.5, 1547.5, 1611.5, 1675.5, 1547.5, 1611.5, 1675.5, 1739.5,
0076 1803.5, 1867.5, 1931.5, 1995.5, 2059.5, 2123.5, 2187.5, 2251.5, 2315.5, 2379.5, 2443.5, 2507.5,
0077 2571.5, 2699.5, 2827.5, 2955.5, 3083.5, 3211.5, 3339.5, 3467.5, 3595.5, 3723.5, 3851.5, 3979.5,
0078 4107.5, 4235.5, 4363.5, 4491.5, 4619.5, 4747.5, 4875.5, 5003.5, 5131.5, 5387.5, 5643.5, 5899.5,
0079 6155.5, 6411.5, 6667.5, 6923.5, 7179.5, 7435.5, 7691.5, 7947.5, 8203.5, 8459.5, 8715.5, 8971.5,
0080 9227.5, 9483.5, 9739.5, 9995.5, 10251.5, 10507.5, 11019.5, 11531.5, 12043.5, 12555.5, 13067.5, 13579.5,
0081 12555.5, 13067.5, 13579.5, 14091.5, 14603.5, 15115.5, 15627.5, 16139.5, 16651.5, 17163.5, 17675.5, 18187.5,
0082 18699.5, 19211.5, 19723.5, 20235.5, 20747.5, 21771.5, 22795.5, 23819.5, 24843.5, 25867.5, 26891.5, 27915.5,
0083 28939.5, 29963.5, 30987.5, 32011.5, 33035.5, 34059.5, 35083.5, 36107.5, 37131.5, 38155.5, 39179.5, 40203.5,
0084 41227.5, 43275.5, 45323.5, 47371.5, 49419.5, 51467.5, 53515.5, 55563.5, 57611.5, 59659.5, 61707.5, 63755.5,
0085 65803.5, 67851.5, 69899.5, 71947.5, 73995.5, 76043.5, 78091.5, 80139.5, 82187.5, 84235.5, 88331.5, 92427.5,
0086 96523.5, 100620, 104716, 108812, 112908};
0087
0088 constexpr int32_t IPHI_MAX = 72;
0089
0090 ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHB(
0091 uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB) {
0092 HcalDetId did{didraw};
0093 uint32_t const value = (did.depth() - 1) + maxDepthHB * (did.iphi() - 1);
0094 return did.ieta() > 0
0095 ? value + maxDepthHB * hcal::reconstruction::mahi::IPHI_MAX * (did.ieta() - firstHBRing)
0096 : value + maxDepthHB * hcal::reconstruction::mahi::IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB);
0097 }
0098 ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHE(uint32_t const didraw,
0099 int const maxDepthHE,
0100 int const maxPhiHE,
0101 int const firstHERing,
0102 int const lastHERing,
0103 int const nEtaHE) {
0104 HcalDetId did{didraw};
0105 uint32_t const value = (did.depth() - 1) + maxDepthHE * (did.iphi() - 1);
0106 return did.ieta() > 0 ? value + maxDepthHE * maxPhiHE * (did.ieta() - firstHERing)
0107 : value + maxDepthHE * maxPhiHE * (did.ieta() + lastHERing + nEtaHE);
0108 }
0109 ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range) {
0110 return capid * 4 + range;
0111 }
0112
0113 ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_reco_correction_factor(float const par1,
0114 float const par2,
0115 float const par3,
0116 float const x) {
0117 return par3 * x * x + par2 * x + par1;
0118 }
0119
0120
0121 ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_coder_charge(
0122 int const qieType, uint8_t const adc, uint8_t const capid, float const* qieOffsets, float const* qieSlopes) {
0123 auto const range = qieType == 0 ? (adc >> 5) & 0x3 : (adc >> 6) & 0x3;
0124 auto const* qieShapeToUse = qieType == 0 ? qie8shape : qie11shape;
0125 auto const nbins = qieType == 0 ? 32 : 64;
0126 auto const center = adc % nbins == nbins - 1 ? 0.5 * (3 * qieShapeToUse[adc] - qieShapeToUse[adc - 1])
0127 : 0.5 * (qieShapeToUse[adc] + qieShapeToUse[adc + 1]);
0128 auto const index = get_qiecoder_index(capid, range);
0129 return (center - qieOffsets[index]) / qieSlopes[index];
0130 }
0131
0132
0133
0134
0135 ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_diff_charge_gain(int const qieType,
0136 uint8_t adc,
0137 uint8_t const capid,
0138 float const* qieOffsets,
0139 float const* qieSlopes,
0140 bool const isqie11) {
0141 constexpr uint32_t mantissaMaskQIE8 = 0x1fu;
0142 constexpr uint32_t mantissaMaskQIE11 = 0x3f;
0143 auto const mantissaMask = isqie11 ? mantissaMaskQIE11 : mantissaMaskQIE8;
0144 auto const q = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
0145 auto const mantissa = adc & mantissaMask;
0146
0147 if (mantissa == 0u || mantissa == mantissaMask - 1u)
0148 return compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes) - q;
0149 else if (mantissa == 1u || mantissa == mantissaMask)
0150 return q - compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
0151 else {
0152 auto const qup = compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes);
0153 auto const qdown = compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
0154 auto const upgain = qup - q;
0155 auto const downgain = q - qdown;
0156 auto const averagegain = (qup - qdown) / 2.f;
0157 if (std::abs(upgain - downgain) < 0.01f * averagegain)
0158 return averagegain;
0159 else {
0160 auto const q2up = compute_coder_charge(qieType, adc + 2u, capid, qieOffsets, qieSlopes);
0161 auto const q2down = compute_coder_charge(qieType, adc - 2u, capid, qieOffsets, qieSlopes);
0162 auto const upgain2 = q2up - qup;
0163 auto const downgain2 = qdown - q2down;
0164 if (std::abs(upgain2 - upgain) < std::abs(downgain2 - downgain))
0165 return upgain;
0166 else
0167 return downgain;
0168 }
0169 }
0170 }
0171
0172 using PulseShapeConstElement = typename HcalPulseShapeSoA::ConstView::const_element;
0173
0174
0175 ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_pulse_shape_value(PulseShapeConstElement const& pulseShape,
0176 float const pulse_time,
0177 int const sample,
0178 int const shift) {
0179 auto const& acc25nsVec = pulseShape.acc25nsVec();
0180 auto const& diff25nsItvlVec = pulseShape.diff25nsItvlVec();
0181 auto const& accVarLenIdxMinusOneVec = pulseShape.accVarLenIdxMinusOneVec();
0182 auto const& diffVarItvlIdxMinusOneVec = pulseShape.diffVarItvlIdxMinusOneVec();
0183 auto const& accVarLenIdxZeroVec = pulseShape.accVarLenIdxZEROVec();
0184 auto const& diffVarItvlIdxZeroVec = pulseShape.diffVarItvlIdxZEROVec();
0185
0186
0187 constexpr float slew = 0.f;
0188 constexpr auto ns_per_bx = ::hcal::constants::nsPerBX;
0189
0190
0191 float const i_start_float = -::hcal::constants::iniTimeShift - pulse_time - slew > 0.f
0192 ? 0.f
0193 : std::abs(-::hcal::constants::iniTimeShift - pulse_time - slew) + 1.f;
0194 int i_start = static_cast<int>(i_start_float);
0195 float offset_start = static_cast<float>(i_start) - ::hcal::constants::iniTimeShift - pulse_time - slew;
0196
0197
0198 if (offset_start == 1.0f) {
0199 offset_start = 0.f;
0200 i_start -= 1;
0201 }
0202
0203 int const bin_start = static_cast<int>(offset_start);
0204 float const bin_start_up = static_cast<float>(bin_start) + 0.5f;
0205 int const bin_0_start = offset_start < bin_start_up ? bin_start - 1 : bin_start;
0206 int const its_start = i_start / ns_per_bx;
0207 int const distTo25ns_start = ns_per_bx - 1 - i_start % ns_per_bx;
0208 auto const factor = offset_start - static_cast<float>(bin_0_start) - 0.5;
0209
0210 auto const sample_over10ts = sample + shift;
0211 float value = 0.0f;
0212 if (sample_over10ts == its_start) {
0213 value = bin_0_start == -1
0214 ? accVarLenIdxMinusOneVec[distTo25ns_start] + factor * diffVarItvlIdxMinusOneVec[distTo25ns_start]
0215 : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start];
0216 } else if (sample_over10ts > its_start) {
0217 int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start;
0218 value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx];
0219 }
0220 return value;
0221 }
0222
0223
0224
0225 constexpr int nMaxItersMin = 50;
0226 constexpr int nMaxItersNNLS = 500;
0227 constexpr double nnlsThresh = 1e-11;
0228 constexpr float deltaChi2Threashold = 1e-3;
0229
0230
0231 ALPAKA_FN_ACC float get_raw_charge(double const charge,
0232 double const pedestal,
0233 float const* shrChargeMinusPedestal,
0234 float const parLin1,
0235 float const parLin2,
0236 float const parLin3,
0237 int32_t const nsamplesForCompute,
0238 int32_t const soi,
0239 int const sipmQTSShift,
0240 int const sipmQNTStoSum,
0241 float const fcByPE,
0242 int32_t const lch,
0243 bool const isqie11) {
0244 float rawCharge;
0245
0246 if (!isqie11)
0247 rawCharge = charge;
0248 else {
0249 int const first = std::max(soi + sipmQTSShift, 0);
0250 int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute);
0251 float sipmq = 0.0f;
0252 for (auto ts = first; ts < last; ts++)
0253 sipmq += shrChargeMinusPedestal[lch * nsamplesForCompute + ts];
0254 auto const effectivePixelsFired = sipmq / fcByPE;
0255 auto const factor = compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
0256 rawCharge = (charge - pedestal) * factor + pedestal;
0257
0258 #ifdef HCAL_MAHI_GPUDEBUG
0259 printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
0260 #endif
0261 }
0262 return rawCharge;
0263 }
0264
0265
0266 inline constexpr bool TSenergyCompare(std::pair<unsigned int, float> a, std::pair<unsigned int, float> b) {
0267 return a.second > b.second;
0268 };
0269
0270 class Kernel_prep1d_sameNumberOfSamples {
0271 public:
0272 ALPAKA_FN_ACC void operator()(Acc2D const& acc,
0273 OProductType::View outputGPU,
0274 IProductTypef01::ConstView f01HEDigis,
0275 IProductTypef5::ConstView f5HBDigis,
0276 IProductTypef3::ConstView f3HBDigis,
0277 HcalMahiConditionsPortableDevice::ConstView mahi,
0278 HcalSiPMCharacteristicsPortableDevice::ConstView sipmCharacteristics,
0279 HcalRecoParamWithPulseShapeDevice::ConstView recoParamsWithPS,
0280 bool const useEffectivePedestals,
0281 int const sipmQTSShift,
0282 int const sipmQNTStoSum,
0283 int const firstSampleShift,
0284 float const ts4Thresh,
0285 float* amplitudes,
0286 float* noiseTerms,
0287 float* electronicNoiseTerms,
0288 int8_t* soiSamples,
0289 int const windowSize) const {
0290 auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
0291 auto const startingSample = compute_nsamples<Flavor1>(f01HEDigis.stride()) - windowSize;
0292 auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
0293
0294
0295 auto const nchannels_per_block(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
0296
0297 auto const nsamplesForCompute(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[1u]);
0298
0299
0300 float* shrEnergyM0PerTS = alpaka::getDynSharedMem<float>(acc);
0301 float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesForCompute * nchannels_per_block;
0302 float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesForCompute * nchannels_per_block;
0303 float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block;
0304 unsigned long long int* shrMethod0EnergySamplePair =
0305 reinterpret_cast<unsigned long long int*>(shrEnergyM0TotalAccum + nchannels_per_block);
0306
0307
0308 for (auto group : uniform_groups_y(acc, nchannels)) {
0309
0310 for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
0311 auto const gch = channel.global;
0312 auto const lch = channel.local;
0313
0314 for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
0315 auto const sampleWithinWindow = i_sample;
0316 auto const sample = i_sample + startingSample;
0317 auto const linearThPerBlock = i_sample + channel.local * nsamplesForCompute;
0318
0319 if (sampleWithinWindow == 0) {
0320 soiSamples[gch] = -1;
0321 shrMethod0EnergyAccum[lch] = 0;
0322 shrMethod0EnergySamplePair[lch] = 0;
0323 shrEnergyM0TotalAccum[lch] = 0;
0324 }
0325
0326
0327 if (gch < f01HEDigis.size()) {
0328
0329
0330
0331 auto const soibit = soibit_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample);
0332 if (soibit == 1)
0333 soiSamples[gch] = sampleWithinWindow;
0334 } else if (gch >= nchannelsf015) {
0335 auto const soibit = soibit_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample);
0336 if (soibit == 1)
0337 soiSamples[gch] = sampleWithinWindow;
0338 }
0339
0340
0341 auto const id = gch < f01HEDigis.size()
0342 ? f01HEDigis.ids()[gch]
0343 : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
0344 : f3HBDigis.ids()[gch - nchannelsf015]);
0345 auto const did = HcalDetId{id};
0346
0347 auto const adc =
0348 gch < f01HEDigis.size()
0349 ? adc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
0350 : (gch < nchannelsf015
0351 ? adc_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
0352 : adc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
0353 auto const capid =
0354 gch < f01HEDigis.size()
0355 ? capid_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
0356 : (gch < nchannelsf015
0357 ? capid_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
0358 : capid_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
0359
0360
0361
0362 auto const hashedId =
0363 did.subdetId() == HcalBarrel
0364 ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
0365 : did2linearIndexHE(id,
0366 mahi.maxDepthHE(),
0367 mahi.maxPhiHE(),
0368 mahi.firstHERing(),
0369 mahi.lastHERing(),
0370 mahi.nEtaHE()) +
0371 mahi.offsetForHashes();
0372
0373
0374 auto const qieType = mahi.qieTypes_values()[hashedId] > 0 ? 1 : 0;
0375 auto const* qieOffsets = mahi.qieCoders_offsets()[hashedId].data();
0376 auto const* qieSlopes = mahi.qieCoders_slopes()[hashedId].data();
0377 auto const pedestal = mahi.pedestals_value()[hashedId][capid];
0378
0379
0380 auto const charge = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
0381
0382 shrChargeMinusPedestal[linearThPerBlock] = charge - pedestal;
0383 }
0384 }
0385 alpaka::syncBlockThreads(acc);
0386
0387
0388 for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
0389 auto const gch = channel.global;
0390 auto const lch = channel.local;
0391 for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
0392 auto const sampleWithinWindow = i_sample;
0393 auto const sample = i_sample + startingSample;
0394
0395
0396 if (sampleWithinWindow == 0) {
0397 outputGPU.detId()[gch] = 0;
0398 outputGPU.energyM0()[gch] = 0;
0399 outputGPU.timeM0()[gch] = 0;
0400 outputGPU.energy()[gch] = 0;
0401 outputGPU.chi2()[gch] = 0;
0402 }
0403
0404
0405 auto* amplitudesForChannel = amplitudes + nsamplesForCompute * gch;
0406 auto* noiseTermsForChannel = noiseTerms + nsamplesForCompute * gch;
0407 auto* electronicNoiseTermsForChannel = electronicNoiseTerms + nsamplesForCompute * gch;
0408
0409
0410 auto const stride = gch < f01HEDigis.size()
0411 ? f01HEDigis.stride()
0412 : (gch < f5HBDigis.size() ? f5HBDigis.stride() : f3HBDigis.stride());
0413 auto const nsamples =
0414 gch < f01HEDigis.size()
0415 ? compute_nsamples<Flavor1>(stride)
0416 : (gch < nchannelsf015 ? compute_nsamples<Flavor5>(stride) : compute_nsamples<Flavor3>(stride));
0417
0418 ALPAKA_ASSERT_ACC(nsamples == nsamplesForCompute || nsamples - startingSample == nsamplesForCompute);
0419
0420 auto const id = gch < f01HEDigis.size()
0421 ? f01HEDigis.ids()[gch]
0422 : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
0423 : f3HBDigis.ids()[gch - nchannelsf015]);
0424 auto const did = HcalDetId{id};
0425
0426 auto const adc =
0427 gch < f01HEDigis.size()
0428 ? adc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
0429 : (gch < nchannelsf015
0430 ? adc_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
0431 : adc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
0432 auto const capid =
0433 gch < f01HEDigis.size()
0434 ? capid_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
0435 : (gch < nchannelsf015
0436 ? capid_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
0437 : capid_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
0438
0439
0440
0441 auto const hashedId =
0442 did.subdetId() == HcalBarrel
0443 ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
0444 : did2linearIndexHE(id,
0445 mahi.maxDepthHE(),
0446 mahi.maxPhiHE(),
0447 mahi.firstHERing(),
0448 mahi.lastHERing(),
0449 mahi.nEtaHE()) +
0450 mahi.offsetForHashes();
0451
0452
0453 auto const qieType = mahi.qieTypes_values()[hashedId] > 0 ? 1 : 0;
0454 auto const* qieOffsets = mahi.qieCoders_offsets()[hashedId].data();
0455 auto const* qieSlopes = mahi.qieCoders_slopes()[hashedId].data();
0456 auto const* pedestalWidthsForChannel =
0457 useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
0458 ? mahi.effectivePedestalWidths()[hashedId].data()
0459 : mahi.pedestals_width()[hashedId].data();
0460
0461 auto const gain = mahi.gains_value()[hashedId][capid];
0462 auto const gain0 = mahi.gains_value()[hashedId][0];
0463 auto const respCorrection = mahi.respCorrs_values()[hashedId];
0464 auto const pedestal = mahi.pedestals_value()[hashedId][capid];
0465 auto const pedestalWidth = pedestalWidthsForChannel[capid];
0466
0467 auto const pedestalToUseForMethod0 =
0468 useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
0469 ? mahi.effectivePedestals()[hashedId][capid]
0470 : pedestal;
0471 auto const sipmType = mahi.sipmPar_type()[hashedId];
0472 auto const fcByPE = mahi.sipmPar_fcByPE()[hashedId];
0473 auto const recoParam1 = recoParamsWithPS.recoParamView().param1()[hashedId];
0474 auto const recoParam2 = recoParamsWithPS.recoParamView().param2()[hashedId];
0475
0476 #ifdef HCAL_MAHI_GPUDEBUG
0477 printf("qieType = %d qieOffset0 = %f qieOffset1 = %f qieSlope0 = %f qieSlope1 = %f\n",
0478 qieType,
0479 qieOffsets[0],
0480 qieOffsets[1],
0481 qieSlopes[0],
0482 qieSlopes[1]);
0483 #endif
0484
0485
0486 auto const charge = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
0487
0488 int32_t const soi =
0489 gch < f01HEDigis.size()
0490 ? soiSamples[gch]
0491 : (gch < nchannelsf015 ? f5HBDigis.npresamples()[gch - f01HEDigis.size()] : soiSamples[gch]);
0492
0493 bool badSOI = (soi < 0 or static_cast<unsigned>(soi) >= nsamplesForCompute);
0494 if (badSOI and sampleWithinWindow == 0) {
0495 #ifdef GPU_DEBUG
0496 printf("Found HBHE channel %d with invalid SOI %d\n", gch, soi);
0497 #endif
0498
0499 outputGPU.chi2()[gch] = -9999.f;
0500 }
0501
0502
0503
0504 auto const precisionItem = sipmCharacteristics.precisionItem()[sipmType - 1];
0505 auto const parLin1 = precisionItem.parLin1_;
0506 auto const parLin2 = precisionItem.parLin2_;
0507 auto const parLin3 = precisionItem.parLin3_;
0508
0509
0510
0511
0512
0513 if (gch >= f01HEDigis.size() && gch < nchannelsf015 && sampleWithinWindow == 0)
0514 soiSamples[gch] = f5HBDigis.npresamples()[gch - f01HEDigis.size()];
0515
0516
0517
0518
0519
0520
0521 float const rawCharge = get_raw_charge(charge,
0522 pedestal,
0523 shrChargeMinusPedestal,
0524 parLin1,
0525 parLin2,
0526 parLin3,
0527 nsamplesForCompute,
0528 soi,
0529 sipmQTSShift,
0530 sipmQNTStoSum,
0531 fcByPE,
0532 lch,
0533 gch < f01HEDigis.size() || gch >= nchannelsf015);
0534
0535 auto const dfc = compute_diff_charge_gain(
0536 qieType, adc, capid, qieOffsets, qieSlopes, gch < f01HEDigis.size() || gch >= nchannelsf015);
0537
0538 #ifdef COMPUTE_TDC_TIME
0539 float tdcTime;
0540 if (gch >= f01HEDigis.size() && gch < nchannelsf015) {
0541 tdcTime = HcalSpecialTimes::UNKNOWN_T_NOTDC;
0542 } else {
0543 if (gch < f01HEDigis.size())
0544 tdcTime =
0545 HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample));
0546 else if (gch >= nchannelsf015)
0547 tdcTime = HcalSpecialTimes::getTDCTime(
0548 tdc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
0549 }
0550 #endif
0551
0552
0553
0554
0555
0556 auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
0557 auto const startSampleTmp = soi + firstSampleShift;
0558 auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp;
0559 auto const endSample =
0560 startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute;
0561
0562 auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
0563 auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
0564
0565 shrEnergyM0PerTS[lch * nsamplesForCompute + sampleWithinWindow] = energym0_per_ts;
0566
0567 alpaka::atomicAdd(
0568 acc, &shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0, alpaka::hierarchy::Threads{});
0569
0570 #ifdef HCAL_MAHI_GPUDEBUG
0571 printf(
0572 "id = %u sample = %d gch = %d hashedId = %u adc = %u capid = %u\n"
0573 " charge = %f rawCharge = %f dfc = %f pedestal = %f\n"
0574 " gain = %f respCorrection = %f energym0_per_ts = %f\n",
0575 id,
0576 sample,
0577 gch,
0578 hashedId,
0579 adc,
0580 capid,
0581 charge,
0582 rawCharge,
0583 dfc,
0584 pedestalToUseForMethod0,
0585 gain,
0586 respCorrection,
0587 energym0_per_ts);
0588 printf("startSample = %d endSample = %d param1 = %u param2 = %u\n",
0589 startSample,
0590 endSample,
0591 recoParam1,
0592 recoParam2);
0593 #endif
0594
0595 if (sampleWithinWindow >= static_cast<unsigned>(startSample) &&
0596 sampleWithinWindow < static_cast<unsigned>(endSample)) {
0597
0598 alpaka::atomicAdd(acc, &shrMethod0EnergyAccum[lch], energym0_per_ts, alpaka::hierarchy::Threads{});
0599
0600
0601 atomicMaxPair(
0602 acc, &shrMethod0EnergySamplePair[lch], {sampleWithinWindow, energym0_per_ts}, TSenergyCompare);
0603 }
0604
0605
0606
0607 if (sampleWithinWindow == static_cast<unsigned>(soi)) {
0608
0609
0610
0611
0612
0613 auto const digiStatus_ = mahi.channelQuality_status()[hashedId];
0614 const bool taggedBadByDb = (digiStatus_ / 32770);
0615
0616 if (taggedBadByDb)
0617 outputGPU.chi2()[gch] = -9999.f;
0618
0619
0620
0621 if (not(energym0_per_ts_gain0 > ts4Thresh)) {
0622 outputGPU.chi2()[gch] = -9999.f;
0623 }
0624 }
0625
0626
0627
0628 auto const amplitude = rawCharge - pedestalToUseForMethod0;
0629 auto const noiseADC = (1. / std::sqrt(12)) * dfc;
0630 auto const noisePhotoSq = amplitude > pedestalWidth ? (amplitude * fcByPE) : 0.f;
0631 auto const noiseTerm = noiseADC * noiseADC + noisePhotoSq + pedestalWidth * pedestalWidth;
0632
0633
0634 amplitudesForChannel[sampleWithinWindow] = amplitude;
0635 noiseTermsForChannel[sampleWithinWindow] = noiseTerm;
0636 electronicNoiseTermsForChannel[sampleWithinWindow] = pedestalWidth;
0637
0638 }
0639 }
0640 alpaka::syncBlockThreads(acc);
0641
0642
0643 for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
0644 auto const gch = channel.global;
0645 auto const lch = channel.local;
0646 for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
0647 auto const sampleWithinWindow = i_sample;
0648
0649 int32_t const soi =
0650 gch < f01HEDigis.size()
0651 ? soiSamples[gch]
0652 : (gch < nchannelsf015 ? f5HBDigis.npresamples()[gch - f01HEDigis.size()] : soiSamples[gch]);
0653
0654 auto const id = gch < f01HEDigis.size()
0655 ? f01HEDigis.ids()[gch]
0656 : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
0657 : f3HBDigis.ids()[gch - nchannelsf015]);
0658 auto const did = HcalDetId{id};
0659 auto const hashedId =
0660 did.subdetId() == HcalBarrel
0661 ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
0662 : did2linearIndexHE(id,
0663 mahi.maxDepthHE(),
0664 mahi.maxPhiHE(),
0665 mahi.firstHERing(),
0666 mahi.lastHERing(),
0667 mahi.nEtaHE()) +
0668 mahi.offsetForHashes();
0669
0670 auto const recoParam1 = recoParamsWithPS.recoParamView().param1()[hashedId];
0671 auto const recoParam2 = recoParamsWithPS.recoParamView().param2()[hashedId];
0672
0673 auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
0674
0675
0676 if (sampleWithinWindow == static_cast<unsigned>(soi)) {
0677 auto const method0_energy = shrMethod0EnergyAccum[lch];
0678 auto const val = shrMethod0EnergySamplePair[lch];
0679 int const max_sample = (val >> 32) & 0xffffffff;
0680
0681 float const max_energy = uint_as_float(static_cast<uint32_t>(val & 0xffffffff));
0682
0683 float const max_energy_1 = static_cast<unsigned>(max_sample) < nsamplesForCompute - 1
0684 ? shrEnergyM0PerTS[lch * nsamplesForCompute + max_sample + 1]
0685 : 0.f;
0686 float const position = nsamplesToAdd < nsamplesForCompute ? max_sample - soi : max_sample;
0687 auto const sum = max_energy + max_energy_1;
0688
0689
0690
0691
0692 float const time =
0693 max_energy > 0.f && max_energy_1 > 0.f ? 25.f * (position + max_energy_1 / sum) : 25.f * position;
0694
0695
0696 outputGPU.detId()[gch] = id;
0697 outputGPU.energyM0()[gch] = method0_energy;
0698 outputGPU.timeM0()[gch] = time;
0699
0700
0701
0702
0703
0704
0705 if (not(shrEnergyM0TotalAccum[lch] > 0)) {
0706 outputGPU.chi2()[gch] = -9999.f;
0707 }
0708
0709 #ifdef HCAL_MAHI_GPUDEBUG
0710 printf("tsTOT = %f tstrig = %f ts4Thresh = %f\n",
0711 shrEnergyM0TotalAccum[lch],
0712 energym0_per_ts_gain0,
0713 ts4Thresh);
0714 #endif
0715
0716 #ifdef HCAL_MAHI_GPUDEBUG
0717 printf(" method0_energy = %f max_sample = %d max_energy = %f time = %f\n",
0718 method0_energy,
0719 max_sample,
0720 max_energy,
0721 time);
0722 #endif
0723 }
0724 #ifdef HCAL_MAHI_GPUDEBUG
0725 printf(
0726 "charge(%d) = %f pedestal(%d) = %f dfc(%d) = %f pedestalWidth(%d) = %f noiseADC(%d) = %f "
0727 "noisPhoto(%d) =%f outputGPU chi2[gch] = %f \n",
0728 sample,
0729 rawCharge,
0730 sample,
0731 pedestalToUseForMethod0,
0732 sample,
0733 dfc,
0734 sample,
0735 pedestalWidth,
0736 sample,
0737 noiseADC,
0738 sample,
0739 noisePhotoSq,
0740 outputGPU.chi2()[gch]);
0741 #endif
0742
0743 }
0744 }
0745 }
0746 }
0747 };
0748
0749 class Kernel_prep_pulseMatrices_sameNumberOfSamples {
0750 public:
0751 ALPAKA_FN_ACC void operator()(Acc3D const& acc,
0752 float* pulseMatrices,
0753 float* pulseMatricesM,
0754 float* pulseMatricesP,
0755 HcalMahiPulseOffsetsSoA::ConstView pulseOffsets,
0756 float const* amplitudes,
0757 IProductTypef01::ConstView f01HEDigis,
0758 IProductTypef5::ConstView f5HBDigis,
0759 IProductTypef3::ConstView f3HBDigis,
0760 int8_t* soiSamples,
0761 HcalMahiConditionsPortableDevice::ConstView mahi,
0762 HcalRecoParamWithPulseShapeDevice::ConstView recoParamsWithPS,
0763 float const meanTime,
0764 float const timeSigmaSiPM,
0765 float const timeSigmaHPD,
0766 bool const applyTimeSlew,
0767 float const tzeroTimeSlew,
0768 float const slopeTimeSlew,
0769 float const tmaxTimeSlew) const {
0770
0771 auto const npulses(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[1u]);
0772
0773 auto const nsamples(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[2u]);
0774
0775 auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
0776 auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
0777
0778
0779 for (auto channel : uniform_elements_z(acc, nchannels)) {
0780
0781 for (auto ipulse : independent_group_elements_y(acc, npulses)) {
0782
0783 for (auto sample : independent_group_elements_x(acc, nsamples)) {
0784
0785 auto const id = channel < f01HEDigis.size()
0786 ? f01HEDigis.ids()[channel]
0787 : (channel < nchannelsf015 ? f5HBDigis.ids()[channel - f01HEDigis.size()]
0788 : f3HBDigis.ids()[channel - nchannelsf015]);
0789 auto const deltaT =
0790 channel >= f01HEDigis.size() && channel < nchannelsf015 ? timeSigmaHPD : timeSigmaSiPM;
0791
0792
0793
0794 auto const did = DetId{id};
0795 auto const hashedId =
0796 did.subdetId() == HcalBarrel
0797 ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
0798 : did2linearIndexHE(id,
0799 mahi.maxDepthHE(),
0800 mahi.maxPhiHE(),
0801 mahi.firstHERing(),
0802 mahi.lastHERing(),
0803 mahi.nEtaHE()) +
0804 mahi.offsetForHashes();
0805 auto const pulseShape = recoParamsWithPS.getPulseShape(hashedId);
0806
0807
0808 auto* pulseMatrix = pulseMatrices + nsamples * npulses * channel;
0809 auto* pulseMatrixM = pulseMatricesM + nsamples * npulses * channel;
0810 auto* pulseMatrixP = pulseMatricesP + nsamples * npulses * channel;
0811
0812
0813 int const soi = soiSamples[channel];
0814 int const pulseOffset = pulseOffsets.offsets()[ipulse];
0815 auto const amplitude = amplitudes[channel * nsamples + pulseOffset + soi];
0816
0817 if (amplitude <= 0.f) {
0818 pulseMatrix[ipulse * nsamples + sample] = 0.f;
0819 pulseMatrixM[ipulse * nsamples + sample] = 0.f;
0820 pulseMatrixP[ipulse * nsamples + sample] = 0.f;
0821 continue;
0822 }
0823 #ifdef HCAL_MAHI_GPUDEBUG
0824 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
0825 if (id != DETID_TO_DEBUG)
0826 return;
0827 #endif
0828 if (sample == 0 && ipulse == 0) {
0829 for (int i = 0; i < 8; i++)
0830 printf("amplitude(%d) = %f\n", i, amplitudes[channel * nsamples + i]);
0831 printf("acc25nsVec and diff25nsItvlVec for recoPulseShapeId = %u\n", recoPulseShapeId);
0832 for (int i = 0; i < 256; i++) {
0833 printf("acc25nsVec(%d) = %f diff25nsItvlVec(%d) = %f\n", i, acc25nsVec[i], i, diff25nsItvlVec[i]);
0834 }
0835 printf("accVarLenIdxZEROVec and accVarLenIdxMinusOneVec\n");
0836 for (int i = 0; i < 25; i++) {
0837 printf("accVarLenIdxZEROVec(%d) = %f accVarLenIdxMinusOneVec(%d) = %f\n",
0838 i,
0839 accVarLenIdxZeroVec[i],
0840 i,
0841 accVarLenIdxMinusOneVec[i]);
0842 }
0843 printf("diffVarItvlIdxZEROVec and diffVarItvlIdxMinusOneVec\n");
0844 for (int i = 0; i < 25; i++) {
0845 printf("diffVarItvlIdxZEROVec(%d) = %f diffVarItvlIdxMinusOneVec(%d) = %f\n",
0846 i,
0847 diffVarItvlIdxZeroVec[i],
0848 i,
0849 diffVarItvlIdxMinusOneVec[i]);
0850 }
0851 }
0852 #endif
0853
0854 auto t0 = meanTime;
0855 if (applyTimeSlew) {
0856 if (amplitude <= 1.0f)
0857 t0 += compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
0858 else
0859 t0 += compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
0860 }
0861 auto const t0m = -deltaT + t0;
0862 auto const t0p = deltaT + t0;
0863
0864 #ifdef HCAL_MAHI_GPUDEBUG
0865 if (sample == 0 && ipulse == 0) {
0866 printf("time values: %f %f %f\n", t0, t0m, t0p);
0867 }
0868
0869 if (sample == 0 && ipulse == 0) {
0870 for (int i = 0; i < hcal::constants::maxSamples; i++) {
0871 auto const value = compute_pulse_shape_value(pulseShape, t0, i, 0);
0872 }
0873 printf("\n");
0874 for (int i = 0; i < hcal::constants::maxSamples; i++) {
0875 auto const value = compute_pulse_shape_value(pulseShape, t0p, i, 0);
0876 }
0877 printf("\n");
0878 for (int i = 0; i < hcal::constants::maxSamples; i++) {
0879 auto const value = compute_pulse_shape_value(pulseShape, t0m, i, 0);
0880 }
0881 }
0882 #endif
0883
0884
0885
0886 auto const shift = 4 - soi;
0887
0888 int32_t const idx = sample - pulseOffset;
0889 auto const value = idx >= 0 && static_cast<unsigned>(idx) < nsamples
0890 ? compute_pulse_shape_value(pulseShape, t0, idx, shift)
0891 : 0;
0892 auto const value_t0m = idx >= 0 && static_cast<unsigned>(idx) < nsamples
0893 ? compute_pulse_shape_value(pulseShape, t0m, idx, shift)
0894 : 0;
0895 auto const value_t0p = idx >= 0 && static_cast<unsigned>(idx) < nsamples
0896 ? compute_pulse_shape_value(pulseShape, t0p, idx, shift)
0897 : 0;
0898
0899
0900 pulseMatrix[ipulse * nsamples + sample] = value;
0901 pulseMatrixM[ipulse * nsamples + sample] = value_t0m;
0902 pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
0903
0904 }
0905 }
0906 }
0907 }
0908 };
0909
0910 template <int NSAMPLES, int NPULSES>
0911 ALPAKA_FN_ACC ALPAKA_FN_INLINE void update_covariance(
0912 calo::multifit::ColumnVector<NPULSES> const& resultAmplitudesVector,
0913 calo::multifit::MapSymM<float, NSAMPLES>& covarianceMatrix,
0914 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
0915 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
0916 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
0917 CMS_UNROLL_LOOP
0918 for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
0919 auto const resultAmplitude = resultAmplitudesVector(ipulse);
0920 if (resultAmplitude == 0)
0921 continue;
0922
0923 #ifdef HCAL_MAHI_GPUDEBUG
0924 printf("pulse cov array for ibx = %d\n", ipulse);
0925 #endif
0926
0927
0928 float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES];
0929 CMS_UNROLL_LOOP
0930 for (int counter = 0; counter < NSAMPLES; counter++) {
0931 pmcol[counter] = pulseMatrix.coeffRef(counter, ipulse);
0932 pmpcol[counter] = pulseMatrixP.coeffRef(counter, ipulse);
0933 pmmcol[counter] = pulseMatrixM.coeffRef(counter, ipulse);
0934 }
0935
0936 auto const ampl2 = resultAmplitude * resultAmplitude;
0937 CMS_UNROLL_LOOP
0938 for (int col = 0; col < NSAMPLES; col++) {
0939 auto const valueP_col = pmpcol[col];
0940 auto const valueM_col = pmmcol[col];
0941 auto const value_col = pmcol[col];
0942 auto const tmppcol = valueP_col - value_col;
0943 auto const tmpmcol = valueM_col - value_col;
0944
0945
0946 auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol);
0947 covarianceMatrix(col, col) += ampl2 * tmp_value;
0948
0949
0950 CMS_UNROLL_LOOP
0951 for (int row = col + 1; row < NSAMPLES; row++) {
0952 float const valueP_row = pmpcol[row];
0953 float const value_row = pmcol[row];
0954 float const valueM_row = pmmcol[row];
0955
0956 float tmpprow = valueP_row - value_row;
0957 float tmpmrow = valueM_row - value_row;
0958
0959 auto const covValue = 0.5 * (tmppcol * tmpprow + tmpmcol * tmpmrow);
0960
0961 covarianceMatrix(row, col) += ampl2 * covValue;
0962 }
0963 }
0964 }
0965 }
0966
0967 template <int NSAMPLES, int NPULSES>
0968 class Kernel_minimize {
0969 public:
0970 ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0971 OProductType::View outputGPU,
0972 float const* amplitudes,
0973 float* pulseMatrices,
0974 float* pulseMatricesM,
0975 float* pulseMatricesP,
0976 HcalMahiPulseOffsetsSoA::ConstView pulseOffsetsView,
0977 float* noiseTerms,
0978 float* electronicNoiseTerms,
0979 int8_t* soiSamples,
0980 HcalMahiConditionsPortableDevice::ConstView mahi,
0981 bool const useEffectivePedestals,
0982 IProductTypef01::ConstView f01HEDigis,
0983 IProductTypef5::ConstView f5HBDigis,
0984 IProductTypef3::ConstView f3HBDigis) const {
0985
0986 static_assert(NPULSES == NSAMPLES);
0987
0988 auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
0989
0990 auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
0991
0992 auto const threadsPerBlock(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
0993
0994
0995 for (auto group : uniform_groups_x(acc, nchannels)) {
0996
0997 for (auto channel : uniform_group_elements_x(acc, group, nchannels)) {
0998 auto const gch = channel.global;
0999 auto const lch = channel.local;
1000
1001
1002 if (outputGPU.chi2()[gch] == -9999.f)
1003 continue;
1004
1005
1006 float* shrmem = alpaka::getDynSharedMem<float>(acc);
1007 float* shrMatrixLFnnlsStorage = shrmem + calo::multifit::MapSymM<float, NPULSES>::total * lch;
1008 float* shrAtAStorage = shrmem + calo::multifit::MapSymM<float, NPULSES>::total * (lch + threadsPerBlock);
1009
1010
1011 auto const id = gch < f01HEDigis.size() ? f01HEDigis.ids()[gch]
1012 : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
1013 : f3HBDigis.ids()[gch - nchannelsf015]);
1014 auto const did = DetId{id};
1015 auto const hashedId =
1016 did.subdetId() == HcalBarrel
1017 ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
1018 : did2linearIndexHE(id,
1019 mahi.maxDepthHE(),
1020 mahi.maxPhiHE(),
1021 mahi.firstHERing(),
1022 mahi.lastHERing(),
1023 mahi.nEtaHE()) +
1024 mahi.offsetForHashes();
1025
1026
1027 auto const* pedestalWidthsForChannel =
1028 useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
1029 ? mahi.effectivePedestalWidths()[hashedId].data()
1030 : mahi.pedestals_width()[hashedId].data();
1031 auto const averagePedestalWidth2 = 0.25 * (pedestalWidthsForChannel[0] * pedestalWidthsForChannel[0] +
1032 pedestalWidthsForChannel[1] * pedestalWidthsForChannel[1] +
1033 pedestalWidthsForChannel[2] * pedestalWidthsForChannel[2] +
1034 pedestalWidthsForChannel[3] * pedestalWidthsForChannel[3]);
1035
1036
1037 auto const gain = mahi.gains_value()[hashedId][0];
1038
1039 auto const respCorrection = mahi.respCorrs_values()[hashedId];
1040 auto const noisecorr = mahi.sipmPar_auxi2()[hashedId];
1041
1042 #ifdef HCAL_MAHI_GPUDEBUG
1043 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
1044 if (id != DETID_TO_DEBUG)
1045 return;
1046 #endif
1047 #endif
1048
1049 calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
1050 CMS_UNROLL_LOOP
1051 for (int i = 0; i < NPULSES; ++i)
1052 pulseOffsets(i) = i;
1053
1054
1055 calo::multifit::ColumnVector<NPULSES> resultAmplitudesVector =
1056 calo::multifit::ColumnVector<NPULSES>::Zero();
1057
1058
1059 Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> inputAmplitudesView{amplitudes + gch * NSAMPLES};
1060 Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
1061 Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseElectronicView{electronicNoiseTerms +
1062 gch * NSAMPLES};
1063 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{
1064 pulseMatricesM + gch * NSAMPLES * NPULSES};
1065 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{
1066 pulseMatricesP + gch * NSAMPLES * NPULSES};
1067 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{
1068 pulseMatrices + gch * NSAMPLES * NPULSES};
1069 #ifdef HCAL_MAHI_GPUDEBUG
1070 for (int i = 0; i < NSAMPLES; i++)
1071 printf("inputValues(%d) = %f noiseTerms(%d) = %f\n", i, inputAmplitudesView(i), i, noiseTermsView(i));
1072 for (int i = 0; i < NSAMPLES; i++) {
1073 for (int j = 0; j < NPULSES; j++)
1074 printf("%f ", glbPulseMatrixView(i, j));
1075 printf("\n");
1076 }
1077 printf("\n");
1078 for (int i = 0; i < NSAMPLES; i++) {
1079 for (int j = 0; j < NPULSES; j++)
1080 printf("%f ", glbPulseMatrixMView(i, j));
1081 printf("\n");
1082 }
1083 printf("\n");
1084 for (int i = 0; i < NSAMPLES; i++) {
1085 for (int j = 0; j < NPULSES; j++)
1086 printf("%f ", glbPulseMatrixPView(i, j));
1087 printf("\n");
1088 }
1089 #endif
1090
1091 int npassive = 0;
1092 float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f;
1093 for (int iter = 1; iter < nMaxItersMin; iter++) {
1094
1095
1096
1097
1098 float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
1099 calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
1100 CMS_UNROLL_LOOP
1101 for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::total; counter++)
1102 covarianceMatrixStorage[counter] = (noisecorr != 0.f) ? 0.f : averagePedestalWidth2;
1103 CMS_UNROLL_LOOP
1104 for (unsigned int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::stride; counter++) {
1105 covarianceMatrix(counter, counter) += noiseTermsView.coeffRef(counter);
1106 if (counter != 0)
1107 covarianceMatrix(counter, counter - 1) +=
1108 noisecorr * noiseElectronicView.coeffRef(counter - 1) * noiseElectronicView.coeffRef(counter);
1109 }
1110
1111
1112 update_covariance(resultAmplitudesVector,
1113 covarianceMatrix,
1114 glbPulseMatrixView,
1115 glbPulseMatrixMView,
1116 glbPulseMatrixPView);
1117
1118 #ifdef HCAL_MAHI_GPUDEBUG
1119 printf("covariance matrix\n");
1120 for (int i = 0; i < 8; i++) {
1121 for (int j = 0; j < 8; j++)
1122 printf("%f ", covarianceMatrix(i, j));
1123 printf("\n");
1124 }
1125 #endif
1126
1127
1128
1129
1130 float matrixLStorage[calo::multifit::MapSymM<float, NSAMPLES>::total];
1131 calo::multifit::MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
1132 calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix);
1133
1134
1135
1136
1137
1138
1139
1140 calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
1141 calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
1142
1143
1144
1145
1146
1147
1148
1149 float reg_b[NSAMPLES];
1150 calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
1151
1152
1153
1154
1155
1156
1157
1158 calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
1159 calo::multifit::ColumnVector<NPULSES> Atb;
1160 CMS_UNROLL_LOOP
1161 for (int icol = 0; icol < NPULSES; icol++) {
1162 float reg_ai[NSAMPLES];
1163
1164
1165 CMS_UNROLL_LOOP
1166 for (int counter = 0; counter < NSAMPLES; counter++)
1167 reg_ai[counter] = A(counter, icol);
1168
1169
1170 float sum = 0.f;
1171 CMS_UNROLL_LOOP
1172 for (int counter = 0; counter < NSAMPLES; counter++)
1173 sum += reg_ai[counter] * reg_ai[counter];
1174
1175
1176 AtA(icol, icol) = sum;
1177
1178
1179 CMS_UNROLL_LOOP
1180 for (int j = icol + 1; j < NPULSES; j++) {
1181
1182 float reg_aj[NSAMPLES];
1183 CMS_UNROLL_LOOP
1184 for (int counter = 0; counter < NSAMPLES; counter++)
1185 reg_aj[counter] = A(counter, j);
1186
1187
1188 float sum = 0.f;
1189 CMS_UNROLL_LOOP
1190 for (int counter = 0; counter < NSAMPLES; counter++)
1191 sum += reg_aj[counter] * reg_ai[counter];
1192
1193
1194
1195 AtA(j, icol) = sum;
1196 }
1197
1198
1199 float sum_atb = 0;
1200 CMS_UNROLL_LOOP
1201 for (int counter = 0; counter < NSAMPLES; counter++)
1202 sum_atb += reg_ai[counter] * reg_b[counter];
1203
1204
1205 Atb(icol) = sum_atb;
1206 }
1207
1208 #ifdef HCAL_MAHI_GPUDEBUG
1209 printf("AtA\n");
1210 for (int i = 0; i < 8; i++) {
1211 for (int j = 0; j < 8; j++)
1212 printf("%f ", AtA(i, j));
1213 printf("\n");
1214 }
1215 printf("Atb\n");
1216 for (int i = 0; i < 8; i++)
1217 printf("%f ", Atb(i));
1218 printf("\n");
1219 printf("result Amplitudes before nnls\n");
1220 for (int i = 0; i < 8; i++)
1221 printf("%f ", resultAmplitudesVector(i));
1222 printf("\n");
1223 #endif
1224
1225
1226 calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
1227
1228
1229 calo::multifit::fnnls(AtA,
1230 Atb,
1231 resultAmplitudesVector,
1232 npassive,
1233 pulseOffsets,
1234 matrixLForFnnls,
1235 nnlsThresh,
1236 nMaxItersNNLS,
1237 10,
1238 10);
1239
1240 #ifdef HCAL_MAHI_GPUDEBUG
1241 printf("result Amplitudes\n");
1242 for (int i = 0; i < 8; i++)
1243 printf("resultAmplitudes(%d) = %f\n", i, resultAmplitudesVector(i));
1244 #endif
1245
1246 calo::multifit::calculateChiSq(
1247 matrixL, glbPulseMatrixView, resultAmplitudesVector, inputAmplitudesView, chi2);
1248
1249 auto const deltaChi2 = std::abs(chi2 - previous_chi2);
1250 if (chi2 == chi2_2itersback && chi2 < previous_chi2)
1251 break;
1252
1253
1254 chi2_2itersback = previous_chi2;
1255 previous_chi2 = chi2;
1256
1257
1258 if (deltaChi2 < deltaChi2Threashold)
1259 break;
1260 }
1261
1262 #ifdef HCAL_MAHI_GPUDEBUG
1263 for (int i = 0; i < NPULSES; i++)
1264 printf("pulseOffsets(%d) = %d outputAmplitudes(%d) = %f\n",
1265 i,
1266 pulseOffsets(i),
1267 i,
1268 resultAmplitudesVector(i));
1269 printf("chi2 = %f\n", chi2);
1270 #endif
1271
1272 outputGPU.chi2()[gch] = chi2;
1273 auto const idx_for_energy = std::abs(pulseOffsetsView.offsets()[0]);
1274 outputGPU.energy()[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
1275
1276 }
1277 }
1278 }
1279 };
1280
1281 }
1282
1283 void runMahiAsync(Queue& queue,
1284 IProductTypef01::ConstView const& f01HEDigis,
1285 IProductTypef5::ConstView const& f5HBDigis,
1286 IProductTypef3::ConstView const& f3HBDigis,
1287 OProductType::View outputGPU,
1288 HcalMahiConditionsPortableDevice::ConstView const& mahi,
1289 HcalSiPMCharacteristicsPortableDevice::ConstView const& sipmCharacteristics,
1290 HcalRecoParamWithPulseShapeDevice::ConstView const& recoParamsWithPS,
1291 HcalMahiPulseOffsetsSoA::ConstView const& mahiPulseOffsets,
1292 ConfigParameters const& configParameters) {
1293 auto const totalChannels =
1294 f01HEDigis.metadata().size() + f5HBDigis.metadata().size() + f3HBDigis.metadata().size();
1295
1296
1297
1298
1299
1300
1301
1302
1303 int constexpr windowSize = 8;
1304
1305
1306 uint32_t nchannels_per_block = configParameters.kprep1dChannelsPerBlock;
1307 auto const blocks_y = cms::alpakatools::divide_up_by(totalChannels, nchannels_per_block);
1308
1309 Vec2D const blocks_2d{blocks_y, 1u};
1310 Vec2D const threads_2d{nchannels_per_block, windowSize};
1311 auto workDivPrep2D = cms::alpakatools::make_workdiv<Acc2D>(blocks_2d, threads_2d);
1312
1313
1314 auto amplitudes = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1315 auto noiseTerms = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1316 auto electronicNoiseTerms = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1317 auto soiSamples = cms::alpakatools::make_device_buffer<int8_t[]>(queue, totalChannels * windowSize);
1318
1319 alpaka::exec<Acc2D>(queue,
1320 workDivPrep2D,
1321 mahi::Kernel_prep1d_sameNumberOfSamples{},
1322 outputGPU,
1323 f01HEDigis,
1324 f5HBDigis,
1325 f3HBDigis,
1326 mahi,
1327 sipmCharacteristics,
1328 recoParamsWithPS,
1329 configParameters.useEffectivePedestals,
1330 configParameters.sipmQTSShift,
1331 configParameters.sipmQNTStoSum,
1332 configParameters.firstSampleShift,
1333 configParameters.ts4Thresh,
1334 amplitudes.data(),
1335 noiseTerms.data(),
1336 electronicNoiseTerms.data(),
1337 soiSamples.data(),
1338 windowSize);
1339
1340
1341
1342 uint32_t const channelsPerBlock = 1024 / (windowSize * mahiPulseOffsets.metadata().size());
1343
1344
1345 auto const blocks_z = cms::alpakatools::divide_up_by(totalChannels, channelsPerBlock);
1346 Vec3D const blocks_3d{blocks_z, 1u, 1u};
1347 Vec3D const threads_3d{channelsPerBlock, mahiPulseOffsets.metadata().size(), windowSize};
1348
1349 auto workDivPrep3D = cms::alpakatools::make_workdiv<Acc3D>(blocks_3d, threads_3d);
1350
1351
1352 auto pulseMatrices = cms::alpakatools::make_device_buffer<float[]>(
1353 queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1354 auto pulseMatricesM = cms::alpakatools::make_device_buffer<float[]>(
1355 queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1356 auto pulseMatricesP = cms::alpakatools::make_device_buffer<float[]>(
1357 queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1358
1359 alpaka::exec<Acc3D>(queue,
1360 workDivPrep3D,
1361 mahi::Kernel_prep_pulseMatrices_sameNumberOfSamples{},
1362 pulseMatrices.data(),
1363 pulseMatricesM.data(),
1364 pulseMatricesP.data(),
1365 mahiPulseOffsets,
1366 amplitudes.data(),
1367 f01HEDigis,
1368 f5HBDigis,
1369 f3HBDigis,
1370 soiSamples.data(),
1371 mahi,
1372 recoParamsWithPS,
1373 configParameters.meanTime,
1374 configParameters.timeSigmaSiPM,
1375 configParameters.timeSigmaHPD,
1376 configParameters.applyTimeSlew,
1377 configParameters.tzeroTimeSlew,
1378 configParameters.slopeTimeSlew,
1379 configParameters.tmaxTimeSlew);
1380
1381 uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
1382 auto blocks_1d = cms::alpakatools::divide_up_by(totalChannels, threadsPerBlock);
1383
1384 auto workDivPrep1D = cms::alpakatools::make_workdiv<Acc1D>(blocks_1d, threadsPerBlock);
1385
1386 alpaka::exec<Acc1D>(queue,
1387 workDivPrep1D,
1388 mahi::Kernel_minimize<8, 8>{},
1389 outputGPU,
1390 amplitudes.data(),
1391 pulseMatrices.data(),
1392 pulseMatricesM.data(),
1393 pulseMatricesP.data(),
1394 mahiPulseOffsets,
1395 noiseTerms.data(),
1396 electronicNoiseTerms.data(),
1397 soiSamples.data(),
1398 mahi,
1399 configParameters.useEffectivePedestals,
1400 f01HEDigis,
1401 f5HBDigis,
1402 f3HBDigis);
1403 }
1404
1405 }
1406 }
1407
1408 namespace alpaka::trait {
1409 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
1410 using namespace ALPAKA_ACCELERATOR_NAMESPACE::hcal::reconstruction::mahi;
1411
1412
1413 template <>
1414 struct BlockSharedMemDynSizeBytes<Kernel_prep1d_sameNumberOfSamples, Acc2D> {
1415
1416 template <typename TVec, typename... TArgs>
1417 ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_prep1d_sameNumberOfSamples const&,
1418 TVec const& threadsPerBlock,
1419 TVec const& elemsPerThread,
1420 TArgs const&...) -> std::size_t {
1421
1422
1423
1424
1425
1426 std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] *
1427 ((2 * threadsPerBlock[1u] * elemsPerThread[1u] + 2) * sizeof(float) + sizeof(uint64_t));
1428 return bytes;
1429 }
1430 };
1431
1432
1433 template <int NSAMPLES, int NPULSES>
1434 struct BlockSharedMemDynSizeBytes<Kernel_minimize<NSAMPLES, NPULSES>, Acc1D> {
1435
1436 template <typename TVec, typename... TArgs>
1437 ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_minimize<NSAMPLES, NPULSES> const&,
1438 TVec const& threadsPerBlock,
1439 TVec const& elemsPerThread,
1440 TArgs const&...) -> std::size_t {
1441
1442
1443 std::size_t bytes =
1444 2 * threadsPerBlock[0u] * elemsPerThread[0u] * (calo::multifit::MapSymM<float, 8>::total * sizeof(float));
1445 return bytes;
1446 }
1447 };
1448
1449 }