Warning, /RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu is written in an unsupported language. File is not indexed.
0001 #include <Eigen/Dense>
0002
0003 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
0004 // needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
0005 #include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
0006 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0007
0008 // TODO reuse some of the HCAL constats from
0009 //#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
0010
0011 #include "SimpleAlgoGPU.h"
0012 #include "KernelHelpers.h"
0013
0014 #ifdef HCAL_MAHI_GPUDEBUG
0015 #define DETID_TO_DEBUG 1125647428
0016 #endif
0017
0018 namespace hcal {
0019 namespace mahi {
0020
0021 // TODO: provide constants from configuration
0022 // from RecoLocalCalo/HcalRecProducers/python/HBHEMahiParameters_cfi.py
0023 constexpr int nMaxItersMin = 50;
0024 constexpr int nMaxItersNNLS = 500;
0025 constexpr double nnlsThresh = 1e-11;
0026 constexpr float deltaChi2Threashold = 1e-3;
0027
0028 // from RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc
0029 __forceinline__ __device__ float get_raw_charge(double const charge,
0030 double const pedestal,
0031 float const* shrChargeMinusPedestal,
0032 float const* parLin1Values,
0033 float const* parLin2Values,
0034 float const* parLin3Values,
0035 int32_t const nsamplesForCompute,
0036 int32_t const soi,
0037 int const sipmQTSShift,
0038 int const sipmQNTStoSum,
0039 int const sipmType,
0040 float const fcByPE,
0041 bool const isqie11) {
0042 float rawCharge;
0043
0044 if (!isqie11)
0045 rawCharge = charge;
0046 else {
0047 auto const parLin1 = parLin1Values[sipmType - 1];
0048 auto const parLin2 = parLin2Values[sipmType - 1];
0049 auto const parLin3 = parLin3Values[sipmType - 1];
0050
0051 int const first = std::max(soi + sipmQTSShift, 0);
0052 int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute);
0053 float sipmq = 0.0f;
0054 for (auto ts = first; ts < last; ts++)
0055 sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesForCompute + ts];
0056 auto const effectivePixelsFired = sipmq / fcByPE;
0057 auto const factor =
0058 hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
0059 rawCharge = (charge - pedestal) * factor + pedestal;
0060
0061 #ifdef HCAL_MAHI_GPUDEBUG
0062 printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
0063 #endif
0064 }
0065 return rawCharge;
0066 }
0067
0068 // Assume: same number of samples for HB and HE
0069 // TODO: add/validate restrict (will increase #registers in use by the kernel)
0070 __global__ void kernel_prep1d_sameNumberOfSamples(float* amplitudes,
0071 float* noiseTerms,
0072 float* electronicNoiseTerms,
0073 float* outputEnergy,
0074 float* outputChi2,
0075 uint16_t const* dataf01HE,
0076 uint16_t const* dataf5HB,
0077 uint16_t const* dataf3HB,
0078 uint32_t const* idsf01HE,
0079 uint32_t const* idsf5HB,
0080 uint32_t const* idsf3HB,
0081 uint32_t const stridef01HE,
0082 uint32_t const stridef5HB,
0083 uint32_t const stridef3HB,
0084 uint32_t const nchannelsf01HE,
0085 uint32_t const nchannelsf5HB,
0086 uint8_t const* npresamplesf5HB,
0087 int8_t* soiSamples,
0088 float* method0Energy,
0089 float* method0Time,
0090 uint32_t* outputdid,
0091 uint32_t const nchannels,
0092 uint32_t const* qualityStatus,
0093 uint32_t const* recoParam1Values,
0094 uint32_t const* recoParam2Values,
0095 float const* qieCoderOffsets,
0096 float const* qieCoderSlopes,
0097 int const* qieTypes,
0098 float const* pedestalWidths,
0099 float const* effectivePedestalWidths,
0100 float const* pedestals,
0101 float const* effectivePedestals,
0102 bool const useEffectivePedestals,
0103 int const* sipmTypeValues,
0104 float const* fcByPEValues,
0105 float const* parLin1Values,
0106 float const* parLin2Values,
0107 float const* parLin3Values,
0108 float const* gainValues,
0109 float const* respCorrectionValues,
0110 int const maxDepthHB,
0111 int const maxDepthHE,
0112 int const maxPhiHE,
0113 int const firstHBRing,
0114 int const lastHBRing,
0115 int const firstHERing,
0116 int const lastHERing,
0117 int const nEtaHB,
0118 int const nEtaHE,
0119 int const sipmQTSShift,
0120 int const sipmQNTStoSum,
0121 int const firstSampleShift,
0122 uint32_t const offsetForHashes,
0123 float const ts4Thresh,
0124 int const startingSample) {
0125 // indices + runtime constants
0126 auto const sample = threadIdx.x + startingSample;
0127 auto const sampleWithinWindow = threadIdx.x;
0128 int32_t const nsamplesForCompute = blockDim.x;
0129 auto const lch = threadIdx.y;
0130 auto const gch = lch + blockDim.y * blockIdx.x;
0131 auto const nchannels_per_block = blockDim.y;
0132 auto const linearThPerBlock = threadIdx.x + threadIdx.y * blockDim.x;
0133
0134 // remove
0135 if (gch >= nchannels)
0136 return;
0137
0138 // initialize all output buffers
0139 if (sampleWithinWindow == 0) {
0140 outputdid[gch] = 0;
0141 method0Energy[gch] = 0;
0142 method0Time[gch] = 0;
0143 outputEnergy[gch] = 0;
0144 outputChi2[gch] = 0;
0145 soiSamples[gch] = -1;
0146 }
0147
0148 #ifdef HCAL_MAHI_GPUDEBUG
0149 #ifdef HCAL_MAHI_GPUDEBUG_SINGLECHANNEL
0150 if (gch > 0)
0151 return;
0152 #endif
0153 #endif
0154
0155 // configure shared mem
0156 extern __shared__ char smem[];
0157 float* shrEnergyM0PerTS = reinterpret_cast<float*>(smem);
0158 float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesForCompute * nchannels_per_block;
0159 float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesForCompute * nchannels_per_block;
0160 float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block;
0161 unsigned long long int* shrMethod0EnergySamplePair =
0162 reinterpret_cast<unsigned long long int*>(shrEnergyM0TotalAccum + nchannels_per_block);
0163 if (sampleWithinWindow == 0) {
0164 shrMethod0EnergyAccum[lch] = 0;
0165 shrMethod0EnergySamplePair[lch] = __float_as_uint(std::numeric_limits<float>::min());
0166 shrEnergyM0TotalAccum[lch] = 0;
0167 }
0168
0169 // offset output
0170 auto* amplitudesForChannel = amplitudes + nsamplesForCompute * gch;
0171 auto* noiseTermsForChannel = noiseTerms + nsamplesForCompute * gch;
0172 auto* electronicNoiseTermsForChannel = electronicNoiseTerms + nsamplesForCompute * gch;
0173 auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
0174
0175 // get event input quantities
0176 auto const stride = gch < nchannelsf01HE ? stridef01HE : (gch < nchannelsf015 ? stridef5HB : stridef3HB);
0177 auto const nsamples = gch < nchannelsf01HE ? compute_nsamples<Flavor1>(stride)
0178 : (gch < nchannelsf015 ? compute_nsamples<Flavor5>(stride)
0179 : compute_nsamples<Flavor3>(stride));
0180
0181 #ifdef HCAL_MAHI_GPUDEBUG
0182 assert(nsamples == nsamplesForCompute || nsamples - startingSample == nsamplesForCompute);
0183 #endif
0184
0185 auto const id = gch < nchannelsf01HE
0186 ? idsf01HE[gch]
0187 : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
0188 auto const did = HcalDetId{id};
0189
0190 auto const adc =
0191 gch < nchannelsf01HE
0192 ? adc_for_sample<Flavor1>(dataf01HE + stride * gch, sample)
0193 : (gch < nchannelsf015 ? adc_for_sample<Flavor5>(dataf5HB + stride * (gch - nchannelsf01HE), sample)
0194 : adc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
0195 auto const capid =
0196 gch < nchannelsf01HE
0197 ? capid_for_sample<Flavor1>(dataf01HE + stride * gch, sample)
0198 : (gch < nchannelsf015 ? capid_for_sample<Flavor5>(dataf5HB + stride * (gch - nchannelsf01HE), sample)
0199 : capid_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
0200
0201 #ifdef HCAL_MAHI_GPUDEBUG
0202 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
0203 if (id != DETID_TO_DEBUG)
0204 return;
0205 #endif
0206 #endif
0207
0208 // compute hash for this did
0209 auto const hashedId =
0210 did.subdetId() == HcalBarrel
0211 ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
0212 : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
0213 offsetForHashes;
0214
0215 // conditions based on the hash
0216 // FIXME: remove hardcoded values
0217 auto const qieType = qieTypes[hashedId] > 0 ? 1 : 0; // 2 types at this point
0218 auto const* qieOffsets = qieCoderOffsets + hashedId * HcalQIECodersGPU::numValuesPerChannel;
0219 auto const* qieSlopes = qieCoderSlopes + hashedId * HcalQIECodersGPU::numValuesPerChannel;
0220 auto const* pedestalsForChannel = pedestals + hashedId * 4;
0221 auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
0222 ? effectivePedestalWidths + hashedId * 4
0223 : pedestalWidths + hashedId * 4;
0224
0225 auto const* gains = gainValues + hashedId * 4;
0226 auto const gain = gains[capid];
0227 auto const gain0 = gains[0];
0228 auto const respCorrection = respCorrectionValues[hashedId];
0229 auto const pedestal = pedestalsForChannel[capid];
0230 auto const pedestalWidth = pedestalWidthsForChannel[capid];
0231 // if needed, only use effective pedestals for f01
0232 auto const pedestalToUseForMethod0 = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
0233 ? effectivePedestals[hashedId * 4 + capid]
0234 : pedestal;
0235 auto const sipmType = sipmTypeValues[hashedId];
0236 auto const fcByPE = fcByPEValues[hashedId];
0237 auto const recoParam1 = recoParam1Values[hashedId];
0238 auto const recoParam2 = recoParam2Values[hashedId];
0239
0240 #ifdef HCAL_MAHI_GPUDEBUG
0241 printf("qieType = %d qieOffset0 = %f qieOffset1 = %f qieSlope0 = %f qieSlope1 = %f\n",
0242 qieType,
0243 qieOffsets[0],
0244 qieOffsets[1],
0245 qieSlopes[0],
0246 qieSlopes[1]);
0247 #endif
0248
0249 // compute charge
0250 auto const charge = hcal::reconstruction::compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
0251
0252 shrChargeMinusPedestal[linearThPerBlock] = charge - pedestal;
0253 if (gch < nchannelsf01HE) {
0254 // NOTE: assume that soi is high only for a single guy!
0255 // which must be the case. cpu version does not check for that
0256 // if that is not the case, we will see that with cuda mmecheck
0257 auto const soibit = soibit_for_sample<Flavor1>(dataf01HE + stride * gch, sample);
0258 if (soibit == 1)
0259 soiSamples[gch] = sampleWithinWindow;
0260 } else if (gch >= nchannelsf015) {
0261 auto const soibit = soibit_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample);
0262 if (soibit == 1)
0263 soiSamples[gch] = sampleWithinWindow;
0264 }
0265 __syncthreads();
0266 int32_t const soi = gch < nchannelsf01HE
0267 ? soiSamples[gch]
0268 : (gch < nchannelsf015 ? npresamplesf5HB[gch - nchannelsf01HE] : soiSamples[gch]);
0269
0270 bool badSOI = (soi < 0 or soi >= nsamplesForCompute);
0271 if (badSOI and sampleWithinWindow == 0) {
0272 #ifdef GPU_DEBUG
0273 printf("Found HBHE channel %d with invalid SOI %d\n", gch, soi);
0274 #endif
0275 // mark the channel as bad
0276 outputChi2[gch] = -9999.f;
0277 }
0278
0279 //int32_t const soi = gch >= nchannelsf01HE
0280 // ? npresamplesf5HB[gch - nchannelsf01HE]
0281 // : soiSamples[gch];
0282 // this is here just to make things uniform...
0283 if (gch >= nchannelsf01HE && gch < nchannelsf015 && sampleWithinWindow == 0)
0284 soiSamples[gch] = npresamplesf5HB[gch - nchannelsf01HE];
0285
0286 //
0287 // compute various quantities (raw charge and tdc stuff)
0288 // NOTE: this branch will be divergent only for a single warp that
0289 // sits on the boundary when flavor 01 channels end and flavor 5 start
0290 //
0291 float const rawCharge = get_raw_charge(charge,
0292 pedestal,
0293 shrChargeMinusPedestal,
0294 parLin1Values,
0295 parLin2Values,
0296 parLin3Values,
0297 nsamplesForCompute,
0298 soi,
0299 sipmQTSShift,
0300 sipmQNTStoSum,
0301 sipmType,
0302 fcByPE,
0303 gch < nchannelsf01HE || gch >= nchannelsf015);
0304
0305 auto const dfc = hcal::reconstruction::compute_diff_charge_gain(
0306 qieType, adc, capid, qieOffsets, qieSlopes, gch < nchannelsf01HE || gch >= nchannelsf015);
0307
0308 #ifdef COMPUTE_TDC_TIME
0309 float tdcTime;
0310 if (gch >= nchannelsf01HE && gch < nchannelsf015) {
0311 tdcTime = HcalSpecialTimes::UNKNOWN_T_NOTDC;
0312 } else {
0313 if (gch < nchannelsf01HE)
0314 tdcTime = HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor1>(dataf01HE + stride * gch, sample));
0315 else if (gch >= nchannelsf015)
0316 tdcTime =
0317 HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
0318 }
0319 #endif // COMPUTE_TDC_TIME
0320
0321 // compute method 0 quantities
0322 // TODO: need to apply containment
0323 // TODO: need to apply time slew
0324 // TODO: for < run 3, apply HBM legacy energy correction
0325 auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
0326 auto const startSampleTmp = soi + firstSampleShift;
0327 auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp;
0328 auto const endSample =
0329 startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute;
0330 // NOTE: gain is a small number < 10^-3, multiply it last
0331 auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
0332 auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
0333 // store to shared mem
0334 shrEnergyM0PerTS[lch * nsamplesForCompute + sampleWithinWindow] = energym0_per_ts;
0335 atomicAdd(&shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0);
0336
0337 #ifdef HCAL_MAHI_GPUDEBUG
0338 printf(
0339 "id = %u sample = %d gch = %d hashedId = %u adc = %u capid = %u\n"
0340 " charge = %f rawCharge = %f dfc = %f pedestal = %f\n"
0341 " gain = %f respCorrection = %f energym0_per_ts = %f\n",
0342 id,
0343 sample,
0344 gch,
0345 hashedId,
0346 adc,
0347 capid,
0348 charge,
0349 rawCharge,
0350 dfc,
0351 pedestalToUseForMethod0,
0352 gain,
0353 respCorrection,
0354 energym0_per_ts);
0355 printf(
0356 "startSample = %d endSample = %d param1 = %u param2 = %u\n", startSample, endSample, recoParam1, recoParam2);
0357 #endif
0358
0359 if (sampleWithinWindow >= startSample && sampleWithinWindow < endSample) {
0360 atomicAdd(&shrMethod0EnergyAccum[lch], energym0_per_ts);
0361 // pack sample, energy as 64 bit value
0362 unsigned long long int old = shrMethod0EnergySamplePair[lch], assumed;
0363 unsigned long long int val =
0364 (static_cast<unsigned long long int>(sampleWithinWindow) << 32) + __float_as_uint(energym0_per_ts);
0365 do {
0366 assumed = old;
0367 // decode energy, sample values
0368 //int const current_sample = (assumed >> 32) & 0xffffffff;
0369 float const current_energy = __uint_as_float(assumed & 0xffffffff);
0370 if (energym0_per_ts > current_energy)
0371 old = atomicCAS(&shrMethod0EnergySamplePair[lch], assumed, val);
0372 else
0373 break;
0374 } while (assumed != old);
0375 }
0376 __syncthreads();
0377
0378 // NOTE: must take soi, as values for that thread are used...
0379 // NOTE: does not run if soi is bad, because it does not match any sampleWithinWindow
0380 if (sampleWithinWindow == soi) {
0381 auto const method0_energy = shrMethod0EnergyAccum[lch];
0382 auto const val = shrMethod0EnergySamplePair[lch];
0383 int const max_sample = (val >> 32) & 0xffffffff;
0384 float const max_energy = __uint_as_float(val & 0xffffffff);
0385 float const max_energy_1 =
0386 max_sample < nsamplesForCompute - 1 ? shrEnergyM0PerTS[lch * nsamplesForCompute + max_sample + 1] : 0.f;
0387 float const position = nsamplesToAdd < nsamplesForCompute ? max_sample - soi : max_sample;
0388 auto const sum = max_energy + max_energy_1;
0389 // FIXME: for full comparison with cpu method 0 timing,
0390 // need to correct by slew
0391 // requires an accumulator -> more shared mem -> omit here unless
0392 // really needed
0393 float const time =
0394 max_energy > 0.f && max_energy_1 > 0.f ? 25.f * (position + max_energy_1 / sum) : 25.f * position;
0395
0396 // store method0 quantities to global mem
0397 outputdid[gch] = id;
0398 method0Energy[gch] = method0_energy;
0399 method0Time[gch] = time;
0400
0401 #ifdef HCAL_MAHI_GPUDEBUG
0402 printf("tsTOT = %f tstrig = %f ts4Thresh = %f\n", shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0, ts4Thresh);
0403 #endif
0404
0405 // Channel quality check
0406 // https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecAlgos/plugins/HcalChannelPropertiesEP.cc#L107-L109
0407 // https://github.com/cms-sw/cmssw/blob/6d2f66057131baacc2fcbdd203588c41c885b42c/CondCore/HcalPlugins/plugins/HcalChannelQuality_PayloadInspector.cc#L30
0408 // const bool taggedBadByDb = severity.dropChannel(digistatus->getValue());
0409 // do not run MAHI if taggedBadByDb = true
0410
0411 auto const digiStatus_ = qualityStatus[hashedId];
0412 const bool taggedBadByDb = (digiStatus_ / 32770);
0413
0414 if (taggedBadByDb)
0415 outputChi2[gch] = -9999.f;
0416
0417 // check as in cpu version if mahi is not needed
0418 // FIXME: KNOWN ISSUE: observed a problem when rawCharge and pedestal
0419 // are basically equal and generate -0.00000...
0420 // needs to be treated properly
0421 if (!(shrEnergyM0TotalAccum[lch] > 0 && energym0_per_ts_gain0 > ts4Thresh)) {
0422 // do not need to run mahi minimization
0423 //outputEnergy[gch] = 0; energy already inited to 0
0424 outputChi2[gch] = -9999.f;
0425 }
0426
0427 #ifdef HCAL_MAHI_GPUDEBUG
0428 printf("method0_energy = %f max_sample = %d max_energy = %f time = %f\n",
0429 method0_energy,
0430 max_sample,
0431 max_energy,
0432 time);
0433 #endif
0434 }
0435
0436 //
0437 // preparations for mahi fit
0438 //
0439 auto const amplitude = rawCharge - pedestalToUseForMethod0;
0440 auto const noiseADC = (1. / std::sqrt(12)) * dfc;
0441 auto const noisePhotoSq = amplitude > pedestalWidth ? (amplitude * fcByPE) : 0.f;
0442 auto const noiseTerm = noiseADC * noiseADC + noisePhotoSq + pedestalWidth * pedestalWidth;
0443
0444 #ifdef HCAL_MAHI_GPUDEBUG
0445 printf(
0446 "charge(%d) = %f pedestal(%d) = %f dfc(%d) = %f pedestalWidth(%d) = %f noiseADC(%d) = %f noisPhoto(%d) = "
0447 "%f\n",
0448 sample,
0449 rawCharge,
0450 sample,
0451 pedestalToUseForMethod0,
0452 sample,
0453 dfc,
0454 sample,
0455 pedestalWidth,
0456 sample,
0457 noiseADC,
0458 sample,
0459 noisePhotoSq);
0460 #endif
0461
0462 // store to global memory
0463 amplitudesForChannel[sampleWithinWindow] = amplitude;
0464 noiseTermsForChannel[sampleWithinWindow] = noiseTerm;
0465 electronicNoiseTermsForChannel[sampleWithinWindow] = pedestalWidth;
0466 }
0467
0468 // TODO: need to add an array of offsets for pulses (a la activeBXs...)
0469 // Assume for now 8 pulses
0470 __global__ void kernel_prep_pulseMatrices_sameNumberOfSamples(float* pulseMatrices,
0471 float* pulseMatricesM,
0472 float* pulseMatricesP,
0473 int const* pulseOffsets,
0474 float const* amplitudes,
0475 uint32_t const* idsf01HE,
0476 uint32_t const* idsf5HB,
0477 uint32_t const* idsf3HB,
0478 uint32_t const nchannelsf01HE,
0479 uint32_t const nchannelsf5HB,
0480 uint32_t const nchannelsTotal,
0481 int8_t const* soiSamples,
0482 uint32_t const* recoPulseShapeIds,
0483 float const* acc25nsVecValues,
0484 float const* diff25nsItvlVecValues,
0485 float const* accVarLenIdxMinusOneVecValues,
0486 float const* diffVarItvlIdxMinusOneVecValues,
0487 float const* accVarLenIdxZeroVecValues,
0488 float const* diffVarItvlIdxZeroVecValues,
0489 float const meanTime,
0490 float const timeSigmaSiPM,
0491 float const timeSigmaHPD,
0492 int const maxDepthHB,
0493 int const maxDepthHE,
0494 int const maxPhiHE,
0495 int const firstHBRing,
0496 int const lastHBRing,
0497 int const firstHERing,
0498 int const lastHERing,
0499 int const nEtaHB,
0500 int const nEtaHE,
0501 uint32_t const offsetForHashes,
0502 bool const applyTimeSlew,
0503 float const tzeroTimeSlew,
0504 float const slopeTimeSlew,
0505 float const tmaxTimeSlew) {
0506 // indices
0507 auto const ipulse = threadIdx.y;
0508 auto const npulses = blockDim.y;
0509 auto const sample = threadIdx.x;
0510 auto const nsamples = blockDim.x;
0511 auto const lch = threadIdx.z;
0512 auto const gch = lch + blockIdx.x * blockDim.z;
0513 auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
0514
0515 if (gch >= nchannelsTotal)
0516 return;
0517
0518 // conditions
0519 auto const id = gch < nchannelsf01HE
0520 ? idsf01HE[gch]
0521 : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
0522 //auto const id = gch >= nchannelsf01HE
0523 // ? idsf5HB[gch - nchannelsf01HE]
0524 // : idsf01HE[gch];
0525 auto const deltaT = gch >= nchannelsf01HE && gch < nchannelsf015 ? timeSigmaHPD : timeSigmaSiPM;
0526 auto const did = DetId{id};
0527 auto const hashedId =
0528 did.subdetId() == HcalBarrel
0529 ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
0530 : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
0531 offsetForHashes;
0532 auto const recoPulseShapeId = recoPulseShapeIds[hashedId];
0533 auto const* acc25nsVec = acc25nsVecValues + recoPulseShapeId * hcal::constants::maxPSshapeBin;
0534 auto const* diff25nsItvlVec = diff25nsItvlVecValues + recoPulseShapeId * hcal::constants::maxPSshapeBin;
0535 auto const* accVarLenIdxMinusOneVec = accVarLenIdxMinusOneVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
0536 auto const* diffVarItvlIdxMinusOneVec =
0537 diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
0538 auto const* accVarLenIdxZeroVec = accVarLenIdxZeroVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
0539 auto const* diffVarItvlIdxZeroVec = diffVarItvlIdxZeroVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
0540
0541 // offset output arrays
0542 auto* pulseMatrix = pulseMatrices + nsamples * npulses * gch;
0543 auto* pulseMatrixM = pulseMatricesM + nsamples * npulses * gch;
0544 auto* pulseMatrixP = pulseMatricesP + nsamples * npulses * gch;
0545
0546 // amplitude per ipulse
0547 int const soi = soiSamples[gch];
0548 int const pulseOffset = pulseOffsets[ipulse];
0549 auto const amplitude = amplitudes[gch * nsamples + pulseOffset + soi];
0550
0551 #ifdef HCAL_MAHI_GPUDEBUG
0552 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
0553 if (id != DETID_TO_DEBUG)
0554 return;
0555 #endif
0556 #endif
0557
0558 #ifdef HCAL_MAHI_GPUDEBUG
0559 if (sample == 0 && ipulse == 0) {
0560 for (int i = 0; i < 8; i++)
0561 printf("amplitude(%d) = %f\n", i, amplitudes[gch * nsamples + i]);
0562 printf("acc25nsVec and diff25nsItvlVec for recoPulseShapeId = %u\n", recoPulseShapeId);
0563 for (int i = 0; i < 256; i++) {
0564 printf("acc25nsVec(%d) = %f diff25nsItvlVec(%d) = %f\n", i, acc25nsVec[i], i, diff25nsItvlVec[i]);
0565 }
0566 printf("accVarLenIdxZEROVec and accVarLenIdxMinusOneVec\n");
0567 for (int i = 0; i < 25; i++) {
0568 printf("accVarLenIdxZEROVec(%d) = %f accVarLenIdxMinusOneVec(%d) = %f\n",
0569 i,
0570 accVarLenIdxZeroVec[i],
0571 i,
0572 accVarLenIdxMinusOneVec[i]);
0573 }
0574 printf("diffVarItvlIdxZEROVec and diffVarItvlIdxMinusOneVec\n");
0575 for (int i = 0; i < 25; i++) {
0576 printf("diffVarItvlIdxZEROVec(%d) = %f diffVarItvlIdxMinusOneVec(%d) = %f\n",
0577 i,
0578 diffVarItvlIdxZeroVec[i],
0579 i,
0580 diffVarItvlIdxMinusOneVec[i]);
0581 }
0582 }
0583 #endif
0584
0585 auto t0 = meanTime;
0586 if (applyTimeSlew) {
0587 if (amplitude <= 1.0f)
0588 t0 += hcal::reconstruction::compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
0589 else
0590 t0 += hcal::reconstruction::compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
0591 }
0592 auto const t0m = -deltaT + t0;
0593 auto const t0p = deltaT + t0;
0594
0595 #ifdef HCAL_MAHI_GPUDEBUG
0596 if (sample == 0 && ipulse == 0) {
0597 printf("time values: %f %f %f\n", t0, t0m, t0p);
0598 }
0599
0600 if (sample == 0 && ipulse == 0) {
0601 for (int i = 0; i < hcal::constants::maxSamples; i++) {
0602 auto const value = hcal::reconstruction::compute_pulse_shape_value(t0,
0603 i,
0604 0,
0605 acc25nsVec,
0606 diff25nsItvlVec,
0607 accVarLenIdxMinusOneVec,
0608 diffVarItvlIdxMinusOneVec,
0609 accVarLenIdxZeroVec,
0610 diffVarItvlIdxZeroVec);
0611 printf("pulse(%d) = %f\n", i, value);
0612 }
0613 printf("\n");
0614 for (int i = 0; i < hcal::constants::maxSamples; i++) {
0615 auto const value = hcal::reconstruction::compute_pulse_shape_value(t0p,
0616 i,
0617 0,
0618 acc25nsVec,
0619 diff25nsItvlVec,
0620 accVarLenIdxMinusOneVec,
0621 diffVarItvlIdxMinusOneVec,
0622 accVarLenIdxZeroVec,
0623 diffVarItvlIdxZeroVec);
0624 printf("pulseP(%d) = %f\n", i, value);
0625 }
0626 printf("\n");
0627 for (int i = 0; i < hcal::constants::maxSamples; i++) {
0628 auto const value = hcal::reconstruction::compute_pulse_shape_value(t0m,
0629 i,
0630 0,
0631 acc25nsVec,
0632 diff25nsItvlVec,
0633 accVarLenIdxMinusOneVec,
0634 diffVarItvlIdxMinusOneVec,
0635 accVarLenIdxZeroVec,
0636 diffVarItvlIdxZeroVec);
0637 printf("pulseM(%d) = %f\n", i, value);
0638 }
0639 }
0640 #endif
0641
0642 // FIXME: shift should be treated properly,
0643 // here assume 8 time slices and 8 samples
0644 auto const shift = 4 - soi; // as in cpu version!
0645
0646 // auto const offset = ipulse - soi;
0647 // auto const idx = sample - offset;
0648 int32_t const idx = sample - pulseOffset;
0649 auto const value = idx >= 0 && idx < nsamples
0650 ? hcal::reconstruction::compute_pulse_shape_value(t0,
0651 idx,
0652 shift,
0653 acc25nsVec,
0654 diff25nsItvlVec,
0655 accVarLenIdxMinusOneVec,
0656 diffVarItvlIdxMinusOneVec,
0657 accVarLenIdxZeroVec,
0658 diffVarItvlIdxZeroVec)
0659 : 0;
0660 auto const value_t0m = idx >= 0 && idx < nsamples
0661 ? hcal::reconstruction::compute_pulse_shape_value(t0m,
0662 idx,
0663 shift,
0664 acc25nsVec,
0665 diff25nsItvlVec,
0666 accVarLenIdxMinusOneVec,
0667 diffVarItvlIdxMinusOneVec,
0668 accVarLenIdxZeroVec,
0669 diffVarItvlIdxZeroVec)
0670 : 0;
0671 auto const value_t0p = idx >= 0 && idx < nsamples
0672 ? hcal::reconstruction::compute_pulse_shape_value(t0p,
0673 idx,
0674 shift,
0675 acc25nsVec,
0676 diff25nsItvlVec,
0677 accVarLenIdxMinusOneVec,
0678 diffVarItvlIdxMinusOneVec,
0679 accVarLenIdxZeroVec,
0680 diffVarItvlIdxZeroVec)
0681 : 0;
0682
0683 // store to global
0684 if (amplitude > 0.f) {
0685 pulseMatrix[ipulse * nsamples + sample] = value;
0686 pulseMatrixM[ipulse * nsamples + sample] = value_t0m;
0687 pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
0688 } else {
0689 pulseMatrix[ipulse * nsamples + sample] = 0.f;
0690 pulseMatrixM[ipulse * nsamples + sample] = 0.f;
0691 pulseMatrixP[ipulse * nsamples + sample] = 0.f;
0692 }
0693 }
0694
0695 template <int NSAMPLES, int NPULSES>
0696 __forceinline__ __device__ void update_covariance(
0697 calo::multifit::ColumnVector<NPULSES> const& resultAmplitudesVector,
0698 calo::multifit::MapSymM<float, NSAMPLES>& covarianceMatrix,
0699 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
0700 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
0701 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
0702 CMS_UNROLL_LOOP
0703 for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
0704 auto const resultAmplitude = resultAmplitudesVector(ipulse);
0705 if (resultAmplitude == 0)
0706 continue;
0707
0708 #ifdef HCAL_MAHI_GPUDEBUG
0709 printf("pulse cov array for ibx = %d\n", ipulse);
0710 #endif
0711
0712 // preload a column
0713 float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES];
0714 CMS_UNROLL_LOOP
0715 for (int counter = 0; counter < NSAMPLES; counter++) {
0716 pmcol[counter] = __ldg(&pulseMatrix.coeffRef(counter, ipulse));
0717 pmpcol[counter] = __ldg(&pulseMatrixP.coeffRef(counter, ipulse));
0718 pmmcol[counter] = __ldg(&pulseMatrixM.coeffRef(counter, ipulse));
0719 }
0720
0721 auto const ampl2 = resultAmplitude * resultAmplitude;
0722 CMS_UNROLL_LOOP
0723 for (int col = 0; col < NSAMPLES; col++) {
0724 auto const valueP_col = pmpcol[col];
0725 auto const valueM_col = pmmcol[col];
0726 auto const value_col = pmcol[col];
0727 auto const tmppcol = valueP_col - value_col;
0728 auto const tmpmcol = valueM_col - value_col;
0729
0730 // diagonal
0731 auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol);
0732 covarianceMatrix(col, col) += ampl2 * tmp_value;
0733
0734 // FIXME: understand if this actually gets unrolled
0735 CMS_UNROLL_LOOP
0736 for (int row = col + 1; row < NSAMPLES; row++) {
0737 float const valueP_row = pmpcol[row]; //pulseMatrixP(j, ipulseReal);
0738 float const value_row = pmcol[row]; //pulseMatrix(j, ipulseReal);
0739 float const valueM_row = pmmcol[row]; //pulseMatrixM(j, ipulseReal);
0740
0741 float tmpprow = valueP_row - value_row;
0742 float tmpmrow = valueM_row - value_row;
0743
0744 auto const covValue = 0.5 * (tmppcol * tmpprow + tmpmcol * tmpmrow);
0745
0746 covarianceMatrix(row, col) += ampl2 * covValue;
0747 }
0748 }
0749 }
0750 }
0751
0752 template <int NSAMPLES, int NPULSES>
0753 __global__ void kernel_minimize(float* outputEnergy,
0754 float* outputChi2,
0755 float const* __restrict__ inputAmplitudes,
0756 float const* __restrict__ pulseMatrices,
0757 float const* __restrict__ pulseMatricesM,
0758 float const* __restrict__ pulseMatricesP,
0759 int const* __restrict__ pulseOffsetValues,
0760 float const* __restrict__ noiseTerms,
0761 float const* __restrict__ electronicNoiseTerms,
0762 int8_t const* __restrict__ soiSamples,
0763 float const* __restrict__ noiseCorrelationValues,
0764 float const* __restrict__ pedestalWidths,
0765 float const* __restrict__ effectivePedestalWidths,
0766 bool const useEffectivePedestals,
0767 uint32_t const* __restrict__ idsf01HE,
0768 uint32_t const* __restrict__ idsf5HB,
0769 uint32_t const* __restrict__ idsf3HB,
0770 float const* __restrict__ gainValues,
0771 float const* __restrict__ respCorrectionValues,
0772 uint32_t const nchannelsf01HE,
0773 uint32_t const nchannelsf5HB,
0774 uint32_t const nchannelsTotal,
0775 uint32_t const offsetForHashes,
0776 int const maxDepthHB,
0777 int const maxDepthHE,
0778 int const maxPhiHE,
0779 int const firstHBRing,
0780 int const lastHBRing,
0781 int const firstHERing,
0782 int const lastHERing,
0783 int const nEtaHB,
0784 int const nEtaHE) {
0785 // can be relaxed if needed - minor updates are needed in that case!
0786 static_assert(NPULSES == NSAMPLES);
0787
0788 // indices
0789 auto const gch = threadIdx.x + blockIdx.x * blockDim.x;
0790 auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
0791 if (gch >= nchannelsTotal)
0792 return;
0793
0794 // if chi2 is set to -9999 do not run minimization
0795 if (outputChi2[gch] == -9999.f)
0796 return;
0797
0798 // configure shared mem
0799 extern __shared__ char shrmem[];
0800 float* shrMatrixLFnnlsStorage =
0801 reinterpret_cast<float*>(shrmem) + calo::multifit::MapSymM<float, NPULSES>::total * threadIdx.x;
0802 float* shrAtAStorage = reinterpret_cast<float*>(shrmem) +
0803 calo::multifit::MapSymM<float, NPULSES>::total * (threadIdx.x + blockDim.x);
0804
0805 // conditions for pedestal widths
0806 auto const id = gch < nchannelsf01HE
0807 ? idsf01HE[gch]
0808 : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
0809 auto const did = DetId{id};
0810 auto const hashedId =
0811 did.subdetId() == HcalBarrel
0812 ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
0813 : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
0814 offsetForHashes;
0815
0816 auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
0817 ? effectivePedestalWidths + hashedId * 4
0818 : pedestalWidths + hashedId * 4;
0819 auto const averagePedestalWidth2 = 0.25 * (pedestalWidthsForChannel[0] * pedestalWidthsForChannel[0] +
0820 pedestalWidthsForChannel[1] * pedestalWidthsForChannel[1] +
0821 pedestalWidthsForChannel[2] * pedestalWidthsForChannel[2] +
0822 pedestalWidthsForChannel[3] * pedestalWidthsForChannel[3]);
0823
0824 auto const* gains = gainValues + hashedId * 4;
0825 // FIXME on cpu ts 0 capid was used - does it make any difference
0826 auto const gain = gains[0];
0827 auto const respCorrection = respCorrectionValues[hashedId];
0828
0829 auto const noisecorr = noiseCorrelationValues[hashedId];
0830
0831 #ifdef HCAL_MAHI_GPUDEBUG
0832 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
0833 if (id != DETID_TO_DEBUG)
0834 return;
0835 #endif
0836 #endif
0837
0838 /*
0839 // TODO: provide this properly
0840 int const soi = soiSamples[gch];
0841 */
0842 calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
0843 CMS_UNROLL_LOOP
0844 for (int i = 0; i < NPULSES; ++i)
0845 pulseOffsets(i) = i;
0846 // pulseOffsets(i) = pulseOffsetValues[i] - pulseOffsetValues[0];
0847
0848 // output amplitudes/weights
0849 calo::multifit::ColumnVector<NPULSES> resultAmplitudesVector = calo::multifit::ColumnVector<NPULSES>::Zero();
0850
0851 // map views
0852 Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> inputAmplitudesView{inputAmplitudes + gch * NSAMPLES};
0853 Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
0854 Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseElectronicView{electronicNoiseTerms +
0855 gch * NSAMPLES};
0856 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{pulseMatricesM +
0857 gch * NSAMPLES * NPULSES};
0858 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{pulseMatricesP +
0859 gch * NSAMPLES * NPULSES};
0860 Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{pulseMatrices +
0861 gch * NSAMPLES * NPULSES};
0862
0863 #ifdef HCAL_MAHI_GPUDEBUG
0864 for (int i = 0; i < NSAMPLES; i++)
0865 printf("inputValues(%d) = %f noiseTerms(%d) = %f\n", i, inputAmplitudesView(i), i, noiseTermsView(i));
0866 for (int i = 0; i < NSAMPLES; i++) {
0867 for (int j = 0; j < NPULSES; j++)
0868 printf("%f ", glbPulseMatrixView(i, j));
0869 printf("\n");
0870 }
0871 printf("\n");
0872 for (int i = 0; i < NSAMPLES; i++) {
0873 for (int j = 0; j < NPULSES; j++)
0874 printf("%f ", glbPulseMatrixMView(i, j));
0875 printf("\n");
0876 }
0877 printf("\n");
0878 for (int i = 0; i < NSAMPLES; i++) {
0879 for (int j = 0; j < NPULSES; j++)
0880 printf("%f ", glbPulseMatrixPView(i, j));
0881 printf("\n");
0882 }
0883 #endif
0884
0885 int npassive = 0;
0886 float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f;
0887 for (int iter = 1; iter < nMaxItersMin; iter++) {
0888 //float covarianceMatrixStorage[MapSymM<float, NSAMPLES>::total];
0889 // NOTE: only works when NSAMPLES == NPULSES
0890 // if does not hold -> slightly rearrange shared mem to still reuse
0891 // shared memory
0892 float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
0893 calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
0894 CMS_UNROLL_LOOP
0895 for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::total; counter++)
0896 covarianceMatrixStorage[counter] = (noisecorr != 0.f) ? 0.f : averagePedestalWidth2;
0897 CMS_UNROLL_LOOP
0898 for (unsigned int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::stride; counter++) {
0899 covarianceMatrix(counter, counter) += noiseTermsView.coeffRef(counter);
0900 if (counter != 0)
0901 covarianceMatrix(counter, counter - 1) += noisecorr * __ldg(&noiseElectronicView.coeffRef(counter - 1)) *
0902 __ldg(&noiseElectronicView.coeffRef(counter));
0903 }
0904
0905 // update covariance matrix
0906 update_covariance(
0907 resultAmplitudesVector, covarianceMatrix, glbPulseMatrixView, glbPulseMatrixMView, glbPulseMatrixPView);
0908
0909 #ifdef HCAL_MAHI_GPUDEBUG
0910 printf("covariance matrix\n");
0911 for (int i = 0; i < 8; i++) {
0912 for (int j = 0; j < 8; j++)
0913 printf("%f ", covarianceMatrix(i, j));
0914 printf("\n");
0915 }
0916 #endif
0917
0918 // compute Cholesky Decomposition L matrix
0919 //matrixDecomposition.compute(covarianceMatrix);
0920 //auto const& matrixL = matrixDecomposition.matrixL();
0921 float matrixLStorage[calo::multifit::MapSymM<float, NSAMPLES>::total];
0922 calo::multifit::MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
0923 calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix);
0924
0925 //
0926 // replace eigen
0927 //
0928 //auto const& A = matrixDecomposition
0929 // .matrixL()
0930 // .solve(pulseMatrixView);
0931 calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
0932 calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
0933
0934 //
0935 // remove eigen
0936 //
0937 //auto const& b = matrixL
0938 // .solve(inputAmplitudesView);
0939 //
0940 float reg_b[NSAMPLES];
0941 calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
0942
0943 // TODO: we do not really need to change these matrcies
0944 // will be fixed in the optimized version
0945 //ColMajorMatrix<NPULSES, NPULSES> AtA = A.transpose() * A;
0946 //ColumnVector<NPULSES> Atb = A.transpose() * b;
0947 //ColMajorMatrix<NPULSES, NPULSES> AtA;
0948 //float AtAStorage[MapSymM<float, NPULSES>::total];
0949 calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
0950 calo::multifit::ColumnVector<NPULSES> Atb;
0951 CMS_UNROLL_LOOP
0952 for (int icol = 0; icol < NPULSES; icol++) {
0953 float reg_ai[NSAMPLES];
0954
0955 // load column icol
0956 CMS_UNROLL_LOOP
0957 for (int counter = 0; counter < NSAMPLES; counter++)
0958 reg_ai[counter] = A(counter, icol);
0959
0960 // compute diagonal
0961 float sum = 0.f;
0962 CMS_UNROLL_LOOP
0963 for (int counter = 0; counter < NSAMPLES; counter++)
0964 sum += reg_ai[counter] * reg_ai[counter];
0965
0966 // store
0967 AtA(icol, icol) = sum;
0968
0969 // go thru the other columns
0970 CMS_UNROLL_LOOP
0971 for (int j = icol + 1; j < NPULSES; j++) {
0972 // load column j
0973 float reg_aj[NSAMPLES];
0974 CMS_UNROLL_LOOP
0975 for (int counter = 0; counter < NSAMPLES; counter++)
0976 reg_aj[counter] = A(counter, j);
0977
0978 // accum
0979 float sum = 0.f;
0980 CMS_UNROLL_LOOP
0981 for (int counter = 0; counter < NSAMPLES; counter++)
0982 sum += reg_aj[counter] * reg_ai[counter];
0983
0984 // store
0985 //AtA(icol, j) = sum;
0986 AtA(j, icol) = sum;
0987 }
0988
0989 // Atb accum
0990 float sum_atb = 0;
0991 CMS_UNROLL_LOOP
0992 for (int counter = 0; counter < NSAMPLES; counter++)
0993 sum_atb += reg_ai[counter] * reg_b[counter];
0994
0995 // store atb
0996 Atb(icol) = sum_atb;
0997 }
0998
0999 #ifdef HCAL_MAHI_GPUDEBUG
1000 printf("AtA\n");
1001 for (int i = 0; i < 8; i++) {
1002 for (int j = 0; j < 8; j++)
1003 printf("%f ", AtA(i, j));
1004 printf("\n");
1005 }
1006 printf("Atb\n");
1007 for (int i = 0; i < 8; i++)
1008 printf("%f ", Atb(i));
1009 printf("\n");
1010 printf("result Amplitudes before nnls\n");
1011 for (int i = 0; i < 8; i++)
1012 printf("%f ", resultAmplitudesVector(i));
1013 printf("\n");
1014 #endif
1015
1016 // for fnnls
1017 calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
1018
1019 // run fast nnls
1020 calo::multifit::fnnls(
1021 AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, nnlsThresh, nMaxItersNNLS, 10, 10);
1022
1023 #ifdef HCAL_MAHI_GPUDEBUG
1024 printf("result Amplitudes\n");
1025 for (int i = 0; i < 8; i++)
1026 printf("resultAmplitudes(%d) = %f\n", i, resultAmplitudesVector(i));
1027 #endif
1028
1029 calo::multifit::calculateChiSq(matrixL, glbPulseMatrixView, resultAmplitudesVector, inputAmplitudesView, chi2);
1030
1031 auto const deltaChi2 = std::abs(chi2 - previous_chi2);
1032 if (chi2 == chi2_2itersback && chi2 < previous_chi2)
1033 break;
1034
1035 // update
1036 chi2_2itersback = previous_chi2;
1037 previous_chi2 = chi2;
1038
1039 // exit condition
1040 if (deltaChi2 < deltaChi2Threashold)
1041 break;
1042 }
1043
1044 #ifdef HCAL_MAHI_GPUDEBUG
1045 for (int i = 0; i < NPULSES; i++)
1046 printf("pulseOffsets(%d) = %d outputAmplitudes(%d) = %f\n", i, pulseOffsets(i), i, resultAmplitudesVector(i));
1047 printf("chi2 = %f\n", chi2);
1048 #endif
1049
1050 outputChi2[gch] = chi2;
1051 auto const idx_for_energy = std::abs(pulseOffsetValues[0]);
1052 outputEnergy[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
1053 /*
1054 CMS_UNROLL_LOOP
1055 for (int i=0; i<NPULSES; i++)
1056 if (pulseOffsets[i] == soi)
1057 // NOTE: gain is a number < 10^-3/4, multiply first to avoid stab issues
1058 outputEnergy[gch] = (gain*resultAmplitudesVector(i))*respCorrection;
1059 */
1060 }
1061
1062 } // namespace mahi
1063 } // namespace hcal
1064
1065 namespace hcal {
1066 namespace reconstruction {
1067
1068 void entryPoint(InputDataGPU const& inputGPU,
1069 OutputDataGPU& outputGPU,
1070 ConditionsProducts const& conditions,
1071 ScratchDataGPU& scratch,
1072 ConfigParameters const& configParameters,
1073 cudaStream_t cudaStream) {
1074 auto const totalChannels = inputGPU.f01HEDigis.size + inputGPU.f5HBDigis.size + inputGPU.f3HBDigis.size;
1075
1076 // FIXME: the number of channels in output might change given that some channesl might be filtered out
1077
1078 // do not run when there are no rechits (e.g. if HCAL is not being read),
1079 // but do set the size of the output collection to 0
1080 outputGPU.recHits.size = totalChannels;
1081 if (totalChannels == 0) {
1082 return;
1083 }
1084
1085 // TODO: this can be lifted by implementing a separate kernel
1086 // similar to the default one, but properly handling the diff in #sample
1087 // or modifying existing one
1088 auto const f01nsamples = compute_nsamples<Flavor1>(inputGPU.f01HEDigis.stride);
1089 auto const f5nsamples = compute_nsamples<Flavor5>(inputGPU.f5HBDigis.stride);
1090 auto const f3nsamples = compute_nsamples<Flavor3>(inputGPU.f3HBDigis.stride);
1091 int constexpr windowSize = 8;
1092 int const startingSample = f01nsamples - windowSize;
1093 assert(startingSample == 0 || startingSample == 2);
1094 if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f5HBDigis.stride > 0)
1095 assert(f01nsamples == f5nsamples);
1096 if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f3HBDigis.stride > 0)
1097 assert(f01nsamples == f3nsamples);
1098
1099 dim3 threadsPerBlock{windowSize, configParameters.kprep1dChannelsPerBlock};
1100 int blocks = static_cast<uint32_t>(threadsPerBlock.y) > totalChannels
1101 ? 1
1102 : (totalChannels + threadsPerBlock.y - 1) / threadsPerBlock.y;
1103 int nbytesShared =
1104 ((2 * windowSize + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock;
1105 hcal::mahi::kernel_prep1d_sameNumberOfSamples<<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
1106 scratch.amplitudes.get(),
1107 scratch.noiseTerms.get(),
1108 scratch.electronicNoiseTerms.get(),
1109 outputGPU.recHits.energy.get(),
1110 outputGPU.recHits.chi2.get(),
1111 inputGPU.f01HEDigis.data.get(),
1112 inputGPU.f5HBDigis.data.get(),
1113 inputGPU.f3HBDigis.data.get(),
1114 inputGPU.f01HEDigis.ids.get(),
1115 inputGPU.f5HBDigis.ids.get(),
1116 inputGPU.f3HBDigis.ids.get(),
1117 inputGPU.f01HEDigis.stride,
1118 inputGPU.f5HBDigis.stride,
1119 inputGPU.f3HBDigis.stride,
1120 inputGPU.f01HEDigis.size,
1121 inputGPU.f5HBDigis.size,
1122 inputGPU.f5HBDigis.npresamples.get(),
1123 scratch.soiSamples.get(),
1124 outputGPU.recHits.energyM0.get(),
1125 outputGPU.recHits.timeM0.get(),
1126 outputGPU.recHits.did.get(),
1127 totalChannels,
1128 conditions.channelQuality.status,
1129 conditions.recoParams.param1,
1130 conditions.recoParams.param2,
1131 conditions.qieCoders.offsets,
1132 conditions.qieCoders.slopes,
1133 conditions.qieTypes.values,
1134 conditions.pedestalWidths.values,
1135 conditions.effectivePedestalWidths.values,
1136 conditions.pedestals.values,
1137 conditions.convertedEffectivePedestals ? conditions.convertedEffectivePedestals->values
1138 : conditions.pedestals.values,
1139 configParameters.useEffectivePedestals,
1140 conditions.sipmParameters.type,
1141 conditions.sipmParameters.fcByPE,
1142 conditions.sipmCharacteristics.parLin1,
1143 conditions.sipmCharacteristics.parLin2,
1144 conditions.sipmCharacteristics.parLin3,
1145 conditions.gains.values,
1146 conditions.respCorrs.values,
1147 conditions.topology->maxDepthHB(),
1148 conditions.topology->maxDepthHE(),
1149 conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
1150 : hcal::reconstruction::IPHI_MAX,
1151 conditions.topology->firstHBRing(),
1152 conditions.topology->lastHBRing(),
1153 conditions.topology->firstHERing(),
1154 conditions.topology->lastHERing(),
1155 conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
1156 conditions.topology->firstHERing() > conditions.topology->lastHERing()
1157 ? 0
1158 : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1),
1159 configParameters.sipmQTSShift,
1160 configParameters.sipmQNTStoSum,
1161 configParameters.firstSampleShift,
1162 conditions.offsetForHashes,
1163 configParameters.ts4Thresh,
1164 startingSample);
1165 cudaCheck(cudaGetLastError());
1166
1167 // 1024 is the max threads per block for gtx1080
1168 // FIXME: take this from cuda service or something like that
1169 uint32_t const channelsPerBlock = 1024 / (windowSize * conditions.pulseOffsetsHost.size());
1170 dim3 threadsPerBlock2{windowSize, static_cast<uint32_t>(conditions.pulseOffsetsHost.size()), channelsPerBlock};
1171 int blocks2 =
1172 threadsPerBlock2.z > totalChannels ? 1 : (totalChannels + threadsPerBlock2.z - 1) / threadsPerBlock2.z;
1173
1174 #ifdef HCAL_MAHI_CPUDEBUG
1175 std::cout << "threads: " << threadsPerBlock2.x << " " << threadsPerBlock2.y << " " << threadsPerBlock2.z
1176 << std::endl;
1177 std::cout << "blocks: " << blocks2 << std::endl;
1178 #endif
1179
1180 hcal::mahi::kernel_prep_pulseMatrices_sameNumberOfSamples<<<blocks2, threadsPerBlock2, 0, cudaStream>>>(
1181 scratch.pulseMatrices.get(),
1182 scratch.pulseMatricesM.get(),
1183 scratch.pulseMatricesP.get(),
1184 conditions.pulseOffsets.values,
1185 scratch.amplitudes.get(),
1186 inputGPU.f01HEDigis.ids.get(),
1187 inputGPU.f5HBDigis.ids.get(),
1188 inputGPU.f3HBDigis.ids.get(),
1189 inputGPU.f01HEDigis.size,
1190 inputGPU.f5HBDigis.size,
1191 totalChannels,
1192 scratch.soiSamples.get(),
1193 conditions.recoParams.ids,
1194 conditions.recoParams.acc25nsVec,
1195 conditions.recoParams.diff25nsItvlVec,
1196 conditions.recoParams.accVarLenIdxMinusOneVec,
1197 conditions.recoParams.diffVarItvlIdxMinusOneVec,
1198 conditions.recoParams.accVarLenIdxZEROVec,
1199 conditions.recoParams.diffVarItvlIdxZEROVec,
1200 configParameters.meanTime,
1201 configParameters.timeSigmaSiPM,
1202 configParameters.timeSigmaHPD,
1203 conditions.topology->maxDepthHB(),
1204 conditions.topology->maxDepthHE(),
1205 conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
1206 : hcal::reconstruction::IPHI_MAX,
1207 conditions.topology->firstHBRing(),
1208 conditions.topology->lastHBRing(),
1209 conditions.topology->firstHERing(),
1210 conditions.topology->lastHERing(),
1211 conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
1212 conditions.topology->firstHERing() > conditions.topology->lastHERing()
1213 ? 0
1214 : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1),
1215 conditions.offsetForHashes,
1216 configParameters.applyTimeSlew,
1217 configParameters.tzeroTimeSlew,
1218 configParameters.slopeTimeSlew,
1219 configParameters.tmaxTimeSlew);
1220 cudaCheck(cudaGetLastError());
1221
1222 // number of samples is checked in above assert
1223 if (conditions.pulseOffsetsHost.size() == 8u) {
1224 // FIXME: provide constants from configuration
1225 uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
1226 uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock;
1227 auto const nbytesShared = 2 * threadsPerBlock * calo::multifit::MapSymM<float, 8>::total * sizeof(float);
1228 hcal::mahi::kernel_minimize<8, 8><<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
1229 outputGPU.recHits.energy.get(),
1230 outputGPU.recHits.chi2.get(),
1231 scratch.amplitudes.get(),
1232 scratch.pulseMatrices.get(),
1233 scratch.pulseMatricesM.get(),
1234 scratch.pulseMatricesP.get(),
1235 conditions.pulseOffsets.values,
1236 scratch.noiseTerms.get(),
1237 scratch.electronicNoiseTerms.get(),
1238 scratch.soiSamples.get(),
1239 conditions.sipmParameters.auxi2,
1240 conditions.pedestalWidths.values,
1241 conditions.effectivePedestalWidths.values,
1242 configParameters.useEffectivePedestals,
1243 inputGPU.f01HEDigis.ids.get(),
1244 inputGPU.f5HBDigis.ids.get(),
1245 inputGPU.f3HBDigis.ids.get(),
1246 conditions.gains.values,
1247 conditions.respCorrs.values,
1248 inputGPU.f01HEDigis.size,
1249 inputGPU.f5HBDigis.size,
1250 totalChannels,
1251 conditions.offsetForHashes,
1252 conditions.topology->maxDepthHB(),
1253 conditions.topology->maxDepthHE(),
1254 conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
1255 : hcal::reconstruction::IPHI_MAX,
1256 conditions.topology->firstHBRing(),
1257 conditions.topology->lastHBRing(),
1258 conditions.topology->firstHERing(),
1259 conditions.topology->lastHERing(),
1260 conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
1261 conditions.topology->firstHERing() > conditions.topology->lastHERing()
1262 ? 0
1263 : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1));
1264 } else {
1265 throw cms::Exception("Invalid MahiGPU configuration")
1266 << "Currently support only 8 pulses and 8 time samples and provided: " << f01nsamples << " samples and "
1267 << conditions.pulseOffsetsHost.size() << " pulses" << std::endl;
1268 }
1269 }
1270
1271 } // namespace reconstruction
1272 } // namespace hcal