Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-10-04 05:19:00

0001 #include <alpaka/alpaka.hpp>
0002 
0003 #include "FWCore/Utilities/interface/bit_cast.h"
0004 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0005 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0006 #include "HeterogeneousCore/AlpakaInterface/interface/atomicMaxF.h"
0007 
0008 #include "DataFormats/ParticleFlowReco/interface/PFLayer.h"
0009 #include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterSoAProducerKernel.h"
0010 #include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterECLCC.h"
0011 
0012 namespace ALPAKA_ACCELERATOR_NAMESPACE {
0013 
0014   using namespace cms::alpakatools;
0015 
0016   using namespace reco::pfClustering;
0017 
0018   static constexpr int threadsPerBlockForClustering = 512;
0019   static constexpr uint32_t blocksForExoticClusters = 4;
0020 
0021   // cutoffFraction -> Is a rechit almost entirely attributed to one cluster
0022   // cutoffDistance -> Is a rechit close enough to a cluster to be associated
0023   // Values are from RecoParticleFlow/PFClusterProducer/plugins/Basic2DGenericPFlowClusterizer.cc
0024   static constexpr float cutoffDistance = 100.;
0025   static constexpr float cutoffFraction = 0.9999;
0026 
0027   static constexpr uint32_t kHBHalf = 1296;
0028   static constexpr uint32_t maxTopoInput = 2 * kHBHalf;
0029 
0030   // Calculation of dR2 for Clustering
0031   ALPAKA_FN_ACC ALPAKA_FN_INLINE static float dR2(Position4 pos1, Position4 pos2) {
0032     float mag1 = sqrtf(pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
0033     float cosTheta1 = mag1 > 0.0 ? pos1.z / mag1 : 1.0;
0034     float eta1 = 0.5f * logf((1.0f + cosTheta1) / (1.0f - cosTheta1));
0035     float phi1 = atan2f(pos1.y, pos1.x);
0036 
0037     float mag2 = sqrtf(pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);
0038     float cosTheta2 = mag2 > 0.0 ? pos2.z / mag2 : 1.0;
0039     float eta2 = 0.5f * logf((1.0f + cosTheta2) / (1.0f - cosTheta2));
0040     float phi2 = atan2f(pos2.y, pos2.x);
0041 
0042     float deta = eta2 - eta1;
0043     constexpr const float fPI = M_PI;
0044     float dphi = std::abs(std::abs(phi2 - phi1) - fPI) - fPI;
0045     return (deta * deta + dphi * dphi);
0046   }
0047 
0048   // Get index of seed
0049   ALPAKA_FN_ACC static auto getSeedRhIdx(int* seeds, int seedNum) { return seeds[seedNum]; }
0050 
0051   // Get rechit fraction of a given rechit for a given seed
0052   ALPAKA_FN_ACC static auto getRhFrac(reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
0053                                       int topoSeedBegin,
0054                                       reco::PFRecHitFractionDeviceCollection::View fracView,
0055                                       int seedNum,
0056                                       int rhNum) {
0057     int seedIdx = pfClusteringVars[topoSeedBegin + seedNum].topoSeedList();
0058     return fracView[pfClusteringVars[seedIdx].seedFracOffsets() + rhNum].frac();
0059   }
0060 
0061   // Cluster position calculation
0062   template <bool debug = false>
0063   ALPAKA_FN_ACC static void updateClusterPos(reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
0064                                              Position4& pos4,
0065                                              float frac,
0066                                              int rhInd,
0067                                              reco::PFRecHitDeviceCollection::ConstView pfRecHits,
0068                                              float rhENormInv) {
0069     Position4 rechitPos = Position4{pfRecHits[rhInd].x(), pfRecHits[rhInd].y(), pfRecHits[rhInd].z(), 1.0};
0070     const auto rh_energy = pfRecHits[rhInd].energy() * frac;
0071     const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : std::max(0.0f, logf(rh_energy * rhENormInv)));
0072     if constexpr (debug)
0073       printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n",
0074              rhInd,
0075              norm,
0076              frac,
0077              rh_energy,
0078              rechitPos.x,
0079              rechitPos.y,
0080              rechitPos.z);
0081 
0082     pos4.x += rechitPos.x * norm;
0083     pos4.y += rechitPos.y * norm;
0084     pos4.z += rechitPos.z * norm;
0085     pos4.w += norm;  //  position_norm
0086   }
0087 
0088   // Processing single seed clusters
0089   // Device function designed to be called by all threads of a given block
0090   template <bool debug = false, typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0091   ALPAKA_FN_ACC static void hcalFastCluster_singleSeed(
0092       const TAcc& acc,
0093       reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
0094       const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
0095       int topoId,   // from selection
0096       int nRHTopo,  // from selection
0097       reco::PFRecHitDeviceCollection::ConstView pfRecHits,
0098       reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
0099       reco::PFClusterDeviceCollection::View clusterView,
0100       reco::PFRecHitFractionDeviceCollection::View fracView) {
0101     int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];  // thread index is rechit number
0102     // Declaration of shared variables
0103     int& i = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0104     int& nRHOther = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0105     unsigned int& iter = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
0106     float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0107     float& clusterEnergy = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0108     float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0109     float& seedEnergy = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0110     Position4& clusterPos = alpaka::declareSharedVar<Position4, __COUNTER__>(acc);
0111     Position4& prevClusterPos = alpaka::declareSharedVar<Position4, __COUNTER__>(acc);
0112     Position4& seedPos = alpaka::declareSharedVar<Position4, __COUNTER__>(acc);
0113     bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
0114     if (once_per_block(acc)) {
0115       i = pfClusteringVars[pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();  // i is the seed rechit index
0116       nRHOther = nRHTopo - 1;                                                           // number of non-seed rechits
0117       seedPos = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.};
0118       clusterPos = seedPos;  // Initial cluster position is just the seed
0119       prevClusterPos = seedPos;
0120       seedEnergy = pfRecHits[i].energy();
0121       clusterEnergy = seedEnergy;
0122       tol = pfClusParams.stoppingTolerance();  // stopping tolerance * tolerance scaling
0123 
0124       if (topology.cutsFromDB()) {
0125         rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
0126       } else {
0127         if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
0128           rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
0129         else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
0130           rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
0131         else {
0132           rhENormInv = 0.;
0133           printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
0134         }
0135       }
0136 
0137       iter = 0;
0138       notDone = true;
0139     }
0140     alpaka::syncBlockThreads(acc);  // all threads call sync
0141 
0142     int j = -1;  // j is the rechit index for this thread
0143     int rhFracOffset = -1;
0144     Position4 rhPos;
0145     float rhEnergy = -1., rhPosNorm = -1.;
0146 
0147     if (tid < nRHOther) {
0148       rhFracOffset =
0149           pfClusteringVars[i].seedFracOffsets() + tid + 1;  // Offset for this rechit in pcrhfrac, pcrhfracidx arrays
0150       j = fracView[rhFracOffset].pfrhIdx();                 // rechit index for this thread
0151       rhPos = Position4{pfRecHits[j].x(), pfRecHits[j].y(), pfRecHits[j].z(), 1.};
0152       rhEnergy = pfRecHits[j].energy();
0153       rhPosNorm = fmaxf(0., logf(rhEnergy * rhENormInv));
0154     }
0155     alpaka::syncBlockThreads(acc);  // all threads call sync
0156 
0157     do {
0158       if constexpr (debug) {
0159         if (once_per_block(acc))
0160           printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
0161       }
0162       float dist2 = -1., d2 = -1., fraction = -1.;
0163       if (tid < nRHOther) {
0164         // Rechit distance calculation
0165         dist2 = (clusterPos.x - rhPos.x) * (clusterPos.x - rhPos.x) +
0166                 (clusterPos.y - rhPos.y) * (clusterPos.y - rhPos.y) +
0167                 (clusterPos.z - rhPos.z) * (clusterPos.z - rhPos.z);
0168 
0169         d2 = dist2 / pfClusParams.showerSigma2();
0170         fraction = clusterEnergy * rhENormInv * expf(-0.5f * d2);
0171 
0172         // For single seed clusters, rechit fraction is either 1 (100%) or -1 (not included)
0173         if (fraction > pfClusParams.minFracTot() && d2 < cutoffDistance)
0174           fraction = 1.;
0175         else
0176           fraction = -1.;
0177         fracView[rhFracOffset].frac() = fraction;
0178       }
0179       alpaka::syncBlockThreads(acc);  // all threads call sync
0180 
0181       if constexpr (debug) {
0182         if (once_per_block(acc))
0183           printf("Computing cluster position for topoId %d\n", topoId);
0184       }
0185 
0186       if (once_per_block(acc)) {
0187         // Reset cluster position and energy
0188         clusterPos = seedPos;
0189         clusterEnergy = seedEnergy;
0190       }
0191       alpaka::syncBlockThreads(acc);  // all threads call sync
0192 
0193       // Recalculate cluster position and energy
0194       if (fraction > -0.5) {
0195         alpaka::atomicAdd(acc, &clusterEnergy, rhEnergy, alpaka::hierarchy::Threads{});
0196         alpaka::atomicAdd(acc, &clusterPos.x, rhPos.x * rhPosNorm, alpaka::hierarchy::Threads{});
0197         alpaka::atomicAdd(acc, &clusterPos.y, rhPos.y * rhPosNorm, alpaka::hierarchy::Threads{});
0198         alpaka::atomicAdd(acc, &clusterPos.z, rhPos.z * rhPosNorm, alpaka::hierarchy::Threads{});
0199         alpaka::atomicAdd(acc, &clusterPos.w, rhPosNorm, alpaka::hierarchy::Threads{});  // position_norm
0200       }
0201       alpaka::syncBlockThreads(acc);  // all threads call sync
0202 
0203       if (once_per_block(acc)) {
0204         // Normalize the seed postiion
0205         if (clusterPos.w >= pfClusParams.minAllowedNormalization()) {
0206           // Divide by position norm
0207           clusterPos.x /= clusterPos.w;
0208           clusterPos.y /= clusterPos.w;
0209           clusterPos.z /= clusterPos.w;
0210 
0211           if constexpr (debug)
0212             printf("\tPF cluster (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
0213                    i,
0214                    clusterEnergy,
0215                    clusterPos.x,
0216                    clusterPos.y,
0217                    clusterPos.z);
0218         } else {
0219           if constexpr (debug)
0220             printf("\tPF cluster (seed %d) position norm (%f) less than minimum (%f)\n",
0221                    i,
0222                    clusterPos.w,
0223                    pfClusParams.minAllowedNormalization());
0224           clusterPos.x = 0.;
0225           clusterPos.y = 0.;
0226           clusterPos.z = 0.;
0227         }
0228         float diff2 = dR2(prevClusterPos, clusterPos);
0229         if constexpr (debug)
0230           printf("\tPF cluster (seed %d) has diff2 = %f\n", i, diff2);
0231         prevClusterPos = clusterPos;  // Save clusterPos
0232 
0233         float tol2 = tol * tol;
0234         iter++;
0235         notDone = (diff2 > tol2) && (iter < pfClusParams.maxIterations());
0236         if constexpr (debug) {
0237           if (diff2 > tol2)
0238             printf("\tTopoId %d has diff2 = %f greater than squared tolerance %f (continuing)\n", topoId, diff2, tol2);
0239           else if constexpr (debug)
0240             printf("\tTopoId %d has diff2 = %f LESS than squared tolerance %f (terminating!)\n", topoId, diff2, tol2);
0241         }
0242       }
0243       alpaka::syncBlockThreads(acc);  // all threads call sync
0244     } while (notDone);  // shared variable condition ensures synchronization is well defined
0245     if (once_per_block(acc)) {  // Cluster is finalized, assign cluster information to te SoA
0246       int rhIdx =
0247           pfClusteringVars[pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();  // i is the seed rechit index
0248       int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
0249       clusterView[seedIdx].energy() = clusterEnergy;
0250       clusterView[seedIdx].x() = clusterPos.x;
0251       clusterView[seedIdx].y() = clusterPos.y;
0252       clusterView[seedIdx].z() = clusterPos.z;
0253     }
0254   }
0255 
0256   // Processing clusters up to 100 seeds and 512 non-seed rechits using shared memory accesses
0257   // Device function designed to be called by all threads of a given block
0258   template <bool debug = false, typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0259   ALPAKA_FN_ACC static void hcalFastCluster_multiSeedParallel(
0260       const TAcc& acc,
0261       reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
0262       const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
0263       int topoId,   // from selection
0264       int nSeeds,   // from selection
0265       int nRHTopo,  // from selection
0266       reco::PFRecHitDeviceCollection::ConstView pfRecHits,
0267       reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
0268       reco::PFClusterDeviceCollection::View clusterView,
0269       reco::PFRecHitFractionDeviceCollection::View fracView) {
0270     int tid = alpaka::getIdx<alpaka::Block,
0271                              alpaka::Threads>(  // Thread index corresponds to a single rechit of the topo cluster
0272         acc)[0u];
0273 
0274     int& nRHNotSeed = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0275     int& topoSeedBegin = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0276     int& stride = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0277     int& iter = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0278     float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0279     float& diff2 = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0280     float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0281     bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
0282     auto& clusterPos = alpaka::declareSharedVar<Position4[100], __COUNTER__>(acc);
0283     auto& prevClusterPos = alpaka::declareSharedVar<Position4[100], __COUNTER__>(acc);
0284     auto& clusterEnergy = alpaka::declareSharedVar<float[100], __COUNTER__>(acc);
0285     auto& rhFracSum = alpaka::declareSharedVar<float[threadsPerBlockForClustering], __COUNTER__>(acc);
0286     auto& seeds = alpaka::declareSharedVar<int[100], __COUNTER__>(acc);
0287     auto& rechits = alpaka::declareSharedVar<int[threadsPerBlockForClustering], __COUNTER__>(acc);
0288 
0289     if (once_per_block(acc)) {
0290       nRHNotSeed = nRHTopo - nSeeds + 1;  // 1 + (# rechits per topoId that are NOT seeds)
0291       topoSeedBegin = pfClusteringVars[topoId].topoSeedOffsets();
0292       tol = pfClusParams.stoppingTolerance() *
0293             powf(fmaxf(1.0, nSeeds - 1), 2.0);  // stopping tolerance * tolerance scaling
0294       stride = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0295       iter = 0;
0296       notDone = true;
0297 
0298       int i = pfClusteringVars[topoSeedBegin].topoSeedList();
0299 
0300       if (topology.cutsFromDB()) {
0301         rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
0302       } else {
0303         if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
0304           rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
0305         else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
0306           rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
0307         else {
0308           rhENormInv = 0.;
0309           printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
0310         }
0311       }
0312     }
0313     alpaka::syncBlockThreads(acc);  // all threads call sync
0314 
0315     if (tid < nSeeds)
0316       seeds[tid] = pfClusteringVars[topoSeedBegin + tid].topoSeedList();
0317     if (tid < nRHNotSeed - 1)
0318       rechits[tid] =
0319           fracView[pfClusteringVars[pfClusteringVars[topoSeedBegin].topoSeedList()].seedFracOffsets() + tid + 1]
0320               .pfrhIdx();
0321 
0322     alpaka::syncBlockThreads(acc);  // all threads call sync
0323 
0324     if constexpr (debug) {
0325       if (once_per_block(acc)) {
0326         printf("\n===========================================================================================\n");
0327         printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
0328         for (int s = 0; s < nSeeds; s++) {
0329           if (s != 0)
0330             printf(", ");
0331           printf("%d", getSeedRhIdx(seeds, s));
0332         }
0333         if (nRHTopo == nSeeds) {
0334           printf(")\n\n");
0335         } else {
0336           printf(") and other rechits (");
0337           for (int r = 1; r < nRHNotSeed; r++) {
0338             if (r != 1)
0339               printf(", ");
0340             if (r <= 0) {
0341               printf("Invalid rhNum (%d) for get RhFracIdx!\n", r);
0342             }
0343             printf("%d", rechits[r - 1]);
0344           }
0345           printf(")\n\n");
0346         }
0347       }
0348       alpaka::syncBlockThreads(acc);  // all (or none) threads call sync
0349     }
0350 
0351     // Set initial cluster position (energy) to seed rechit position (energy)
0352     if (tid < nSeeds) {
0353       int i = getSeedRhIdx(seeds, tid);
0354       clusterPos[tid] = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.0};
0355       prevClusterPos[tid] = clusterPos[tid];
0356       clusterEnergy[tid] = pfRecHits[i].energy();
0357       for (int r = 0; r < (nRHNotSeed - 1); r++) {
0358         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].pfrhIdx() = rechits[r];
0359         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].frac() = -1.;
0360       }
0361     }
0362     alpaka::syncBlockThreads(acc);  // all threads call sync
0363 
0364     int rhThreadIdx = -1;
0365     Position4 rhThreadPos;
0366     if (tid < (nRHNotSeed - 1)) {
0367       rhThreadIdx = rechits[tid];  // Index when thread represents rechit
0368       rhThreadPos = Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
0369     }
0370 
0371     // Neighbors when threadIdx represents seed
0372     int seedThreadIdx = -1;
0373     Neighbours4 seedNeighbors = Neighbours4{-9, -9, -9, -9};
0374     float seedEnergy = -1.;
0375     Position4 seedInitClusterPos = Position4{0., 0., 0., 0.};
0376     if (tid < nSeeds) {
0377       if constexpr (debug)
0378         printf("tid: %d\n", tid);
0379       seedThreadIdx = getSeedRhIdx(seeds, tid);
0380       seedNeighbors = Neighbours4{pfRecHits[seedThreadIdx].neighbours()(0),
0381                                   pfRecHits[seedThreadIdx].neighbours()(1),
0382                                   pfRecHits[seedThreadIdx].neighbours()(2),
0383                                   pfRecHits[seedThreadIdx].neighbours()(3)};
0384       seedEnergy = pfRecHits[seedThreadIdx].energy();
0385 
0386       // Compute initial cluster position shift for seed
0387       updateClusterPos(pfClusParams, seedInitClusterPos, 1., seedThreadIdx, pfRecHits, rhENormInv);
0388     }
0389 
0390     do {
0391       if constexpr (debug) {
0392         if (once_per_block(acc))
0393           printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
0394       }
0395 
0396       // Reset rhFracSum
0397       rhFracSum[tid] = 0.;
0398       if (once_per_block(acc))
0399         diff2 = -1;
0400 
0401       if (tid < (nRHNotSeed - 1)) {
0402         for (int s = 0; s < nSeeds; s++) {
0403           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0404                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0405                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0406 
0407           float d2 = dist2 / pfClusParams.showerSigma2();
0408           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0409 
0410           rhFracSum[tid] += fraction;
0411         }
0412       }
0413       alpaka::syncBlockThreads(acc);  // all threads call sync
0414 
0415       if (tid < (nRHNotSeed - 1)) {
0416         for (int s = 0; s < nSeeds; s++) {
0417           int i = seeds[s];
0418           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0419                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0420                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0421 
0422           float d2 = dist2 / pfClusParams.showerSigma2();
0423           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0424 
0425           if (rhFracSum[tid] > pfClusParams.minFracTot()) {
0426             float fracpct = fraction / rhFracSum[tid];
0427             if (fracpct > cutoffFraction || (d2 < cutoffDistance && fracpct > pfClusParams.minFracToKeep())) {
0428               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = fracpct;
0429             } else {
0430               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0431             }
0432           } else {
0433             fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0434           }
0435         }
0436       }
0437       alpaka::syncBlockThreads(acc);  // all threads call sync
0438 
0439       if constexpr (debug) {
0440         if (once_per_block(acc))
0441           printf("Computing cluster position for topoId %d\n", topoId);
0442       }
0443 
0444       // Reset cluster position and energy
0445       if (tid < nSeeds) {
0446         clusterPos[tid] = seedInitClusterPos;
0447         clusterEnergy[tid] = seedEnergy;
0448         if constexpr (debug) {
0449           printf("Cluster %d (seed %d) has energy %f\tpos = (%f, %f, %f, %f)\n",
0450                  tid,
0451                  seeds[tid],
0452                  clusterEnergy[tid],
0453                  clusterPos[tid].x,
0454                  clusterPos[tid].y,
0455                  clusterPos[tid].z,
0456                  clusterPos[tid].w);
0457         }
0458       }
0459       alpaka::syncBlockThreads(acc);  // all threads call sync
0460 
0461       // Recalculate position
0462       if (tid < nSeeds) {
0463         for (int r = 0; r < nRHNotSeed - 1; r++) {
0464           int j = rechits[r];
0465           float frac = getRhFrac(pfClusteringVars, topoSeedBegin, fracView, tid, r + 1);
0466 
0467           if (frac > -0.5) {
0468             clusterEnergy[tid] += frac * pfRecHits[j].energy();
0469 
0470             if (nSeeds == 1 || j == seedNeighbors.x || j == seedNeighbors.y || j == seedNeighbors.z ||
0471                 j == seedNeighbors.w)
0472               updateClusterPos(pfClusParams, clusterPos[tid], frac, j, pfRecHits, rhENormInv);
0473           }
0474         }
0475       }
0476       alpaka::syncBlockThreads(acc);  // all threads call sync
0477 
0478       // Position normalization
0479       if (tid < nSeeds) {
0480         if (clusterPos[tid].w >= pfClusParams.minAllowedNormalization()) {
0481           // Divide by position norm
0482           clusterPos[tid].x /= clusterPos[tid].w;
0483           clusterPos[tid].y /= clusterPos[tid].w;
0484           clusterPos[tid].z /= clusterPos[tid].w;
0485 
0486           if constexpr (debug)
0487             printf("\tCluster %d (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
0488                    tid,
0489                    seedThreadIdx,
0490                    clusterEnergy[tid],
0491                    clusterPos[tid].x,
0492                    clusterPos[tid].y,
0493                    clusterPos[tid].z);
0494         } else {
0495           if constexpr (debug)
0496             printf("\tCluster %d (seed %d) position norm (%f) less than minimum (%f)\n",
0497                    tid,
0498                    seedThreadIdx,
0499                    clusterPos[tid].w,
0500                    pfClusParams.minAllowedNormalization());
0501           clusterPos[tid].x = 0.0;
0502           clusterPos[tid].y = 0.0;
0503           clusterPos[tid].z = 0.0;
0504         }
0505       }
0506       alpaka::syncBlockThreads(acc);  // all threads call sync
0507 
0508       if (tid < nSeeds) {
0509         float delta2 = dR2(prevClusterPos[tid], clusterPos[tid]);
0510         if constexpr (debug)
0511           printf("\tCluster %d (seed %d) has delta2 = %f\n", tid, seeds[tid], delta2);
0512         atomicMaxF(acc, &diff2, delta2);
0513         prevClusterPos[tid] = clusterPos[tid];  // Save clusterPos
0514       }
0515       alpaka::syncBlockThreads(acc);  // all threads call sync
0516 
0517       if (once_per_block(acc)) {
0518         float tol2 = tol * tol;
0519         iter++;
0520         notDone = (diff2 > tol2) && ((unsigned int)iter < pfClusParams.maxIterations());
0521         if constexpr (debug) {
0522           if (diff2 > tol2)
0523             printf("\tTopoId %d has diff2 = %f greater than squared tolerance %f (continuing)\n", topoId, diff2, tol2);
0524           else if constexpr (debug)
0525             printf("\tTopoId %d has diff2 = %f LESS than squared tolerance %f (terminating!)\n", topoId, diff2, tol2);
0526         }
0527       }
0528       alpaka::syncBlockThreads(acc);  // all threads call sync
0529     } while (notDone);  // shared variable condition ensures synchronization is well defined
0530     if (once_per_block(acc))
0531       // Fill PFCluster-level info
0532       if (tid < nSeeds) {
0533         int rhIdx = pfClusteringVars[tid + pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
0534         int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
0535         clusterView[seedIdx].energy() = clusterEnergy[tid];
0536         clusterView[seedIdx].x() = clusterPos[tid].x;
0537         clusterView[seedIdx].y() = clusterPos[tid].y;
0538         clusterView[seedIdx].z() = clusterPos[tid].z;
0539       }
0540   }
0541 
0542   // Process very large exotic clusters, from nSeeds > 400 and non-seeds > 1500
0543   // Uses global memory access
0544   // Device function designed to be called by all threads of a given block
0545   template <bool debug = false, typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0546   ALPAKA_FN_ACC static void hcalFastCluster_exotic(const TAcc& acc,
0547                                                    reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
0548                                                    const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
0549                                                    int topoId,
0550                                                    int nSeeds,
0551                                                    int nRHTopo,
0552                                                    reco::PFRecHitDeviceCollection::ConstView pfRecHits,
0553                                                    reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
0554                                                    reco::PFClusterDeviceCollection::View clusterView,
0555                                                    reco::PFRecHitFractionDeviceCollection::View fracView,
0556                                                    Position4* __restrict__ globalClusterPos,
0557                                                    Position4* __restrict__ globalPrevClusterPos,
0558                                                    float* __restrict__ globalClusterEnergy,
0559                                                    float* __restrict__ globalRhFracSum,
0560                                                    int* __restrict__ globalSeeds,
0561                                                    int* __restrict__ globalRechits) {
0562     int& nRHNotSeed = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0563     int& blockIdx = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0564     int& topoSeedBegin = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0565     int& stride = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0566     int& iter = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0567     float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0568     float& diff2 = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0569     float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0570     bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
0571 
0572     blockIdx = maxTopoInput * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0573     Position4* clusterPos = globalClusterPos + blockIdx;
0574     Position4* prevClusterPos = globalPrevClusterPos + blockIdx;
0575     float* clusterEnergy = globalClusterEnergy + blockIdx;
0576     float* rhFracSum = globalRhFracSum + blockIdx;
0577     int* seeds = globalSeeds + blockIdx;
0578     int* rechits = globalRechits + blockIdx;
0579 
0580     if (once_per_block(acc)) {
0581       nRHNotSeed = nRHTopo - nSeeds + 1;  // 1 + (# rechits per topoId that are NOT seeds)
0582       topoSeedBegin = pfClusteringVars[topoId].topoSeedOffsets();
0583       tol = pfClusParams.stoppingTolerance() *
0584             powf(fmaxf(1.0, nSeeds - 1), 2.0);  // stopping tolerance * tolerance scaling
0585       stride = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0586       iter = 0;
0587       notDone = true;
0588 
0589       int i = pfClusteringVars[topoSeedBegin].topoSeedList();
0590 
0591       if (topology.cutsFromDB()) {
0592         rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
0593       } else {
0594         if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
0595           rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
0596         else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
0597           rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
0598         else {
0599           rhENormInv = 0.;
0600           printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
0601         }
0602       }
0603     }
0604     alpaka::syncBlockThreads(acc);  // all threads call sync
0605 
0606     for (int n = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; n < nRHTopo; n += stride) {
0607       if (n < nSeeds)
0608         seeds[n] = pfClusteringVars[topoSeedBegin + n].topoSeedList();
0609       if (n < nRHNotSeed - 1)
0610         rechits[n] =
0611             fracView[pfClusteringVars[pfClusteringVars[topoSeedBegin].topoSeedList()].seedFracOffsets() + n + 1]
0612                 .pfrhIdx();
0613     }
0614     alpaka::syncBlockThreads(acc);  // all threads call sync
0615 
0616     if constexpr (debug) {
0617       if (once_per_block(acc)) {
0618         printf("\n===========================================================================================\n");
0619         printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
0620         for (int s = 0; s < nSeeds; s++) {
0621           if (s != 0)
0622             printf(", ");
0623           printf("%d", getSeedRhIdx(seeds, s));
0624         }
0625         if (nRHTopo == nSeeds) {
0626           printf(")\n\n");
0627         } else {
0628           printf(") and other rechits (");
0629           for (int r = 1; r < nRHNotSeed; r++) {
0630             if (r != 1)
0631               printf(", ");
0632             if (r <= 0) {
0633               printf("Invalid rhNum (%d) for get RhFracIdx!\n", r);
0634             }
0635             printf("%d", rechits[r - 1]);
0636           }
0637           printf(")\n\n");
0638         }
0639       }
0640       alpaka::syncBlockThreads(acc);  // all (or none) threads call sync
0641     }
0642 
0643     // Set initial cluster position (energy) to seed rechit position (energy)
0644     for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0645       int i = seeds[s];
0646       clusterPos[s] = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.0};
0647       prevClusterPos[s] = clusterPos[s];
0648       clusterEnergy[s] = pfRecHits[i].energy();
0649       for (int r = 0; r < (nRHNotSeed - 1); r++) {
0650         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].pfrhIdx() = rechits[r];
0651         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].frac() = -1.;
0652       }
0653     }
0654     alpaka::syncBlockThreads(acc);  // all threads call sync
0655 
0656     do {
0657       if constexpr (debug) {
0658         if (once_per_block(acc))
0659           printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
0660       }
0661 
0662       if (once_per_block(acc))
0663         diff2 = -1;
0664       // Reset rhFracSum
0665       for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
0666         rhFracSum[tid] = 0.;
0667         int rhThreadIdx = rechits[tid];
0668         Position4 rhThreadPos =
0669             Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
0670         for (int s = 0; s < nSeeds; s++) {
0671           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0672                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0673                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0674 
0675           float d2 = dist2 / pfClusParams.showerSigma2();
0676           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0677 
0678           rhFracSum[tid] += fraction;
0679         }
0680       }
0681       alpaka::syncBlockThreads(acc);  // all threads call sync
0682 
0683       for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
0684         int rhThreadIdx = rechits[tid];
0685         Position4 rhThreadPos =
0686             Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
0687         for (int s = 0; s < nSeeds; s++) {
0688           int i = seeds[s];
0689           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0690                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0691                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0692 
0693           float d2 = dist2 / pfClusParams.showerSigma2();
0694           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0695 
0696           if (rhFracSum[tid] > pfClusParams.minFracTot()) {
0697             float fracpct = fraction / rhFracSum[tid];
0698             if (fracpct > cutoffFraction || (d2 < cutoffDistance && fracpct > pfClusParams.minFracToKeep())) {
0699               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = fracpct;
0700             } else {
0701               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0702             }
0703           } else {
0704             fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0705           }
0706         }
0707       }
0708       alpaka::syncBlockThreads(acc);  // all threads call sync
0709 
0710       if constexpr (debug) {
0711         if (once_per_block(acc))
0712           printf("Computing cluster position for topoId %d\n", topoId);
0713       }
0714 
0715       // Reset cluster position and energy
0716       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0717         int seedRhIdx = getSeedRhIdx(seeds, s);
0718         float norm = logf(pfRecHits[seedRhIdx].energy() * rhENormInv);
0719         clusterPos[s] = Position4{
0720             pfRecHits[seedRhIdx].x() * norm, pfRecHits[seedRhIdx].y() * norm, pfRecHits[seedRhIdx].z() * norm, norm};
0721         clusterEnergy[s] = pfRecHits[seedRhIdx].energy();
0722         if constexpr (debug) {
0723           printf("Cluster %d (seed %d) has energy %f\tpos = (%f, %f, %f, %f)\n",
0724                  s,
0725                  seeds[s],
0726                  clusterEnergy[s],
0727                  clusterPos[s].x,
0728                  clusterPos[s].y,
0729                  clusterPos[s].z,
0730                  clusterPos[s].w);
0731         }
0732       }
0733       alpaka::syncBlockThreads(acc);  // all threads call sync
0734 
0735       // Recalculate position
0736       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0737         int seedRhIdx = getSeedRhIdx(seeds, s);
0738         for (int r = 0; r < nRHNotSeed - 1; r++) {
0739           int j = rechits[r];
0740           float frac = getRhFrac(pfClusteringVars, topoSeedBegin, fracView, s, r + 1);
0741 
0742           if (frac > -0.5) {
0743             clusterEnergy[s] += frac * pfRecHits[j].energy();
0744 
0745             if (nSeeds == 1 || j == pfRecHits[seedRhIdx].neighbours()(0) || j == pfRecHits[seedRhIdx].neighbours()(1) ||
0746                 j == pfRecHits[seedRhIdx].neighbours()(2) || j == pfRecHits[seedRhIdx].neighbours()(3))
0747               updateClusterPos(pfClusParams, clusterPos[s], frac, j, pfRecHits, rhENormInv);
0748           }
0749         }
0750       }
0751       alpaka::syncBlockThreads(acc);  // all threads call sync
0752 
0753       // Position normalization
0754       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0755         if (clusterPos[s].w >= pfClusParams.minAllowedNormalization()) {
0756           // Divide by position norm
0757           clusterPos[s].x /= clusterPos[s].w;
0758           clusterPos[s].y /= clusterPos[s].w;
0759           clusterPos[s].z /= clusterPos[s].w;
0760 
0761           if constexpr (debug)
0762             printf("\tCluster %d (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
0763                    s,
0764                    seeds[s],
0765                    clusterEnergy[s],
0766                    clusterPos[s].x,
0767                    clusterPos[s].y,
0768                    clusterPos[s].z);
0769         } else {
0770           if constexpr (debug)
0771             printf("\tCluster %d (seed %d) position norm (%f) less than minimum (%f)\n",
0772                    s,
0773                    seeds[s],
0774                    clusterPos[s].w,
0775                    pfClusParams.minAllowedNormalization());
0776           clusterPos[s].x = 0.0;
0777           clusterPos[s].y = 0.0;
0778           clusterPos[s].z = 0.0;
0779         }
0780       }
0781       alpaka::syncBlockThreads(acc);  // all threads call sync
0782 
0783       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0784         float delta2 = dR2(prevClusterPos[s], clusterPos[s]);
0785         if constexpr (debug)
0786           printf("\tCluster %d (seed %d) has delta2 = %f\n", s, seeds[s], delta2);
0787         atomicMaxF(acc, &diff2, delta2);
0788         prevClusterPos[s] = clusterPos[s];  // Save clusterPos
0789       }
0790       alpaka::syncBlockThreads(acc);  // all threads call sync
0791 
0792       if (once_per_block(acc)) {
0793         float tol2 = tol * tol;
0794         iter++;
0795         notDone = (diff2 > tol2) && ((unsigned int)iter < pfClusParams.maxIterations());
0796         if constexpr (debug) {
0797           if (diff2 > tol2)
0798             printf("\tTopoId %d has diff2 = %f greater than squared tolerance %f (continuing)\n", topoId, diff2, tol2);
0799           else if constexpr (debug)
0800             printf("\tTopoId %d has diff2 = %f LESS than squared tolerance %f (terminating!)\n", topoId, diff2, tol2);
0801         }
0802       }
0803       alpaka::syncBlockThreads(acc);  // all threads call sync
0804     } while (notDone);  // shared variable ensures synchronization is well defined
0805     if (once_per_block(acc))
0806       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0807         int rhIdx = pfClusteringVars[s + pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
0808         int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
0809         clusterView[seedIdx].energy() = pfRecHits[s].energy();
0810         clusterView[seedIdx].x() = pfRecHits[s].x();
0811         clusterView[seedIdx].y() = pfRecHits[s].y();
0812         clusterView[seedIdx].z() = pfRecHits[s].z();
0813       }
0814     alpaka::syncBlockThreads(acc);  // all threads call sync
0815   }
0816 
0817   // Process clusters with up to 400 seeds and 1500 non seeds using shared memory
0818   // Device function designed to be called by all threads of a given block
0819   template <bool debug = false, typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0820   ALPAKA_FN_ACC static void hcalFastCluster_multiSeedIterative(
0821       const TAcc& acc,
0822       reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
0823       const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
0824       int topoId,
0825       int nSeeds,
0826       int nRHTopo,
0827       reco::PFRecHitDeviceCollection::ConstView pfRecHits,
0828       reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
0829       reco::PFClusterDeviceCollection::View clusterView,
0830       reco::PFRecHitFractionDeviceCollection::View fracView) {
0831     int& nRHNotSeed = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0832     int& topoSeedBegin = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0833     int& stride = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0834     int& iter = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0835     float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0836     float& diff2 = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0837     float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0838     bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
0839 
0840     auto& clusterPos = alpaka::declareSharedVar<Position4[400], __COUNTER__>(acc);
0841     auto& prevClusterPos = alpaka::declareSharedVar<Position4[400], __COUNTER__>(acc);
0842     auto& clusterEnergy = alpaka::declareSharedVar<float[400], __COUNTER__>(acc);
0843     auto& rhFracSum = alpaka::declareSharedVar<float[1500], __COUNTER__>(acc);
0844     auto& seeds = alpaka::declareSharedVar<int[400], __COUNTER__>(acc);
0845     auto& rechits = alpaka::declareSharedVar<int[1500], __COUNTER__>(acc);
0846 
0847     if (once_per_block(acc)) {
0848       nRHNotSeed = nRHTopo - nSeeds + 1;  // 1 + (# rechits per topoId that are NOT seeds)
0849       topoSeedBegin = pfClusteringVars[topoId].topoSeedOffsets();
0850       tol = pfClusParams.stoppingTolerance() *  // stopping tolerance * tolerance scaling
0851             powf(fmaxf(1.0, nSeeds - 1), 2.0);
0852       stride = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0853       iter = 0;
0854       notDone = true;
0855 
0856       int i = pfClusteringVars[topoSeedBegin].topoSeedList();
0857 
0858       if (topology.cutsFromDB()) {
0859         rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
0860       } else {
0861         if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
0862           rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
0863         else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
0864           rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
0865         else {
0866           rhENormInv = 0.;
0867           printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
0868         }
0869       }
0870     }
0871     alpaka::syncBlockThreads(acc);  // all threads call sync
0872 
0873     for (int n = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; n < nRHTopo; n += stride) {
0874       if (n < nSeeds)
0875         seeds[n] = pfClusteringVars[topoSeedBegin + n].topoSeedList();
0876       if (n < nRHNotSeed - 1)
0877         rechits[n] =
0878             fracView[pfClusteringVars[pfClusteringVars[topoSeedBegin].topoSeedList()].seedFracOffsets() + n + 1]
0879                 .pfrhIdx();
0880     }
0881     alpaka::syncBlockThreads(acc);  // all threads call sync
0882 
0883     if constexpr (debug) {
0884       if (once_per_block(acc)) {
0885         printf("\n===========================================================================================\n");
0886         printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
0887         for (int s = 0; s < nSeeds; s++) {
0888           if (s != 0)
0889             printf(", ");
0890           printf("%d", getSeedRhIdx(seeds, s));
0891         }
0892         if (nRHTopo == nSeeds) {
0893           printf(")\n\n");
0894         } else {
0895           printf(") and other rechits (");
0896           for (int r = 1; r < nRHNotSeed; r++) {
0897             if (r != 1)
0898               printf(", ");
0899             if (r <= 0) {
0900               printf("Invalid rhNum (%d) for get RhFracIdx!\n", r);
0901             }
0902             printf("%d", rechits[r - 1]);
0903           }
0904           printf(")\n\n");
0905         }
0906       }
0907       alpaka::syncBlockThreads(acc);  // all (or none) threads call sync
0908     }
0909 
0910     // Set initial cluster position (energy) to seed rechit position (energy)
0911     for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0912       int i = seeds[s];
0913       clusterPos[s] = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.0};
0914       prevClusterPos[s] = clusterPos[s];
0915       clusterEnergy[s] = pfRecHits[i].energy();
0916       for (int r = 0; r < (nRHNotSeed - 1); r++) {
0917         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].pfrhIdx() = rechits[r];
0918         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].frac() = -1.;
0919       }
0920     }
0921     alpaka::syncBlockThreads(acc);  // all threads call sync
0922 
0923     do {
0924       if constexpr (debug) {
0925         if (once_per_block(acc))
0926           printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
0927       }
0928 
0929       if (once_per_block(acc))
0930         diff2 = -1;
0931       // Reset rhFracSum
0932       for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
0933         rhFracSum[tid] = 0.;
0934         int rhThreadIdx = rechits[tid];
0935         Position4 rhThreadPos =
0936             Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
0937         for (int s = 0; s < nSeeds; s++) {
0938           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0939                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0940                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0941 
0942           float d2 = dist2 / pfClusParams.showerSigma2();
0943           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0944 
0945           rhFracSum[tid] += fraction;
0946         }
0947       }
0948       alpaka::syncBlockThreads(acc);  // all threads call sync
0949 
0950       for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
0951         int rhThreadIdx = rechits[tid];
0952         Position4 rhThreadPos =
0953             Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
0954         for (int s = 0; s < nSeeds; s++) {
0955           int i = seeds[s];
0956           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0957                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0958                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0959 
0960           float d2 = dist2 / pfClusParams.showerSigma2();
0961           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0962 
0963           if (rhFracSum[tid] > pfClusParams.minFracTot()) {
0964             float fracpct = fraction / rhFracSum[tid];
0965             if (fracpct > cutoffFraction || (d2 < cutoffDistance && fracpct > pfClusParams.minFracToKeep())) {
0966               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = fracpct;
0967             } else {
0968               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0969             }
0970           } else {
0971             fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0972           }
0973         }
0974       }
0975       alpaka::syncBlockThreads(acc);  // all threads call sync
0976 
0977       if constexpr (debug) {
0978         if (once_per_block(acc))
0979           printf("Computing cluster position for topoId %d\n", topoId);
0980       }
0981 
0982       // Reset cluster position and energy
0983       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0984         int seedRhIdx = getSeedRhIdx(seeds, s);
0985         float norm = logf(pfRecHits[seedRhIdx].energy() * rhENormInv);
0986         clusterPos[s] = Position4{
0987             pfRecHits[seedRhIdx].x() * norm, pfRecHits[seedRhIdx].y() * norm, pfRecHits[seedRhIdx].z() * norm, norm};
0988         clusterEnergy[s] = pfRecHits[seedRhIdx].energy();
0989         if constexpr (debug) {
0990           printf("Cluster %d (seed %d) has energy %f\tpos = (%f, %f, %f, %f)\n",
0991                  s,
0992                  seeds[s],
0993                  clusterEnergy[s],
0994                  clusterPos[s].x,
0995                  clusterPos[s].y,
0996                  clusterPos[s].z,
0997                  clusterPos[s].w);
0998         }
0999       }
1000       alpaka::syncBlockThreads(acc);  // all threads call sync
1001 
1002       // Recalculate position
1003       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1004         int seedRhIdx = getSeedRhIdx(seeds, s);
1005         for (int r = 0; r < nRHNotSeed - 1; r++) {
1006           int j = rechits[r];
1007           float frac = getRhFrac(pfClusteringVars, topoSeedBegin, fracView, s, r + 1);
1008 
1009           if (frac > -0.5) {
1010             clusterEnergy[s] += frac * pfRecHits[j].energy();
1011 
1012             if (nSeeds == 1 || j == pfRecHits[seedRhIdx].neighbours()(0) || j == pfRecHits[seedRhIdx].neighbours()(1) ||
1013                 j == pfRecHits[seedRhIdx].neighbours()(2) || j == pfRecHits[seedRhIdx].neighbours()(3))
1014               updateClusterPos(pfClusParams, clusterPos[s], frac, j, pfRecHits, rhENormInv);
1015           }
1016         }
1017       }
1018       alpaka::syncBlockThreads(acc);  // all threads call sync
1019 
1020       // Position normalization
1021       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1022         if (clusterPos[s].w >= pfClusParams.minAllowedNormalization()) {
1023           // Divide by position norm
1024           clusterPos[s].x /= clusterPos[s].w;
1025           clusterPos[s].y /= clusterPos[s].w;
1026           clusterPos[s].z /= clusterPos[s].w;
1027 
1028           if constexpr (debug)
1029             printf("\tCluster %d (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
1030                    s,
1031                    seeds[s],
1032                    clusterEnergy[s],
1033                    clusterPos[s].x,
1034                    clusterPos[s].y,
1035                    clusterPos[s].z);
1036         } else {
1037           if constexpr (debug)
1038             printf("\tCluster %d (seed %d) position norm (%f) less than minimum (%f)\n",
1039                    s,
1040                    seeds[s],
1041                    clusterPos[s].w,
1042                    pfClusParams.minAllowedNormalization());
1043           clusterPos[s].x = 0.0;
1044           clusterPos[s].y = 0.0;
1045           clusterPos[s].z = 0.0;
1046         }
1047       }
1048       alpaka::syncBlockThreads(acc);  // all threads call sync
1049 
1050       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1051         float delta2 = dR2(prevClusterPos[s], clusterPos[s]);
1052         if constexpr (debug)
1053           printf("\tCluster %d (seed %d) has delta2 = %f\n", s, seeds[s], delta2);
1054         atomicMaxF(acc, &diff2, delta2);
1055         prevClusterPos[s] = clusterPos[s];  // Save clusterPos
1056       }
1057       alpaka::syncBlockThreads(acc);  // all threads call sync
1058 
1059       if (once_per_block(acc)) {
1060         float tol2 = tol * tol;
1061         iter++;
1062         notDone = (diff2 > tol2) && ((unsigned int)iter < pfClusParams.maxIterations());
1063         if constexpr (debug) {
1064           if (diff2 > tol2)
1065             printf("\tTopoId %d has diff2 = %f greater than tolerance %f (continuing)\n", topoId, diff2, tol2);
1066           else if constexpr (debug)
1067             printf("\tTopoId %d has diff2 = %f LESS than tolerance %f (terminating!)\n", topoId, diff2, tol2);
1068         }
1069       }
1070       alpaka::syncBlockThreads(acc);  // all threads call sync
1071     } while (notDone);  // shared variable ensures synchronization is well defined
1072     if (once_per_block(acc))
1073       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1074         int rhIdx = pfClusteringVars[s + pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
1075         int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
1076         clusterView[seedIdx].energy() = pfRecHits[s].energy();
1077         clusterView[seedIdx].x() = pfRecHits[s].x();
1078         clusterView[seedIdx].y() = pfRecHits[s].y();
1079         clusterView[seedIdx].z() = pfRecHits[s].z();
1080       }
1081   }
1082 
1083   // Seeding using local energy maxima
1084   class SeedingTopoThresh {
1085   public:
1086     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1087     ALPAKA_FN_ACC void operator()(const TAcc& acc,
1088                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1089                                   const reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
1090                                   const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
1091                                   const reco::PFRecHitHostCollection::ConstView pfRecHits,
1092                                   reco::PFClusterDeviceCollection::View clusterView,
1093                                   uint32_t* __restrict__ nSeeds) const {
1094       const int nRH = pfRecHits.size();
1095 
1096       if (once_per_grid(acc)) {
1097         clusterView.size() = nRH;
1098       }
1099 
1100       for (auto i : uniform_elements(acc, nRH)) {
1101         // Initialize arrays
1102         pfClusteringVars[i].pfrh_isSeed() = 0;
1103         pfClusteringVars[i].rhCount() = 0;
1104         pfClusteringVars[i].topoSeedCount() = 0;
1105         pfClusteringVars[i].topoRHCount() = 0;
1106         pfClusteringVars[i].seedFracOffsets() = -1;
1107         pfClusteringVars[i].topoSeedOffsets() = -1;
1108         pfClusteringVars[i].topoSeedList() = -1;
1109         clusterView[i].seedRHIdx() = -1;
1110 
1111         int layer = pfRecHits[i].layer();
1112         int depthOffset = pfRecHits[i].depth() - 1;
1113         float energy = pfRecHits[i].energy();
1114         Position3 pos = Position3{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z()};
1115         float seedThreshold = 9999.;
1116         float topoThreshold = 9999.;
1117 
1118         if (topology.cutsFromDB()) {
1119           seedThreshold = topology[pfRecHits[i].denseId()].seedThreshold();
1120           topoThreshold = topology[pfRecHits[i].denseId()].noiseThreshold();
1121         } else {
1122           if (layer == PFLayer::HCAL_BARREL1) {
1123             seedThreshold = pfClusParams.seedEThresholdHB_vec()[depthOffset];
1124             topoThreshold = pfClusParams.topoEThresholdHB_vec()[depthOffset];
1125           } else if (layer == PFLayer::HCAL_ENDCAP) {
1126             seedThreshold = pfClusParams.seedEThresholdHE_vec()[depthOffset];
1127             topoThreshold = pfClusParams.topoEThresholdHE_vec()[depthOffset];
1128           }
1129         }
1130 
1131         // cmssdt.cern.ch/lxr/source/DataFormats/ParticleFlowReco/interface/PFRecHit.h#0108
1132         float pt2 = energy * energy * (pos.x * pos.x + pos.y * pos.y) / (pos.x * pos.x + pos.y * pos.y + pos.z * pos.z);
1133 
1134         // Seeding threshold test
1135         if ((layer == PFLayer::HCAL_BARREL1 && energy > seedThreshold && pt2 > pfClusParams.seedPt2ThresholdHB()) ||
1136             (layer == PFLayer::HCAL_ENDCAP && energy > seedThreshold && pt2 > pfClusParams.seedPt2ThresholdHE())) {
1137           pfClusteringVars[i].pfrh_isSeed() = 1;
1138           for (int k = 0; k < 4; k++) {  // Does this seed candidate have a higher energy than four neighbours
1139             if (pfRecHits[i].neighbours()(k) < 0)
1140               continue;
1141             if (energy < pfRecHits[pfRecHits[i].neighbours()(k)].energy()) {
1142               pfClusteringVars[i].pfrh_isSeed() = 0;
1143               break;
1144             }
1145           }
1146           if (pfClusteringVars[i].pfrh_isSeed())
1147             alpaka::atomicAdd(acc, nSeeds, 1u);
1148         }
1149         // Topo clustering threshold test
1150 
1151         if ((layer == PFLayer::HCAL_ENDCAP && energy > topoThreshold) ||
1152             (layer == PFLayer::HCAL_BARREL1 && energy > topoThreshold)) {
1153           pfClusteringVars[i].pfrh_passTopoThresh() = true;
1154           pfClusteringVars[i].pfrh_topoId() = i;
1155         } else {
1156           pfClusteringVars[i].pfrh_passTopoThresh() = false;
1157           pfClusteringVars[i].pfrh_topoId() = -1;
1158         }
1159       }
1160     }
1161   };
1162 
1163   // Preparation of topo inputs. Initializing topoId, egdeIdx, nEdges, edgeList
1164   class PrepareTopoInputs {
1165   public:
1166     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1167     ALPAKA_FN_ACC void operator()(const TAcc& acc,
1168                                   const reco::PFRecHitHostCollection::ConstView pfRecHits,
1169                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1170                                   reco::PFClusteringEdgeVarsDeviceCollection::View pfClusteringEdgeVars,
1171                                   uint32_t* __restrict__ nSeeds) const {
1172       const int nRH = pfRecHits.size();
1173 
1174       if (once_per_grid(acc)) {
1175         pfClusteringVars.nEdges() = nRH * 8;
1176         pfClusteringEdgeVars[nRH].pfrh_edgeIdx() = nRH * 8;
1177       }
1178       for (uint32_t i : cms::alpakatools::uniform_elements(acc, nRH)) {
1179         pfClusteringEdgeVars[i].pfrh_edgeIdx() = i * 8;
1180         pfClusteringVars[i].pfrh_topoId() = 0;
1181         for (int j = 0; j < 8; j++) {  // checking if neighbours exist and assigning neighbours as edges
1182           if (pfRecHits[i].neighbours()(j) == -1)
1183             pfClusteringEdgeVars[i * 8 + j].pfrh_edgeList() = i;
1184           else
1185             pfClusteringEdgeVars[i * 8 + j].pfrh_edgeList() = pfRecHits[i].neighbours()(j);
1186         }
1187       }
1188 
1189       return;
1190     }
1191   };
1192 
1193   // Contraction in a single block
1194   class TopoClusterContraction {
1195   public:
1196     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1197     ALPAKA_FN_ACC void operator()(const TAcc& acc,
1198                                   const reco::PFRecHitHostCollection::ConstView pfRecHits,
1199                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1200                                   reco::PFClusterDeviceCollection::View clusterView,
1201                                   uint32_t* __restrict__ nSeeds,
1202                                   uint32_t* __restrict__ nRHF) const {
1203       const int nRH = pfRecHits.size();
1204       int& totalSeedOffset = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1205       int& totalSeedFracOffset = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1206 
1207       // rhCount, topoRHCount, topoSeedCount initialized earlier
1208       if (once_per_block(acc)) {
1209         pfClusteringVars.nTopos() = 0;
1210         pfClusteringVars.nRHFracs() = 0;
1211         totalSeedOffset = 0;
1212         totalSeedFracOffset = 0;
1213         pfClusteringVars.pcrhFracSize() = 0;
1214       }
1215 
1216       alpaka::syncBlockThreads(acc);  // all threads call sync
1217 
1218       // Now determine the number of seeds and rechits in each topo cluster [topoRHCount, topoSeedCount]
1219       // Also get the list of topoIds (smallest rhIdx of each topo cluser)
1220       for (int rhIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; rhIdx < nRH;
1221            rhIdx += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1222         pfClusteringVars[rhIdx].rhIdxToSeedIdx() = -1;
1223         int topoId = pfClusteringVars[rhIdx].pfrh_topoId();
1224         if (topoId > -1) {
1225           // Valid topo cluster
1226           alpaka::atomicAdd(acc, &pfClusteringVars[topoId].topoRHCount(), 1);
1227           // Valid topoId not counted yet
1228           if (topoId == rhIdx) {  // For every topo cluster, there is one rechit that meets this condition.
1229             int topoIdx = alpaka::atomicAdd(acc, &pfClusteringVars.nTopos(), 1);
1230             pfClusteringVars[topoIdx].topoIds() =
1231                 topoId;  // topoId: the smallest index of rechits that belong to a topo cluster.
1232           }
1233           // This is a cluster seed
1234           if (pfClusteringVars[rhIdx].pfrh_isSeed()) {  // # of seeds in this topo cluster
1235             alpaka::atomicAdd(acc, &pfClusteringVars[topoId].topoSeedCount(), 1);
1236           }
1237         }
1238       }
1239 
1240       alpaka::syncBlockThreads(acc);  // all threads call sync
1241 
1242       // Determine offsets for topo ID seed array [topoSeedOffsets]
1243       for (int topoId = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; topoId < nRH;
1244            topoId += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1245         if (pfClusteringVars[topoId].topoSeedCount() > 0) {
1246           // This is a valid topo ID
1247           int offset = alpaka::atomicAdd(acc, &totalSeedOffset, pfClusteringVars[topoId].topoSeedCount());
1248           pfClusteringVars[topoId].topoSeedOffsets() = offset;
1249         }
1250       }
1251       alpaka::syncBlockThreads(acc);  // all threads call sync
1252 
1253       // Fill arrays of rechit indicies for each seed [topoSeedList] and rhIdx->seedIdx conversion for each seed [rhIdxToSeedIdx]
1254       // Also fill seedRHIdx, topoId, depth
1255       for (int rhIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; rhIdx < nRH;
1256            rhIdx += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1257         int topoId = pfClusteringVars[rhIdx].pfrh_topoId();
1258         if (pfClusteringVars[rhIdx].pfrh_isSeed()) {
1259           // Valid topo cluster and this rhIdx corresponds to a seed
1260           int k = alpaka::atomicAdd(acc, &pfClusteringVars[topoId].rhCount(), 1);
1261           int seedIdx = pfClusteringVars[topoId].topoSeedOffsets() + k;
1262           if ((unsigned int)seedIdx >= *nSeeds)
1263             printf("Warning(contraction) %8d > %8d should not happen, check topoId: %d has %d rh\n",
1264                    seedIdx,
1265                    *nSeeds,
1266                    topoId,
1267                    k);
1268           pfClusteringVars[seedIdx].topoSeedList() = rhIdx;
1269           pfClusteringVars[rhIdx].rhIdxToSeedIdx() = seedIdx;
1270           clusterView[seedIdx].topoId() = topoId;
1271           clusterView[seedIdx].seedRHIdx() = rhIdx;
1272           clusterView[seedIdx].depth() = pfRecHits[rhIdx].depth();
1273         }
1274       }
1275 
1276       alpaka::syncBlockThreads(acc);  // all threads call sync
1277 
1278       // Determine seed offsets for rechit fraction array
1279       for (int rhIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; rhIdx < nRH;
1280            rhIdx += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1281         pfClusteringVars[rhIdx].rhCount() = 1;  // Reset this counter array
1282 
1283         int topoId = pfClusteringVars[rhIdx].pfrh_topoId();
1284         if (pfClusteringVars[rhIdx].pfrh_isSeed() && topoId > -1) {
1285           // Allot the total number of rechits for this topo cluster for rh fractions
1286           int offset = alpaka::atomicAdd(acc, &totalSeedFracOffset, pfClusteringVars[topoId].topoRHCount());
1287 
1288           // Add offset for this PF cluster seed
1289           pfClusteringVars[rhIdx].seedFracOffsets() = offset;
1290 
1291           // Store recHitFraction offset & size information for each seed
1292           clusterView[pfClusteringVars[rhIdx].rhIdxToSeedIdx()].rhfracOffset() =
1293               pfClusteringVars[rhIdx].seedFracOffsets();
1294           clusterView[pfClusteringVars[rhIdx].rhIdxToSeedIdx()].rhfracSize() =
1295               pfClusteringVars[topoId].topoRHCount() - pfClusteringVars[topoId].topoSeedCount() + 1;
1296         }
1297       }
1298 
1299       alpaka::syncBlockThreads(acc);  // all threads call sync
1300 
1301       if (once_per_block(acc)) {
1302         pfClusteringVars.pcrhFracSize() = totalSeedFracOffset;
1303         pfClusteringVars.nRHFracs() = totalSeedFracOffset;
1304         clusterView.nRHFracs() = totalSeedFracOffset;
1305         *nRHF = totalSeedFracOffset;
1306         clusterView.nSeeds() = *nSeeds;
1307         clusterView.nTopos() = pfClusteringVars.nTopos();
1308 
1309         if (pfClusteringVars.pcrhFracSize() > 200000)  // Warning in case the fraction is too large
1310           printf("At the end of topoClusterContraction, found large *pcrhFracSize = %d\n",
1311                  pfClusteringVars.pcrhFracSize());
1312       }
1313     }
1314   };
1315 
1316   // Prefill the rechit index for all PFCluster fractions
1317   // Optimized for GPU parallel, but works on any backend
1318   class FillRhfIndex {
1319   public:
1320     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1321     ALPAKA_FN_ACC void operator()(const TAcc& acc,
1322                                   const reco::PFRecHitHostCollection::ConstView pfRecHits,
1323                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1324                                   reco::PFRecHitFractionDeviceCollection::View fracView) const {
1325       const int nRH = pfRecHits.size();
1326 
1327       for (auto index : uniform_elements_nd(acc, {nRH, nRH})) {
1328         const int i = index[0u];  // i is a seed index
1329         const int j = index[1u];  // j is NOT a seed
1330         int topoId = pfClusteringVars[i].pfrh_topoId();
1331         if (topoId > -1 && pfClusteringVars[i].pfrh_isSeed() && topoId == pfClusteringVars[j].pfrh_topoId()) {
1332           if (!pfClusteringVars[j].pfrh_isSeed()) {  // NOT a seed
1333             int k = alpaka::atomicAdd(
1334                 acc, &pfClusteringVars[i].rhCount(), 1);  // Increment the number of rechit fractions for this seed
1335             auto fraction = fracView[pfClusteringVars[i].seedFracOffsets() + k];
1336             fraction.pfrhIdx() = j;
1337             fraction.pfcIdx() = pfClusteringVars[i].rhIdxToSeedIdx();
1338           } else if (i == j) {  // i==j is a seed rechit index
1339             auto seed = fracView[pfClusteringVars[i].seedFracOffsets()];
1340             seed.pfrhIdx() = j;
1341             seed.frac() = 1;
1342             seed.pfcIdx() = pfClusteringVars[i].rhIdxToSeedIdx();
1343           }
1344         }
1345       }
1346     }
1347   };
1348 
1349   class FastCluster {
1350   public:
1351     template <bool debug = false, typename TAcc, typename = std::enable_if<!std::is_same_v<Device, alpaka::DevCpu>>>
1352     ALPAKA_FN_ACC void operator()(const TAcc& acc,
1353                                   const reco::PFRecHitHostCollection::ConstView pfRecHits,
1354                                   const reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
1355                                   const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
1356                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1357                                   reco::PFClusterDeviceCollection::View clusterView,
1358                                   reco::PFRecHitFractionDeviceCollection::View fracView) const {
1359       const int nRH = pfRecHits.size();
1360       int& topoId = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1361       int& nRHTopo = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1362       int& nSeeds = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1363 
1364       if (once_per_block(acc)) {
1365         topoId = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
1366         nRHTopo = pfClusteringVars[topoId].topoRHCount();
1367         nSeeds = pfClusteringVars[topoId].topoSeedCount();
1368       }
1369 
1370       alpaka::syncBlockThreads(acc);  // all threads call sync
1371 
1372       if (topoId < nRH && nRHTopo > 0 && nSeeds > 0) {
1373         if (nRHTopo == nSeeds) {
1374           // PF cluster is isolated seed. No iterations needed
1375           if (once_per_block(acc)) {
1376             // Fill PFCluster-level information
1377             int rhIdx = pfClusteringVars[pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
1378             int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
1379             clusterView[seedIdx].energy() = pfRecHits[rhIdx].energy();
1380             clusterView[seedIdx].x() = pfRecHits[rhIdx].x();
1381             clusterView[seedIdx].y() = pfRecHits[rhIdx].y();
1382             clusterView[seedIdx].z() = pfRecHits[rhIdx].z();
1383           }
1384           // singleSeed and multiSeedParallel functions work only for GPU backend
1385         } else if ((not std::is_same_v<Device, alpaka::DevCpu>) && nSeeds == 1) {
1386           // Single seed cluster
1387           hcalFastCluster_singleSeed(
1388               acc, pfClusParams, topology, topoId, nRHTopo, pfRecHits, pfClusteringVars, clusterView, fracView);
1389         } else if ((not std::is_same_v<Device, alpaka::DevCpu>) && nSeeds <= 100 &&
1390                    nRHTopo - nSeeds < threadsPerBlockForClustering) {
1391           hcalFastCluster_multiSeedParallel(
1392               acc, pfClusParams, topology, topoId, nSeeds, nRHTopo, pfRecHits, pfClusteringVars, clusterView, fracView);
1393         } else if (nSeeds <= 400 && nRHTopo - nSeeds <= 1500) {
1394           // nSeeds value must match exotic in FastClusterExotic
1395           hcalFastCluster_multiSeedIterative(
1396               acc, pfClusParams, topology, topoId, nSeeds, nRHTopo, pfRecHits, pfClusteringVars, clusterView, fracView);
1397         } else {
1398           if constexpr (debug) {
1399             if (once_per_block(acc))
1400               printf("Topo cluster %d has %d seeds and %d rechits. Will be processed in next kernel.\n",
1401                      topoId,
1402                      nSeeds,
1403                      nRHTopo);
1404           }
1405         }
1406       }
1407     }
1408   };
1409 
1410   // Process very large, exotic topo clusters
1411   class FastClusterExotic {
1412   public:
1413     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1414     ALPAKA_FN_ACC void operator()(const TAcc& acc,
1415                                   const reco::PFRecHitHostCollection::ConstView pfRecHits,
1416                                   const reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
1417                                   const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
1418                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1419                                   reco::PFClusterDeviceCollection::View clusterView,
1420                                   reco::PFRecHitFractionDeviceCollection::View fracView,
1421                                   Position4* __restrict__ globalClusterPos,
1422                                   Position4* __restrict__ globalPrevClusterPos,
1423                                   float* __restrict__ globalClusterEnergy,
1424                                   float* __restrict__ globalRhFracSum,
1425                                   int* __restrict__ globalSeeds,
1426                                   int* __restrict__ globalRechits) const {
1427       const int nRH = pfRecHits.size();
1428       for (int topoId = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]; topoId < nRH;
1429            topoId += blocksForExoticClusters) {
1430         int nRHTopo = pfClusteringVars[topoId].topoRHCount();
1431         int nSeeds = pfClusteringVars[topoId].topoSeedCount();
1432 
1433         // nSeeds value must match multiSeedIterative in FastCluster
1434         if (nRHTopo > 0 && (nSeeds > 400 || nRHTopo - nSeeds > 1500)) {
1435           hcalFastCluster_exotic(acc,
1436                                  pfClusParams,
1437                                  topology,
1438                                  topoId,
1439                                  nSeeds,
1440                                  nRHTopo,
1441                                  pfRecHits,
1442                                  pfClusteringVars,
1443                                  clusterView,
1444                                  fracView,
1445                                  globalClusterPos,
1446                                  globalPrevClusterPos,
1447                                  globalClusterEnergy,
1448                                  globalRhFracSum,
1449                                  globalSeeds,
1450                                  globalRechits);
1451         }
1452         alpaka::syncBlockThreads(acc);  // all threads call sync
1453       }
1454     }
1455   };
1456 
1457   PFClusterProducerKernel::PFClusterProducerKernel(Queue& queue, const reco::PFRecHitHostCollection& pfRecHits)
1458       : nSeeds(cms::alpakatools::make_device_buffer<uint32_t>(queue)),
1459         globalClusterPos(
1460             cms::alpakatools::make_device_buffer<Position4[]>(queue, blocksForExoticClusters * maxTopoInput)),
1461         globalPrevClusterPos(
1462             cms::alpakatools::make_device_buffer<Position4[]>(queue, blocksForExoticClusters * maxTopoInput)),
1463         globalClusterEnergy(
1464             cms::alpakatools::make_device_buffer<float[]>(queue, blocksForExoticClusters * maxTopoInput)),
1465         globalRhFracSum(cms::alpakatools::make_device_buffer<float[]>(queue, blocksForExoticClusters * maxTopoInput)),
1466         globalSeeds(cms::alpakatools::make_device_buffer<int[]>(queue, blocksForExoticClusters * maxTopoInput)),
1467         globalRechits(cms::alpakatools::make_device_buffer<int[]>(queue, blocksForExoticClusters * maxTopoInput)) {
1468     alpaka::memset(queue, nSeeds, 0x00);  // Reset nSeeds
1469   }
1470 
1471   void PFClusterProducerKernel::seedTopoAndContract(Queue& queue,
1472                                                     const reco::PFClusterParamsDeviceCollection& params,
1473                                                     const reco::PFRecHitHCALTopologyDeviceCollection& topology,
1474                                                     reco::PFClusteringVarsDeviceCollection& pfClusteringVars,
1475                                                     reco::PFClusteringEdgeVarsDeviceCollection& pfClusteringEdgeVars,
1476                                                     const reco::PFRecHitHostCollection& pfRecHits,
1477                                                     reco::PFClusterDeviceCollection& pfClusters,
1478                                                     uint32_t* __restrict__ nRHF) {
1479     const int nRH = pfRecHits->size();
1480     const int threadsPerBlock = 256;
1481     const int blocks = divide_up_by(nRH, threadsPerBlock);
1482 
1483     // seedingTopoThresh
1484     alpaka::exec<Acc1D>(queue,
1485                         make_workdiv<Acc1D>(blocks, threadsPerBlock),
1486                         SeedingTopoThresh{},
1487                         pfClusteringVars.view(),
1488                         params.view(),
1489                         topology.view(),
1490                         pfRecHits.view(),
1491                         pfClusters.view(),
1492                         nSeeds.data());
1493     // prepareTopoInputs
1494     alpaka::exec<Acc1D>(queue,
1495                         make_workdiv<Acc1D>(blocks, threadsPerBlock),
1496                         PrepareTopoInputs{},
1497                         pfRecHits.view(),
1498                         pfClusteringVars.view(),
1499                         pfClusteringEdgeVars.view(),
1500                         nSeeds.data());
1501     // ECLCC
1502     alpaka::exec<Acc1D>(queue,
1503                         make_workdiv<Acc1D>(blocks, threadsPerBlock),
1504                         ECLCCInit{},
1505                         pfRecHits.view(),
1506                         pfClusteringVars.view(),
1507                         pfClusteringEdgeVars.view());
1508     alpaka::exec<Acc1D>(queue,
1509                         make_workdiv<Acc1D>(blocks, threadsPerBlock),
1510                         ECLCCCompute1{},
1511                         pfRecHits.view(),
1512                         pfClusteringVars.view(),
1513                         pfClusteringEdgeVars.view());
1514     alpaka::exec<Acc1D>(queue,
1515                         make_workdiv<Acc1D>(blocks, threadsPerBlock),
1516                         ECLCCFlatten{},
1517                         pfRecHits.view(),
1518                         pfClusteringVars.view(),
1519                         pfClusteringEdgeVars.view());
1520     // topoClusterContraction
1521     alpaka::exec<Acc1D>(queue,
1522                         make_workdiv<Acc1D>(1, threadsPerBlockForClustering),
1523                         TopoClusterContraction{},
1524                         pfRecHits.view(),
1525                         pfClusteringVars.view(),
1526                         pfClusters.view(),
1527                         nSeeds.data(),
1528                         nRHF);
1529   }
1530 
1531   void PFClusterProducerKernel::cluster(Queue& queue,
1532                                         const reco::PFClusterParamsDeviceCollection& params,
1533                                         const reco::PFRecHitHCALTopologyDeviceCollection& topology,
1534                                         reco::PFClusteringVarsDeviceCollection& pfClusteringVars,
1535                                         reco::PFClusteringEdgeVarsDeviceCollection& pfClusteringEdgeVars,
1536                                         const reco::PFRecHitHostCollection& pfRecHits,
1537                                         reco::PFClusterDeviceCollection& pfClusters,
1538                                         reco::PFRecHitFractionDeviceCollection& pfrhFractions) {
1539     const int nRH = pfRecHits->size();
1540 
1541     // fillRhfIndex
1542     alpaka::exec<Acc2D>(queue,
1543                         make_workdiv<Acc2D>({divide_up_by(nRH, 32), divide_up_by(nRH, 32)}, {32, 32}),
1544                         FillRhfIndex{},
1545                         pfRecHits.view(),
1546                         pfClusteringVars.view(),
1547                         pfrhFractions.view());
1548 
1549     // Run fastCluster
1550     alpaka::exec<Acc1D>(queue,
1551                         make_workdiv<Acc1D>(nRH, threadsPerBlockForClustering),
1552                         FastCluster{},
1553                         pfRecHits.view(),
1554                         params.view(),
1555                         topology.view(),
1556                         pfClusteringVars.view(),
1557                         pfClusters.view(),
1558                         pfrhFractions.view());
1559     // exotic clustering kernel
1560     alpaka::exec<Acc1D>(queue,
1561                         make_workdiv<Acc1D>(blocksForExoticClusters,
1562                                             threadsPerBlockForClustering),  // uses 4 blocks to minimize memory usage
1563                         FastClusterExotic{},
1564                         pfRecHits.view(),
1565                         params.view(),
1566                         topology.view(),
1567                         pfClusteringVars.view(),
1568                         pfClusters.view(),
1569                         pfrhFractions.view(),
1570                         globalClusterPos.data(),
1571                         globalPrevClusterPos.data(),
1572                         globalClusterEnergy.data(),
1573                         globalRhFracSum.data(),
1574                         globalSeeds.data(),
1575                         globalRechits.data());
1576   }
1577 
1578 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE