Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-01-22 07:34:27

0001 #include <alpaka/alpaka.hpp>
0002 
0003 #include "DataFormats/ParticleFlowReco/interface/PFLayer.h"
0004 #include "FWCore/Utilities/interface/bit_cast.h"
0005 #include "HeterogeneousCore/AlpakaInterface/interface/atomicMaxF.h"
0006 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0007 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0008 #include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterECLCC.h"
0009 #include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterSoAProducerKernel.h"
0010 
0011 namespace ALPAKA_ACCELERATOR_NAMESPACE {
0012 
0013   using namespace cms::alpakatools;
0014 
0015   using namespace reco::pfClustering;
0016 
0017   static constexpr int threadsPerBlockForClustering = 512;
0018   static constexpr uint32_t blocksForExoticClusters = 4;
0019 
0020   // cutoffFraction -> Is a rechit almost entirely attributed to one cluster
0021   // cutoffDistance -> Is a rechit close enough to a cluster to be associated
0022   // Values are from RecoParticleFlow/PFClusterProducer/plugins/Basic2DGenericPFlowClusterizer.cc
0023   static constexpr float cutoffDistance = 100.;
0024   static constexpr float cutoffFraction = 0.9999;
0025 
0026   static constexpr uint32_t kHBHalf = 1296;
0027   static constexpr uint32_t maxTopoInput = 2 * kHBHalf;
0028 
0029   // Calculation of dR2 for Clustering
0030   ALPAKA_FN_ACC ALPAKA_FN_INLINE static float dR2(Position4 pos1, Position4 pos2) {
0031     float mag1 = sqrtf(pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
0032     float cosTheta1 = mag1 > 0.0 ? pos1.z / mag1 : 1.0;
0033     float eta1 = 0.5f * logf((1.0f + cosTheta1) / (1.0f - cosTheta1));
0034     float phi1 = atan2f(pos1.y, pos1.x);
0035 
0036     float mag2 = sqrtf(pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);
0037     float cosTheta2 = mag2 > 0.0 ? pos2.z / mag2 : 1.0;
0038     float eta2 = 0.5f * logf((1.0f + cosTheta2) / (1.0f - cosTheta2));
0039     float phi2 = atan2f(pos2.y, pos2.x);
0040 
0041     float deta = eta2 - eta1;
0042     constexpr const float fPI = M_PI;
0043     float dphi = std::abs(std::abs(phi2 - phi1) - fPI) - fPI;
0044     return (deta * deta + dphi * dphi);
0045   }
0046 
0047   // Get index of seed
0048   ALPAKA_FN_ACC static auto getSeedRhIdx(int* seeds, int seedNum) { return seeds[seedNum]; }
0049 
0050   // Get rechit fraction of a given rechit for a given seed
0051   ALPAKA_FN_ACC static auto getRhFrac(reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
0052                                       int topoSeedBegin,
0053                                       reco::PFRecHitFractionDeviceCollection::View fracView,
0054                                       int seedNum,
0055                                       int rhNum) {
0056     int seedIdx = pfClusteringVars[topoSeedBegin + seedNum].topoSeedList();
0057     return fracView[pfClusteringVars[seedIdx].seedFracOffsets() + rhNum].frac();
0058   }
0059 
0060   // Cluster position calculation
0061   template <bool debug = false>
0062   ALPAKA_FN_ACC static void updateClusterPos(::reco::PFClusterParamsSoA::ConstView pfClusParams,
0063                                              Position4& pos4,
0064                                              float frac,
0065                                              int rhInd,
0066                                              reco::PFRecHitDeviceCollection::ConstView pfRecHits,
0067                                              float rhENormInv) {
0068     Position4 rechitPos = Position4{pfRecHits[rhInd].x(), pfRecHits[rhInd].y(), pfRecHits[rhInd].z(), 1.0};
0069     const auto rh_energy = pfRecHits[rhInd].energy() * frac;
0070     const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : std::max(0.0f, logf(rh_energy * rhENormInv)));
0071     if constexpr (debug)
0072       printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n",
0073              rhInd,
0074              norm,
0075              frac,
0076              rh_energy,
0077              rechitPos.x,
0078              rechitPos.y,
0079              rechitPos.z);
0080 
0081     pos4.x += rechitPos.x * norm;
0082     pos4.y += rechitPos.y * norm;
0083     pos4.z += rechitPos.z * norm;
0084     pos4.w += norm;  //  position_norm
0085   }
0086 
0087   // Processing single seed clusters
0088   // Device function designed to be called by all threads of a given block
0089   template <bool debug = false>
0090   ALPAKA_FN_ACC static void hcalFastCluster_singleSeed(
0091       const Acc1D& acc,
0092       ::reco::PFClusterParamsSoA::ConstView pfClusParams,
0093       const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
0094       int topoId,   // from selection
0095       int nRHTopo,  // from selection
0096       reco::PFRecHitDeviceCollection::ConstView pfRecHits,
0097       reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
0098       reco::PFClusterDeviceCollection::View clusterView,
0099       reco::PFRecHitFractionDeviceCollection::View fracView) {
0100     int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];  // thread index is rechit number
0101     // Declaration of shared variables
0102     int& i = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0103     int& nRHOther = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0104     unsigned int& iter = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
0105     float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0106     float& clusterEnergy = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0107     float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0108     float& seedEnergy = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0109     Position4& clusterPos = alpaka::declareSharedVar<Position4, __COUNTER__>(acc);
0110     Position4& prevClusterPos = alpaka::declareSharedVar<Position4, __COUNTER__>(acc);
0111     Position4& seedPos = alpaka::declareSharedVar<Position4, __COUNTER__>(acc);
0112     bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
0113     if (once_per_block(acc)) {
0114       i = pfClusteringVars[pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();  // i is the seed rechit index
0115       nRHOther = nRHTopo - 1;                                                           // number of non-seed rechits
0116       seedPos = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.};
0117       clusterPos = seedPos;  // Initial cluster position is just the seed
0118       prevClusterPos = seedPos;
0119       seedEnergy = pfRecHits[i].energy();
0120       clusterEnergy = seedEnergy;
0121       tol = pfClusParams.stoppingTolerance();  // stopping tolerance * tolerance scaling
0122 
0123       if (topology.cutsFromDB()) {
0124         rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
0125       } else {
0126         if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
0127           rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
0128         else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
0129           rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
0130         else {
0131           rhENormInv = 0.;
0132           printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
0133         }
0134       }
0135 
0136       iter = 0;
0137       notDone = true;
0138     }
0139     alpaka::syncBlockThreads(acc);  // all threads call sync
0140 
0141     int j = -1;  // j is the rechit index for this thread
0142     int rhFracOffset = -1;
0143     Position4 rhPos;
0144     float rhEnergy = -1., rhPosNorm = -1.;
0145 
0146     if (tid < nRHOther) {
0147       rhFracOffset =
0148           pfClusteringVars[i].seedFracOffsets() + tid + 1;  // Offset for this rechit in pcrhfrac, pcrhfracidx arrays
0149       j = fracView[rhFracOffset].pfrhIdx();                 // rechit index for this thread
0150       rhPos = Position4{pfRecHits[j].x(), pfRecHits[j].y(), pfRecHits[j].z(), 1.};
0151       rhEnergy = pfRecHits[j].energy();
0152       rhPosNorm = fmaxf(0., logf(rhEnergy * rhENormInv));
0153     }
0154     alpaka::syncBlockThreads(acc);  // all threads call sync
0155 
0156     do {
0157       if constexpr (debug) {
0158         if (once_per_block(acc))
0159           printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
0160       }
0161       float dist2 = -1., d2 = -1., fraction = -1.;
0162       if (tid < nRHOther) {
0163         // Rechit distance calculation
0164         dist2 = (clusterPos.x - rhPos.x) * (clusterPos.x - rhPos.x) +
0165                 (clusterPos.y - rhPos.y) * (clusterPos.y - rhPos.y) +
0166                 (clusterPos.z - rhPos.z) * (clusterPos.z - rhPos.z);
0167 
0168         d2 = dist2 / pfClusParams.showerSigma2();
0169         fraction = clusterEnergy * rhENormInv * expf(-0.5f * d2);
0170 
0171         // For single seed clusters, rechit fraction is either 1 (100%) or -1 (not included)
0172         if (fraction > pfClusParams.minFracTot() && d2 < cutoffDistance)
0173           fraction = 1.;
0174         else
0175           fraction = -1.;
0176         fracView[rhFracOffset].frac() = fraction;
0177       }
0178       alpaka::syncBlockThreads(acc);  // all threads call sync
0179 
0180       if constexpr (debug) {
0181         if (once_per_block(acc))
0182           printf("Computing cluster position for topoId %d\n", topoId);
0183       }
0184 
0185       if (once_per_block(acc)) {
0186         // Reset cluster position and energy
0187         clusterPos = seedPos;
0188         clusterEnergy = seedEnergy;
0189       }
0190       alpaka::syncBlockThreads(acc);  // all threads call sync
0191 
0192       // Recalculate cluster position and energy
0193       if (fraction > -0.5) {
0194         alpaka::atomicAdd(acc, &clusterEnergy, rhEnergy, alpaka::hierarchy::Threads{});
0195         alpaka::atomicAdd(acc, &clusterPos.x, rhPos.x * rhPosNorm, alpaka::hierarchy::Threads{});
0196         alpaka::atomicAdd(acc, &clusterPos.y, rhPos.y * rhPosNorm, alpaka::hierarchy::Threads{});
0197         alpaka::atomicAdd(acc, &clusterPos.z, rhPos.z * rhPosNorm, alpaka::hierarchy::Threads{});
0198         alpaka::atomicAdd(acc, &clusterPos.w, rhPosNorm, alpaka::hierarchy::Threads{});  // position_norm
0199       }
0200       alpaka::syncBlockThreads(acc);  // all threads call sync
0201 
0202       if (once_per_block(acc)) {
0203         // Normalize the seed postiion
0204         if (clusterPos.w >= pfClusParams.minAllowedNormalization()) {
0205           // Divide by position norm
0206           clusterPos.x /= clusterPos.w;
0207           clusterPos.y /= clusterPos.w;
0208           clusterPos.z /= clusterPos.w;
0209 
0210           if constexpr (debug)
0211             printf("\tPF cluster (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
0212                    i,
0213                    clusterEnergy,
0214                    clusterPos.x,
0215                    clusterPos.y,
0216                    clusterPos.z);
0217         } else {
0218           if constexpr (debug)
0219             printf("\tPF cluster (seed %d) position norm (%f) less than minimum (%f)\n",
0220                    i,
0221                    clusterPos.w,
0222                    pfClusParams.minAllowedNormalization());
0223           clusterPos.x = 0.;
0224           clusterPos.y = 0.;
0225           clusterPos.z = 0.;
0226         }
0227         float diff2 = dR2(prevClusterPos, clusterPos);
0228         if constexpr (debug)
0229           printf("\tPF cluster (seed %d) has diff2 = %f\n", i, diff2);
0230         prevClusterPos = clusterPos;  // Save clusterPos
0231 
0232         float tol2 = tol * tol;
0233         iter++;
0234         notDone = (diff2 > tol2) && (iter < pfClusParams.maxIterations());
0235         if constexpr (debug) {
0236           if (diff2 > tol2)
0237             printf("\tTopoId %d has diff2 = %f greater than squared tolerance %f (continuing)\n", topoId, diff2, tol2);
0238           else if constexpr (debug)
0239             printf("\tTopoId %d has diff2 = %f LESS than squared tolerance %f (terminating!)\n", topoId, diff2, tol2);
0240         }
0241       }
0242       alpaka::syncBlockThreads(acc);  // all threads call sync
0243     } while (notDone);  // shared variable condition ensures synchronization is well defined
0244     if (once_per_block(acc)) {  // Cluster is finalized, assign cluster information to te SoA
0245       int rhIdx =
0246           pfClusteringVars[pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();  // i is the seed rechit index
0247       int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
0248       clusterView[seedIdx].energy() = clusterEnergy;
0249       clusterView[seedIdx].x() = clusterPos.x;
0250       clusterView[seedIdx].y() = clusterPos.y;
0251       clusterView[seedIdx].z() = clusterPos.z;
0252     }
0253   }
0254 
0255   // Processing clusters up to 100 seeds and 512 non-seed rechits using shared memory accesses
0256   // Device function designed to be called by all threads of a given block
0257   template <bool debug = false>
0258   ALPAKA_FN_ACC static void hcalFastCluster_multiSeedParallel(
0259       const Acc1D& acc,
0260       ::reco::PFClusterParamsSoA::ConstView pfClusParams,
0261       const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
0262       int topoId,   // from selection
0263       int nSeeds,   // from selection
0264       int nRHTopo,  // from selection
0265       reco::PFRecHitDeviceCollection::ConstView pfRecHits,
0266       reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
0267       reco::PFClusterDeviceCollection::View clusterView,
0268       reco::PFRecHitFractionDeviceCollection::View fracView) {
0269     int tid = alpaka::getIdx<alpaka::Block,
0270                              alpaka::Threads>(  // Thread index corresponds to a single rechit of the topo cluster
0271         acc)[0u];
0272 
0273     int& nRHNotSeed = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0274     int& topoSeedBegin = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0275     int& stride = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0276     int& iter = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0277     float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0278     float& diff2 = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0279     float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0280     bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
0281     auto& clusterPos = alpaka::declareSharedVar<Position4[100], __COUNTER__>(acc);
0282     auto& prevClusterPos = alpaka::declareSharedVar<Position4[100], __COUNTER__>(acc);
0283     auto& clusterEnergy = alpaka::declareSharedVar<float[100], __COUNTER__>(acc);
0284     auto& rhFracSum = alpaka::declareSharedVar<float[threadsPerBlockForClustering], __COUNTER__>(acc);
0285     auto& seeds = alpaka::declareSharedVar<int[100], __COUNTER__>(acc);
0286     auto& rechits = alpaka::declareSharedVar<int[threadsPerBlockForClustering], __COUNTER__>(acc);
0287 
0288     if (once_per_block(acc)) {
0289       nRHNotSeed = nRHTopo - nSeeds + 1;  // 1 + (# rechits per topoId that are NOT seeds)
0290       topoSeedBegin = pfClusteringVars[topoId].topoSeedOffsets();
0291       tol = pfClusParams.stoppingTolerance() *
0292             powf(fmaxf(1.0, nSeeds - 1), 2.0);  // stopping tolerance * tolerance scaling
0293       stride = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0294       iter = 0;
0295       notDone = true;
0296 
0297       int i = pfClusteringVars[topoSeedBegin].topoSeedList();
0298 
0299       if (topology.cutsFromDB()) {
0300         rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
0301       } else {
0302         if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
0303           rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
0304         else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
0305           rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
0306         else {
0307           rhENormInv = 0.;
0308           printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
0309         }
0310       }
0311     }
0312     alpaka::syncBlockThreads(acc);  // all threads call sync
0313 
0314     if (tid < nSeeds)
0315       seeds[tid] = pfClusteringVars[topoSeedBegin + tid].topoSeedList();
0316     if (tid < nRHNotSeed - 1)
0317       rechits[tid] =
0318           fracView[pfClusteringVars[pfClusteringVars[topoSeedBegin].topoSeedList()].seedFracOffsets() + tid + 1]
0319               .pfrhIdx();
0320 
0321     alpaka::syncBlockThreads(acc);  // all threads call sync
0322 
0323     if constexpr (debug) {
0324       if (once_per_block(acc)) {
0325         printf("\n===========================================================================================\n");
0326         printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
0327         for (int s = 0; s < nSeeds; s++) {
0328           if (s != 0)
0329             printf(", ");
0330           printf("%d", getSeedRhIdx(seeds, s));
0331         }
0332         if (nRHTopo == nSeeds) {
0333           printf(")\n\n");
0334         } else {
0335           printf(") and other rechits (");
0336           for (int r = 1; r < nRHNotSeed; r++) {
0337             if (r != 1)
0338               printf(", ");
0339             if (r <= 0) {
0340               printf("Invalid rhNum (%d) for get RhFracIdx!\n", r);
0341             }
0342             printf("%d", rechits[r - 1]);
0343           }
0344           printf(")\n\n");
0345         }
0346       }
0347       alpaka::syncBlockThreads(acc);  // all (or none) threads call sync
0348     }
0349 
0350     // Set initial cluster position (energy) to seed rechit position (energy)
0351     if (tid < nSeeds) {
0352       int i = getSeedRhIdx(seeds, tid);
0353       clusterPos[tid] = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.0};
0354       prevClusterPos[tid] = clusterPos[tid];
0355       clusterEnergy[tid] = pfRecHits[i].energy();
0356       for (int r = 0; r < (nRHNotSeed - 1); r++) {
0357         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].pfrhIdx() = rechits[r];
0358         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].frac() = -1.;
0359       }
0360     }
0361     alpaka::syncBlockThreads(acc);  // all threads call sync
0362 
0363     int rhThreadIdx = -1;
0364     Position4 rhThreadPos;
0365     if (tid < (nRHNotSeed - 1)) {
0366       rhThreadIdx = rechits[tid];  // Index when thread represents rechit
0367       rhThreadPos = Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
0368     }
0369 
0370     // Neighbors when threadIdx represents seed
0371     int seedThreadIdx = -1;
0372     Neighbours4 seedNeighbors = Neighbours4{-9, -9, -9, -9};
0373     float seedEnergy = -1.;
0374     Position4 seedInitClusterPos = Position4{0., 0., 0., 0.};
0375     if (tid < nSeeds) {
0376       if constexpr (debug)
0377         printf("tid: %d\n", tid);
0378       seedThreadIdx = getSeedRhIdx(seeds, tid);
0379       seedNeighbors = Neighbours4{pfRecHits[seedThreadIdx].neighbours()(0),
0380                                   pfRecHits[seedThreadIdx].neighbours()(1),
0381                                   pfRecHits[seedThreadIdx].neighbours()(2),
0382                                   pfRecHits[seedThreadIdx].neighbours()(3)};
0383       seedEnergy = pfRecHits[seedThreadIdx].energy();
0384 
0385       // Compute initial cluster position shift for seed
0386       updateClusterPos(pfClusParams, seedInitClusterPos, 1., seedThreadIdx, pfRecHits, rhENormInv);
0387     }
0388 
0389     do {
0390       if constexpr (debug) {
0391         if (once_per_block(acc))
0392           printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
0393       }
0394 
0395       // Reset rhFracSum
0396       rhFracSum[tid] = 0.;
0397       if (once_per_block(acc))
0398         diff2 = -1;
0399 
0400       if (tid < (nRHNotSeed - 1)) {
0401         for (int s = 0; s < nSeeds; s++) {
0402           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0403                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0404                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0405 
0406           float d2 = dist2 / pfClusParams.showerSigma2();
0407           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0408 
0409           rhFracSum[tid] += fraction;
0410         }
0411       }
0412       alpaka::syncBlockThreads(acc);  // all threads call sync
0413 
0414       if (tid < (nRHNotSeed - 1)) {
0415         for (int s = 0; s < nSeeds; s++) {
0416           int i = seeds[s];
0417           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0418                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0419                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0420 
0421           float d2 = dist2 / pfClusParams.showerSigma2();
0422           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0423 
0424           if (rhFracSum[tid] > pfClusParams.minFracTot()) {
0425             float fracpct = fraction / rhFracSum[tid];
0426             if (fracpct > cutoffFraction || (d2 < cutoffDistance && fracpct > pfClusParams.minFracToKeep())) {
0427               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = fracpct;
0428             } else {
0429               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0430             }
0431           } else {
0432             fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0433           }
0434         }
0435       }
0436       alpaka::syncBlockThreads(acc);  // all threads call sync
0437 
0438       if constexpr (debug) {
0439         if (once_per_block(acc))
0440           printf("Computing cluster position for topoId %d\n", topoId);
0441       }
0442 
0443       // Reset cluster position and energy
0444       if (tid < nSeeds) {
0445         clusterPos[tid] = seedInitClusterPos;
0446         clusterEnergy[tid] = seedEnergy;
0447         if constexpr (debug) {
0448           printf("Cluster %d (seed %d) has energy %f\tpos = (%f, %f, %f, %f)\n",
0449                  tid,
0450                  seeds[tid],
0451                  clusterEnergy[tid],
0452                  clusterPos[tid].x,
0453                  clusterPos[tid].y,
0454                  clusterPos[tid].z,
0455                  clusterPos[tid].w);
0456         }
0457       }
0458       alpaka::syncBlockThreads(acc);  // all threads call sync
0459 
0460       // Recalculate position
0461       if (tid < nSeeds) {
0462         for (int r = 0; r < nRHNotSeed - 1; r++) {
0463           int j = rechits[r];
0464           float frac = getRhFrac(pfClusteringVars, topoSeedBegin, fracView, tid, r + 1);
0465 
0466           if (frac > -0.5) {
0467             clusterEnergy[tid] += frac * pfRecHits[j].energy();
0468 
0469             if (nSeeds == 1 || j == seedNeighbors.x || j == seedNeighbors.y || j == seedNeighbors.z ||
0470                 j == seedNeighbors.w)
0471               updateClusterPos(pfClusParams, clusterPos[tid], frac, j, pfRecHits, rhENormInv);
0472           }
0473         }
0474       }
0475       alpaka::syncBlockThreads(acc);  // all threads call sync
0476 
0477       // Position normalization
0478       if (tid < nSeeds) {
0479         if (clusterPos[tid].w >= pfClusParams.minAllowedNormalization()) {
0480           // Divide by position norm
0481           clusterPos[tid].x /= clusterPos[tid].w;
0482           clusterPos[tid].y /= clusterPos[tid].w;
0483           clusterPos[tid].z /= clusterPos[tid].w;
0484 
0485           if constexpr (debug)
0486             printf("\tCluster %d (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
0487                    tid,
0488                    seedThreadIdx,
0489                    clusterEnergy[tid],
0490                    clusterPos[tid].x,
0491                    clusterPos[tid].y,
0492                    clusterPos[tid].z);
0493         } else {
0494           if constexpr (debug)
0495             printf("\tCluster %d (seed %d) position norm (%f) less than minimum (%f)\n",
0496                    tid,
0497                    seedThreadIdx,
0498                    clusterPos[tid].w,
0499                    pfClusParams.minAllowedNormalization());
0500           clusterPos[tid].x = 0.0;
0501           clusterPos[tid].y = 0.0;
0502           clusterPos[tid].z = 0.0;
0503         }
0504       }
0505       alpaka::syncBlockThreads(acc);  // all threads call sync
0506 
0507       if (tid < nSeeds) {
0508         float delta2 = dR2(prevClusterPos[tid], clusterPos[tid]);
0509         if constexpr (debug)
0510           printf("\tCluster %d (seed %d) has delta2 = %f\n", tid, seeds[tid], delta2);
0511         atomicMaxF(acc, &diff2, delta2);
0512         prevClusterPos[tid] = clusterPos[tid];  // Save clusterPos
0513       }
0514       alpaka::syncBlockThreads(acc);  // all threads call sync
0515 
0516       if (once_per_block(acc)) {
0517         float tol2 = tol * tol;
0518         iter++;
0519         notDone = (diff2 > tol2) && ((unsigned int)iter < pfClusParams.maxIterations());
0520         if constexpr (debug) {
0521           if (diff2 > tol2)
0522             printf("\tTopoId %d has diff2 = %f greater than squared tolerance %f (continuing)\n", topoId, diff2, tol2);
0523           else if constexpr (debug)
0524             printf("\tTopoId %d has diff2 = %f LESS than squared tolerance %f (terminating!)\n", topoId, diff2, tol2);
0525         }
0526       }
0527       alpaka::syncBlockThreads(acc);  // all threads call sync
0528     } while (notDone);  // shared variable condition ensures synchronization is well defined
0529     if (once_per_block(acc))
0530       // Fill PFCluster-level info
0531       if (tid < nSeeds) {
0532         int rhIdx = pfClusteringVars[tid + pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
0533         int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
0534         clusterView[seedIdx].energy() = clusterEnergy[tid];
0535         clusterView[seedIdx].x() = clusterPos[tid].x;
0536         clusterView[seedIdx].y() = clusterPos[tid].y;
0537         clusterView[seedIdx].z() = clusterPos[tid].z;
0538       }
0539   }
0540 
0541   // Process very large exotic clusters, from nSeeds > 400 and non-seeds > 1500
0542   // Uses global memory access
0543   // Device function designed to be called by all threads of a given block
0544   template <bool debug = false>
0545   ALPAKA_FN_ACC static void hcalFastCluster_exotic(const Acc1D& acc,
0546                                                    ::reco::PFClusterParamsSoA::ConstView pfClusParams,
0547                                                    const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
0548                                                    int topoId,
0549                                                    int nSeeds,
0550                                                    int nRHTopo,
0551                                                    reco::PFRecHitDeviceCollection::ConstView pfRecHits,
0552                                                    reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
0553                                                    reco::PFClusterDeviceCollection::View clusterView,
0554                                                    reco::PFRecHitFractionDeviceCollection::View fracView,
0555                                                    Position4* __restrict__ globalClusterPos,
0556                                                    Position4* __restrict__ globalPrevClusterPos,
0557                                                    float* __restrict__ globalClusterEnergy,
0558                                                    float* __restrict__ globalRhFracSum,
0559                                                    int* __restrict__ globalSeeds,
0560                                                    int* __restrict__ globalRechits) {
0561     int& nRHNotSeed = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0562     int& blockIdx = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0563     int& topoSeedBegin = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0564     int& stride = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0565     int& iter = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0566     float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0567     float& diff2 = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0568     float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0569     bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
0570 
0571     blockIdx = maxTopoInput * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0572     Position4* clusterPos = globalClusterPos + blockIdx;
0573     Position4* prevClusterPos = globalPrevClusterPos + blockIdx;
0574     float* clusterEnergy = globalClusterEnergy + blockIdx;
0575     float* rhFracSum = globalRhFracSum + blockIdx;
0576     int* seeds = globalSeeds + blockIdx;
0577     int* rechits = globalRechits + blockIdx;
0578 
0579     if (once_per_block(acc)) {
0580       nRHNotSeed = nRHTopo - nSeeds + 1;  // 1 + (# rechits per topoId that are NOT seeds)
0581       topoSeedBegin = pfClusteringVars[topoId].topoSeedOffsets();
0582       tol = pfClusParams.stoppingTolerance() *
0583             powf(fmaxf(1.0, nSeeds - 1), 2.0);  // stopping tolerance * tolerance scaling
0584       stride = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0585       iter = 0;
0586       notDone = true;
0587 
0588       int i = pfClusteringVars[topoSeedBegin].topoSeedList();
0589 
0590       if (topology.cutsFromDB()) {
0591         rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
0592       } else {
0593         if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
0594           rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
0595         else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
0596           rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
0597         else {
0598           rhENormInv = 0.;
0599           printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
0600         }
0601       }
0602     }
0603     alpaka::syncBlockThreads(acc);  // all threads call sync
0604 
0605     for (int n = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; n < nRHTopo; n += stride) {
0606       if (n < nSeeds)
0607         seeds[n] = pfClusteringVars[topoSeedBegin + n].topoSeedList();
0608       if (n < nRHNotSeed - 1)
0609         rechits[n] =
0610             fracView[pfClusteringVars[pfClusteringVars[topoSeedBegin].topoSeedList()].seedFracOffsets() + n + 1]
0611                 .pfrhIdx();
0612     }
0613     alpaka::syncBlockThreads(acc);  // all threads call sync
0614 
0615     if constexpr (debug) {
0616       if (once_per_block(acc)) {
0617         printf("\n===========================================================================================\n");
0618         printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
0619         for (int s = 0; s < nSeeds; s++) {
0620           if (s != 0)
0621             printf(", ");
0622           printf("%d", getSeedRhIdx(seeds, s));
0623         }
0624         if (nRHTopo == nSeeds) {
0625           printf(")\n\n");
0626         } else {
0627           printf(") and other rechits (");
0628           for (int r = 1; r < nRHNotSeed; r++) {
0629             if (r != 1)
0630               printf(", ");
0631             if (r <= 0) {
0632               printf("Invalid rhNum (%d) for get RhFracIdx!\n", r);
0633             }
0634             printf("%d", rechits[r - 1]);
0635           }
0636           printf(")\n\n");
0637         }
0638       }
0639       alpaka::syncBlockThreads(acc);  // all (or none) threads call sync
0640     }
0641 
0642     // Set initial cluster position (energy) to seed rechit position (energy)
0643     for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0644       int i = seeds[s];
0645       clusterPos[s] = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.0};
0646       prevClusterPos[s] = clusterPos[s];
0647       clusterEnergy[s] = pfRecHits[i].energy();
0648       for (int r = 0; r < (nRHNotSeed - 1); r++) {
0649         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].pfrhIdx() = rechits[r];
0650         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].frac() = -1.;
0651       }
0652     }
0653     alpaka::syncBlockThreads(acc);  // all threads call sync
0654 
0655     do {
0656       if constexpr (debug) {
0657         if (once_per_block(acc))
0658           printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
0659       }
0660 
0661       if (once_per_block(acc))
0662         diff2 = -1;
0663       // Reset rhFracSum
0664       for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
0665         rhFracSum[tid] = 0.;
0666         int rhThreadIdx = rechits[tid];
0667         Position4 rhThreadPos =
0668             Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
0669         for (int s = 0; s < nSeeds; s++) {
0670           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0671                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0672                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0673 
0674           float d2 = dist2 / pfClusParams.showerSigma2();
0675           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0676 
0677           rhFracSum[tid] += fraction;
0678         }
0679       }
0680       alpaka::syncBlockThreads(acc);  // all threads call sync
0681 
0682       for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
0683         int rhThreadIdx = rechits[tid];
0684         Position4 rhThreadPos =
0685             Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
0686         for (int s = 0; s < nSeeds; s++) {
0687           int i = seeds[s];
0688           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0689                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0690                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0691 
0692           float d2 = dist2 / pfClusParams.showerSigma2();
0693           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0694 
0695           if (rhFracSum[tid] > pfClusParams.minFracTot()) {
0696             float fracpct = fraction / rhFracSum[tid];
0697             if (fracpct > cutoffFraction || (d2 < cutoffDistance && fracpct > pfClusParams.minFracToKeep())) {
0698               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = fracpct;
0699             } else {
0700               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0701             }
0702           } else {
0703             fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0704           }
0705         }
0706       }
0707       alpaka::syncBlockThreads(acc);  // all threads call sync
0708 
0709       if constexpr (debug) {
0710         if (once_per_block(acc))
0711           printf("Computing cluster position for topoId %d\n", topoId);
0712       }
0713 
0714       // Reset cluster position and energy
0715       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0716         int seedRhIdx = getSeedRhIdx(seeds, s);
0717         float norm = logf(pfRecHits[seedRhIdx].energy() * rhENormInv);
0718         clusterPos[s] = Position4{
0719             pfRecHits[seedRhIdx].x() * norm, pfRecHits[seedRhIdx].y() * norm, pfRecHits[seedRhIdx].z() * norm, norm};
0720         clusterEnergy[s] = pfRecHits[seedRhIdx].energy();
0721         if constexpr (debug) {
0722           printf("Cluster %d (seed %d) has energy %f\tpos = (%f, %f, %f, %f)\n",
0723                  s,
0724                  seeds[s],
0725                  clusterEnergy[s],
0726                  clusterPos[s].x,
0727                  clusterPos[s].y,
0728                  clusterPos[s].z,
0729                  clusterPos[s].w);
0730         }
0731       }
0732       alpaka::syncBlockThreads(acc);  // all threads call sync
0733 
0734       // Recalculate position
0735       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0736         int seedRhIdx = getSeedRhIdx(seeds, s);
0737         for (int r = 0; r < nRHNotSeed - 1; r++) {
0738           int j = rechits[r];
0739           float frac = getRhFrac(pfClusteringVars, topoSeedBegin, fracView, s, r + 1);
0740 
0741           if (frac > -0.5) {
0742             clusterEnergy[s] += frac * pfRecHits[j].energy();
0743 
0744             if (nSeeds == 1 || j == pfRecHits[seedRhIdx].neighbours()(0) || j == pfRecHits[seedRhIdx].neighbours()(1) ||
0745                 j == pfRecHits[seedRhIdx].neighbours()(2) || j == pfRecHits[seedRhIdx].neighbours()(3))
0746               updateClusterPos(pfClusParams, clusterPos[s], frac, j, pfRecHits, rhENormInv);
0747           }
0748         }
0749       }
0750       alpaka::syncBlockThreads(acc);  // all threads call sync
0751 
0752       // Position normalization
0753       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0754         if (clusterPos[s].w >= pfClusParams.minAllowedNormalization()) {
0755           // Divide by position norm
0756           clusterPos[s].x /= clusterPos[s].w;
0757           clusterPos[s].y /= clusterPos[s].w;
0758           clusterPos[s].z /= clusterPos[s].w;
0759 
0760           if constexpr (debug)
0761             printf("\tCluster %d (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
0762                    s,
0763                    seeds[s],
0764                    clusterEnergy[s],
0765                    clusterPos[s].x,
0766                    clusterPos[s].y,
0767                    clusterPos[s].z);
0768         } else {
0769           if constexpr (debug)
0770             printf("\tCluster %d (seed %d) position norm (%f) less than minimum (%f)\n",
0771                    s,
0772                    seeds[s],
0773                    clusterPos[s].w,
0774                    pfClusParams.minAllowedNormalization());
0775           clusterPos[s].x = 0.0;
0776           clusterPos[s].y = 0.0;
0777           clusterPos[s].z = 0.0;
0778         }
0779       }
0780       alpaka::syncBlockThreads(acc);  // all threads call sync
0781 
0782       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0783         float delta2 = dR2(prevClusterPos[s], clusterPos[s]);
0784         if constexpr (debug)
0785           printf("\tCluster %d (seed %d) has delta2 = %f\n", s, seeds[s], delta2);
0786         atomicMaxF(acc, &diff2, delta2);
0787         prevClusterPos[s] = clusterPos[s];  // Save clusterPos
0788       }
0789       alpaka::syncBlockThreads(acc);  // all threads call sync
0790 
0791       if (once_per_block(acc)) {
0792         float tol2 = tol * tol;
0793         iter++;
0794         notDone = (diff2 > tol2) && ((unsigned int)iter < pfClusParams.maxIterations());
0795         if constexpr (debug) {
0796           if (diff2 > tol2)
0797             printf("\tTopoId %d has diff2 = %f greater than squared tolerance %f (continuing)\n", topoId, diff2, tol2);
0798           else if constexpr (debug)
0799             printf("\tTopoId %d has diff2 = %f LESS than squared tolerance %f (terminating!)\n", topoId, diff2, tol2);
0800         }
0801       }
0802       alpaka::syncBlockThreads(acc);  // all threads call sync
0803     } while (notDone);  // shared variable ensures synchronization is well defined
0804     if (once_per_block(acc))
0805       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0806         int rhIdx = pfClusteringVars[s + pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
0807         int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
0808         clusterView[seedIdx].energy() = pfRecHits[s].energy();
0809         clusterView[seedIdx].x() = pfRecHits[s].x();
0810         clusterView[seedIdx].y() = pfRecHits[s].y();
0811         clusterView[seedIdx].z() = pfRecHits[s].z();
0812       }
0813     alpaka::syncBlockThreads(acc);  // all threads call sync
0814   }
0815 
0816   // Process clusters with up to 400 seeds and 1500 non seeds using shared memory
0817   // Device function designed to be called by all threads of a given block
0818   template <bool debug = false>
0819   ALPAKA_FN_ACC static void hcalFastCluster_multiSeedIterative(
0820       const Acc1D& acc,
0821       ::reco::PFClusterParamsSoA::ConstView pfClusParams,
0822       const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
0823       int topoId,
0824       int nSeeds,
0825       int nRHTopo,
0826       reco::PFRecHitDeviceCollection::ConstView pfRecHits,
0827       reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
0828       reco::PFClusterDeviceCollection::View clusterView,
0829       reco::PFRecHitFractionDeviceCollection::View fracView) {
0830     int& nRHNotSeed = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0831     int& topoSeedBegin = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0832     int& stride = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0833     int& iter = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0834     float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0835     float& diff2 = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0836     float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
0837     bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
0838 
0839     auto& clusterPos = alpaka::declareSharedVar<Position4[400], __COUNTER__>(acc);
0840     auto& prevClusterPos = alpaka::declareSharedVar<Position4[400], __COUNTER__>(acc);
0841     auto& clusterEnergy = alpaka::declareSharedVar<float[400], __COUNTER__>(acc);
0842     auto& rhFracSum = alpaka::declareSharedVar<float[1500], __COUNTER__>(acc);
0843     auto& seeds = alpaka::declareSharedVar<int[400], __COUNTER__>(acc);
0844     auto& rechits = alpaka::declareSharedVar<int[1500], __COUNTER__>(acc);
0845 
0846     if (once_per_block(acc)) {
0847       nRHNotSeed = nRHTopo - nSeeds + 1;  // 1 + (# rechits per topoId that are NOT seeds)
0848       topoSeedBegin = pfClusteringVars[topoId].topoSeedOffsets();
0849       tol = pfClusParams.stoppingTolerance() *  // stopping tolerance * tolerance scaling
0850             powf(fmaxf(1.0, nSeeds - 1), 2.0);
0851       stride = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0852       iter = 0;
0853       notDone = true;
0854 
0855       int i = pfClusteringVars[topoSeedBegin].topoSeedList();
0856 
0857       if (topology.cutsFromDB()) {
0858         rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
0859       } else {
0860         if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
0861           rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
0862         else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
0863           rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
0864         else {
0865           rhENormInv = 0.;
0866           printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
0867         }
0868       }
0869     }
0870     alpaka::syncBlockThreads(acc);  // all threads call sync
0871 
0872     for (int n = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; n < nRHTopo; n += stride) {
0873       if (n < nSeeds)
0874         seeds[n] = pfClusteringVars[topoSeedBegin + n].topoSeedList();
0875       if (n < nRHNotSeed - 1)
0876         rechits[n] =
0877             fracView[pfClusteringVars[pfClusteringVars[topoSeedBegin].topoSeedList()].seedFracOffsets() + n + 1]
0878                 .pfrhIdx();
0879     }
0880     alpaka::syncBlockThreads(acc);  // all threads call sync
0881 
0882     if constexpr (debug) {
0883       if (once_per_block(acc)) {
0884         printf("\n===========================================================================================\n");
0885         printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
0886         for (int s = 0; s < nSeeds; s++) {
0887           if (s != 0)
0888             printf(", ");
0889           printf("%d", getSeedRhIdx(seeds, s));
0890         }
0891         if (nRHTopo == nSeeds) {
0892           printf(")\n\n");
0893         } else {
0894           printf(") and other rechits (");
0895           for (int r = 1; r < nRHNotSeed; r++) {
0896             if (r != 1)
0897               printf(", ");
0898             if (r <= 0) {
0899               printf("Invalid rhNum (%d) for get RhFracIdx!\n", r);
0900             }
0901             printf("%d", rechits[r - 1]);
0902           }
0903           printf(")\n\n");
0904         }
0905       }
0906       alpaka::syncBlockThreads(acc);  // all (or none) threads call sync
0907     }
0908 
0909     // Set initial cluster position (energy) to seed rechit position (energy)
0910     for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0911       int i = seeds[s];
0912       clusterPos[s] = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.0};
0913       prevClusterPos[s] = clusterPos[s];
0914       clusterEnergy[s] = pfRecHits[i].energy();
0915       for (int r = 0; r < (nRHNotSeed - 1); r++) {
0916         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].pfrhIdx() = rechits[r];
0917         fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].frac() = -1.;
0918       }
0919     }
0920     alpaka::syncBlockThreads(acc);  // all threads call sync
0921 
0922     do {
0923       if constexpr (debug) {
0924         if (once_per_block(acc))
0925           printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
0926       }
0927 
0928       if (once_per_block(acc))
0929         diff2 = -1;
0930       // Reset rhFracSum
0931       for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
0932         rhFracSum[tid] = 0.;
0933         int rhThreadIdx = rechits[tid];
0934         Position4 rhThreadPos =
0935             Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
0936         for (int s = 0; s < nSeeds; s++) {
0937           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0938                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0939                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0940 
0941           float d2 = dist2 / pfClusParams.showerSigma2();
0942           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0943 
0944           rhFracSum[tid] += fraction;
0945         }
0946       }
0947       alpaka::syncBlockThreads(acc);  // all threads call sync
0948 
0949       for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
0950         int rhThreadIdx = rechits[tid];
0951         Position4 rhThreadPos =
0952             Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
0953         for (int s = 0; s < nSeeds; s++) {
0954           int i = seeds[s];
0955           float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
0956                         (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
0957                         (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
0958 
0959           float d2 = dist2 / pfClusParams.showerSigma2();
0960           float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
0961 
0962           if (rhFracSum[tid] > pfClusParams.minFracTot()) {
0963             float fracpct = fraction / rhFracSum[tid];
0964             if (fracpct > cutoffFraction || (d2 < cutoffDistance && fracpct > pfClusParams.minFracToKeep())) {
0965               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = fracpct;
0966             } else {
0967               fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0968             }
0969           } else {
0970             fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
0971           }
0972         }
0973       }
0974       alpaka::syncBlockThreads(acc);  // all threads call sync
0975 
0976       if constexpr (debug) {
0977         if (once_per_block(acc))
0978           printf("Computing cluster position for topoId %d\n", topoId);
0979       }
0980 
0981       // Reset cluster position and energy
0982       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
0983         int seedRhIdx = getSeedRhIdx(seeds, s);
0984         float norm = logf(pfRecHits[seedRhIdx].energy() * rhENormInv);
0985         clusterPos[s] = Position4{
0986             pfRecHits[seedRhIdx].x() * norm, pfRecHits[seedRhIdx].y() * norm, pfRecHits[seedRhIdx].z() * norm, norm};
0987         clusterEnergy[s] = pfRecHits[seedRhIdx].energy();
0988         if constexpr (debug) {
0989           printf("Cluster %d (seed %d) has energy %f\tpos = (%f, %f, %f, %f)\n",
0990                  s,
0991                  seeds[s],
0992                  clusterEnergy[s],
0993                  clusterPos[s].x,
0994                  clusterPos[s].y,
0995                  clusterPos[s].z,
0996                  clusterPos[s].w);
0997         }
0998       }
0999       alpaka::syncBlockThreads(acc);  // all threads call sync
1000 
1001       // Recalculate position
1002       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1003         int seedRhIdx = getSeedRhIdx(seeds, s);
1004         for (int r = 0; r < nRHNotSeed - 1; r++) {
1005           int j = rechits[r];
1006           float frac = getRhFrac(pfClusteringVars, topoSeedBegin, fracView, s, r + 1);
1007 
1008           if (frac > -0.5) {
1009             clusterEnergy[s] += frac * pfRecHits[j].energy();
1010 
1011             if (nSeeds == 1 || j == pfRecHits[seedRhIdx].neighbours()(0) || j == pfRecHits[seedRhIdx].neighbours()(1) ||
1012                 j == pfRecHits[seedRhIdx].neighbours()(2) || j == pfRecHits[seedRhIdx].neighbours()(3))
1013               updateClusterPos(pfClusParams, clusterPos[s], frac, j, pfRecHits, rhENormInv);
1014           }
1015         }
1016       }
1017       alpaka::syncBlockThreads(acc);  // all threads call sync
1018 
1019       // Position normalization
1020       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1021         if (clusterPos[s].w >= pfClusParams.minAllowedNormalization()) {
1022           // Divide by position norm
1023           clusterPos[s].x /= clusterPos[s].w;
1024           clusterPos[s].y /= clusterPos[s].w;
1025           clusterPos[s].z /= clusterPos[s].w;
1026 
1027           if constexpr (debug)
1028             printf("\tCluster %d (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
1029                    s,
1030                    seeds[s],
1031                    clusterEnergy[s],
1032                    clusterPos[s].x,
1033                    clusterPos[s].y,
1034                    clusterPos[s].z);
1035         } else {
1036           if constexpr (debug)
1037             printf("\tCluster %d (seed %d) position norm (%f) less than minimum (%f)\n",
1038                    s,
1039                    seeds[s],
1040                    clusterPos[s].w,
1041                    pfClusParams.minAllowedNormalization());
1042           clusterPos[s].x = 0.0;
1043           clusterPos[s].y = 0.0;
1044           clusterPos[s].z = 0.0;
1045         }
1046       }
1047       alpaka::syncBlockThreads(acc);  // all threads call sync
1048 
1049       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1050         float delta2 = dR2(prevClusterPos[s], clusterPos[s]);
1051         if constexpr (debug)
1052           printf("\tCluster %d (seed %d) has delta2 = %f\n", s, seeds[s], delta2);
1053         atomicMaxF(acc, &diff2, delta2);
1054         prevClusterPos[s] = clusterPos[s];  // Save clusterPos
1055       }
1056       alpaka::syncBlockThreads(acc);  // all threads call sync
1057 
1058       if (once_per_block(acc)) {
1059         float tol2 = tol * tol;
1060         iter++;
1061         notDone = (diff2 > tol2) && ((unsigned int)iter < pfClusParams.maxIterations());
1062         if constexpr (debug) {
1063           if (diff2 > tol2)
1064             printf("\tTopoId %d has diff2 = %f greater than tolerance %f (continuing)\n", topoId, diff2, tol2);
1065           else if constexpr (debug)
1066             printf("\tTopoId %d has diff2 = %f LESS than tolerance %f (terminating!)\n", topoId, diff2, tol2);
1067         }
1068       }
1069       alpaka::syncBlockThreads(acc);  // all threads call sync
1070     } while (notDone);  // shared variable ensures synchronization is well defined
1071     if (once_per_block(acc))
1072       for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1073         int rhIdx = pfClusteringVars[s + pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
1074         int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
1075         clusterView[seedIdx].energy() = pfRecHits[s].energy();
1076         clusterView[seedIdx].x() = pfRecHits[s].x();
1077         clusterView[seedIdx].y() = pfRecHits[s].y();
1078         clusterView[seedIdx].z() = pfRecHits[s].z();
1079       }
1080   }
1081 
1082   // Seeding using local energy maxima
1083   class SeedingTopoThresh {
1084   public:
1085     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
1086                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1087                                   const ::reco::PFClusterParamsSoA::ConstView pfClusParams,
1088                                   const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
1089                                   const reco::PFRecHitDeviceCollection::ConstView pfRecHits,
1090                                   reco::PFClusterDeviceCollection::View clusterView,
1091                                   uint32_t* __restrict__ nSeeds) const {
1092       const int nRH = pfRecHits.size();
1093 
1094       if (once_per_grid(acc)) {
1095         clusterView.size() = nRH;
1096       }
1097 
1098       for (auto i : uniform_elements(acc, nRH)) {
1099         // Initialize arrays
1100         pfClusteringVars[i].pfrh_isSeed() = 0;
1101         pfClusteringVars[i].rhCount() = 0;
1102         pfClusteringVars[i].topoSeedCount() = 0;
1103         pfClusteringVars[i].topoRHCount() = 0;
1104         pfClusteringVars[i].seedFracOffsets() = -1;
1105         pfClusteringVars[i].topoSeedOffsets() = -1;
1106         pfClusteringVars[i].topoSeedList() = -1;
1107         clusterView[i].seedRHIdx() = -1;
1108 
1109         int layer = pfRecHits[i].layer();
1110         int depthOffset = pfRecHits[i].depth() - 1;
1111         float energy = pfRecHits[i].energy();
1112         Position3 pos = Position3{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z()};
1113         float seedThreshold = 9999.;
1114         float topoThreshold = 9999.;
1115 
1116         if (topology.cutsFromDB()) {
1117           seedThreshold = topology[pfRecHits[i].denseId()].seedThreshold();
1118           topoThreshold = topology[pfRecHits[i].denseId()].noiseThreshold();
1119         } else {
1120           if (layer == PFLayer::HCAL_BARREL1) {
1121             seedThreshold = pfClusParams.seedEThresholdHB_vec()[depthOffset];
1122             topoThreshold = pfClusParams.topoEThresholdHB_vec()[depthOffset];
1123           } else if (layer == PFLayer::HCAL_ENDCAP) {
1124             seedThreshold = pfClusParams.seedEThresholdHE_vec()[depthOffset];
1125             topoThreshold = pfClusParams.topoEThresholdHE_vec()[depthOffset];
1126           }
1127         }
1128 
1129         // cmssdt.cern.ch/lxr/source/DataFormats/ParticleFlowReco/interface/PFRecHit.h#0108
1130         float pt2 = energy * energy * (pos.x * pos.x + pos.y * pos.y) / (pos.x * pos.x + pos.y * pos.y + pos.z * pos.z);
1131 
1132         // Seeding threshold test
1133         if ((layer == PFLayer::HCAL_BARREL1 && energy > seedThreshold && pt2 > pfClusParams.seedPt2ThresholdHB()) ||
1134             (layer == PFLayer::HCAL_ENDCAP && energy > seedThreshold && pt2 > pfClusParams.seedPt2ThresholdHE())) {
1135           pfClusteringVars[i].pfrh_isSeed() = 1;
1136           for (int k = 0; k < 4; k++) {  // Does this seed candidate have a higher energy than four neighbours
1137             if (pfRecHits[i].neighbours()(k) < 0)
1138               continue;
1139             if (energy < pfRecHits[pfRecHits[i].neighbours()(k)].energy()) {
1140               pfClusteringVars[i].pfrh_isSeed() = 0;
1141               break;
1142             }
1143           }
1144           if (pfClusteringVars[i].pfrh_isSeed())
1145             alpaka::atomicAdd(acc, nSeeds, 1u);
1146         }
1147         // Topo clustering threshold test
1148 
1149         if ((layer == PFLayer::HCAL_ENDCAP && energy > topoThreshold) ||
1150             (layer == PFLayer::HCAL_BARREL1 && energy > topoThreshold)) {
1151           pfClusteringVars[i].pfrh_passTopoThresh() = true;
1152           pfClusteringVars[i].pfrh_topoId() = i;
1153         } else {
1154           pfClusteringVars[i].pfrh_passTopoThresh() = false;
1155           pfClusteringVars[i].pfrh_topoId() = -1;
1156         }
1157       }
1158     }
1159   };
1160 
1161   // Preparation of topo inputs. Initializing topoId, egdeIdx, nEdges, edgeList
1162   class PrepareTopoInputs {
1163   public:
1164     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
1165                                   const reco::PFRecHitDeviceCollection::ConstView pfRecHits,
1166                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1167                                   reco::PFClusteringEdgeVarsDeviceCollection::View pfClusteringEdgeVars,
1168                                   uint32_t* __restrict__ nSeeds) const {
1169       const int nRH = pfRecHits.size();
1170 
1171       if (once_per_grid(acc)) {
1172         pfClusteringVars.nEdges() = nRH * 8;
1173         pfClusteringEdgeVars[nRH].pfrh_edgeIdx() = nRH * 8;
1174       }
1175       for (uint32_t i : cms::alpakatools::uniform_elements(acc, nRH)) {
1176         pfClusteringEdgeVars[i].pfrh_edgeIdx() = i * 8;
1177         pfClusteringVars[i].pfrh_topoId() = 0;
1178         for (int j = 0; j < 8; j++) {  // checking if neighbours exist and assigning neighbours as edges
1179           if (pfRecHits[i].neighbours()(j) == -1)
1180             pfClusteringEdgeVars[i * 8 + j].pfrh_edgeList() = i;
1181           else
1182             pfClusteringEdgeVars[i * 8 + j].pfrh_edgeList() = pfRecHits[i].neighbours()(j);
1183         }
1184       }
1185 
1186       return;
1187     }
1188   };
1189 
1190   // Contraction in a single block
1191   class TopoClusterContraction {
1192   public:
1193     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
1194                                   const reco::PFRecHitDeviceCollection::ConstView pfRecHits,
1195                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1196                                   reco::PFClusterDeviceCollection::View clusterView,
1197                                   uint32_t* __restrict__ nSeeds,
1198                                   uint32_t* __restrict__ nRHF) const {
1199       const int nRH = pfRecHits.size();
1200       int& totalSeedOffset = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1201       int& totalSeedFracOffset = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1202 
1203       // rhCount, topoRHCount, topoSeedCount initialized earlier
1204       if (once_per_block(acc)) {
1205         pfClusteringVars.nTopos() = 0;
1206         pfClusteringVars.nRHFracs() = 0;
1207         totalSeedOffset = 0;
1208         totalSeedFracOffset = 0;
1209         pfClusteringVars.pcrhFracSize() = 0;
1210       }
1211 
1212       alpaka::syncBlockThreads(acc);  // all threads call sync
1213 
1214       // Now determine the number of seeds and rechits in each topo cluster [topoRHCount, topoSeedCount]
1215       // Also get the list of topoIds (smallest rhIdx of each topo cluser)
1216       for (int rhIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; rhIdx < nRH;
1217            rhIdx += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1218         pfClusteringVars[rhIdx].rhIdxToSeedIdx() = -1;
1219         int topoId = pfClusteringVars[rhIdx].pfrh_topoId();
1220         if (topoId > -1) {
1221           // Valid topo cluster
1222           alpaka::atomicAdd(acc, &pfClusteringVars[topoId].topoRHCount(), 1);
1223           // Valid topoId not counted yet
1224           if (topoId == rhIdx) {  // For every topo cluster, there is one rechit that meets this condition.
1225             int topoIdx = alpaka::atomicAdd(acc, &pfClusteringVars.nTopos(), 1);
1226             pfClusteringVars[topoIdx].topoIds() =
1227                 topoId;  // topoId: the smallest index of rechits that belong to a topo cluster.
1228           }
1229           // This is a cluster seed
1230           if (pfClusteringVars[rhIdx].pfrh_isSeed()) {  // # of seeds in this topo cluster
1231             alpaka::atomicAdd(acc, &pfClusteringVars[topoId].topoSeedCount(), 1);
1232           }
1233         }
1234       }
1235 
1236       alpaka::syncBlockThreads(acc);  // all threads call sync
1237 
1238       // Determine offsets for topo ID seed array [topoSeedOffsets]
1239       for (int topoId = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; topoId < nRH;
1240            topoId += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1241         if (pfClusteringVars[topoId].topoSeedCount() > 0) {
1242           // This is a valid topo ID
1243           int offset = alpaka::atomicAdd(acc, &totalSeedOffset, pfClusteringVars[topoId].topoSeedCount());
1244           pfClusteringVars[topoId].topoSeedOffsets() = offset;
1245         }
1246       }
1247       alpaka::syncBlockThreads(acc);  // all threads call sync
1248 
1249       // Fill arrays of rechit indicies for each seed [topoSeedList] and rhIdx->seedIdx conversion for each seed [rhIdxToSeedIdx]
1250       // Also fill seedRHIdx, topoId, depth
1251       for (int rhIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; rhIdx < nRH;
1252            rhIdx += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1253         int topoId = pfClusteringVars[rhIdx].pfrh_topoId();
1254         if (pfClusteringVars[rhIdx].pfrh_isSeed()) {
1255           // Valid topo cluster and this rhIdx corresponds to a seed
1256           int k = alpaka::atomicAdd(acc, &pfClusteringVars[topoId].rhCount(), 1);
1257           int seedIdx = pfClusteringVars[topoId].topoSeedOffsets() + k;
1258           if ((unsigned int)seedIdx >= *nSeeds)
1259             printf("Warning(contraction) %8d > %8d should not happen, check topoId: %d has %d rh\n",
1260                    seedIdx,
1261                    *nSeeds,
1262                    topoId,
1263                    k);
1264           pfClusteringVars[seedIdx].topoSeedList() = rhIdx;
1265           pfClusteringVars[rhIdx].rhIdxToSeedIdx() = seedIdx;
1266           clusterView[seedIdx].topoId() = topoId;
1267           clusterView[seedIdx].seedRHIdx() = rhIdx;
1268           clusterView[seedIdx].depth() = pfRecHits[rhIdx].depth();
1269         }
1270       }
1271 
1272       alpaka::syncBlockThreads(acc);  // all threads call sync
1273 
1274       // Determine seed offsets for rechit fraction array
1275       for (int rhIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; rhIdx < nRH;
1276            rhIdx += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1277         pfClusteringVars[rhIdx].rhCount() = 1;  // Reset this counter array
1278 
1279         int topoId = pfClusteringVars[rhIdx].pfrh_topoId();
1280         if (pfClusteringVars[rhIdx].pfrh_isSeed() && topoId > -1) {
1281           // Allot the total number of rechits for this topo cluster for rh fractions
1282           int offset = alpaka::atomicAdd(acc, &totalSeedFracOffset, pfClusteringVars[topoId].topoRHCount());
1283 
1284           // Add offset for this PF cluster seed
1285           pfClusteringVars[rhIdx].seedFracOffsets() = offset;
1286 
1287           // Store recHitFraction offset & size information for each seed
1288           clusterView[pfClusteringVars[rhIdx].rhIdxToSeedIdx()].rhfracOffset() =
1289               pfClusteringVars[rhIdx].seedFracOffsets();
1290           clusterView[pfClusteringVars[rhIdx].rhIdxToSeedIdx()].rhfracSize() =
1291               pfClusteringVars[topoId].topoRHCount() - pfClusteringVars[topoId].topoSeedCount() + 1;
1292         }
1293       }
1294 
1295       alpaka::syncBlockThreads(acc);  // all threads call sync
1296 
1297       if (once_per_block(acc)) {
1298         pfClusteringVars.pcrhFracSize() = totalSeedFracOffset;
1299         pfClusteringVars.nRHFracs() = totalSeedFracOffset;
1300         clusterView.nRHFracs() = totalSeedFracOffset;
1301         *nRHF = totalSeedFracOffset;
1302         clusterView.nSeeds() = *nSeeds;
1303         clusterView.nTopos() = pfClusteringVars.nTopos();
1304       }
1305     }
1306   };
1307 
1308   // Prefill the rechit index for all PFCluster fractions
1309   // Optimized for GPU parallel, but works on any backend
1310   class FillRhfIndex {
1311   public:
1312     ALPAKA_FN_ACC void operator()(Acc2D const& acc,
1313                                   const reco::PFRecHitDeviceCollection::ConstView pfRecHits,
1314                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1315                                   reco::PFRecHitFractionDeviceCollection::View fracView) const {
1316       const int nRH = pfRecHits.size();
1317 
1318       for (auto index : uniform_elements_nd(acc, {nRH, nRH})) {
1319         const int i = index[0u];  // i is a seed index
1320         const int j = index[1u];  // j is NOT a seed
1321         int topoId = pfClusteringVars[i].pfrh_topoId();
1322         if (topoId > -1 && pfClusteringVars[i].pfrh_isSeed() && topoId == pfClusteringVars[j].pfrh_topoId()) {
1323           if (!pfClusteringVars[j].pfrh_isSeed()) {  // NOT a seed
1324             int k = alpaka::atomicAdd(
1325                 acc, &pfClusteringVars[i].rhCount(), 1);  // Increment the number of rechit fractions for this seed
1326             auto fraction = fracView[pfClusteringVars[i].seedFracOffsets() + k];
1327             fraction.pfrhIdx() = j;
1328             fraction.pfcIdx() = pfClusteringVars[i].rhIdxToSeedIdx();
1329           } else if (i == j) {  // i==j is a seed rechit index
1330             auto seed = fracView[pfClusteringVars[i].seedFracOffsets()];
1331             seed.pfrhIdx() = j;
1332             seed.frac() = 1;
1333             seed.pfcIdx() = pfClusteringVars[i].rhIdxToSeedIdx();
1334           }
1335         }
1336       }
1337     }
1338   };
1339 
1340   class FastCluster {
1341   public:
1342     template <bool debug = false, typename = std::enable_if<!std::is_same_v<Device, alpaka::DevCpu>>>
1343     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
1344                                   const reco::PFRecHitDeviceCollection::ConstView pfRecHits,
1345                                   const ::reco::PFClusterParamsSoA::ConstView pfClusParams,
1346                                   const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
1347                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1348                                   reco::PFClusterDeviceCollection::View clusterView,
1349                                   reco::PFRecHitFractionDeviceCollection::View fracView) const {
1350       const int nRH = pfRecHits.size();
1351       int& topoId = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1352       int& nRHTopo = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1353       int& nSeeds = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1354 
1355       if (once_per_block(acc)) {
1356         topoId = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
1357         nRHTopo = pfClusteringVars[topoId].topoRHCount();
1358         nSeeds = pfClusteringVars[topoId].topoSeedCount();
1359       }
1360 
1361       alpaka::syncBlockThreads(acc);  // all threads call sync
1362 
1363       if (topoId < nRH && nRHTopo > 0 && nSeeds > 0) {
1364         if (nRHTopo == nSeeds) {
1365           // PF cluster is isolated seed. No iterations needed
1366           if (once_per_block(acc)) {
1367             // Fill PFCluster-level information
1368             int rhIdx = pfClusteringVars[pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
1369             int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
1370             clusterView[seedIdx].energy() = pfRecHits[rhIdx].energy();
1371             clusterView[seedIdx].x() = pfRecHits[rhIdx].x();
1372             clusterView[seedIdx].y() = pfRecHits[rhIdx].y();
1373             clusterView[seedIdx].z() = pfRecHits[rhIdx].z();
1374           }
1375           // singleSeed and multiSeedParallel functions work only for GPU backend
1376         } else if ((not std::is_same_v<Device, alpaka::DevCpu>) && nSeeds == 1) {
1377           // Single seed cluster
1378           hcalFastCluster_singleSeed(
1379               acc, pfClusParams, topology, topoId, nRHTopo, pfRecHits, pfClusteringVars, clusterView, fracView);
1380         } else if ((not std::is_same_v<Device, alpaka::DevCpu>) && nSeeds <= 100 &&
1381                    nRHTopo - nSeeds < threadsPerBlockForClustering) {
1382           hcalFastCluster_multiSeedParallel(
1383               acc, pfClusParams, topology, topoId, nSeeds, nRHTopo, pfRecHits, pfClusteringVars, clusterView, fracView);
1384         } else if (nSeeds <= 400 && nRHTopo - nSeeds <= 1500) {
1385           // nSeeds value must match exotic in FastClusterExotic
1386           hcalFastCluster_multiSeedIterative(
1387               acc, pfClusParams, topology, topoId, nSeeds, nRHTopo, pfRecHits, pfClusteringVars, clusterView, fracView);
1388         } else {
1389           if constexpr (debug) {
1390             if (once_per_block(acc))
1391               printf("Topo cluster %d has %d seeds and %d rechits. Will be processed in next kernel.\n",
1392                      topoId,
1393                      nSeeds,
1394                      nRHTopo);
1395           }
1396         }
1397       }
1398     }
1399   };
1400 
1401   // Process very large, exotic topo clusters
1402   class FastClusterExotic {
1403   public:
1404     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
1405                                   const reco::PFRecHitDeviceCollection::ConstView pfRecHits,
1406                                   const ::reco::PFClusterParamsSoA::ConstView pfClusParams,
1407                                   const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
1408                                   reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
1409                                   reco::PFClusterDeviceCollection::View clusterView,
1410                                   reco::PFRecHitFractionDeviceCollection::View fracView,
1411                                   Position4* __restrict__ globalClusterPos,
1412                                   Position4* __restrict__ globalPrevClusterPos,
1413                                   float* __restrict__ globalClusterEnergy,
1414                                   float* __restrict__ globalRhFracSum,
1415                                   int* __restrict__ globalSeeds,
1416                                   int* __restrict__ globalRechits) const {
1417       const int nRH = pfRecHits.size();
1418       for (int topoId = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]; topoId < nRH;
1419            topoId += blocksForExoticClusters) {
1420         int nRHTopo = pfClusteringVars[topoId].topoRHCount();
1421         int nSeeds = pfClusteringVars[topoId].topoSeedCount();
1422 
1423         // nSeeds value must match multiSeedIterative in FastCluster
1424         if (nRHTopo > 0 && (nSeeds > 400 || nRHTopo - nSeeds > 1500)) {
1425           hcalFastCluster_exotic(acc,
1426                                  pfClusParams,
1427                                  topology,
1428                                  topoId,
1429                                  nSeeds,
1430                                  nRHTopo,
1431                                  pfRecHits,
1432                                  pfClusteringVars,
1433                                  clusterView,
1434                                  fracView,
1435                                  globalClusterPos,
1436                                  globalPrevClusterPos,
1437                                  globalClusterEnergy,
1438                                  globalRhFracSum,
1439                                  globalSeeds,
1440                                  globalRechits);
1441         }
1442         alpaka::syncBlockThreads(acc);  // all threads call sync
1443       }
1444     }
1445   };
1446 
1447   PFClusterProducerKernel::PFClusterProducerKernel(Queue& queue)
1448       : nSeeds(cms::alpakatools::make_device_buffer<uint32_t>(queue)),
1449         globalClusterPos(
1450             cms::alpakatools::make_device_buffer<Position4[]>(queue, blocksForExoticClusters * maxTopoInput)),
1451         globalPrevClusterPos(
1452             cms::alpakatools::make_device_buffer<Position4[]>(queue, blocksForExoticClusters * maxTopoInput)),
1453         globalClusterEnergy(
1454             cms::alpakatools::make_device_buffer<float[]>(queue, blocksForExoticClusters * maxTopoInput)),
1455         globalRhFracSum(cms::alpakatools::make_device_buffer<float[]>(queue, blocksForExoticClusters * maxTopoInput)),
1456         globalSeeds(cms::alpakatools::make_device_buffer<int[]>(queue, blocksForExoticClusters * maxTopoInput)),
1457         globalRechits(cms::alpakatools::make_device_buffer<int[]>(queue, blocksForExoticClusters * maxTopoInput)) {
1458     alpaka::memset(queue, nSeeds, 0x00);  // Reset nSeeds
1459   }
1460 
1461   void PFClusterProducerKernel::seedTopoAndContract(Queue& queue,
1462                                                     const ::reco::PFClusterParamsSoA::ConstView params,
1463                                                     const reco::PFRecHitHCALTopologyDeviceCollection& topology,
1464                                                     reco::PFClusteringVarsDeviceCollection& pfClusteringVars,
1465                                                     reco::PFClusteringEdgeVarsDeviceCollection& pfClusteringEdgeVars,
1466                                                     const reco::PFRecHitDeviceCollection& pfRecHits,
1467                                                     int nRH,
1468                                                     reco::PFClusterDeviceCollection& pfClusters,
1469                                                     uint32_t* __restrict__ nRHF) {
1470     const int threadsPerBlock = 256;
1471     const int blocks = divide_up_by(nRH, threadsPerBlock);
1472 
1473     // seedingTopoThresh
1474     alpaka::exec<Acc1D>(queue,
1475                         make_workdiv<Acc1D>(blocks, threadsPerBlock),
1476                         SeedingTopoThresh{},
1477                         pfClusteringVars.view(),
1478                         params,
1479                         topology.view(),
1480                         pfRecHits.view(),
1481                         pfClusters.view(),
1482                         nSeeds.data());
1483     // prepareTopoInputs
1484     alpaka::exec<Acc1D>(queue,
1485                         make_workdiv<Acc1D>(blocks, threadsPerBlock),
1486                         PrepareTopoInputs{},
1487                         pfRecHits.view(),
1488                         pfClusteringVars.view(),
1489                         pfClusteringEdgeVars.view(),
1490                         nSeeds.data());
1491     // ECLCC
1492     alpaka::exec<Acc1D>(queue,
1493                         make_workdiv<Acc1D>(blocks, threadsPerBlock),
1494                         ECLCCInit{},
1495                         pfRecHits.view(),
1496                         pfClusteringVars.view(),
1497                         pfClusteringEdgeVars.view());
1498     alpaka::exec<Acc1D>(queue,
1499                         make_workdiv<Acc1D>(blocks, threadsPerBlock),
1500                         ECLCCCompute1{},
1501                         pfRecHits.view(),
1502                         pfClusteringVars.view(),
1503                         pfClusteringEdgeVars.view());
1504     alpaka::exec<Acc1D>(queue,
1505                         make_workdiv<Acc1D>(blocks, threadsPerBlock),
1506                         ECLCCFlatten{},
1507                         pfRecHits.view(),
1508                         pfClusteringVars.view(),
1509                         pfClusteringEdgeVars.view());
1510     // topoClusterContraction
1511     alpaka::exec<Acc1D>(queue,
1512                         make_workdiv<Acc1D>(1, threadsPerBlockForClustering),
1513                         TopoClusterContraction{},
1514                         pfRecHits.view(),
1515                         pfClusteringVars.view(),
1516                         pfClusters.view(),
1517                         nSeeds.data(),
1518                         nRHF);
1519   }
1520 
1521   void PFClusterProducerKernel::cluster(Queue& queue,
1522                                         const ::reco::PFClusterParamsSoA::ConstView params,
1523                                         const reco::PFRecHitHCALTopologyDeviceCollection& topology,
1524                                         reco::PFClusteringVarsDeviceCollection& pfClusteringVars,
1525                                         reco::PFClusteringEdgeVarsDeviceCollection& pfClusteringEdgeVars,
1526                                         const reco::PFRecHitDeviceCollection& pfRecHits,
1527                                         int nRH,
1528                                         reco::PFClusterDeviceCollection& pfClusters,
1529                                         reco::PFRecHitFractionDeviceCollection& pfrhFractions) {
1530     // fillRhfIndex
1531     alpaka::exec<Acc2D>(queue,
1532                         make_workdiv<Acc2D>({divide_up_by(nRH, 32), divide_up_by(nRH, 32)}, {32, 32}),
1533                         FillRhfIndex{},
1534                         pfRecHits.view(),
1535                         pfClusteringVars.view(),
1536                         pfrhFractions.view());
1537 
1538     // Run fastCluster
1539     alpaka::exec<Acc1D>(queue,
1540                         make_workdiv<Acc1D>(nRH, threadsPerBlockForClustering),
1541                         FastCluster{},
1542                         pfRecHits.view(),
1543                         params,
1544                         topology.view(),
1545                         pfClusteringVars.view(),
1546                         pfClusters.view(),
1547                         pfrhFractions.view());
1548     // exotic clustering kernel
1549     alpaka::exec<Acc1D>(queue,
1550                         make_workdiv<Acc1D>(blocksForExoticClusters,
1551                                             threadsPerBlockForClustering),  // uses 4 blocks to minimize memory usage
1552                         FastClusterExotic{},
1553                         pfRecHits.view(),
1554                         params,
1555                         topology.view(),
1556                         pfClusteringVars.view(),
1557                         pfClusters.view(),
1558                         pfrhFractions.view(),
1559                         globalClusterPos.data(),
1560                         globalPrevClusterPos.data(),
1561                         globalClusterEnergy.data(),
1562                         globalRhFracSum.data(),
1563                         globalSeeds.data(),
1564                         globalRechits.data());
1565   }
1566 
1567 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE