Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:28:22

0001 #ifndef RecoTracker_MkFitCore_standalone_attic_BestCands_h
0002 #define RecoTracker_MkFitCore_standalone_attic_BestCands_h
0003 
0004 #include "Config.h"
0005 
0006 #include <cstdio>
0007 #include <limits>
0008 
0009 namespace CandsGPU {
0010 
0011   constexpr int trkIdx_sentinel = -1;
0012   constexpr int hitIdx_sentinel = -1;
0013   constexpr int nhits_sentinel = -1;
0014   constexpr float chi2_sentinel = std::numeric_limits<float>::max();
0015 
0016   template <typename T>
0017   __device__ void swap_values(T& a, T& b) {
0018     T c(a);
0019     a = b;
0020     b = c;
0021   }
0022 
0023   template <int MaxCandsPerSeed, int BlockSize>
0024   struct BestCands {
0025     // AoS would generate bank conflicts when used in SM
0026     int trkIdx[MaxCandsPerSeed][BlockSize];
0027     int hitIdx[MaxCandsPerSeed][BlockSize];
0028     int nhits[MaxCandsPerSeed][BlockSize];
0029     float chi2[MaxCandsPerSeed][BlockSize];
0030 
0031     __device__ void reset(int itrack);
0032     __device__ void update(int itrack, int cand_trIdx, int cand_hitIdx, int cand_nhits, float cand_chi2);
0033     __device__ void heapify(int itrack, int idx, int heap_size);
0034     __device__ int left(int idx);
0035     __device__ int right(int idx);
0036 
0037     __device__ bool better(int icand_fst, int fst, int icand_snd, int snd);
0038 
0039     __device__ void heap_sort(int icand, int heap_size);
0040     __device__ void merge_cands_for_seed(int iseed, int icand);
0041     __device__ void swap_nodes(int icand_fst, int fst, int icand_snd, int snd);
0042     __device__ void copy_node(int icand_fst, int fst, int icand_snd, int snd);
0043 
0044     __device__ int count_valid_cands(int itrack);
0045 
0046     // TODO: Should really return a IdxChi2List
0047     __device__ void get_cand_info(
0048         const int tid, const int cid, int& my_trkIdx, int& my_hitIdx, int& my_nhits, float& my_chi2);
0049     __device__ int get_nhits(const int tid, const int cid) { return nhits[cid][tid]; }
0050 
0051     __device__ void print_heap(const int tid);
0052   };
0053 
0054   template <int M, int B>
0055   __device__ void BestCands<M, B>::reset(int itrack) {
0056     for (auto j = 0; j < M; ++j) {
0057       trkIdx[j][itrack] = trkIdx_sentinel;
0058       hitIdx[j][itrack] = hitIdx_sentinel;
0059       nhits[j][itrack] = nhits_sentinel;
0060       chi2[j][itrack] = chi2_sentinel;
0061     }
0062   }
0063 
0064   template <int M, int B>
0065   __device__ void BestCands<M, B>::update(int itrack, int cand_trIdx, int cand_hitIdx, int cand_nhits, float cand_chi2) {
0066     if (cand_nhits < nhits[0][itrack])
0067       return;
0068     if (cand_chi2 > chi2[0][itrack])
0069       return;
0070     trkIdx[0][itrack] = cand_trIdx;
0071     hitIdx[0][itrack] = cand_hitIdx;
0072     nhits[0][itrack] = cand_nhits;
0073     chi2[0][itrack] = cand_chi2;
0074 
0075     heapify(itrack, 0, M);
0076   }
0077 
0078   template <int M, int B>
0079   __device__ void BestCands<M, B>::print_heap(const int tid) {
0080     for (int cid = 0; cid < M; cid++) {
0081       printf(">>>>> tid %d rowIdx %d hitIdx %d nhits %d chi2 %f\n",
0082              tid,
0083              cid,
0084              hitIdx[cid][tid],
0085              nhits[cid][tid],
0086              chi2[cid][tid]);
0087     }
0088   }
0089 
0090   template <int M, int B>
0091   __device__ int BestCands<M, B>::left(int idx) {
0092     return (++idx << 1) - 1;
0093   }
0094 
0095   template <int M, int B>
0096   __device__ int BestCands<M, B>::right(int idx) {
0097     return ++idx << 1;
0098   }
0099 
0100   template <int M, int B>
0101   __device__ bool BestCands<M, B>::better(int icand_fst, int fst, int icand_snd, int snd) {
0102     return (nhits[fst][icand_fst] > nhits[snd][icand_snd]) ||
0103            ((nhits[fst][icand_fst] == nhits[snd][icand_snd]) && (chi2[fst][icand_fst] < chi2[snd][icand_snd]));
0104   }
0105 
0106   template <int M, int B>
0107   __device__ void BestCands<M, B>::swap_nodes(int icand_fst, int fst, int icand_snd, int snd) {
0108     swap_values(trkIdx[fst][icand_fst], trkIdx[snd][icand_snd]);
0109     swap_values(hitIdx[fst][icand_fst], hitIdx[snd][icand_snd]);
0110     swap_values(nhits[fst][icand_fst], nhits[snd][icand_snd]);
0111     swap_values(chi2[fst][icand_fst], chi2[snd][icand_snd]);
0112   }
0113 
0114   template <int M, int B>
0115   __device__ void BestCands<M, B>::copy_node(int icand_fst, int fst, int icand_snd, int snd) {
0116     trkIdx[snd][icand_snd] = trkIdx[fst][icand_fst];
0117     hitIdx[snd][icand_snd] = hitIdx[fst][icand_fst];
0118     nhits[snd][icand_snd] = nhits[fst][icand_fst];
0119     chi2[snd][icand_snd] = chi2[fst][icand_fst];
0120   }
0121 
0122   template <int M, int B>
0123   __device__ void BestCands<M, B>::heapify(int icand, int idx, int heap_size) {
0124     // We want to move idx down so the smallest value is at the root
0125     int smallest = -1;
0126     while (idx != smallest) {
0127       if (idx < 0 || idx >= heap_size / 2)
0128         return;
0129 
0130       smallest = idx;
0131       if (heap_size > left(idx) && better(icand, smallest, icand, left(idx)))
0132         smallest = left(idx);
0133       if (heap_size > right(idx) && better(icand, smallest, icand, right(idx)))
0134         smallest = right(idx);
0135 
0136       if (smallest != idx) {
0137         swap_nodes(icand, smallest, icand, idx);
0138         idx = smallest;
0139         smallest = -1;
0140       }
0141     }
0142   }
0143 
0144   template <int M, int B>
0145   __device__ void BestCands<M, B>::merge_cands_for_seed(int iseed, int icand) {
0146     int itrack = iseed * M + icand;
0147 // TODO: Need a better way to reduce candidates.
0148 //       So far, binary tree reduction is a bit slower than the naive approach
0149 #if 1
0150     if (icand) {
0151       heap_sort(itrack, M);
0152     }
0153     __syncthreads();  // cand 0 waits;
0154     if (icand)
0155       return;  // reduction by the first cand of each seed
0156 
0157     for (int i = itrack + 1; i < itrack + M; ++i) {  // over cands
0158       for (int j = 0; j < M; ++j) {                  // inside heap
0159         if (better(i, j, itrack, 0)) {
0160           copy_node(i, j, itrack, 0);
0161           heapify(itrack, 0, M);
0162         } else {
0163           break;
0164         }
0165       }
0166     }
0167     heap_sort(itrack, M);
0168     __syncthreads();  // TODO: Volta: sync only on MaxCandsPerSeeds threads
0169 #else
0170 
0171     for (int step = 2; step <= Config::maxCandsPerSeed; step <<= 1) {
0172       if (icand % step == step / 2) {
0173         heap_sort(itrack, M);
0174       }
0175       __syncthreads();
0176 
0177       if (icand % step == 0) {
0178         int i = itrack + step / 2;
0179         if ((i < iseed * M + M) && (i < B) && (icand + step / 2 < M)) {
0180           for (int j = 0; j < M; ++j) {  // inside heap
0181             if (better(i, j, itrack, 0)) {
0182               copy_node(i, j, itrack, 0);
0183               heapify(itrack, 0, M);
0184             } else {
0185               break;
0186             }
0187           }
0188         }
0189       }
0190       //__syncthreads();
0191     }
0192 
0193     if (icand == 0) {
0194       heap_sort(itrack, M);
0195     }
0196     __syncthreads();
0197 #endif
0198   }
0199 
0200   template <int M, int B>
0201   __device__ void BestCands<M, B>::heap_sort(int icand, int heap_size) {
0202     int num_unsorted_elts = heap_size;
0203     // Assume that we have a heap with the worst one at the root.
0204     for (int i = heap_size - 1; i > 0; --i) {
0205       swap_nodes(icand, 0, icand, i);  // worst at the end
0206       heapify(icand, 0, --num_unsorted_elts);
0207     }
0208   }
0209 
0210   template <int MaxCandsPerSeed, int BlockSize>
0211   __device__ void BestCands<MaxCandsPerSeed, BlockSize>::get_cand_info(
0212       const int tid, const int cid, int& my_trkIdx, int& my_hitIdx, int& my_nhits, float& my_chi2) {
0213     if (cid < MaxCandsPerSeed && tid < BlockSize) {
0214       my_trkIdx = trkIdx[cid][tid];
0215       my_hitIdx = hitIdx[cid][tid];
0216       my_nhits = nhits[cid][tid];
0217       my_chi2 = chi2[cid][tid];
0218     }
0219   }
0220 
0221   template <int M, int B>
0222   __device__ int BestCands<M, B>::count_valid_cands(int itrack) {
0223     int count = 0;
0224     for (int i = 0; i < M; ++i) {
0225       if (trkIdx[i][itrack] != trkIdx_sentinel)
0226         ++count;
0227     }
0228     return count;
0229   }
0230 
0231 }  // namespace CandsGPU
0232 
0233 #endif  // _BEST_CANDS_H_