Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-08-21 04:46:44

0001 // Original Author:  Theo Cuisset
0002 //         Created:  Nov 2023
0003 /** Produce samples for electron superclustering DNN training in TICL
0004 
0005  Pairs of seed-candidate tracksters (in compatible geometric windows) are iterated over, in similar manner as in TracksterLinkingBySuperclustering.
0006  For each of these pairs, the DNN features are computed and saved to a TTree. 
0007  Also saved is the best (=lowest) association score of the seed trackster with CaloParticles. The association score of the candidate trackster 
0008  with the same CaloParticle is also saved.
0009 */
0010 #include <cmath>
0011 #include <vector>
0012 #include <memory>
0013 #include <algorithm>
0014 
0015 #include <TTree.h>
0016 
0017 #include "FWCore/Framework/interface/Frameworkfwd.h"
0018 #include "FWCore/Framework/interface/one/EDAnalyzer.h"
0019 #include "FWCore/Framework/interface/Event.h"
0020 #include "FWCore/Framework/interface/MakerMacros.h"
0021 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0022 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0023 #include "FWCore/ParameterSet/interface/allowedValues.h"
0024 #include "FWCore/Utilities/interface/InputTag.h"
0025 #include "FWCore/ServiceRegistry/interface/Service.h"
0026 
0027 #include "CommonTools/UtilAlgos/interface/TFileService.h"
0028 
0029 #include "DataFormats/Math/interface/deltaPhi.h"
0030 #include "DataFormats/HGCalReco/interface/Trackster.h"
0031 #include "SimDataFormats/CaloAnalysis/interface/CaloParticle.h"
0032 #include "SimDataFormats/Associations/interface/TracksterToSimTracksterAssociator.h"
0033 
0034 #include "RecoHGCal/TICL/plugins/TracksterLinkingbySuperClusteringDNN.h"
0035 #include "RecoHGCal/TICL/interface/SuperclusteringDNNInputs.h"
0036 
0037 using namespace ticl;
0038 
0039 class SuperclusteringSampleDumper : public edm::one::EDAnalyzer<edm::one::SharedResources> {
0040 public:
0041   explicit SuperclusteringSampleDumper(const edm::ParameterSet&);
0042 
0043   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0044 
0045 private:
0046   void beginJob() override;
0047   void analyze(const edm::Event&, const edm::EventSetup&) override;
0048   bool checkExplainedVarianceRatioCut(ticl::Trackster const& ts) const;
0049 
0050   const edm::EDGetTokenT<std::vector<Trackster>> tracksters_clue3d_token_;
0051   const edm::EDGetTokenT<ticl::RecoToSimCollectionSimTracksters> tsRecoToSimCP_token_;
0052   float deltaEtaWindow_;
0053   float deltaPhiWindow_;
0054   float seedPtThreshold_;
0055   float candidateEnergyThreshold_;
0056   float explVarRatioCut_energyBoundary_;  // Boundary energy between low and high energy explVarRatio cut threshold
0057   float explVarRatioMinimum_lowEnergy_;  // Cut on explained variance ratio of tracksters to be considered as candidate, for trackster raw_energy < explVarRatioCut_energyBoundary
0058   float explVarRatioMinimum_highEnergy_;  // Cut on explained variance ratio of tracksters to be considered as candidate, for trackster raw_energy > explVarRatioCut_energyBoundary
0059 
0060   TTree* output_tree_;
0061   edm::EventID eventId_;
0062   std::unique_ptr<AbstractSuperclusteringDNNInput> dnnInput_;
0063   std::vector<std::vector<float>>
0064       features_;  // Outer index : feature number (split into branches), inner index : inference pair index
0065   std::vector<unsigned int> seedTracksterIdx_;       // ID of seed trackster used for inference pair
0066   std::vector<unsigned int> candidateTracksterIdx_;  // ID of candidate trackster used for inference pair
0067 
0068   std::vector<float>
0069       seedTracksterBestAssociationScore_;  // Best association score of seed trackster (seedTracksterIdx) with CaloParticle
0070   std::vector<long>
0071       seedTracksterBestAssociation_simTsIdx_;  // Index of SimTrackster that has the best association score to the seedTrackster
0072   std::vector<float> seedTracksterBestAssociation_caloParticleEnergy_;  // Energy of best associated CaloParticle to seed
0073 
0074   std::vector<float>
0075       candidateTracksterBestAssociationScore_;  // Best association score of candidate trackster (seedTracksterIdx) with CaloParticle
0076   std::vector<long>
0077       candidateTracksterBestAssociation_simTsIdx_;  // Index of SimTrackster that has the best association score to the candidate
0078 
0079   std::vector<float>
0080       candidateTracksterAssociationWithSeed_score_;  // Association score of candidate trackster with the CaloParticle of seedTracksterBestAssociation_simTsIdx_
0081 };
0082 
0083 SuperclusteringSampleDumper::SuperclusteringSampleDumper(const edm::ParameterSet& ps)
0084     : tracksters_clue3d_token_(consumes<std::vector<Trackster>>(ps.getParameter<edm::InputTag>("tracksters"))),
0085       tsRecoToSimCP_token_(
0086           consumes<ticl::RecoToSimCollectionSimTracksters>(ps.getParameter<edm::InputTag>("recoToSimAssociatorCP"))),
0087       deltaEtaWindow_(ps.getParameter<double>("deltaEtaWindow")),
0088       deltaPhiWindow_(ps.getParameter<double>("deltaPhiWindow")),
0089       seedPtThreshold_(ps.getParameter<double>("seedPtThreshold")),
0090       candidateEnergyThreshold_(ps.getParameter<double>("candidateEnergyThreshold")),
0091       explVarRatioCut_energyBoundary_(ps.getParameter<double>("candidateEnergyThreshold")),
0092       explVarRatioMinimum_lowEnergy_(ps.getParameter<double>("explVarRatioMinimum_lowEnergy")),
0093       explVarRatioMinimum_highEnergy_(ps.getParameter<double>("explVarRatioMinimum_highEnergy")),
0094       eventId_(),
0095       dnnInput_(makeSuperclusteringDNNInputFromString(ps.getParameter<std::string>("dnnInputsVersion"))),
0096       features_(dnnInput_->featureCount()) {
0097   usesResource("TFileService");
0098 }
0099 
0100 void SuperclusteringSampleDumper::beginJob() {
0101   edm::Service<TFileService> fs;
0102   output_tree_ = fs->make<TTree>("superclusteringTraining", "Superclustering training samples");
0103   output_tree_->Branch("Event", &eventId_);
0104   output_tree_->Branch("seedTracksterIdx", &seedTracksterIdx_);
0105   output_tree_->Branch("candidateTracksterIdx", &candidateTracksterIdx_);
0106   output_tree_->Branch("seedTracksterBestAssociationScore", &seedTracksterBestAssociationScore_);
0107   output_tree_->Branch("seedTracksterBestAssociation_simTsIdx", &seedTracksterBestAssociation_simTsIdx_);
0108   output_tree_->Branch("seedTracksterBestAssociation_caloParticleEnergy",
0109                        &seedTracksterBestAssociation_caloParticleEnergy_);
0110   output_tree_->Branch("candidateTracksterBestAssociationScore", &candidateTracksterBestAssociationScore_);
0111   output_tree_->Branch("candidateTracksterBestAssociation_simTsIdx", &candidateTracksterBestAssociation_simTsIdx_);
0112   output_tree_->Branch("candidateTracksterAssociationWithSeed_score", &candidateTracksterAssociationWithSeed_score_);
0113   std::vector<std::string> featureNames = dnnInput_->featureNames();
0114   assert(featureNames.size() == dnnInput_->featureCount());
0115   for (unsigned int i = 0; i < dnnInput_->featureCount(); i++) {
0116     output_tree_->Branch(("feature_" + featureNames[i]).c_str(), &features_[i]);
0117   }
0118 }
0119 
0120 /** 
0121  * Check if trackster passes cut on explained variance ratio. The DNN is trained only on pairs where both seed and candidate pass this cut
0122  * Explained variance ratio is (largest PCA eigenvalue) / (sum of PCA eigenvalues)
0123 */
0124 bool SuperclusteringSampleDumper::checkExplainedVarianceRatioCut(ticl::Trackster const& ts) const {
0125   float explVar_denominator =
0126       std::accumulate(std::begin(ts.eigenvalues()), std::end(ts.eigenvalues()), 0.f, std::plus<float>());
0127   if (explVar_denominator != 0.) {
0128     float explVarRatio = ts.eigenvalues()[0] / explVar_denominator;
0129     if (ts.raw_energy() > explVarRatioCut_energyBoundary_)
0130       return explVarRatio >= explVarRatioMinimum_highEnergy_;
0131     else
0132       return explVarRatio >= explVarRatioMinimum_lowEnergy_;
0133   } else
0134     return false;
0135 }
0136 
0137 void SuperclusteringSampleDumper::analyze(const edm::Event& evt, const edm::EventSetup& iSetup) {
0138   eventId_ = evt.id();
0139 
0140   edm::Handle<std::vector<Trackster>> inputTracksters;
0141   evt.getByToken(tracksters_clue3d_token_, inputTracksters);
0142 
0143   edm::Handle<ticl::RecoToSimCollectionSimTracksters> assoc_CP_recoToSim;
0144   evt.getByToken(tsRecoToSimCP_token_, assoc_CP_recoToSim);
0145 
0146   /* Sorting tracksters by decreasing order of pT (out-of-place sort). 
0147   inputTracksters[trackstersIndicesPt[0]], ..., inputTracksters[trackstersIndicesPt[N]] makes a list of tracksters sorted by decreasing pT
0148   Indices into this pT sorted collection will have the suffix _pt. Thus inputTracksters[index] and inputTracksters[trackstersIndicesPt[index_pt]] are correct
0149   */
0150   std::vector<unsigned int> trackstersIndicesPt(inputTracksters->size());
0151   std::iota(trackstersIndicesPt.begin(), trackstersIndicesPt.end(), 0);
0152   std::stable_sort(
0153       trackstersIndicesPt.begin(), trackstersIndicesPt.end(), [&inputTracksters](unsigned int i1, unsigned int i2) {
0154         return (*inputTracksters)[i1].raw_pt() > (*inputTracksters)[i2].raw_pt();
0155       });
0156 
0157   // Order of loops are reversed compared to SuperclusteringProducer (here outer is seed, inner is candidate), for performance reasons.
0158   // The same pairs seed-candidate should be present, just in a different order
0159   // First loop on seed tracksters
0160   for (unsigned int ts_seed_idx_pt = 0; ts_seed_idx_pt < inputTracksters->size(); ts_seed_idx_pt++) {
0161     const unsigned int ts_seed_idx_input =
0162         trackstersIndicesPt[ts_seed_idx_pt];  // Index of seed trackster in input collection (not in pT sorted collection)
0163     Trackster const& ts_seed = (*inputTracksters)[ts_seed_idx_input];
0164 
0165     if (ts_seed.raw_pt() < seedPtThreshold_)
0166       break;  // All further seeds will have lower pT than threshold (due to pT sorting)
0167 
0168     if (!checkExplainedVarianceRatioCut(ts_seed))
0169       continue;
0170 
0171     // Find best associated CaloParticle to the seed
0172     auto seed_assocs = assoc_CP_recoToSim->find({inputTracksters, ts_seed_idx_input});
0173     if (seed_assocs == assoc_CP_recoToSim->end())
0174       continue;  // No CaloParticle associations for the current trackster (extremly unlikely)
0175     ticl::RecoToSimCollectionSimTracksters::data_type const& seed_assocWithBestScore =
0176         *std::min_element(seed_assocs->val.begin(),
0177                           seed_assocs->val.end(),
0178                           [](ticl::RecoToSimCollectionSimTracksters::data_type const& assoc_1,
0179                              ticl::RecoToSimCollectionSimTracksters::data_type const& assoc_2) {
0180                             // assoc_* is of type : std::pair<edmRefIntoSimTracksterCollection, std::pair<sharedEnergy, associationScore>>
0181                             // Best score is smallest score
0182                             return assoc_1.second.second < assoc_2.second.second;
0183                           });
0184 
0185     // Second loop on superclustering candidates tracksters
0186     // Look only at candidate tracksters with lower pT than the seed (so all pairs are only looked at once)
0187     for (unsigned int ts_cand_idx_pt = ts_seed_idx_pt + 1; ts_cand_idx_pt < inputTracksters->size(); ts_cand_idx_pt++) {
0188       Trackster const& ts_cand = (*inputTracksters)[trackstersIndicesPt[ts_cand_idx_pt]];
0189       // Check that the two tracksters are geometrically compatible for superclustering (using deltaEta, deltaPhi window)
0190       // There is no need to run training or inference for tracksters very far apart
0191       if (!(std::abs(ts_seed.barycenter().Eta() - ts_cand.barycenter().Eta()) < deltaEtaWindow_ &&
0192             std::abs(deltaPhi(ts_seed.barycenter().Phi(), ts_cand.barycenter().Phi())) < deltaPhiWindow_ &&
0193             ts_cand.raw_energy() >= candidateEnergyThreshold_ && checkExplainedVarianceRatioCut(ts_cand)))
0194         continue;
0195 
0196       std::vector<float> features = dnnInput_->computeVector(ts_seed, ts_cand);
0197       assert(features.size() == features_.size());
0198       for (unsigned int feature_idx = 0; feature_idx < features_.size(); feature_idx++) {
0199         features_[feature_idx].push_back(features[feature_idx]);
0200       }
0201       seedTracksterIdx_.push_back(trackstersIndicesPt[ts_seed_idx_pt]);
0202       candidateTracksterIdx_.push_back(trackstersIndicesPt[ts_cand_idx_pt]);
0203 
0204       float candidateTracksterBestAssociationScore = 1.;  // Best association score of candidate with any CaloParticle
0205       long candidateTracksterBestAssociation_simTsIdx = -1;  // Corresponding CaloParticle simTrackster index
0206       float candidateTracksterAssociationWithSeed_score =
0207           1.;  // Association score of candidate with CaloParticle best associated with seed
0208 
0209       // First find associated CaloParticles with candidate
0210       auto cand_assocCP = assoc_CP_recoToSim->find(
0211           edm::Ref<ticl::TracksterCollection>(inputTracksters, trackstersIndicesPt[ts_cand_idx_pt]));
0212       if (cand_assocCP != assoc_CP_recoToSim->end()) {
0213         // find the association with best score
0214         ticl::RecoToSimCollectionSimTracksters::data_type const& cand_assocWithBestScore =
0215             *std::min_element(cand_assocCP->val.begin(),
0216                               cand_assocCP->val.end(),
0217                               [](ticl::RecoToSimCollectionSimTracksters::data_type const& assoc_1,
0218                                  ticl::RecoToSimCollectionSimTracksters::data_type const& assoc_2) {
0219                                 // assoc_* is of type : std::pair<edmRefIntoSimTracksterCollection, std::pair<sharedEnergy, associationScore>>
0220                                 return assoc_1.second.second < assoc_2.second.second;
0221                               });
0222         candidateTracksterBestAssociationScore = cand_assocWithBestScore.second.second;
0223         candidateTracksterBestAssociation_simTsIdx = cand_assocWithBestScore.first.key();
0224 
0225         // find the association score with the same CaloParticle as the seed
0226         auto cand_assocWithSeedCP =
0227             std::find_if(cand_assocCP->val.begin(),
0228                          cand_assocCP->val.end(),
0229                          [&seed_assocWithBestScore](ticl::RecoToSimCollectionSimTracksters::data_type const& assoc) {
0230                            // assoc is of type : std::pair<edmRefIntoSimTracksterCollection, std::pair<sharedEnergy, associationScore>>
0231                            return assoc.first == seed_assocWithBestScore.first;
0232                          });
0233         if (cand_assocWithSeedCP != cand_assocCP->val.end()) {
0234           candidateTracksterAssociationWithSeed_score = cand_assocWithSeedCP->second.second;
0235         }
0236       }
0237 
0238       seedTracksterBestAssociationScore_.push_back(seed_assocWithBestScore.second.second);
0239       seedTracksterBestAssociation_simTsIdx_.push_back(seed_assocWithBestScore.first.key());
0240       seedTracksterBestAssociation_caloParticleEnergy_.push_back(seed_assocWithBestScore.first->regressed_energy());
0241 
0242       candidateTracksterBestAssociationScore_.push_back(candidateTracksterBestAssociationScore);
0243       candidateTracksterBestAssociation_simTsIdx_.push_back(candidateTracksterBestAssociation_simTsIdx);
0244 
0245       candidateTracksterAssociationWithSeed_score_.push_back(candidateTracksterAssociationWithSeed_score);
0246     }
0247   }
0248 
0249   output_tree_->Fill();
0250   for (auto& feats : features_)
0251     feats.clear();
0252   seedTracksterIdx_.clear();
0253   candidateTracksterIdx_.clear();
0254   seedTracksterBestAssociationScore_.clear();
0255   seedTracksterBestAssociation_simTsIdx_.clear();
0256   seedTracksterBestAssociation_caloParticleEnergy_.clear();
0257   candidateTracksterBestAssociationScore_.clear();
0258   candidateTracksterBestAssociation_simTsIdx_.clear();
0259   candidateTracksterAssociationWithSeed_score_.clear();
0260 }
0261 
0262 void SuperclusteringSampleDumper::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0263   edm::ParameterSetDescription desc;
0264   desc.add<edm::InputTag>("tracksters", edm::InputTag("ticlTrackstersCLUE3DHigh"))
0265       ->setComment("Input trackster collection, same as what is used for superclustering inference.");
0266   desc.add<edm::InputTag>("recoToSimAssociatorCP",
0267                           edm::InputTag("tracksterSimTracksterAssociationLinkingbyCLUE3D", "recoToSim"));
0268   desc.ifValue(edm::ParameterDescription<std::string>("dnnInputsVersion", "v2", true),
0269                edm::allowedValues<std::string>("v1", "v2"))
0270       ->setComment(
0271           "DNN inputs version tag. Defines which set of features is fed to the DNN. Must match with the actual DNN.");
0272   // Cuts are intentionally looser than those used for inference in TracksterLinkingBySuperClustering.cpp
0273   desc.add<double>("deltaEtaWindow", 0.2)
0274       ->setComment(
0275           "Size of delta eta window to consider for superclustering. Seed-candidate pairs outside this window "
0276           "are not considered for DNN inference.");
0277   desc.add<double>("deltaPhiWindow", 0.7)
0278       ->setComment(
0279           "Size of delta phi window to consider for superclustering. Seed-candidate pairs outside this window "
0280           "are not considered for DNN inference.");
0281   desc.add<double>("seedPtThreshold", 3.)
0282       ->setComment("Minimum transverse momentum of trackster to be considered as seed of a supercluster");
0283   desc.add<double>("candidateEnergyThreshold", 1.5)
0284       ->setComment("Minimum energy of trackster to be considered as candidate for superclustering");
0285   desc.add<double>("explVarRatioCut_energyBoundary", 50.)
0286       ->setComment("Boundary energy between low and high energy explVarRatio cut threshold");
0287   desc.add<double>("explVarRatioMinimum_lowEnergy", 0.85)
0288       ->setComment(
0289           "Cut on explained variance ratio of tracksters to be considered as candidate, "
0290           "for trackster raw_energy < explVarRatioCut_energyBoundary");
0291   desc.add<double>("explVarRatioMinimum_highEnergy", 0.9)
0292       ->setComment(
0293           "Cut on explained variance ratio of tracksters to be considered as candidate, "
0294           "for trackster raw_energy > explVarRatioCut_energyBoundary");
0295   descriptions.add("superclusteringSampleDumper", desc);
0296 }
0297 
0298 DEFINE_FWK_MODULE(SuperclusteringSampleDumper);