Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:25:04

0001 
0002 #include "RecoEgamma/EgammaTools/interface/EgammaDNNHelper.h"
0003 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0004 #include "FWCore/Utilities/interface/FileInPath.h"
0005 #include <iostream>
0006 #include <fstream>
0007 using namespace egammaTools;
0008 
0009 EgammaDNNHelper::EgammaDNNHelper(const DNNConfiguration& cfg,
0010                                  const ModelSelector& modelSelector,
0011                                  const std::vector<std::string>& availableVars)
0012     : cfg_(cfg), modelSelector_(modelSelector), nModels_(cfg_.modelsFiles.size()), graphDefs_(cfg_.modelsFiles.size()) {
0013   initTensorFlowGraphs();
0014   initScalerFiles(availableVars);
0015 }
0016 
0017 void EgammaDNNHelper::initTensorFlowGraphs() {
0018   // load the graph definition
0019   LogDebug("EgammaDNNHelper") << "Loading " << nModels_ << " graphs";
0020   size_t i = 0;
0021   for (const auto& model_file : cfg_.modelsFiles) {
0022     graphDefs_[i] =
0023         std::unique_ptr<tensorflow::GraphDef>(tensorflow::loadGraphDef(edm::FileInPath(model_file).fullPath()));
0024     i++;
0025   }
0026 }
0027 
0028 std::vector<tensorflow::Session*> EgammaDNNHelper::getSessions() const {
0029   std::vector<tensorflow::Session*> sessions;
0030   LogDebug("EgammaDNNHelper") << "Starting " << nModels_ << " TF sessions";
0031   sessions.reserve(graphDefs_.size());
0032   for (const auto& graphDef : graphDefs_) {
0033     sessions.push_back(tensorflow::createSession(graphDef.get()));
0034   }
0035   LogDebug("EgammaDNNHelper") << "TF sessions started";
0036   return sessions;
0037 }
0038 
0039 void EgammaDNNHelper::initScalerFiles(const std::vector<std::string>& availableVars) {
0040   for (const auto& scaler_file : cfg_.scalersFiles) {
0041     // Parse scaler configuration
0042     std::vector<ScalerConfiguration> features;
0043     std::ifstream inputfile_scaler{edm::FileInPath(scaler_file).fullPath()};
0044     int ninputs = 0;
0045     if (inputfile_scaler.fail()) {
0046       throw cms::Exception("MissingFile") << "Scaler file for PFid DNN not found";
0047     } else {
0048       // Now read mean, scale factors for each variable
0049       float par1, par2;
0050       std::string varName, type_str;
0051       uint type;
0052       while (inputfile_scaler >> varName >> type_str >> par1 >> par2) {
0053         if (type_str == "stdscale")
0054           type = 1;
0055         else if (type_str == "minmax")
0056           type = 2;
0057         else if (type_str == "custom1")  // 2*((X_train - minValues)/(MaxMinusMin)) -1.0
0058           type = 3;
0059         else
0060           type = 0;
0061         features.push_back(ScalerConfiguration{.varName = varName, .type = type, .par1 = par1, .par2 = par2});
0062         // Protection for mismatch between requested variables and the available ones
0063         auto match = std::find(availableVars.begin(), availableVars.end(), varName);
0064         if (match == std::end(availableVars)) {
0065           throw cms::Exception("MissingVariable")
0066               << "Requested variable (" << varName << ") not available between DNN inputs";
0067         }
0068         ninputs += 1;
0069       }
0070     }
0071     inputfile_scaler.close();
0072     featuresMap_.push_back(features);
0073     nInputs_.push_back(ninputs);
0074   }
0075 }
0076 
0077 std::pair<uint, std::vector<float>> EgammaDNNHelper::getScaledInputs(
0078     const std::map<std::string, float>& variables) const {
0079   // Call the modelSelector function passing the variables map to return
0080   // the modelIndex to be used for the current candidate
0081   const auto modelIndex = modelSelector_(variables);
0082   std::vector<float> inputs;
0083   // Loop on the list of requested variables and scaling values for the specific modelIndex
0084   // Different type of scaling are available: 0=no scaling, 1=standard scaler, 2=minmax
0085   for (auto& [varName, type, par1, par2] : featuresMap_[modelIndex]) {
0086     if (type == 1)  // Standard scaling
0087       inputs.push_back((variables.at(varName) - par1) / par2);
0088     else if (type == 2)  // MinMax
0089       inputs.push_back((variables.at(varName) - par1) / (par2 - par1));
0090     else if (type == 3)  //2*((X_train - minValues)/(MaxMinusMin)) -1.0
0091       inputs.push_back(2 * (variables.at(varName) - par1) / (par2 - par1) - 1.);
0092     else {
0093       inputs.push_back(variables.at(varName));  // Do nothing on the variable
0094     }
0095     //Protection for mismatch between requested variables and the available ones
0096     // have been added when the scaler config are loaded --> here we know that the variables are available
0097   }
0098   return std::make_pair(modelIndex, inputs);
0099 }
0100 
0101 std::vector<std::pair<uint, std::vector<float>>> EgammaDNNHelper::evaluate(
0102     const std::vector<std::map<std::string, float>>& candidates,
0103     const std::vector<tensorflow::Session*>& sessions) const {
0104   /*
0105     Evaluate the PFID DNN for all the electrons/photons. 
0106     nModels_ are defined depending on modelIndex  --> we need to build N input tensors to evaluate
0107     the DNNs with batching.
0108     
0109     1) Get all the variable for each candidate  vector<map<string:float>>
0110     2) Scale the input and select the variables for each model
0111     2) Prepare the input tensors for the  models
0112     3) Run the models and get the output for each candidate
0113     4) Sort the output by candidate index
0114     5) Return the DNN outputs along with the model index used on it
0115 
0116     */
0117   size_t nCandidates = candidates.size();
0118   std::vector<std::vector<uint>> indexMap(nModels_);  // for each model; the list of candidate index is saved
0119   std::vector<std::vector<float>> inputsVectors(nCandidates);
0120   std::vector<uint> counts(nModels_);
0121 
0122   LogDebug("EgammaDNNHelper") << "Working on " << nCandidates << " candidates";
0123 
0124   uint icand = 0;
0125   for (auto& candidate : candidates) {
0126     LogDebug("EgammaDNNHelper") << "Working on candidate: " << icand;
0127     const auto& [model_index, inputs] = getScaledInputs(candidate);
0128     counts[model_index] += 1;
0129     indexMap[model_index].push_back(icand);
0130     inputsVectors[icand] = inputs;
0131     icand++;
0132   }
0133 
0134   // Prepare one input tensors for each model
0135   std::vector<tensorflow::Tensor> input_tensors(nModels_);
0136   // Pointers for filling efficiently the input tensors
0137   std::vector<float*> input_tensors_pointer(nModels_);
0138   for (size_t i = 0; i < nModels_; i++) {
0139     LogDebug("EgammaDNNHelper") << "Initializing TF input " << i << " with rows:" << counts[i]
0140                                 << " and cols:" << nInputs_[i];
0141     input_tensors[i] = tensorflow::Tensor{tensorflow::DT_FLOAT, {counts[i], nInputs_[i]}};
0142     input_tensors_pointer[i] = input_tensors[i].flat<float>().data();
0143   }
0144 
0145   // Filling the input tensors
0146   for (size_t m = 0; m < nModels_; m++) {
0147     LogDebug("EgammaDNNHelper") << "Loading TF input tensor for model: " << m;
0148     float* T = input_tensors_pointer[m];
0149     for (size_t cand_index : indexMap[m]) {
0150       for (size_t k = 0; k < nInputs_[m]; k++, T++) {  //Note the input tensor pointer incremented
0151         *T = inputsVectors[cand_index][k];
0152       }
0153     }
0154   }
0155 
0156   // Define the output and run
0157   // The initial output is [(cand_index,(model_index, outputs)),.. ]
0158   std::vector<std::pair<uint, std::pair<uint, std::vector<float>>>> outputs;
0159   // Run all the models
0160   for (size_t m = 0; m < nModels_; m++) {
0161     if (counts[m] == 0)
0162       continue;  //Skip model witout inputs
0163     std::vector<tensorflow::Tensor> output;
0164     LogDebug("EgammaDNNHelper") << "Run model: " << m << " with " << counts[m] << "objects";
0165     tensorflow::run(sessions[m], {{cfg_.inputTensorName, input_tensors[m]}}, {cfg_.outputTensorName}, &output);
0166     // Get the output and save the ElectronDNNEstimator::outputDim numbers along with the ele index
0167     const auto& r = output[0].tensor<float, 2>();
0168     // Iterate on the list of elements in the batch --> many electrons
0169     LogDebug("EgammaDNNHelper") << "Model " << m << " has " << cfg_.outputDim[m] << " nodes!";
0170     for (uint b = 0; b < counts[m]; b++) {
0171       //auto outputDim=cfg_.outputDim;
0172       std::vector<float> result(cfg_.outputDim[m]);
0173       for (size_t k = 0; k < cfg_.outputDim[m]; k++) {
0174         result[k] = r(b, k);
0175         LogDebug("EgammaDNNHelper") << "For Object " << b + 1 << " : Node " << k + 1 << " score = " << r(b, k);
0176       }
0177       // Get the original index of the electorn in the original order
0178       const auto cand_index = indexMap[m][b];
0179       outputs.push_back(std::make_pair(cand_index, std::make_pair(m, result)));
0180     }
0181   }
0182   // Now we have just to re-order the outputs
0183   std::sort(outputs.begin(), outputs.end());
0184   std::vector<std::pair<uint, std::vector<float>>> final_outputs(outputs.size());
0185   std::transform(outputs.begin(), outputs.end(), final_outputs.begin(), [](auto a) { return a.second; });
0186 
0187   return final_outputs;
0188 }