Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-06-16 01:50:15

0001 
0002 #include "RecoEgamma/EgammaTools/interface/EgammaDNNHelper.h"
0003 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0004 #include "FWCore/Utilities/interface/FileInPath.h"
0005 #include <iostream>
0006 #include <fstream>
0007 using namespace egammaTools;
0008 
0009 EgammaDNNHelper::EgammaDNNHelper(const DNNConfiguration& cfg,
0010                                  const ModelSelector& modelSelector,
0011                                  const std::vector<std::string>& availableVars)
0012     : cfg_(cfg), modelSelector_(modelSelector), nModels_(cfg_.modelsFiles.size()), graphDefs_(cfg_.modelsFiles.size()) {
0013   initTensorFlowGraphs();
0014   initScalerFiles(availableVars);
0015 }
0016 
0017 void EgammaDNNHelper::initTensorFlowGraphs() {
0018   // load the graph definition
0019   LogDebug("EgammaDNNHelper") << "Loading " << nModels_ << " graphs";
0020   size_t i = 0;
0021   for (const auto& model_file : cfg_.modelsFiles) {
0022     graphDefs_[i] =
0023         std::unique_ptr<tensorflow::GraphDef>(tensorflow::loadGraphDef(edm::FileInPath(model_file).fullPath()));
0024     i++;
0025   }
0026 }
0027 
0028 std::vector<tensorflow::Session*> EgammaDNNHelper::getSessions() const {
0029   std::vector<tensorflow::Session*> sessions;
0030   LogDebug("EgammaDNNHelper") << "Starting " << nModels_ << " TF sessions";
0031   for (const auto& graphDef : graphDefs_) {
0032     sessions.push_back(tensorflow::createSession(graphDef.get()));
0033   }
0034   LogDebug("EgammaDNNHelper") << "TF sessions started";
0035   return sessions;
0036 }
0037 
0038 void EgammaDNNHelper::initScalerFiles(const std::vector<std::string>& availableVars) {
0039   for (const auto& scaler_file : cfg_.scalersFiles) {
0040     // Parse scaler configuration
0041     std::vector<ScalerConfiguration> features;
0042     std::ifstream inputfile_scaler{edm::FileInPath(scaler_file).fullPath()};
0043     int ninputs = 0;
0044     if (inputfile_scaler.fail()) {
0045       throw cms::Exception("MissingFile") << "Scaler file for PFid DNN not found";
0046     } else {
0047       // Now read mean, scale factors for each variable
0048       float par1, par2;
0049       std::string varName, type_str;
0050       uint type;
0051       while (inputfile_scaler >> varName >> type_str >> par1 >> par2) {
0052         if (type_str == "stdscale")
0053           type = 1;
0054         else if (type_str == "minmax")
0055           type = 2;
0056         else if (type_str == "custom1")  // 2*((X_train - minValues)/(MaxMinusMin)) -1.0
0057           type = 3;
0058         else
0059           type = 0;
0060         features.push_back(ScalerConfiguration{.varName = varName, .type = type, .par1 = par1, .par2 = par2});
0061         // Protection for mismatch between requested variables and the available ones
0062         auto match = std::find(availableVars.begin(), availableVars.end(), varName);
0063         if (match == std::end(availableVars)) {
0064           throw cms::Exception("MissingVariable")
0065               << "Requested variable (" << varName << ") not available between DNN inputs";
0066         }
0067         ninputs += 1;
0068       }
0069     }
0070     inputfile_scaler.close();
0071     featuresMap_.push_back(features);
0072     nInputs_.push_back(ninputs);
0073   }
0074 }
0075 
0076 std::pair<uint, std::vector<float>> EgammaDNNHelper::getScaledInputs(
0077     const std::map<std::string, float>& variables) const {
0078   // Call the modelSelector function passing the variables map to return
0079   // the modelIndex to be used for the current candidate
0080   const auto modelIndex = modelSelector_(variables);
0081   std::vector<float> inputs;
0082   // Loop on the list of requested variables and scaling values for the specific modelIndex
0083   // Different type of scaling are available: 0=no scaling, 1=standard scaler, 2=minmax
0084   for (auto& [varName, type, par1, par2] : featuresMap_[modelIndex]) {
0085     if (type == 1)  // Standard scaling
0086       inputs.push_back((variables.at(varName) - par1) / par2);
0087     else if (type == 2)  // MinMax
0088       inputs.push_back((variables.at(varName) - par1) / (par2 - par1));
0089     else if (type == 3)  //2*((X_train - minValues)/(MaxMinusMin)) -1.0
0090       inputs.push_back(2 * (variables.at(varName) - par1) / (par2 - par1) - 1.);
0091     else {
0092       inputs.push_back(variables.at(varName));  // Do nothing on the variable
0093     }
0094     //Protection for mismatch between requested variables and the available ones
0095     // have been added when the scaler config are loaded --> here we know that the variables are available
0096   }
0097   return std::make_pair(modelIndex, inputs);
0098 }
0099 
0100 std::vector<std::pair<uint, std::vector<float>>> EgammaDNNHelper::evaluate(
0101     const std::vector<std::map<std::string, float>>& candidates,
0102     const std::vector<tensorflow::Session*>& sessions) const {
0103   /*
0104     Evaluate the PFID DNN for all the electrons/photons. 
0105     nModels_ are defined depending on modelIndex  --> we need to build N input tensors to evaluate
0106     the DNNs with batching.
0107     
0108     1) Get all the variable for each candidate  vector<map<string:float>>
0109     2) Scale the input and select the variables for each model
0110     2) Prepare the input tensors for the  models
0111     3) Run the models and get the output for each candidate
0112     4) Sort the output by candidate index
0113     5) Return the DNN outputs along with the model index used on it
0114 
0115     */
0116   size_t nCandidates = candidates.size();
0117   std::vector<std::vector<uint>> indexMap(nModels_);  // for each model; the list of candidate index is saved
0118   std::vector<std::vector<float>> inputsVectors(nCandidates);
0119   std::vector<uint> counts(nModels_);
0120 
0121   LogDebug("EgammaDNNHelper") << "Working on " << nCandidates << " candidates";
0122 
0123   uint icand = 0;
0124   for (auto& candidate : candidates) {
0125     LogDebug("EgammaDNNHelper") << "Working on candidate: " << icand;
0126     const auto& [model_index, inputs] = getScaledInputs(candidate);
0127     counts[model_index] += 1;
0128     indexMap[model_index].push_back(icand);
0129     inputsVectors[icand] = inputs;
0130     icand++;
0131   }
0132 
0133   // Prepare one input tensors for each model
0134   std::vector<tensorflow::Tensor> input_tensors(nModels_);
0135   // Pointers for filling efficiently the input tensors
0136   std::vector<float*> input_tensors_pointer(nModels_);
0137   for (size_t i = 0; i < nModels_; i++) {
0138     LogDebug("EgammaDNNHelper") << "Initializing TF input " << i << " with rows:" << counts[i]
0139                                 << " and cols:" << nInputs_[i];
0140     input_tensors[i] = tensorflow::Tensor{tensorflow::DT_FLOAT, {counts[i], nInputs_[i]}};
0141     input_tensors_pointer[i] = input_tensors[i].flat<float>().data();
0142   }
0143 
0144   // Filling the input tensors
0145   for (size_t m = 0; m < nModels_; m++) {
0146     LogDebug("EgammaDNNHelper") << "Loading TF input tensor for model: " << m;
0147     float* T = input_tensors_pointer[m];
0148     for (size_t cand_index : indexMap[m]) {
0149       for (size_t k = 0; k < nInputs_[m]; k++, T++) {  //Note the input tensor pointer incremented
0150         *T = inputsVectors[cand_index][k];
0151       }
0152     }
0153   }
0154 
0155   // Define the output and run
0156   // The initial output is [(cand_index,(model_index, outputs)),.. ]
0157   std::vector<std::pair<uint, std::pair<uint, std::vector<float>>>> outputs;
0158   // Run all the models
0159   for (size_t m = 0; m < nModels_; m++) {
0160     if (counts[m] == 0)
0161       continue;  //Skip model witout inputs
0162     std::vector<tensorflow::Tensor> output;
0163     LogDebug("EgammaDNNHelper") << "Run model: " << m << " with " << counts[m] << "objects";
0164     tensorflow::run(sessions[m], {{cfg_.inputTensorName, input_tensors[m]}}, {cfg_.outputTensorName}, &output);
0165     // Get the output and save the ElectronDNNEstimator::outputDim numbers along with the ele index
0166     const auto& r = output[0].tensor<float, 2>();
0167     // Iterate on the list of elements in the batch --> many electrons
0168     LogDebug("EgammaDNNHelper") << "Model " << m << " has " << cfg_.outputDim[m] << " nodes!";
0169     for (uint b = 0; b < counts[m]; b++) {
0170       //auto outputDim=cfg_.outputDim;
0171       std::vector<float> result(cfg_.outputDim[m]);
0172       for (size_t k = 0; k < cfg_.outputDim[m]; k++) {
0173         result[k] = r(b, k);
0174         LogDebug("EgammaDNNHelper") << "For Object " << b + 1 << " : Node " << k + 1 << " score = " << r(b, k);
0175       }
0176       // Get the original index of the electorn in the original order
0177       const auto cand_index = indexMap[m][b];
0178       outputs.push_back(std::make_pair(cand_index, std::make_pair(m, result)));
0179     }
0180   }
0181   // Now we have just to re-order the outputs
0182   std::sort(outputs.begin(), outputs.end());
0183   std::vector<std::pair<uint, std::vector<float>>> final_outputs(outputs.size());
0184   std::transform(outputs.begin(), outputs.end(), final_outputs.begin(), [](auto a) { return a.second; });
0185 
0186   return final_outputs;
0187 }