EgammaTools/src/EgammaDNNHelper.cc

0001
0002 #include "RecoEgamma/EgammaTools/interface/EgammaDNNHelper.h"
0003 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0004 #include "FWCore/Utilities/interface/FileInPath.h"
0005 #include <iostream>
0006 #include <fstream>
0007 using namespace egammaTools;
0008
0009 EgammaDNNHelper::EgammaDNNHelper(const DNNConfiguration& cfg,
0010                                  const ModelSelector& modelSelector,
0011                                  const std::vector<std::string>& availableVars)
0012     : cfg_(cfg),
0013       modelSelector_(modelSelector),
0014       nModels_(cfg_.modelsFiles.size()),
0015       tf_sessions_cache_(cfg_.modelsFiles.size()) {
0016   initTensorFlowSessions();
0017   initScalerFiles(availableVars);
0018 }
0019
0020 void EgammaDNNHelper::initTensorFlowSessions() {
0021   // load the graph definition
0022   LogDebug("EgammaDNNHelper") << "Loading " << nModels_ << " graphs and sessions";
0023   size_t i = 0;
0024   for (auto& model_file : cfg_.modelsFiles) {
0025     tf_sessions_cache_[i] = std::make_unique<tensorflow::SessionCache>(edm::FileInPath(model_file).fullPath());
0026     i++;
0027   }
0028   LogDebug("EgammaDNNHelper") << "TF sessions initialized";
0029 }
0030
0031 void EgammaDNNHelper::initScalerFiles(const std::vector<std::string>& availableVars) {
0032   for (const auto& scaler_file : cfg_.scalersFiles) {
0033     // Parse scaler configuration
0034     std::vector<ScalerConfiguration> features;
0035     std::ifstream inputfile_scaler{edm::FileInPath(scaler_file).fullPath()};
0036     int ninputs = 0;
0037     if (inputfile_scaler.fail()) {
0038       throw cms::Exception("MissingFile") << "Scaler file for PFid DNN not found";
0039     } else {
0040       // Now read mean, scale factors for each variable
0041       float par1, par2;
0042       std::string varName, type_str;
0043       uint type;
0044       while (inputfile_scaler >> varName >> type_str >> par1 >> par2) {
0045         if (type_str == "stdscale")
0046           type = 1;
0047         else if (type_str == "minmax")
0048           type = 2;
0049         else if (type_str == "custom1")  // 2*((X_train - minValues)/(MaxMinusMin)) -1.0
0050           type = 3;
0051         else
0052           type = 0;
0053         features.push_back(ScalerConfiguration{.varName = varName, .type = type, .par1 = par1, .par2 = par2});
0054         // Protection for mismatch between requested variables and the available ones
0055         auto match = std::find(availableVars.begin(), availableVars.end(), varName);
0056         if (match == std::end(availableVars)) {
0057           throw cms::Exception("MissingVariable")
0058               << "Requested variable (" << varName << ") not available between DNN inputs";
0059         }
0060         ninputs += 1;
0061       }
0062     }
0063     inputfile_scaler.close();
0064     featuresMap_.push_back(features);
0065     nInputs_.push_back(ninputs);
0066   }
0067 }
0068
0069 std::pair<uint, std::vector<float>> EgammaDNNHelper::getScaledInputs(
0070     const std::map<std::string, float>& variables) const {
0071   // Call the modelSelector function passing the variables map to return
0072   // the modelIndex to be used for the current candidate
0073   const auto modelIndex = modelSelector_(variables);
0074   std::vector<float> inputs;
0075   // Loop on the list of requested variables and scaling values for the specific modelIndex
0076   // Different type of scaling are available: 0=no scaling, 1=standard scaler, 2=minmax
0077   for (auto& [varName, type, par1, par2] : featuresMap_[modelIndex]) {
0078     if (type == 1)  // Standard scaling
0079       inputs.push_back((variables.at(varName) - par1) / par2);
0080     else if (type == 2)  // MinMax
0081       inputs.push_back((variables.at(varName) - par1) / (par2 - par1));
0082     else if (type == 3)  //2*((X_train - minValues)/(MaxMinusMin)) -1.0
0083       inputs.push_back(2 * (variables.at(varName) - par1) / (par2 - par1) - 1.);
0084     else {
0085       inputs.push_back(variables.at(varName));  // Do nothing on the variable
0086     }
0087     //Protection for mismatch between requested variables and the available ones
0088     // have been added when the scaler config are loaded --> here we know that the variables are available
0089   }
0090   return std::make_pair(modelIndex, inputs);
0091 }
0092
0093 std::vector<std::pair<uint, std::vector<float>>> EgammaDNNHelper::evaluate(
0094     const std::vector<std::map<std::string, float>>& candidates) const {
0095   /*
0096     Evaluate the PFID DNN for all the electrons/photons.
0097     nModels_ are defined depending on modelIndex  --> we need to build N input tensors to evaluate
0098     the DNNs with batching.
0099
0100     1) Get all the variable for each candidate  vector<map<string:float>>
0101     2) Scale the input and select the variables for each model
0102     2) Prepare the input tensors for the  models
0103     3) Run the models and get the output for each candidate
0104     4) Sort the output by candidate index
0105     5) Return the DNN outputs along with the model index used on it
0106
0107     */
0108   size_t nCandidates = candidates.size();
0109   std::vector<std::vector<uint>> indexMap(nModels_);  // for each model; the list of candidate index is saved
0110   std::vector<std::vector<float>> inputsVectors(nCandidates);
0111   std::vector<uint> counts(nModels_);
0112
0113   LogDebug("EgammaDNNHelper") << "Working on " << nCandidates << " candidates";
0114
0115   uint icand = 0;
0116   for (auto& candidate : candidates) {
0117     LogDebug("EgammaDNNHelper") << "Working on candidate: " << icand;
0118     const auto& [model_index, inputs] = getScaledInputs(candidate);
0119     counts[model_index] += 1;
0120     indexMap[model_index].push_back(icand);
0121     inputsVectors[icand] = inputs;
0122     icand++;
0123   }
0124
0125   // Prepare one input tensors for each model
0126   std::vector<tensorflow::Tensor> input_tensors(nModels_);
0127   // Pointers for filling efficiently the input tensors
0128   std::vector<float*> input_tensors_pointer(nModels_);
0129   for (size_t i = 0; i < nModels_; i++) {
0130     LogDebug("EgammaDNNHelper") << "Initializing TF input " << i << " with rows:" << counts[i]
0131                                 << " and cols:" << nInputs_[i];
0132     input_tensors[i] = tensorflow::Tensor{tensorflow::DT_FLOAT, {counts[i], nInputs_[i]}};
0133     input_tensors_pointer[i] = input_tensors[i].flat<float>().data();
0134   }
0135
0136   // Filling the input tensors
0137   for (size_t m = 0; m < nModels_; m++) {
0138     LogDebug("EgammaDNNHelper") << "Loading TF input tensor for model: " << m;
0139     float* T = input_tensors_pointer[m];
0140     for (size_t cand_index : indexMap[m]) {
0141       for (size_t k = 0; k < nInputs_[m]; k++, T++) {  //Note the input tensor pointer incremented
0142         *T = inputsVectors[cand_index][k];
0143       }
0144     }
0145   }
0146
0147   // Define the output and run
0148   // The initial output is [(cand_index,(model_index, outputs)),.. ]
0149   std::vector<std::pair<uint, std::pair<uint, std::vector<float>>>> outputs;
0150   // Run all the models
0151   for (size_t m = 0; m < nModels_; m++) {
0152     if (counts[m] == 0)
0153       continue;  //Skip model witout inputs
0154     std::vector<tensorflow::Tensor> output;
0155     LogDebug("EgammaDNNHelper") << "Run model: " << m << " with " << counts[m] << "objects";
0156     tensorflow::run(tf_sessions_cache_[m]->getSession(),
0157                     {{cfg_.inputTensorName, input_tensors[m]}},
0158                     {cfg_.outputTensorName},
0159                     &output);
0160     // Get the output and save the ElectronDNNEstimator::outputDim numbers along with the ele index
0161     const auto& r = output[0].tensor<float, 2>();
0162     // Iterate on the list of elements in the batch --> many electrons
0163     LogDebug("EgammaDNNHelper") << "Model " << m << " has " << cfg_.outputDim[m] << " nodes!";
0164     for (uint b = 0; b < counts[m]; b++) {
0165       //auto outputDim=cfg_.outputDim;
0166       std::vector<float> result(cfg_.outputDim[m]);
0167       for (size_t k = 0; k < cfg_.outputDim[m]; k++) {
0168         result[k] = r(b, k);
0169         LogDebug("EgammaDNNHelper") << "For Object " << b + 1 << " : Node " << k + 1 << " score = " << r(b, k);
0170       }
0171       // Get the original index of the electorn in the original order
0172       const auto cand_index = indexMap[m][b];
0173       outputs.push_back(std::make_pair(cand_index, std::make_pair(m, result)));
0174     }
0175   }
0176   // Now we have just to re-order the outputs
0177   std::sort(outputs.begin(), outputs.end());
0178   std::vector<std::pair<uint, std::vector<float>>> final_outputs(outputs.size());
0179   std::transform(outputs.begin(), outputs.end(), final_outputs.begin(), [](auto a) { return a.second; });
0180
0181   return final_outputs;
0182 }