PatAlgos/interface/BaseMVAValueMapProducer.h

0001 #ifndef PhysicsTools_PatAlgos_BaseMVAValueMapProducer
0002 #define PhysicsTools_PatAlgos_BaseMVAValueMapProducer
0003
0004 // -*- C++ -*-
0005 //
0006 // Package:    PhysicsTools/PatAlgos
0007 // Class:      BaseMVAValueMapProducer
0008 //
0009 /**\class BaseMVAValueMapProducer BaseMVAValueMapProducer.cc PhysicsTools/PatAlgos/plugins/BaseMVAValueMapProducer.cc
0010
0011  Description: [one line class summary]
0012
0013  Implementation:
0014      [Notes on implementation]
0015 */
0016 //
0017 // Original Author:  Andre Rizzi
0018 //         Created:  Mon, 07 Sep 2017 09:18:03 GMT
0019 //
0020 //
0021
0022 // system include files
0023 #include <memory>
0024
0025 // user include files
0026 #include "FWCore/Framework/interface/Frameworkfwd.h"
0027 #include "FWCore/Framework/interface/stream/EDProducer.h"
0028
0029 #include "FWCore/Framework/interface/Event.h"
0030 #include "FWCore/Framework/interface/MakerMacros.h"
0031
0032 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0033 #include "FWCore/Utilities/interface/StreamID.h"
0034
0035 #include "TMVA/Factory.h"
0036 #include "TMVA/Reader.h"
0037
0038 #include "CommonTools/Utils/interface/StringObjectFunction.h"
0039 #include "DataFormats/Common/interface/ValueMap.h"
0040 #include "CommonTools/MVAUtils/interface/TMVAZipReader.h"
0041 #include "DataFormats/PatCandidates/interface/Jet.h"
0042 #include "DataFormats/PatCandidates/interface/Muon.h"
0043 #include "DataFormats/PatCandidates/interface/Electron.h"
0044
0045 #include "DataFormats/PatCandidates/interface/Jet.h"
0046 #include "DataFormats/PatCandidates/interface/Muon.h"
0047 #include "DataFormats/PatCandidates/interface/Electron.h"
0048 #include "PhysicsTools/TensorFlow/interface/TensorFlow.h"
0049 #include "PhysicsTools/ONNXRuntime/interface/ONNXRuntime.h"
0050
0051 #include <string>
0052 //
0053 // class declaration
0054 //
0055
0056 class BaseMVACache {
0057 public:
0058   BaseMVACache(const std::string& model_path, const std::string& backend, const bool disableONNXGraphOpt) {
0059     if (backend == "TF") {
0060       graph_.reset(tensorflow::loadGraphDef(model_path));
0061       tf_session_ = tensorflow::createSession(graph_.get());
0062     } else if (backend == "ONNX") {
0063       if (disableONNXGraphOpt) {
0064         Ort::SessionOptions sess_opts;
0065         sess_opts = cms::Ort::ONNXRuntime::defaultSessionOptions();
0066         sess_opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
0067         ort_ = std::make_unique<cms::Ort::ONNXRuntime>(model_path, &sess_opts);
0068       } else {
0069         ort_ = std::make_unique<cms::Ort::ONNXRuntime>(model_path);
0070       }
0071     }
0072   }
0073   ~BaseMVACache() { tensorflow::closeSession(tf_session_); }
0074
0075   tensorflow::Session* getTFSession() const { return tf_session_; }
0076   const cms::Ort::ONNXRuntime& getONNXSession() const { return *ort_; }
0077
0078 private:
0079   std::shared_ptr<tensorflow::GraphDef> graph_;
0080   tensorflow::Session* tf_session_ = nullptr;
0081   std::unique_ptr<cms::Ort::ONNXRuntime> ort_;
0082 };
0083
0084 template <typename T>
0085 class BaseMVAValueMapProducer : public edm::stream::EDProducer<edm::GlobalCache<BaseMVACache>> {
0086 public:
0087   explicit BaseMVAValueMapProducer(const edm::ParameterSet& iConfig, const BaseMVACache* cache)
0088       : src_(consumes<edm::View<T>>(iConfig.getParameter<edm::InputTag>("src"))),
0089         name_(iConfig.getParameter<std::string>("name")),
0090         backend_(iConfig.getParameter<std::string>("backend")),
0091         weightfilename_(iConfig.getParameter<edm::FileInPath>("weightFile").fullPath()),
0092         tmva_(backend_ == "TMVA"),
0093         tf_(backend_ == "TF"),
0094         onnx_(backend_ == "ONNX"),
0095         batch_eval_(iConfig.getParameter<bool>("batch_eval")) {
0096     if (tmva_) {
0097       reader_ = new TMVA::Reader();
0098       isClassifier_ = iConfig.getParameter<bool>("isClassifier");
0099     }
0100
0101     std::vector<edm::ParameterSet> const& varsPSet = iConfig.getParameter<std::vector<edm::ParameterSet>>("variables");
0102     values_.resize(varsPSet.size());
0103     size_t i = 0;
0104     for (const edm::ParameterSet& var_pset : varsPSet) {
0105       const std::string& vname = var_pset.getParameter<std::string>("name");
0106       if (var_pset.existsAs<std::string>("expr"))
0107         funcs_.emplace_back(
0108             std::pair<std::string, StringObjectFunction<T, true>>(vname, var_pset.getParameter<std::string>("expr")));
0109       positions_[vname] = i;
0110       if (tmva_)
0111         reader_->AddVariable(vname, (&values_.front()) + i);
0112       i++;
0113     }
0114
0115     if (tmva_) {
0116       reco::details::loadTMVAWeights(reader_, name_, weightfilename_);
0117     }
0118     if (tf_ || onnx_) {
0119       inputTensorName_ = iConfig.getParameter<std::string>("inputTensorName");
0120       outputTensorName_ = iConfig.getParameter<std::string>("outputTensorName");
0121       output_names_ = iConfig.getParameter<std::vector<std::string>>("outputNames");
0122       for (const auto& s : iConfig.getParameter<std::vector<std::string>>("outputFormulas")) {
0123         output_formulas_.push_back(StringObjectFunction<std::vector<float>>(s));
0124       }
0125     }
0126
0127     if (tmva_)
0128       produces<edm::ValueMap<float>>();
0129     else {
0130       for (const auto& n : output_names_) {
0131         produces<edm::ValueMap<float>>(n);
0132       }
0133     }
0134   }
0135   ~BaseMVAValueMapProducer() override {}
0136
0137   void setValue(const std::string var, float val) {
0138     if (positions_.find(var) != positions_.end())
0139       values_[positions_[var]] = val;
0140   }
0141
0142   static std::unique_ptr<BaseMVACache> initializeGlobalCache(const edm::ParameterSet& cfg);
0143   static void globalEndJob(const BaseMVACache* cache);
0144
0145   static edm::ParameterSetDescription getDescription();
0146   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0147
0148 private:
0149   void beginStream(edm::StreamID) override{};
0150   void produce(edm::Event&, const edm::EventSetup&) override;
0151   void endStream() override {}
0152
0153   ///to be implemented in derived classes, filling values for additional variables
0154   virtual void readAdditionalCollections(edm::Event&, const edm::EventSetup&) {}
0155   virtual void fillAdditionalVariables(const T&) {}
0156
0157   edm::EDGetTokenT<edm::View<T>> src_;
0158   std::map<std::string, size_t> positions_;
0159   std::vector<std::pair<std::string, StringObjectFunction<T, true>>> funcs_;
0160   std::vector<float> values_;
0161   TMVA::Reader* reader_;
0162
0163   std::string name_;
0164   std::string backend_;
0165   std::string weightfilename_;
0166   bool isClassifier_;
0167   bool tmva_;
0168   bool tf_;
0169   bool onnx_;
0170   bool batch_eval_;
0171   std::string inputTensorName_;
0172   std::string outputTensorName_;
0173   std::vector<std::string> output_names_;
0174   std::vector<StringObjectFunction<std::vector<float>>> output_formulas_;
0175 };
0176
0177 template <typename T>
0178 void BaseMVAValueMapProducer<T>::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
0179   edm::Handle<edm::View<T>> src;
0180   iEvent.getByToken(src_, src);
0181   readAdditionalCollections(iEvent, iSetup);
0182   std::vector<std::vector<float>> mvaOut((tmva_) ? 1 : output_names_.size());
0183   for (auto& v : mvaOut)
0184     v.reserve(src->size());
0185
0186   if (batch_eval_) {
0187     if (!src->empty()) {
0188       std::vector<float> data;
0189       data.reserve(src->size() * positions_.size());
0190       for (auto const& o : *src) {
0191         for (auto const& p : funcs_) {
0192           setValue(p.first, p.second(o));
0193         }
0194         fillAdditionalVariables(o);
0195         data.insert(data.end(), values_.begin(), values_.end());
0196       }
0197
0198       std::vector<float> outputs;
0199       if (tf_) {
0200         tensorflow::TensorShape input_size{(long long int)src->size(), (long long int)positions_.size()};
0201         tensorflow::NamedTensorList input_tensors;
0202         input_tensors.resize(1);
0203         input_tensors[0] =
0204             tensorflow::NamedTensor(inputTensorName_, tensorflow::Tensor(tensorflow::DT_FLOAT, input_size));
0205         for (unsigned i = 0; i < data.size(); ++i) {
0206           input_tensors[0].second.flat<float>()(i) = data[i];
0207         }
0208         std::vector<tensorflow::Tensor> output_tensors;
0209         tensorflow::run(globalCache()->getTFSession(), input_tensors, {outputTensorName_}, &output_tensors);
0210         for (unsigned i = 0; i < output_tensors.at(0).NumElements(); ++i) {
0211           outputs.push_back(output_tensors.at(0).flat<float>()(i));
0212         }
0213       } else if (onnx_) {
0214         cms::Ort::FloatArrays inputs{data};
0215         outputs =
0216             globalCache()->getONNXSession().run({inputTensorName_}, inputs, {}, {outputTensorName_}, src->size())[0];
0217       }
0218
0219       const unsigned outdim = outputs.size() / src->size();
0220       for (unsigned i = 0; i < src->size(); ++i) {
0221         std::vector<float> tmpOut(outputs.begin() + i * outdim, outputs.begin() + (i + 1) * outdim);
0222         for (size_t k = 0; k < output_names_.size(); k++) {
0223           mvaOut[k].push_back(output_formulas_[k](tmpOut));
0224         }
0225       }
0226     }
0227   } else {
0228     for (auto const& o : *src) {
0229       for (auto const& p : funcs_) {
0230         setValue(p.first, p.second(o));
0231       }
0232       fillAdditionalVariables(o);
0233       if (tmva_) {
0234         mvaOut[0].push_back(isClassifier_ ? reader_->EvaluateMVA(name_) : reader_->EvaluateRegression(name_)[0]);
0235       } else {
0236         std::vector<float> tmpOut;
0237         if (tf_) {
0238           //currently support only one input sensor to reuse the TMVA like config
0239           tensorflow::TensorShape input_size{1, (long long int)positions_.size()};
0240           tensorflow::NamedTensorList input_tensors;
0241           input_tensors.resize(1);
0242           input_tensors[0] =
0243               tensorflow::NamedTensor(inputTensorName_, tensorflow::Tensor(tensorflow::DT_FLOAT, input_size));
0244           for (size_t j = 0; j < values_.size(); j++) {
0245             input_tensors[0].second.matrix<float>()(0, j) = values_[j];
0246           }
0247           std::vector<tensorflow::Tensor> outputs;
0248           tensorflow::run(globalCache()->getTFSession(), input_tensors, {outputTensorName_}, &outputs);
0249           for (int k = 0; k < outputs.at(0).matrix<float>().dimension(1); k++)
0250             tmpOut.push_back(outputs.at(0).matrix<float>()(0, k));
0251         } else if (onnx_) {
0252           cms::Ort::FloatArrays inputs{values_};
0253           tmpOut = globalCache()->getONNXSession().run({inputTensorName_}, inputs, {}, {outputTensorName_})[0];
0254         }
0255         for (size_t k = 0; k < output_names_.size(); k++)
0256           mvaOut[k].push_back(output_formulas_[k](tmpOut));
0257       }
0258     }
0259   }
0260
0261   size_t k = 0;
0262   for (auto& m : mvaOut) {
0263     std::unique_ptr<edm::ValueMap<float>> mvaV(new edm::ValueMap<float>());
0264     edm::ValueMap<float>::Filler filler(*mvaV);
0265     filler.insert(src, m.begin(), m.end());
0266     filler.fill();
0267     iEvent.put(std::move(mvaV), (tmva_) ? "" : output_names_[k]);
0268     k++;
0269   }
0270 }
0271
0272 template <typename T>
0273 std::unique_ptr<BaseMVACache> BaseMVAValueMapProducer<T>::initializeGlobalCache(const edm::ParameterSet& cfg) {
0274   std::string backend = cfg.getParameter<std::string>("backend");
0275   bool disableONNXGraphOpt = false;
0276   if (backend == "ONNX")
0277     disableONNXGraphOpt = cfg.getParameter<bool>("disableONNXGraphOpt");
0278   return std::make_unique<BaseMVACache>(
0279       cfg.getParameter<edm::FileInPath>("weightFile").fullPath(), backend, disableONNXGraphOpt);
0280 }
0281
0282 template <typename T>
0283 void BaseMVAValueMapProducer<T>::globalEndJob(const BaseMVACache* cache) {}
0284
0285 template <typename T>
0286 edm::ParameterSetDescription BaseMVAValueMapProducer<T>::getDescription() {
0287   edm::ParameterSetDescription desc;
0288   desc.add<edm::InputTag>("src")->setComment("input physics object collection");
0289
0290   desc.add<std::string>("name")->setComment("output score variable name");
0291   desc.add<edm::FileInPath>("weightFile")->setComment("xml weight file, or TF/ONNX model file");
0292   desc.add<bool>("batch_eval", false)->setComment("Run inference in batch instead of per-object");
0293
0294   edm::ParameterSetDescription variable;
0295   variable.add<std::string>("name")->setComment("name of the variable, either created by expr, or internally by code");
0296   variable.addOptional<std::string>("expr")->setComment(
0297       "a function to define the content of the model input, absence of it means the leaf is computed internally");
0298   variable.setComment("a PSet to define an entry to the ML model");
0299   desc.addVPSet("variables", variable);
0300
0301   auto itn = edm::ParameterDescription<std::string>(
0302       "inputTensorName", "", true, edm::Comment("Name of tensorflow input tensor in the model"));
0303   auto otn = edm::ParameterDescription<std::string>(
0304       "outputTensorName", "", true, edm::Comment("Name of tensorflow output tensor in the model"));
0305   auto on = edm::ParameterDescription<std::vector<std::string>>(
0306       "outputNames",
0307       std::vector<std::string>(),
0308       true,
0309       edm::Comment("Names of the output values to be used in the output valuemap"));
0310   auto of = edm::ParameterDescription<std::vector<std::string>>(
0311       "outputFormulas",
0312       std::vector<std::string>(),
0313       true,
0314       edm::Comment("Formulas to be used to post process the output"));
0315   auto dog = edm::ParameterDescription<bool>(
0316       "disableONNXGraphOpt", false, true, edm::Comment("Disable ONNX runtime graph optimization"));
0317
0318   desc.ifValue(edm::ParameterDescription<std::string>(
0319                    "backend", "TMVA", true, edm::Comment("the backend to evaluate the model:tmva, tf or onnx")),
0320                "TMVA" >> edm::ParameterDescription<bool>(
0321                              "isClassifier", true, true, edm::Comment("a classification or regression")) or
0322                    "TF" >> (itn and otn and on and of) or "ONNX" >> (itn and otn and on and of and dog));
0323
0324   return desc;
0325 }
0326
0327 template <typename T>
0328 void BaseMVAValueMapProducer<T>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0329   edm::ParameterSetDescription desc = getDescription();
0330   std::string modname;
0331   if (typeid(T) == typeid(pat::Jet))
0332     modname += "Jet";
0333   else if (typeid(T) == typeid(pat::Muon))
0334     modname += "Muon";
0335   else if (typeid(T) == typeid(pat::Electron))
0336     modname += "Ele";
0337   modname += "BaseMVAValueMapProducer";
0338   descriptions.add(modname, desc);
0339 }
0340
0341 #endif