![]() |
|
|||
File indexing completed on 2025-03-23 23:40:23
0001 ////////////////////////////////////////////////////////////////////////// 0002 // Forest.cxx // 0003 // =====================================================================// 0004 // This is the object implementation of a forest of decision trees. // 0005 // We need this to implement gradient boosting. // 0006 // References include // 0007 // *Elements of Statistical Learning by Hastie, // 0008 // Tibshirani, and Friedman. // 0009 // *Greedy Function Approximation: A Gradient Boosting Machine. // 0010 // Friedman. The Annals of Statistics, Vol. 29, No. 5. Oct 2001. // 0011 // *Inductive Learning of Tree-based Regression Models. Luis Torgo. // 0012 // // 0013 ////////////////////////////////////////////////////////////////////////// 0014 0015 /////////////////////////////////////////////////////////////////////////// 0016 // _______________________Includes_______________________________________// 0017 /////////////////////////////////////////////////////////////////////////// 0018 0019 #include "L1Trigger/L1TMuonEndCap/interface/bdt/Forest.h" 0020 #include "L1Trigger/L1TMuonEndCap/interface/bdt/Utilities.h" 0021 0022 #include "FWCore/ParameterSet/interface/FileInPath.h" 0023 0024 #include "TStopwatch.h" 0025 #include "TString.h" 0026 0027 #include <iostream> 0028 #include <sstream> 0029 #include <algorithm> 0030 #include <iterator> 0031 #include <fstream> 0032 #include <utility> 0033 0034 using namespace emtf; 0035 0036 ////////////////////////////////////////////////////////////////////////// 0037 // _______________________Constructor(s)________________________________// 0038 ////////////////////////////////////////////////////////////////////////// 0039 0040 Forest::Forest() { events = std::vector<std::vector<Event*>>(1); } 0041 0042 ////////////////////////////////////////////////////////////////////////// 0043 // ---------------------------------------------------------------------- 0044 ////////////////////////////////////////////////////////////////////////// 0045 0046 Forest::Forest(std::vector<Event*>& trainingEvents) { setTrainingEvents(trainingEvents); } 0047 0048 Forest::Forest(const Forest& forest) { 0049 transform(forest.trees.cbegin(), forest.trees.cend(), back_inserter(trees), [](const std::unique_ptr<Tree>& tree) { 0050 return std::make_unique<Tree>(*tree); 0051 }); 0052 } 0053 0054 Forest& Forest::operator=(const Forest& forest) { 0055 trees.resize(0); 0056 trees.reserve(forest.trees.size()); 0057 transform(forest.trees.cbegin(), forest.trees.cend(), back_inserter(trees), [](const std::unique_ptr<Tree>& tree) { 0058 return std::make_unique<Tree>(*tree); 0059 }); 0060 return *this; 0061 } 0062 0063 ////////////////////////////////////////////////////////////////////////// 0064 // ______________________Get/Set_Functions______________________________// 0065 ////////////////////////////////////////////////////////////////////////// 0066 0067 void Forest::setTrainingEvents(std::vector<Event*>& trainingEvents) { 0068 // tell the forest which events to use for training 0069 0070 Event* e = trainingEvents[0]; 0071 // Unused variable 0072 // unsigned int numrows = e->data.size(); 0073 0074 // Reset the events matrix. 0075 events = std::vector<std::vector<Event*>>(); 0076 0077 for (unsigned int i = 0; i < e->data.size(); i++) { 0078 events.push_back(trainingEvents); 0079 } 0080 } 0081 0082 ////////////////////////////////////////////////////////////////////////// 0083 // ---------------------------------------------------------------------- 0084 ////////////////////////////////////////////////////////////////////////// 0085 0086 // return a copy of the training events 0087 std::vector<Event*> Forest::getTrainingEvents() { return events[0]; } 0088 0089 ////////////////////////////////////////////////////////////////////////// 0090 // ---------------------------------------------------------------------- 0091 ////////////////////////////////////////////////////////////////////////// 0092 0093 // return the ith tree 0094 Tree* Forest::getTree(unsigned int i) { 0095 if (/*i>=0 && */ i < trees.size()) 0096 return trees[i].get(); 0097 else { 0098 //std::cout << i << "is an invalid input for getTree. Out of range." << std::endl; 0099 return nullptr; 0100 } 0101 } 0102 0103 ////////////////////////////////////////////////////////////////////////// 0104 // ______________________Various_Helpful_Functions______________________// 0105 ////////////////////////////////////////////////////////////////////////// 0106 0107 unsigned int Forest::size() const { 0108 // Return the number of trees in the forest. 0109 return trees.size(); 0110 } 0111 0112 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 0113 //*** Need to make a data structure that includes the next few functions *** 0114 //*** pertaining to events. These don't really have much to do with the *** 0115 //*** forest class. *** 0116 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 0117 0118 ////////////////////////////////////////////////////////////////////////// 0119 // ---------------------------------------------------------------------- 0120 ////////////////////////////////////////////////////////////////////////// 0121 0122 void Forest::listEvents(std::vector<std::vector<Event*>>& e) const { 0123 // Simply list the events in each event vector. We have multiple copies 0124 // of the events vector. Each copy is sorted according to a different 0125 // determining variable. 0126 std::cout << std::endl << "Listing Events... " << std::endl; 0127 0128 for (unsigned int i = 0; i < e.size(); i++) { 0129 std::cout << std::endl << "Variable " << i << " vector contents: " << std::endl; 0130 for (unsigned int j = 0; j < e[i].size(); j++) { 0131 e[i][j]->outputEvent(); 0132 } 0133 std::cout << std::endl; 0134 } 0135 } 0136 0137 ////////////////////////////////////////////////////////////////////////// 0138 // ---------------------------------------------------------------------- 0139 ////////////////////////////////////////////////////////////////////////// 0140 0141 // We have to initialize Event::sortingIndex outside of a function since 0142 // it is a static member. 0143 int Event::sortingIndex = 1; 0144 0145 bool compareEvents(Event* e1, Event* e2) { 0146 // Sort the events according to the variable given by the sortingIndex. 0147 return e1->data[Event::sortingIndex] < e2->data[Event::sortingIndex]; 0148 } 0149 ////////////////////////////////////////////////////////////////////////// 0150 // ---------------------------------------------------------------------- 0151 ////////////////////////////////////////////////////////////////////////// 0152 0153 bool compareEventsById(Event* e1, Event* e2) { 0154 // Sort the events by ID. We need this to produce rate plots. 0155 return e1->id < e2->id; 0156 } 0157 ////////////////////////////////////////////////////////////////////////// 0158 // ---------------------------------------------------------------------- 0159 ////////////////////////////////////////////////////////////////////////// 0160 0161 void Forest::sortEventVectors(std::vector<std::vector<Event*>>& e) const { 0162 // When a node chooses the optimum split point and split variable it needs 0163 // the events to be sorted according to the variable it is considering. 0164 0165 for (unsigned int i = 0; i < e.size(); i++) { 0166 Event::sortingIndex = i; 0167 std::sort(e[i].begin(), e[i].end(), compareEvents); 0168 } 0169 } 0170 0171 ////////////////////////////////////////////////////////////////////////// 0172 // ---------------------------------------------------------------------- 0173 ////////////////////////////////////////////////////////////////////////// 0174 0175 void Forest::rankVariables(std::vector<int>& rank) const { 0176 // This function ranks the determining variables according to their importance 0177 // in determining the fit. Use a low learning rate for better results. 0178 // Separates completely useless variables from useful ones well, 0179 // but isn't the best at separating variables of similar importance. 0180 // This is calculated using the error reduction on the training set. The function 0181 // should be changed to use the testing set, but this works fine for now. 0182 // I will try to change this in the future. 0183 0184 // Initialize the vector v, which will store the total error reduction 0185 // for each variable i in v[i]. 0186 std::vector<double> v(events.size(), 0); 0187 0188 //std::cout << std::endl << "Ranking Variables by Net Error Reduction... " << std::endl; 0189 0190 for (unsigned int j = 0; j < trees.size(); j++) { 0191 trees[j]->rankVariables(v); 0192 } 0193 0194 double max = *std::max_element(v.begin(), v.end()); 0195 0196 // Scale the importance. Maximum importance = 100. 0197 for (unsigned int i = 0; i < v.size(); i++) { 0198 v[i] = 100 * v[i] / max; 0199 } 0200 0201 // Change the storage format so that we can keep the index 0202 // and the value associated after sorting. 0203 std::vector<std::pair<double, int>> w(events.size()); 0204 0205 for (unsigned int i = 0; i < v.size(); i++) { 0206 w[i] = std::pair<double, int>(v[i], i); 0207 } 0208 0209 // Sort so that we can output in order of importance. 0210 std::sort(w.begin(), w.end()); 0211 0212 // Output the results. 0213 for (int i = (v.size() - 1); i >= 0; i--) { 0214 rank.push_back(w[i].second); 0215 // std::cout << "x" << w[i].second << ": " << w[i].first << std::endl; 0216 } 0217 0218 // std::cout << std::endl << "Done." << std::endl << std::endl; 0219 } 0220 0221 ////////////////////////////////////////////////////////////////////////// 0222 // ---------------------------------------------------------------------- 0223 ////////////////////////////////////////////////////////////////////////// 0224 0225 void Forest::saveSplitValues(const char* savefilename) const { 0226 // This function gathers all of the split values from the forest and puts them into lists. 0227 0228 std::ofstream splitvaluefile; 0229 splitvaluefile.open(savefilename); 0230 0231 // Initialize the matrix v, which will store the list of split values 0232 // for each variable i in v[i]. 0233 std::vector<std::vector<double>> v(events.size(), std::vector<double>()); 0234 0235 //std::cout << std::endl << "Gathering split values... " << std::endl; 0236 0237 // Gather the split values from each tree in the forest. 0238 for (unsigned int j = 0; j < trees.size(); j++) { 0239 trees[j]->getSplitValues(v); 0240 } 0241 0242 // Sort the lists of split values and remove the duplicates. 0243 for (unsigned int i = 0; i < v.size(); i++) { 0244 std::sort(v[i].begin(), v[i].end()); 0245 v[i].erase(unique(v[i].begin(), v[i].end()), v[i].end()); 0246 } 0247 0248 // Output the results after removing duplicates. 0249 // The 0th variable is special and is not used for splitting, so we start at 1. 0250 for (unsigned int i = 1; i < v.size(); i++) { 0251 TString splitValues; 0252 for (unsigned int j = 0; j < v[i].size(); j++) { 0253 std::stringstream ss; 0254 ss.precision(14); 0255 ss << std::scientific << v[i][j]; 0256 splitValues += ","; 0257 splitValues += ss.str().c_str(); 0258 } 0259 0260 splitValues = splitValues(1, splitValues.Length()); 0261 splitvaluefile << splitValues << std::endl << std::endl; 0262 ; 0263 } 0264 } 0265 ////////////////////////////////////////////////////////////////////////// 0266 // ______________________Update_Events_After_Fitting____________________// 0267 ////////////////////////////////////////////////////////////////////////// 0268 0269 void Forest::updateRegTargets(Tree* tree, double learningRate, LossFunction* l) { 0270 // Prepare the global vector of events for the next tree. 0271 // Update the fit for each event and set the new target value 0272 // for the next tree. 0273 0274 // Get the list of terminal nodes for this tree. 0275 std::list<Node*>& tn = tree->getTerminalNodes(); 0276 0277 // Loop through the terminal nodes. 0278 for (std::list<Node*>::iterator it = tn.begin(); it != tn.end(); it++) { 0279 // Get the events in the current terminal region. 0280 std::vector<Event*>& v = (*it)->getEvents()[0]; 0281 0282 // Fit the events depending on the loss function criteria. 0283 double fit = l->fit(v); 0284 0285 // Scale the rate at which the algorithm converges. 0286 fit = learningRate * fit; 0287 0288 // Store the official fit value in the terminal node. 0289 (*it)->setFitValue(fit); 0290 0291 // Loop through each event in the terminal region and update the 0292 // the target for the next tree. 0293 for (unsigned int j = 0; j < v.size(); j++) { 0294 Event* e = v[j]; 0295 e->predictedValue += fit; 0296 e->data[0] = l->target(e); 0297 } 0298 0299 // Release memory. 0300 (*it)->getEvents() = std::vector<std::vector<Event*>>(); 0301 } 0302 } 0303 0304 ///////////////////////////////////////////////////////////////////////// 0305 // ---------------------------------------------------------------------- 0306 ///////////////////////////////////////////////////////////////////////// 0307 0308 void Forest::updateEvents(Tree* tree) { 0309 // Prepare the test events for the next tree. 0310 0311 // Get the list of terminal nodes for this tree. 0312 std::list<Node*>& tn = tree->getTerminalNodes(); 0313 0314 // Loop through the terminal nodes. 0315 for (std::list<Node*>::iterator it = tn.begin(); it != tn.end(); it++) { 0316 std::vector<Event*>& v = (*it)->getEvents()[0]; 0317 double fit = (*it)->getFitValue(); 0318 0319 // Loop through each event in the terminal region and update the 0320 // the global event it maps to. 0321 for (unsigned int j = 0; j < v.size(); j++) { 0322 Event* e = v[j]; 0323 e->predictedValue += fit; 0324 } 0325 0326 // Release memory. 0327 (*it)->getEvents() = std::vector<std::vector<Event*>>(); 0328 } 0329 } 0330 0331 ////////////////////////////////////////////////////////////////////////// 0332 // ____________________Do/Test_the Regression___________________________// 0333 ////////////////////////////////////////////////////////////////////////// 0334 0335 void Forest::doRegression(int nodeLimit, 0336 int treeLimit, 0337 double learningRate, 0338 LossFunction* l, 0339 const char* savetreesdirectory, 0340 bool saveTrees) { 0341 // Build the forest using the training sample. 0342 0343 //std::cout << std::endl << "--Building Forest..." << std::endl << std::endl; 0344 0345 // The trees work with a matrix of events where the rows have the same set of events. Each row however 0346 // is sorted according to the feature variable given by event->data[row]. 0347 // If we only had one set of events we would have to sort it according to the 0348 // feature variable every time we want to calculate the best split point for that feature. 0349 // By keeping sorted copies we avoid the sorting operation during splint point calculation 0350 // and save computation time. If we do not sort each of the rows the regression will fail. 0351 //std::cout << "Sorting event vectors..." << std::endl; 0352 sortEventVectors(events); 0353 0354 // See how long the regression takes. 0355 TStopwatch timer; 0356 timer.Start(kTRUE); 0357 0358 for (unsigned int i = 0; i < (unsigned)treeLimit; i++) { 0359 // std::cout << "++Building Tree " << i << "... " << std::endl; 0360 trees.emplace_back(std::make_unique<Tree>(events)); 0361 Tree* tree = trees.back().get(); 0362 tree->buildTree(nodeLimit); 0363 0364 // Update the targets for the next tree to fit. 0365 updateRegTargets(tree, learningRate, l); 0366 0367 // Save trees to xml in some directory. 0368 std::ostringstream ss; 0369 ss << savetreesdirectory << "/" << i << ".xml"; 0370 std::string s = ss.str(); 0371 const char* c = s.c_str(); 0372 0373 if (saveTrees) 0374 tree->saveToXML(c); 0375 } 0376 //std::cout << std::endl; 0377 //std::cout << std::endl << "Done." << std::endl << std::endl; 0378 0379 // std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl; 0380 } 0381 0382 ////////////////////////////////////////////////////////////////////////// 0383 // ---------------------------------------------------------------------- 0384 ////////////////////////////////////////////////////////////////////////// 0385 0386 void Forest::predictEvents(std::vector<Event*>& eventsp, unsigned int numtrees) { 0387 // Predict values for eventsp by running them through the forest up to numtrees. 0388 0389 //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl; 0390 if (numtrees > trees.size()) { 0391 //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl; 0392 numtrees = trees.size(); 0393 } 0394 0395 // i iterates through the trees in the forest. Each tree corrects the last prediction. 0396 for (unsigned int i = 0; i < numtrees; i++) { 0397 //std::cout << "++Tree " << i << "..." << std::endl; 0398 appendCorrection(eventsp, i); 0399 } 0400 } 0401 0402 ////////////////////////////////////////////////////////////////////////// 0403 // ---------------------------------------------------------------------- 0404 ////////////////////////////////////////////////////////////////////////// 0405 0406 void Forest::appendCorrection(std::vector<Event*>& eventsp, int treenum) { 0407 // Update the prediction by appending the next correction. 0408 0409 Tree* tree = trees[treenum].get(); 0410 tree->filterEvents(eventsp); 0411 0412 // Update the events with their new prediction. 0413 updateEvents(tree); 0414 } 0415 0416 ////////////////////////////////////////////////////////////////////////// 0417 // ---------------------------------------------------------------------- 0418 ////////////////////////////////////////////////////////////////////////// 0419 0420 void Forest::predictEvent(Event* e, unsigned int numtrees) { 0421 // Predict values for eventsp by running them through the forest up to numtrees. 0422 0423 //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl; 0424 if (numtrees > trees.size()) { 0425 //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl; 0426 numtrees = trees.size(); 0427 } 0428 0429 // just like in line #2470 of https://root.cern.ch/doc/master/MethodBDT_8cxx_source.html for gradient boosting 0430 e->predictedValue = trees[0]->getBoostWeight(); 0431 0432 // i iterates through the trees in the forest. Each tree corrects the last prediction. 0433 for (unsigned int i = 0; i < numtrees; i++) { 0434 //std::cout << "++Tree " << i << "..." << std::endl; 0435 appendCorrection(e, i); 0436 } 0437 } 0438 0439 ////////////////////////////////////////////////////////////////////////// 0440 // ---------------------------------------------------------------------- 0441 ////////////////////////////////////////////////////////////////////////// 0442 0443 void Forest::appendCorrection(Event* e, int treenum) { 0444 // Update the prediction by appending the next correction. 0445 0446 Tree* tree = trees[treenum].get(); 0447 Node* terminalNode = tree->filterEvent(e); 0448 0449 // Update the event with its new prediction. 0450 double fit = terminalNode->getFitValue(); 0451 e->predictedValue += fit; 0452 } 0453 ///////////////////////////////////////////////////////////////////////////////////// 0454 // ---------------------------------------------------------------------------------- 0455 ///////////////////////////////////////////////////////////////////////////////////// 0456 0457 void Forest::loadForestFromXML(const char* directory, unsigned int numTrees) { 0458 // Load a forest that has already been created and stored into XML somewhere. 0459 0460 // Initialize the vector of trees. 0461 trees.resize(numTrees); 0462 trees.shrink_to_fit(); 0463 0464 // Load the Forest. 0465 // std::cout << std::endl << "Loading Forest from XML ... " << std::endl; 0466 for (unsigned int i = 0; i < numTrees; i++) { 0467 trees[i] = std::make_unique<Tree>(); 0468 0469 std::stringstream ss; 0470 ss << directory << "/" << i << ".xml"; 0471 0472 trees[i]->loadFromXML(edm::FileInPath(ss.str().c_str()).fullPath().c_str()); 0473 } 0474 0475 //std::cout << "Done." << std::endl << std::endl; 0476 } 0477 0478 void Forest::loadFromCondPayload(const L1TMuonEndCapForest::DForest& forest) { 0479 // Load a forest that has already been created and stored in CondDB. 0480 // Initialize the vector of trees. 0481 unsigned int numTrees = forest.size(); 0482 0483 trees.resize(numTrees); 0484 trees.shrink_to_fit(); 0485 0486 // Load the Forest. 0487 for (unsigned int i = 0; i < numTrees; i++) { 0488 trees[i] = std::make_unique<Tree>(); 0489 trees[i]->loadFromCondPayload(forest[i]); 0490 } 0491 } 0492 0493 ////////////////////////////////////////////////////////////////////////// 0494 // ___________________Stochastic_Sampling_&_Regression__________________// 0495 ////////////////////////////////////////////////////////////////////////// 0496 0497 void Forest::prepareRandomSubsample(double fraction) { 0498 // We use this for Stochastic Gradient Boosting. Basically you 0499 // take a subsample of the training events and build a tree using 0500 // those. Then use the tree built from the subsample to update 0501 // the predictions for all the events. 0502 0503 subSample = std::vector<std::vector<Event*>>(events.size()); 0504 size_t subSampleSize = fraction * events[0].size(); 0505 0506 // Randomize the first subSampleSize events in events[0]. 0507 shuffle(events[0].begin(), events[0].end(), subSampleSize); 0508 0509 // Get a copy of the random subset we just made. 0510 std::vector<Event*> v(events[0].begin(), events[0].begin() + subSampleSize); 0511 0512 // Initialize and sort the subSample collection. 0513 for (unsigned int i = 0; i < subSample.size(); i++) { 0514 subSample[i] = v; 0515 } 0516 0517 sortEventVectors(subSample); 0518 } 0519 0520 ////////////////////////////////////////////////////////////////////////// 0521 // ---------------------------------------------------------------------- 0522 ////////////////////////////////////////////////////////////////////////// 0523 0524 void Forest::doStochasticRegression( 0525 int nodeLimit, int treeLimit, double learningRate, double fraction, LossFunction* l) { 0526 // If the fraction of events to use is one then this algorithm is slower than doRegression due to the fact 0527 // that we have to sort the events every time we extract a subsample. Without random sampling we simply 0528 // use all of the events and keep them sorted. 0529 0530 // Anyways, this algorithm uses a portion of the events to train each tree. All of the events are updated 0531 // afterwards with the results from the subsample built tree. 0532 0533 // Prepare some things. 0534 sortEventVectors(events); 0535 trees.resize(treeLimit); 0536 trees.shrink_to_fit(); 0537 0538 // See how long the regression takes. 0539 TStopwatch timer; 0540 timer.Start(kTRUE); 0541 0542 // Output the current settings. 0543 // std::cout << std::endl << "Running stochastic regression ... " << std::endl; 0544 //std::cout << "# Nodes: " << nodeLimit << std::endl; 0545 //std::cout << "Learning Rate: " << learningRate << std::endl; 0546 //std::cout << "Bagging Fraction: " << fraction << std::endl; 0547 //std::cout << std::endl; 0548 0549 for (unsigned int i = 0; i < (unsigned)treeLimit; i++) { 0550 // Build the tree using a random subsample. 0551 prepareRandomSubsample(fraction); 0552 trees[i] = std::make_unique<Tree>(subSample); 0553 trees[i]->buildTree(nodeLimit); 0554 0555 // Fit all of the events based upon the tree we built using 0556 // the subsample of events. 0557 trees[i]->filterEvents(events[0]); 0558 0559 // Update the targets for the next tree to fit. 0560 updateRegTargets(trees[i].get(), learningRate, l); 0561 0562 // Save trees to xml in some directory. 0563 std::ostringstream ss; 0564 ss << "trees/" << i << ".xml"; 0565 std::string s = ss.str(); 0566 const char* c = s.c_str(); 0567 0568 trees[i]->saveToXML(c); 0569 } 0570 0571 //std::cout << std::endl << "Done." << std::endl << std::endl; 0572 0573 //std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl; 0574 }
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.2.1 LXR engine. The LXR team |
![]() ![]() |