|
||||
File indexing completed on 2024-04-06 12:20:55
0001 ////////////////////////////////////////////////////////////////////////// 0002 // Forest.cxx // 0003 // =====================================================================// 0004 // This is the object implementation of a forest of decision trees. // 0005 // We need this to implement gradient boosting. // 0006 // References include // 0007 // *Elements of Statistical Learning by Hastie, // 0008 // Tibshirani, and Friedman. // 0009 // *Greedy Function Approximation: A Gradient Boosting Machine. // 0010 // Friedman. The Annals of Statistics, Vol. 29, No. 5. Oct 2001. // 0011 // *Inductive Learning of Tree-based Regression Models. Luis Torgo. // 0012 // // 0013 ////////////////////////////////////////////////////////////////////////// 0014 0015 /////////////////////////////////////////////////////////////////////////// 0016 // _______________________Includes_______________________________________// 0017 /////////////////////////////////////////////////////////////////////////// 0018 0019 #include "L1Trigger/L1TMuonEndCap/interface/bdt/Forest.h" 0020 #include "L1Trigger/L1TMuonEndCap/interface/bdt/Utilities.h" 0021 0022 #include "FWCore/ParameterSet/interface/FileInPath.h" 0023 0024 #include "TStopwatch.h" 0025 #include "TString.h" 0026 0027 #include <iostream> 0028 #include <sstream> 0029 #include <algorithm> 0030 #include <iterator> 0031 #include <fstream> 0032 #include <utility> 0033 0034 using namespace emtf; 0035 0036 ////////////////////////////////////////////////////////////////////////// 0037 // _______________________Constructor(s)________________________________// 0038 ////////////////////////////////////////////////////////////////////////// 0039 0040 Forest::Forest() { events = std::vector<std::vector<Event*>>(1); } 0041 0042 ////////////////////////////////////////////////////////////////////////// 0043 // ---------------------------------------------------------------------- 0044 ////////////////////////////////////////////////////////////////////////// 0045 0046 Forest::Forest(std::vector<Event*>& trainingEvents) { setTrainingEvents(trainingEvents); } 0047 0048 ///////////////////////////////////////////////////////////////////////// 0049 // _______________________Destructor____________________________________// 0050 ////////////////////////////////////////////////////////////////////////// 0051 0052 Forest::~Forest() { 0053 // When the forest is destroyed it will delete the trees as well as the 0054 // events from the training and testing sets. 0055 // The user may want the events to remain after they destroy the forest 0056 // this should be changed in future upgrades. 0057 0058 for (unsigned int i = 0; i < trees.size(); i++) { 0059 if (trees[i]) 0060 delete trees[i]; 0061 } 0062 } 0063 0064 Forest::Forest(const Forest& forest) { 0065 transform(forest.trees.cbegin(), forest.trees.cend(), back_inserter(trees), [](const Tree* tree) { 0066 return new Tree(*tree); 0067 }); 0068 } 0069 0070 Forest& Forest::operator=(const Forest& forest) { 0071 for (unsigned int i = 0; i < trees.size(); i++) { 0072 if (trees[i]) 0073 delete trees[i]; 0074 } 0075 trees.resize(0); 0076 0077 transform(forest.trees.cbegin(), forest.trees.cend(), back_inserter(trees), [](const Tree* tree) { 0078 return new Tree(*tree); 0079 }); 0080 return *this; 0081 } 0082 0083 ////////////////////////////////////////////////////////////////////////// 0084 // ______________________Get/Set_Functions______________________________// 0085 ////////////////////////////////////////////////////////////////////////// 0086 0087 void Forest::setTrainingEvents(std::vector<Event*>& trainingEvents) { 0088 // tell the forest which events to use for training 0089 0090 Event* e = trainingEvents[0]; 0091 // Unused variable 0092 // unsigned int numrows = e->data.size(); 0093 0094 // Reset the events matrix. 0095 events = std::vector<std::vector<Event*>>(); 0096 0097 for (unsigned int i = 0; i < e->data.size(); i++) { 0098 events.push_back(trainingEvents); 0099 } 0100 } 0101 0102 ////////////////////////////////////////////////////////////////////////// 0103 // ---------------------------------------------------------------------- 0104 ////////////////////////////////////////////////////////////////////////// 0105 0106 // return a copy of the training events 0107 std::vector<Event*> Forest::getTrainingEvents() { return events[0]; } 0108 0109 ////////////////////////////////////////////////////////////////////////// 0110 // ---------------------------------------------------------------------- 0111 ////////////////////////////////////////////////////////////////////////// 0112 0113 // return the ith tree 0114 Tree* Forest::getTree(unsigned int i) { 0115 if (/*i>=0 && */ i < trees.size()) 0116 return trees[i]; 0117 else { 0118 //std::cout << i << "is an invalid input for getTree. Out of range." << std::endl; 0119 return nullptr; 0120 } 0121 } 0122 0123 ////////////////////////////////////////////////////////////////////////// 0124 // ______________________Various_Helpful_Functions______________________// 0125 ////////////////////////////////////////////////////////////////////////// 0126 0127 unsigned int Forest::size() { 0128 // Return the number of trees in the forest. 0129 return trees.size(); 0130 } 0131 0132 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 0133 //*** Need to make a data structure that includes the next few functions *** 0134 //*** pertaining to events. These don't really have much to do with the *** 0135 //*** forest class. *** 0136 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 0137 0138 ////////////////////////////////////////////////////////////////////////// 0139 // ---------------------------------------------------------------------- 0140 ////////////////////////////////////////////////////////////////////////// 0141 0142 void Forest::listEvents(std::vector<std::vector<Event*>>& e) { 0143 // Simply list the events in each event vector. We have multiple copies 0144 // of the events vector. Each copy is sorted according to a different 0145 // determining variable. 0146 std::cout << std::endl << "Listing Events... " << std::endl; 0147 0148 for (unsigned int i = 0; i < e.size(); i++) { 0149 std::cout << std::endl << "Variable " << i << " vector contents: " << std::endl; 0150 for (unsigned int j = 0; j < e[i].size(); j++) { 0151 e[i][j]->outputEvent(); 0152 } 0153 std::cout << std::endl; 0154 } 0155 } 0156 0157 ////////////////////////////////////////////////////////////////////////// 0158 // ---------------------------------------------------------------------- 0159 ////////////////////////////////////////////////////////////////////////// 0160 0161 // We have to initialize Event::sortingIndex outside of a function since 0162 // it is a static member. 0163 int Event::sortingIndex = 1; 0164 0165 bool compareEvents(Event* e1, Event* e2) { 0166 // Sort the events according to the variable given by the sortingIndex. 0167 return e1->data[Event::sortingIndex] < e2->data[Event::sortingIndex]; 0168 } 0169 ////////////////////////////////////////////////////////////////////////// 0170 // ---------------------------------------------------------------------- 0171 ////////////////////////////////////////////////////////////////////////// 0172 0173 bool compareEventsById(Event* e1, Event* e2) { 0174 // Sort the events by ID. We need this to produce rate plots. 0175 return e1->id < e2->id; 0176 } 0177 ////////////////////////////////////////////////////////////////////////// 0178 // ---------------------------------------------------------------------- 0179 ////////////////////////////////////////////////////////////////////////// 0180 0181 void Forest::sortEventVectors(std::vector<std::vector<Event*>>& e) { 0182 // When a node chooses the optimum split point and split variable it needs 0183 // the events to be sorted according to the variable it is considering. 0184 0185 for (unsigned int i = 0; i < e.size(); i++) { 0186 Event::sortingIndex = i; 0187 std::sort(e[i].begin(), e[i].end(), compareEvents); 0188 } 0189 } 0190 0191 ////////////////////////////////////////////////////////////////////////// 0192 // ---------------------------------------------------------------------- 0193 ////////////////////////////////////////////////////////////////////////// 0194 0195 void Forest::rankVariables(std::vector<int>& rank) { 0196 // This function ranks the determining variables according to their importance 0197 // in determining the fit. Use a low learning rate for better results. 0198 // Separates completely useless variables from useful ones well, 0199 // but isn't the best at separating variables of similar importance. 0200 // This is calculated using the error reduction on the training set. The function 0201 // should be changed to use the testing set, but this works fine for now. 0202 // I will try to change this in the future. 0203 0204 // Initialize the vector v, which will store the total error reduction 0205 // for each variable i in v[i]. 0206 std::vector<double> v(events.size(), 0); 0207 0208 //std::cout << std::endl << "Ranking Variables by Net Error Reduction... " << std::endl; 0209 0210 for (unsigned int j = 0; j < trees.size(); j++) { 0211 trees[j]->rankVariables(v); 0212 } 0213 0214 double max = *std::max_element(v.begin(), v.end()); 0215 0216 // Scale the importance. Maximum importance = 100. 0217 for (unsigned int i = 0; i < v.size(); i++) { 0218 v[i] = 100 * v[i] / max; 0219 } 0220 0221 // Change the storage format so that we can keep the index 0222 // and the value associated after sorting. 0223 std::vector<std::pair<double, int>> w(events.size()); 0224 0225 for (unsigned int i = 0; i < v.size(); i++) { 0226 w[i] = std::pair<double, int>(v[i], i); 0227 } 0228 0229 // Sort so that we can output in order of importance. 0230 std::sort(w.begin(), w.end()); 0231 0232 // Output the results. 0233 for (int i = (v.size() - 1); i >= 0; i--) { 0234 rank.push_back(w[i].second); 0235 // std::cout << "x" << w[i].second << ": " << w[i].first << std::endl; 0236 } 0237 0238 // std::cout << std::endl << "Done." << std::endl << std::endl; 0239 } 0240 0241 ////////////////////////////////////////////////////////////////////////// 0242 // ---------------------------------------------------------------------- 0243 ////////////////////////////////////////////////////////////////////////// 0244 0245 void Forest::saveSplitValues(const char* savefilename) { 0246 // This function gathers all of the split values from the forest and puts them into lists. 0247 0248 std::ofstream splitvaluefile; 0249 splitvaluefile.open(savefilename); 0250 0251 // Initialize the matrix v, which will store the list of split values 0252 // for each variable i in v[i]. 0253 std::vector<std::vector<double>> v(events.size(), std::vector<double>()); 0254 0255 //std::cout << std::endl << "Gathering split values... " << std::endl; 0256 0257 // Gather the split values from each tree in the forest. 0258 for (unsigned int j = 0; j < trees.size(); j++) { 0259 trees[j]->getSplitValues(v); 0260 } 0261 0262 // Sort the lists of split values and remove the duplicates. 0263 for (unsigned int i = 0; i < v.size(); i++) { 0264 std::sort(v[i].begin(), v[i].end()); 0265 v[i].erase(unique(v[i].begin(), v[i].end()), v[i].end()); 0266 } 0267 0268 // Output the results after removing duplicates. 0269 // The 0th variable is special and is not used for splitting, so we start at 1. 0270 for (unsigned int i = 1; i < v.size(); i++) { 0271 TString splitValues; 0272 for (unsigned int j = 0; j < v[i].size(); j++) { 0273 std::stringstream ss; 0274 ss.precision(14); 0275 ss << std::scientific << v[i][j]; 0276 splitValues += ","; 0277 splitValues += ss.str().c_str(); 0278 } 0279 0280 splitValues = splitValues(1, splitValues.Length()); 0281 splitvaluefile << splitValues << std::endl << std::endl; 0282 ; 0283 } 0284 } 0285 ////////////////////////////////////////////////////////////////////////// 0286 // ______________________Update_Events_After_Fitting____________________// 0287 ////////////////////////////////////////////////////////////////////////// 0288 0289 void Forest::updateRegTargets(Tree* tree, double learningRate, LossFunction* l) { 0290 // Prepare the global vector of events for the next tree. 0291 // Update the fit for each event and set the new target value 0292 // for the next tree. 0293 0294 // Get the list of terminal nodes for this tree. 0295 std::list<Node*>& tn = tree->getTerminalNodes(); 0296 0297 // Loop through the terminal nodes. 0298 for (std::list<Node*>::iterator it = tn.begin(); it != tn.end(); it++) { 0299 // Get the events in the current terminal region. 0300 std::vector<Event*>& v = (*it)->getEvents()[0]; 0301 0302 // Fit the events depending on the loss function criteria. 0303 double fit = l->fit(v); 0304 0305 // Scale the rate at which the algorithm converges. 0306 fit = learningRate * fit; 0307 0308 // Store the official fit value in the terminal node. 0309 (*it)->setFitValue(fit); 0310 0311 // Loop through each event in the terminal region and update the 0312 // the target for the next tree. 0313 for (unsigned int j = 0; j < v.size(); j++) { 0314 Event* e = v[j]; 0315 e->predictedValue += fit; 0316 e->data[0] = l->target(e); 0317 } 0318 0319 // Release memory. 0320 (*it)->getEvents() = std::vector<std::vector<Event*>>(); 0321 } 0322 } 0323 0324 ///////////////////////////////////////////////////////////////////////// 0325 // ---------------------------------------------------------------------- 0326 ///////////////////////////////////////////////////////////////////////// 0327 0328 void Forest::updateEvents(Tree* tree) { 0329 // Prepare the test events for the next tree. 0330 0331 // Get the list of terminal nodes for this tree. 0332 std::list<Node*>& tn = tree->getTerminalNodes(); 0333 0334 // Loop through the terminal nodes. 0335 for (std::list<Node*>::iterator it = tn.begin(); it != tn.end(); it++) { 0336 std::vector<Event*>& v = (*it)->getEvents()[0]; 0337 double fit = (*it)->getFitValue(); 0338 0339 // Loop through each event in the terminal region and update the 0340 // the global event it maps to. 0341 for (unsigned int j = 0; j < v.size(); j++) { 0342 Event* e = v[j]; 0343 e->predictedValue += fit; 0344 } 0345 0346 // Release memory. 0347 (*it)->getEvents() = std::vector<std::vector<Event*>>(); 0348 } 0349 } 0350 0351 ////////////////////////////////////////////////////////////////////////// 0352 // ____________________Do/Test_the Regression___________________________// 0353 ////////////////////////////////////////////////////////////////////////// 0354 0355 void Forest::doRegression(int nodeLimit, 0356 int treeLimit, 0357 double learningRate, 0358 LossFunction* l, 0359 const char* savetreesdirectory, 0360 bool saveTrees) { 0361 // Build the forest using the training sample. 0362 0363 //std::cout << std::endl << "--Building Forest..." << std::endl << std::endl; 0364 0365 // The trees work with a matrix of events where the rows have the same set of events. Each row however 0366 // is sorted according to the feature variable given by event->data[row]. 0367 // If we only had one set of events we would have to sort it according to the 0368 // feature variable every time we want to calculate the best split point for that feature. 0369 // By keeping sorted copies we avoid the sorting operation during splint point calculation 0370 // and save computation time. If we do not sort each of the rows the regression will fail. 0371 //std::cout << "Sorting event vectors..." << std::endl; 0372 sortEventVectors(events); 0373 0374 // See how long the regression takes. 0375 TStopwatch timer; 0376 timer.Start(kTRUE); 0377 0378 for (unsigned int i = 0; i < (unsigned)treeLimit; i++) { 0379 // std::cout << "++Building Tree " << i << "... " << std::endl; 0380 Tree* tree = new Tree(events); 0381 trees.push_back(tree); 0382 tree->buildTree(nodeLimit); 0383 0384 // Update the targets for the next tree to fit. 0385 updateRegTargets(tree, learningRate, l); 0386 0387 // Save trees to xml in some directory. 0388 std::ostringstream ss; 0389 ss << savetreesdirectory << "/" << i << ".xml"; 0390 std::string s = ss.str(); 0391 const char* c = s.c_str(); 0392 0393 if (saveTrees) 0394 tree->saveToXML(c); 0395 } 0396 //std::cout << std::endl; 0397 //std::cout << std::endl << "Done." << std::endl << std::endl; 0398 0399 // std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl; 0400 } 0401 0402 ////////////////////////////////////////////////////////////////////////// 0403 // ---------------------------------------------------------------------- 0404 ////////////////////////////////////////////////////////////////////////// 0405 0406 void Forest::predictEvents(std::vector<Event*>& eventsp, unsigned int numtrees) { 0407 // Predict values for eventsp by running them through the forest up to numtrees. 0408 0409 //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl; 0410 if (numtrees > trees.size()) { 0411 //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl; 0412 numtrees = trees.size(); 0413 } 0414 0415 // i iterates through the trees in the forest. Each tree corrects the last prediction. 0416 for (unsigned int i = 0; i < numtrees; i++) { 0417 //std::cout << "++Tree " << i << "..." << std::endl; 0418 appendCorrection(eventsp, i); 0419 } 0420 } 0421 0422 ////////////////////////////////////////////////////////////////////////// 0423 // ---------------------------------------------------------------------- 0424 ////////////////////////////////////////////////////////////////////////// 0425 0426 void Forest::appendCorrection(std::vector<Event*>& eventsp, int treenum) { 0427 // Update the prediction by appending the next correction. 0428 0429 Tree* tree = trees[treenum]; 0430 tree->filterEvents(eventsp); 0431 0432 // Update the events with their new prediction. 0433 updateEvents(tree); 0434 } 0435 0436 ////////////////////////////////////////////////////////////////////////// 0437 // ---------------------------------------------------------------------- 0438 ////////////////////////////////////////////////////////////////////////// 0439 0440 void Forest::predictEvent(Event* e, unsigned int numtrees) { 0441 // Predict values for eventsp by running them through the forest up to numtrees. 0442 0443 //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl; 0444 if (numtrees > trees.size()) { 0445 //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl; 0446 numtrees = trees.size(); 0447 } 0448 0449 // just like in line #2470 of https://root.cern.ch/doc/master/MethodBDT_8cxx_source.html for gradient boosting 0450 e->predictedValue = trees[0]->getBoostWeight(); 0451 0452 // i iterates through the trees in the forest. Each tree corrects the last prediction. 0453 for (unsigned int i = 0; i < numtrees; i++) { 0454 //std::cout << "++Tree " << i << "..." << std::endl; 0455 appendCorrection(e, i); 0456 } 0457 } 0458 0459 ////////////////////////////////////////////////////////////////////////// 0460 // ---------------------------------------------------------------------- 0461 ////////////////////////////////////////////////////////////////////////// 0462 0463 void Forest::appendCorrection(Event* e, int treenum) { 0464 // Update the prediction by appending the next correction. 0465 0466 Tree* tree = trees[treenum]; 0467 Node* terminalNode = tree->filterEvent(e); 0468 0469 // Update the event with its new prediction. 0470 double fit = terminalNode->getFitValue(); 0471 e->predictedValue += fit; 0472 } 0473 ///////////////////////////////////////////////////////////////////////////////////// 0474 // ---------------------------------------------------------------------------------- 0475 ///////////////////////////////////////////////////////////////////////////////////// 0476 0477 void Forest::loadForestFromXML(const char* directory, unsigned int numTrees) { 0478 // Load a forest that has already been created and stored into XML somewhere. 0479 0480 // Initialize the vector of trees. 0481 trees = std::vector<Tree*>(numTrees); 0482 0483 // Load the Forest. 0484 // std::cout << std::endl << "Loading Forest from XML ... " << std::endl; 0485 for (unsigned int i = 0; i < numTrees; i++) { 0486 trees[i] = new Tree(); 0487 0488 std::stringstream ss; 0489 ss << directory << "/" << i << ".xml"; 0490 0491 trees[i]->loadFromXML(edm::FileInPath(ss.str().c_str()).fullPath().c_str()); 0492 } 0493 0494 //std::cout << "Done." << std::endl << std::endl; 0495 } 0496 0497 void Forest::loadFromCondPayload(const L1TMuonEndCapForest::DForest& forest) { 0498 // Load a forest that has already been created and stored in CondDB. 0499 // Initialize the vector of trees. 0500 unsigned int numTrees = forest.size(); 0501 0502 // clean-up leftovers from previous initialization (if any) 0503 for (unsigned int i = 0; i < trees.size(); i++) { 0504 if (trees[i]) 0505 delete trees[i]; 0506 } 0507 0508 trees = std::vector<Tree*>(numTrees); 0509 0510 // Load the Forest. 0511 for (unsigned int i = 0; i < numTrees; i++) { 0512 trees[i] = new Tree(); 0513 trees[i]->loadFromCondPayload(forest[i]); 0514 } 0515 } 0516 0517 ////////////////////////////////////////////////////////////////////////// 0518 // ___________________Stochastic_Sampling_&_Regression__________________// 0519 ////////////////////////////////////////////////////////////////////////// 0520 0521 void Forest::prepareRandomSubsample(double fraction) { 0522 // We use this for Stochastic Gradient Boosting. Basically you 0523 // take a subsample of the training events and build a tree using 0524 // those. Then use the tree built from the subsample to update 0525 // the predictions for all the events. 0526 0527 subSample = std::vector<std::vector<Event*>>(events.size()); 0528 size_t subSampleSize = fraction * events[0].size(); 0529 0530 // Randomize the first subSampleSize events in events[0]. 0531 shuffle(events[0].begin(), events[0].end(), subSampleSize); 0532 0533 // Get a copy of the random subset we just made. 0534 std::vector<Event*> v(events[0].begin(), events[0].begin() + subSampleSize); 0535 0536 // Initialize and sort the subSample collection. 0537 for (unsigned int i = 0; i < subSample.size(); i++) { 0538 subSample[i] = v; 0539 } 0540 0541 sortEventVectors(subSample); 0542 } 0543 0544 ////////////////////////////////////////////////////////////////////////// 0545 // ---------------------------------------------------------------------- 0546 ////////////////////////////////////////////////////////////////////////// 0547 0548 void Forest::doStochasticRegression( 0549 int nodeLimit, int treeLimit, double learningRate, double fraction, LossFunction* l) { 0550 // If the fraction of events to use is one then this algorithm is slower than doRegression due to the fact 0551 // that we have to sort the events every time we extract a subsample. Without random sampling we simply 0552 // use all of the events and keep them sorted. 0553 0554 // Anyways, this algorithm uses a portion of the events to train each tree. All of the events are updated 0555 // afterwards with the results from the subsample built tree. 0556 0557 // Prepare some things. 0558 sortEventVectors(events); 0559 trees = std::vector<Tree*>(treeLimit); 0560 0561 // See how long the regression takes. 0562 TStopwatch timer; 0563 timer.Start(kTRUE); 0564 0565 // Output the current settings. 0566 // std::cout << std::endl << "Running stochastic regression ... " << std::endl; 0567 //std::cout << "# Nodes: " << nodeLimit << std::endl; 0568 //std::cout << "Learning Rate: " << learningRate << std::endl; 0569 //std::cout << "Bagging Fraction: " << fraction << std::endl; 0570 //std::cout << std::endl; 0571 0572 for (unsigned int i = 0; i < (unsigned)treeLimit; i++) { 0573 // Build the tree using a random subsample. 0574 prepareRandomSubsample(fraction); 0575 trees[i] = new Tree(subSample); 0576 trees[i]->buildTree(nodeLimit); 0577 0578 // Fit all of the events based upon the tree we built using 0579 // the subsample of events. 0580 trees[i]->filterEvents(events[0]); 0581 0582 // Update the targets for the next tree to fit. 0583 updateRegTargets(trees[i], learningRate, l); 0584 0585 // Save trees to xml in some directory. 0586 std::ostringstream ss; 0587 ss << "trees/" << i << ".xml"; 0588 std::string s = ss.str(); 0589 const char* c = s.c_str(); 0590 0591 trees[i]->saveToXML(c); 0592 } 0593 0594 //std::cout << std::endl << "Done." << std::endl << std::endl; 0595 0596 //std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl; 0597 }
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.2.1 LXR engine. The LXR team |