Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:03:54

0001 #ifndef DataFormats_Common_Trie_h
0002 #define DataFormats_Common_Trie_h
0003 /*
0004 ** 
0005 ** 
0006 ** Copyright (C) 2006 Julien Lemoine
0007 ** This program is free software; you can redistribute it and/or modify
0008 ** it under the terms of the GNU Lesser General Public License as published by
0009 ** the Free Software Foundation; either version 2 of the License, or
0010 ** (at your option) any later version.
0011 ** 
0012 ** This program is distributed in the hope that it will be useful,
0013 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
0014 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0015 ** GNU Lesser General Public License for more details.
0016 ** 
0017 ** You should have received a copy of the GNU Lesser General Public License
0018 ** along with this program; if not, write to the Free Software
0019 ** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0020 **
0021 **
0022 **   modified by Vincenzo Innocente on 15/08/2007
0023 **
0024 */
0025 
0026 #include <list>
0027 #include <string>
0028 
0029 namespace edm {
0030   // fwd declaration
0031   template <typename T>
0032   class TrieNode;
0033 
0034   /**
0035    * The goal of this class is to allocate Trie node by paquet of X
0036    * element in order to reduce heap-admin size
0037    */
0038   template <typename T>
0039   class TrieFactory {
0040   public:
0041     TrieFactory(unsigned paquetSize);
0042     ~TrieFactory();
0043 
0044   private:
0045     /// avoid default constructor
0046     TrieFactory() = delete;
0047     /// avoid copy constructor
0048     TrieFactory(const TrieFactory &e) = delete;
0049     /// avoid affectation operator
0050     TrieFactory &operator=(const TrieFactory &e) = delete;
0051 
0052   public:
0053     TrieNode<T> *newNode(const T &value);
0054     void clear();
0055 
0056   private:
0057     unsigned _paquetSize;
0058     std::list<TrieNode<T> *> _allocatedNodes;
0059     TrieNode<T> *_lastNodes;
0060     unsigned _nbUsedInLastNodes;
0061   };
0062 }  // namespace edm
0063 
0064 namespace edm {
0065   template <typename T>
0066   class TrieNodeIter;
0067 
0068   /**
0069    * @brief this class represent the node of a trie, it contains a
0070    * link to a sub node and a link to a brother (node which have the
0071    * same father)
0072    */
0073   template <typename T>
0074   class TrieNode {
0075   public:
0076     typedef TrieNodeIter<T> const_iterator;
0077 
0078     TrieNode();
0079     ~TrieNode();
0080 
0081   private:
0082     /// avoid copy constructor
0083     TrieNode(const TrieNode &e) = delete;
0084     /// avoid affectation operator
0085     TrieNode &operator=(const TrieNode &e) = delete;
0086 
0087   public:
0088     /// set value associed to node
0089     void setValue(const T &val);
0090     /// get value associed to node
0091     const T &value() const;
0092 
0093     /// get brother (return 0x0 this node has no brother)
0094     const TrieNode<T> *brother() const;
0095     TrieNode<T> *brother();
0096     /// get brother label
0097     unsigned char brotherLabel() const;
0098 
0099     /// initialize subnode iterator (std conforming)
0100     TrieNodeIter<T> begin() const;
0101     /// mark end of iteration (std conforming)
0102     TrieNodeIter<T> end() const;
0103 
0104     // get first sub Node
0105     const TrieNode<T> *subNode() const;
0106     TrieNode<T> *subNode();
0107     unsigned char subNodeLabel() const;
0108 
0109     // Looking for a sub node
0110     const TrieNode<T> *subNodeByLabel(unsigned char chr) const;
0111     TrieNode<T> *subNodeByLabel(unsigned char chr);
0112 
0113     // add an edge
0114     void addSubNode(unsigned char chr, TrieNode<T> *node);
0115 
0116     /// display content of node in output stream
0117     void display(std::ostream &os, unsigned offset, unsigned char label) const;
0118 
0119     /// clear content of TrieNode
0120     void clear();
0121 
0122   private:
0123     template <typename Node>
0124     Node _sequentialSearch(Node first, unsigned char label, unsigned char val) const;
0125     /// set brother (used by sort)
0126     void _setBrother(TrieNode<T> *brother, unsigned char brotherLabel);
0127     /// add a new brother
0128     void _addBrother(unsigned char chr, TrieNode<T> *brother);
0129     /**
0130      * @ brief get brother that has the label chr (return 0x0 if brother is
0131      * not found)
0132      */
0133     const TrieNode<T> *_getBrother(unsigned char chr) const;
0134     /**
0135      * @ brief get brother that has the label chr (return 0x0 if brother is
0136      * not found)
0137      */
0138     TrieNode<T> *_getBrother(unsigned char chr);
0139 
0140   private:
0141     /// pointer to brother (node with same father as this one)
0142     TrieNode<T> *_brother;
0143     /// character to go to brother (node with same father as this one)
0144     unsigned char _brotherLabel;
0145     /// pointer to first sub node
0146     TrieNode<T> *_firstSubNode;
0147     /// character to go to first subnode
0148     unsigned char _firstSubNodeLabel;
0149     /// value associed to this node
0150     T _value;
0151   };
0152 }  // namespace edm
0153 
0154 #include <ostream>
0155 
0156 namespace edm {
0157   //fwd declaration
0158   template <typename T>
0159   class TrieFactory;
0160   template <typename T>
0161   class TrieNode;
0162 
0163   /**
0164    * Implement a trie in memory with the smallest structure as possible
0165    * (use few RAM as possible)
0166    */
0167   template <typename T>
0168   class Trie {
0169   public:
0170     /// constuctor, empty is the value returned when no match in found
0171     /// in trie
0172     Trie(const T &empty);
0173     ~Trie();
0174 
0175   private:
0176     /// avoid default constructor
0177     Trie() = delete;
0178     /// avoid copy constructor
0179     Trie(const Trie &e) = delete;
0180     /// avoid affectation operator
0181     Trie &operator=(const Trie &e) = delete;
0182 
0183   public:
0184     /// add an entry in the Trie, if entry already exist an exception
0185     /// is throw
0186     void insert(std::string const &str, const T &value);
0187     void insert(const char *str, unsigned strLen, const T &value);
0188     /// associates a value to a string, if string is already in Trie,
0189     /// value is overwriten
0190     void setEntry(std::string const &str, const T &value);
0191     void setEntry(const char *str, unsigned strLen, const T &value);
0192     /// get an entry in the Trie
0193     const T &find(std::string const &str) const;
0194     const T &find(const char *str, unsigned strLen) const;
0195     /// get node matching a string
0196     const TrieNode<T> *node(std::string const &str) const;
0197     const TrieNode<T> *node(const char *str, unsigned strLen) const;
0198     ///  get initial TrieNode
0199     const TrieNode<T> *initialNode() const;
0200     /// display content of trie in output stream
0201     void display(std::ostream &os);
0202     /// clear the content of trie
0203     void clear();
0204 
0205   private:
0206     TrieNode<T> *_addEntry(const char *str, unsigned strLen);
0207 
0208   private:
0209     /// value returned when no match is found in trie
0210     T _empty;
0211     /// factory
0212     TrieFactory<T> *_factory;
0213     /// first node of trie
0214     TrieNode<T> *_initialNode;
0215   };
0216 }  // namespace edm
0217 
0218 #include <boost/iterator/iterator_facade.hpp>
0219 
0220 #include <string>
0221 #include <ostream>
0222 
0223 // iterators and visitors
0224 
0225 namespace edm {
0226 
0227   template <typename T>
0228   class TrieNodeIter : public boost::iterator_facade<TrieNodeIter<T>, TrieNode<T> const, boost::forward_traversal_tag> {
0229   public:
0230     typedef TrieNodeIter<T> self;
0231     typedef TrieNode<T> const node_base;
0232     TrieNodeIter() : m_node(nullptr), m_label(0) {}
0233 
0234     explicit TrieNodeIter(node_base *p) : m_node(p ? p->subNode() : nullptr), m_label(p ? p->subNodeLabel() : 0) {}
0235 
0236     unsigned char label() const { return m_label; }
0237 
0238   private:
0239     friend class boost::iterator_core_access;
0240 
0241     void increment() {
0242       m_label = m_node->brotherLabel();
0243       m_node = m_node->brother();
0244     }
0245 
0246     bool equal(self const &other) const { return this->m_node == other.m_node; }
0247 
0248     node_base &dereference() const { return *m_node; }
0249 
0250     node_base *m_node;
0251     unsigned char m_label;
0252   };
0253 
0254   /// visit each node of the trie
0255   template <typename V, typename T>
0256   void walkTrie(V &v, TrieNode<T> const &n, std::string const &label = "") {
0257     typedef TrieNodeIter<T> node_iterator;
0258     node_iterator e;
0259     for (node_iterator p(&n); p != e; ++p) {
0260       v((*p).value(), label + (char)p.label());
0261       walkTrie(v, *p, label + (char)p.label());
0262     }
0263   }
0264 
0265   /// visits only leaf nodes
0266   template <typename V, typename T>
0267   bool iterateTrieLeaves(V &v, TrieNode<T> const &n, std::string const &label = "") {
0268     typedef TrieNodeIter<T> node_iterator;
0269     node_iterator e;
0270     node_iterator p(&n);
0271     if (p == e)
0272       return true;
0273     for (; p != e; ++p) {
0274       if (iterateTrieLeaves(v, *p, label + (char)p.label()))
0275         v((*p).value(), label + (char)p.label());
0276     }
0277     return false;
0278   }
0279 
0280 }  // namespace edm
0281 
0282 //
0283 //----------------------------------------------------------------
0284 //
0285 // implementations
0286 
0287 template <typename T>
0288 edm::TrieFactory<T>::TrieFactory(unsigned paquetSize)
0289     : _paquetSize(paquetSize), _lastNodes(nullptr), _nbUsedInLastNodes(0) {
0290   _lastNodes = new TrieNode<T>[paquetSize];
0291 }
0292 
0293 template <typename T>
0294 edm::TrieFactory<T>::~TrieFactory() {
0295   typename std::list<TrieNode<T> *>::const_iterator it;
0296 
0297   for (it = _allocatedNodes.begin(); it != _allocatedNodes.end(); ++it)
0298     delete[] *it;
0299   if (_lastNodes)
0300     delete[] _lastNodes;
0301 }
0302 
0303 template <typename T>
0304 edm::TrieNode<T> *edm::TrieFactory<T>::newNode(const T &value) {
0305   if (_nbUsedInLastNodes == _paquetSize) {
0306     _allocatedNodes.push_back(_lastNodes);
0307     _nbUsedInLastNodes = 0;
0308     _lastNodes = new TrieNode<T>[_paquetSize];
0309   }
0310   TrieNode<T> *res = &_lastNodes[_nbUsedInLastNodes];
0311   ++_nbUsedInLastNodes;
0312   res->setValue(value);
0313   res->clear();
0314   return res;
0315 }
0316 
0317 template <typename T>
0318 void edm::TrieFactory<T>::clear() {
0319   typename std::list<TrieNode<T> *>::const_iterator it;
0320   for (it = _allocatedNodes.begin(); it != _allocatedNodes.end(); ++it)
0321     delete[] *it;
0322   _allocatedNodes.clear();
0323   _nbUsedInLastNodes = 0;
0324 }
0325 
0326 template <typename T>
0327 edm::TrieNode<T>::TrieNode()
0328     : _brother(nullptr),
0329       _brotherLabel(0),
0330       _firstSubNode(nullptr),
0331       _firstSubNodeLabel(0),
0332       _value()
0333 /// we can not set _value here because type is unknown. assert that
0334 /// the value is set later with setValue()
0335 {}
0336 
0337 template <typename T>
0338 edm::TrieNode<T>::~TrieNode() {
0339   // do not delete _brother and _firstSubNode because they are
0340   // allocated by factory (TrieFactory) and factory will delete them
0341 }
0342 
0343 template <typename T>
0344 edm::TrieNodeIter<T> edm::TrieNode<T>::begin() const {
0345   return const_iterator(this);
0346 }
0347 template <typename T>
0348 edm::TrieNodeIter<T> edm::TrieNode<T>::end() const {
0349   return const_iterator(nullptr);
0350 }
0351 
0352 template <typename T>
0353 void edm::TrieNode<T>::setValue(const T &val) {
0354   _value = val;
0355 }
0356 
0357 template <typename T>
0358 const T &edm::TrieNode<T>::value() const {
0359   return _value;
0360 }
0361 
0362 template <typename T>
0363 const edm::TrieNode<T> *edm::TrieNode<T>::brother() const {
0364   return _brother;
0365 }
0366 
0367 template <typename T>
0368 edm::TrieNode<T> *edm::TrieNode<T>::brother() {
0369   return _brother;
0370 }
0371 
0372 template <typename T>
0373 const edm::TrieNode<T> *edm::TrieNode<T>::_getBrother(unsigned char chr) const {
0374   const TrieNode<T> *brother = _brother;
0375   return _sequentialSearch(brother, _brotherLabel, chr);
0376 }
0377 
0378 template <typename T>
0379 edm::TrieNode<T> *edm::TrieNode<T>::_getBrother(unsigned char chr) {
0380   return _sequentialSearch(_brother, _brotherLabel, chr);
0381 }
0382 
0383 template <typename T>
0384 void edm::TrieNode<T>::_addBrother(unsigned char chr, TrieNode<T> *brother) {
0385   if (!_brother || _brotherLabel > chr) {
0386     brother->_setBrother(_brother, _brotherLabel);
0387     _brother = brother;
0388     _brotherLabel = chr;
0389   } else
0390     _brother->_addBrother(chr, brother);
0391 }
0392 
0393 template <typename T>
0394 unsigned char edm::TrieNode<T>::brotherLabel() const {
0395   return _brotherLabel;
0396 }
0397 
0398 template <typename T>
0399 const edm::TrieNode<T> *edm::TrieNode<T>::subNode() const {
0400   return _firstSubNode;
0401 }
0402 
0403 template <typename T>
0404 edm::TrieNode<T> *edm::TrieNode<T>::subNode() {
0405   return _firstSubNode;
0406 }
0407 
0408 template <typename T>
0409 unsigned char edm::TrieNode<T>::subNodeLabel() const {
0410   return _firstSubNodeLabel;
0411 }
0412 
0413 template <typename T>
0414 const edm::TrieNode<T> *edm::TrieNode<T>::subNodeByLabel(unsigned char chr) const {
0415   const TrieNode<T> *first = _firstSubNode;
0416   return _sequentialSearch(first, _firstSubNodeLabel, chr);
0417 }
0418 
0419 template <typename T>
0420 edm::TrieNode<T> *edm::TrieNode<T>::subNodeByLabel(unsigned char chr) {
0421   return _sequentialSearch(_firstSubNode, _firstSubNodeLabel, chr);
0422 }
0423 
0424 template <typename T>
0425 void edm::TrieNode<T>::addSubNode(unsigned char chr, TrieNode<T> *node) {
0426   if (!_firstSubNode || _firstSubNodeLabel > chr) {
0427     node->_setBrother(_firstSubNode, _firstSubNodeLabel);
0428     _firstSubNode = node;
0429     _firstSubNodeLabel = chr;
0430   } else
0431     _firstSubNode->_addBrother(chr, node);
0432 }
0433 
0434 template <typename T>
0435 template <typename Node>
0436 inline Node edm::TrieNode<T>::_sequentialSearch(Node first, unsigned char label, unsigned char val) const {
0437   if (first && label <= val) {
0438     if (label == val)
0439       return first;
0440     return first->_getBrother(val);
0441   }
0442   return 0x0;
0443 }
0444 
0445 template <typename T>
0446 void edm::TrieNode<T>::_setBrother(TrieNode<T> *brother, unsigned char brotherLabel) {
0447   _brother = brother;
0448   _brotherLabel = brotherLabel;
0449 }
0450 
0451 template <typename T>
0452 void edm::TrieNode<T>::display(std::ostream &os, unsigned offset, unsigned char label) const {
0453   unsigned int i;
0454   for (i = 0; i < offset; ++i)
0455     os << " ";
0456   if (label)
0457     os << "label[" << label << "] ";
0458   os << "value[" << _value << "]" << std::endl;
0459   if (_firstSubNode)
0460     _firstSubNode->display(os, offset + 2, _firstSubNodeLabel);
0461   if (_brother)
0462     _brother->display(os, offset, _brotherLabel);
0463 }
0464 
0465 template <typename T>
0466 void edm::TrieNode<T>::clear() {
0467   _brother = nullptr;
0468   _brotherLabel = 0;
0469   _firstSubNode = nullptr;
0470   _firstSubNodeLabel = 0;
0471 }
0472 
0473 #include <vector>
0474 #include <algorithm>
0475 #include <string>
0476 #include <cassert>
0477 
0478 #include "FWCore/Utilities/interface/EDMException.h"
0479 
0480 namespace edm {
0481   namespace detailsTrie {
0482     inline void errorInsert(std::string const &key) {
0483       Exception::throwThis(errors::InvalidReference,
0484                            "Trie::insert called with a key already in collection;\n"
0485                            "key value: ",
0486                            key.c_str());
0487     }
0488   }  // namespace detailsTrie
0489 }  // namespace edm
0490 
0491 template <typename T>
0492 edm::Trie<T>::Trie(const T &empty) : _empty(empty), _factory(nullptr), _initialNode(nullptr) {
0493   // initialize nodes by paquets of 10000
0494   _factory = new TrieFactory<T>(10000);
0495   _initialNode = _factory->newNode(_empty);
0496 }
0497 
0498 template <typename T>
0499 edm::Trie<T>::~Trie() {
0500   delete _factory;
0501 }
0502 
0503 template <typename T>
0504 void edm::Trie<T>::setEntry(std::string const &str, const T &value) {
0505   setEntry(str.c_str(), str.size(), value);
0506 }
0507 template <typename T>
0508 void edm::Trie<T>::setEntry(const char *str, unsigned strLen, const T &value) {
0509   TrieNode<T> *node = _addEntry(str, strLen);
0510   node->setValue(value);
0511 }
0512 
0513 template <typename T>
0514 edm::TrieNode<T> *edm::Trie<T>::_addEntry(const char *str, unsigned strLen) {
0515   unsigned pos = 0;
0516   bool found = true;
0517   TrieNode<T> *node = _initialNode, *previous = nullptr;
0518 
0519   // Look for the part of the word which is in Trie
0520   while (found && pos < strLen) {
0521     found = false;
0522     previous = node;
0523     node = node->subNodeByLabel(str[pos]);
0524     if (node) {
0525       found = true;
0526       ++pos;
0527     }
0528   }
0529 
0530   // Add part of the word which is not in Trie
0531   if (!node || pos != strLen) {
0532     node = previous;
0533     for (unsigned i = pos; i < strLen; ++i) {
0534       TrieNode<T> *newNode = _factory->newNode(_empty);
0535       node->addSubNode(str[pos], newNode);
0536       node = newNode;
0537       ++pos;
0538     }
0539   }
0540   assert(node != nullptr);
0541   return node;
0542 }
0543 
0544 template <typename T>
0545 void edm::Trie<T>::insert(std::string const &str, const T &value) {
0546   insert(str.c_str(), str.size(), value);
0547 }
0548 
0549 template <typename T>
0550 void edm::Trie<T>::insert(const char *str, unsigned strLen, const T &value) {
0551   TrieNode<T> *node = _addEntry(str, strLen);
0552 
0553   // Set the value on the last node
0554   if (node && node->value() != _empty)
0555     detailsTrie::errorInsert(std::string(str, strLen));
0556   node->setValue(value);
0557 }
0558 
0559 template <typename T>
0560 const T &edm::Trie<T>::find(std::string const &str) const {
0561   return find(str.c_str(), str.size());
0562 }
0563 
0564 template <typename T>
0565 const T &edm::Trie<T>::find(const char *str, unsigned strLen) const {
0566   unsigned pos = 0;
0567   bool found = true;
0568   const TrieNode<T> *node = _initialNode;
0569 
0570   while (found && pos < strLen) {
0571     found = false;
0572     node = node->subNodeByLabel(str[pos]);
0573     if (node) {
0574       found = true;
0575       ++pos;
0576     }
0577   }
0578   if (node && pos == strLen)  // The word is complet in the automaton
0579     return node->value();
0580   return _empty;
0581 }
0582 
0583 template <typename T>
0584 edm::TrieNode<T> const *edm::Trie<T>::node(std::string const &str) const {
0585   return node(str.c_str(), str.size());
0586 }
0587 
0588 template <typename T>
0589 edm::TrieNode<T> const *edm::Trie<T>::node(const char *str, unsigned strLen) const {
0590   unsigned pos = 0;
0591   bool found = true;
0592   const TrieNode<T> *node = _initialNode;
0593 
0594   while (found && pos < strLen) {
0595     found = false;
0596     node = node->subNodeByLabel(str[pos]);
0597     if (node) {
0598       found = true;
0599       ++pos;
0600     }
0601   }
0602   return node;
0603 }
0604 
0605 template <typename T>
0606 const edm::TrieNode<T> *edm::Trie<T>::initialNode() const {
0607   return _initialNode;
0608 }
0609 
0610 template <typename T>
0611 void edm::Trie<T>::clear() {
0612   _factory->clear();
0613   _initialNode = _factory->newNode(_empty);
0614 }
0615 
0616 template <typename T>
0617 void edm::Trie<T>::display(std::ostream &os) {
0618   if (_initialNode)
0619     _initialNode->display(os, 0, 0);
0620 }
0621 
0622 #endif  //  DataFormats_Common_Trie_h