Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:11:33

0001 #ifndef __SIMPLE_SAX_PARSER_H_
0002 #define __SIMPLE_SAX_PARSER_H_
0003 /*  A simple SAX-like parser. 
0004 
0005     And yes, I know the S in SAX stands for Simple.
0006         
0007     Licensed under GPLv3 license.
0008     
0009     TODO: incomplete support for entities.
0010     TODO: no support for DTD nor <?xml> preamble.
0011  */
0012 
0013 #include <string>
0014 #include <cstdio>
0015 #include <cstdlib>
0016 #include <cassert>
0017 #include <cstring>
0018 #include <iostream>
0019 #include <algorithm>
0020 #include <vector>
0021 
0022 bool fgettoken(std::istream &in, char **buffer, size_t *maxSize, const char *separators, int *firstChar);
0023 
0024 /** A simple SAX parser which is able to parse the configuration.
0025 
0026     State machine for the parser can be drawn by cut and pasting the following
0027     to graphviz:
0028   
0029     digraph {
0030     IN_DOCUMENT->IN_BEGIN_TAG [label="nextChar == '<'"];
0031     IN_DOCUMENT->IN_DATA [label="nextChar != '<'"];
0032     
0033     IN_BEGIN_TAG->IN_BEGIN_ELEMENT [label="nextChar >= 'a' && nextChar < 'Z'"];
0034     IN_BEGIN_TAG->IN_END_ELEMENT [label= "nextChar == '/'"];
0035     
0036     IN_BEGIN_ELEMENT->IN_END_ELEMENT [label="nextChar == '/'"];
0037     IN_BEGIN_ELEMENT->IN_ELEMENT_WHITESPACE [label="nextChar == ' '"];
0038     IN_BEGIN_ELEMENT->IN_END_TAG [label="nextChar == '>'"];
0039     
0040     IN_ELEMENT_WHITESPACE->IN_ELEMENT_WHITESPACE [ label = "nextChar == \"\\ \\t\\n\""]
0041     IN_ELEMENT_WHITESPACE->IN_ATTRIBUTE_KEY [ label = "nextChar >= 'a' && nextChar < 'Z'"]
0042     IN_ELEMENT_WHITESPACE->IN_END_ELEMENT [label="nextChar == '/'"]
0043     
0044     IN_END_ELEMENT->IN_END_TAG [label = "nextChar == '>'"];
0045     
0046     IN_END_TAG->IN_BEGIN_TAG [label="nextChar == '<'"];
0047     IN_END_TAG->IN_DATA [label="nextChar != '<'"]
0048     
0049     IN_DATA->IN_BEGIN_TAG [label="nextChar == '<'"];
0050     IN_DATA->IN_DATA_ENTITY [label="nextChar == '&'"];
0051     IN_DATA->IN_DONE [label = "nextChar == EOF"];
0052     
0053     IN_DATA_ENTITY->IN_DATA [label="nextChar == ';'"];
0054     
0055     IN_ATTRIBUTE_KEY->IN_BEGIN_ATTRIBUTE_VALUE [label = "nextChar == '='"]
0056     
0057     IN_BEGIN_ATTRIBUTE_VALUE->IN_STRING [label = "nextChar == '\"' || nextChar == '\'' "]
0058     
0059     IN_STRING->IN_END_ATTRIBUTE_VALUE [label = "nextChar == quote"]
0060     IN_STRING->IN_STRING_ENTITY [label = "nextChar == '&'"]
0061     
0062     IN_END_ATTRIBUTE_VALUE->IN_ELEMENT_WHITESPACE [label = "nextChar == ' '"]
0063     IN_END_ATTRIBUTE_VALUE->IN_END_ELEMENT [label = "nextChar == '/'"]
0064     IN_END_ATTRIBUTE_VALUE->IN_END_TAG [label = "nextChar == '>'"]
0065     
0066     IN_STRING_ENTITY->IN_STRING [label = "nextChar == ';'"]
0067     }    
0068     */
0069 class SimpleSAXParser {
0070 public:
0071   struct Attribute {
0072     std::string key;
0073     std::string value;
0074 
0075     Attribute(const std::string &iKey, const std::string &iValue) : key(iKey), value(iValue) {}
0076 
0077     bool operator<(const Attribute &attribute) const { return this->key < attribute.key; }
0078   };
0079 
0080   typedef std::vector<Attribute> Attributes;
0081   class ParserError {
0082   public:
0083     ParserError(const std::string &error) : m_error(error) {}
0084 
0085     const char *error() { return m_error.c_str(); }
0086 
0087   private:
0088     std::string m_error;
0089   };
0090 
0091   enum PARSER_STATES {
0092     IN_DOCUMENT,
0093     IN_BEGIN_TAG,
0094     IN_DONE,
0095     IN_BEGIN_ELEMENT,
0096     IN_ELEMENT_WHITESPACE,
0097     IN_END_ELEMENT,
0098     IN_ATTRIBUTE_KEY,
0099     IN_END_TAG,
0100     IN_DATA,
0101     IN_BEGIN_ATTRIBUTE_VALUE,
0102     IN_STRING,
0103     IN_END_ATTRIBUTE_VALUE,
0104     IN_STRING_ENTITY,
0105     IN_DATA_ENTITY
0106   };
0107 
0108   SimpleSAXParser(std::istream &f)
0109       : m_in(f), m_bufferSize(1024), m_buffer(new char[m_bufferSize]), m_nextChar(m_in.get()) {}
0110 
0111   virtual ~SimpleSAXParser();
0112 
0113   void parse(void);
0114 
0115   virtual void startElement(const std::string & /*name*/, Attributes & /*attributes*/) {}
0116   virtual void endElement(const std::string & /*name*/) {}
0117   virtual void data(const std::string & /*data*/) {}
0118 
0119   SimpleSAXParser(const SimpleSAXParser &) = delete;                   // stop default
0120   const SimpleSAXParser &operator=(const SimpleSAXParser &) = delete;  // stop default
0121 
0122 private:
0123   std::string parseEntity(const std::string &entity);
0124   std::string getToken(const char *delim) {
0125     fgettoken(m_in, &m_buffer, &m_bufferSize, delim, &m_nextChar);
0126     return m_buffer;
0127   }
0128 
0129   std::string getToken(const char delim) {
0130     char buf[2] = {delim, 0};
0131     fgettoken(m_in, &m_buffer, &m_bufferSize, buf, &m_nextChar);
0132     m_nextChar = m_in.get();
0133     return m_buffer;
0134   }
0135 
0136   bool skipChar(int c) {
0137     if (m_nextChar != c)
0138       return false;
0139     m_nextChar = m_in.get();
0140     return true;
0141   }
0142 
0143   int nextChar(void) { return m_nextChar; }
0144 
0145   std::istream &m_in;
0146   size_t m_bufferSize;
0147   char *m_buffer;
0148   int m_nextChar;
0149   std::vector<std::string> m_elementTags;
0150   Attributes m_attributes;
0151 };
0152 
0153 // NOTE: put in a .cc if this file is used in more than one place.
0154 #endif  // __SIMPLE_SAX_PARSER_H_