Attribute

PARSER_STATES

ParserError

SimpleSAXParser

Macros

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
#ifndef __SIMPLE_SAX_PARSER_H_
#define __SIMPLE_SAX_PARSER_H_
/*  A simple SAX-like parser. 

    And yes, I know the S in SAX stands for Simple.
        
    Licensed under GPLv3 license.
    
    TODO: incomplete support for entities.
    TODO: no support for DTD nor <?xml> preamble.
 */

#include <string>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cstring>
#include <iostream>
#include <algorithm>
#include <vector>

bool fgettoken(std::istream &in, char **buffer, size_t *maxSize, const char *separators, int *firstChar);

/** A simple SAX parser which is able to parse the configuration.

    State machine for the parser can be drawn by cut and pasting the following
    to graphviz:
  
    digraph {
    IN_DOCUMENT->IN_BEGIN_TAG [label="nextChar == '<'"];
    IN_DOCUMENT->IN_DATA [label="nextChar != '<'"];
    
    IN_BEGIN_TAG->IN_BEGIN_ELEMENT [label="nextChar >= 'a' && nextChar < 'Z'"];
    IN_BEGIN_TAG->IN_END_ELEMENT [label= "nextChar == '/'"];
    
    IN_BEGIN_ELEMENT->IN_END_ELEMENT [label="nextChar == '/'"];
    IN_BEGIN_ELEMENT->IN_ELEMENT_WHITESPACE [label="nextChar == ' '"];
    IN_BEGIN_ELEMENT->IN_END_TAG [label="nextChar == '>'"];
    
    IN_ELEMENT_WHITESPACE->IN_ELEMENT_WHITESPACE [ label = "nextChar == \"\\ \\t\\n\""]
    IN_ELEMENT_WHITESPACE->IN_ATTRIBUTE_KEY [ label = "nextChar >= 'a' && nextChar < 'Z'"]
    IN_ELEMENT_WHITESPACE->IN_END_ELEMENT [label="nextChar == '/'"]
    
    IN_END_ELEMENT->IN_END_TAG [label = "nextChar == '>'"];
    
    IN_END_TAG->IN_BEGIN_TAG [label="nextChar == '<'"];
    IN_END_TAG->IN_DATA [label="nextChar != '<'"]
    
    IN_DATA->IN_BEGIN_TAG [label="nextChar == '<'"];
    IN_DATA->IN_DATA_ENTITY [label="nextChar == '&'"];
    IN_DATA->IN_DONE [label = "nextChar == EOF"];
    
    IN_DATA_ENTITY->IN_DATA [label="nextChar == ';'"];
    
    IN_ATTRIBUTE_KEY->IN_BEGIN_ATTRIBUTE_VALUE [label = "nextChar == '='"]
    
    IN_BEGIN_ATTRIBUTE_VALUE->IN_STRING [label = "nextChar == '\"' || nextChar == '\'' "]
    
    IN_STRING->IN_END_ATTRIBUTE_VALUE [label = "nextChar == quote"]
    IN_STRING->IN_STRING_ENTITY [label = "nextChar == '&'"]
    
    IN_END_ATTRIBUTE_VALUE->IN_ELEMENT_WHITESPACE [label = "nextChar == ' '"]
    IN_END_ATTRIBUTE_VALUE->IN_END_ELEMENT [label = "nextChar == '/'"]
    IN_END_ATTRIBUTE_VALUE->IN_END_TAG [label = "nextChar == '>'"]
    
    IN_STRING_ENTITY->IN_STRING [label = "nextChar == ';'"]
    }    
    */
class SimpleSAXParser {
public:
  struct Attribute {
    std::string key;
    std::string value;

    Attribute(const std::string &iKey, const std::string &iValue) : key(iKey), value(iValue) {}

    bool operator<(const Attribute &attribute) const { return this->key < attribute.key; }
  };

  typedef std::vector<Attribute> Attributes;
  class ParserError {
  public:
    ParserError(const std::string &error) : m_error(error) {}

    const char *error() { return m_error.c_str(); }

  private:
    std::string m_error;
  };

  enum PARSER_STATES {
    IN_DOCUMENT,
    IN_BEGIN_TAG,
    IN_DONE,
    IN_BEGIN_ELEMENT,
    IN_ELEMENT_WHITESPACE,
    IN_END_ELEMENT,
    IN_ATTRIBUTE_KEY,
    IN_END_TAG,
    IN_DATA,
    IN_BEGIN_ATTRIBUTE_VALUE,
    IN_STRING,
    IN_END_ATTRIBUTE_VALUE,
    IN_STRING_ENTITY,
    IN_DATA_ENTITY
  };

  SimpleSAXParser(std::istream &f)
      : m_in(f), m_bufferSize(1024), m_buffer(new char[m_bufferSize]), m_nextChar(m_in.get()) {}

  virtual ~SimpleSAXParser();

  void parse(void);

  virtual void startElement(const std::string & /*name*/, Attributes & /*attributes*/) {}
  virtual void endElement(const std::string & /*name*/) {}
  virtual void data(const std::string & /*data*/) {}

  SimpleSAXParser(const SimpleSAXParser &) = delete;                   // stop default
  const SimpleSAXParser &operator=(const SimpleSAXParser &) = delete;  // stop default

private:
  std::string parseEntity(const std::string &entity);
  std::string getToken(const char *delim) {
    fgettoken(m_in, &m_buffer, &m_bufferSize, delim, &m_nextChar);
    return m_buffer;
  }

  std::string getToken(const char delim) {
    char buf[2] = {delim, 0};
    fgettoken(m_in, &m_buffer, &m_bufferSize, buf, &m_nextChar);
    m_nextChar = m_in.get();
    return m_buffer;
  }

  bool skipChar(int c) {
    if (m_nextChar != c)
      return false;
    m_nextChar = m_in.get();
    return true;
  }

  int nextChar(void) { return m_nextChar; }

  std::istream &m_in;
  size_t m_bufferSize;
  char *m_buffer;
  int m_nextChar;
  std::vector<std::string> m_elementTags;
  Attributes m_attributes;
};

// NOTE: put in a .cc if this file is used in more than one place.
#endif  // __SIMPLE_SAX_PARSER_H_