SimpleSAXParser.cc

CMSSW/Fireworks/Core/src/SimpleSAXParser.cc

Line Code

Line	Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260	`#include "Fireworks/Core/interface/SimpleSAXParser.h"` `/** Helper function to handle entities, i.e. characters specified with` `the "&label;" syntax.` `/` `std::string SimpleSAXParser::parseEntity(const std::string &entity) {` `if (entity == "quot")` `return "\"";` `else if (entity == "amp")` `return "&";` `else if (entity == "lt")` `return "<";` `else if (entity == "gt")` `return ">";` `throw ParserError("Unknown entity " + entity);` `}` `void debug_state_machine(enum SimpleSAXParser::PARSER_STATES state) {` `#ifdef SIMPLE_SAX_PARSER_DEBUG` `static char debug_states[] = {"IN_DOCUMENT",` `"IN_BEGIN_TAG",` `"IN_DONE",` `"IN_BEGIN_ELEMENT",` `"IN_ELEMENT_WHITESPACE",` `"IN_END_ELEMENT",` `"IN_ATTRIBUTE_KEY",` `"IN_END_TAG",` `"IN_DATA",` `"IN_BEGIN_ATTRIBUTE_VALUE",` `"IN_STRING",` `"IN_END_ATTRIBUTE_VALUE",` `"IN_STRING_ENTITY",` `"IN_DATA_ENTITY"};` `std::cerr << debug_states[state] << std::endl;` `#endif` `}` `/** Runs the state machine of the parser, invoking startElement(),` `setAttribute(), endElement(), data() virtual methods as approppriate.` `In order have the parser doing something usefull you need to derive from` `it and specialize the above mentioned virtual methods.` `Default implementation is in any case useful to check syntax.` `/` `void SimpleSAXParser::parse(void) {` `enum PARSER_STATES state = IN_DOCUMENT;` `// Current delimiters for strings in attributes.` `char stringDelims[] = "\"&";` `std::string attributeName;` `std::string attributeValue;` `std::string tmp;` `std::string currentData;` `while (state != IN_DONE) {` `debug_state_machine(state);` `switch (state) {` `// FIXME: IN_DOCUMENT should check the dtd...` `case IN_DOCUMENT:` `state = IN_DATA;` `if (skipChar('<'))` `state = IN_BEGIN_TAG;` `break;` `case IN_BEGIN_TAG:` `if (nextChar() >= 'A' && nextChar() <= 'z')` `state = IN_BEGIN_ELEMENT;` `else if (skipChar('/'))` `state = IN_END_ELEMENT;` `else` `throw ParserError("Bad tag");` `break;` `case IN_BEGIN_ELEMENT:` `m_attributes.clear();` `m_elementTags.push_back(getToken(" />"));` `if (nextChar() == ' ')` `state = IN_ELEMENT_WHITESPACE;` `else if (skipChar('/'))` `state = IN_END_ELEMENT;` `else if (skipChar('>')) {` `startElement(m_elementTags.back(), m_attributes);` `state = IN_END_TAG;` `} else` `throw ParserError("Bad element.");` `break;` `case IN_ELEMENT_WHITESPACE:` `while (skipChar(' ') \|\| skipChar('\n') \|\| skipChar('\t')) {` `}` `if (nextChar() >= 'A' && nextChar() <= 'z')` `state = IN_ATTRIBUTE_KEY;` `else if (nextChar() == '/')` `state = IN_END_ELEMENT;` `else` `throw ParserError("Syntax error in element" + m_elementTags.back());` `break;` `case IN_ATTRIBUTE_KEY:` `attributeName = getToken('=');` `state = IN_BEGIN_ATTRIBUTE_VALUE;` `break;` `case IN_BEGIN_ATTRIBUTE_VALUE:` `if (skipChar('"')) {` `state = IN_STRING;` `attributeValue.clear();` `stringDelims[0] = '\"';` `} else if (skipChar('\'')) {` `state = IN_STRING;` `attributeValue.clear();` `stringDelims[0] = '\'';` `} else` `throw ParserError("Expecting quotes.");` `break;` `case IN_STRING:` `attributeValue += getToken(stringDelims);` `if (skipChar(stringDelims[0])) {` `// Save the attributes in order, replacing those that are` `// specified more than once.` `Attribute attr(attributeName, attributeValue);` `Attributes::iterator i = std::lower_bound(m_attributes.begin(), m_attributes.end(), attr);` `if (i != m_attributes.end() && i->key == attr.key)` `throw ParserError("Attribute " + i->key + " defined more than once");` `m_attributes.insert(i, attr);` `state = IN_END_ATTRIBUTE_VALUE;` `} else if (skipChar(stringDelims[1]))` `state = IN_STRING_ENTITY;` `else` `throw ParserError("Unexpected end of input at " + attributeValue);` `break;` `case IN_END_ATTRIBUTE_VALUE:` `getToken(" />");` `if (nextChar() == ' ')` `state = IN_ELEMENT_WHITESPACE;` `else if (skipChar('/'))` `state = IN_END_ELEMENT;` `else if (skipChar('>')) {` `startElement(m_elementTags.back(), m_attributes);` `state = IN_END_TAG;` `}` `break;` `case IN_END_ELEMENT:` `tmp = getToken('>');` `if (!tmp.empty() && tmp != m_elementTags.back())` `throw ParserError("Non-matching closing element " + tmp + " for " + attributeValue);` `endElement(tmp);` `m_elementTags.pop_back();` `state = IN_END_TAG;` `break;` `case IN_END_TAG:` `if (nextChar() == EOF)` `return;` `else if (skipChar('<'))` `state = IN_BEGIN_TAG;` `else` `state = IN_DATA;` `break;` `case IN_DATA:` `currentData += getToken("<&");` `if (skipChar('&'))` `state = IN_DATA_ENTITY;` `else if (skipChar('<')) {` `data(currentData);` `currentData.clear();` `state = IN_BEGIN_TAG;` `} else if (nextChar() == EOF) {` `data(currentData);` `return;` `} else` `throw ParserError("Unexpected end of input in element " + m_elementTags.back() + currentData);` `break;` `case IN_DATA_ENTITY:` `currentData += parseEntity(getToken(';'));` `state = IN_DATA;` `break;` `case IN_STRING_ENTITY:` `attributeValue += parseEntity(getToken(';'));` `state = IN_STRING;` `break;` `case IN_DONE:` `return;` `}` `}` `}` `SimpleSAXParser::~SimpleSAXParser() { delete[] m_buffer; }` `/* Helper function which gets a token delimited by @a separator from the` `@a file and write it, 0 terminated in the buffer found in @a buffer.` `Notice that if the token is larger than @a maxSize, the buffer is` `reallocated and @a maxSize is updated to the new size.` `The trailing separator after a token is not put in the token and is left` `in the buffer. If @a nextChar is not 0, the delimiter is put there.` `@a in the input stream to be parsed.` `@a buffer a pointer to the buffer where to put the tokens. The buffer will` `be redimensioned accordingly, if the token is larger of the buffer.` `@a maxSize, a pointer to the size of the buffer. Notice that in case the` `buffer is reallocated to have more space, maxSize is updated with the new` `size.` `@a firstChar a pointer with the first character in the buffer, notice` `that the first charater in the stream must be obtained` `separately!!!` `@return whether or not we were able to get a (possibly empty) token from` `the file.` `/` `bool fgettoken(std::istream &in, char buffer, size_t maxSize, const char separators, int firstChar) {` `// if the passed first character is EOF or a separator,` `// return an empty otherwise use it as first character` `// of the buffer.` `if (firstChar == EOF \|\| (int)separators[0] == firstChar \|\| strchr(separators + 1, firstChar)) {` `(buffer)[0] = 0;` `return true;` `} else` `(buffer)[0] = (char)firstChar;` `size_t i = 1;` `while (true) {` `if (i >= maxSize) {` `maxSize += 1024;` `buffer = (char )realloc(buffer, maxSize);` `if (!buffer)` `return false;` `}` `int c = in.get();` `if (c == EOF) {` `(buffer)[i] = 0;` `firstChar = c;` `return false;` `}` `if (separators[0] == c \|\| strchr(separators + 1, c)) {` `(buffer)[i] = 0;` `firstChar = c;` `return true;` `}` `(buffer)[i++] = (char)c;` `}` `}`

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260

#include "Fireworks/Core/interface/SimpleSAXParser.h"

/** Helper function to handle entities, i.e. characters specified with
    the "&label;" syntax.
  */
std::string SimpleSAXParser::parseEntity(const std::string &entity) {
  if (entity == "quot")
    return "\"";
  else if (entity == "amp")
    return "&";
  else if (entity == "lt")
    return "<";
  else if (entity == "gt")
    return ">";
  throw ParserError("Unknown entity " + entity);
}

void debug_state_machine(enum SimpleSAXParser::PARSER_STATES state) {
#ifdef SIMPLE_SAX_PARSER_DEBUG
  static char *debug_states[] = {"IN_DOCUMENT",
                                 "IN_BEGIN_TAG",
                                 "IN_DONE",
                                 "IN_BEGIN_ELEMENT",
                                 "IN_ELEMENT_WHITESPACE",
                                 "IN_END_ELEMENT",
                                 "IN_ATTRIBUTE_KEY",
                                 "IN_END_TAG",
                                 "IN_DATA",
                                 "IN_BEGIN_ATTRIBUTE_VALUE",
                                 "IN_STRING",
                                 "IN_END_ATTRIBUTE_VALUE",
                                 "IN_STRING_ENTITY",
                                 "IN_DATA_ENTITY"};

  std::cerr << debug_states[state] << std::endl;
#endif
}

/** Runs the state machine of the parser, invoking startElement(),
    setAttribute(), endElement(), data() virtual methods as approppriate. 
    In order have the parser doing something usefull you need to derive from
    it and specialize the above mentioned virtual methods.
    
    Default implementation is in any case useful to check syntax.
  */
void SimpleSAXParser::parse(void) {
  enum PARSER_STATES state = IN_DOCUMENT;
  // Current delimiters for strings in attributes.
  char stringDelims[] = "\"&";
  std::string attributeName;
  std::string attributeValue;
  std::string tmp;
  std::string currentData;

  while (state != IN_DONE) {
    debug_state_machine(state);

    switch (state) {
      // FIXME: IN_DOCUMENT should check the dtd...
      case IN_DOCUMENT:
        state = IN_DATA;
        if (skipChar('<'))
          state = IN_BEGIN_TAG;
        break;

      case IN_BEGIN_TAG:
        if (nextChar() >= 'A' && nextChar() <= 'z')
          state = IN_BEGIN_ELEMENT;
        else if (skipChar('/'))
          state = IN_END_ELEMENT;
        else
          throw ParserError("Bad tag");
        break;

      case IN_BEGIN_ELEMENT:
        m_attributes.clear();
        m_elementTags.push_back(getToken(" />"));
        if (nextChar() == ' ')
          state = IN_ELEMENT_WHITESPACE;
        else if (skipChar('/'))
          state = IN_END_ELEMENT;
        else if (skipChar('>')) {
          startElement(m_elementTags.back(), m_attributes);
          state = IN_END_TAG;
        } else
          throw ParserError("Bad element.");
        break;

      case IN_ELEMENT_WHITESPACE:
        while (skipChar(' ') || skipChar('\n') || skipChar('\t')) {
        }

        if (nextChar() >= 'A' && nextChar() <= 'z')
          state = IN_ATTRIBUTE_KEY;
        else if (nextChar() == '/')
          state = IN_END_ELEMENT;
        else
          throw ParserError("Syntax error in element" + m_elementTags.back());
        break;

      case IN_ATTRIBUTE_KEY:
        attributeName = getToken('=');
        state = IN_BEGIN_ATTRIBUTE_VALUE;
        break;

      case IN_BEGIN_ATTRIBUTE_VALUE:
        if (skipChar('"')) {
          state = IN_STRING;
          attributeValue.clear();
          stringDelims[0] = '\"';
        } else if (skipChar('\'')) {
          state = IN_STRING;
          attributeValue.clear();
          stringDelims[0] = '\'';
        } else
          throw ParserError("Expecting quotes.");
        break;

      case IN_STRING:
        attributeValue += getToken(stringDelims);
        if (skipChar(stringDelims[0])) {
          // Save the attributes in order, replacing those that are
          // specified more than once.
          Attribute attr(attributeName, attributeValue);
          Attributes::iterator i = std::lower_bound(m_attributes.begin(), m_attributes.end(), attr);
          if (i != m_attributes.end() && i->key == attr.key)
            throw ParserError("Attribute " + i->key + " defined more than once");
          m_attributes.insert(i, attr);
          state = IN_END_ATTRIBUTE_VALUE;
        } else if (skipChar(stringDelims[1]))
          state = IN_STRING_ENTITY;
        else
          throw ParserError("Unexpected end of input at " + attributeValue);
        break;

      case IN_END_ATTRIBUTE_VALUE:
        getToken(" />");
        if (nextChar() == ' ')
          state = IN_ELEMENT_WHITESPACE;
        else if (skipChar('/'))
          state = IN_END_ELEMENT;
        else if (skipChar('>')) {
          startElement(m_elementTags.back(), m_attributes);
          state = IN_END_TAG;
        }
        break;

      case IN_END_ELEMENT:
        tmp = getToken('>');
        if (!tmp.empty() && tmp != m_elementTags.back())
          throw ParserError("Non-matching closing element " + tmp + " for " + attributeValue);
        endElement(tmp);
        m_elementTags.pop_back();
        state = IN_END_TAG;
        break;

      case IN_END_TAG:
        if (nextChar() == EOF)
          return;
        else if (skipChar('<'))
          state = IN_BEGIN_TAG;
        else
          state = IN_DATA;
        break;

      case IN_DATA:
        currentData += getToken("<&");
        if (skipChar('&'))
          state = IN_DATA_ENTITY;
        else if (skipChar('<')) {
          data(currentData);
          currentData.clear();
          state = IN_BEGIN_TAG;
        } else if (nextChar() == EOF) {
          data(currentData);
          return;
        } else
          throw ParserError("Unexpected end of input in element " + m_elementTags.back() + currentData);
        break;

      case IN_DATA_ENTITY:
        currentData += parseEntity(getToken(';'));
        state = IN_DATA;
        break;

      case IN_STRING_ENTITY:
        attributeValue += parseEntity(getToken(';'));
        state = IN_STRING;
        break;

      case IN_DONE:
        return;
    }
  }
}

SimpleSAXParser::~SimpleSAXParser() { delete[] m_buffer; }

/** Helper function which gets a token delimited by @a separator from the 
    @a file and write it, 0 terminated in the buffer found in @a buffer.
    
    Notice that if the token is larger than @a maxSize, the buffer is
    reallocated and @a maxSize is updated to the new size.

    The trailing separator after a token is not put in the token and is left 
    in the buffer. If @a nextChar is not 0, the delimiter is put there.
    
    @a in the input stream to be parsed.
    
    @a buffer a pointer to the buffer where to put the tokens. The buffer will
     be redimensioned accordingly, if the token is larger of the buffer.
     
    @a maxSize, a pointer to the size of the buffer. Notice that in case the 
     buffer is reallocated to have more space, maxSize is updated with the new 
     size.
     
    @a firstChar a pointer with the first character in the buffer, notice
                 that the first charater in the stream must be obtained 
                 separately!!!
    
    @return whether or not we were able to get a (possibly empty) token from
            the file.
  */
bool fgettoken(std::istream &in, char **buffer, size_t *maxSize, const char *separators, int *firstChar) {
  // if the passed first character is EOF or a separator,
  // return an empty otherwise use it as first character
  // of the buffer.
  if (*firstChar == EOF || (int)separators[0] == *firstChar || strchr(separators + 1, *firstChar)) {
    (*buffer)[0] = 0;
    return true;
  } else
    (*buffer)[0] = (char)*firstChar;

  size_t i = 1;

  while (true) {
    if (i >= *maxSize) {
      *maxSize += 1024;
      *buffer = (char *)realloc(*buffer, *maxSize);
      if (!*buffer)
        return false;
    }

    int c = in.get();

    if (c == EOF) {
      (*buffer)[i] = 0;
      *firstChar = c;
      return false;
    }

    if (separators[0] == c || strchr(separators + 1, c)) {
      (*buffer)[i] = 0;
      *firstChar = c;
      return true;
    }

    (*buffer)[i++] = (char)c;
  }
}