Core/src/SimpleSAXParser.cc

0001 #include "Fireworks/Core/interface/SimpleSAXParser.h"
0002
0003 /** Helper function to handle entities, i.e. characters specified with
0004     the "&label;" syntax.
0005   */
0006 std::string SimpleSAXParser::parseEntity(const std::string &entity) {
0007   if (entity == "quot")
0008     return "\"";
0009   else if (entity == "amp")
0010     return "&";
0011   else if (entity == "lt")
0012     return "<";
0013   else if (entity == "gt")
0014     return ">";
0015   throw ParserError("Unknown entity " + entity);
0016 }
0017
0018 void debug_state_machine(enum SimpleSAXParser::PARSER_STATES state) {
0019 #ifdef SIMPLE_SAX_PARSER_DEBUG
0020   static char *debug_states[] = {"IN_DOCUMENT",
0021                                  "IN_BEGIN_TAG",
0022                                  "IN_DONE",
0023                                  "IN_BEGIN_ELEMENT",
0024                                  "IN_ELEMENT_WHITESPACE",
0025                                  "IN_END_ELEMENT",
0026                                  "IN_ATTRIBUTE_KEY",
0027                                  "IN_END_TAG",
0028                                  "IN_DATA",
0029                                  "IN_BEGIN_ATTRIBUTE_VALUE",
0030                                  "IN_STRING",
0031                                  "IN_END_ATTRIBUTE_VALUE",
0032                                  "IN_STRING_ENTITY",
0033                                  "IN_DATA_ENTITY"};
0034
0035   std::cerr << debug_states[state] << std::endl;
0036 #endif
0037 }
0038
0039 /** Runs the state machine of the parser, invoking startElement(),
0040     setAttribute(), endElement(), data() virtual methods as approppriate.
0041     In order have the parser doing something usefull you need to derive from
0042     it and specialize the above mentioned virtual methods.
0043
0044     Default implementation is in any case useful to check syntax.
0045   */
0046 void SimpleSAXParser::parse(void) {
0047   enum PARSER_STATES state = IN_DOCUMENT;
0048   // Current delimiters for strings in attributes.
0049   char stringDelims[] = "\"&";
0050   std::string attributeName;
0051   std::string attributeValue;
0052   std::string tmp;
0053   std::string currentData;
0054
0055   while (state != IN_DONE) {
0056     debug_state_machine(state);
0057
0058     switch (state) {
0059       // FIXME: IN_DOCUMENT should check the dtd...
0060       case IN_DOCUMENT:
0061         state = IN_DATA;
0062         if (skipChar('<'))
0063           state = IN_BEGIN_TAG;
0064         break;
0065
0066       case IN_BEGIN_TAG:
0067         if (nextChar() >= 'A' && nextChar() <= 'z')
0068           state = IN_BEGIN_ELEMENT;
0069         else if (skipChar('/'))
0070           state = IN_END_ELEMENT;
0071         else
0072           throw ParserError("Bad tag");
0073         break;
0074
0075       case IN_BEGIN_ELEMENT:
0076         m_attributes.clear();
0077         m_elementTags.push_back(getToken(" />"));
0078         if (nextChar() == ' ')
0079           state = IN_ELEMENT_WHITESPACE;
0080         else if (skipChar('/'))
0081           state = IN_END_ELEMENT;
0082         else if (skipChar('>')) {
0083           startElement(m_elementTags.back(), m_attributes);
0084           state = IN_END_TAG;
0085         } else
0086           throw ParserError("Bad element.");
0087         break;
0088
0089       case IN_ELEMENT_WHITESPACE:
0090         while (skipChar(' ') || skipChar('\n') || skipChar('\t')) {
0091         }
0092
0093         if (nextChar() >= 'A' && nextChar() <= 'z')
0094           state = IN_ATTRIBUTE_KEY;
0095         else if (nextChar() == '/')
0096           state = IN_END_ELEMENT;
0097         else
0098           throw ParserError("Syntax error in element" + m_elementTags.back());
0099         break;
0100
0101       case IN_ATTRIBUTE_KEY:
0102         attributeName = getToken('=');
0103         state = IN_BEGIN_ATTRIBUTE_VALUE;
0104         break;
0105
0106       case IN_BEGIN_ATTRIBUTE_VALUE:
0107         if (skipChar('"')) {
0108           state = IN_STRING;
0109           attributeValue.clear();
0110           stringDelims[0] = '\"';
0111         } else if (skipChar('\'')) {
0112           state = IN_STRING;
0113           attributeValue.clear();
0114           stringDelims[0] = '\'';
0115         } else
0116           throw ParserError("Expecting quotes.");
0117         break;
0118
0119       case IN_STRING:
0120         attributeValue += getToken(stringDelims);
0121         if (skipChar(stringDelims[0])) {
0122           // Save the attributes in order, replacing those that are
0123           // specified more than once.
0124           Attribute attr(attributeName, attributeValue);
0125           Attributes::iterator i = std::lower_bound(m_attributes.begin(), m_attributes.end(), attr);
0126           if (i != m_attributes.end() && i->key == attr.key)
0127             throw ParserError("Attribute " + i->key + " defined more than once");
0128           m_attributes.insert(i, attr);
0129           state = IN_END_ATTRIBUTE_VALUE;
0130         } else if (skipChar(stringDelims[1]))
0131           state = IN_STRING_ENTITY;
0132         else
0133           throw ParserError("Unexpected end of input at " + attributeValue);
0134         break;
0135
0136       case IN_END_ATTRIBUTE_VALUE:
0137         getToken(" />");
0138         if (nextChar() == ' ')
0139           state = IN_ELEMENT_WHITESPACE;
0140         else if (skipChar('/'))
0141           state = IN_END_ELEMENT;
0142         else if (skipChar('>')) {
0143           startElement(m_elementTags.back(), m_attributes);
0144           state = IN_END_TAG;
0145         }
0146         break;
0147
0148       case IN_END_ELEMENT:
0149         tmp = getToken('>');
0150         if (!tmp.empty() && tmp != m_elementTags.back())
0151           throw ParserError("Non-matching closing element " + tmp + " for " + attributeValue);
0152         endElement(tmp);
0153         m_elementTags.pop_back();
0154         state = IN_END_TAG;
0155         break;
0156
0157       case IN_END_TAG:
0158         if (nextChar() == EOF)
0159           return;
0160         else if (skipChar('<'))
0161           state = IN_BEGIN_TAG;
0162         else
0163           state = IN_DATA;
0164         break;
0165
0166       case IN_DATA:
0167         currentData += getToken("<&");
0168         if (skipChar('&'))
0169           state = IN_DATA_ENTITY;
0170         else if (skipChar('<')) {
0171           data(currentData);
0172           currentData.clear();
0173           state = IN_BEGIN_TAG;
0174         } else if (nextChar() == EOF) {
0175           data(currentData);
0176           return;
0177         } else
0178           throw ParserError("Unexpected end of input in element " + m_elementTags.back() + currentData);
0179         break;
0180
0181       case IN_DATA_ENTITY:
0182         currentData += parseEntity(getToken(';'));
0183         state = IN_DATA;
0184         break;
0185
0186       case IN_STRING_ENTITY:
0187         attributeValue += parseEntity(getToken(';'));
0188         state = IN_STRING;
0189         break;
0190
0191       case IN_DONE:
0192         return;
0193     }
0194   }
0195 }
0196
0197 SimpleSAXParser::~SimpleSAXParser() { delete[] m_buffer; }
0198
0199 /** Helper function which gets a token delimited by @a separator from the
0200     @a file and write it, 0 terminated in the buffer found in @a buffer.
0201
0202     Notice that if the token is larger than @a maxSize, the buffer is
0203     reallocated and @a maxSize is updated to the new size.
0204
0205     The trailing separator after a token is not put in the token and is left
0206     in the buffer. If @a nextChar is not 0, the delimiter is put there.
0207
0208     @a in the input stream to be parsed.
0209
0210     @a buffer a pointer to the buffer where to put the tokens. The buffer will
0211      be redimensioned accordingly, if the token is larger of the buffer.
0212
0213     @a maxSize, a pointer to the size of the buffer. Notice that in case the
0214      buffer is reallocated to have more space, maxSize is updated with the new
0215      size.
0216
0217     @a firstChar a pointer with the first character in the buffer, notice
0218                  that the first charater in the stream must be obtained
0219                  separately!!!
0220
0221     @return whether or not we were able to get a (possibly empty) token from
0222             the file.
0223   */
0224 bool fgettoken(std::istream &in, char **buffer, size_t *maxSize, const char *separators, int *firstChar) {
0225   // if the passed first character is EOF or a separator,
0226   // return an empty otherwise use it as first character
0227   // of the buffer.
0228   if (*firstChar == EOF || (int)separators[0] == *firstChar || strchr(separators + 1, *firstChar)) {
0229     (*buffer)[0] = 0;
0230     return true;
0231   } else
0232     (*buffer)[0] = (char)*firstChar;
0233
0234   size_t i = 1;
0235
0236   while (true) {
0237     if (i >= *maxSize) {
0238       *maxSize += 1024;
0239       *buffer = (char *)realloc(*buffer, *maxSize);
0240       if (!*buffer)
0241         return false;
0242     }
0243
0244     int c = in.get();
0245
0246     if (c == EOF) {
0247       (*buffer)[i] = 0;
0248       *firstChar = c;
0249       return false;
0250     }
0251
0252     if (separators[0] == c || strchr(separators + 1, c)) {
0253       (*buffer)[i] = 0;
0254       *firstChar = c;
0255       return true;
0256     }
0257
0258     (*buffer)[i++] = (char)c;
0259   }
0260 }