Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:11:44

0001 #include "Fireworks/Core/interface/SimpleSAXParser.h"
0002 
0003 /** Helper function to handle entities, i.e. characters specified with
0004     the "&label;" syntax.
0005   */
0006 std::string SimpleSAXParser::parseEntity(const std::string &entity) {
0007   if (entity == "quot")
0008     return "\"";
0009   else if (entity == "amp")
0010     return "&";
0011   else if (entity == "lt")
0012     return "<";
0013   else if (entity == "gt")
0014     return ">";
0015   throw ParserError("Unknown entity " + entity);
0016 }
0017 
0018 void debug_state_machine(enum SimpleSAXParser::PARSER_STATES state) {
0019 #ifdef SIMPLE_SAX_PARSER_DEBUG
0020   static char *debug_states[] = {"IN_DOCUMENT",
0021                                  "IN_BEGIN_TAG",
0022                                  "IN_DONE",
0023                                  "IN_BEGIN_ELEMENT",
0024                                  "IN_ELEMENT_WHITESPACE",
0025                                  "IN_END_ELEMENT",
0026                                  "IN_ATTRIBUTE_KEY",
0027                                  "IN_END_TAG",
0028                                  "IN_DATA",
0029                                  "IN_BEGIN_ATTRIBUTE_VALUE",
0030                                  "IN_STRING",
0031                                  "IN_END_ATTRIBUTE_VALUE",
0032                                  "IN_STRING_ENTITY",
0033                                  "IN_DATA_ENTITY"};
0034 
0035   std::cerr << debug_states[state] << std::endl;
0036 #endif
0037 }
0038 
0039 /** Runs the state machine of the parser, invoking startElement(),
0040     setAttribute(), endElement(), data() virtual methods as approppriate. 
0041     In order have the parser doing something usefull you need to derive from
0042     it and specialize the above mentioned virtual methods.
0043     
0044     Default implementation is in any case useful to check syntax.
0045   */
0046 void SimpleSAXParser::parse(void) {
0047   enum PARSER_STATES state = IN_DOCUMENT;
0048   // Current delimiters for strings in attributes.
0049   char stringDelims[] = "\"&";
0050   std::string attributeName;
0051   std::string attributeValue;
0052   std::string tmp;
0053   std::string currentData;
0054 
0055   while (state != IN_DONE) {
0056     debug_state_machine(state);
0057 
0058     switch (state) {
0059       // FIXME: IN_DOCUMENT should check the dtd...
0060       case IN_DOCUMENT:
0061         state = IN_DATA;
0062         if (skipChar('<'))
0063           state = IN_BEGIN_TAG;
0064         break;
0065 
0066       case IN_BEGIN_TAG:
0067         if (nextChar() >= 'A' && nextChar() <= 'z')
0068           state = IN_BEGIN_ELEMENT;
0069         else if (skipChar('/'))
0070           state = IN_END_ELEMENT;
0071         else
0072           throw ParserError("Bad tag");
0073         break;
0074 
0075       case IN_BEGIN_ELEMENT:
0076         m_attributes.clear();
0077         m_elementTags.push_back(getToken(" />"));
0078         if (nextChar() == ' ')
0079           state = IN_ELEMENT_WHITESPACE;
0080         else if (skipChar('/'))
0081           state = IN_END_ELEMENT;
0082         else if (skipChar('>')) {
0083           startElement(m_elementTags.back(), m_attributes);
0084           state = IN_END_TAG;
0085         } else
0086           throw ParserError("Bad element.");
0087         break;
0088 
0089       case IN_ELEMENT_WHITESPACE:
0090         while (skipChar(' ') || skipChar('\n') || skipChar('\t')) {
0091         }
0092 
0093         if (nextChar() >= 'A' && nextChar() <= 'z')
0094           state = IN_ATTRIBUTE_KEY;
0095         else if (nextChar() == '/')
0096           state = IN_END_ELEMENT;
0097         else
0098           throw ParserError("Syntax error in element" + m_elementTags.back());
0099         break;
0100 
0101       case IN_ATTRIBUTE_KEY:
0102         attributeName = getToken('=');
0103         state = IN_BEGIN_ATTRIBUTE_VALUE;
0104         break;
0105 
0106       case IN_BEGIN_ATTRIBUTE_VALUE:
0107         if (skipChar('"')) {
0108           state = IN_STRING;
0109           attributeValue.clear();
0110           stringDelims[0] = '\"';
0111         } else if (skipChar('\'')) {
0112           state = IN_STRING;
0113           attributeValue.clear();
0114           stringDelims[0] = '\'';
0115         } else
0116           throw ParserError("Expecting quotes.");
0117         break;
0118 
0119       case IN_STRING:
0120         attributeValue += getToken(stringDelims);
0121         if (skipChar(stringDelims[0])) {
0122           // Save the attributes in order, replacing those that are
0123           // specified more than once.
0124           Attribute attr(attributeName, attributeValue);
0125           Attributes::iterator i = std::lower_bound(m_attributes.begin(), m_attributes.end(), attr);
0126           if (i != m_attributes.end() && i->key == attr.key)
0127             throw ParserError("Attribute " + i->key + " defined more than once");
0128           m_attributes.insert(i, attr);
0129           state = IN_END_ATTRIBUTE_VALUE;
0130         } else if (skipChar(stringDelims[1]))
0131           state = IN_STRING_ENTITY;
0132         else
0133           throw ParserError("Unexpected end of input at " + attributeValue);
0134         break;
0135 
0136       case IN_END_ATTRIBUTE_VALUE:
0137         getToken(" />");
0138         if (nextChar() == ' ')
0139           state = IN_ELEMENT_WHITESPACE;
0140         else if (skipChar('/'))
0141           state = IN_END_ELEMENT;
0142         else if (skipChar('>')) {
0143           startElement(m_elementTags.back(), m_attributes);
0144           state = IN_END_TAG;
0145         }
0146         break;
0147 
0148       case IN_END_ELEMENT:
0149         tmp = getToken('>');
0150         if (!tmp.empty() && tmp != m_elementTags.back())
0151           throw ParserError("Non-matching closing element " + tmp + " for " + attributeValue);
0152         endElement(tmp);
0153         m_elementTags.pop_back();
0154         state = IN_END_TAG;
0155         break;
0156 
0157       case IN_END_TAG:
0158         if (nextChar() == EOF)
0159           return;
0160         else if (skipChar('<'))
0161           state = IN_BEGIN_TAG;
0162         else
0163           state = IN_DATA;
0164         break;
0165 
0166       case IN_DATA:
0167         currentData += getToken("<&");
0168         if (skipChar('&'))
0169           state = IN_DATA_ENTITY;
0170         else if (skipChar('<')) {
0171           data(currentData);
0172           currentData.clear();
0173           state = IN_BEGIN_TAG;
0174         } else if (nextChar() == EOF) {
0175           data(currentData);
0176           return;
0177         } else
0178           throw ParserError("Unexpected end of input in element " + m_elementTags.back() + currentData);
0179         break;
0180 
0181       case IN_DATA_ENTITY:
0182         currentData += parseEntity(getToken(';'));
0183         state = IN_DATA;
0184         break;
0185 
0186       case IN_STRING_ENTITY:
0187         attributeValue += parseEntity(getToken(';'));
0188         state = IN_STRING;
0189         break;
0190 
0191       case IN_DONE:
0192         return;
0193     }
0194   }
0195 }
0196 
0197 SimpleSAXParser::~SimpleSAXParser() { delete[] m_buffer; }
0198 
0199 /** Helper function which gets a token delimited by @a separator from the 
0200     @a file and write it, 0 terminated in the buffer found in @a buffer.
0201     
0202     Notice that if the token is larger than @a maxSize, the buffer is
0203     reallocated and @a maxSize is updated to the new size.
0204 
0205     The trailing separator after a token is not put in the token and is left 
0206     in the buffer. If @a nextChar is not 0, the delimiter is put there.
0207     
0208     @a in the input stream to be parsed.
0209     
0210     @a buffer a pointer to the buffer where to put the tokens. The buffer will
0211      be redimensioned accordingly, if the token is larger of the buffer.
0212      
0213     @a maxSize, a pointer to the size of the buffer. Notice that in case the 
0214      buffer is reallocated to have more space, maxSize is updated with the new 
0215      size.
0216      
0217     @a firstChar a pointer with the first character in the buffer, notice
0218                  that the first charater in the stream must be obtained 
0219                  separately!!!
0220     
0221     @return whether or not we were able to get a (possibly empty) token from
0222             the file.
0223   */
0224 bool fgettoken(std::istream &in, char **buffer, size_t *maxSize, const char *separators, int *firstChar) {
0225   // if the passed first character is EOF or a separator,
0226   // return an empty otherwise use it as first character
0227   // of the buffer.
0228   if (*firstChar == EOF || (int)separators[0] == *firstChar || strchr(separators + 1, *firstChar)) {
0229     (*buffer)[0] = 0;
0230     return true;
0231   } else
0232     (*buffer)[0] = (char)*firstChar;
0233 
0234   size_t i = 1;
0235 
0236   while (true) {
0237     if (i >= *maxSize) {
0238       *maxSize += 1024;
0239       *buffer = (char *)realloc(*buffer, *maxSize);
0240       if (!*buffer)
0241         return false;
0242     }
0243 
0244     int c = in.get();
0245 
0246     if (c == EOF) {
0247       (*buffer)[i] = 0;
0248       *firstChar = c;
0249       return false;
0250     }
0251 
0252     if (separators[0] == c || strchr(separators + 1, c)) {
0253       (*buffer)[i] = 0;
0254       *firstChar = c;
0255       return true;
0256     }
0257 
0258     (*buffer)[i++] = (char)c;
0259   }
0260 }