Utilities/src/json_reader.cpp

0001 #include "EventFilter/Utilities/interface/reader.h"
0002 #include "EventFilter/Utilities/interface/value.h"
0003 #include <utility>
0004 #include <cstdio>
0005 #include <cassert>
0006 #include <cstring>
0007 #include <iostream>
0008 #include <stdexcept>
0009
0010 #if _MSC_VER >= 1400             // VC++ 8.0
0011 #pragma warning(disable : 4996)  // disable warning about strdup being deprecated.
0012 #endif
0013
0014 namespace jsoncollector {
0015   namespace Json {
0016
0017     // Implementation of class Features
0018     // ////////////////////////////////
0019
0020     Features::Features() : allowComments_(true), strictRoot_(false) {}
0021
0022     Features Features::all() { return Features(); }
0023
0024     Features Features::strictMode() {
0025       Features features;
0026       features.allowComments_ = false;
0027       features.strictRoot_ = true;
0028       return features;
0029     }
0030
0031     // Implementation of class Reader
0032     // ////////////////////////////////
0033
0034     static inline bool in(Reader::Char c, Reader::Char c1, Reader::Char c2, Reader::Char c3, Reader::Char c4) {
0035       return c == c1 || c == c2 || c == c3 || c == c4;
0036     }
0037
0038     static inline bool in(
0039         Reader::Char c, Reader::Char c1, Reader::Char c2, Reader::Char c3, Reader::Char c4, Reader::Char c5) {
0040       return c == c1 || c == c2 || c == c3 || c == c4 || c == c5;
0041     }
0042
0043     static bool containsNewLine(Reader::Location begin, Reader::Location end) {
0044       for (; begin < end; ++begin)
0045         if (*begin == '\n' || *begin == '\r')
0046           return true;
0047       return false;
0048     }
0049
0050     static std::string codePointToUTF8(unsigned int cp) {
0051       std::string result;
0052
0053       // based on description from http://en.wikipedia.org/wiki/UTF-8
0054
0055       if (cp <= 0x7f) {
0056         result.resize(1);
0057         result[0] = static_cast<char>(cp);
0058       } else if (cp <= 0x7FF) {
0059         result.resize(2);
0060         result[1] = static_cast<char>(0x80 | (0x3f & cp));
0061         result[0] = static_cast<char>(0xC0 | (0x1f & (cp >> 6)));
0062       } else if (cp <= 0xFFFF) {
0063         result.resize(3);
0064         result[2] = static_cast<char>(0x80 | (0x3f & cp));
0065         result[1] = 0x80 | static_cast<char>((0x3f & (cp >> 6)));
0066         result[0] = 0xE0 | static_cast<char>((0xf & (cp >> 12)));
0067       } else if (cp <= 0x10FFFF) {
0068         result.resize(4);
0069         result[3] = static_cast<char>(0x80 | (0x3f & cp));
0070         result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
0071         result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
0072         result[0] = static_cast<char>(0xF0 | (0x7 & (cp >> 18)));
0073       }
0074
0075       return result;
0076     }
0077
0078     // Class Reader
0079     // //////////////////////////////////////////////////////////////////
0080
0081     Reader::Reader() : features_(Features::all()) {}
0082
0083     Reader::Reader(const Features &features) : features_(features) {}
0084
0085     bool Reader::parse(const std::string &document, Value &root, bool collectComments) {
0086       document_ = document;
0087       const char *begin = document_.c_str();
0088       const char *end = begin + document_.length();
0089       return parse(begin, end, root, collectComments);
0090     }
0091
0092     bool Reader::parse(std::istream &sin, Value &root, bool collectComments) {
0093       //std::istream_iterator<char> begin(sin);
0094       //std::istream_iterator<char> end;
0095       // Those would allow streamed input from a file, if parse() were a
0096       // template function.
0097
0098       // Since std::string is reference-counted, this at least does not
0099       // create an extra copy.
0100       std::string doc;
0101       std::getline(sin, doc, (char)EOF);
0102       return parse(doc, root, collectComments);
0103     }
0104
0105     bool Reader::parse(const char *beginDoc, const char *endDoc, Value &root, bool collectComments) {
0106       if (!features_.allowComments_) {
0107         collectComments = false;
0108       }
0109
0110       begin_ = beginDoc;
0111       end_ = endDoc;
0112       collectComments_ = collectComments;
0113       current_ = begin_;
0114       lastValueEnd_ = nullptr;
0115       lastValue_ = nullptr;
0116       commentsBefore_ = "";
0117       errors_.clear();
0118       while (!nodes_.empty())
0119         nodes_.pop();
0120       nodes_.push(&root);
0121
0122       bool successful = readValue();
0123       Token token;
0124       skipCommentTokens(token);
0125       if (collectComments_ && !commentsBefore_.empty())
0126         root.setComment(commentsBefore_, commentAfter);
0127       if (features_.strictRoot_) {
0128         if (!root.isArray() && !root.isObject()) {
0129           // Set error location to start of doc, ideally should be first token found in doc
0130           token.type_ = tokenError;
0131           token.start_ = beginDoc;
0132           token.end_ = endDoc;
0133           addError("A valid JSON document must be either an array or an object value.", token);
0134           return false;
0135         }
0136       }
0137       return successful;
0138     }
0139
0140     bool Reader::readValue() {
0141       Token token;
0142       skipCommentTokens(token);
0143       bool successful = true;
0144
0145       if (collectComments_ && !commentsBefore_.empty()) {
0146         currentValue().setComment(commentsBefore_, commentBefore);
0147         commentsBefore_ = "";
0148       }
0149
0150       switch (token.type_) {
0151         case tokenObjectBegin:
0152           successful = readObject(token);
0153           break;
0154         case tokenArrayBegin:
0155           successful = readArray(token);
0156           break;
0157         case tokenNumber:
0158           successful = decodeNumber(token);
0159           break;
0160         case tokenString:
0161           successful = decodeString(token);
0162           break;
0163         case tokenTrue:
0164           currentValue() = true;
0165           break;
0166         case tokenFalse:
0167           currentValue() = false;
0168           break;
0169         case tokenNull:
0170           currentValue() = Value();
0171           break;
0172         default:
0173           return addError("Syntax error: value, object or array expected.", token);
0174       }
0175
0176       if (collectComments_) {
0177         lastValueEnd_ = current_;
0178         lastValue_ = &currentValue();
0179       }
0180
0181       return successful;
0182     }
0183
0184     void Reader::skipCommentTokens(Token &token) {
0185       if (features_.allowComments_) {
0186         do {
0187           readToken(token);
0188         } while (token.type_ == tokenComment);
0189       } else {
0190         readToken(token);
0191       }
0192     }
0193
0194     bool Reader::expectToken(TokenType type, Token &token, const char *message) {
0195       readToken(token);
0196       if (token.type_ != type)
0197         return addError(message, token);
0198       return true;
0199     }
0200
0201     bool Reader::readToken(Token &token) {
0202       skipSpaces();
0203       token.start_ = current_;
0204       Char c = getNextChar();
0205       bool ok = true;
0206       switch (c) {
0207         case '{':
0208           token.type_ = tokenObjectBegin;
0209           break;
0210         case '}':
0211           token.type_ = tokenObjectEnd;
0212           break;
0213         case '[':
0214           token.type_ = tokenArrayBegin;
0215           break;
0216         case ']':
0217           token.type_ = tokenArrayEnd;
0218           break;
0219         case '"':
0220           token.type_ = tokenString;
0221           ok = readString();
0222           break;
0223         case '/':
0224           token.type_ = tokenComment;
0225           ok = readComment();
0226           break;
0227         case '0':
0228         case '1':
0229         case '2':
0230         case '3':
0231         case '4':
0232         case '5':
0233         case '6':
0234         case '7':
0235         case '8':
0236         case '9':
0237         case '-':
0238           token.type_ = tokenNumber;
0239           readNumber();
0240           break;
0241         case 't':
0242           token.type_ = tokenTrue;
0243           ok = match("rue", 3);
0244           break;
0245         case 'f':
0246           token.type_ = tokenFalse;
0247           ok = match("alse", 4);
0248           break;
0249         case 'n':
0250           token.type_ = tokenNull;
0251           ok = match("ull", 3);
0252           break;
0253         case ',':
0254           token.type_ = tokenArraySeparator;
0255           break;
0256         case ':':
0257           token.type_ = tokenMemberSeparator;
0258           break;
0259         case 0:
0260           token.type_ = tokenEndOfStream;
0261           break;
0262         default:
0263           ok = false;
0264           break;
0265       }
0266       if (!ok)
0267         token.type_ = tokenError;
0268       token.end_ = current_;
0269       return true;
0270     }
0271
0272     void Reader::skipSpaces() {
0273       while (current_ != end_) {
0274         Char c = *current_;
0275         if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
0276           ++current_;
0277         else
0278           break;
0279       }
0280     }
0281
0282     bool Reader::match(Location pattern, int patternLength) {
0283       if (end_ - current_ < patternLength)
0284         return false;
0285       int index = patternLength;
0286       while (index--)
0287         if (current_[index] != pattern[index])
0288           return false;
0289       current_ += patternLength;
0290       return true;
0291     }
0292
0293     bool Reader::readComment() {
0294       Location commentBegin = current_ - 1;
0295       Char c = getNextChar();
0296       bool successful = false;
0297       if (c == '*')
0298         successful = readCStyleComment();
0299       else if (c == '/')
0300         successful = readCppStyleComment();
0301       if (!successful)
0302         return false;
0303
0304       if (collectComments_) {
0305         CommentPlacement placement = commentBefore;
0306         if (lastValueEnd_ && !containsNewLine(lastValueEnd_, commentBegin)) {
0307           if (c != '*' || !containsNewLine(commentBegin, current_))
0308             placement = commentAfterOnSameLine;
0309         }
0310
0311         addComment(commentBegin, current_, placement);
0312       }
0313       return true;
0314     }
0315
0316     void Reader::addComment(Location begin, Location end, CommentPlacement placement) {
0317       assert(collectComments_);
0318       if (placement == commentAfterOnSameLine) {
0319         assert(lastValue_ != nullptr);
0320         lastValue_->setComment(std::string(begin, end), placement);
0321       } else {
0322         if (!commentsBefore_.empty())
0323           commentsBefore_ += "\n";
0324         commentsBefore_ += std::string(begin, end);
0325       }
0326     }
0327
0328     bool Reader::readCStyleComment() {
0329       while (current_ != end_) {
0330         Char c = getNextChar();
0331         if (c == '*' && *current_ == '/')
0332           break;
0333       }
0334       return getNextChar() == '/';
0335     }
0336
0337     bool Reader::readCppStyleComment() {
0338       while (current_ != end_) {
0339         Char c = getNextChar();
0340         if (c == '\r' || c == '\n')
0341           break;
0342       }
0343       return true;
0344     }
0345
0346     void Reader::readNumber() {
0347       while (current_ != end_) {
0348         if (!(*current_ >= '0' && *current_ <= '9') && !in(*current_, '.', 'e', 'E', '+', '-'))
0349           break;
0350         ++current_;
0351       }
0352     }
0353
0354     bool Reader::readString() {
0355       Char c = 0;
0356       while (current_ != end_) {
0357         c = getNextChar();
0358         if (c == '\\')
0359           getNextChar();
0360         else if (c == '"')
0361           break;
0362       }
0363       return c == '"';
0364     }
0365
0366     bool Reader::readObject(Token &tokenStart) {
0367       Token tokenName;
0368       std::string name;
0369       currentValue() = Value(objectValue);
0370       while (readToken(tokenName)) {
0371         bool initialTokenOk = true;
0372         while (tokenName.type_ == tokenComment && initialTokenOk)
0373           initialTokenOk = readToken(tokenName);
0374         if (!initialTokenOk)
0375           break;
0376         if (tokenName.type_ == tokenObjectEnd && name.empty())  // empty object
0377           return true;
0378         if (tokenName.type_ != tokenString)
0379           break;
0380
0381         name = "";
0382         if (!decodeString(tokenName, name))
0383           return recoverFromError(tokenObjectEnd);
0384
0385         Token colon;
0386         if (!readToken(colon) || colon.type_ != tokenMemberSeparator) {
0387           return addErrorAndRecover("Missing ':' after object member name", colon, tokenObjectEnd);
0388         }
0389         Value &value = currentValue()[name];
0390         nodes_.push(&value);
0391         bool ok = readValue();
0392         nodes_.pop();
0393         if (!ok)  // error already set
0394           return recoverFromError(tokenObjectEnd);
0395
0396         Token comma;
0397         if (!readToken(comma) ||
0398             (comma.type_ != tokenObjectEnd && comma.type_ != tokenArraySeparator && comma.type_ != tokenComment)) {
0399           return addErrorAndRecover("Missing ',' or '}' in object declaration", comma, tokenObjectEnd);
0400         }
0401         bool finalizeTokenOk = true;
0402         while (comma.type_ == tokenComment && finalizeTokenOk)
0403           finalizeTokenOk = readToken(comma);
0404         if (comma.type_ == tokenObjectEnd)
0405           return true;
0406       }
0407       return addErrorAndRecover("Missing '}' or object member name", tokenName, tokenObjectEnd);
0408     }
0409
0410     bool Reader::readArray(Token &tokenStart) {
0411       currentValue() = Value(arrayValue);
0412       skipSpaces();
0413       if (*current_ == ']')  // empty array
0414       {
0415         Token endArray;
0416         readToken(endArray);
0417         return true;
0418       }
0419       int index = 0;
0420       while (true) {
0421         Value &value = currentValue()[index++];
0422         nodes_.push(&value);
0423         bool ok = readValue();
0424         nodes_.pop();
0425         if (!ok)  // error already set
0426           return recoverFromError(tokenArrayEnd);
0427
0428         Token token;
0429         // Accept Comment after last item in the array.
0430         ok = readToken(token);
0431         while (token.type_ == tokenComment && ok) {
0432           ok = readToken(token);
0433         }
0434         bool badTokenType = (token.type_ != tokenArraySeparator && token.type_ != tokenArrayEnd);
0435         if (!ok || badTokenType) {
0436           return addErrorAndRecover("Missing ',' or ']' in array declaration", token, tokenArrayEnd);
0437         }
0438         if (token.type_ == tokenArrayEnd)
0439           break;
0440       }
0441       return true;
0442     }
0443
0444     bool Reader::decodeNumber(Token &token) {
0445       bool isDouble = false;
0446       for (Location inspect = token.start_; inspect != token.end_; ++inspect) {
0447         isDouble = isDouble || in(*inspect, '.', 'e', 'E', '+') || (*inspect == '-' && inspect != token.start_);
0448       }
0449       if (isDouble)
0450         return decodeDouble(token);
0451       Location current = token.start_;
0452       bool isNegative = *current == '-';
0453       if (isNegative)
0454         ++current;
0455       Value::UInt threshold = (isNegative ? Value::UInt(-Value::minInt) : Value::maxUInt) / 10;
0456       Value::UInt value = 0;
0457       while (current < token.end_) {
0458         Char c = *current++;
0459         if (c < '0' || c > '9')
0460           return addError("'" + std::string(token.start_, token.end_) + "' is not a number.", token);
0461         if (value >= threshold)
0462           return decodeDouble(token);
0463         value = value * 10 + Value::UInt(c - '0');
0464       }
0465       if (isNegative)
0466         currentValue() = -Value::Int(value);
0467       else if (value <= Value::UInt(Value::maxInt))
0468         currentValue() = Value::Int(value);
0469       else
0470         currentValue() = value;
0471       return true;
0472     }
0473
0474     bool Reader::decodeDouble(Token &token) {
0475       double value = 0;
0476       const int bufferSize = 32;
0477       int count;
0478       int length = int(token.end_ - token.start_);
0479       if (length <= bufferSize) {
0480         Char buffer[bufferSize];
0481         memcpy(buffer, token.start_, length);
0482         buffer[length] = 0;
0483         count = sscanf(buffer, "%lf", &value);
0484       } else {
0485         std::string buffer(token.start_, token.end_);
0486         count = sscanf(buffer.c_str(), "%lf", &value);
0487       }
0488
0489       if (count != 1)
0490         return addError("'" + std::string(token.start_, token.end_) + "' is not a number.", token);
0491       currentValue() = value;
0492       return true;
0493     }
0494
0495     bool Reader::decodeString(Token &token) {
0496       std::string decoded;
0497       if (!decodeString(token, decoded))
0498         return false;
0499       currentValue() = decoded;
0500       return true;
0501     }
0502
0503     bool Reader::decodeString(Token &token, std::string &decoded) {
0504       decoded.reserve(token.end_ - token.start_ - 2);
0505       Location current = token.start_ + 1;  // skip '"'
0506       Location end = token.end_ - 1;        // do not include '"'
0507       while (current != end) {
0508         Char c = *current++;
0509         if (c == '"')
0510           break;
0511         else if (c == '\\') {
0512           if (current == end)
0513             return addError("Empty escape sequence in string", token, current);
0514           Char escape = *current++;
0515           switch (escape) {
0516             case '"':
0517               decoded += '"';
0518               break;
0519             case '/':
0520               decoded += '/';
0521               break;
0522             case '\\':
0523               decoded += '\\';
0524               break;
0525             case 'b':
0526               decoded += '\b';
0527               break;
0528             case 'f':
0529               decoded += '\f';
0530               break;
0531             case 'n':
0532               decoded += '\n';
0533               break;
0534             case 'r':
0535               decoded += '\r';
0536               break;
0537             case 't':
0538               decoded += '\t';
0539               break;
0540             case 'u': {
0541               unsigned int unicode;
0542               if (!decodeUnicodeCodePoint(token, current, end, unicode))
0543                 return false;
0544               decoded += codePointToUTF8(unicode);
0545             } break;
0546             default:
0547               return addError("Bad escape sequence in string", token, current);
0548           }
0549         } else {
0550           decoded += c;
0551         }
0552       }
0553       return true;
0554     }
0555
0556     bool Reader::decodeUnicodeCodePoint(Token &token, Location &current, Location end, unsigned int &unicode) {
0557       if (!decodeUnicodeEscapeSequence(token, current, end, unicode))
0558         return false;
0559       if (unicode >= 0xD800 && unicode <= 0xDBFF) {
0560         // surrogate pairs
0561         if (end - current < 6)
0562           return addError("additional six characters expected to parse unicode surrogate pair.", token, current);
0563         unsigned int surrogatePair;
0564         if (*(current++) == '\\' && *(current++) == 'u') {
0565           if (decodeUnicodeEscapeSequence(token, current, end, surrogatePair)) {
0566             unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF);
0567           } else
0568             return false;
0569         } else
0570           return addError(
0571               "expecting another \\u token to begin the second half of a unicode surrogate pair", token, current);
0572       }
0573       return true;
0574     }
0575
0576     bool Reader::decodeUnicodeEscapeSequence(Token &token, Location &current, Location end, unsigned int &unicode) {
0577       if (end - current < 4)
0578         return addError("Bad unicode escape sequence in string: four digits expected.", token, current);
0579       unicode = 0;
0580       for (int index = 0; index < 4; ++index) {
0581         Char c = *current++;
0582         unicode *= 16;
0583         if (c >= '0' && c <= '9')
0584           unicode += c - '0';
0585         else if (c >= 'a' && c <= 'f')
0586           unicode += c - 'a' + 10;
0587         else if (c >= 'A' && c <= 'F')
0588           unicode += c - 'A' + 10;
0589         else
0590           return addError("Bad unicode escape sequence in string: hexadecimal digit expected.", token, current);
0591       }
0592       return true;
0593     }
0594
0595     bool Reader::addError(const std::string &message, Token &token, Location extra) {
0596       ErrorInfo info;
0597       info.token_ = token;
0598       info.message_ = message;
0599       info.extra_ = extra;
0600       errors_.push_back(info);
0601       return false;
0602     }
0603
0604     bool Reader::recoverFromError(TokenType skipUntilToken) {
0605       int errorCount = int(errors_.size());
0606       Token skip;
0607       while (true) {
0608         if (!readToken(skip))
0609           errors_.resize(errorCount);  // discard errors caused by recovery
0610         if (skip.type_ == skipUntilToken || skip.type_ == tokenEndOfStream)
0611           break;
0612       }
0613       errors_.resize(errorCount);
0614       return false;
0615     }
0616
0617     bool Reader::addErrorAndRecover(const std::string &message, Token &token, TokenType skipUntilToken) {
0618       addError(message, token);
0619       return recoverFromError(skipUntilToken);
0620     }
0621
0622     Value &Reader::currentValue() { return *(nodes_.top()); }
0623
0624     Reader::Char Reader::getNextChar() {
0625       if (current_ == end_)
0626         return 0;
0627       return *current_++;
0628     }
0629
0630     void Reader::getLocationLineAndColumn(Location location, int &line, int &column) const {
0631       Location current = begin_;
0632       Location lastLineStart = current;
0633       line = 0;
0634       while (current < location && current != end_) {
0635         Char c = *current++;
0636         if (c == '\r') {
0637           if (*current == '\n')
0638             ++current;
0639           lastLineStart = current;
0640           ++line;
0641         } else if (c == '\n') {
0642           lastLineStart = current;
0643           ++line;
0644         }
0645       }
0646       // column & line start at 1
0647       column = int(location - lastLineStart) + 1;
0648       ++line;
0649     }
0650
0651     std::string Reader::getLocationLineAndColumn(Location location) const {
0652       int line, column;
0653       getLocationLineAndColumn(location, line, column);
0654       char buffer[18 + 16 + 16 + 1];
0655       sprintf(buffer, "Line %d, Column %d", line, column);
0656       return buffer;
0657     }
0658
0659     std::string Reader::getFormatedErrorMessages() const {
0660       std::string formattedMessage;
0661       for (Errors::const_iterator itError = errors_.begin(); itError != errors_.end(); ++itError) {
0662         const ErrorInfo &error = *itError;
0663         formattedMessage += "* " + getLocationLineAndColumn(error.token_.start_) + "\n";
0664         formattedMessage += "  " + error.message_ + "\n";
0665         if (error.extra_)
0666           formattedMessage += "See " + getLocationLineAndColumn(error.extra_) + " for detail.\n";
0667       }
0668       return formattedMessage;
0669     }
0670
0671     std::istream &operator>>(std::istream &sin, Value &root) {
0672       Json::Reader reader;
0673       bool ok = reader.parse(sin, root, true);
0674       //JSON_ASSERT( ok );
0675       if (!ok)
0676         throw std::runtime_error(reader.getFormatedErrorMessages());
0677       return sin;
0678     }
0679
0680   }  // namespace Json
0681 }  // namespace jsoncollector