annotate osx/include/kj/parse/char.h @ 162:d43aab368df9

Duplicate for patch testing
author Chris Cannam <cannam@all-day-breakfast.com>
date Wed, 30 Oct 2019 11:25:10 +0000
parents 45360b968bf4
children
rev   line source
cannam@147 1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
cannam@147 2 // Licensed under the MIT License:
cannam@147 3 //
cannam@147 4 // Permission is hereby granted, free of charge, to any person obtaining a copy
cannam@147 5 // of this software and associated documentation files (the "Software"), to deal
cannam@147 6 // in the Software without restriction, including without limitation the rights
cannam@147 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
cannam@147 8 // copies of the Software, and to permit persons to whom the Software is
cannam@147 9 // furnished to do so, subject to the following conditions:
cannam@147 10 //
cannam@147 11 // The above copyright notice and this permission notice shall be included in
cannam@147 12 // all copies or substantial portions of the Software.
cannam@147 13 //
cannam@147 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
cannam@147 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
cannam@147 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
cannam@147 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
cannam@147 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
cannam@147 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
cannam@147 20 // THE SOFTWARE.
cannam@147 21
cannam@147 22 // This file contains parsers useful for character stream inputs, including parsers to parse
cannam@147 23 // common kinds of tokens like identifiers, numbers, and quoted strings.
cannam@147 24
cannam@147 25 #ifndef KJ_PARSE_CHAR_H_
cannam@147 26 #define KJ_PARSE_CHAR_H_
cannam@147 27
cannam@147 28 #if defined(__GNUC__) && !KJ_HEADER_WARNINGS
cannam@147 29 #pragma GCC system_header
cannam@147 30 #endif
cannam@147 31
cannam@147 32 #include "common.h"
cannam@147 33 #include "../string.h"
cannam@147 34 #include <inttypes.h>
cannam@147 35
cannam@147 36 namespace kj {
cannam@147 37 namespace parse {
cannam@147 38
cannam@147 39 // =======================================================================================
cannam@147 40 // Exact char/string.
cannam@147 41
cannam@147 42 class ExactString_ {
cannam@147 43 public:
cannam@147 44 constexpr inline ExactString_(const char* str): str(str) {}
cannam@147 45
cannam@147 46 template <typename Input>
cannam@147 47 Maybe<Tuple<>> operator()(Input& input) const {
cannam@147 48 const char* ptr = str;
cannam@147 49
cannam@147 50 while (*ptr != '\0') {
cannam@147 51 if (input.atEnd() || input.current() != *ptr) return nullptr;
cannam@147 52 input.next();
cannam@147 53 ++ptr;
cannam@147 54 }
cannam@147 55
cannam@147 56 return Tuple<>();
cannam@147 57 }
cannam@147 58
cannam@147 59 private:
cannam@147 60 const char* str;
cannam@147 61 };
cannam@147 62
cannam@147 63 constexpr inline ExactString_ exactString(const char* str) {
cannam@147 64 return ExactString_(str);
cannam@147 65 }
cannam@147 66
cannam@147 67 template <char c>
cannam@147 68 constexpr ExactlyConst_<char, c> exactChar() {
cannam@147 69 // Returns a parser that matches exactly the character given by the template argument (returning
cannam@147 70 // no result).
cannam@147 71 return ExactlyConst_<char, c>();
cannam@147 72 }
cannam@147 73
cannam@147 74 // =======================================================================================
cannam@147 75 // Char ranges / sets
cannam@147 76
cannam@147 77 class CharGroup_ {
cannam@147 78 public:
cannam@147 79 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {}
cannam@147 80
cannam@147 81 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
cannam@147 82 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )),
cannam@147 83 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)),
cannam@147 84 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)),
cannam@147 85 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192)));
cannam@147 86 }
cannam@147 87
cannam@147 88 constexpr inline CharGroup_ orAny(const char* chars) const {
cannam@147 89 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1);
cannam@147 90 }
cannam@147 91
cannam@147 92 constexpr inline CharGroup_ orChar(unsigned char c) const {
cannam@147 93 return CharGroup_(bits[0] | bit(c),
cannam@147 94 bits[1] | bit(c - 64),
cannam@147 95 bits[2] | bit(c - 128),
cannam@147 96 bits[3] | bit(c - 256));
cannam@147 97 }
cannam@147 98
cannam@147 99 constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
cannam@147 100 return CharGroup_(bits[0] | other.bits[0],
cannam@147 101 bits[1] | other.bits[1],
cannam@147 102 bits[2] | other.bits[2],
cannam@147 103 bits[3] | other.bits[3]);
cannam@147 104 }
cannam@147 105
cannam@147 106 constexpr inline CharGroup_ invert() const {
cannam@147 107 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]);
cannam@147 108 }
cannam@147 109
cannam@147 110 constexpr inline bool contains(unsigned char c) const {
cannam@147 111 return (bits[c / 64] & (1ll << (c % 64))) != 0;
cannam@147 112 }
cannam@147 113
cannam@147 114 template <typename Input>
cannam@147 115 Maybe<char> operator()(Input& input) const {
cannam@147 116 if (input.atEnd()) return nullptr;
cannam@147 117 unsigned char c = input.current();
cannam@147 118 if (contains(c)) {
cannam@147 119 input.next();
cannam@147 120 return c;
cannam@147 121 } else {
cannam@147 122 return nullptr;
cannam@147 123 }
cannam@147 124 }
cannam@147 125
cannam@147 126 private:
cannam@147 127 typedef unsigned long long Bits64;
cannam@147 128
cannam@147 129 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
cannam@147 130 Bits64 bits[4];
cannam@147 131
cannam@147 132 static constexpr inline Bits64 oneBits(int count) {
cannam@147 133 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1);
cannam@147 134 }
cannam@147 135 static constexpr inline Bits64 bit(int index) {
cannam@147 136 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index);
cannam@147 137 }
cannam@147 138 };
cannam@147 139
cannam@147 140 constexpr inline CharGroup_ charRange(char first, char last) {
cannam@147 141 // Create a parser which accepts any character in the range from `first` to `last`, inclusive.
cannam@147 142 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the
cannam@147 143 // character matched.
cannam@147 144 //
cannam@147 145 // The returned object has methods which can be used to match more characters. The following
cannam@147 146 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
cannam@147 147 //
cannam@147 148 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
cannam@147 149 //
cannam@147 150 // You can also use `.invert()` to match the opposite set of characters.
cannam@147 151
cannam@147 152 return CharGroup_().orRange(first, last);
cannam@147 153 }
cannam@147 154
cannam@147 155 #if _MSC_VER
cannam@147 156 #define anyOfChars(chars) CharGroup_().orAny(chars)
cannam@147 157 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
cannam@147 158 // building the compiler or schema parser. We don't know why this happens, but Harris found that
cannam@147 159 // this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
cannam@147 160 // Hopefully, MSVC will get fixed soon and we'll be able to remove this.
cannam@147 161 #else
cannam@147 162 constexpr inline CharGroup_ anyOfChars(const char* chars) {
cannam@147 163 // Returns a parser that accepts any of the characters in the given string (which should usually
cannam@147 164 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see
cannam@147 165 // that function for more info.
cannam@147 166
cannam@147 167 return CharGroup_().orAny(chars);
cannam@147 168 }
cannam@147 169 #endif
cannam@147 170
cannam@147 171 // =======================================================================================
cannam@147 172
cannam@147 173 namespace _ { // private
cannam@147 174
cannam@147 175 struct ArrayToString {
cannam@147 176 inline String operator()(const Array<char>& arr) const {
cannam@147 177 return heapString(arr);
cannam@147 178 }
cannam@147 179 };
cannam@147 180
cannam@147 181 } // namespace _ (private)
cannam@147 182
cannam@147 183 template <typename SubParser>
cannam@147 184 constexpr inline auto charsToString(SubParser&& subParser)
cannam@147 185 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) {
cannam@147 186 // Wraps a parser that returns Array<char> such that it returns String instead.
cannam@147 187 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString());
cannam@147 188 }
cannam@147 189
cannam@147 190 // =======================================================================================
cannam@147 191 // Basic character classes.
cannam@147 192
cannam@147 193 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z');
cannam@147 194 constexpr auto digit = charRange('0', '9');
cannam@147 195 constexpr auto alphaNumeric = alpha.orGroup(digit);
cannam@147 196 constexpr auto nameStart = alpha.orChar('_');
cannam@147 197 constexpr auto nameChar = alphaNumeric.orChar('_');
cannam@147 198 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
cannam@147 199 constexpr auto octDigit = charRange('0', '7');
cannam@147 200 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
cannam@147 201 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
cannam@147 202
cannam@147 203 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
cannam@147 204
cannam@147 205 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
cannam@147 206 // Like discard(whitespace) but avoids some memory allocation.
cannam@147 207
cannam@147 208 // =======================================================================================
cannam@147 209 // Identifiers
cannam@147 210
cannam@147 211 namespace _ { // private
cannam@147 212
cannam@147 213 struct IdentifierToString {
cannam@147 214 inline String operator()(char first, const Array<char>& rest) const {
cannam@147 215 String result = heapString(rest.size() + 1);
cannam@147 216 result[0] = first;
cannam@147 217 memcpy(result.begin() + 1, rest.begin(), rest.size());
cannam@147 218 return result;
cannam@147 219 }
cannam@147 220 };
cannam@147 221
cannam@147 222 } // namespace _ (private)
cannam@147 223
cannam@147 224 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString());
cannam@147 225 // Parses an identifier (e.g. a C variable name).
cannam@147 226
cannam@147 227 // =======================================================================================
cannam@147 228 // Integers
cannam@147 229
cannam@147 230 namespace _ { // private
cannam@147 231
cannam@147 232 inline char parseDigit(char c) {
cannam@147 233 if (c < 'A') return c - '0';
cannam@147 234 if (c < 'a') return c - 'A' + 10;
cannam@147 235 return c - 'a' + 10;
cannam@147 236 }
cannam@147 237
cannam@147 238 template <uint base>
cannam@147 239 struct ParseInteger {
cannam@147 240 inline uint64_t operator()(const Array<char>& digits) const {
cannam@147 241 return operator()('0', digits);
cannam@147 242 }
cannam@147 243 uint64_t operator()(char first, const Array<char>& digits) const {
cannam@147 244 uint64_t result = parseDigit(first);
cannam@147 245 for (char digit: digits) {
cannam@147 246 result = result * base + parseDigit(digit);
cannam@147 247 }
cannam@147 248 return result;
cannam@147 249 }
cannam@147 250 };
cannam@147 251
cannam@147 252
cannam@147 253 } // namespace _ (private)
cannam@147 254
cannam@147 255 constexpr auto integer = sequence(
cannam@147 256 oneOf(
cannam@147 257 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()),
cannam@147 258 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()),
cannam@147 259 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())),
cannam@147 260 notLookingAt(alpha.orAny("_.")));
cannam@147 261
cannam@147 262 // =======================================================================================
cannam@147 263 // Numbers (i.e. floats)
cannam@147 264
cannam@147 265 namespace _ { // private
cannam@147 266
cannam@147 267 struct ParseFloat {
cannam@147 268 double operator()(const Array<char>& digits,
cannam@147 269 const Maybe<Array<char>>& fraction,
cannam@147 270 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
cannam@147 271 };
cannam@147 272
cannam@147 273 } // namespace _ (private)
cannam@147 274
cannam@147 275 constexpr auto number = transform(
cannam@147 276 sequence(
cannam@147 277 oneOrMore(digit),
cannam@147 278 optional(sequence(exactChar<'.'>(), many(digit))),
cannam@147 279 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
cannam@147 280 notLookingAt(alpha.orAny("_."))),
cannam@147 281 _::ParseFloat());
cannam@147 282
cannam@147 283 // =======================================================================================
cannam@147 284 // Quoted strings
cannam@147 285
cannam@147 286 namespace _ { // private
cannam@147 287
cannam@147 288 struct InterpretEscape {
cannam@147 289 char operator()(char c) const {
cannam@147 290 switch (c) {
cannam@147 291 case 'a': return '\a';
cannam@147 292 case 'b': return '\b';
cannam@147 293 case 'f': return '\f';
cannam@147 294 case 'n': return '\n';
cannam@147 295 case 'r': return '\r';
cannam@147 296 case 't': return '\t';
cannam@147 297 case 'v': return '\v';
cannam@147 298 default: return c;
cannam@147 299 }
cannam@147 300 }
cannam@147 301 };
cannam@147 302
cannam@147 303 struct ParseHexEscape {
cannam@147 304 inline char operator()(char first, char second) const {
cannam@147 305 return (parseDigit(first) << 4) | parseDigit(second);
cannam@147 306 }
cannam@147 307 };
cannam@147 308
cannam@147 309 struct ParseHexByte {
cannam@147 310 inline byte operator()(char first, char second) const {
cannam@147 311 return (parseDigit(first) << 4) | parseDigit(second);
cannam@147 312 }
cannam@147 313 };
cannam@147 314
cannam@147 315 struct ParseOctEscape {
cannam@147 316 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
cannam@147 317 char result = first - '0';
cannam@147 318 KJ_IF_MAYBE(digit1, second) {
cannam@147 319 result = (result << 3) | (*digit1 - '0');
cannam@147 320 KJ_IF_MAYBE(digit2, third) {
cannam@147 321 result = (result << 3) | (*digit2 - '0');
cannam@147 322 }
cannam@147 323 }
cannam@147 324 return result;
cannam@147 325 }
cannam@147 326 };
cannam@147 327
cannam@147 328 } // namespace _ (private)
cannam@147 329
cannam@147 330 constexpr auto escapeSequence =
cannam@147 331 sequence(exactChar<'\\'>(), oneOf(
cannam@147 332 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()),
cannam@147 333 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()),
cannam@147 334 transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
cannam@147 335 _::ParseOctEscape())));
cannam@147 336 // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns
cannam@147 337 // a char.
cannam@147 338
cannam@147 339 constexpr auto doubleQuotedString = charsToString(sequence(
cannam@147 340 exactChar<'\"'>(),
cannam@147 341 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
cannam@147 342 exactChar<'\"'>()));
cannam@147 343 // Parses a C-style double-quoted string.
cannam@147 344
cannam@147 345 constexpr auto singleQuotedString = charsToString(sequence(
cannam@147 346 exactChar<'\''>(),
cannam@147 347 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
cannam@147 348 exactChar<'\''>()));
cannam@147 349 // Parses a C-style single-quoted string.
cannam@147 350
cannam@147 351 constexpr auto doubleQuotedHexBinary = sequence(
cannam@147 352 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(),
cannam@147 353 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())),
cannam@147 354 discardWhitespace,
cannam@147 355 exactChar<'\"'>());
cannam@147 356 // Parses a double-quoted hex binary literal. Returns Array<byte>.
cannam@147 357
cannam@147 358 } // namespace parse
cannam@147 359 } // namespace kj
cannam@147 360
cannam@147 361 #endif // KJ_PARSE_CHAR_H_