annotate osx/include/kj/parse/char.h @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 0994c39f1e94
children
rev   line source
cannam@62 1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
cannam@62 2 // Licensed under the MIT License:
cannam@62 3 //
cannam@62 4 // Permission is hereby granted, free of charge, to any person obtaining a copy
cannam@62 5 // of this software and associated documentation files (the "Software"), to deal
cannam@62 6 // in the Software without restriction, including without limitation the rights
cannam@62 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
cannam@62 8 // copies of the Software, and to permit persons to whom the Software is
cannam@62 9 // furnished to do so, subject to the following conditions:
cannam@62 10 //
cannam@62 11 // The above copyright notice and this permission notice shall be included in
cannam@62 12 // all copies or substantial portions of the Software.
cannam@62 13 //
cannam@62 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
cannam@62 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
cannam@62 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
cannam@62 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
cannam@62 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
cannam@62 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
cannam@62 20 // THE SOFTWARE.
cannam@62 21
cannam@62 22 // This file contains parsers useful for character stream inputs, including parsers to parse
cannam@62 23 // common kinds of tokens like identifiers, numbers, and quoted strings.
cannam@62 24
cannam@62 25 #ifndef KJ_PARSE_CHAR_H_
cannam@62 26 #define KJ_PARSE_CHAR_H_
cannam@62 27
cannam@62 28 #if defined(__GNUC__) && !KJ_HEADER_WARNINGS
cannam@62 29 #pragma GCC system_header
cannam@62 30 #endif
cannam@62 31
cannam@62 32 #include "common.h"
cannam@62 33 #include "../string.h"
cannam@62 34 #include <inttypes.h>
cannam@62 35
cannam@62 36 namespace kj {
cannam@62 37 namespace parse {
cannam@62 38
cannam@62 39 // =======================================================================================
cannam@62 40 // Exact char/string.
cannam@62 41
cannam@62 42 class ExactString_ {
cannam@62 43 public:
cannam@62 44 constexpr inline ExactString_(const char* str): str(str) {}
cannam@62 45
cannam@62 46 template <typename Input>
cannam@62 47 Maybe<Tuple<>> operator()(Input& input) const {
cannam@62 48 const char* ptr = str;
cannam@62 49
cannam@62 50 while (*ptr != '\0') {
cannam@62 51 if (input.atEnd() || input.current() != *ptr) return nullptr;
cannam@62 52 input.next();
cannam@62 53 ++ptr;
cannam@62 54 }
cannam@62 55
cannam@62 56 return Tuple<>();
cannam@62 57 }
cannam@62 58
cannam@62 59 private:
cannam@62 60 const char* str;
cannam@62 61 };
cannam@62 62
cannam@62 63 constexpr inline ExactString_ exactString(const char* str) {
cannam@62 64 return ExactString_(str);
cannam@62 65 }
cannam@62 66
cannam@62 67 template <char c>
cannam@62 68 constexpr ExactlyConst_<char, c> exactChar() {
cannam@62 69 // Returns a parser that matches exactly the character given by the template argument (returning
cannam@62 70 // no result).
cannam@62 71 return ExactlyConst_<char, c>();
cannam@62 72 }
cannam@62 73
cannam@62 74 // =======================================================================================
cannam@62 75 // Char ranges / sets
cannam@62 76
cannam@62 77 class CharGroup_ {
cannam@62 78 public:
cannam@62 79 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {}
cannam@62 80
cannam@62 81 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
cannam@62 82 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )),
cannam@62 83 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)),
cannam@62 84 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)),
cannam@62 85 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192)));
cannam@62 86 }
cannam@62 87
cannam@62 88 constexpr inline CharGroup_ orAny(const char* chars) const {
cannam@62 89 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1);
cannam@62 90 }
cannam@62 91
cannam@62 92 constexpr inline CharGroup_ orChar(unsigned char c) const {
cannam@62 93 return CharGroup_(bits[0] | bit(c),
cannam@62 94 bits[1] | bit(c - 64),
cannam@62 95 bits[2] | bit(c - 128),
cannam@62 96 bits[3] | bit(c - 256));
cannam@62 97 }
cannam@62 98
cannam@62 99 constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
cannam@62 100 return CharGroup_(bits[0] | other.bits[0],
cannam@62 101 bits[1] | other.bits[1],
cannam@62 102 bits[2] | other.bits[2],
cannam@62 103 bits[3] | other.bits[3]);
cannam@62 104 }
cannam@62 105
cannam@62 106 constexpr inline CharGroup_ invert() const {
cannam@62 107 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]);
cannam@62 108 }
cannam@62 109
cannam@62 110 constexpr inline bool contains(unsigned char c) const {
cannam@62 111 return (bits[c / 64] & (1ll << (c % 64))) != 0;
cannam@62 112 }
cannam@62 113
cannam@62 114 template <typename Input>
cannam@62 115 Maybe<char> operator()(Input& input) const {
cannam@62 116 if (input.atEnd()) return nullptr;
cannam@62 117 unsigned char c = input.current();
cannam@62 118 if (contains(c)) {
cannam@62 119 input.next();
cannam@62 120 return c;
cannam@62 121 } else {
cannam@62 122 return nullptr;
cannam@62 123 }
cannam@62 124 }
cannam@62 125
cannam@62 126 private:
cannam@62 127 typedef unsigned long long Bits64;
cannam@62 128
cannam@62 129 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
cannam@62 130 Bits64 bits[4];
cannam@62 131
cannam@62 132 static constexpr inline Bits64 oneBits(int count) {
cannam@62 133 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1);
cannam@62 134 }
cannam@62 135 static constexpr inline Bits64 bit(int index) {
cannam@62 136 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index);
cannam@62 137 }
cannam@62 138 };
cannam@62 139
cannam@62 140 constexpr inline CharGroup_ charRange(char first, char last) {
cannam@62 141 // Create a parser which accepts any character in the range from `first` to `last`, inclusive.
cannam@62 142 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the
cannam@62 143 // character matched.
cannam@62 144 //
cannam@62 145 // The returned object has methods which can be used to match more characters. The following
cannam@62 146 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
cannam@62 147 //
cannam@62 148 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
cannam@62 149 //
cannam@62 150 // You can also use `.invert()` to match the opposite set of characters.
cannam@62 151
cannam@62 152 return CharGroup_().orRange(first, last);
cannam@62 153 }
cannam@62 154
cannam@62 155 #if _MSC_VER
cannam@62 156 #define anyOfChars(chars) CharGroup_().orAny(chars)
cannam@62 157 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
cannam@62 158 // building the compiler or schema parser. We don't know why this happens, but Harris found that
cannam@62 159 // this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
cannam@62 160 // Hopefully, MSVC will get fixed soon and we'll be able to remove this.
cannam@62 161 #else
cannam@62 162 constexpr inline CharGroup_ anyOfChars(const char* chars) {
cannam@62 163 // Returns a parser that accepts any of the characters in the given string (which should usually
cannam@62 164 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see
cannam@62 165 // that function for more info.
cannam@62 166
cannam@62 167 return CharGroup_().orAny(chars);
cannam@62 168 }
cannam@62 169 #endif
cannam@62 170
cannam@62 171 // =======================================================================================
cannam@62 172
cannam@62 173 namespace _ { // private
cannam@62 174
cannam@62 175 struct ArrayToString {
cannam@62 176 inline String operator()(const Array<char>& arr) const {
cannam@62 177 return heapString(arr);
cannam@62 178 }
cannam@62 179 };
cannam@62 180
cannam@62 181 } // namespace _ (private)
cannam@62 182
cannam@62 183 template <typename SubParser>
cannam@62 184 constexpr inline auto charsToString(SubParser&& subParser)
cannam@62 185 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) {
cannam@62 186 // Wraps a parser that returns Array<char> such that it returns String instead.
cannam@62 187 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString());
cannam@62 188 }
cannam@62 189
cannam@62 190 // =======================================================================================
cannam@62 191 // Basic character classes.
cannam@62 192
cannam@62 193 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z');
cannam@62 194 constexpr auto digit = charRange('0', '9');
cannam@62 195 constexpr auto alphaNumeric = alpha.orGroup(digit);
cannam@62 196 constexpr auto nameStart = alpha.orChar('_');
cannam@62 197 constexpr auto nameChar = alphaNumeric.orChar('_');
cannam@62 198 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
cannam@62 199 constexpr auto octDigit = charRange('0', '7');
cannam@62 200 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
cannam@62 201 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
cannam@62 202
cannam@62 203 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
cannam@62 204
cannam@62 205 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
cannam@62 206 // Like discard(whitespace) but avoids some memory allocation.
cannam@62 207
cannam@62 208 // =======================================================================================
cannam@62 209 // Identifiers
cannam@62 210
cannam@62 211 namespace _ { // private
cannam@62 212
cannam@62 213 struct IdentifierToString {
cannam@62 214 inline String operator()(char first, const Array<char>& rest) const {
cannam@62 215 String result = heapString(rest.size() + 1);
cannam@62 216 result[0] = first;
cannam@62 217 memcpy(result.begin() + 1, rest.begin(), rest.size());
cannam@62 218 return result;
cannam@62 219 }
cannam@62 220 };
cannam@62 221
cannam@62 222 } // namespace _ (private)
cannam@62 223
cannam@62 224 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString());
cannam@62 225 // Parses an identifier (e.g. a C variable name).
cannam@62 226
cannam@62 227 // =======================================================================================
cannam@62 228 // Integers
cannam@62 229
cannam@62 230 namespace _ { // private
cannam@62 231
cannam@62 232 inline char parseDigit(char c) {
cannam@62 233 if (c < 'A') return c - '0';
cannam@62 234 if (c < 'a') return c - 'A' + 10;
cannam@62 235 return c - 'a' + 10;
cannam@62 236 }
cannam@62 237
cannam@62 238 template <uint base>
cannam@62 239 struct ParseInteger {
cannam@62 240 inline uint64_t operator()(const Array<char>& digits) const {
cannam@62 241 return operator()('0', digits);
cannam@62 242 }
cannam@62 243 uint64_t operator()(char first, const Array<char>& digits) const {
cannam@62 244 uint64_t result = parseDigit(first);
cannam@62 245 for (char digit: digits) {
cannam@62 246 result = result * base + parseDigit(digit);
cannam@62 247 }
cannam@62 248 return result;
cannam@62 249 }
cannam@62 250 };
cannam@62 251
cannam@62 252
cannam@62 253 } // namespace _ (private)
cannam@62 254
cannam@62 255 constexpr auto integer = sequence(
cannam@62 256 oneOf(
cannam@62 257 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()),
cannam@62 258 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()),
cannam@62 259 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())),
cannam@62 260 notLookingAt(alpha.orAny("_.")));
cannam@62 261
cannam@62 262 // =======================================================================================
cannam@62 263 // Numbers (i.e. floats)
cannam@62 264
cannam@62 265 namespace _ { // private
cannam@62 266
cannam@62 267 struct ParseFloat {
cannam@62 268 double operator()(const Array<char>& digits,
cannam@62 269 const Maybe<Array<char>>& fraction,
cannam@62 270 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
cannam@62 271 };
cannam@62 272
cannam@62 273 } // namespace _ (private)
cannam@62 274
cannam@62 275 constexpr auto number = transform(
cannam@62 276 sequence(
cannam@62 277 oneOrMore(digit),
cannam@62 278 optional(sequence(exactChar<'.'>(), many(digit))),
cannam@62 279 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
cannam@62 280 notLookingAt(alpha.orAny("_."))),
cannam@62 281 _::ParseFloat());
cannam@62 282
cannam@62 283 // =======================================================================================
cannam@62 284 // Quoted strings
cannam@62 285
cannam@62 286 namespace _ { // private
cannam@62 287
cannam@62 288 struct InterpretEscape {
cannam@62 289 char operator()(char c) const {
cannam@62 290 switch (c) {
cannam@62 291 case 'a': return '\a';
cannam@62 292 case 'b': return '\b';
cannam@62 293 case 'f': return '\f';
cannam@62 294 case 'n': return '\n';
cannam@62 295 case 'r': return '\r';
cannam@62 296 case 't': return '\t';
cannam@62 297 case 'v': return '\v';
cannam@62 298 default: return c;
cannam@62 299 }
cannam@62 300 }
cannam@62 301 };
cannam@62 302
cannam@62 303 struct ParseHexEscape {
cannam@62 304 inline char operator()(char first, char second) const {
cannam@62 305 return (parseDigit(first) << 4) | parseDigit(second);
cannam@62 306 }
cannam@62 307 };
cannam@62 308
cannam@62 309 struct ParseHexByte {
cannam@62 310 inline byte operator()(char first, char second) const {
cannam@62 311 return (parseDigit(first) << 4) | parseDigit(second);
cannam@62 312 }
cannam@62 313 };
cannam@62 314
cannam@62 315 struct ParseOctEscape {
cannam@62 316 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
cannam@62 317 char result = first - '0';
cannam@62 318 KJ_IF_MAYBE(digit1, second) {
cannam@62 319 result = (result << 3) | (*digit1 - '0');
cannam@62 320 KJ_IF_MAYBE(digit2, third) {
cannam@62 321 result = (result << 3) | (*digit2 - '0');
cannam@62 322 }
cannam@62 323 }
cannam@62 324 return result;
cannam@62 325 }
cannam@62 326 };
cannam@62 327
cannam@62 328 } // namespace _ (private)
cannam@62 329
cannam@62 330 constexpr auto escapeSequence =
cannam@62 331 sequence(exactChar<'\\'>(), oneOf(
cannam@62 332 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()),
cannam@62 333 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()),
cannam@62 334 transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
cannam@62 335 _::ParseOctEscape())));
cannam@62 336 // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns
cannam@62 337 // a char.
cannam@62 338
cannam@62 339 constexpr auto doubleQuotedString = charsToString(sequence(
cannam@62 340 exactChar<'\"'>(),
cannam@62 341 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
cannam@62 342 exactChar<'\"'>()));
cannam@62 343 // Parses a C-style double-quoted string.
cannam@62 344
cannam@62 345 constexpr auto singleQuotedString = charsToString(sequence(
cannam@62 346 exactChar<'\''>(),
cannam@62 347 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
cannam@62 348 exactChar<'\''>()));
cannam@62 349 // Parses a C-style single-quoted string.
cannam@62 350
cannam@62 351 constexpr auto doubleQuotedHexBinary = sequence(
cannam@62 352 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(),
cannam@62 353 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())),
cannam@62 354 discardWhitespace,
cannam@62 355 exactChar<'\"'>());
cannam@62 356 // Parses a double-quoted hex binary literal. Returns Array<byte>.
cannam@62 357
cannam@62 358 } // namespace parse
cannam@62 359 } // namespace kj
cannam@62 360
cannam@62 361 #endif // KJ_PARSE_CHAR_H_