annotate osx/include/kj/parse/char.h @ 71:388bd4da45bf

Opus build for Windows (MinGW)
author Chris Cannam
date Fri, 25 Jan 2019 13:49:03 +0000
parents 0994c39f1e94
children
rev   line source
cannam@62 1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
cannam@62 2 // Licensed under the MIT License:
cannam@62 3 //
cannam@62 4 // Permission is hereby granted, free of charge, to any person obtaining a copy
cannam@62 5 // of this software and associated documentation files (the "Software"), to deal
cannam@62 6 // in the Software without restriction, including without limitation the rights
cannam@62 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
cannam@62 8 // copies of the Software, and to permit persons to whom the Software is
cannam@62 9 // furnished to do so, subject to the following conditions:
cannam@62 10 //
cannam@62 11 // The above copyright notice and this permission notice shall be included in
cannam@62 12 // all copies or substantial portions of the Software.
cannam@62 13 //
cannam@62 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
cannam@62 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
cannam@62 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
cannam@62 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
cannam@62 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
cannam@62 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
cannam@62 20 // THE SOFTWARE.
cannam@62 21
cannam@62 22 // This file contains parsers useful for character stream inputs, including parsers to parse
cannam@62 23 // common kinds of tokens like identifiers, numbers, and quoted strings.
cannam@62 24
cannam@62 25 #ifndef KJ_PARSE_CHAR_H_
cannam@62 26 #define KJ_PARSE_CHAR_H_
cannam@62 27
cannam@62 28 #if defined(__GNUC__) && !KJ_HEADER_WARNINGS
cannam@62 29 #pragma GCC system_header
cannam@62 30 #endif
cannam@62 31
cannam@62 32 #include "common.h"
cannam@62 33 #include "../string.h"
cannam@62 34 #include <inttypes.h>
cannam@62 35
cannam@62 36 namespace kj {
cannam@62 37 namespace parse {
cannam@62 38
cannam@62 39 // =======================================================================================
cannam@62 40 // Exact char/string.
cannam@62 41
cannam@62 42 class ExactString_ {
cannam@62 43 public:
cannam@62 44 constexpr inline ExactString_(const char* str): str(str) {}
cannam@62 45
cannam@62 46 template <typename Input>
cannam@62 47 Maybe<Tuple<>> operator()(Input& input) const {
cannam@62 48 const char* ptr = str;
cannam@62 49
cannam@62 50 while (*ptr != '\0') {
cannam@62 51 if (input.atEnd() || input.current() != *ptr) return nullptr;
cannam@62 52 input.next();
cannam@62 53 ++ptr;
cannam@62 54 }
cannam@62 55
cannam@62 56 return Tuple<>();
cannam@62 57 }
cannam@62 58
cannam@62 59 private:
cannam@62 60 const char* str;
cannam@62 61 };
cannam@62 62
cannam@62 63 constexpr inline ExactString_ exactString(const char* str) {
cannam@62 64 return ExactString_(str);
cannam@62 65 }
cannam@62 66
cannam@62 67 template <char c>
cannam@62 68 constexpr ExactlyConst_<char, c> exactChar() {
cannam@62 69 // Returns a parser that matches exactly the character given by the template argument (returning
cannam@62 70 // no result).
cannam@62 71 return ExactlyConst_<char, c>();
cannam@62 72 }
cannam@62 73
cannam@62 74 // =======================================================================================
cannam@62 75 // Char ranges / sets
cannam@62 76
cannam@62 77 class CharGroup_ {
cannam@62 78 public:
cannam@62 79 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {}
cannam@62 80
cannam@62 81 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
cannam@62 82 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )),
cannam@62 83 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)),
cannam@62 84 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)),
cannam@62 85 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192)));
cannam@62 86 }
cannam@62 87
cannam@62 88 constexpr inline CharGroup_ orAny(const char* chars) const {
cannam@62 89 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1);
cannam@62 90 }
cannam@62 91
cannam@62 92 constexpr inline CharGroup_ orChar(unsigned char c) const {
cannam@62 93 return CharGroup_(bits[0] | bit(c),
cannam@62 94 bits[1] | bit(c - 64),
cannam@62 95 bits[2] | bit(c - 128),
cannam@62 96 bits[3] | bit(c - 256));
cannam@62 97 }
cannam@62 98
cannam@62 99 constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
cannam@62 100 return CharGroup_(bits[0] | other.bits[0],
cannam@62 101 bits[1] | other.bits[1],
cannam@62 102 bits[2] | other.bits[2],
cannam@62 103 bits[3] | other.bits[3]);
cannam@62 104 }
cannam@62 105
cannam@62 106 constexpr inline CharGroup_ invert() const {
cannam@62 107 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]);
cannam@62 108 }
cannam@62 109
cannam@62 110 constexpr inline bool contains(unsigned char c) const {
cannam@62 111 return (bits[c / 64] & (1ll << (c % 64))) != 0;
cannam@62 112 }
cannam@62 113
cannam@62 114 template <typename Input>
cannam@62 115 Maybe<char> operator()(Input& input) const {
cannam@62 116 if (input.atEnd()) return nullptr;
cannam@62 117 unsigned char c = input.current();
cannam@62 118 if (contains(c)) {
cannam@62 119 input.next();
cannam@62 120 return c;
cannam@62 121 } else {
cannam@62 122 return nullptr;
cannam@62 123 }
cannam@62 124 }
cannam@62 125
cannam@62 126 private:
cannam@62 127 typedef unsigned long long Bits64;
cannam@62 128
cannam@62 129 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
cannam@62 130 Bits64 bits[4];
cannam@62 131
cannam@62 132 static constexpr inline Bits64 oneBits(int count) {
cannam@62 133 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1);
cannam@62 134 }
cannam@62 135 static constexpr inline Bits64 bit(int index) {
cannam@62 136 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index);
cannam@62 137 }
cannam@62 138 };
cannam@62 139
cannam@62 140 constexpr inline CharGroup_ charRange(char first, char last) {
cannam@62 141 // Create a parser which accepts any character in the range from `first` to `last`, inclusive.
cannam@62 142 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the
cannam@62 143 // character matched.
cannam@62 144 //
cannam@62 145 // The returned object has methods which can be used to match more characters. The following
cannam@62 146 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
cannam@62 147 //
cannam@62 148 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
cannam@62 149 //
cannam@62 150 // You can also use `.invert()` to match the opposite set of characters.
cannam@62 151
cannam@62 152 return CharGroup_().orRange(first, last);
cannam@62 153 }
cannam@62 154
cannam@62 155 #if _MSC_VER
cannam@62 156 #define anyOfChars(chars) CharGroup_().orAny(chars)
cannam@62 157 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
cannam@62 158 // building the compiler or schema parser. We don't know why this happens, but Harris found that
cannam@62 159 // this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
cannam@62 160 // Hopefully, MSVC will get fixed soon and we'll be able to remove this.
cannam@62 161 #else
cannam@62 162 constexpr inline CharGroup_ anyOfChars(const char* chars) {
cannam@62 163 // Returns a parser that accepts any of the characters in the given string (which should usually
cannam@62 164 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see
cannam@62 165 // that function for more info.
cannam@62 166
cannam@62 167 return CharGroup_().orAny(chars);
cannam@62 168 }
cannam@62 169 #endif
cannam@62 170
cannam@62 171 // =======================================================================================
cannam@62 172
cannam@62 173 namespace _ { // private
cannam@62 174
cannam@62 175 struct ArrayToString {
cannam@62 176 inline String operator()(const Array<char>& arr) const {
cannam@62 177 return heapString(arr);
cannam@62 178 }
cannam@62 179 };
cannam@62 180
cannam@62 181 } // namespace _ (private)
cannam@62 182
cannam@62 183 template <typename SubParser>
cannam@62 184 constexpr inline auto charsToString(SubParser&& subParser)
cannam@62 185 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) {
cannam@62 186 // Wraps a parser that returns Array<char> such that it returns String instead.
cannam@62 187 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString());
cannam@62 188 }
cannam@62 189
cannam@62 190 // =======================================================================================
cannam@62 191 // Basic character classes.
cannam@62 192
cannam@62 193 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z');
cannam@62 194 constexpr auto digit = charRange('0', '9');
cannam@62 195 constexpr auto alphaNumeric = alpha.orGroup(digit);
cannam@62 196 constexpr auto nameStart = alpha.orChar('_');
cannam@62 197 constexpr auto nameChar = alphaNumeric.orChar('_');
cannam@62 198 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
cannam@62 199 constexpr auto octDigit = charRange('0', '7');
cannam@62 200 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
cannam@62 201 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
cannam@62 202
cannam@62 203 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
cannam@62 204
cannam@62 205 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
cannam@62 206 // Like discard(whitespace) but avoids some memory allocation.
cannam@62 207
cannam@62 208 // =======================================================================================
cannam@62 209 // Identifiers
cannam@62 210
cannam@62 211 namespace _ { // private
cannam@62 212
cannam@62 213 struct IdentifierToString {
cannam@62 214 inline String operator()(char first, const Array<char>& rest) const {
cannam@62 215 String result = heapString(rest.size() + 1);
cannam@62 216 result[0] = first;
cannam@62 217 memcpy(result.begin() + 1, rest.begin(), rest.size());
cannam@62 218 return result;
cannam@62 219 }
cannam@62 220 };
cannam@62 221
cannam@62 222 } // namespace _ (private)
cannam@62 223
cannam@62 224 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString());
cannam@62 225 // Parses an identifier (e.g. a C variable name).
cannam@62 226
cannam@62 227 // =======================================================================================
cannam@62 228 // Integers
cannam@62 229
cannam@62 230 namespace _ { // private
cannam@62 231
cannam@62 232 inline char parseDigit(char c) {
cannam@62 233 if (c < 'A') return c - '0';
cannam@62 234 if (c < 'a') return c - 'A' + 10;
cannam@62 235 return c - 'a' + 10;
cannam@62 236 }
cannam@62 237
cannam@62 238 template <uint base>
cannam@62 239 struct ParseInteger {
cannam@62 240 inline uint64_t operator()(const Array<char>& digits) const {
cannam@62 241 return operator()('0', digits);
cannam@62 242 }
cannam@62 243 uint64_t operator()(char first, const Array<char>& digits) const {
cannam@62 244 uint64_t result = parseDigit(first);
cannam@62 245 for (char digit: digits) {
cannam@62 246 result = result * base + parseDigit(digit);
cannam@62 247 }
cannam@62 248 return result;
cannam@62 249 }
cannam@62 250 };
cannam@62 251
cannam@62 252
cannam@62 253 } // namespace _ (private)
cannam@62 254
cannam@62 255 constexpr auto integer = sequence(
cannam@62 256 oneOf(
cannam@62 257 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()),
cannam@62 258 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()),
cannam@62 259 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())),
cannam@62 260 notLookingAt(alpha.orAny("_.")));
cannam@62 261
cannam@62 262 // =======================================================================================
cannam@62 263 // Numbers (i.e. floats)
cannam@62 264
cannam@62 265 namespace _ { // private
cannam@62 266
cannam@62 267 struct ParseFloat {
cannam@62 268 double operator()(const Array<char>& digits,
cannam@62 269 const Maybe<Array<char>>& fraction,
cannam@62 270 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
cannam@62 271 };
cannam@62 272
cannam@62 273 } // namespace _ (private)
cannam@62 274
cannam@62 275 constexpr auto number = transform(
cannam@62 276 sequence(
cannam@62 277 oneOrMore(digit),
cannam@62 278 optional(sequence(exactChar<'.'>(), many(digit))),
cannam@62 279 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
cannam@62 280 notLookingAt(alpha.orAny("_."))),
cannam@62 281 _::ParseFloat());
cannam@62 282
cannam@62 283 // =======================================================================================
cannam@62 284 // Quoted strings
cannam@62 285
cannam@62 286 namespace _ { // private
cannam@62 287
cannam@62 288 struct InterpretEscape {
cannam@62 289 char operator()(char c) const {
cannam@62 290 switch (c) {
cannam@62 291 case 'a': return '\a';
cannam@62 292 case 'b': return '\b';
cannam@62 293 case 'f': return '\f';
cannam@62 294 case 'n': return '\n';
cannam@62 295 case 'r': return '\r';
cannam@62 296 case 't': return '\t';
cannam@62 297 case 'v': return '\v';
cannam@62 298 default: return c;
cannam@62 299 }
cannam@62 300 }
cannam@62 301 };
cannam@62 302
cannam@62 303 struct ParseHexEscape {
cannam@62 304 inline char operator()(char first, char second) const {
cannam@62 305 return (parseDigit(first) << 4) | parseDigit(second);
cannam@62 306 }
cannam@62 307 };
cannam@62 308
cannam@62 309 struct ParseHexByte {
cannam@62 310 inline byte operator()(char first, char second) const {
cannam@62 311 return (parseDigit(first) << 4) | parseDigit(second);
cannam@62 312 }
cannam@62 313 };
cannam@62 314
cannam@62 315 struct ParseOctEscape {
cannam@62 316 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
cannam@62 317 char result = first - '0';
cannam@62 318 KJ_IF_MAYBE(digit1, second) {
cannam@62 319 result = (result << 3) | (*digit1 - '0');
cannam@62 320 KJ_IF_MAYBE(digit2, third) {
cannam@62 321 result = (result << 3) | (*digit2 - '0');
cannam@62 322 }
cannam@62 323 }
cannam@62 324 return result;
cannam@62 325 }
cannam@62 326 };
cannam@62 327
cannam@62 328 } // namespace _ (private)
cannam@62 329
cannam@62 330 constexpr auto escapeSequence =
cannam@62 331 sequence(exactChar<'\\'>(), oneOf(
cannam@62 332 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()),
cannam@62 333 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()),
cannam@62 334 transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
cannam@62 335 _::ParseOctEscape())));
cannam@62 336 // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns
cannam@62 337 // a char.
cannam@62 338
cannam@62 339 constexpr auto doubleQuotedString = charsToString(sequence(
cannam@62 340 exactChar<'\"'>(),
cannam@62 341 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
cannam@62 342 exactChar<'\"'>()));
cannam@62 343 // Parses a C-style double-quoted string.
cannam@62 344
cannam@62 345 constexpr auto singleQuotedString = charsToString(sequence(
cannam@62 346 exactChar<'\''>(),
cannam@62 347 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
cannam@62 348 exactChar<'\''>()));
cannam@62 349 // Parses a C-style single-quoted string.
cannam@62 350
cannam@62 351 constexpr auto doubleQuotedHexBinary = sequence(
cannam@62 352 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(),
cannam@62 353 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())),
cannam@62 354 discardWhitespace,
cannam@62 355 exactChar<'\"'>());
cannam@62 356 // Parses a double-quoted hex binary literal. Returns Array<byte>.
cannam@62 357
cannam@62 358 } // namespace parse
cannam@62 359 } // namespace kj
cannam@62 360
cannam@62 361 #endif // KJ_PARSE_CHAR_H_