Mercurial > hg > sv-dependency-builds
diff osx/include/kj/parse/char.h @ 62:0994c39f1e94
Cap'n Proto v0.6 + build for OSX
author | Chris Cannam <cannam@all-day-breakfast.com> |
---|---|
date | Mon, 22 May 2017 10:01:37 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/osx/include/kj/parse/char.h Mon May 22 10:01:37 2017 +0100 @@ -0,0 +1,361 @@ +// Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors +// Licensed under the MIT License: +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +// This file contains parsers useful for character stream inputs, including parsers to parse +// common kinds of tokens like identifiers, numbers, and quoted strings. + +#ifndef KJ_PARSE_CHAR_H_ +#define KJ_PARSE_CHAR_H_ + +#if defined(__GNUC__) && !KJ_HEADER_WARNINGS +#pragma GCC system_header +#endif + +#include "common.h" +#include "../string.h" +#include <inttypes.h> + +namespace kj { +namespace parse { + +// ======================================================================================= +// Exact char/string. + +class ExactString_ { +public: + constexpr inline ExactString_(const char* str): str(str) {} + + template <typename Input> + Maybe<Tuple<>> operator()(Input& input) const { + const char* ptr = str; + + while (*ptr != '\0') { + if (input.atEnd() || input.current() != *ptr) return nullptr; + input.next(); + ++ptr; + } + + return Tuple<>(); + } + +private: + const char* str; +}; + +constexpr inline ExactString_ exactString(const char* str) { + return ExactString_(str); +} + +template <char c> +constexpr ExactlyConst_<char, c> exactChar() { + // Returns a parser that matches exactly the character given by the template argument (returning + // no result). + return ExactlyConst_<char, c>(); +} + +// ======================================================================================= +// Char ranges / sets + +class CharGroup_ { +public: + constexpr inline CharGroup_(): bits{0, 0, 0, 0} {} + + constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const { + return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )), + bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)), + bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)), + bits[3] | (oneBits(last - 191) & ~oneBits(first - 192))); + } + + constexpr inline CharGroup_ orAny(const char* chars) const { + return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1); + } + + constexpr inline CharGroup_ orChar(unsigned char c) const { + return CharGroup_(bits[0] | bit(c), + bits[1] | bit(c - 64), + bits[2] | bit(c - 128), + bits[3] | bit(c - 256)); + } + + constexpr inline CharGroup_ orGroup(CharGroup_ other) const { + return CharGroup_(bits[0] | other.bits[0], + bits[1] | other.bits[1], + bits[2] | other.bits[2], + bits[3] | other.bits[3]); + } + + constexpr inline CharGroup_ invert() const { + return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]); + } + + constexpr inline bool contains(unsigned char c) const { + return (bits[c / 64] & (1ll << (c % 64))) != 0; + } + + template <typename Input> + Maybe<char> operator()(Input& input) const { + if (input.atEnd()) return nullptr; + unsigned char c = input.current(); + if (contains(c)) { + input.next(); + return c; + } else { + return nullptr; + } + } + +private: + typedef unsigned long long Bits64; + + constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {} + Bits64 bits[4]; + + static constexpr inline Bits64 oneBits(int count) { + return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1); + } + static constexpr inline Bits64 bit(int index) { + return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index); + } +}; + +constexpr inline CharGroup_ charRange(char first, char last) { + // Create a parser which accepts any character in the range from `first` to `last`, inclusive. + // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the + // character matched. + // + // The returned object has methods which can be used to match more characters. The following + // produces a parser which accepts any letter as well as '_', '+', '-', and '.'. + // + // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.") + // + // You can also use `.invert()` to match the opposite set of characters. + + return CharGroup_().orRange(first, last); +} + +#if _MSC_VER +#define anyOfChars(chars) CharGroup_().orAny(chars) +// TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from +// building the compiler or schema parser. We don't know why this happens, but Harris found that +// this horrible, horrible hack makes things work. This is awful, but it's better than nothing. +// Hopefully, MSVC will get fixed soon and we'll be able to remove this. +#else +constexpr inline CharGroup_ anyOfChars(const char* chars) { + // Returns a parser that accepts any of the characters in the given string (which should usually + // be a literal). The returned parser is of the same type as returned by `charRange()` -- see + // that function for more info. + + return CharGroup_().orAny(chars); +} +#endif + +// ======================================================================================= + +namespace _ { // private + +struct ArrayToString { + inline String operator()(const Array<char>& arr) const { + return heapString(arr); + } +}; + +} // namespace _ (private) + +template <typename SubParser> +constexpr inline auto charsToString(SubParser&& subParser) + -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) { + // Wraps a parser that returns Array<char> such that it returns String instead. + return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString()); +} + +// ======================================================================================= +// Basic character classes. + +constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z'); +constexpr auto digit = charRange('0', '9'); +constexpr auto alphaNumeric = alpha.orGroup(digit); +constexpr auto nameStart = alpha.orChar('_'); +constexpr auto nameChar = alphaNumeric.orChar('_'); +constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F'); +constexpr auto octDigit = charRange('0', '7'); +constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v"); +constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert(); + +constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v")); + +constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v")))); +// Like discard(whitespace) but avoids some memory allocation. + +// ======================================================================================= +// Identifiers + +namespace _ { // private + +struct IdentifierToString { + inline String operator()(char first, const Array<char>& rest) const { + String result = heapString(rest.size() + 1); + result[0] = first; + memcpy(result.begin() + 1, rest.begin(), rest.size()); + return result; + } +}; + +} // namespace _ (private) + +constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString()); +// Parses an identifier (e.g. a C variable name). + +// ======================================================================================= +// Integers + +namespace _ { // private + +inline char parseDigit(char c) { + if (c < 'A') return c - '0'; + if (c < 'a') return c - 'A' + 10; + return c - 'a' + 10; +} + +template <uint base> +struct ParseInteger { + inline uint64_t operator()(const Array<char>& digits) const { + return operator()('0', digits); + } + uint64_t operator()(char first, const Array<char>& digits) const { + uint64_t result = parseDigit(first); + for (char digit: digits) { + result = result * base + parseDigit(digit); + } + return result; + } +}; + + +} // namespace _ (private) + +constexpr auto integer = sequence( + oneOf( + transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()), + transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()), + transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())), + notLookingAt(alpha.orAny("_."))); + +// ======================================================================================= +// Numbers (i.e. floats) + +namespace _ { // private + +struct ParseFloat { + double operator()(const Array<char>& digits, + const Maybe<Array<char>>& fraction, + const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const; +}; + +} // namespace _ (private) + +constexpr auto number = transform( + sequence( + oneOrMore(digit), + optional(sequence(exactChar<'.'>(), many(digit))), + optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))), + notLookingAt(alpha.orAny("_."))), + _::ParseFloat()); + +// ======================================================================================= +// Quoted strings + +namespace _ { // private + +struct InterpretEscape { + char operator()(char c) const { + switch (c) { + case 'a': return '\a'; + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case 'v': return '\v'; + default: return c; + } + } +}; + +struct ParseHexEscape { + inline char operator()(char first, char second) const { + return (parseDigit(first) << 4) | parseDigit(second); + } +}; + +struct ParseHexByte { + inline byte operator()(char first, char second) const { + return (parseDigit(first) << 4) | parseDigit(second); + } +}; + +struct ParseOctEscape { + inline char operator()(char first, Maybe<char> second, Maybe<char> third) const { + char result = first - '0'; + KJ_IF_MAYBE(digit1, second) { + result = (result << 3) | (*digit1 - '0'); + KJ_IF_MAYBE(digit2, third) { + result = (result << 3) | (*digit2 - '0'); + } + } + return result; + } +}; + +} // namespace _ (private) + +constexpr auto escapeSequence = + sequence(exactChar<'\\'>(), oneOf( + transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()), + transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()), + transform(sequence(octDigit, optional(octDigit), optional(octDigit)), + _::ParseOctEscape()))); +// A parser that parses a C-string-style escape sequence (starting with a backslash). Returns +// a char. + +constexpr auto doubleQuotedString = charsToString(sequence( + exactChar<'\"'>(), + many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)), + exactChar<'\"'>())); +// Parses a C-style double-quoted string. + +constexpr auto singleQuotedString = charsToString(sequence( + exactChar<'\''>(), + many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)), + exactChar<'\''>())); +// Parses a C-style single-quoted string. + +constexpr auto doubleQuotedHexBinary = sequence( + exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(), + oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())), + discardWhitespace, + exactChar<'\"'>()); +// Parses a double-quoted hex binary literal. Returns Array<byte>. + +} // namespace parse +} // namespace kj + +#endif // KJ_PARSE_CHAR_H_