comparison osx/include/kj/parse/char.h @ 62:0994c39f1e94

Cap'n Proto v0.6 + build for OSX
author Chris Cannam <cannam@all-day-breakfast.com>
date Mon, 22 May 2017 10:01:37 +0100
parents
children
comparison
equal deleted inserted replaced
61:d101c4099725 62:0994c39f1e94
1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
2 // Licensed under the MIT License:
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a copy
5 // of this software and associated documentation files (the "Software"), to deal
6 // in the Software without restriction, including without limitation the rights
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the Software is
9 // furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 // THE SOFTWARE.
21
22 // This file contains parsers useful for character stream inputs, including parsers to parse
23 // common kinds of tokens like identifiers, numbers, and quoted strings.
24
25 #ifndef KJ_PARSE_CHAR_H_
26 #define KJ_PARSE_CHAR_H_
27
28 #if defined(__GNUC__) && !KJ_HEADER_WARNINGS
29 #pragma GCC system_header
30 #endif
31
32 #include "common.h"
33 #include "../string.h"
34 #include <inttypes.h>
35
36 namespace kj {
37 namespace parse {
38
39 // =======================================================================================
40 // Exact char/string.
41
42 class ExactString_ {
43 public:
44 constexpr inline ExactString_(const char* str): str(str) {}
45
46 template <typename Input>
47 Maybe<Tuple<>> operator()(Input& input) const {
48 const char* ptr = str;
49
50 while (*ptr != '\0') {
51 if (input.atEnd() || input.current() != *ptr) return nullptr;
52 input.next();
53 ++ptr;
54 }
55
56 return Tuple<>();
57 }
58
59 private:
60 const char* str;
61 };
62
63 constexpr inline ExactString_ exactString(const char* str) {
64 return ExactString_(str);
65 }
66
67 template <char c>
68 constexpr ExactlyConst_<char, c> exactChar() {
69 // Returns a parser that matches exactly the character given by the template argument (returning
70 // no result).
71 return ExactlyConst_<char, c>();
72 }
73
74 // =======================================================================================
75 // Char ranges / sets
76
77 class CharGroup_ {
78 public:
79 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {}
80
81 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
82 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )),
83 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)),
84 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)),
85 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192)));
86 }
87
88 constexpr inline CharGroup_ orAny(const char* chars) const {
89 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1);
90 }
91
92 constexpr inline CharGroup_ orChar(unsigned char c) const {
93 return CharGroup_(bits[0] | bit(c),
94 bits[1] | bit(c - 64),
95 bits[2] | bit(c - 128),
96 bits[3] | bit(c - 256));
97 }
98
99 constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
100 return CharGroup_(bits[0] | other.bits[0],
101 bits[1] | other.bits[1],
102 bits[2] | other.bits[2],
103 bits[3] | other.bits[3]);
104 }
105
106 constexpr inline CharGroup_ invert() const {
107 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]);
108 }
109
110 constexpr inline bool contains(unsigned char c) const {
111 return (bits[c / 64] & (1ll << (c % 64))) != 0;
112 }
113
114 template <typename Input>
115 Maybe<char> operator()(Input& input) const {
116 if (input.atEnd()) return nullptr;
117 unsigned char c = input.current();
118 if (contains(c)) {
119 input.next();
120 return c;
121 } else {
122 return nullptr;
123 }
124 }
125
126 private:
127 typedef unsigned long long Bits64;
128
129 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
130 Bits64 bits[4];
131
132 static constexpr inline Bits64 oneBits(int count) {
133 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1);
134 }
135 static constexpr inline Bits64 bit(int index) {
136 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index);
137 }
138 };
139
140 constexpr inline CharGroup_ charRange(char first, char last) {
141 // Create a parser which accepts any character in the range from `first` to `last`, inclusive.
142 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the
143 // character matched.
144 //
145 // The returned object has methods which can be used to match more characters. The following
146 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
147 //
148 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
149 //
150 // You can also use `.invert()` to match the opposite set of characters.
151
152 return CharGroup_().orRange(first, last);
153 }
154
155 #if _MSC_VER
156 #define anyOfChars(chars) CharGroup_().orAny(chars)
157 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
158 // building the compiler or schema parser. We don't know why this happens, but Harris found that
159 // this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
160 // Hopefully, MSVC will get fixed soon and we'll be able to remove this.
161 #else
162 constexpr inline CharGroup_ anyOfChars(const char* chars) {
163 // Returns a parser that accepts any of the characters in the given string (which should usually
164 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see
165 // that function for more info.
166
167 return CharGroup_().orAny(chars);
168 }
169 #endif
170
171 // =======================================================================================
172
173 namespace _ { // private
174
175 struct ArrayToString {
176 inline String operator()(const Array<char>& arr) const {
177 return heapString(arr);
178 }
179 };
180
181 } // namespace _ (private)
182
183 template <typename SubParser>
184 constexpr inline auto charsToString(SubParser&& subParser)
185 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) {
186 // Wraps a parser that returns Array<char> such that it returns String instead.
187 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString());
188 }
189
190 // =======================================================================================
191 // Basic character classes.
192
193 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z');
194 constexpr auto digit = charRange('0', '9');
195 constexpr auto alphaNumeric = alpha.orGroup(digit);
196 constexpr auto nameStart = alpha.orChar('_');
197 constexpr auto nameChar = alphaNumeric.orChar('_');
198 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
199 constexpr auto octDigit = charRange('0', '7');
200 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
201 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
202
203 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
204
205 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
206 // Like discard(whitespace) but avoids some memory allocation.
207
208 // =======================================================================================
209 // Identifiers
210
211 namespace _ { // private
212
213 struct IdentifierToString {
214 inline String operator()(char first, const Array<char>& rest) const {
215 String result = heapString(rest.size() + 1);
216 result[0] = first;
217 memcpy(result.begin() + 1, rest.begin(), rest.size());
218 return result;
219 }
220 };
221
222 } // namespace _ (private)
223
224 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString());
225 // Parses an identifier (e.g. a C variable name).
226
227 // =======================================================================================
228 // Integers
229
230 namespace _ { // private
231
232 inline char parseDigit(char c) {
233 if (c < 'A') return c - '0';
234 if (c < 'a') return c - 'A' + 10;
235 return c - 'a' + 10;
236 }
237
238 template <uint base>
239 struct ParseInteger {
240 inline uint64_t operator()(const Array<char>& digits) const {
241 return operator()('0', digits);
242 }
243 uint64_t operator()(char first, const Array<char>& digits) const {
244 uint64_t result = parseDigit(first);
245 for (char digit: digits) {
246 result = result * base + parseDigit(digit);
247 }
248 return result;
249 }
250 };
251
252
253 } // namespace _ (private)
254
255 constexpr auto integer = sequence(
256 oneOf(
257 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()),
258 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()),
259 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())),
260 notLookingAt(alpha.orAny("_.")));
261
262 // =======================================================================================
263 // Numbers (i.e. floats)
264
265 namespace _ { // private
266
267 struct ParseFloat {
268 double operator()(const Array<char>& digits,
269 const Maybe<Array<char>>& fraction,
270 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
271 };
272
273 } // namespace _ (private)
274
275 constexpr auto number = transform(
276 sequence(
277 oneOrMore(digit),
278 optional(sequence(exactChar<'.'>(), many(digit))),
279 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
280 notLookingAt(alpha.orAny("_."))),
281 _::ParseFloat());
282
283 // =======================================================================================
284 // Quoted strings
285
286 namespace _ { // private
287
288 struct InterpretEscape {
289 char operator()(char c) const {
290 switch (c) {
291 case 'a': return '\a';
292 case 'b': return '\b';
293 case 'f': return '\f';
294 case 'n': return '\n';
295 case 'r': return '\r';
296 case 't': return '\t';
297 case 'v': return '\v';
298 default: return c;
299 }
300 }
301 };
302
303 struct ParseHexEscape {
304 inline char operator()(char first, char second) const {
305 return (parseDigit(first) << 4) | parseDigit(second);
306 }
307 };
308
309 struct ParseHexByte {
310 inline byte operator()(char first, char second) const {
311 return (parseDigit(first) << 4) | parseDigit(second);
312 }
313 };
314
315 struct ParseOctEscape {
316 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
317 char result = first - '0';
318 KJ_IF_MAYBE(digit1, second) {
319 result = (result << 3) | (*digit1 - '0');
320 KJ_IF_MAYBE(digit2, third) {
321 result = (result << 3) | (*digit2 - '0');
322 }
323 }
324 return result;
325 }
326 };
327
328 } // namespace _ (private)
329
330 constexpr auto escapeSequence =
331 sequence(exactChar<'\\'>(), oneOf(
332 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()),
333 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()),
334 transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
335 _::ParseOctEscape())));
336 // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns
337 // a char.
338
339 constexpr auto doubleQuotedString = charsToString(sequence(
340 exactChar<'\"'>(),
341 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
342 exactChar<'\"'>()));
343 // Parses a C-style double-quoted string.
344
345 constexpr auto singleQuotedString = charsToString(sequence(
346 exactChar<'\''>(),
347 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
348 exactChar<'\''>()));
349 // Parses a C-style single-quoted string.
350
351 constexpr auto doubleQuotedHexBinary = sequence(
352 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(),
353 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())),
354 discardWhitespace,
355 exactChar<'\"'>());
356 // Parses a double-quoted hex binary literal. Returns Array<byte>.
357
358 } // namespace parse
359 } // namespace kj
360
361 #endif // KJ_PARSE_CHAR_H_