Mercurial > hg > sv-dependency-builds
comparison osx/include/kj/parse/char.h @ 62:0994c39f1e94
Cap'n Proto v0.6 + build for OSX
author | Chris Cannam <cannam@all-day-breakfast.com> |
---|---|
date | Mon, 22 May 2017 10:01:37 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
61:d101c4099725 | 62:0994c39f1e94 |
---|---|
1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors | |
2 // Licensed under the MIT License: | |
3 // | |
4 // Permission is hereby granted, free of charge, to any person obtaining a copy | |
5 // of this software and associated documentation files (the "Software"), to deal | |
6 // in the Software without restriction, including without limitation the rights | |
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
8 // copies of the Software, and to permit persons to whom the Software is | |
9 // furnished to do so, subject to the following conditions: | |
10 // | |
11 // The above copyright notice and this permission notice shall be included in | |
12 // all copies or substantial portions of the Software. | |
13 // | |
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
20 // THE SOFTWARE. | |
21 | |
22 // This file contains parsers useful for character stream inputs, including parsers to parse | |
23 // common kinds of tokens like identifiers, numbers, and quoted strings. | |
24 | |
25 #ifndef KJ_PARSE_CHAR_H_ | |
26 #define KJ_PARSE_CHAR_H_ | |
27 | |
28 #if defined(__GNUC__) && !KJ_HEADER_WARNINGS | |
29 #pragma GCC system_header | |
30 #endif | |
31 | |
32 #include "common.h" | |
33 #include "../string.h" | |
34 #include <inttypes.h> | |
35 | |
36 namespace kj { | |
37 namespace parse { | |
38 | |
39 // ======================================================================================= | |
40 // Exact char/string. | |
41 | |
42 class ExactString_ { | |
43 public: | |
44 constexpr inline ExactString_(const char* str): str(str) {} | |
45 | |
46 template <typename Input> | |
47 Maybe<Tuple<>> operator()(Input& input) const { | |
48 const char* ptr = str; | |
49 | |
50 while (*ptr != '\0') { | |
51 if (input.atEnd() || input.current() != *ptr) return nullptr; | |
52 input.next(); | |
53 ++ptr; | |
54 } | |
55 | |
56 return Tuple<>(); | |
57 } | |
58 | |
59 private: | |
60 const char* str; | |
61 }; | |
62 | |
63 constexpr inline ExactString_ exactString(const char* str) { | |
64 return ExactString_(str); | |
65 } | |
66 | |
67 template <char c> | |
68 constexpr ExactlyConst_<char, c> exactChar() { | |
69 // Returns a parser that matches exactly the character given by the template argument (returning | |
70 // no result). | |
71 return ExactlyConst_<char, c>(); | |
72 } | |
73 | |
74 // ======================================================================================= | |
75 // Char ranges / sets | |
76 | |
77 class CharGroup_ { | |
78 public: | |
79 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {} | |
80 | |
81 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const { | |
82 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )), | |
83 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)), | |
84 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)), | |
85 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192))); | |
86 } | |
87 | |
88 constexpr inline CharGroup_ orAny(const char* chars) const { | |
89 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1); | |
90 } | |
91 | |
92 constexpr inline CharGroup_ orChar(unsigned char c) const { | |
93 return CharGroup_(bits[0] | bit(c), | |
94 bits[1] | bit(c - 64), | |
95 bits[2] | bit(c - 128), | |
96 bits[3] | bit(c - 256)); | |
97 } | |
98 | |
99 constexpr inline CharGroup_ orGroup(CharGroup_ other) const { | |
100 return CharGroup_(bits[0] | other.bits[0], | |
101 bits[1] | other.bits[1], | |
102 bits[2] | other.bits[2], | |
103 bits[3] | other.bits[3]); | |
104 } | |
105 | |
106 constexpr inline CharGroup_ invert() const { | |
107 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]); | |
108 } | |
109 | |
110 constexpr inline bool contains(unsigned char c) const { | |
111 return (bits[c / 64] & (1ll << (c % 64))) != 0; | |
112 } | |
113 | |
114 template <typename Input> | |
115 Maybe<char> operator()(Input& input) const { | |
116 if (input.atEnd()) return nullptr; | |
117 unsigned char c = input.current(); | |
118 if (contains(c)) { | |
119 input.next(); | |
120 return c; | |
121 } else { | |
122 return nullptr; | |
123 } | |
124 } | |
125 | |
126 private: | |
127 typedef unsigned long long Bits64; | |
128 | |
129 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {} | |
130 Bits64 bits[4]; | |
131 | |
132 static constexpr inline Bits64 oneBits(int count) { | |
133 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1); | |
134 } | |
135 static constexpr inline Bits64 bit(int index) { | |
136 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index); | |
137 } | |
138 }; | |
139 | |
140 constexpr inline CharGroup_ charRange(char first, char last) { | |
141 // Create a parser which accepts any character in the range from `first` to `last`, inclusive. | |
142 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the | |
143 // character matched. | |
144 // | |
145 // The returned object has methods which can be used to match more characters. The following | |
146 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'. | |
147 // | |
148 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.") | |
149 // | |
150 // You can also use `.invert()` to match the opposite set of characters. | |
151 | |
152 return CharGroup_().orRange(first, last); | |
153 } | |
154 | |
155 #if _MSC_VER | |
156 #define anyOfChars(chars) CharGroup_().orAny(chars) | |
157 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from | |
158 // building the compiler or schema parser. We don't know why this happens, but Harris found that | |
159 // this horrible, horrible hack makes things work. This is awful, but it's better than nothing. | |
160 // Hopefully, MSVC will get fixed soon and we'll be able to remove this. | |
161 #else | |
162 constexpr inline CharGroup_ anyOfChars(const char* chars) { | |
163 // Returns a parser that accepts any of the characters in the given string (which should usually | |
164 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see | |
165 // that function for more info. | |
166 | |
167 return CharGroup_().orAny(chars); | |
168 } | |
169 #endif | |
170 | |
171 // ======================================================================================= | |
172 | |
173 namespace _ { // private | |
174 | |
175 struct ArrayToString { | |
176 inline String operator()(const Array<char>& arr) const { | |
177 return heapString(arr); | |
178 } | |
179 }; | |
180 | |
181 } // namespace _ (private) | |
182 | |
183 template <typename SubParser> | |
184 constexpr inline auto charsToString(SubParser&& subParser) | |
185 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) { | |
186 // Wraps a parser that returns Array<char> such that it returns String instead. | |
187 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString()); | |
188 } | |
189 | |
190 // ======================================================================================= | |
191 // Basic character classes. | |
192 | |
193 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z'); | |
194 constexpr auto digit = charRange('0', '9'); | |
195 constexpr auto alphaNumeric = alpha.orGroup(digit); | |
196 constexpr auto nameStart = alpha.orChar('_'); | |
197 constexpr auto nameChar = alphaNumeric.orChar('_'); | |
198 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F'); | |
199 constexpr auto octDigit = charRange('0', '7'); | |
200 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v"); | |
201 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert(); | |
202 | |
203 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v")); | |
204 | |
205 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v")))); | |
206 // Like discard(whitespace) but avoids some memory allocation. | |
207 | |
208 // ======================================================================================= | |
209 // Identifiers | |
210 | |
211 namespace _ { // private | |
212 | |
213 struct IdentifierToString { | |
214 inline String operator()(char first, const Array<char>& rest) const { | |
215 String result = heapString(rest.size() + 1); | |
216 result[0] = first; | |
217 memcpy(result.begin() + 1, rest.begin(), rest.size()); | |
218 return result; | |
219 } | |
220 }; | |
221 | |
222 } // namespace _ (private) | |
223 | |
224 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString()); | |
225 // Parses an identifier (e.g. a C variable name). | |
226 | |
227 // ======================================================================================= | |
228 // Integers | |
229 | |
230 namespace _ { // private | |
231 | |
232 inline char parseDigit(char c) { | |
233 if (c < 'A') return c - '0'; | |
234 if (c < 'a') return c - 'A' + 10; | |
235 return c - 'a' + 10; | |
236 } | |
237 | |
238 template <uint base> | |
239 struct ParseInteger { | |
240 inline uint64_t operator()(const Array<char>& digits) const { | |
241 return operator()('0', digits); | |
242 } | |
243 uint64_t operator()(char first, const Array<char>& digits) const { | |
244 uint64_t result = parseDigit(first); | |
245 for (char digit: digits) { | |
246 result = result * base + parseDigit(digit); | |
247 } | |
248 return result; | |
249 } | |
250 }; | |
251 | |
252 | |
253 } // namespace _ (private) | |
254 | |
255 constexpr auto integer = sequence( | |
256 oneOf( | |
257 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()), | |
258 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()), | |
259 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())), | |
260 notLookingAt(alpha.orAny("_."))); | |
261 | |
262 // ======================================================================================= | |
263 // Numbers (i.e. floats) | |
264 | |
265 namespace _ { // private | |
266 | |
267 struct ParseFloat { | |
268 double operator()(const Array<char>& digits, | |
269 const Maybe<Array<char>>& fraction, | |
270 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const; | |
271 }; | |
272 | |
273 } // namespace _ (private) | |
274 | |
275 constexpr auto number = transform( | |
276 sequence( | |
277 oneOrMore(digit), | |
278 optional(sequence(exactChar<'.'>(), many(digit))), | |
279 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))), | |
280 notLookingAt(alpha.orAny("_."))), | |
281 _::ParseFloat()); | |
282 | |
283 // ======================================================================================= | |
284 // Quoted strings | |
285 | |
286 namespace _ { // private | |
287 | |
288 struct InterpretEscape { | |
289 char operator()(char c) const { | |
290 switch (c) { | |
291 case 'a': return '\a'; | |
292 case 'b': return '\b'; | |
293 case 'f': return '\f'; | |
294 case 'n': return '\n'; | |
295 case 'r': return '\r'; | |
296 case 't': return '\t'; | |
297 case 'v': return '\v'; | |
298 default: return c; | |
299 } | |
300 } | |
301 }; | |
302 | |
303 struct ParseHexEscape { | |
304 inline char operator()(char first, char second) const { | |
305 return (parseDigit(first) << 4) | parseDigit(second); | |
306 } | |
307 }; | |
308 | |
309 struct ParseHexByte { | |
310 inline byte operator()(char first, char second) const { | |
311 return (parseDigit(first) << 4) | parseDigit(second); | |
312 } | |
313 }; | |
314 | |
315 struct ParseOctEscape { | |
316 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const { | |
317 char result = first - '0'; | |
318 KJ_IF_MAYBE(digit1, second) { | |
319 result = (result << 3) | (*digit1 - '0'); | |
320 KJ_IF_MAYBE(digit2, third) { | |
321 result = (result << 3) | (*digit2 - '0'); | |
322 } | |
323 } | |
324 return result; | |
325 } | |
326 }; | |
327 | |
328 } // namespace _ (private) | |
329 | |
330 constexpr auto escapeSequence = | |
331 sequence(exactChar<'\\'>(), oneOf( | |
332 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()), | |
333 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()), | |
334 transform(sequence(octDigit, optional(octDigit), optional(octDigit)), | |
335 _::ParseOctEscape()))); | |
336 // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns | |
337 // a char. | |
338 | |
339 constexpr auto doubleQuotedString = charsToString(sequence( | |
340 exactChar<'\"'>(), | |
341 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)), | |
342 exactChar<'\"'>())); | |
343 // Parses a C-style double-quoted string. | |
344 | |
345 constexpr auto singleQuotedString = charsToString(sequence( | |
346 exactChar<'\''>(), | |
347 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)), | |
348 exactChar<'\''>())); | |
349 // Parses a C-style single-quoted string. | |
350 | |
351 constexpr auto doubleQuotedHexBinary = sequence( | |
352 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(), | |
353 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())), | |
354 discardWhitespace, | |
355 exactChar<'\"'>()); | |
356 // Parses a double-quoted hex binary literal. Returns Array<byte>. | |
357 | |
358 } // namespace parse | |
359 } // namespace kj | |
360 | |
361 #endif // KJ_PARSE_CHAR_H_ |