cannam@226
|
1 /*
|
cannam@226
|
2 Copyright 2011-2017 David Robillard <http://drobilla.net>
|
cannam@226
|
3
|
cannam@226
|
4 Permission to use, copy, modify, and/or distribute this software for any
|
cannam@226
|
5 purpose with or without fee is hereby granted, provided that the above
|
cannam@226
|
6 copyright notice and this permission notice appear in all copies.
|
cannam@226
|
7
|
cannam@226
|
8 THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
cannam@226
|
9 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
cannam@226
|
10 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
cannam@226
|
11 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
cannam@226
|
12 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
cannam@226
|
13 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
cannam@226
|
14 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
cannam@226
|
15 */
|
cannam@226
|
16
|
cannam@226
|
17 #include "serd_internal.h"
|
cannam@226
|
18
|
cannam@226
|
19 #include <assert.h>
|
cannam@226
|
20 #include <ctype.h>
|
cannam@226
|
21 #include <errno.h>
|
cannam@226
|
22 #include <stdarg.h>
|
cannam@226
|
23 #include <stdint.h>
|
cannam@226
|
24 #include <stdio.h>
|
cannam@226
|
25 #include <stdlib.h>
|
cannam@226
|
26 #include <string.h>
|
cannam@226
|
27
|
cannam@226
|
28 #define NS_XSD "http://www.w3.org/2001/XMLSchema#"
|
cannam@226
|
29 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
cannam@226
|
30
|
cannam@226
|
31 #define TRY_THROW(exp) if (!(exp)) goto except;
|
cannam@226
|
32 #define TRY_RET(exp) if (!(exp)) return 0;
|
cannam@226
|
33
|
cannam@226
|
34 #ifdef SERD_STACK_CHECK
|
cannam@226
|
35 # define SERD_STACK_ASSERT_TOP(reader, ref) \
|
cannam@226
|
36 assert(ref == reader->allocs[reader->n_allocs - 1]);
|
cannam@226
|
37 #else
|
cannam@226
|
38 # define SERD_STACK_ASSERT_TOP(reader, ref)
|
cannam@226
|
39 #endif
|
cannam@226
|
40
|
cannam@226
|
41 typedef struct {
|
cannam@226
|
42 const uint8_t* filename;
|
cannam@226
|
43 unsigned line;
|
cannam@226
|
44 unsigned col;
|
cannam@226
|
45 } Cursor;
|
cannam@226
|
46
|
cannam@226
|
47 typedef uint32_t uchar;
|
cannam@226
|
48
|
cannam@226
|
49 /* Reference to a node in the stack (we can not use pointers since the
|
cannam@226
|
50 stack may be reallocated, invalidating any pointers to elements).
|
cannam@226
|
51 */
|
cannam@226
|
52 typedef size_t Ref;
|
cannam@226
|
53
|
cannam@226
|
54 typedef struct {
|
cannam@226
|
55 Ref graph;
|
cannam@226
|
56 Ref subject;
|
cannam@226
|
57 Ref predicate;
|
cannam@226
|
58 Ref object;
|
cannam@226
|
59 Ref datatype;
|
cannam@226
|
60 Ref lang;
|
cannam@226
|
61 SerdStatementFlags* flags;
|
cannam@226
|
62 } ReadContext;
|
cannam@226
|
63
|
cannam@226
|
64 struct SerdReaderImpl {
|
cannam@226
|
65 void* handle;
|
cannam@226
|
66 void (*free_handle)(void* ptr);
|
cannam@226
|
67 SerdBaseSink base_sink;
|
cannam@226
|
68 SerdPrefixSink prefix_sink;
|
cannam@226
|
69 SerdStatementSink statement_sink;
|
cannam@226
|
70 SerdEndSink end_sink;
|
cannam@226
|
71 SerdErrorSink error_sink;
|
cannam@226
|
72 void* error_handle;
|
cannam@226
|
73 Ref rdf_first;
|
cannam@226
|
74 Ref rdf_rest;
|
cannam@226
|
75 Ref rdf_nil;
|
cannam@226
|
76 SerdNode default_graph;
|
cannam@226
|
77 SerdByteSource source;
|
cannam@226
|
78 SerdStack stack;
|
cannam@226
|
79 SerdSyntax syntax;
|
cannam@226
|
80 unsigned next_id;
|
cannam@226
|
81 Cursor cur;
|
cannam@226
|
82 SerdStatus status;
|
cannam@226
|
83 uint8_t* buf;
|
cannam@226
|
84 uint8_t* bprefix;
|
cannam@226
|
85 size_t bprefix_len;
|
cannam@226
|
86 bool strict; ///< True iff strict parsing
|
cannam@226
|
87 bool eof;
|
cannam@226
|
88 bool seen_genid;
|
cannam@226
|
89 #ifdef SERD_STACK_CHECK
|
cannam@226
|
90 Ref* allocs; ///< Stack of push offsets
|
cannam@226
|
91 size_t n_allocs; ///< Number of stack pushes
|
cannam@226
|
92 #endif
|
cannam@226
|
93 };
|
cannam@226
|
94
|
cannam@226
|
95 static inline bool
|
cannam@226
|
96 supports_fancy_literals(const SerdReader* reader)
|
cannam@226
|
97 {
|
cannam@226
|
98 return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG;
|
cannam@226
|
99 }
|
cannam@226
|
100
|
cannam@226
|
101 static inline bool
|
cannam@226
|
102 supports_relative_iris(const SerdReader* reader)
|
cannam@226
|
103 {
|
cannam@226
|
104 return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG;
|
cannam@226
|
105 }
|
cannam@226
|
106
|
cannam@226
|
107 static int
|
cannam@226
|
108 r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...)
|
cannam@226
|
109 {
|
cannam@226
|
110 va_list args;
|
cannam@226
|
111 va_start(args, fmt);
|
cannam@226
|
112 const SerdError e = {
|
cannam@226
|
113 st, reader->cur.filename, reader->cur.line, reader->cur.col, fmt, &args
|
cannam@226
|
114 };
|
cannam@226
|
115 serd_error(reader->error_sink, reader->error_handle, &e);
|
cannam@226
|
116 va_end(args);
|
cannam@226
|
117 return 0;
|
cannam@226
|
118 }
|
cannam@226
|
119
|
cannam@226
|
120 /** fread-like wrapper for getc (which is faster). */
|
cannam@226
|
121 static size_t
|
cannam@226
|
122 serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream)
|
cannam@226
|
123 {
|
cannam@226
|
124 const int c = getc((FILE*)stream);
|
cannam@226
|
125 if (c == EOF) {
|
cannam@226
|
126 *((uint8_t*)buf) = 0;
|
cannam@226
|
127 return 0;
|
cannam@226
|
128 }
|
cannam@226
|
129 *((uint8_t*)buf) = (uint8_t)c;
|
cannam@226
|
130 return 1;
|
cannam@226
|
131 }
|
cannam@226
|
132
|
cannam@226
|
133 static inline uint8_t
|
cannam@226
|
134 peek_byte(SerdReader* reader)
|
cannam@226
|
135 {
|
cannam@226
|
136 return serd_byte_source_peek(&reader->source);
|
cannam@226
|
137 }
|
cannam@226
|
138
|
cannam@226
|
139 static inline uint8_t
|
cannam@226
|
140 eat_byte_safe(SerdReader* reader, const uint8_t byte)
|
cannam@226
|
141 {
|
cannam@226
|
142 assert(peek_byte(reader) == byte);
|
cannam@226
|
143 switch (byte) {
|
cannam@226
|
144 case '\0': reader->eof = (byte != '\0'); break;
|
cannam@226
|
145 case '\n': ++reader->cur.line; reader->cur.col = 0; break;
|
cannam@226
|
146 default: ++reader->cur.col;
|
cannam@226
|
147 }
|
cannam@226
|
148
|
cannam@226
|
149 reader->status = serd_byte_source_advance(&reader->source);
|
cannam@226
|
150 return byte;
|
cannam@226
|
151 }
|
cannam@226
|
152
|
cannam@226
|
153 static inline uint8_t
|
cannam@226
|
154 eat_byte_check(SerdReader* reader, const uint8_t byte)
|
cannam@226
|
155 {
|
cannam@226
|
156 const uint8_t c = peek_byte(reader);
|
cannam@226
|
157 if (c != byte) {
|
cannam@226
|
158 return r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
159 "expected `%c', not `%c'\n", byte, c);
|
cannam@226
|
160 }
|
cannam@226
|
161 return eat_byte_safe(reader, byte);
|
cannam@226
|
162 }
|
cannam@226
|
163
|
cannam@226
|
164 static inline bool
|
cannam@226
|
165 eat_string(SerdReader* reader, const char* str, unsigned n)
|
cannam@226
|
166 {
|
cannam@226
|
167 bool bad = false;
|
cannam@226
|
168 for (unsigned i = 0; i < n; ++i) {
|
cannam@226
|
169 bad |= eat_byte_check(reader, ((const uint8_t*)str)[i]);
|
cannam@226
|
170 }
|
cannam@226
|
171 return bad;
|
cannam@226
|
172 }
|
cannam@226
|
173
|
cannam@226
|
174 static Ref
|
cannam@226
|
175 push_node_padded(SerdReader* reader, size_t maxlen,
|
cannam@226
|
176 SerdType type, const char* str, size_t n_bytes)
|
cannam@226
|
177 {
|
cannam@226
|
178 void* mem = serd_stack_push_aligned(
|
cannam@226
|
179 &reader->stack, sizeof(SerdNode) + maxlen + 1, sizeof(SerdNode));
|
cannam@226
|
180
|
cannam@226
|
181 SerdNode* const node = (SerdNode*)mem;
|
cannam@226
|
182 node->n_bytes = node->n_chars = n_bytes;
|
cannam@226
|
183 node->flags = 0;
|
cannam@226
|
184 node->type = type;
|
cannam@226
|
185 node->buf = NULL;
|
cannam@226
|
186
|
cannam@226
|
187 uint8_t* buf = (uint8_t*)(node + 1);
|
cannam@226
|
188 memcpy(buf, str, n_bytes + 1);
|
cannam@226
|
189
|
cannam@226
|
190 #ifdef SERD_STACK_CHECK
|
cannam@226
|
191 reader->allocs = realloc(
|
cannam@226
|
192 reader->allocs, sizeof(uint8_t*) * (++reader->n_allocs));
|
cannam@226
|
193 reader->allocs[reader->n_allocs - 1] = ((uint8_t*)mem - reader->stack.buf);
|
cannam@226
|
194 #endif
|
cannam@226
|
195 return (uint8_t*)node - reader->stack.buf;
|
cannam@226
|
196 }
|
cannam@226
|
197
|
cannam@226
|
198 static Ref
|
cannam@226
|
199 push_node(SerdReader* reader, SerdType type, const char* str, size_t n_bytes)
|
cannam@226
|
200 {
|
cannam@226
|
201 return push_node_padded(reader, n_bytes, type, str, n_bytes);
|
cannam@226
|
202 }
|
cannam@226
|
203
|
cannam@226
|
204 static inline SerdNode*
|
cannam@226
|
205 deref(SerdReader* reader, const Ref ref)
|
cannam@226
|
206 {
|
cannam@226
|
207 if (ref) {
|
cannam@226
|
208 SerdNode* node = (SerdNode*)(reader->stack.buf + ref);
|
cannam@226
|
209 node->buf = (uint8_t*)node + sizeof(SerdNode);
|
cannam@226
|
210 return node;
|
cannam@226
|
211 }
|
cannam@226
|
212 return NULL;
|
cannam@226
|
213 }
|
cannam@226
|
214
|
cannam@226
|
215 static inline void
|
cannam@226
|
216 push_byte(SerdReader* reader, Ref ref, const uint8_t c)
|
cannam@226
|
217 {
|
cannam@226
|
218 SERD_STACK_ASSERT_TOP(reader, ref);
|
cannam@226
|
219 uint8_t* const s = serd_stack_push(&reader->stack, 1);
|
cannam@226
|
220 SerdNode* const node = (SerdNode*)(reader->stack.buf + ref);
|
cannam@226
|
221 ++node->n_bytes;
|
cannam@226
|
222 if (!(c & 0x80)) { // Starts with 0 bit, start of new character
|
cannam@226
|
223 ++node->n_chars;
|
cannam@226
|
224 }
|
cannam@226
|
225 *(s - 1) = c;
|
cannam@226
|
226 *s = '\0';
|
cannam@226
|
227 }
|
cannam@226
|
228
|
cannam@226
|
229 static inline void
|
cannam@226
|
230 push_replacement(SerdReader* reader, Ref dest)
|
cannam@226
|
231 {
|
cannam@226
|
232 push_byte(reader, dest, 0xEF);
|
cannam@226
|
233 push_byte(reader, dest, 0xBF);
|
cannam@226
|
234 push_byte(reader, dest, 0xBD);
|
cannam@226
|
235 }
|
cannam@226
|
236
|
cannam@226
|
237 static Ref
|
cannam@226
|
238 pop_node(SerdReader* reader, Ref ref)
|
cannam@226
|
239 {
|
cannam@226
|
240 if (ref && ref != reader->rdf_first && ref != reader->rdf_rest
|
cannam@226
|
241 && ref != reader->rdf_nil) {
|
cannam@226
|
242 #ifdef SERD_STACK_CHECK
|
cannam@226
|
243 SERD_STACK_ASSERT_TOP(reader, ref);
|
cannam@226
|
244 --reader->n_allocs;
|
cannam@226
|
245 #endif
|
cannam@226
|
246 SerdNode* const node = deref(reader, ref);
|
cannam@226
|
247 uint8_t* const top = reader->stack.buf + reader->stack.size;
|
cannam@226
|
248 serd_stack_pop_aligned(&reader->stack, top - (uint8_t*)node);
|
cannam@226
|
249 }
|
cannam@226
|
250 return 0;
|
cannam@226
|
251 }
|
cannam@226
|
252
|
cannam@226
|
253 static inline bool
|
cannam@226
|
254 emit_statement(SerdReader* reader, ReadContext ctx, Ref o, Ref d, Ref l)
|
cannam@226
|
255 {
|
cannam@226
|
256 SerdNode* graph = deref(reader, ctx.graph);
|
cannam@226
|
257 if (!graph && reader->default_graph.buf) {
|
cannam@226
|
258 graph = &reader->default_graph;
|
cannam@226
|
259 }
|
cannam@226
|
260 bool ret = !reader->statement_sink ||
|
cannam@226
|
261 !reader->statement_sink(
|
cannam@226
|
262 reader->handle, *ctx.flags, graph,
|
cannam@226
|
263 deref(reader, ctx.subject), deref(reader, ctx.predicate),
|
cannam@226
|
264 deref(reader, o), deref(reader, d), deref(reader, l));
|
cannam@226
|
265 *ctx.flags &= SERD_ANON_CONT|SERD_LIST_CONT; // Preserve only cont flags
|
cannam@226
|
266 return ret;
|
cannam@226
|
267 }
|
cannam@226
|
268
|
cannam@226
|
269 static bool
|
cannam@226
|
270 read_collection(SerdReader* reader, ReadContext ctx, Ref* dest);
|
cannam@226
|
271
|
cannam@226
|
272 static bool
|
cannam@226
|
273 read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot);
|
cannam@226
|
274
|
cannam@226
|
275 static inline uint8_t
|
cannam@226
|
276 read_HEX(SerdReader* reader)
|
cannam@226
|
277 {
|
cannam@226
|
278 const uint8_t c = peek_byte(reader);
|
cannam@226
|
279 if (is_digit(c) || in_range(c, 'A', 'F') || in_range(c, 'a', 'f')) {
|
cannam@226
|
280 return eat_byte_safe(reader, c);
|
cannam@226
|
281 } else {
|
cannam@226
|
282 return r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
283 "invalid hexadecimal digit `%c'\n", c);
|
cannam@226
|
284 }
|
cannam@226
|
285 }
|
cannam@226
|
286
|
cannam@226
|
287 // Read UCHAR escape, initial \ is already eaten by caller
|
cannam@226
|
288 static inline bool
|
cannam@226
|
289 read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code)
|
cannam@226
|
290 {
|
cannam@226
|
291 const uint8_t b = peek_byte(reader);
|
cannam@226
|
292 unsigned length = 0;
|
cannam@226
|
293 switch (b) {
|
cannam@226
|
294 case 'U':
|
cannam@226
|
295 length = 8;
|
cannam@226
|
296 break;
|
cannam@226
|
297 case 'u':
|
cannam@226
|
298 length = 4;
|
cannam@226
|
299 break;
|
cannam@226
|
300 default:
|
cannam@226
|
301 return false;
|
cannam@226
|
302 }
|
cannam@226
|
303 eat_byte_safe(reader, b);
|
cannam@226
|
304
|
cannam@226
|
305 uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
cannam@226
|
306 for (unsigned i = 0; i < length; ++i) {
|
cannam@226
|
307 if (!(buf[i] = read_HEX(reader))) {
|
cannam@226
|
308 return false;
|
cannam@226
|
309 }
|
cannam@226
|
310 }
|
cannam@226
|
311
|
cannam@226
|
312 uint32_t code;
|
cannam@226
|
313 sscanf((const char*)buf, "%X", &code);
|
cannam@226
|
314
|
cannam@226
|
315 unsigned size = 0;
|
cannam@226
|
316 if (code < 0x00000080) {
|
cannam@226
|
317 size = 1;
|
cannam@226
|
318 } else if (code < 0x00000800) {
|
cannam@226
|
319 size = 2;
|
cannam@226
|
320 } else if (code < 0x00010000) {
|
cannam@226
|
321 size = 3;
|
cannam@226
|
322 } else if (code < 0x00110000) {
|
cannam@226
|
323 size = 4;
|
cannam@226
|
324 } else {
|
cannam@226
|
325 r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
326 "unicode character 0x%X out of range\n", code);
|
cannam@226
|
327 push_replacement(reader, dest);
|
cannam@226
|
328 *char_code = 0xFFFD;
|
cannam@226
|
329 return true;
|
cannam@226
|
330 }
|
cannam@226
|
331
|
cannam@226
|
332 // Build output in buf
|
cannam@226
|
333 // (Note # of bytes = # of leading 1 bits in first byte)
|
cannam@226
|
334 uint32_t c = code;
|
cannam@226
|
335 switch (size) {
|
cannam@226
|
336 case 4:
|
cannam@226
|
337 buf[3] = 0x80 | (uint8_t)(c & 0x3F);
|
cannam@226
|
338 c >>= 6;
|
cannam@226
|
339 c |= (16 << 12); // set bit 4
|
cannam@226
|
340 case 3:
|
cannam@226
|
341 buf[2] = 0x80 | (uint8_t)(c & 0x3F);
|
cannam@226
|
342 c >>= 6;
|
cannam@226
|
343 c |= (32 << 6); // set bit 5
|
cannam@226
|
344 case 2:
|
cannam@226
|
345 buf[1] = 0x80 | (uint8_t)(c & 0x3F);
|
cannam@226
|
346 c >>= 6;
|
cannam@226
|
347 c |= 0xC0; // set bits 6 and 7
|
cannam@226
|
348 case 1:
|
cannam@226
|
349 buf[0] = (uint8_t)c;
|
cannam@226
|
350 }
|
cannam@226
|
351
|
cannam@226
|
352 for (unsigned i = 0; i < size; ++i) {
|
cannam@226
|
353 push_byte(reader, dest, buf[i]);
|
cannam@226
|
354 }
|
cannam@226
|
355 *char_code = code;
|
cannam@226
|
356 return true;
|
cannam@226
|
357 }
|
cannam@226
|
358
|
cannam@226
|
359 // Read ECHAR escape, initial \ is already eaten by caller
|
cannam@226
|
360 static inline bool
|
cannam@226
|
361 read_ECHAR(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
|
cannam@226
|
362 {
|
cannam@226
|
363 const uint8_t c = peek_byte(reader);
|
cannam@226
|
364 switch (c) {
|
cannam@226
|
365 case 't':
|
cannam@226
|
366 eat_byte_safe(reader, 't');
|
cannam@226
|
367 push_byte(reader, dest, '\t');
|
cannam@226
|
368 return true;
|
cannam@226
|
369 case 'b':
|
cannam@226
|
370 eat_byte_safe(reader, 'b');
|
cannam@226
|
371 push_byte(reader, dest, '\b');
|
cannam@226
|
372 return true;
|
cannam@226
|
373 case 'n':
|
cannam@226
|
374 *flags |= SERD_HAS_NEWLINE;
|
cannam@226
|
375 eat_byte_safe(reader, 'n');
|
cannam@226
|
376 push_byte(reader, dest, '\n');
|
cannam@226
|
377 return true;
|
cannam@226
|
378 case 'r':
|
cannam@226
|
379 *flags |= SERD_HAS_NEWLINE;
|
cannam@226
|
380 eat_byte_safe(reader, 'r');
|
cannam@226
|
381 push_byte(reader, dest, '\r');
|
cannam@226
|
382 return true;
|
cannam@226
|
383 case 'f':
|
cannam@226
|
384 eat_byte_safe(reader, 'f');
|
cannam@226
|
385 push_byte(reader, dest, '\f');
|
cannam@226
|
386 return true;
|
cannam@226
|
387 case '\\': case '"': case '\'':
|
cannam@226
|
388 push_byte(reader, dest, eat_byte_safe(reader, c));
|
cannam@226
|
389 return true;
|
cannam@226
|
390 default:
|
cannam@226
|
391 return false;
|
cannam@226
|
392 }
|
cannam@226
|
393 }
|
cannam@226
|
394
|
cannam@226
|
395 static inline SerdStatus
|
cannam@226
|
396 bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c)
|
cannam@226
|
397 {
|
cannam@226
|
398 r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c);
|
cannam@226
|
399 push_replacement(reader, dest);
|
cannam@226
|
400
|
cannam@226
|
401 // Skip bytes until the next start byte
|
cannam@226
|
402 for (uint8_t b = peek_byte(reader); (b & 0x80);) {
|
cannam@226
|
403 eat_byte_safe(reader, b);
|
cannam@226
|
404 b = peek_byte(reader);
|
cannam@226
|
405 }
|
cannam@226
|
406
|
cannam@226
|
407 return SERD_SUCCESS;
|
cannam@226
|
408 }
|
cannam@226
|
409
|
cannam@226
|
410 static SerdStatus
|
cannam@226
|
411 read_utf8_character(SerdReader* reader, Ref dest, uint8_t c)
|
cannam@226
|
412 {
|
cannam@226
|
413 unsigned size = 1;
|
cannam@226
|
414 if ((c & 0xE0) == 0xC0) { // Starts with `110'
|
cannam@226
|
415 size = 2;
|
cannam@226
|
416 } else if ((c & 0xF0) == 0xE0) { // Starts with `1110'
|
cannam@226
|
417 size = 3;
|
cannam@226
|
418 } else if ((c & 0xF8) == 0xF0) { // Starts with `11110'
|
cannam@226
|
419 size = 4;
|
cannam@226
|
420 } else {
|
cannam@226
|
421 return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n", c);
|
cannam@226
|
422 }
|
cannam@226
|
423
|
cannam@226
|
424 char bytes[4];
|
cannam@226
|
425 bytes[0] = c;
|
cannam@226
|
426
|
cannam@226
|
427 // Check character validity
|
cannam@226
|
428 for (unsigned i = 1; i < size; ++i) {
|
cannam@226
|
429 if (((bytes[i] = peek_byte(reader)) & 0x80) == 0) {
|
cannam@226
|
430 return bad_char(reader, dest, "invalid UTF-8 continuation 0x%X\n",
|
cannam@226
|
431 bytes[i]);
|
cannam@226
|
432 }
|
cannam@226
|
433 eat_byte_safe(reader, bytes[i]);
|
cannam@226
|
434 }
|
cannam@226
|
435
|
cannam@226
|
436 // Emit character
|
cannam@226
|
437 for (unsigned i = 0; i < size; ++i) {
|
cannam@226
|
438 push_byte(reader, dest, bytes[i]);
|
cannam@226
|
439 }
|
cannam@226
|
440 return SERD_SUCCESS;
|
cannam@226
|
441 }
|
cannam@226
|
442
|
cannam@226
|
443 // Read one character (possibly multi-byte)
|
cannam@226
|
444 // The first byte, c, has already been eaten by caller
|
cannam@226
|
445 static inline SerdStatus
|
cannam@226
|
446 read_character(SerdReader* reader, Ref dest, SerdNodeFlags* flags, uint8_t c)
|
cannam@226
|
447 {
|
cannam@226
|
448 if (!(c & 0x80)) {
|
cannam@226
|
449 switch (c) {
|
cannam@226
|
450 case 0xA: case 0xD:
|
cannam@226
|
451 *flags |= SERD_HAS_NEWLINE;
|
cannam@226
|
452 break;
|
cannam@226
|
453 case '"': case '\'':
|
cannam@226
|
454 *flags |= SERD_HAS_QUOTE;
|
cannam@226
|
455 break;
|
cannam@226
|
456 }
|
cannam@226
|
457 push_byte(reader, dest, c);
|
cannam@226
|
458 return SERD_SUCCESS;
|
cannam@226
|
459 } else {
|
cannam@226
|
460 return read_utf8_character(reader, dest, c);
|
cannam@226
|
461 }
|
cannam@226
|
462 }
|
cannam@226
|
463
|
cannam@226
|
464 // [10] comment ::= '#' ( [^#xA #xD] )*
|
cannam@226
|
465 static void
|
cannam@226
|
466 read_comment(SerdReader* reader)
|
cannam@226
|
467 {
|
cannam@226
|
468 eat_byte_safe(reader, '#');
|
cannam@226
|
469 uint8_t c;
|
cannam@226
|
470 while (((c = peek_byte(reader)) != 0xA) && (c != 0xD) && c) {
|
cannam@226
|
471 eat_byte_safe(reader, c);
|
cannam@226
|
472 }
|
cannam@226
|
473 }
|
cannam@226
|
474
|
cannam@226
|
475 // [24] ws ::= #x9 | #xA | #xD | #x20 | comment
|
cannam@226
|
476 static inline bool
|
cannam@226
|
477 read_ws(SerdReader* reader)
|
cannam@226
|
478 {
|
cannam@226
|
479 const uint8_t c = peek_byte(reader);
|
cannam@226
|
480 switch (c) {
|
cannam@226
|
481 case 0x9: case 0xA: case 0xD: case 0x20:
|
cannam@226
|
482 eat_byte_safe(reader, c);
|
cannam@226
|
483 return true;
|
cannam@226
|
484 case '#':
|
cannam@226
|
485 read_comment(reader);
|
cannam@226
|
486 return true;
|
cannam@226
|
487 default:
|
cannam@226
|
488 return false;
|
cannam@226
|
489 }
|
cannam@226
|
490 }
|
cannam@226
|
491
|
cannam@226
|
492 static inline bool
|
cannam@226
|
493 read_ws_star(SerdReader* reader)
|
cannam@226
|
494 {
|
cannam@226
|
495 while (read_ws(reader)) {}
|
cannam@226
|
496 return true;
|
cannam@226
|
497 }
|
cannam@226
|
498
|
cannam@226
|
499 static inline bool
|
cannam@226
|
500 peek_delim(SerdReader* reader, const char delim)
|
cannam@226
|
501 {
|
cannam@226
|
502 read_ws_star(reader);
|
cannam@226
|
503 return peek_byte(reader) == delim;
|
cannam@226
|
504 }
|
cannam@226
|
505
|
cannam@226
|
506 static inline bool
|
cannam@226
|
507 eat_delim(SerdReader* reader, const char delim)
|
cannam@226
|
508 {
|
cannam@226
|
509 if (peek_delim(reader, delim)) {
|
cannam@226
|
510 eat_byte_safe(reader, delim);
|
cannam@226
|
511 return read_ws_star(reader);
|
cannam@226
|
512 }
|
cannam@226
|
513 return false;
|
cannam@226
|
514 }
|
cannam@226
|
515
|
cannam@226
|
516 // STRING_LITERAL_LONG_QUOTE and STRING_LITERAL_LONG_SINGLE_QUOTE
|
cannam@226
|
517 // Initial triple quotes are already eaten by caller
|
cannam@226
|
518 static Ref
|
cannam@226
|
519 read_STRING_LITERAL_LONG(SerdReader* reader, SerdNodeFlags* flags, uint8_t q)
|
cannam@226
|
520 {
|
cannam@226
|
521 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
|
cannam@226
|
522 while (true) {
|
cannam@226
|
523 const uint8_t c = peek_byte(reader);
|
cannam@226
|
524 uint32_t code;
|
cannam@226
|
525 switch (c) {
|
cannam@226
|
526 case '\\':
|
cannam@226
|
527 eat_byte_safe(reader, c);
|
cannam@226
|
528 if (!read_ECHAR(reader, ref, flags) &&
|
cannam@226
|
529 !read_UCHAR(reader, ref, &code)) {
|
cannam@226
|
530 r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
531 "invalid escape `\\%c'\n", peek_byte(reader));
|
cannam@226
|
532 return pop_node(reader, ref);
|
cannam@226
|
533 }
|
cannam@226
|
534 break;
|
cannam@226
|
535 default:
|
cannam@226
|
536 if (c == q) {
|
cannam@226
|
537 eat_byte_safe(reader, q);
|
cannam@226
|
538 const uint8_t q2 = eat_byte_safe(reader, peek_byte(reader));
|
cannam@226
|
539 const uint8_t q3 = peek_byte(reader);
|
cannam@226
|
540 if (q2 == q && q3 == q) { // End of string
|
cannam@226
|
541 eat_byte_safe(reader, q3);
|
cannam@226
|
542 return ref;
|
cannam@226
|
543 } else {
|
cannam@226
|
544 *flags |= SERD_HAS_QUOTE;
|
cannam@226
|
545 push_byte(reader, ref, c);
|
cannam@226
|
546 read_character(reader, ref, flags, q2);
|
cannam@226
|
547 }
|
cannam@226
|
548 } else {
|
cannam@226
|
549 read_character(reader, ref, flags, eat_byte_safe(reader, c));
|
cannam@226
|
550 }
|
cannam@226
|
551 }
|
cannam@226
|
552 }
|
cannam@226
|
553 return ref;
|
cannam@226
|
554 }
|
cannam@226
|
555
|
cannam@226
|
556 // STRING_LITERAL_QUOTE and STRING_LITERAL_SINGLE_QUOTE
|
cannam@226
|
557 // Initial quote is already eaten by caller
|
cannam@226
|
558 static Ref
|
cannam@226
|
559 read_STRING_LITERAL(SerdReader* reader, SerdNodeFlags* flags, uint8_t q)
|
cannam@226
|
560 {
|
cannam@226
|
561 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
|
cannam@226
|
562 while (true) {
|
cannam@226
|
563 const uint8_t c = peek_byte(reader);
|
cannam@226
|
564 uint32_t code;
|
cannam@226
|
565 switch (c) {
|
cannam@226
|
566 case '\n': case '\r':
|
cannam@226
|
567 r_err(reader, SERD_ERR_BAD_SYNTAX, "line end in short string\n");
|
cannam@226
|
568 return pop_node(reader, ref);
|
cannam@226
|
569 case '\\':
|
cannam@226
|
570 eat_byte_safe(reader, c);
|
cannam@226
|
571 if (!read_ECHAR(reader, ref, flags) &&
|
cannam@226
|
572 !read_UCHAR(reader, ref, &code)) {
|
cannam@226
|
573 r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
574 "invalid escape `\\%c'\n", peek_byte(reader));
|
cannam@226
|
575 return pop_node(reader, ref);
|
cannam@226
|
576 }
|
cannam@226
|
577 break;
|
cannam@226
|
578 default:
|
cannam@226
|
579 if (c == q) {
|
cannam@226
|
580 eat_byte_check(reader, q);
|
cannam@226
|
581 return ref;
|
cannam@226
|
582 } else {
|
cannam@226
|
583 read_character(reader, ref, flags, eat_byte_safe(reader, c));
|
cannam@226
|
584 }
|
cannam@226
|
585 }
|
cannam@226
|
586 }
|
cannam@226
|
587 eat_byte_check(reader, q);
|
cannam@226
|
588 return ref;
|
cannam@226
|
589 }
|
cannam@226
|
590
|
cannam@226
|
591 static Ref
|
cannam@226
|
592 read_String(SerdReader* reader, SerdNodeFlags* flags)
|
cannam@226
|
593 {
|
cannam@226
|
594 const uint8_t q1 = peek_byte(reader);
|
cannam@226
|
595 eat_byte_safe(reader, q1);
|
cannam@226
|
596
|
cannam@226
|
597 const uint8_t q2 = peek_byte(reader);
|
cannam@226
|
598 if (q2 != q1) { // Short string (not triple quoted)
|
cannam@226
|
599 return read_STRING_LITERAL(reader, flags, q1);
|
cannam@226
|
600 }
|
cannam@226
|
601
|
cannam@226
|
602 eat_byte_safe(reader, q2);
|
cannam@226
|
603 const uint8_t q3 = peek_byte(reader);
|
cannam@226
|
604 if (q3 != q1) { // Empty short string ("" or '')
|
cannam@226
|
605 return push_node(reader, SERD_LITERAL, "", 0);
|
cannam@226
|
606 }
|
cannam@226
|
607
|
cannam@226
|
608 if (!supports_fancy_literals(reader)) {
|
cannam@226
|
609 return r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
610 "syntax does not support long literals\n");
|
cannam@226
|
611 }
|
cannam@226
|
612
|
cannam@226
|
613 eat_byte_safe(reader, q3);
|
cannam@226
|
614 return read_STRING_LITERAL_LONG(reader, flags, q1);
|
cannam@226
|
615 }
|
cannam@226
|
616
|
cannam@226
|
617 static bool
|
cannam@226
|
618 read_PN_CHARS_BASE(SerdReader* reader, Ref dest)
|
cannam@226
|
619 {
|
cannam@226
|
620 const uint8_t c = peek_byte(reader);
|
cannam@226
|
621 if ((c & 0x80)) { // Multi-byte character
|
cannam@226
|
622 return !read_utf8_character(reader, dest, eat_byte_safe(reader, c));
|
cannam@226
|
623 }
|
cannam@226
|
624 if (is_alpha(c)) {
|
cannam@226
|
625 push_byte(reader, dest, eat_byte_safe(reader, c));
|
cannam@226
|
626 return true;
|
cannam@226
|
627 }
|
cannam@226
|
628 return false;
|
cannam@226
|
629 }
|
cannam@226
|
630
|
cannam@226
|
631 static bool
|
cannam@226
|
632 read_PN_CHARS(SerdReader* reader, Ref dest)
|
cannam@226
|
633 {
|
cannam@226
|
634 const uint8_t c = peek_byte(reader);
|
cannam@226
|
635 if ((c & 0x80)) { // Multi-byte character
|
cannam@226
|
636 return !read_utf8_character(reader, dest, eat_byte_safe(reader, c));
|
cannam@226
|
637 }
|
cannam@226
|
638
|
cannam@226
|
639 if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') {
|
cannam@226
|
640 push_byte(reader, dest, eat_byte_safe(reader, c));
|
cannam@226
|
641 return true;
|
cannam@226
|
642 }
|
cannam@226
|
643 return false;
|
cannam@226
|
644 }
|
cannam@226
|
645
|
cannam@226
|
646 static bool
|
cannam@226
|
647 read_PERCENT(SerdReader* reader, Ref dest)
|
cannam@226
|
648 {
|
cannam@226
|
649 push_byte(reader, dest, eat_byte_safe(reader, '%'));
|
cannam@226
|
650 const uint8_t h1 = read_HEX(reader);
|
cannam@226
|
651 const uint8_t h2 = read_HEX(reader);
|
cannam@226
|
652 if (h1 && h2) {
|
cannam@226
|
653 push_byte(reader, dest, h1);
|
cannam@226
|
654 push_byte(reader, dest, h2);
|
cannam@226
|
655 return true;
|
cannam@226
|
656 }
|
cannam@226
|
657 return false;
|
cannam@226
|
658 }
|
cannam@226
|
659
|
cannam@226
|
660 static SerdStatus
|
cannam@226
|
661 read_PLX(SerdReader* reader, Ref dest)
|
cannam@226
|
662 {
|
cannam@226
|
663 uint8_t c = peek_byte(reader);
|
cannam@226
|
664 switch (c) {
|
cannam@226
|
665 case '%':
|
cannam@226
|
666 if (!read_PERCENT(reader, dest)) {
|
cannam@226
|
667 return SERD_ERR_BAD_SYNTAX;
|
cannam@226
|
668 }
|
cannam@226
|
669 return SERD_SUCCESS;
|
cannam@226
|
670 case '\\':
|
cannam@226
|
671 eat_byte_safe(reader, c);
|
cannam@226
|
672 if (is_alpha(c = peek_byte(reader))) {
|
cannam@226
|
673 // Escapes like \u \n etc. are not supported
|
cannam@226
|
674 return SERD_ERR_BAD_SYNTAX;
|
cannam@226
|
675 } else {
|
cannam@226
|
676 // Allow escaping of pretty much any other character
|
cannam@226
|
677 push_byte(reader, dest, eat_byte_safe(reader, c));
|
cannam@226
|
678 return SERD_SUCCESS;
|
cannam@226
|
679 }
|
cannam@226
|
680 default:
|
cannam@226
|
681 return SERD_FAILURE;
|
cannam@226
|
682 }
|
cannam@226
|
683 }
|
cannam@226
|
684
|
cannam@226
|
685 static SerdStatus
|
cannam@226
|
686 read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot)
|
cannam@226
|
687 {
|
cannam@226
|
688 uint8_t c = peek_byte(reader);
|
cannam@226
|
689 SerdStatus st;
|
cannam@226
|
690 switch (c) {
|
cannam@226
|
691 case '0': case '1': case '2': case '3': case '4': case '5':
|
cannam@226
|
692 case '6': case '7': case '8': case '9': case ':': case '_':
|
cannam@226
|
693 push_byte(reader, dest, eat_byte_safe(reader, c));
|
cannam@226
|
694 break;
|
cannam@226
|
695 default:
|
cannam@226
|
696 if ((st = read_PLX(reader, dest)) > SERD_FAILURE) {
|
cannam@226
|
697 return st;
|
cannam@226
|
698 } else if (st != SERD_SUCCESS && !read_PN_CHARS_BASE(reader, dest)) {
|
cannam@226
|
699 return SERD_FAILURE;
|
cannam@226
|
700 }
|
cannam@226
|
701 }
|
cannam@226
|
702
|
cannam@226
|
703 while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.' | ';')*
|
cannam@226
|
704 if (c == '.' || c == ':') {
|
cannam@226
|
705 push_byte(reader, dest, eat_byte_safe(reader, c));
|
cannam@226
|
706 } else if ((st = read_PLX(reader, dest)) > SERD_FAILURE) {
|
cannam@226
|
707 return st;
|
cannam@226
|
708 } else if (st != SERD_SUCCESS && !read_PN_CHARS(reader, dest)) {
|
cannam@226
|
709 break;
|
cannam@226
|
710 }
|
cannam@226
|
711 }
|
cannam@226
|
712
|
cannam@226
|
713 SerdNode* const n = deref(reader, dest);
|
cannam@226
|
714 if (n->buf[n->n_bytes - 1] == '.') {
|
cannam@226
|
715 // Ate trailing dot, pop it from stack/node and inform caller
|
cannam@226
|
716 --n->n_bytes;
|
cannam@226
|
717 serd_stack_pop(&reader->stack, 1);
|
cannam@226
|
718 *ate_dot = true;
|
cannam@226
|
719 }
|
cannam@226
|
720
|
cannam@226
|
721 return SERD_SUCCESS;
|
cannam@226
|
722 }
|
cannam@226
|
723
|
cannam@226
|
724 // Read the remainder of a PN_PREFIX after some initial characters
|
cannam@226
|
725 static SerdStatus
|
cannam@226
|
726 read_PN_PREFIX_tail(SerdReader* reader, Ref dest)
|
cannam@226
|
727 {
|
cannam@226
|
728 uint8_t c;
|
cannam@226
|
729 while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')*
|
cannam@226
|
730 if (c == '.') {
|
cannam@226
|
731 push_byte(reader, dest, eat_byte_safe(reader, c));
|
cannam@226
|
732 } else if (!read_PN_CHARS(reader, dest)) {
|
cannam@226
|
733 break;
|
cannam@226
|
734 }
|
cannam@226
|
735 }
|
cannam@226
|
736
|
cannam@226
|
737 const SerdNode* const n = deref(reader, dest);
|
cannam@226
|
738 if (n->buf[n->n_bytes - 1] == '.' && !read_PN_CHARS(reader, dest)) {
|
cannam@226
|
739 r_err(reader, SERD_ERR_BAD_SYNTAX, "prefix ends with `.'\n");
|
cannam@226
|
740 return SERD_ERR_BAD_SYNTAX;
|
cannam@226
|
741 }
|
cannam@226
|
742
|
cannam@226
|
743 return SERD_SUCCESS;
|
cannam@226
|
744 }
|
cannam@226
|
745
|
cannam@226
|
746 static SerdStatus
|
cannam@226
|
747 read_PN_PREFIX(SerdReader* reader, Ref dest)
|
cannam@226
|
748 {
|
cannam@226
|
749 if (read_PN_CHARS_BASE(reader, dest)) {
|
cannam@226
|
750 return read_PN_PREFIX_tail(reader, dest);
|
cannam@226
|
751 }
|
cannam@226
|
752 return SERD_FAILURE;
|
cannam@226
|
753 }
|
cannam@226
|
754
|
cannam@226
|
755 static Ref
|
cannam@226
|
756 read_LANGTAG(SerdReader* reader)
|
cannam@226
|
757 {
|
cannam@226
|
758 uint8_t c = peek_byte(reader);
|
cannam@226
|
759 if (!is_alpha(c)) {
|
cannam@226
|
760 return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'\n", c);
|
cannam@226
|
761 }
|
cannam@226
|
762 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
|
cannam@226
|
763 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
764 while ((c = peek_byte(reader)) && is_alpha(c)) {
|
cannam@226
|
765 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
766 }
|
cannam@226
|
767 while (peek_byte(reader) == '-') {
|
cannam@226
|
768 push_byte(reader, ref, eat_byte_safe(reader, '-'));
|
cannam@226
|
769 while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) {
|
cannam@226
|
770 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
771 }
|
cannam@226
|
772 }
|
cannam@226
|
773 return ref;
|
cannam@226
|
774 }
|
cannam@226
|
775
|
cannam@226
|
776 typedef enum { PREFIX, GOOD, BAD} SchemeState;
|
cannam@226
|
777
|
cannam@226
|
778 static inline bool
|
cannam@226
|
779 check_scheme(SerdReader* reader, uint8_t c, SchemeState* state)
|
cannam@226
|
780 {
|
cannam@226
|
781 if (!supports_relative_iris(reader) && *state == PREFIX) {
|
cannam@226
|
782 if (c == ':') {
|
cannam@226
|
783 *state = GOOD;
|
cannam@226
|
784 } else if (!isalpha(c)) {
|
cannam@226
|
785 *state = BAD;
|
cannam@226
|
786 return r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
787 "syntax does not support relative IRIs\n");
|
cannam@226
|
788 }
|
cannam@226
|
789 }
|
cannam@226
|
790 return true;
|
cannam@226
|
791 }
|
cannam@226
|
792
|
cannam@226
|
793 static Ref
|
cannam@226
|
794 read_IRIREF(SerdReader* reader)
|
cannam@226
|
795 {
|
cannam@226
|
796 TRY_RET(eat_byte_check(reader, '<'));
|
cannam@226
|
797 Ref ref = push_node(reader, SERD_URI, "", 0);
|
cannam@226
|
798 SchemeState scheme = PREFIX;
|
cannam@226
|
799 uint32_t code;
|
cannam@226
|
800 while (true) {
|
cannam@226
|
801 const uint8_t c = peek_byte(reader);
|
cannam@226
|
802 if (!check_scheme(reader, c, &scheme)) {
|
cannam@226
|
803 return pop_node(reader, ref);
|
cannam@226
|
804 }
|
cannam@226
|
805 switch (c) {
|
cannam@226
|
806 case '"': case '<': case '^': case '`': case '{': case '|': case '}':
|
cannam@226
|
807 r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
808 "invalid IRI character `%c'\n", c);
|
cannam@226
|
809 return pop_node(reader, ref);
|
cannam@226
|
810 case '>':
|
cannam@226
|
811 eat_byte_safe(reader, c);
|
cannam@226
|
812 return ref;
|
cannam@226
|
813 case '\\':
|
cannam@226
|
814 eat_byte_safe(reader, c);
|
cannam@226
|
815 if (!read_UCHAR(reader, ref, &code)) {
|
cannam@226
|
816 r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape\n");
|
cannam@226
|
817 return pop_node(reader, ref);
|
cannam@226
|
818 }
|
cannam@226
|
819 switch (code) {
|
cannam@226
|
820 case 0: case ' ': case '<': case '>':
|
cannam@226
|
821 r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
822 "invalid escaped IRI character %X %c\n", code, code);
|
cannam@226
|
823 return pop_node(reader, ref);
|
cannam@226
|
824 }
|
cannam@226
|
825 break;
|
cannam@226
|
826 default:
|
cannam@226
|
827 if (c <= 0x20) {
|
cannam@226
|
828 if (isprint(c)) {
|
cannam@226
|
829 r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
830 "invalid IRI character `%c' (escape %%%02X)\n", c, c);
|
cannam@226
|
831 } else {
|
cannam@226
|
832 r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
833 "invalid IRI character (escape %%%02X)\n", c, c);
|
cannam@226
|
834 }
|
cannam@226
|
835 if (reader->strict) {
|
cannam@226
|
836 return pop_node(reader, ref);
|
cannam@226
|
837 }
|
cannam@226
|
838 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
839 } else {
|
cannam@226
|
840 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
841 }
|
cannam@226
|
842 }
|
cannam@226
|
843 }
|
cannam@226
|
844 }
|
cannam@226
|
845
|
cannam@226
|
846 static bool
|
cannam@226
|
847 read_PrefixedName(SerdReader* reader, Ref dest, bool read_prefix, bool* ate_dot)
|
cannam@226
|
848 {
|
cannam@226
|
849 if (read_prefix && read_PN_PREFIX(reader, dest) > SERD_FAILURE) {
|
cannam@226
|
850 return false;
|
cannam@226
|
851 } else if (peek_byte(reader) != ':') {
|
cannam@226
|
852 return false;
|
cannam@226
|
853 }
|
cannam@226
|
854
|
cannam@226
|
855 push_byte(reader, dest, eat_byte_safe(reader, ':'));
|
cannam@226
|
856 return read_PN_LOCAL(reader, dest, ate_dot) <= SERD_FAILURE;
|
cannam@226
|
857 }
|
cannam@226
|
858
|
cannam@226
|
859 static bool
|
cannam@226
|
860 read_0_9(SerdReader* reader, Ref str, bool at_least_one)
|
cannam@226
|
861 {
|
cannam@226
|
862 unsigned count = 0;
|
cannam@226
|
863 for (uint8_t c; is_digit((c = peek_byte(reader))); ++count) {
|
cannam@226
|
864 push_byte(reader, str, eat_byte_safe(reader, c));
|
cannam@226
|
865 }
|
cannam@226
|
866 if (at_least_one && count == 0) {
|
cannam@226
|
867 r_err(reader, SERD_ERR_BAD_SYNTAX, "expected digit\n");
|
cannam@226
|
868 }
|
cannam@226
|
869 return count;
|
cannam@226
|
870 }
|
cannam@226
|
871
|
cannam@226
|
872 static bool
|
cannam@226
|
873 read_number(SerdReader* reader, Ref* dest, Ref* datatype, bool* ate_dot)
|
cannam@226
|
874 {
|
cannam@226
|
875 #define XSD_DECIMAL NS_XSD "decimal"
|
cannam@226
|
876 #define XSD_DOUBLE NS_XSD "double"
|
cannam@226
|
877 #define XSD_INTEGER NS_XSD "integer"
|
cannam@226
|
878 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
|
cannam@226
|
879 uint8_t c = peek_byte(reader);
|
cannam@226
|
880 bool has_decimal = false;
|
cannam@226
|
881 if (c == '-' || c == '+') {
|
cannam@226
|
882 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
883 }
|
cannam@226
|
884 if ((c = peek_byte(reader)) == '.') {
|
cannam@226
|
885 has_decimal = true;
|
cannam@226
|
886 // decimal case 2 (e.g. '.0' or `-.0' or `+.0')
|
cannam@226
|
887 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
888 TRY_THROW(read_0_9(reader, ref, true));
|
cannam@226
|
889 } else {
|
cannam@226
|
890 // all other cases ::= ( '-' | '+' ) [0-9]+ ( . )? ( [0-9]+ )? ...
|
cannam@226
|
891 TRY_THROW(is_digit(c));
|
cannam@226
|
892 read_0_9(reader, ref, true);
|
cannam@226
|
893 if ((c = peek_byte(reader)) == '.') {
|
cannam@226
|
894 has_decimal = true;
|
cannam@226
|
895
|
cannam@226
|
896 // Annoyingly, dot can be end of statement, so tentatively eat
|
cannam@226
|
897 eat_byte_safe(reader, c);
|
cannam@226
|
898 c = peek_byte(reader);
|
cannam@226
|
899 if (!is_digit(c) && c != 'e' && c != 'E') {
|
cannam@226
|
900 *dest = ref;
|
cannam@226
|
901 *ate_dot = true; // Force caller to deal with stupid grammar
|
cannam@226
|
902 return true; // Next byte is not a number character, done
|
cannam@226
|
903 }
|
cannam@226
|
904
|
cannam@226
|
905 push_byte(reader, ref, '.');
|
cannam@226
|
906 read_0_9(reader, ref, false);
|
cannam@226
|
907 }
|
cannam@226
|
908 }
|
cannam@226
|
909 c = peek_byte(reader);
|
cannam@226
|
910 if (c == 'e' || c == 'E') {
|
cannam@226
|
911 // double
|
cannam@226
|
912 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
913 switch ((c = peek_byte(reader))) {
|
cannam@226
|
914 case '+': case '-':
|
cannam@226
|
915 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
916 default: break;
|
cannam@226
|
917 }
|
cannam@226
|
918 TRY_THROW(read_0_9(reader, ref, true));
|
cannam@226
|
919 *datatype = push_node(reader, SERD_URI,
|
cannam@226
|
920 XSD_DOUBLE, sizeof(XSD_DOUBLE) - 1);
|
cannam@226
|
921 } else if (has_decimal) {
|
cannam@226
|
922 *datatype = push_node(reader, SERD_URI,
|
cannam@226
|
923 XSD_DECIMAL, sizeof(XSD_DECIMAL) - 1);
|
cannam@226
|
924 } else {
|
cannam@226
|
925 *datatype = push_node(reader, SERD_URI,
|
cannam@226
|
926 XSD_INTEGER, sizeof(XSD_INTEGER) - 1);
|
cannam@226
|
927 }
|
cannam@226
|
928 *dest = ref;
|
cannam@226
|
929 return true;
|
cannam@226
|
930 except:
|
cannam@226
|
931 pop_node(reader, *datatype);
|
cannam@226
|
932 pop_node(reader, ref);
|
cannam@226
|
933 return false;
|
cannam@226
|
934 }
|
cannam@226
|
935
|
cannam@226
|
936 static bool
|
cannam@226
|
937 read_iri(SerdReader* reader, Ref* dest, bool* ate_dot)
|
cannam@226
|
938 {
|
cannam@226
|
939 switch (peek_byte(reader)) {
|
cannam@226
|
940 case '<':
|
cannam@226
|
941 *dest = read_IRIREF(reader);
|
cannam@226
|
942 return true;
|
cannam@226
|
943 default:
|
cannam@226
|
944 *dest = push_node(reader, SERD_CURIE, "", 0);
|
cannam@226
|
945 return read_PrefixedName(reader, *dest, true, ate_dot);
|
cannam@226
|
946 }
|
cannam@226
|
947 }
|
cannam@226
|
948
|
cannam@226
|
949 static bool
|
cannam@226
|
950 read_literal(SerdReader* reader, Ref* dest,
|
cannam@226
|
951 Ref* datatype, Ref* lang, SerdNodeFlags* flags, bool* ate_dot)
|
cannam@226
|
952 {
|
cannam@226
|
953 Ref str = read_String(reader, flags);
|
cannam@226
|
954 if (!str) {
|
cannam@226
|
955 return false;
|
cannam@226
|
956 }
|
cannam@226
|
957
|
cannam@226
|
958 switch (peek_byte(reader)) {
|
cannam@226
|
959 case '@':
|
cannam@226
|
960 eat_byte_safe(reader, '@');
|
cannam@226
|
961 TRY_THROW(*lang = read_LANGTAG(reader));
|
cannam@226
|
962 break;
|
cannam@226
|
963 case '^':
|
cannam@226
|
964 eat_byte_safe(reader, '^');
|
cannam@226
|
965 eat_byte_check(reader, '^');
|
cannam@226
|
966 TRY_THROW(read_iri(reader, datatype, ate_dot));
|
cannam@226
|
967 break;
|
cannam@226
|
968 }
|
cannam@226
|
969 *dest = str;
|
cannam@226
|
970 return true;
|
cannam@226
|
971 except:
|
cannam@226
|
972 *datatype = pop_node(reader, *datatype);
|
cannam@226
|
973 *lang = pop_node(reader, *lang);
|
cannam@226
|
974 pop_node(reader, str);
|
cannam@226
|
975 return false;
|
cannam@226
|
976 }
|
cannam@226
|
977
|
cannam@226
|
978 inline static bool
|
cannam@226
|
979 is_token_end(uint8_t c)
|
cannam@226
|
980 {
|
cannam@226
|
981 switch (c) {
|
cannam@226
|
982 case 0x9: case 0xA: case 0xD: case 0x20: case '\0':
|
cannam@226
|
983 case '#': case '.': case ';': case '<':
|
cannam@226
|
984 return true;
|
cannam@226
|
985 default:
|
cannam@226
|
986 return false;
|
cannam@226
|
987 }
|
cannam@226
|
988 }
|
cannam@226
|
989
|
cannam@226
|
990 static bool
|
cannam@226
|
991 read_verb(SerdReader* reader, Ref* dest)
|
cannam@226
|
992 {
|
cannam@226
|
993 if (peek_byte(reader) == '<') {
|
cannam@226
|
994 return (*dest = read_IRIREF(reader));
|
cannam@226
|
995 } else {
|
cannam@226
|
996 /* Either a qname, or "a". Read the prefix first, and if it is in fact
|
cannam@226
|
997 "a", produce that instead.
|
cannam@226
|
998 */
|
cannam@226
|
999 *dest = push_node(reader, SERD_CURIE, "", 0);
|
cannam@226
|
1000 SerdNode* node = deref(reader, *dest);
|
cannam@226
|
1001 const SerdStatus st = read_PN_PREFIX(reader, *dest);
|
cannam@226
|
1002 bool ate_dot = false;
|
cannam@226
|
1003 if (!st && node->n_bytes == 1 && node->buf[0] == 'a' &&
|
cannam@226
|
1004 is_token_end(peek_byte(reader))) {
|
cannam@226
|
1005 pop_node(reader, *dest);
|
cannam@226
|
1006 return (*dest = push_node(reader, SERD_URI, NS_RDF "type", 47));
|
cannam@226
|
1007 } else if (st > SERD_FAILURE ||
|
cannam@226
|
1008 !read_PrefixedName(reader, *dest, false, &ate_dot) ||
|
cannam@226
|
1009 ate_dot) {
|
cannam@226
|
1010 return (*dest = pop_node(reader, *dest));
|
cannam@226
|
1011 } else {
|
cannam@226
|
1012 return true;
|
cannam@226
|
1013 }
|
cannam@226
|
1014 }
|
cannam@226
|
1015 return false;
|
cannam@226
|
1016 }
|
cannam@226
|
1017
|
cannam@226
|
1018 static Ref
|
cannam@226
|
1019 read_BLANK_NODE_LABEL(SerdReader* reader, bool* ate_dot)
|
cannam@226
|
1020 {
|
cannam@226
|
1021 eat_byte_safe(reader, '_');
|
cannam@226
|
1022 eat_byte_check(reader, ':');
|
cannam@226
|
1023 Ref ref = push_node(reader, SERD_BLANK,
|
cannam@226
|
1024 reader->bprefix ? (char*)reader->bprefix : "",
|
cannam@226
|
1025 reader->bprefix_len);
|
cannam@226
|
1026
|
cannam@226
|
1027 uint8_t c = peek_byte(reader); // First: (PN_CHARS | '_' | [0-9])
|
cannam@226
|
1028 if (is_digit(c) || c == '_') {
|
cannam@226
|
1029 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
1030 } else if (!read_PN_CHARS(reader, ref)) {
|
cannam@226
|
1031 r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid name start character\n");
|
cannam@226
|
1032 return pop_node(reader, ref);
|
cannam@226
|
1033 }
|
cannam@226
|
1034
|
cannam@226
|
1035 while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')*
|
cannam@226
|
1036 if (c == '.') {
|
cannam@226
|
1037 push_byte(reader, ref, eat_byte_safe(reader, c));
|
cannam@226
|
1038 } else if (!read_PN_CHARS(reader, ref)) {
|
cannam@226
|
1039 break;
|
cannam@226
|
1040 }
|
cannam@226
|
1041 }
|
cannam@226
|
1042
|
cannam@226
|
1043 SerdNode* n = deref(reader, ref);
|
cannam@226
|
1044 if (n->buf[n->n_bytes - 1] == '.' && !read_PN_CHARS(reader, ref)) {
|
cannam@226
|
1045 // Ate trailing dot, pop it from stack/node and inform caller
|
cannam@226
|
1046 --n->n_bytes;
|
cannam@226
|
1047 serd_stack_pop(&reader->stack, 1);
|
cannam@226
|
1048 *ate_dot = true;
|
cannam@226
|
1049 }
|
cannam@226
|
1050
|
cannam@226
|
1051 if (reader->syntax == SERD_TURTLE) {
|
cannam@226
|
1052 if (is_digit(n->buf[reader->bprefix_len + 1])) {
|
cannam@226
|
1053 if ((n->buf[reader->bprefix_len]) == 'b') {
|
cannam@226
|
1054 ((char*)n->buf)[reader->bprefix_len] = 'B'; // Prevent clash
|
cannam@226
|
1055 reader->seen_genid = true;
|
cannam@226
|
1056 } else if (reader->seen_genid &&
|
cannam@226
|
1057 n->buf[reader->bprefix_len] == 'B') {
|
cannam@226
|
1058 r_err(reader, SERD_ERR_ID_CLASH,
|
cannam@226
|
1059 "found both `b' and `B' blank IDs, prefix required\n");
|
cannam@226
|
1060 return pop_node(reader, ref);
|
cannam@226
|
1061 }
|
cannam@226
|
1062 }
|
cannam@226
|
1063 }
|
cannam@226
|
1064 return ref;
|
cannam@226
|
1065 }
|
cannam@226
|
1066
|
cannam@226
|
1067 static void
|
cannam@226
|
1068 set_blank_id(SerdReader* reader, Ref ref, size_t buf_size)
|
cannam@226
|
1069 {
|
cannam@226
|
1070 SerdNode* node = deref(reader, ref);
|
cannam@226
|
1071 const char* prefix = reader->bprefix ? (const char*)reader->bprefix : "";
|
cannam@226
|
1072 node->n_bytes = node->n_chars = snprintf(
|
cannam@226
|
1073 (char*)node->buf, buf_size, "%sb%u", prefix, reader->next_id++);
|
cannam@226
|
1074 }
|
cannam@226
|
1075
|
cannam@226
|
1076 static size_t
|
cannam@226
|
1077 genid_size(SerdReader* reader)
|
cannam@226
|
1078 {
|
cannam@226
|
1079 return reader->bprefix_len + 1 + 10 + 1; // + "b" + UINT32_MAX + \0
|
cannam@226
|
1080 }
|
cannam@226
|
1081
|
cannam@226
|
1082 static Ref
|
cannam@226
|
1083 blank_id(SerdReader* reader)
|
cannam@226
|
1084 {
|
cannam@226
|
1085 Ref ref = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0);
|
cannam@226
|
1086 set_blank_id(reader, ref, genid_size(reader));
|
cannam@226
|
1087 return ref;
|
cannam@226
|
1088 }
|
cannam@226
|
1089
|
cannam@226
|
1090 static Ref
|
cannam@226
|
1091 read_blankName(SerdReader* reader)
|
cannam@226
|
1092 {
|
cannam@226
|
1093 eat_byte_safe(reader, '=');
|
cannam@226
|
1094 if (eat_byte_check(reader, '=') != '=') {
|
cannam@226
|
1095 return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected `='\n");
|
cannam@226
|
1096 }
|
cannam@226
|
1097
|
cannam@226
|
1098 Ref subject = 0;
|
cannam@226
|
1099 bool ate_dot = false;
|
cannam@226
|
1100 read_ws_star(reader);
|
cannam@226
|
1101 read_iri(reader, &subject, &ate_dot);
|
cannam@226
|
1102 return subject;
|
cannam@226
|
1103 }
|
cannam@226
|
1104
|
cannam@226
|
1105 static bool
|
cannam@226
|
1106 read_anon(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest)
|
cannam@226
|
1107 {
|
cannam@226
|
1108 const SerdStatementFlags old_flags = *ctx.flags;
|
cannam@226
|
1109 bool empty;
|
cannam@226
|
1110 eat_byte_safe(reader, '[');
|
cannam@226
|
1111 if ((empty = peek_delim(reader, ']'))) {
|
cannam@226
|
1112 *ctx.flags |= (subject) ? SERD_EMPTY_S : SERD_EMPTY_O;
|
cannam@226
|
1113 } else {
|
cannam@226
|
1114 *ctx.flags |= (subject) ? SERD_ANON_S_BEGIN : SERD_ANON_O_BEGIN;
|
cannam@226
|
1115 if (peek_delim(reader, '=')) {
|
cannam@226
|
1116 if (!(*dest = read_blankName(reader)) ||
|
cannam@226
|
1117 !eat_delim(reader, ';')) {
|
cannam@226
|
1118 return false;
|
cannam@226
|
1119 }
|
cannam@226
|
1120 }
|
cannam@226
|
1121 }
|
cannam@226
|
1122
|
cannam@226
|
1123 if (!*dest) {
|
cannam@226
|
1124 *dest = blank_id(reader);
|
cannam@226
|
1125 }
|
cannam@226
|
1126 if (ctx.subject) {
|
cannam@226
|
1127 TRY_RET(emit_statement(reader, ctx, *dest, 0, 0));
|
cannam@226
|
1128 }
|
cannam@226
|
1129
|
cannam@226
|
1130 ctx.subject = *dest;
|
cannam@226
|
1131 if (!empty) {
|
cannam@226
|
1132 *ctx.flags &= ~(SERD_LIST_CONT);
|
cannam@226
|
1133 if (!subject) {
|
cannam@226
|
1134 *ctx.flags |= SERD_ANON_CONT;
|
cannam@226
|
1135 }
|
cannam@226
|
1136 bool ate_dot_in_list = false;
|
cannam@226
|
1137 read_predicateObjectList(reader, ctx, &ate_dot_in_list);
|
cannam@226
|
1138 if (ate_dot_in_list) {
|
cannam@226
|
1139 return r_err(reader, SERD_ERR_BAD_SYNTAX, "`.' inside blank\n");
|
cannam@226
|
1140 }
|
cannam@226
|
1141 read_ws_star(reader);
|
cannam@226
|
1142 if (reader->end_sink) {
|
cannam@226
|
1143 reader->end_sink(reader->handle, deref(reader, *dest));
|
cannam@226
|
1144 }
|
cannam@226
|
1145 *ctx.flags = old_flags;
|
cannam@226
|
1146 }
|
cannam@226
|
1147 return (eat_byte_check(reader, ']') == ']');
|
cannam@226
|
1148 }
|
cannam@226
|
1149
|
cannam@226
|
1150 /* If emit is true: recurses, calling statement_sink for every statement
|
cannam@226
|
1151 encountered, and leaves stack in original calling state (i.e. pops
|
cannam@226
|
1152 everything it pushes). */
|
cannam@226
|
1153 static bool
|
cannam@226
|
1154 read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot)
|
cannam@226
|
1155 {
|
cannam@226
|
1156 static const char* const XSD_BOOLEAN = NS_XSD "boolean";
|
cannam@226
|
1157 static const size_t XSD_BOOLEAN_LEN = 40;
|
cannam@226
|
1158
|
cannam@226
|
1159 #ifndef NDEBUG
|
cannam@226
|
1160 const size_t orig_stack_size = reader->stack.size;
|
cannam@226
|
1161 #endif
|
cannam@226
|
1162
|
cannam@226
|
1163 bool ret = false;
|
cannam@226
|
1164 bool simple = (ctx->subject != 0);
|
cannam@226
|
1165 SerdNode* node = NULL;
|
cannam@226
|
1166 Ref o = 0;
|
cannam@226
|
1167 Ref datatype = 0;
|
cannam@226
|
1168 Ref lang = 0;
|
cannam@226
|
1169 uint32_t flags = 0;
|
cannam@226
|
1170 const uint8_t c = peek_byte(reader);
|
cannam@226
|
1171 if (!supports_fancy_literals(reader)) {
|
cannam@226
|
1172 switch (c) {
|
cannam@226
|
1173 case '"': case ':': case '<': case '_': break;
|
cannam@226
|
1174 default: return r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
1175 "expected: ':', '<', or '_'\n");
|
cannam@226
|
1176 }
|
cannam@226
|
1177 }
|
cannam@226
|
1178 switch (c) {
|
cannam@226
|
1179 case '\0':
|
cannam@226
|
1180 case ')':
|
cannam@226
|
1181 return false;
|
cannam@226
|
1182 case '[':
|
cannam@226
|
1183 simple = false;
|
cannam@226
|
1184 TRY_THROW(ret = read_anon(reader, *ctx, false, &o));
|
cannam@226
|
1185 break;
|
cannam@226
|
1186 case '(':
|
cannam@226
|
1187 simple = false;
|
cannam@226
|
1188 TRY_THROW(ret = read_collection(reader, *ctx, &o));
|
cannam@226
|
1189 break;
|
cannam@226
|
1190 case '_':
|
cannam@226
|
1191 TRY_THROW(ret = (o = read_BLANK_NODE_LABEL(reader, ate_dot)));
|
cannam@226
|
1192 break;
|
cannam@226
|
1193 case '<': case ':':
|
cannam@226
|
1194 TRY_THROW(ret = read_iri(reader, &o, ate_dot));
|
cannam@226
|
1195 break;
|
cannam@226
|
1196 case '+': case '-': case '.': case '0': case '1': case '2': case '3':
|
cannam@226
|
1197 case '4': case '5': case '6': case '7': case '8': case '9':
|
cannam@226
|
1198 TRY_THROW(ret = read_number(reader, &o, &datatype, ate_dot));
|
cannam@226
|
1199 break;
|
cannam@226
|
1200 case '\"':
|
cannam@226
|
1201 case '\'':
|
cannam@226
|
1202 TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang, &flags, ate_dot));
|
cannam@226
|
1203 break;
|
cannam@226
|
1204 default:
|
cannam@226
|
1205 /* Either a boolean literal, or a qname. Read the prefix first, and if
|
cannam@226
|
1206 it is in fact a "true" or "false" literal, produce that instead.
|
cannam@226
|
1207 */
|
cannam@226
|
1208 node = deref(reader, o = push_node(reader, SERD_CURIE, "", 0));
|
cannam@226
|
1209 while (read_PN_CHARS_BASE(reader, o)) {}
|
cannam@226
|
1210 if ((node->n_bytes == 4 && !memcmp(node->buf, "true", 4)) ||
|
cannam@226
|
1211 (node->n_bytes == 5 && !memcmp(node->buf, "false", 5))) {
|
cannam@226
|
1212 node->type = SERD_LITERAL;
|
cannam@226
|
1213 datatype = push_node(
|
cannam@226
|
1214 reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN);
|
cannam@226
|
1215 ret = true;
|
cannam@226
|
1216 } else if (read_PN_PREFIX_tail(reader, o) > SERD_FAILURE) {
|
cannam@226
|
1217 ret = false;
|
cannam@226
|
1218 } else {
|
cannam@226
|
1219 ret = read_PrefixedName(reader, o, false, ate_dot);
|
cannam@226
|
1220 }
|
cannam@226
|
1221 }
|
cannam@226
|
1222
|
cannam@226
|
1223 if (simple && o) {
|
cannam@226
|
1224 deref(reader, o)->flags = flags;
|
cannam@226
|
1225 }
|
cannam@226
|
1226
|
cannam@226
|
1227 if (ret && emit && simple) {
|
cannam@226
|
1228 ret = emit_statement(reader, *ctx, o, datatype, lang);
|
cannam@226
|
1229 } else if (ret && !emit) {
|
cannam@226
|
1230 ctx->object = o;
|
cannam@226
|
1231 ctx->datatype = datatype;
|
cannam@226
|
1232 ctx->lang = lang;
|
cannam@226
|
1233 return true;
|
cannam@226
|
1234 }
|
cannam@226
|
1235
|
cannam@226
|
1236 except:
|
cannam@226
|
1237 pop_node(reader, lang);
|
cannam@226
|
1238 pop_node(reader, datatype);
|
cannam@226
|
1239 pop_node(reader, o);
|
cannam@226
|
1240 #ifndef NDEBUG
|
cannam@226
|
1241 assert(reader->stack.size == orig_stack_size);
|
cannam@226
|
1242 #endif
|
cannam@226
|
1243 return ret;
|
cannam@226
|
1244 }
|
cannam@226
|
1245
|
cannam@226
|
1246 static bool
|
cannam@226
|
1247 read_objectList(SerdReader* reader, ReadContext ctx, bool* ate_dot)
|
cannam@226
|
1248 {
|
cannam@226
|
1249 TRY_RET(read_object(reader, &ctx, true, ate_dot));
|
cannam@226
|
1250 while (!*ate_dot && eat_delim(reader, ',')) {
|
cannam@226
|
1251 TRY_RET(read_object(reader, &ctx, true, ate_dot));
|
cannam@226
|
1252 }
|
cannam@226
|
1253 return true;
|
cannam@226
|
1254 }
|
cannam@226
|
1255
|
cannam@226
|
1256 static bool
|
cannam@226
|
1257 read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot)
|
cannam@226
|
1258 {
|
cannam@226
|
1259 uint8_t c;
|
cannam@226
|
1260 while (true) {
|
cannam@226
|
1261 TRY_THROW(read_verb(reader, &ctx.predicate));
|
cannam@226
|
1262 read_ws_star(reader);
|
cannam@226
|
1263
|
cannam@226
|
1264 TRY_THROW(read_objectList(reader, ctx, ate_dot));
|
cannam@226
|
1265 ctx.predicate = pop_node(reader, ctx.predicate);
|
cannam@226
|
1266 if (*ate_dot) {
|
cannam@226
|
1267 return true;
|
cannam@226
|
1268 }
|
cannam@226
|
1269
|
cannam@226
|
1270 bool ate_semi = false;
|
cannam@226
|
1271 do {
|
cannam@226
|
1272 read_ws_star(reader);
|
cannam@226
|
1273 switch (c = peek_byte(reader)) {
|
cannam@226
|
1274 case 0:
|
cannam@226
|
1275 return false;
|
cannam@226
|
1276 case '.': case ']': case '}':
|
cannam@226
|
1277 return true;
|
cannam@226
|
1278 case ';':
|
cannam@226
|
1279 eat_byte_safe(reader, c);
|
cannam@226
|
1280 ate_semi = true;
|
cannam@226
|
1281 }
|
cannam@226
|
1282 } while (c == ';');
|
cannam@226
|
1283
|
cannam@226
|
1284 if (!ate_semi) {
|
cannam@226
|
1285 return r_err(reader, SERD_ERR_BAD_SYNTAX, "missing ';' or '.'\n");
|
cannam@226
|
1286 }
|
cannam@226
|
1287 }
|
cannam@226
|
1288
|
cannam@226
|
1289 pop_node(reader, ctx.predicate);
|
cannam@226
|
1290 return true;
|
cannam@226
|
1291 except:
|
cannam@226
|
1292 pop_node(reader, ctx.predicate);
|
cannam@226
|
1293 return false;
|
cannam@226
|
1294 }
|
cannam@226
|
1295
|
cannam@226
|
1296 static bool
|
cannam@226
|
1297 end_collection(SerdReader* reader, ReadContext ctx, Ref n1, Ref n2, bool ret)
|
cannam@226
|
1298 {
|
cannam@226
|
1299 pop_node(reader, n2);
|
cannam@226
|
1300 pop_node(reader, n1);
|
cannam@226
|
1301 *ctx.flags &= ~SERD_LIST_CONT;
|
cannam@226
|
1302 return ret && (eat_byte_safe(reader, ')') == ')');
|
cannam@226
|
1303 }
|
cannam@226
|
1304
|
cannam@226
|
1305 static bool
|
cannam@226
|
1306 read_collection(SerdReader* reader, ReadContext ctx, Ref* dest)
|
cannam@226
|
1307 {
|
cannam@226
|
1308 eat_byte_safe(reader, '(');
|
cannam@226
|
1309 bool end = peek_delim(reader, ')');
|
cannam@226
|
1310 *dest = end ? reader->rdf_nil : blank_id(reader);
|
cannam@226
|
1311 if (ctx.subject) {
|
cannam@226
|
1312 // subject predicate _:head
|
cannam@226
|
1313 *ctx.flags |= (end ? 0 : SERD_LIST_O_BEGIN);
|
cannam@226
|
1314 TRY_RET(emit_statement(reader, ctx, *dest, 0, 0));
|
cannam@226
|
1315 *ctx.flags |= SERD_LIST_CONT;
|
cannam@226
|
1316 } else {
|
cannam@226
|
1317 *ctx.flags |= (end ? 0 : SERD_LIST_S_BEGIN);
|
cannam@226
|
1318 }
|
cannam@226
|
1319
|
cannam@226
|
1320 if (end) {
|
cannam@226
|
1321 return end_collection(reader, ctx, 0, 0, true);
|
cannam@226
|
1322 }
|
cannam@226
|
1323
|
cannam@226
|
1324 /* The order of node allocation here is necessarily not in stack order,
|
cannam@226
|
1325 so we create two nodes and recycle them throughout. */
|
cannam@226
|
1326 Ref n1 = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0);
|
cannam@226
|
1327 Ref n2 = 0;
|
cannam@226
|
1328 Ref node = n1;
|
cannam@226
|
1329 Ref rest = 0;
|
cannam@226
|
1330
|
cannam@226
|
1331 ctx.subject = *dest;
|
cannam@226
|
1332 while (!(end = peek_delim(reader, ')'))) {
|
cannam@226
|
1333 // _:node rdf:first object
|
cannam@226
|
1334 ctx.predicate = reader->rdf_first;
|
cannam@226
|
1335 bool ate_dot = false;
|
cannam@226
|
1336 if (!read_object(reader, &ctx, true, &ate_dot) || ate_dot) {
|
cannam@226
|
1337 return end_collection(reader, ctx, n1, n2, false);
|
cannam@226
|
1338 }
|
cannam@226
|
1339
|
cannam@226
|
1340 if (!(end = peek_delim(reader, ')'))) {
|
cannam@226
|
1341 /* Give rest a new ID. Done as late as possible to ensure it is
|
cannam@226
|
1342 used and > IDs generated by read_object above. */
|
cannam@226
|
1343 if (!rest) {
|
cannam@226
|
1344 rest = n2 = blank_id(reader); // First pass, push
|
cannam@226
|
1345 } else {
|
cannam@226
|
1346 set_blank_id(reader, rest, genid_size(reader));
|
cannam@226
|
1347 }
|
cannam@226
|
1348 }
|
cannam@226
|
1349
|
cannam@226
|
1350 // _:node rdf:rest _:rest
|
cannam@226
|
1351 *ctx.flags |= SERD_LIST_CONT;
|
cannam@226
|
1352 ctx.predicate = reader->rdf_rest;
|
cannam@226
|
1353 TRY_RET(emit_statement(reader, ctx,
|
cannam@226
|
1354 (end ? reader->rdf_nil : rest), 0, 0));
|
cannam@226
|
1355
|
cannam@226
|
1356 ctx.subject = rest; // _:node = _:rest
|
cannam@226
|
1357 rest = node; // _:rest = (old)_:node
|
cannam@226
|
1358 node = ctx.subject; // invariant
|
cannam@226
|
1359 }
|
cannam@226
|
1360
|
cannam@226
|
1361 return end_collection(reader, ctx, n1, n2, true);
|
cannam@226
|
1362 }
|
cannam@226
|
1363
|
cannam@226
|
1364 static Ref
|
cannam@226
|
1365 read_subject(SerdReader* reader, ReadContext ctx, Ref* dest, char* s_type)
|
cannam@226
|
1366 {
|
cannam@226
|
1367 bool ate_dot = false;
|
cannam@226
|
1368 switch ((*s_type = peek_byte(reader))) {
|
cannam@226
|
1369 case '[':
|
cannam@226
|
1370 read_anon(reader, ctx, true, dest);
|
cannam@226
|
1371 break;
|
cannam@226
|
1372 case '(':
|
cannam@226
|
1373 read_collection(reader, ctx, dest);
|
cannam@226
|
1374 break;
|
cannam@226
|
1375 case '_':
|
cannam@226
|
1376 *dest = read_BLANK_NODE_LABEL(reader, &ate_dot);
|
cannam@226
|
1377 break;
|
cannam@226
|
1378 default:
|
cannam@226
|
1379 TRY_RET(read_iri(reader, dest, &ate_dot));
|
cannam@226
|
1380 }
|
cannam@226
|
1381 return ate_dot ? pop_node(reader, *dest) : *dest;
|
cannam@226
|
1382 }
|
cannam@226
|
1383
|
cannam@226
|
1384 static Ref
|
cannam@226
|
1385 read_labelOrSubject(SerdReader* reader, ReadContext ctx)
|
cannam@226
|
1386 {
|
cannam@226
|
1387 Ref subject = 0;
|
cannam@226
|
1388 bool ate_dot = false;
|
cannam@226
|
1389 switch (peek_byte(reader)) {
|
cannam@226
|
1390 case '[':
|
cannam@226
|
1391 eat_byte_safe(reader, '[');
|
cannam@226
|
1392 read_ws_star(reader);
|
cannam@226
|
1393 TRY_RET(eat_byte_check(reader, ']'));
|
cannam@226
|
1394 return blank_id(reader);
|
cannam@226
|
1395 case '_':
|
cannam@226
|
1396 return read_BLANK_NODE_LABEL(reader, &ate_dot);
|
cannam@226
|
1397 default:
|
cannam@226
|
1398 read_iri(reader, &subject, &ate_dot);
|
cannam@226
|
1399 }
|
cannam@226
|
1400 return subject;
|
cannam@226
|
1401 }
|
cannam@226
|
1402
|
cannam@226
|
1403 static bool
|
cannam@226
|
1404 read_triples(SerdReader* reader, ReadContext ctx, bool* ate_dot)
|
cannam@226
|
1405 {
|
cannam@226
|
1406 bool ret = false;
|
cannam@226
|
1407 if (ctx.subject) {
|
cannam@226
|
1408 read_ws_star(reader);
|
cannam@226
|
1409 switch (peek_byte(reader)) {
|
cannam@226
|
1410 case '.':
|
cannam@226
|
1411 *ate_dot = eat_byte_safe(reader, '.');
|
cannam@226
|
1412 return false;
|
cannam@226
|
1413 case '}':
|
cannam@226
|
1414 return false;
|
cannam@226
|
1415 }
|
cannam@226
|
1416 ret = read_predicateObjectList(reader, ctx, ate_dot);
|
cannam@226
|
1417 }
|
cannam@226
|
1418 ctx.subject = ctx.predicate = 0;
|
cannam@226
|
1419 return ret;
|
cannam@226
|
1420 }
|
cannam@226
|
1421
|
cannam@226
|
1422 static bool
|
cannam@226
|
1423 read_base(SerdReader* reader, bool sparql, bool token)
|
cannam@226
|
1424 {
|
cannam@226
|
1425 if (token) {
|
cannam@226
|
1426 TRY_RET(eat_string(reader, "base", 4));
|
cannam@226
|
1427 }
|
cannam@226
|
1428
|
cannam@226
|
1429 Ref uri;
|
cannam@226
|
1430 read_ws_star(reader);
|
cannam@226
|
1431 TRY_RET(uri = read_IRIREF(reader));
|
cannam@226
|
1432 if (reader->base_sink) {
|
cannam@226
|
1433 reader->base_sink(reader->handle, deref(reader, uri));
|
cannam@226
|
1434 }
|
cannam@226
|
1435 pop_node(reader, uri);
|
cannam@226
|
1436
|
cannam@226
|
1437 read_ws_star(reader);
|
cannam@226
|
1438 if (!sparql) {
|
cannam@226
|
1439 return eat_byte_check(reader, '.');
|
cannam@226
|
1440 } else if (peek_byte(reader) == '.') {
|
cannam@226
|
1441 return r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
1442 "full stop after SPARQL BASE\n");
|
cannam@226
|
1443 }
|
cannam@226
|
1444 return true;
|
cannam@226
|
1445 }
|
cannam@226
|
1446
|
cannam@226
|
1447 static bool
|
cannam@226
|
1448 read_prefixID(SerdReader* reader, bool sparql, bool token)
|
cannam@226
|
1449 {
|
cannam@226
|
1450 if (token) {
|
cannam@226
|
1451 TRY_RET(eat_string(reader, "prefix", 6));
|
cannam@226
|
1452 }
|
cannam@226
|
1453
|
cannam@226
|
1454 read_ws_star(reader);
|
cannam@226
|
1455 bool ret = true;
|
cannam@226
|
1456 Ref name = push_node(reader, SERD_LITERAL, "", 0);
|
cannam@226
|
1457 if (read_PN_PREFIX(reader, name) > SERD_FAILURE) {
|
cannam@226
|
1458 return pop_node(reader, name);
|
cannam@226
|
1459 }
|
cannam@226
|
1460
|
cannam@226
|
1461 if (eat_byte_check(reader, ':') != ':') {
|
cannam@226
|
1462 return pop_node(reader, name);
|
cannam@226
|
1463 }
|
cannam@226
|
1464
|
cannam@226
|
1465 read_ws_star(reader);
|
cannam@226
|
1466 const Ref uri = read_IRIREF(reader);
|
cannam@226
|
1467 if (!uri) {
|
cannam@226
|
1468 pop_node(reader, name);
|
cannam@226
|
1469 return false;
|
cannam@226
|
1470 }
|
cannam@226
|
1471
|
cannam@226
|
1472 if (reader->prefix_sink) {
|
cannam@226
|
1473 ret = !reader->prefix_sink(reader->handle,
|
cannam@226
|
1474 deref(reader, name),
|
cannam@226
|
1475 deref(reader, uri));
|
cannam@226
|
1476 }
|
cannam@226
|
1477 pop_node(reader, uri);
|
cannam@226
|
1478 pop_node(reader, name);
|
cannam@226
|
1479 if (!sparql) {
|
cannam@226
|
1480 read_ws_star(reader);
|
cannam@226
|
1481 return eat_byte_check(reader, '.');
|
cannam@226
|
1482 }
|
cannam@226
|
1483 return ret;
|
cannam@226
|
1484 }
|
cannam@226
|
1485
|
cannam@226
|
1486 static bool
|
cannam@226
|
1487 read_directive(SerdReader* reader)
|
cannam@226
|
1488 {
|
cannam@226
|
1489 const bool sparql = peek_byte(reader) != '@';
|
cannam@226
|
1490 if (!sparql) {
|
cannam@226
|
1491 eat_byte_safe(reader, '@');
|
cannam@226
|
1492 switch (peek_byte(reader)) {
|
cannam@226
|
1493 case 'B': case 'P':
|
cannam@226
|
1494 return r_err(reader, SERD_ERR_BAD_SYNTAX,
|
cannam@226
|
1495 "uppercase directive\n");
|
cannam@226
|
1496 }
|
cannam@226
|
1497 }
|
cannam@226
|
1498
|
cannam@226
|
1499 switch (peek_byte(reader)) {
|
cannam@226
|
1500 case 'B': case 'b': return read_base(reader, sparql, true);
|
cannam@226
|
1501 case 'P': case 'p': return read_prefixID(reader, sparql, true);
|
cannam@226
|
1502 default:
|
cannam@226
|
1503 return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid directive\n");
|
cannam@226
|
1504 }
|
cannam@226
|
1505
|
cannam@226
|
1506 return true;
|
cannam@226
|
1507 }
|
cannam@226
|
1508
|
cannam@226
|
1509 static bool
|
cannam@226
|
1510 read_wrappedGraph(SerdReader* reader, ReadContext* ctx)
|
cannam@226
|
1511 {
|
cannam@226
|
1512 bool ate_dot = false;
|
cannam@226
|
1513 char s_type = 0;
|
cannam@226
|
1514 TRY_RET(eat_byte_check(reader, '{'));
|
cannam@226
|
1515 read_ws_star(reader);
|
cannam@226
|
1516 while (peek_byte(reader) != '}') {
|
cannam@226
|
1517 ctx->subject = 0;
|
cannam@226
|
1518 Ref subj = read_subject(reader, *ctx, &ctx->subject, &s_type);
|
cannam@226
|
1519 if (!subj ||
|
cannam@226
|
1520 (!read_triples(reader, *ctx, &ate_dot) && s_type != '[')) {
|
cannam@226
|
1521 return false;
|
cannam@226
|
1522 }
|
cannam@226
|
1523 pop_node(reader, subj);
|
cannam@226
|
1524 read_ws_star(reader);
|
cannam@226
|
1525 if (peek_byte(reader) == '.') {
|
cannam@226
|
1526 eat_byte_safe(reader, '.');
|
cannam@226
|
1527 }
|
cannam@226
|
1528 read_ws_star(reader);
|
cannam@226
|
1529 }
|
cannam@226
|
1530 return eat_byte_check(reader, '}');
|
cannam@226
|
1531 }
|
cannam@226
|
1532
|
cannam@226
|
1533 static int
|
cannam@226
|
1534 tokcmp(SerdReader* reader, Ref ref, const char* tok, size_t n)
|
cannam@226
|
1535 {
|
cannam@226
|
1536 SerdNode* node = deref(reader, ref);
|
cannam@226
|
1537 if (!node || node->n_bytes != n) {
|
cannam@226
|
1538 return -1;
|
cannam@226
|
1539 }
|
cannam@226
|
1540 const char* s1 = (const char*)node->buf;
|
cannam@226
|
1541 const char* s2 = tok;
|
cannam@226
|
1542 for (; n > 0 && *s2; s1++, s2++, --n) {
|
cannam@226
|
1543 if (toupper(*s1) != toupper(*s2)) {
|
cannam@226
|
1544 return ((*(uint8_t*)s1 < *(uint8_t*)s2) ? -1 : +1);
|
cannam@226
|
1545 }
|
cannam@226
|
1546 }
|
cannam@226
|
1547 return 0;
|
cannam@226
|
1548 }
|
cannam@226
|
1549
|
cannam@226
|
1550 static bool
|
cannam@226
|
1551 read_statement(SerdReader* reader)
|
cannam@226
|
1552 {
|
cannam@226
|
1553 SerdStatementFlags flags = 0;
|
cannam@226
|
1554 ReadContext ctx = { 0, 0, 0, 0, 0, 0, &flags };
|
cannam@226
|
1555 Ref subj = 0;
|
cannam@226
|
1556 bool ate_dot = false;
|
cannam@226
|
1557 char s_type = false;
|
cannam@226
|
1558 bool ret = true;
|
cannam@226
|
1559 read_ws_star(reader);
|
cannam@226
|
1560 switch (peek_byte(reader)) {
|
cannam@226
|
1561 case '\0':
|
cannam@226
|
1562 reader->eof = true;
|
cannam@226
|
1563 return reader->status <= SERD_FAILURE;
|
cannam@226
|
1564 case '@':
|
cannam@226
|
1565 TRY_RET(read_directive(reader));
|
cannam@226
|
1566 read_ws_star(reader);
|
cannam@226
|
1567 break;
|
cannam@226
|
1568 case '{':
|
cannam@226
|
1569 if (reader->syntax == SERD_TRIG) {
|
cannam@226
|
1570 TRY_RET(read_wrappedGraph(reader, &ctx));
|
cannam@226
|
1571 read_ws_star(reader);
|
cannam@226
|
1572 } else {
|
cannam@226
|
1573 return r_err(reader, SERD_ERR_BAD_SYNTAX, "graph in Turtle\n");
|
cannam@226
|
1574 }
|
cannam@226
|
1575 break;
|
cannam@226
|
1576 default:
|
cannam@226
|
1577 subj = read_subject(reader, ctx, &ctx.subject, &s_type);
|
cannam@226
|
1578 if (!tokcmp(reader, ctx.subject, "base", 4)) {
|
cannam@226
|
1579 ret = read_base(reader, true, false);
|
cannam@226
|
1580 } else if (!tokcmp(reader, ctx.subject, "prefix", 6)) {
|
cannam@226
|
1581 ret = read_prefixID(reader, true, false);
|
cannam@226
|
1582 } else if (!tokcmp(reader, ctx.subject, "graph", 5)) {
|
cannam@226
|
1583 read_ws_star(reader);
|
cannam@226
|
1584 TRY_RET((ctx.graph = read_labelOrSubject(reader, ctx)));
|
cannam@226
|
1585 read_ws_star(reader);
|
cannam@226
|
1586 TRY_RET(read_wrappedGraph(reader, &ctx));
|
cannam@226
|
1587 read_ws_star(reader);
|
cannam@226
|
1588 } else if (read_ws_star(reader) && peek_byte(reader) == '{') {
|
cannam@226
|
1589 if (s_type == '(' || (s_type == '[' && !*ctx.flags)) {
|
cannam@226
|
1590 return false; // invalid graph with complex label
|
cannam@226
|
1591 }
|
cannam@226
|
1592 ctx.graph = subj;
|
cannam@226
|
1593 ctx.subject = subj = 0;
|
cannam@226
|
1594 TRY_RET(read_wrappedGraph(reader, &ctx));
|
cannam@226
|
1595 read_ws_star(reader);
|
cannam@226
|
1596 } else if (!subj) {
|
cannam@226
|
1597 ret = r_err(reader, SERD_ERR_BAD_SYNTAX, "bad subject\n");
|
cannam@226
|
1598 } else if (!read_triples(reader, ctx, &ate_dot)) {
|
cannam@226
|
1599 ret = (s_type == '[');
|
cannam@226
|
1600 } else if (!ate_dot) {
|
cannam@226
|
1601 read_ws_star(reader);
|
cannam@226
|
1602 ret = (eat_byte_check(reader, '.') == '.');
|
cannam@226
|
1603 }
|
cannam@226
|
1604 pop_node(reader, subj);
|
cannam@226
|
1605 break;
|
cannam@226
|
1606 }
|
cannam@226
|
1607 return ret;
|
cannam@226
|
1608 }
|
cannam@226
|
1609
|
cannam@226
|
1610 static bool
|
cannam@226
|
1611 read_turtleDoc(SerdReader* reader)
|
cannam@226
|
1612 {
|
cannam@226
|
1613 while (!reader->eof) {
|
cannam@226
|
1614 TRY_RET(read_statement(reader));
|
cannam@226
|
1615 }
|
cannam@226
|
1616 return reader->status <= SERD_FAILURE;
|
cannam@226
|
1617 }
|
cannam@226
|
1618
|
cannam@226
|
1619 static bool
|
cannam@226
|
1620 read_trigDoc(SerdReader* reader)
|
cannam@226
|
1621 {
|
cannam@226
|
1622 while (!reader->eof) {
|
cannam@226
|
1623 TRY_RET(read_statement(reader));
|
cannam@226
|
1624 }
|
cannam@226
|
1625 return reader->status <= SERD_FAILURE;
|
cannam@226
|
1626 }
|
cannam@226
|
1627
|
cannam@226
|
1628 static bool
|
cannam@226
|
1629 read_nquadsDoc(SerdReader* reader)
|
cannam@226
|
1630 {
|
cannam@226
|
1631 while (!reader->eof) {
|
cannam@226
|
1632 SerdStatementFlags flags = 0;
|
cannam@226
|
1633 ReadContext ctx = { 0, 0, 0, 0, 0, 0, &flags };
|
cannam@226
|
1634 bool ate_dot = false;
|
cannam@226
|
1635 char s_type = false;
|
cannam@226
|
1636 read_ws_star(reader);
|
cannam@226
|
1637 if (peek_byte(reader) == '\0') {
|
cannam@226
|
1638 reader->eof = true;
|
cannam@226
|
1639 break;
|
cannam@226
|
1640 }
|
cannam@226
|
1641
|
cannam@226
|
1642 // subject predicate object
|
cannam@226
|
1643 if (!(ctx.subject = read_subject(reader, ctx, &ctx.subject, &s_type)) ||
|
cannam@226
|
1644 !read_ws_star(reader) ||
|
cannam@226
|
1645 !(ctx.predicate = read_IRIREF(reader)) ||
|
cannam@226
|
1646 !read_ws_star(reader) ||
|
cannam@226
|
1647 !read_object(reader, &ctx, false, &ate_dot)) {
|
cannam@226
|
1648 return false;
|
cannam@226
|
1649 }
|
cannam@226
|
1650
|
cannam@226
|
1651 if (!ate_dot) { // graphLabel?
|
cannam@226
|
1652 TRY_RET(read_ws_star(reader));
|
cannam@226
|
1653 switch (peek_byte(reader)) {
|
cannam@226
|
1654 case '.':
|
cannam@226
|
1655 break;
|
cannam@226
|
1656 case '_':
|
cannam@226
|
1657 ctx.graph = read_BLANK_NODE_LABEL(reader, &ate_dot);
|
cannam@226
|
1658 break;
|
cannam@226
|
1659 default:
|
cannam@226
|
1660 if (!(ctx.graph = read_IRIREF(reader))) {
|
cannam@226
|
1661 return false;
|
cannam@226
|
1662 }
|
cannam@226
|
1663 }
|
cannam@226
|
1664
|
cannam@226
|
1665 // Terminating '.'
|
cannam@226
|
1666 TRY_RET(read_ws_star(reader));
|
cannam@226
|
1667 eat_byte_check(reader, '.');
|
cannam@226
|
1668 }
|
cannam@226
|
1669
|
cannam@226
|
1670 TRY_RET(emit_statement(reader, ctx, ctx.object, ctx.datatype, ctx.lang));
|
cannam@226
|
1671 pop_node(reader, ctx.graph);
|
cannam@226
|
1672 pop_node(reader, ctx.lang);
|
cannam@226
|
1673 pop_node(reader, ctx.datatype);
|
cannam@226
|
1674 pop_node(reader, ctx.object);
|
cannam@226
|
1675 }
|
cannam@226
|
1676 return reader->status <= SERD_FAILURE;
|
cannam@226
|
1677 }
|
cannam@226
|
1678
|
cannam@226
|
1679 static bool
|
cannam@226
|
1680 read_doc(SerdReader* reader)
|
cannam@226
|
1681 {
|
cannam@226
|
1682 switch (reader->syntax) {
|
cannam@226
|
1683 case SERD_NQUADS: return read_nquadsDoc(reader);
|
cannam@226
|
1684 case SERD_TRIG: return read_trigDoc(reader);
|
cannam@226
|
1685 default: return read_turtleDoc(reader);
|
cannam@226
|
1686 }
|
cannam@226
|
1687 }
|
cannam@226
|
1688
|
cannam@226
|
1689 SERD_API
|
cannam@226
|
1690 SerdReader*
|
cannam@226
|
1691 serd_reader_new(SerdSyntax syntax,
|
cannam@226
|
1692 void* handle,
|
cannam@226
|
1693 void (*free_handle)(void*),
|
cannam@226
|
1694 SerdBaseSink base_sink,
|
cannam@226
|
1695 SerdPrefixSink prefix_sink,
|
cannam@226
|
1696 SerdStatementSink statement_sink,
|
cannam@226
|
1697 SerdEndSink end_sink)
|
cannam@226
|
1698 {
|
cannam@226
|
1699 const Cursor cur = { NULL, 0, 0 };
|
cannam@226
|
1700 SerdReader* me = (SerdReader*)calloc(1, sizeof(SerdReader));
|
cannam@226
|
1701 me->handle = handle;
|
cannam@226
|
1702 me->free_handle = free_handle;
|
cannam@226
|
1703 me->base_sink = base_sink;
|
cannam@226
|
1704 me->prefix_sink = prefix_sink;
|
cannam@226
|
1705 me->statement_sink = statement_sink;
|
cannam@226
|
1706 me->end_sink = end_sink;
|
cannam@226
|
1707 me->default_graph = SERD_NODE_NULL;
|
cannam@226
|
1708 me->stack = serd_stack_new(SERD_PAGE_SIZE);
|
cannam@226
|
1709 me->syntax = syntax;
|
cannam@226
|
1710 me->cur = cur;
|
cannam@226
|
1711 me->next_id = 1;
|
cannam@226
|
1712
|
cannam@226
|
1713 me->rdf_first = push_node(me, SERD_URI, NS_RDF "first", 48);
|
cannam@226
|
1714 me->rdf_rest = push_node(me, SERD_URI, NS_RDF "rest", 47);
|
cannam@226
|
1715 me->rdf_nil = push_node(me, SERD_URI, NS_RDF "nil", 46);
|
cannam@226
|
1716
|
cannam@226
|
1717 return me;
|
cannam@226
|
1718 }
|
cannam@226
|
1719
|
cannam@226
|
1720 SERD_API
|
cannam@226
|
1721 void
|
cannam@226
|
1722 serd_reader_set_strict(SerdReader* reader, bool strict)
|
cannam@226
|
1723 {
|
cannam@226
|
1724 reader->strict = strict;
|
cannam@226
|
1725 }
|
cannam@226
|
1726
|
cannam@226
|
1727 SERD_API
|
cannam@226
|
1728 void
|
cannam@226
|
1729 serd_reader_set_error_sink(SerdReader* reader,
|
cannam@226
|
1730 SerdErrorSink error_sink,
|
cannam@226
|
1731 void* error_handle)
|
cannam@226
|
1732 {
|
cannam@226
|
1733 reader->error_sink = error_sink;
|
cannam@226
|
1734 reader->error_handle = error_handle;
|
cannam@226
|
1735 }
|
cannam@226
|
1736
|
cannam@226
|
1737 SERD_API
|
cannam@226
|
1738 void
|
cannam@226
|
1739 serd_reader_free(SerdReader* reader)
|
cannam@226
|
1740 {
|
cannam@226
|
1741 pop_node(reader, reader->rdf_nil);
|
cannam@226
|
1742 pop_node(reader, reader->rdf_rest);
|
cannam@226
|
1743 pop_node(reader, reader->rdf_first);
|
cannam@226
|
1744 serd_node_free(&reader->default_graph);
|
cannam@226
|
1745
|
cannam@226
|
1746 #ifdef SERD_STACK_CHECK
|
cannam@226
|
1747 free(reader->allocs);
|
cannam@226
|
1748 #endif
|
cannam@226
|
1749 free(reader->stack.buf);
|
cannam@226
|
1750 free(reader->bprefix);
|
cannam@226
|
1751 if (reader->free_handle) {
|
cannam@226
|
1752 reader->free_handle(reader->handle);
|
cannam@226
|
1753 }
|
cannam@226
|
1754 free(reader);
|
cannam@226
|
1755 }
|
cannam@226
|
1756
|
cannam@226
|
1757 SERD_API
|
cannam@226
|
1758 void*
|
cannam@226
|
1759 serd_reader_get_handle(const SerdReader* reader)
|
cannam@226
|
1760 {
|
cannam@226
|
1761 return reader->handle;
|
cannam@226
|
1762 }
|
cannam@226
|
1763
|
cannam@226
|
1764 SERD_API
|
cannam@226
|
1765 void
|
cannam@226
|
1766 serd_reader_add_blank_prefix(SerdReader* reader,
|
cannam@226
|
1767 const uint8_t* prefix)
|
cannam@226
|
1768 {
|
cannam@226
|
1769 free(reader->bprefix);
|
cannam@226
|
1770 reader->bprefix_len = 0;
|
cannam@226
|
1771 reader->bprefix = NULL;
|
cannam@226
|
1772 if (prefix) {
|
cannam@226
|
1773 reader->bprefix_len = strlen((const char*)prefix);
|
cannam@226
|
1774 reader->bprefix = (uint8_t*)malloc(reader->bprefix_len + 1);
|
cannam@226
|
1775 memcpy(reader->bprefix, prefix, reader->bprefix_len + 1);
|
cannam@226
|
1776 }
|
cannam@226
|
1777 }
|
cannam@226
|
1778
|
cannam@226
|
1779 SERD_API
|
cannam@226
|
1780 void
|
cannam@226
|
1781 serd_reader_set_default_graph(SerdReader* reader,
|
cannam@226
|
1782 const SerdNode* graph)
|
cannam@226
|
1783 {
|
cannam@226
|
1784 serd_node_free(&reader->default_graph);
|
cannam@226
|
1785 reader->default_graph = serd_node_copy(graph);
|
cannam@226
|
1786 }
|
cannam@226
|
1787
|
cannam@226
|
1788 SERD_API
|
cannam@226
|
1789 SerdStatus
|
cannam@226
|
1790 serd_reader_read_file(SerdReader* reader,
|
cannam@226
|
1791 const uint8_t* uri)
|
cannam@226
|
1792 {
|
cannam@226
|
1793 uint8_t* const path = serd_file_uri_parse(uri, NULL);
|
cannam@226
|
1794 if (!path) {
|
cannam@226
|
1795 return SERD_ERR_BAD_ARG;
|
cannam@226
|
1796 }
|
cannam@226
|
1797
|
cannam@226
|
1798 FILE* fd = serd_fopen((const char*)path, "r");
|
cannam@226
|
1799 if (!fd) {
|
cannam@226
|
1800 free(path);
|
cannam@226
|
1801 return SERD_ERR_UNKNOWN;
|
cannam@226
|
1802 }
|
cannam@226
|
1803
|
cannam@226
|
1804 SerdStatus ret = serd_reader_read_file_handle(reader, fd, path);
|
cannam@226
|
1805 fclose(fd);
|
cannam@226
|
1806 free(path);
|
cannam@226
|
1807 return ret;
|
cannam@226
|
1808 }
|
cannam@226
|
1809
|
cannam@226
|
1810 static bool
|
cannam@226
|
1811 skip_bom(SerdReader* me)
|
cannam@226
|
1812 {
|
cannam@226
|
1813 if (peek_byte(me) == 0xEF) {
|
cannam@226
|
1814 eat_byte_safe(me, 0xEF);
|
cannam@226
|
1815 if (eat_byte_check(me, 0xBB) != 0xBB ||
|
cannam@226
|
1816 eat_byte_check(me, 0xBF) != 0xBF) {
|
cannam@226
|
1817 return r_err(me, SERD_ERR_BAD_SYNTAX, "corrupt byte order mark\n");
|
cannam@226
|
1818 }
|
cannam@226
|
1819 }
|
cannam@226
|
1820
|
cannam@226
|
1821 return true;
|
cannam@226
|
1822 }
|
cannam@226
|
1823
|
cannam@226
|
1824 SERD_API
|
cannam@226
|
1825 SerdStatus
|
cannam@226
|
1826 serd_reader_start_stream(SerdReader* me,
|
cannam@226
|
1827 FILE* file,
|
cannam@226
|
1828 const uint8_t* name,
|
cannam@226
|
1829 bool bulk)
|
cannam@226
|
1830 {
|
cannam@226
|
1831 return serd_reader_start_source_stream(
|
cannam@226
|
1832 me,
|
cannam@226
|
1833 bulk ? (SerdSource)fread : serd_file_read_byte,
|
cannam@226
|
1834 (SerdStreamErrorFunc)ferror,
|
cannam@226
|
1835 file,
|
cannam@226
|
1836 name,
|
cannam@226
|
1837 bulk ? SERD_PAGE_SIZE : 1);
|
cannam@226
|
1838 }
|
cannam@226
|
1839
|
cannam@226
|
1840 SERD_API
|
cannam@226
|
1841 SerdStatus
|
cannam@226
|
1842 serd_reader_start_source_stream(SerdReader* me,
|
cannam@226
|
1843 SerdSource read_func,
|
cannam@226
|
1844 SerdStreamErrorFunc error_func,
|
cannam@226
|
1845 void* stream,
|
cannam@226
|
1846 const uint8_t* name,
|
cannam@226
|
1847 size_t page_size)
|
cannam@226
|
1848 {
|
cannam@226
|
1849 const Cursor cur = { name, 1, 1 };
|
cannam@226
|
1850 me->cur = cur;
|
cannam@226
|
1851
|
cannam@226
|
1852 return serd_byte_source_open_source(
|
cannam@226
|
1853 &me->source, read_func, error_func, stream, page_size);
|
cannam@226
|
1854 }
|
cannam@226
|
1855
|
cannam@226
|
1856 static SerdStatus
|
cannam@226
|
1857 serd_reader_prepare(SerdReader* me)
|
cannam@226
|
1858 {
|
cannam@226
|
1859 me->eof = false;
|
cannam@226
|
1860 if ((me->status = serd_byte_source_prepare(&me->source))) {
|
cannam@226
|
1861 r_err(me, me->status, "read error: %s\n", strerror(errno));
|
cannam@226
|
1862 } else if (!skip_bom(me)) {
|
cannam@226
|
1863 me->status = SERD_ERR_BAD_SYNTAX;
|
cannam@226
|
1864 }
|
cannam@226
|
1865 return me->status;
|
cannam@226
|
1866 }
|
cannam@226
|
1867
|
cannam@226
|
1868 SERD_API
|
cannam@226
|
1869 SerdStatus
|
cannam@226
|
1870 serd_reader_read_chunk(SerdReader* me)
|
cannam@226
|
1871 {
|
cannam@226
|
1872 SerdStatus st = SERD_SUCCESS;
|
cannam@226
|
1873 if (!me->source.prepared) {
|
cannam@226
|
1874 if ((st = serd_reader_prepare(me))) {
|
cannam@226
|
1875 return st;
|
cannam@226
|
1876 }
|
cannam@226
|
1877 } else if (me->eof) {
|
cannam@226
|
1878 me->eof = false;
|
cannam@226
|
1879 if ((st = serd_byte_source_advance(&me->source))) {
|
cannam@226
|
1880 return st;
|
cannam@226
|
1881 }
|
cannam@226
|
1882 }
|
cannam@226
|
1883 return read_statement(me) ? SERD_SUCCESS : SERD_FAILURE;
|
cannam@226
|
1884 }
|
cannam@226
|
1885
|
cannam@226
|
1886 SERD_API
|
cannam@226
|
1887 SerdStatus
|
cannam@226
|
1888 serd_reader_end_stream(SerdReader* me)
|
cannam@226
|
1889 {
|
cannam@226
|
1890 return serd_byte_source_close(&me->source);
|
cannam@226
|
1891 }
|
cannam@226
|
1892
|
cannam@226
|
1893 SERD_API
|
cannam@226
|
1894 SerdStatus
|
cannam@226
|
1895 serd_reader_read_file_handle(SerdReader* me, FILE* file, const uint8_t* name)
|
cannam@226
|
1896 {
|
cannam@226
|
1897 return serd_reader_read_source(
|
cannam@226
|
1898 me, (SerdSource)fread, (SerdStreamErrorFunc)ferror,
|
cannam@226
|
1899 file, name, SERD_PAGE_SIZE);
|
cannam@226
|
1900 }
|
cannam@226
|
1901
|
cannam@226
|
1902 SERD_API
|
cannam@226
|
1903 SerdStatus
|
cannam@226
|
1904 serd_reader_read_source(SerdReader* me,
|
cannam@226
|
1905 SerdSource source,
|
cannam@226
|
1906 SerdStreamErrorFunc error,
|
cannam@226
|
1907 void* stream,
|
cannam@226
|
1908 const uint8_t* name,
|
cannam@226
|
1909 size_t page_size)
|
cannam@226
|
1910 {
|
cannam@226
|
1911 SerdStatus st = serd_reader_start_source_stream(
|
cannam@226
|
1912 me, source, error, stream, name, page_size);
|
cannam@226
|
1913
|
cannam@226
|
1914 if ((st = serd_reader_prepare(me))) {
|
cannam@226
|
1915 serd_reader_end_stream(me);
|
cannam@226
|
1916 return st;
|
cannam@226
|
1917 } else if (!read_doc(me)) {
|
cannam@226
|
1918 serd_reader_end_stream(me);
|
cannam@226
|
1919 return SERD_ERR_UNKNOWN;
|
cannam@226
|
1920 }
|
cannam@226
|
1921
|
cannam@226
|
1922 return serd_reader_end_stream(me);
|
cannam@226
|
1923 }
|
cannam@226
|
1924
|
cannam@226
|
1925 SERD_API
|
cannam@226
|
1926 SerdStatus
|
cannam@226
|
1927 serd_reader_read_string(SerdReader* me, const uint8_t* utf8)
|
cannam@226
|
1928 {
|
cannam@226
|
1929 const Cursor cur = { (const uint8_t*)"(string)", 1, 1 };
|
cannam@226
|
1930
|
cannam@226
|
1931 serd_byte_source_open_string(&me->source, utf8);
|
cannam@226
|
1932 me->cur = cur;
|
cannam@226
|
1933 me->eof = false;
|
cannam@226
|
1934
|
cannam@226
|
1935 SerdStatus st = serd_reader_prepare(me);
|
cannam@226
|
1936 if (!st) {
|
cannam@226
|
1937 st = read_doc(me) ? SERD_SUCCESS : SERD_ERR_UNKNOWN;
|
cannam@226
|
1938 }
|
cannam@226
|
1939
|
cannam@226
|
1940 serd_byte_source_close(&me->source);
|
cannam@226
|
1941
|
cannam@226
|
1942 return st;
|
cannam@226
|
1943 }
|