comparison ext/serd/src/reader.c @ 226:c5cdc9e6a4bf

Add these external library files
author Chris Cannam <cannam@all-day-breakfast.com>
date Fri, 09 Jun 2017 16:41:31 +0100
parents
children
comparison
equal deleted inserted replaced
225:025b3e2f7c17 226:c5cdc9e6a4bf
1 /*
2 Copyright 2011-2017 David Robillard <http://drobilla.net>
3
4 Permission to use, copy, modify, and/or distribute this software for any
5 purpose with or without fee is hereby granted, provided that the above
6 copyright notice and this permission notice appear in all copies.
7
8 THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include "serd_internal.h"
18
19 #include <assert.h>
20 #include <ctype.h>
21 #include <errno.h>
22 #include <stdarg.h>
23 #include <stdint.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27
28 #define NS_XSD "http://www.w3.org/2001/XMLSchema#"
29 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
30
31 #define TRY_THROW(exp) if (!(exp)) goto except;
32 #define TRY_RET(exp) if (!(exp)) return 0;
33
34 #ifdef SERD_STACK_CHECK
35 # define SERD_STACK_ASSERT_TOP(reader, ref) \
36 assert(ref == reader->allocs[reader->n_allocs - 1]);
37 #else
38 # define SERD_STACK_ASSERT_TOP(reader, ref)
39 #endif
40
41 typedef struct {
42 const uint8_t* filename;
43 unsigned line;
44 unsigned col;
45 } Cursor;
46
47 typedef uint32_t uchar;
48
49 /* Reference to a node in the stack (we can not use pointers since the
50 stack may be reallocated, invalidating any pointers to elements).
51 */
52 typedef size_t Ref;
53
54 typedef struct {
55 Ref graph;
56 Ref subject;
57 Ref predicate;
58 Ref object;
59 Ref datatype;
60 Ref lang;
61 SerdStatementFlags* flags;
62 } ReadContext;
63
64 struct SerdReaderImpl {
65 void* handle;
66 void (*free_handle)(void* ptr);
67 SerdBaseSink base_sink;
68 SerdPrefixSink prefix_sink;
69 SerdStatementSink statement_sink;
70 SerdEndSink end_sink;
71 SerdErrorSink error_sink;
72 void* error_handle;
73 Ref rdf_first;
74 Ref rdf_rest;
75 Ref rdf_nil;
76 SerdNode default_graph;
77 SerdByteSource source;
78 SerdStack stack;
79 SerdSyntax syntax;
80 unsigned next_id;
81 Cursor cur;
82 SerdStatus status;
83 uint8_t* buf;
84 uint8_t* bprefix;
85 size_t bprefix_len;
86 bool strict; ///< True iff strict parsing
87 bool eof;
88 bool seen_genid;
89 #ifdef SERD_STACK_CHECK
90 Ref* allocs; ///< Stack of push offsets
91 size_t n_allocs; ///< Number of stack pushes
92 #endif
93 };
94
95 static inline bool
96 supports_fancy_literals(const SerdReader* reader)
97 {
98 return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG;
99 }
100
101 static inline bool
102 supports_relative_iris(const SerdReader* reader)
103 {
104 return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG;
105 }
106
107 static int
108 r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...)
109 {
110 va_list args;
111 va_start(args, fmt);
112 const SerdError e = {
113 st, reader->cur.filename, reader->cur.line, reader->cur.col, fmt, &args
114 };
115 serd_error(reader->error_sink, reader->error_handle, &e);
116 va_end(args);
117 return 0;
118 }
119
120 /** fread-like wrapper for getc (which is faster). */
121 static size_t
122 serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream)
123 {
124 const int c = getc((FILE*)stream);
125 if (c == EOF) {
126 *((uint8_t*)buf) = 0;
127 return 0;
128 }
129 *((uint8_t*)buf) = (uint8_t)c;
130 return 1;
131 }
132
133 static inline uint8_t
134 peek_byte(SerdReader* reader)
135 {
136 return serd_byte_source_peek(&reader->source);
137 }
138
139 static inline uint8_t
140 eat_byte_safe(SerdReader* reader, const uint8_t byte)
141 {
142 assert(peek_byte(reader) == byte);
143 switch (byte) {
144 case '\0': reader->eof = (byte != '\0'); break;
145 case '\n': ++reader->cur.line; reader->cur.col = 0; break;
146 default: ++reader->cur.col;
147 }
148
149 reader->status = serd_byte_source_advance(&reader->source);
150 return byte;
151 }
152
153 static inline uint8_t
154 eat_byte_check(SerdReader* reader, const uint8_t byte)
155 {
156 const uint8_t c = peek_byte(reader);
157 if (c != byte) {
158 return r_err(reader, SERD_ERR_BAD_SYNTAX,
159 "expected `%c', not `%c'\n", byte, c);
160 }
161 return eat_byte_safe(reader, byte);
162 }
163
164 static inline bool
165 eat_string(SerdReader* reader, const char* str, unsigned n)
166 {
167 bool bad = false;
168 for (unsigned i = 0; i < n; ++i) {
169 bad |= eat_byte_check(reader, ((const uint8_t*)str)[i]);
170 }
171 return bad;
172 }
173
174 static Ref
175 push_node_padded(SerdReader* reader, size_t maxlen,
176 SerdType type, const char* str, size_t n_bytes)
177 {
178 void* mem = serd_stack_push_aligned(
179 &reader->stack, sizeof(SerdNode) + maxlen + 1, sizeof(SerdNode));
180
181 SerdNode* const node = (SerdNode*)mem;
182 node->n_bytes = node->n_chars = n_bytes;
183 node->flags = 0;
184 node->type = type;
185 node->buf = NULL;
186
187 uint8_t* buf = (uint8_t*)(node + 1);
188 memcpy(buf, str, n_bytes + 1);
189
190 #ifdef SERD_STACK_CHECK
191 reader->allocs = realloc(
192 reader->allocs, sizeof(uint8_t*) * (++reader->n_allocs));
193 reader->allocs[reader->n_allocs - 1] = ((uint8_t*)mem - reader->stack.buf);
194 #endif
195 return (uint8_t*)node - reader->stack.buf;
196 }
197
198 static Ref
199 push_node(SerdReader* reader, SerdType type, const char* str, size_t n_bytes)
200 {
201 return push_node_padded(reader, n_bytes, type, str, n_bytes);
202 }
203
204 static inline SerdNode*
205 deref(SerdReader* reader, const Ref ref)
206 {
207 if (ref) {
208 SerdNode* node = (SerdNode*)(reader->stack.buf + ref);
209 node->buf = (uint8_t*)node + sizeof(SerdNode);
210 return node;
211 }
212 return NULL;
213 }
214
215 static inline void
216 push_byte(SerdReader* reader, Ref ref, const uint8_t c)
217 {
218 SERD_STACK_ASSERT_TOP(reader, ref);
219 uint8_t* const s = serd_stack_push(&reader->stack, 1);
220 SerdNode* const node = (SerdNode*)(reader->stack.buf + ref);
221 ++node->n_bytes;
222 if (!(c & 0x80)) { // Starts with 0 bit, start of new character
223 ++node->n_chars;
224 }
225 *(s - 1) = c;
226 *s = '\0';
227 }
228
229 static inline void
230 push_replacement(SerdReader* reader, Ref dest)
231 {
232 push_byte(reader, dest, 0xEF);
233 push_byte(reader, dest, 0xBF);
234 push_byte(reader, dest, 0xBD);
235 }
236
237 static Ref
238 pop_node(SerdReader* reader, Ref ref)
239 {
240 if (ref && ref != reader->rdf_first && ref != reader->rdf_rest
241 && ref != reader->rdf_nil) {
242 #ifdef SERD_STACK_CHECK
243 SERD_STACK_ASSERT_TOP(reader, ref);
244 --reader->n_allocs;
245 #endif
246 SerdNode* const node = deref(reader, ref);
247 uint8_t* const top = reader->stack.buf + reader->stack.size;
248 serd_stack_pop_aligned(&reader->stack, top - (uint8_t*)node);
249 }
250 return 0;
251 }
252
253 static inline bool
254 emit_statement(SerdReader* reader, ReadContext ctx, Ref o, Ref d, Ref l)
255 {
256 SerdNode* graph = deref(reader, ctx.graph);
257 if (!graph && reader->default_graph.buf) {
258 graph = &reader->default_graph;
259 }
260 bool ret = !reader->statement_sink ||
261 !reader->statement_sink(
262 reader->handle, *ctx.flags, graph,
263 deref(reader, ctx.subject), deref(reader, ctx.predicate),
264 deref(reader, o), deref(reader, d), deref(reader, l));
265 *ctx.flags &= SERD_ANON_CONT|SERD_LIST_CONT; // Preserve only cont flags
266 return ret;
267 }
268
269 static bool
270 read_collection(SerdReader* reader, ReadContext ctx, Ref* dest);
271
272 static bool
273 read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot);
274
275 static inline uint8_t
276 read_HEX(SerdReader* reader)
277 {
278 const uint8_t c = peek_byte(reader);
279 if (is_digit(c) || in_range(c, 'A', 'F') || in_range(c, 'a', 'f')) {
280 return eat_byte_safe(reader, c);
281 } else {
282 return r_err(reader, SERD_ERR_BAD_SYNTAX,
283 "invalid hexadecimal digit `%c'\n", c);
284 }
285 }
286
287 // Read UCHAR escape, initial \ is already eaten by caller
288 static inline bool
289 read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code)
290 {
291 const uint8_t b = peek_byte(reader);
292 unsigned length = 0;
293 switch (b) {
294 case 'U':
295 length = 8;
296 break;
297 case 'u':
298 length = 4;
299 break;
300 default:
301 return false;
302 }
303 eat_byte_safe(reader, b);
304
305 uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
306 for (unsigned i = 0; i < length; ++i) {
307 if (!(buf[i] = read_HEX(reader))) {
308 return false;
309 }
310 }
311
312 uint32_t code;
313 sscanf((const char*)buf, "%X", &code);
314
315 unsigned size = 0;
316 if (code < 0x00000080) {
317 size = 1;
318 } else if (code < 0x00000800) {
319 size = 2;
320 } else if (code < 0x00010000) {
321 size = 3;
322 } else if (code < 0x00110000) {
323 size = 4;
324 } else {
325 r_err(reader, SERD_ERR_BAD_SYNTAX,
326 "unicode character 0x%X out of range\n", code);
327 push_replacement(reader, dest);
328 *char_code = 0xFFFD;
329 return true;
330 }
331
332 // Build output in buf
333 // (Note # of bytes = # of leading 1 bits in first byte)
334 uint32_t c = code;
335 switch (size) {
336 case 4:
337 buf[3] = 0x80 | (uint8_t)(c & 0x3F);
338 c >>= 6;
339 c |= (16 << 12); // set bit 4
340 case 3:
341 buf[2] = 0x80 | (uint8_t)(c & 0x3F);
342 c >>= 6;
343 c |= (32 << 6); // set bit 5
344 case 2:
345 buf[1] = 0x80 | (uint8_t)(c & 0x3F);
346 c >>= 6;
347 c |= 0xC0; // set bits 6 and 7
348 case 1:
349 buf[0] = (uint8_t)c;
350 }
351
352 for (unsigned i = 0; i < size; ++i) {
353 push_byte(reader, dest, buf[i]);
354 }
355 *char_code = code;
356 return true;
357 }
358
359 // Read ECHAR escape, initial \ is already eaten by caller
360 static inline bool
361 read_ECHAR(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
362 {
363 const uint8_t c = peek_byte(reader);
364 switch (c) {
365 case 't':
366 eat_byte_safe(reader, 't');
367 push_byte(reader, dest, '\t');
368 return true;
369 case 'b':
370 eat_byte_safe(reader, 'b');
371 push_byte(reader, dest, '\b');
372 return true;
373 case 'n':
374 *flags |= SERD_HAS_NEWLINE;
375 eat_byte_safe(reader, 'n');
376 push_byte(reader, dest, '\n');
377 return true;
378 case 'r':
379 *flags |= SERD_HAS_NEWLINE;
380 eat_byte_safe(reader, 'r');
381 push_byte(reader, dest, '\r');
382 return true;
383 case 'f':
384 eat_byte_safe(reader, 'f');
385 push_byte(reader, dest, '\f');
386 return true;
387 case '\\': case '"': case '\'':
388 push_byte(reader, dest, eat_byte_safe(reader, c));
389 return true;
390 default:
391 return false;
392 }
393 }
394
395 static inline SerdStatus
396 bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c)
397 {
398 r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c);
399 push_replacement(reader, dest);
400
401 // Skip bytes until the next start byte
402 for (uint8_t b = peek_byte(reader); (b & 0x80);) {
403 eat_byte_safe(reader, b);
404 b = peek_byte(reader);
405 }
406
407 return SERD_SUCCESS;
408 }
409
410 static SerdStatus
411 read_utf8_character(SerdReader* reader, Ref dest, uint8_t c)
412 {
413 unsigned size = 1;
414 if ((c & 0xE0) == 0xC0) { // Starts with `110'
415 size = 2;
416 } else if ((c & 0xF0) == 0xE0) { // Starts with `1110'
417 size = 3;
418 } else if ((c & 0xF8) == 0xF0) { // Starts with `11110'
419 size = 4;
420 } else {
421 return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n", c);
422 }
423
424 char bytes[4];
425 bytes[0] = c;
426
427 // Check character validity
428 for (unsigned i = 1; i < size; ++i) {
429 if (((bytes[i] = peek_byte(reader)) & 0x80) == 0) {
430 return bad_char(reader, dest, "invalid UTF-8 continuation 0x%X\n",
431 bytes[i]);
432 }
433 eat_byte_safe(reader, bytes[i]);
434 }
435
436 // Emit character
437 for (unsigned i = 0; i < size; ++i) {
438 push_byte(reader, dest, bytes[i]);
439 }
440 return SERD_SUCCESS;
441 }
442
443 // Read one character (possibly multi-byte)
444 // The first byte, c, has already been eaten by caller
445 static inline SerdStatus
446 read_character(SerdReader* reader, Ref dest, SerdNodeFlags* flags, uint8_t c)
447 {
448 if (!(c & 0x80)) {
449 switch (c) {
450 case 0xA: case 0xD:
451 *flags |= SERD_HAS_NEWLINE;
452 break;
453 case '"': case '\'':
454 *flags |= SERD_HAS_QUOTE;
455 break;
456 }
457 push_byte(reader, dest, c);
458 return SERD_SUCCESS;
459 } else {
460 return read_utf8_character(reader, dest, c);
461 }
462 }
463
464 // [10] comment ::= '#' ( [^#xA #xD] )*
465 static void
466 read_comment(SerdReader* reader)
467 {
468 eat_byte_safe(reader, '#');
469 uint8_t c;
470 while (((c = peek_byte(reader)) != 0xA) && (c != 0xD) && c) {
471 eat_byte_safe(reader, c);
472 }
473 }
474
475 // [24] ws ::= #x9 | #xA | #xD | #x20 | comment
476 static inline bool
477 read_ws(SerdReader* reader)
478 {
479 const uint8_t c = peek_byte(reader);
480 switch (c) {
481 case 0x9: case 0xA: case 0xD: case 0x20:
482 eat_byte_safe(reader, c);
483 return true;
484 case '#':
485 read_comment(reader);
486 return true;
487 default:
488 return false;
489 }
490 }
491
492 static inline bool
493 read_ws_star(SerdReader* reader)
494 {
495 while (read_ws(reader)) {}
496 return true;
497 }
498
499 static inline bool
500 peek_delim(SerdReader* reader, const char delim)
501 {
502 read_ws_star(reader);
503 return peek_byte(reader) == delim;
504 }
505
506 static inline bool
507 eat_delim(SerdReader* reader, const char delim)
508 {
509 if (peek_delim(reader, delim)) {
510 eat_byte_safe(reader, delim);
511 return read_ws_star(reader);
512 }
513 return false;
514 }
515
516 // STRING_LITERAL_LONG_QUOTE and STRING_LITERAL_LONG_SINGLE_QUOTE
517 // Initial triple quotes are already eaten by caller
518 static Ref
519 read_STRING_LITERAL_LONG(SerdReader* reader, SerdNodeFlags* flags, uint8_t q)
520 {
521 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
522 while (true) {
523 const uint8_t c = peek_byte(reader);
524 uint32_t code;
525 switch (c) {
526 case '\\':
527 eat_byte_safe(reader, c);
528 if (!read_ECHAR(reader, ref, flags) &&
529 !read_UCHAR(reader, ref, &code)) {
530 r_err(reader, SERD_ERR_BAD_SYNTAX,
531 "invalid escape `\\%c'\n", peek_byte(reader));
532 return pop_node(reader, ref);
533 }
534 break;
535 default:
536 if (c == q) {
537 eat_byte_safe(reader, q);
538 const uint8_t q2 = eat_byte_safe(reader, peek_byte(reader));
539 const uint8_t q3 = peek_byte(reader);
540 if (q2 == q && q3 == q) { // End of string
541 eat_byte_safe(reader, q3);
542 return ref;
543 } else {
544 *flags |= SERD_HAS_QUOTE;
545 push_byte(reader, ref, c);
546 read_character(reader, ref, flags, q2);
547 }
548 } else {
549 read_character(reader, ref, flags, eat_byte_safe(reader, c));
550 }
551 }
552 }
553 return ref;
554 }
555
556 // STRING_LITERAL_QUOTE and STRING_LITERAL_SINGLE_QUOTE
557 // Initial quote is already eaten by caller
558 static Ref
559 read_STRING_LITERAL(SerdReader* reader, SerdNodeFlags* flags, uint8_t q)
560 {
561 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
562 while (true) {
563 const uint8_t c = peek_byte(reader);
564 uint32_t code;
565 switch (c) {
566 case '\n': case '\r':
567 r_err(reader, SERD_ERR_BAD_SYNTAX, "line end in short string\n");
568 return pop_node(reader, ref);
569 case '\\':
570 eat_byte_safe(reader, c);
571 if (!read_ECHAR(reader, ref, flags) &&
572 !read_UCHAR(reader, ref, &code)) {
573 r_err(reader, SERD_ERR_BAD_SYNTAX,
574 "invalid escape `\\%c'\n", peek_byte(reader));
575 return pop_node(reader, ref);
576 }
577 break;
578 default:
579 if (c == q) {
580 eat_byte_check(reader, q);
581 return ref;
582 } else {
583 read_character(reader, ref, flags, eat_byte_safe(reader, c));
584 }
585 }
586 }
587 eat_byte_check(reader, q);
588 return ref;
589 }
590
591 static Ref
592 read_String(SerdReader* reader, SerdNodeFlags* flags)
593 {
594 const uint8_t q1 = peek_byte(reader);
595 eat_byte_safe(reader, q1);
596
597 const uint8_t q2 = peek_byte(reader);
598 if (q2 != q1) { // Short string (not triple quoted)
599 return read_STRING_LITERAL(reader, flags, q1);
600 }
601
602 eat_byte_safe(reader, q2);
603 const uint8_t q3 = peek_byte(reader);
604 if (q3 != q1) { // Empty short string ("" or '')
605 return push_node(reader, SERD_LITERAL, "", 0);
606 }
607
608 if (!supports_fancy_literals(reader)) {
609 return r_err(reader, SERD_ERR_BAD_SYNTAX,
610 "syntax does not support long literals\n");
611 }
612
613 eat_byte_safe(reader, q3);
614 return read_STRING_LITERAL_LONG(reader, flags, q1);
615 }
616
617 static bool
618 read_PN_CHARS_BASE(SerdReader* reader, Ref dest)
619 {
620 const uint8_t c = peek_byte(reader);
621 if ((c & 0x80)) { // Multi-byte character
622 return !read_utf8_character(reader, dest, eat_byte_safe(reader, c));
623 }
624 if (is_alpha(c)) {
625 push_byte(reader, dest, eat_byte_safe(reader, c));
626 return true;
627 }
628 return false;
629 }
630
631 static bool
632 read_PN_CHARS(SerdReader* reader, Ref dest)
633 {
634 const uint8_t c = peek_byte(reader);
635 if ((c & 0x80)) { // Multi-byte character
636 return !read_utf8_character(reader, dest, eat_byte_safe(reader, c));
637 }
638
639 if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') {
640 push_byte(reader, dest, eat_byte_safe(reader, c));
641 return true;
642 }
643 return false;
644 }
645
646 static bool
647 read_PERCENT(SerdReader* reader, Ref dest)
648 {
649 push_byte(reader, dest, eat_byte_safe(reader, '%'));
650 const uint8_t h1 = read_HEX(reader);
651 const uint8_t h2 = read_HEX(reader);
652 if (h1 && h2) {
653 push_byte(reader, dest, h1);
654 push_byte(reader, dest, h2);
655 return true;
656 }
657 return false;
658 }
659
660 static SerdStatus
661 read_PLX(SerdReader* reader, Ref dest)
662 {
663 uint8_t c = peek_byte(reader);
664 switch (c) {
665 case '%':
666 if (!read_PERCENT(reader, dest)) {
667 return SERD_ERR_BAD_SYNTAX;
668 }
669 return SERD_SUCCESS;
670 case '\\':
671 eat_byte_safe(reader, c);
672 if (is_alpha(c = peek_byte(reader))) {
673 // Escapes like \u \n etc. are not supported
674 return SERD_ERR_BAD_SYNTAX;
675 } else {
676 // Allow escaping of pretty much any other character
677 push_byte(reader, dest, eat_byte_safe(reader, c));
678 return SERD_SUCCESS;
679 }
680 default:
681 return SERD_FAILURE;
682 }
683 }
684
685 static SerdStatus
686 read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot)
687 {
688 uint8_t c = peek_byte(reader);
689 SerdStatus st;
690 switch (c) {
691 case '0': case '1': case '2': case '3': case '4': case '5':
692 case '6': case '7': case '8': case '9': case ':': case '_':
693 push_byte(reader, dest, eat_byte_safe(reader, c));
694 break;
695 default:
696 if ((st = read_PLX(reader, dest)) > SERD_FAILURE) {
697 return st;
698 } else if (st != SERD_SUCCESS && !read_PN_CHARS_BASE(reader, dest)) {
699 return SERD_FAILURE;
700 }
701 }
702
703 while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.' | ';')*
704 if (c == '.' || c == ':') {
705 push_byte(reader, dest, eat_byte_safe(reader, c));
706 } else if ((st = read_PLX(reader, dest)) > SERD_FAILURE) {
707 return st;
708 } else if (st != SERD_SUCCESS && !read_PN_CHARS(reader, dest)) {
709 break;
710 }
711 }
712
713 SerdNode* const n = deref(reader, dest);
714 if (n->buf[n->n_bytes - 1] == '.') {
715 // Ate trailing dot, pop it from stack/node and inform caller
716 --n->n_bytes;
717 serd_stack_pop(&reader->stack, 1);
718 *ate_dot = true;
719 }
720
721 return SERD_SUCCESS;
722 }
723
724 // Read the remainder of a PN_PREFIX after some initial characters
725 static SerdStatus
726 read_PN_PREFIX_tail(SerdReader* reader, Ref dest)
727 {
728 uint8_t c;
729 while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')*
730 if (c == '.') {
731 push_byte(reader, dest, eat_byte_safe(reader, c));
732 } else if (!read_PN_CHARS(reader, dest)) {
733 break;
734 }
735 }
736
737 const SerdNode* const n = deref(reader, dest);
738 if (n->buf[n->n_bytes - 1] == '.' && !read_PN_CHARS(reader, dest)) {
739 r_err(reader, SERD_ERR_BAD_SYNTAX, "prefix ends with `.'\n");
740 return SERD_ERR_BAD_SYNTAX;
741 }
742
743 return SERD_SUCCESS;
744 }
745
746 static SerdStatus
747 read_PN_PREFIX(SerdReader* reader, Ref dest)
748 {
749 if (read_PN_CHARS_BASE(reader, dest)) {
750 return read_PN_PREFIX_tail(reader, dest);
751 }
752 return SERD_FAILURE;
753 }
754
755 static Ref
756 read_LANGTAG(SerdReader* reader)
757 {
758 uint8_t c = peek_byte(reader);
759 if (!is_alpha(c)) {
760 return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'\n", c);
761 }
762 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
763 push_byte(reader, ref, eat_byte_safe(reader, c));
764 while ((c = peek_byte(reader)) && is_alpha(c)) {
765 push_byte(reader, ref, eat_byte_safe(reader, c));
766 }
767 while (peek_byte(reader) == '-') {
768 push_byte(reader, ref, eat_byte_safe(reader, '-'));
769 while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) {
770 push_byte(reader, ref, eat_byte_safe(reader, c));
771 }
772 }
773 return ref;
774 }
775
776 typedef enum { PREFIX, GOOD, BAD} SchemeState;
777
778 static inline bool
779 check_scheme(SerdReader* reader, uint8_t c, SchemeState* state)
780 {
781 if (!supports_relative_iris(reader) && *state == PREFIX) {
782 if (c == ':') {
783 *state = GOOD;
784 } else if (!isalpha(c)) {
785 *state = BAD;
786 return r_err(reader, SERD_ERR_BAD_SYNTAX,
787 "syntax does not support relative IRIs\n");
788 }
789 }
790 return true;
791 }
792
793 static Ref
794 read_IRIREF(SerdReader* reader)
795 {
796 TRY_RET(eat_byte_check(reader, '<'));
797 Ref ref = push_node(reader, SERD_URI, "", 0);
798 SchemeState scheme = PREFIX;
799 uint32_t code;
800 while (true) {
801 const uint8_t c = peek_byte(reader);
802 if (!check_scheme(reader, c, &scheme)) {
803 return pop_node(reader, ref);
804 }
805 switch (c) {
806 case '"': case '<': case '^': case '`': case '{': case '|': case '}':
807 r_err(reader, SERD_ERR_BAD_SYNTAX,
808 "invalid IRI character `%c'\n", c);
809 return pop_node(reader, ref);
810 case '>':
811 eat_byte_safe(reader, c);
812 return ref;
813 case '\\':
814 eat_byte_safe(reader, c);
815 if (!read_UCHAR(reader, ref, &code)) {
816 r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape\n");
817 return pop_node(reader, ref);
818 }
819 switch (code) {
820 case 0: case ' ': case '<': case '>':
821 r_err(reader, SERD_ERR_BAD_SYNTAX,
822 "invalid escaped IRI character %X %c\n", code, code);
823 return pop_node(reader, ref);
824 }
825 break;
826 default:
827 if (c <= 0x20) {
828 if (isprint(c)) {
829 r_err(reader, SERD_ERR_BAD_SYNTAX,
830 "invalid IRI character `%c' (escape %%%02X)\n", c, c);
831 } else {
832 r_err(reader, SERD_ERR_BAD_SYNTAX,
833 "invalid IRI character (escape %%%02X)\n", c, c);
834 }
835 if (reader->strict) {
836 return pop_node(reader, ref);
837 }
838 push_byte(reader, ref, eat_byte_safe(reader, c));
839 } else {
840 push_byte(reader, ref, eat_byte_safe(reader, c));
841 }
842 }
843 }
844 }
845
846 static bool
847 read_PrefixedName(SerdReader* reader, Ref dest, bool read_prefix, bool* ate_dot)
848 {
849 if (read_prefix && read_PN_PREFIX(reader, dest) > SERD_FAILURE) {
850 return false;
851 } else if (peek_byte(reader) != ':') {
852 return false;
853 }
854
855 push_byte(reader, dest, eat_byte_safe(reader, ':'));
856 return read_PN_LOCAL(reader, dest, ate_dot) <= SERD_FAILURE;
857 }
858
859 static bool
860 read_0_9(SerdReader* reader, Ref str, bool at_least_one)
861 {
862 unsigned count = 0;
863 for (uint8_t c; is_digit((c = peek_byte(reader))); ++count) {
864 push_byte(reader, str, eat_byte_safe(reader, c));
865 }
866 if (at_least_one && count == 0) {
867 r_err(reader, SERD_ERR_BAD_SYNTAX, "expected digit\n");
868 }
869 return count;
870 }
871
872 static bool
873 read_number(SerdReader* reader, Ref* dest, Ref* datatype, bool* ate_dot)
874 {
875 #define XSD_DECIMAL NS_XSD "decimal"
876 #define XSD_DOUBLE NS_XSD "double"
877 #define XSD_INTEGER NS_XSD "integer"
878 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
879 uint8_t c = peek_byte(reader);
880 bool has_decimal = false;
881 if (c == '-' || c == '+') {
882 push_byte(reader, ref, eat_byte_safe(reader, c));
883 }
884 if ((c = peek_byte(reader)) == '.') {
885 has_decimal = true;
886 // decimal case 2 (e.g. '.0' or `-.0' or `+.0')
887 push_byte(reader, ref, eat_byte_safe(reader, c));
888 TRY_THROW(read_0_9(reader, ref, true));
889 } else {
890 // all other cases ::= ( '-' | '+' ) [0-9]+ ( . )? ( [0-9]+ )? ...
891 TRY_THROW(is_digit(c));
892 read_0_9(reader, ref, true);
893 if ((c = peek_byte(reader)) == '.') {
894 has_decimal = true;
895
896 // Annoyingly, dot can be end of statement, so tentatively eat
897 eat_byte_safe(reader, c);
898 c = peek_byte(reader);
899 if (!is_digit(c) && c != 'e' && c != 'E') {
900 *dest = ref;
901 *ate_dot = true; // Force caller to deal with stupid grammar
902 return true; // Next byte is not a number character, done
903 }
904
905 push_byte(reader, ref, '.');
906 read_0_9(reader, ref, false);
907 }
908 }
909 c = peek_byte(reader);
910 if (c == 'e' || c == 'E') {
911 // double
912 push_byte(reader, ref, eat_byte_safe(reader, c));
913 switch ((c = peek_byte(reader))) {
914 case '+': case '-':
915 push_byte(reader, ref, eat_byte_safe(reader, c));
916 default: break;
917 }
918 TRY_THROW(read_0_9(reader, ref, true));
919 *datatype = push_node(reader, SERD_URI,
920 XSD_DOUBLE, sizeof(XSD_DOUBLE) - 1);
921 } else if (has_decimal) {
922 *datatype = push_node(reader, SERD_URI,
923 XSD_DECIMAL, sizeof(XSD_DECIMAL) - 1);
924 } else {
925 *datatype = push_node(reader, SERD_URI,
926 XSD_INTEGER, sizeof(XSD_INTEGER) - 1);
927 }
928 *dest = ref;
929 return true;
930 except:
931 pop_node(reader, *datatype);
932 pop_node(reader, ref);
933 return false;
934 }
935
936 static bool
937 read_iri(SerdReader* reader, Ref* dest, bool* ate_dot)
938 {
939 switch (peek_byte(reader)) {
940 case '<':
941 *dest = read_IRIREF(reader);
942 return true;
943 default:
944 *dest = push_node(reader, SERD_CURIE, "", 0);
945 return read_PrefixedName(reader, *dest, true, ate_dot);
946 }
947 }
948
949 static bool
950 read_literal(SerdReader* reader, Ref* dest,
951 Ref* datatype, Ref* lang, SerdNodeFlags* flags, bool* ate_dot)
952 {
953 Ref str = read_String(reader, flags);
954 if (!str) {
955 return false;
956 }
957
958 switch (peek_byte(reader)) {
959 case '@':
960 eat_byte_safe(reader, '@');
961 TRY_THROW(*lang = read_LANGTAG(reader));
962 break;
963 case '^':
964 eat_byte_safe(reader, '^');
965 eat_byte_check(reader, '^');
966 TRY_THROW(read_iri(reader, datatype, ate_dot));
967 break;
968 }
969 *dest = str;
970 return true;
971 except:
972 *datatype = pop_node(reader, *datatype);
973 *lang = pop_node(reader, *lang);
974 pop_node(reader, str);
975 return false;
976 }
977
978 inline static bool
979 is_token_end(uint8_t c)
980 {
981 switch (c) {
982 case 0x9: case 0xA: case 0xD: case 0x20: case '\0':
983 case '#': case '.': case ';': case '<':
984 return true;
985 default:
986 return false;
987 }
988 }
989
990 static bool
991 read_verb(SerdReader* reader, Ref* dest)
992 {
993 if (peek_byte(reader) == '<') {
994 return (*dest = read_IRIREF(reader));
995 } else {
996 /* Either a qname, or "a". Read the prefix first, and if it is in fact
997 "a", produce that instead.
998 */
999 *dest = push_node(reader, SERD_CURIE, "", 0);
1000 SerdNode* node = deref(reader, *dest);
1001 const SerdStatus st = read_PN_PREFIX(reader, *dest);
1002 bool ate_dot = false;
1003 if (!st && node->n_bytes == 1 && node->buf[0] == 'a' &&
1004 is_token_end(peek_byte(reader))) {
1005 pop_node(reader, *dest);
1006 return (*dest = push_node(reader, SERD_URI, NS_RDF "type", 47));
1007 } else if (st > SERD_FAILURE ||
1008 !read_PrefixedName(reader, *dest, false, &ate_dot) ||
1009 ate_dot) {
1010 return (*dest = pop_node(reader, *dest));
1011 } else {
1012 return true;
1013 }
1014 }
1015 return false;
1016 }
1017
1018 static Ref
1019 read_BLANK_NODE_LABEL(SerdReader* reader, bool* ate_dot)
1020 {
1021 eat_byte_safe(reader, '_');
1022 eat_byte_check(reader, ':');
1023 Ref ref = push_node(reader, SERD_BLANK,
1024 reader->bprefix ? (char*)reader->bprefix : "",
1025 reader->bprefix_len);
1026
1027 uint8_t c = peek_byte(reader); // First: (PN_CHARS | '_' | [0-9])
1028 if (is_digit(c) || c == '_') {
1029 push_byte(reader, ref, eat_byte_safe(reader, c));
1030 } else if (!read_PN_CHARS(reader, ref)) {
1031 r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid name start character\n");
1032 return pop_node(reader, ref);
1033 }
1034
1035 while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')*
1036 if (c == '.') {
1037 push_byte(reader, ref, eat_byte_safe(reader, c));
1038 } else if (!read_PN_CHARS(reader, ref)) {
1039 break;
1040 }
1041 }
1042
1043 SerdNode* n = deref(reader, ref);
1044 if (n->buf[n->n_bytes - 1] == '.' && !read_PN_CHARS(reader, ref)) {
1045 // Ate trailing dot, pop it from stack/node and inform caller
1046 --n->n_bytes;
1047 serd_stack_pop(&reader->stack, 1);
1048 *ate_dot = true;
1049 }
1050
1051 if (reader->syntax == SERD_TURTLE) {
1052 if (is_digit(n->buf[reader->bprefix_len + 1])) {
1053 if ((n->buf[reader->bprefix_len]) == 'b') {
1054 ((char*)n->buf)[reader->bprefix_len] = 'B'; // Prevent clash
1055 reader->seen_genid = true;
1056 } else if (reader->seen_genid &&
1057 n->buf[reader->bprefix_len] == 'B') {
1058 r_err(reader, SERD_ERR_ID_CLASH,
1059 "found both `b' and `B' blank IDs, prefix required\n");
1060 return pop_node(reader, ref);
1061 }
1062 }
1063 }
1064 return ref;
1065 }
1066
1067 static void
1068 set_blank_id(SerdReader* reader, Ref ref, size_t buf_size)
1069 {
1070 SerdNode* node = deref(reader, ref);
1071 const char* prefix = reader->bprefix ? (const char*)reader->bprefix : "";
1072 node->n_bytes = node->n_chars = snprintf(
1073 (char*)node->buf, buf_size, "%sb%u", prefix, reader->next_id++);
1074 }
1075
1076 static size_t
1077 genid_size(SerdReader* reader)
1078 {
1079 return reader->bprefix_len + 1 + 10 + 1; // + "b" + UINT32_MAX + \0
1080 }
1081
1082 static Ref
1083 blank_id(SerdReader* reader)
1084 {
1085 Ref ref = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0);
1086 set_blank_id(reader, ref, genid_size(reader));
1087 return ref;
1088 }
1089
1090 static Ref
1091 read_blankName(SerdReader* reader)
1092 {
1093 eat_byte_safe(reader, '=');
1094 if (eat_byte_check(reader, '=') != '=') {
1095 return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected `='\n");
1096 }
1097
1098 Ref subject = 0;
1099 bool ate_dot = false;
1100 read_ws_star(reader);
1101 read_iri(reader, &subject, &ate_dot);
1102 return subject;
1103 }
1104
1105 static bool
1106 read_anon(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest)
1107 {
1108 const SerdStatementFlags old_flags = *ctx.flags;
1109 bool empty;
1110 eat_byte_safe(reader, '[');
1111 if ((empty = peek_delim(reader, ']'))) {
1112 *ctx.flags |= (subject) ? SERD_EMPTY_S : SERD_EMPTY_O;
1113 } else {
1114 *ctx.flags |= (subject) ? SERD_ANON_S_BEGIN : SERD_ANON_O_BEGIN;
1115 if (peek_delim(reader, '=')) {
1116 if (!(*dest = read_blankName(reader)) ||
1117 !eat_delim(reader, ';')) {
1118 return false;
1119 }
1120 }
1121 }
1122
1123 if (!*dest) {
1124 *dest = blank_id(reader);
1125 }
1126 if (ctx.subject) {
1127 TRY_RET(emit_statement(reader, ctx, *dest, 0, 0));
1128 }
1129
1130 ctx.subject = *dest;
1131 if (!empty) {
1132 *ctx.flags &= ~(SERD_LIST_CONT);
1133 if (!subject) {
1134 *ctx.flags |= SERD_ANON_CONT;
1135 }
1136 bool ate_dot_in_list = false;
1137 read_predicateObjectList(reader, ctx, &ate_dot_in_list);
1138 if (ate_dot_in_list) {
1139 return r_err(reader, SERD_ERR_BAD_SYNTAX, "`.' inside blank\n");
1140 }
1141 read_ws_star(reader);
1142 if (reader->end_sink) {
1143 reader->end_sink(reader->handle, deref(reader, *dest));
1144 }
1145 *ctx.flags = old_flags;
1146 }
1147 return (eat_byte_check(reader, ']') == ']');
1148 }
1149
1150 /* If emit is true: recurses, calling statement_sink for every statement
1151 encountered, and leaves stack in original calling state (i.e. pops
1152 everything it pushes). */
1153 static bool
1154 read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot)
1155 {
1156 static const char* const XSD_BOOLEAN = NS_XSD "boolean";
1157 static const size_t XSD_BOOLEAN_LEN = 40;
1158
1159 #ifndef NDEBUG
1160 const size_t orig_stack_size = reader->stack.size;
1161 #endif
1162
1163 bool ret = false;
1164 bool simple = (ctx->subject != 0);
1165 SerdNode* node = NULL;
1166 Ref o = 0;
1167 Ref datatype = 0;
1168 Ref lang = 0;
1169 uint32_t flags = 0;
1170 const uint8_t c = peek_byte(reader);
1171 if (!supports_fancy_literals(reader)) {
1172 switch (c) {
1173 case '"': case ':': case '<': case '_': break;
1174 default: return r_err(reader, SERD_ERR_BAD_SYNTAX,
1175 "expected: ':', '<', or '_'\n");
1176 }
1177 }
1178 switch (c) {
1179 case '\0':
1180 case ')':
1181 return false;
1182 case '[':
1183 simple = false;
1184 TRY_THROW(ret = read_anon(reader, *ctx, false, &o));
1185 break;
1186 case '(':
1187 simple = false;
1188 TRY_THROW(ret = read_collection(reader, *ctx, &o));
1189 break;
1190 case '_':
1191 TRY_THROW(ret = (o = read_BLANK_NODE_LABEL(reader, ate_dot)));
1192 break;
1193 case '<': case ':':
1194 TRY_THROW(ret = read_iri(reader, &o, ate_dot));
1195 break;
1196 case '+': case '-': case '.': case '0': case '1': case '2': case '3':
1197 case '4': case '5': case '6': case '7': case '8': case '9':
1198 TRY_THROW(ret = read_number(reader, &o, &datatype, ate_dot));
1199 break;
1200 case '\"':
1201 case '\'':
1202 TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang, &flags, ate_dot));
1203 break;
1204 default:
1205 /* Either a boolean literal, or a qname. Read the prefix first, and if
1206 it is in fact a "true" or "false" literal, produce that instead.
1207 */
1208 node = deref(reader, o = push_node(reader, SERD_CURIE, "", 0));
1209 while (read_PN_CHARS_BASE(reader, o)) {}
1210 if ((node->n_bytes == 4 && !memcmp(node->buf, "true", 4)) ||
1211 (node->n_bytes == 5 && !memcmp(node->buf, "false", 5))) {
1212 node->type = SERD_LITERAL;
1213 datatype = push_node(
1214 reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN);
1215 ret = true;
1216 } else if (read_PN_PREFIX_tail(reader, o) > SERD_FAILURE) {
1217 ret = false;
1218 } else {
1219 ret = read_PrefixedName(reader, o, false, ate_dot);
1220 }
1221 }
1222
1223 if (simple && o) {
1224 deref(reader, o)->flags = flags;
1225 }
1226
1227 if (ret && emit && simple) {
1228 ret = emit_statement(reader, *ctx, o, datatype, lang);
1229 } else if (ret && !emit) {
1230 ctx->object = o;
1231 ctx->datatype = datatype;
1232 ctx->lang = lang;
1233 return true;
1234 }
1235
1236 except:
1237 pop_node(reader, lang);
1238 pop_node(reader, datatype);
1239 pop_node(reader, o);
1240 #ifndef NDEBUG
1241 assert(reader->stack.size == orig_stack_size);
1242 #endif
1243 return ret;
1244 }
1245
1246 static bool
1247 read_objectList(SerdReader* reader, ReadContext ctx, bool* ate_dot)
1248 {
1249 TRY_RET(read_object(reader, &ctx, true, ate_dot));
1250 while (!*ate_dot && eat_delim(reader, ',')) {
1251 TRY_RET(read_object(reader, &ctx, true, ate_dot));
1252 }
1253 return true;
1254 }
1255
1256 static bool
1257 read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot)
1258 {
1259 uint8_t c;
1260 while (true) {
1261 TRY_THROW(read_verb(reader, &ctx.predicate));
1262 read_ws_star(reader);
1263
1264 TRY_THROW(read_objectList(reader, ctx, ate_dot));
1265 ctx.predicate = pop_node(reader, ctx.predicate);
1266 if (*ate_dot) {
1267 return true;
1268 }
1269
1270 bool ate_semi = false;
1271 do {
1272 read_ws_star(reader);
1273 switch (c = peek_byte(reader)) {
1274 case 0:
1275 return false;
1276 case '.': case ']': case '}':
1277 return true;
1278 case ';':
1279 eat_byte_safe(reader, c);
1280 ate_semi = true;
1281 }
1282 } while (c == ';');
1283
1284 if (!ate_semi) {
1285 return r_err(reader, SERD_ERR_BAD_SYNTAX, "missing ';' or '.'\n");
1286 }
1287 }
1288
1289 pop_node(reader, ctx.predicate);
1290 return true;
1291 except:
1292 pop_node(reader, ctx.predicate);
1293 return false;
1294 }
1295
1296 static bool
1297 end_collection(SerdReader* reader, ReadContext ctx, Ref n1, Ref n2, bool ret)
1298 {
1299 pop_node(reader, n2);
1300 pop_node(reader, n1);
1301 *ctx.flags &= ~SERD_LIST_CONT;
1302 return ret && (eat_byte_safe(reader, ')') == ')');
1303 }
1304
1305 static bool
1306 read_collection(SerdReader* reader, ReadContext ctx, Ref* dest)
1307 {
1308 eat_byte_safe(reader, '(');
1309 bool end = peek_delim(reader, ')');
1310 *dest = end ? reader->rdf_nil : blank_id(reader);
1311 if (ctx.subject) {
1312 // subject predicate _:head
1313 *ctx.flags |= (end ? 0 : SERD_LIST_O_BEGIN);
1314 TRY_RET(emit_statement(reader, ctx, *dest, 0, 0));
1315 *ctx.flags |= SERD_LIST_CONT;
1316 } else {
1317 *ctx.flags |= (end ? 0 : SERD_LIST_S_BEGIN);
1318 }
1319
1320 if (end) {
1321 return end_collection(reader, ctx, 0, 0, true);
1322 }
1323
1324 /* The order of node allocation here is necessarily not in stack order,
1325 so we create two nodes and recycle them throughout. */
1326 Ref n1 = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0);
1327 Ref n2 = 0;
1328 Ref node = n1;
1329 Ref rest = 0;
1330
1331 ctx.subject = *dest;
1332 while (!(end = peek_delim(reader, ')'))) {
1333 // _:node rdf:first object
1334 ctx.predicate = reader->rdf_first;
1335 bool ate_dot = false;
1336 if (!read_object(reader, &ctx, true, &ate_dot) || ate_dot) {
1337 return end_collection(reader, ctx, n1, n2, false);
1338 }
1339
1340 if (!(end = peek_delim(reader, ')'))) {
1341 /* Give rest a new ID. Done as late as possible to ensure it is
1342 used and > IDs generated by read_object above. */
1343 if (!rest) {
1344 rest = n2 = blank_id(reader); // First pass, push
1345 } else {
1346 set_blank_id(reader, rest, genid_size(reader));
1347 }
1348 }
1349
1350 // _:node rdf:rest _:rest
1351 *ctx.flags |= SERD_LIST_CONT;
1352 ctx.predicate = reader->rdf_rest;
1353 TRY_RET(emit_statement(reader, ctx,
1354 (end ? reader->rdf_nil : rest), 0, 0));
1355
1356 ctx.subject = rest; // _:node = _:rest
1357 rest = node; // _:rest = (old)_:node
1358 node = ctx.subject; // invariant
1359 }
1360
1361 return end_collection(reader, ctx, n1, n2, true);
1362 }
1363
1364 static Ref
1365 read_subject(SerdReader* reader, ReadContext ctx, Ref* dest, char* s_type)
1366 {
1367 bool ate_dot = false;
1368 switch ((*s_type = peek_byte(reader))) {
1369 case '[':
1370 read_anon(reader, ctx, true, dest);
1371 break;
1372 case '(':
1373 read_collection(reader, ctx, dest);
1374 break;
1375 case '_':
1376 *dest = read_BLANK_NODE_LABEL(reader, &ate_dot);
1377 break;
1378 default:
1379 TRY_RET(read_iri(reader, dest, &ate_dot));
1380 }
1381 return ate_dot ? pop_node(reader, *dest) : *dest;
1382 }
1383
1384 static Ref
1385 read_labelOrSubject(SerdReader* reader, ReadContext ctx)
1386 {
1387 Ref subject = 0;
1388 bool ate_dot = false;
1389 switch (peek_byte(reader)) {
1390 case '[':
1391 eat_byte_safe(reader, '[');
1392 read_ws_star(reader);
1393 TRY_RET(eat_byte_check(reader, ']'));
1394 return blank_id(reader);
1395 case '_':
1396 return read_BLANK_NODE_LABEL(reader, &ate_dot);
1397 default:
1398 read_iri(reader, &subject, &ate_dot);
1399 }
1400 return subject;
1401 }
1402
1403 static bool
1404 read_triples(SerdReader* reader, ReadContext ctx, bool* ate_dot)
1405 {
1406 bool ret = false;
1407 if (ctx.subject) {
1408 read_ws_star(reader);
1409 switch (peek_byte(reader)) {
1410 case '.':
1411 *ate_dot = eat_byte_safe(reader, '.');
1412 return false;
1413 case '}':
1414 return false;
1415 }
1416 ret = read_predicateObjectList(reader, ctx, ate_dot);
1417 }
1418 ctx.subject = ctx.predicate = 0;
1419 return ret;
1420 }
1421
1422 static bool
1423 read_base(SerdReader* reader, bool sparql, bool token)
1424 {
1425 if (token) {
1426 TRY_RET(eat_string(reader, "base", 4));
1427 }
1428
1429 Ref uri;
1430 read_ws_star(reader);
1431 TRY_RET(uri = read_IRIREF(reader));
1432 if (reader->base_sink) {
1433 reader->base_sink(reader->handle, deref(reader, uri));
1434 }
1435 pop_node(reader, uri);
1436
1437 read_ws_star(reader);
1438 if (!sparql) {
1439 return eat_byte_check(reader, '.');
1440 } else if (peek_byte(reader) == '.') {
1441 return r_err(reader, SERD_ERR_BAD_SYNTAX,
1442 "full stop after SPARQL BASE\n");
1443 }
1444 return true;
1445 }
1446
1447 static bool
1448 read_prefixID(SerdReader* reader, bool sparql, bool token)
1449 {
1450 if (token) {
1451 TRY_RET(eat_string(reader, "prefix", 6));
1452 }
1453
1454 read_ws_star(reader);
1455 bool ret = true;
1456 Ref name = push_node(reader, SERD_LITERAL, "", 0);
1457 if (read_PN_PREFIX(reader, name) > SERD_FAILURE) {
1458 return pop_node(reader, name);
1459 }
1460
1461 if (eat_byte_check(reader, ':') != ':') {
1462 return pop_node(reader, name);
1463 }
1464
1465 read_ws_star(reader);
1466 const Ref uri = read_IRIREF(reader);
1467 if (!uri) {
1468 pop_node(reader, name);
1469 return false;
1470 }
1471
1472 if (reader->prefix_sink) {
1473 ret = !reader->prefix_sink(reader->handle,
1474 deref(reader, name),
1475 deref(reader, uri));
1476 }
1477 pop_node(reader, uri);
1478 pop_node(reader, name);
1479 if (!sparql) {
1480 read_ws_star(reader);
1481 return eat_byte_check(reader, '.');
1482 }
1483 return ret;
1484 }
1485
1486 static bool
1487 read_directive(SerdReader* reader)
1488 {
1489 const bool sparql = peek_byte(reader) != '@';
1490 if (!sparql) {
1491 eat_byte_safe(reader, '@');
1492 switch (peek_byte(reader)) {
1493 case 'B': case 'P':
1494 return r_err(reader, SERD_ERR_BAD_SYNTAX,
1495 "uppercase directive\n");
1496 }
1497 }
1498
1499 switch (peek_byte(reader)) {
1500 case 'B': case 'b': return read_base(reader, sparql, true);
1501 case 'P': case 'p': return read_prefixID(reader, sparql, true);
1502 default:
1503 return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid directive\n");
1504 }
1505
1506 return true;
1507 }
1508
1509 static bool
1510 read_wrappedGraph(SerdReader* reader, ReadContext* ctx)
1511 {
1512 bool ate_dot = false;
1513 char s_type = 0;
1514 TRY_RET(eat_byte_check(reader, '{'));
1515 read_ws_star(reader);
1516 while (peek_byte(reader) != '}') {
1517 ctx->subject = 0;
1518 Ref subj = read_subject(reader, *ctx, &ctx->subject, &s_type);
1519 if (!subj ||
1520 (!read_triples(reader, *ctx, &ate_dot) && s_type != '[')) {
1521 return false;
1522 }
1523 pop_node(reader, subj);
1524 read_ws_star(reader);
1525 if (peek_byte(reader) == '.') {
1526 eat_byte_safe(reader, '.');
1527 }
1528 read_ws_star(reader);
1529 }
1530 return eat_byte_check(reader, '}');
1531 }
1532
1533 static int
1534 tokcmp(SerdReader* reader, Ref ref, const char* tok, size_t n)
1535 {
1536 SerdNode* node = deref(reader, ref);
1537 if (!node || node->n_bytes != n) {
1538 return -1;
1539 }
1540 const char* s1 = (const char*)node->buf;
1541 const char* s2 = tok;
1542 for (; n > 0 && *s2; s1++, s2++, --n) {
1543 if (toupper(*s1) != toupper(*s2)) {
1544 return ((*(uint8_t*)s1 < *(uint8_t*)s2) ? -1 : +1);
1545 }
1546 }
1547 return 0;
1548 }
1549
1550 static bool
1551 read_statement(SerdReader* reader)
1552 {
1553 SerdStatementFlags flags = 0;
1554 ReadContext ctx = { 0, 0, 0, 0, 0, 0, &flags };
1555 Ref subj = 0;
1556 bool ate_dot = false;
1557 char s_type = false;
1558 bool ret = true;
1559 read_ws_star(reader);
1560 switch (peek_byte(reader)) {
1561 case '\0':
1562 reader->eof = true;
1563 return reader->status <= SERD_FAILURE;
1564 case '@':
1565 TRY_RET(read_directive(reader));
1566 read_ws_star(reader);
1567 break;
1568 case '{':
1569 if (reader->syntax == SERD_TRIG) {
1570 TRY_RET(read_wrappedGraph(reader, &ctx));
1571 read_ws_star(reader);
1572 } else {
1573 return r_err(reader, SERD_ERR_BAD_SYNTAX, "graph in Turtle\n");
1574 }
1575 break;
1576 default:
1577 subj = read_subject(reader, ctx, &ctx.subject, &s_type);
1578 if (!tokcmp(reader, ctx.subject, "base", 4)) {
1579 ret = read_base(reader, true, false);
1580 } else if (!tokcmp(reader, ctx.subject, "prefix", 6)) {
1581 ret = read_prefixID(reader, true, false);
1582 } else if (!tokcmp(reader, ctx.subject, "graph", 5)) {
1583 read_ws_star(reader);
1584 TRY_RET((ctx.graph = read_labelOrSubject(reader, ctx)));
1585 read_ws_star(reader);
1586 TRY_RET(read_wrappedGraph(reader, &ctx));
1587 read_ws_star(reader);
1588 } else if (read_ws_star(reader) && peek_byte(reader) == '{') {
1589 if (s_type == '(' || (s_type == '[' && !*ctx.flags)) {
1590 return false; // invalid graph with complex label
1591 }
1592 ctx.graph = subj;
1593 ctx.subject = subj = 0;
1594 TRY_RET(read_wrappedGraph(reader, &ctx));
1595 read_ws_star(reader);
1596 } else if (!subj) {
1597 ret = r_err(reader, SERD_ERR_BAD_SYNTAX, "bad subject\n");
1598 } else if (!read_triples(reader, ctx, &ate_dot)) {
1599 ret = (s_type == '[');
1600 } else if (!ate_dot) {
1601 read_ws_star(reader);
1602 ret = (eat_byte_check(reader, '.') == '.');
1603 }
1604 pop_node(reader, subj);
1605 break;
1606 }
1607 return ret;
1608 }
1609
1610 static bool
1611 read_turtleDoc(SerdReader* reader)
1612 {
1613 while (!reader->eof) {
1614 TRY_RET(read_statement(reader));
1615 }
1616 return reader->status <= SERD_FAILURE;
1617 }
1618
1619 static bool
1620 read_trigDoc(SerdReader* reader)
1621 {
1622 while (!reader->eof) {
1623 TRY_RET(read_statement(reader));
1624 }
1625 return reader->status <= SERD_FAILURE;
1626 }
1627
1628 static bool
1629 read_nquadsDoc(SerdReader* reader)
1630 {
1631 while (!reader->eof) {
1632 SerdStatementFlags flags = 0;
1633 ReadContext ctx = { 0, 0, 0, 0, 0, 0, &flags };
1634 bool ate_dot = false;
1635 char s_type = false;
1636 read_ws_star(reader);
1637 if (peek_byte(reader) == '\0') {
1638 reader->eof = true;
1639 break;
1640 }
1641
1642 // subject predicate object
1643 if (!(ctx.subject = read_subject(reader, ctx, &ctx.subject, &s_type)) ||
1644 !read_ws_star(reader) ||
1645 !(ctx.predicate = read_IRIREF(reader)) ||
1646 !read_ws_star(reader) ||
1647 !read_object(reader, &ctx, false, &ate_dot)) {
1648 return false;
1649 }
1650
1651 if (!ate_dot) { // graphLabel?
1652 TRY_RET(read_ws_star(reader));
1653 switch (peek_byte(reader)) {
1654 case '.':
1655 break;
1656 case '_':
1657 ctx.graph = read_BLANK_NODE_LABEL(reader, &ate_dot);
1658 break;
1659 default:
1660 if (!(ctx.graph = read_IRIREF(reader))) {
1661 return false;
1662 }
1663 }
1664
1665 // Terminating '.'
1666 TRY_RET(read_ws_star(reader));
1667 eat_byte_check(reader, '.');
1668 }
1669
1670 TRY_RET(emit_statement(reader, ctx, ctx.object, ctx.datatype, ctx.lang));
1671 pop_node(reader, ctx.graph);
1672 pop_node(reader, ctx.lang);
1673 pop_node(reader, ctx.datatype);
1674 pop_node(reader, ctx.object);
1675 }
1676 return reader->status <= SERD_FAILURE;
1677 }
1678
1679 static bool
1680 read_doc(SerdReader* reader)
1681 {
1682 switch (reader->syntax) {
1683 case SERD_NQUADS: return read_nquadsDoc(reader);
1684 case SERD_TRIG: return read_trigDoc(reader);
1685 default: return read_turtleDoc(reader);
1686 }
1687 }
1688
1689 SERD_API
1690 SerdReader*
1691 serd_reader_new(SerdSyntax syntax,
1692 void* handle,
1693 void (*free_handle)(void*),
1694 SerdBaseSink base_sink,
1695 SerdPrefixSink prefix_sink,
1696 SerdStatementSink statement_sink,
1697 SerdEndSink end_sink)
1698 {
1699 const Cursor cur = { NULL, 0, 0 };
1700 SerdReader* me = (SerdReader*)calloc(1, sizeof(SerdReader));
1701 me->handle = handle;
1702 me->free_handle = free_handle;
1703 me->base_sink = base_sink;
1704 me->prefix_sink = prefix_sink;
1705 me->statement_sink = statement_sink;
1706 me->end_sink = end_sink;
1707 me->default_graph = SERD_NODE_NULL;
1708 me->stack = serd_stack_new(SERD_PAGE_SIZE);
1709 me->syntax = syntax;
1710 me->cur = cur;
1711 me->next_id = 1;
1712
1713 me->rdf_first = push_node(me, SERD_URI, NS_RDF "first", 48);
1714 me->rdf_rest = push_node(me, SERD_URI, NS_RDF "rest", 47);
1715 me->rdf_nil = push_node(me, SERD_URI, NS_RDF "nil", 46);
1716
1717 return me;
1718 }
1719
1720 SERD_API
1721 void
1722 serd_reader_set_strict(SerdReader* reader, bool strict)
1723 {
1724 reader->strict = strict;
1725 }
1726
1727 SERD_API
1728 void
1729 serd_reader_set_error_sink(SerdReader* reader,
1730 SerdErrorSink error_sink,
1731 void* error_handle)
1732 {
1733 reader->error_sink = error_sink;
1734 reader->error_handle = error_handle;
1735 }
1736
1737 SERD_API
1738 void
1739 serd_reader_free(SerdReader* reader)
1740 {
1741 pop_node(reader, reader->rdf_nil);
1742 pop_node(reader, reader->rdf_rest);
1743 pop_node(reader, reader->rdf_first);
1744 serd_node_free(&reader->default_graph);
1745
1746 #ifdef SERD_STACK_CHECK
1747 free(reader->allocs);
1748 #endif
1749 free(reader->stack.buf);
1750 free(reader->bprefix);
1751 if (reader->free_handle) {
1752 reader->free_handle(reader->handle);
1753 }
1754 free(reader);
1755 }
1756
1757 SERD_API
1758 void*
1759 serd_reader_get_handle(const SerdReader* reader)
1760 {
1761 return reader->handle;
1762 }
1763
1764 SERD_API
1765 void
1766 serd_reader_add_blank_prefix(SerdReader* reader,
1767 const uint8_t* prefix)
1768 {
1769 free(reader->bprefix);
1770 reader->bprefix_len = 0;
1771 reader->bprefix = NULL;
1772 if (prefix) {
1773 reader->bprefix_len = strlen((const char*)prefix);
1774 reader->bprefix = (uint8_t*)malloc(reader->bprefix_len + 1);
1775 memcpy(reader->bprefix, prefix, reader->bprefix_len + 1);
1776 }
1777 }
1778
1779 SERD_API
1780 void
1781 serd_reader_set_default_graph(SerdReader* reader,
1782 const SerdNode* graph)
1783 {
1784 serd_node_free(&reader->default_graph);
1785 reader->default_graph = serd_node_copy(graph);
1786 }
1787
1788 SERD_API
1789 SerdStatus
1790 serd_reader_read_file(SerdReader* reader,
1791 const uint8_t* uri)
1792 {
1793 uint8_t* const path = serd_file_uri_parse(uri, NULL);
1794 if (!path) {
1795 return SERD_ERR_BAD_ARG;
1796 }
1797
1798 FILE* fd = serd_fopen((const char*)path, "r");
1799 if (!fd) {
1800 free(path);
1801 return SERD_ERR_UNKNOWN;
1802 }
1803
1804 SerdStatus ret = serd_reader_read_file_handle(reader, fd, path);
1805 fclose(fd);
1806 free(path);
1807 return ret;
1808 }
1809
1810 static bool
1811 skip_bom(SerdReader* me)
1812 {
1813 if (peek_byte(me) == 0xEF) {
1814 eat_byte_safe(me, 0xEF);
1815 if (eat_byte_check(me, 0xBB) != 0xBB ||
1816 eat_byte_check(me, 0xBF) != 0xBF) {
1817 return r_err(me, SERD_ERR_BAD_SYNTAX, "corrupt byte order mark\n");
1818 }
1819 }
1820
1821 return true;
1822 }
1823
1824 SERD_API
1825 SerdStatus
1826 serd_reader_start_stream(SerdReader* me,
1827 FILE* file,
1828 const uint8_t* name,
1829 bool bulk)
1830 {
1831 return serd_reader_start_source_stream(
1832 me,
1833 bulk ? (SerdSource)fread : serd_file_read_byte,
1834 (SerdStreamErrorFunc)ferror,
1835 file,
1836 name,
1837 bulk ? SERD_PAGE_SIZE : 1);
1838 }
1839
1840 SERD_API
1841 SerdStatus
1842 serd_reader_start_source_stream(SerdReader* me,
1843 SerdSource read_func,
1844 SerdStreamErrorFunc error_func,
1845 void* stream,
1846 const uint8_t* name,
1847 size_t page_size)
1848 {
1849 const Cursor cur = { name, 1, 1 };
1850 me->cur = cur;
1851
1852 return serd_byte_source_open_source(
1853 &me->source, read_func, error_func, stream, page_size);
1854 }
1855
1856 static SerdStatus
1857 serd_reader_prepare(SerdReader* me)
1858 {
1859 me->eof = false;
1860 if ((me->status = serd_byte_source_prepare(&me->source))) {
1861 r_err(me, me->status, "read error: %s\n", strerror(errno));
1862 } else if (!skip_bom(me)) {
1863 me->status = SERD_ERR_BAD_SYNTAX;
1864 }
1865 return me->status;
1866 }
1867
1868 SERD_API
1869 SerdStatus
1870 serd_reader_read_chunk(SerdReader* me)
1871 {
1872 SerdStatus st = SERD_SUCCESS;
1873 if (!me->source.prepared) {
1874 if ((st = serd_reader_prepare(me))) {
1875 return st;
1876 }
1877 } else if (me->eof) {
1878 me->eof = false;
1879 if ((st = serd_byte_source_advance(&me->source))) {
1880 return st;
1881 }
1882 }
1883 return read_statement(me) ? SERD_SUCCESS : SERD_FAILURE;
1884 }
1885
1886 SERD_API
1887 SerdStatus
1888 serd_reader_end_stream(SerdReader* me)
1889 {
1890 return serd_byte_source_close(&me->source);
1891 }
1892
1893 SERD_API
1894 SerdStatus
1895 serd_reader_read_file_handle(SerdReader* me, FILE* file, const uint8_t* name)
1896 {
1897 return serd_reader_read_source(
1898 me, (SerdSource)fread, (SerdStreamErrorFunc)ferror,
1899 file, name, SERD_PAGE_SIZE);
1900 }
1901
1902 SERD_API
1903 SerdStatus
1904 serd_reader_read_source(SerdReader* me,
1905 SerdSource source,
1906 SerdStreamErrorFunc error,
1907 void* stream,
1908 const uint8_t* name,
1909 size_t page_size)
1910 {
1911 SerdStatus st = serd_reader_start_source_stream(
1912 me, source, error, stream, name, page_size);
1913
1914 if ((st = serd_reader_prepare(me))) {
1915 serd_reader_end_stream(me);
1916 return st;
1917 } else if (!read_doc(me)) {
1918 serd_reader_end_stream(me);
1919 return SERD_ERR_UNKNOWN;
1920 }
1921
1922 return serd_reader_end_stream(me);
1923 }
1924
1925 SERD_API
1926 SerdStatus
1927 serd_reader_read_string(SerdReader* me, const uint8_t* utf8)
1928 {
1929 const Cursor cur = { (const uint8_t*)"(string)", 1, 1 };
1930
1931 serd_byte_source_open_string(&me->source, utf8);
1932 me->cur = cur;
1933 me->eof = false;
1934
1935 SerdStatus st = serd_reader_prepare(me);
1936 if (!st) {
1937 st = read_doc(me) ? SERD_SUCCESS : SERD_ERR_UNKNOWN;
1938 }
1939
1940 serd_byte_source_close(&me->source);
1941
1942 return st;
1943 }